2008-08-21 05:46:33

by Jared Hulbert

[permalink] [raw]
Subject: [PATCH 04/10] AXFS: axfs_inode.c

This is the main filesystem logic.

Signed-off-by: Jared Hulbert <[email protected]>
---
diff --git a/fs/axfs/axfs_inode.c b/fs/axfs/axfs_inode.c
new file mode 100644
index 0000000..cbf3dd1
--- /dev/null
+++ b/fs/axfs/axfs_inode.c
@@ -0,0 +1,488 @@
+/*
+ * Advanced XIP File System for Linux - AXFS
+ * Readonly, compressed, and XIP filesystem for Linux systems big and small
+ *
+ * Copyright(c) 2008 Numonyx
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * Authors:
+ * Eric Anderson
+ * Jared Hulbert <[email protected]>
+ * Sujaya Srinivasan
+ * Justin Treon
+ *
+ * Project url: http://axfs.sourceforge.net
+ *
+ * Borrowed heavily from fs/cramfs/inode.c by Linus Torvalds
+ *
+ * axfs_inode.c -
+ * Contains the most of the filesystem logic with the major exception of the
+ * mounting infrastructure.
+ *
+ */
+
+#include <linux/axfs.h>
+#include <linux/pagemap.h>
+
+/***************** functions in other axfs files ******************************/
+int axfs_get_sb(struct file_system_type *, int, const char *, void *,
+ struct vfsmount *);
+void axfs_kill_super(struct super_block *);
+void axfs_profiling_add(struct axfs_super *, unsigned long, unsigned int);
+int axfs_copy_mtd(struct super_block *, void *, u64, u64);
+int axfs_copy_block(struct super_block *, void *, u64, u64);
+/******************************************************************************/
+static int axfs_readdir(struct file *, void *, filldir_t);
+static int axfs_mmap(struct file *, struct vm_area_struct *);
+static ssize_t axfs_file_read(struct file *, char __user *, size_t, loff_t *);
+static int axfs_readpage(struct file *, struct page *);
+static int axfs_fault(struct vm_area_struct *, struct vm_fault *);
+static struct dentry *axfs_lookup(struct inode *, struct dentry *,
+ struct nameidata *);
+static int axfs_get_xip_mem(struct address_space *, pgoff_t, int, void **,
+ unsigned long *);
+
+/******************************************************************************/
+
+static struct file_operations axfs_directory_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = axfs_readdir,
+};
+
+static struct file_operations axfs_fops = {
+ .read = axfs_file_read,
+ .aio_read = generic_file_aio_read,
+ .mmap = axfs_mmap,
+};
+
+static struct address_space_operations axfs_aops = {
+ .readpage = axfs_readpage,
+ .get_xip_mem = axfs_get_xip_mem,
+};
+
+static struct inode_operations axfs_dir_inode_operations = {
+ .lookup = axfs_lookup,
+};
+
+static struct vm_operations_struct axfs_vm_ops = {
+ .fault = axfs_fault,
+};
+
+static int axfs_copy_data(struct super_block *sb, void *dst,
+ struct axfs_region_desc *region, u64 offset, u64 len)
+{
+ u64 mmapped = 0;
+ u64 end = region->fsoffset + offset + len;
+ u64 begin = region->fsoffset + offset;
+ u64 left;
+ void *addr;
+ void *newdst;
+ struct axfs_super *sbi = AXFS_SB(sb);
+
+ if (len == 0)
+ return 0;
+
+ if (region->virt_addr) {
+ if (sbi->mmap_size >= end) {
+ mmapped = len;
+ } else if (sbi->mmap_size > begin) {
+ mmapped = sbi->mmap_size - begin;
+ }
+ }
+
+ if (mmapped) {
+ addr = (void *)(region->virt_addr + offset);
+ memcpy(dst, addr, mmapped);
+ }
+
+ newdst = (void *)(dst + mmapped);
+ left = len - mmapped;
+
+ if (left == 0)
+ return len;
+
+ if (AXFS_HAS_BDEV(sb)) {
+ return axfs_copy_block(sb, newdst, begin + mmapped, left);
+ } else if (AXFS_HAS_MTD(sb)) {
+ return axfs_copy_mtd(sb, newdst, begin + mmapped, left);
+ } else {
+ return 0;
+ }
+}
+
+static int axfs_iget5_test(struct inode *inode, void *opaque)
+{
+ u64 *inode_number = (u64 *) opaque;
+
+ if (inode->i_sb == NULL) {
+ printk(KERN_ERR "axfs_iget5_test:"
+ " the super block is set to null\n");
+ }
+ if (inode->i_ino == *inode_number)
+ return 1; /* matches */
+ else
+ return 0; /* does not match */
+}
+
+static int axfs_iget5_set(struct inode *inode, void *opaque)
+{
+ u64 *inode_number = (u64 *) opaque;
+
+ if (inode->i_sb == NULL) {
+ printk(KERN_ERR "axfs_iget5_set:"
+ " the super block is set to null \n");
+ }
+ inode->i_ino = *inode_number;
+ return 0;
+}
+
+struct inode *axfs_create_vfs_inode(struct super_block *sb, int ino)
+{
+ struct axfs_super *sbi = AXFS_SB(sb);
+ struct inode *inode;
+ u64 size;
+
+ inode = iget5_locked(sb, ino, axfs_iget5_test, axfs_iget5_set, &ino);
+
+ if (!(inode && (inode->i_state & I_NEW)))
+ return inode;
+
+ inode->i_mode = AXFS_GET_MODE(sbi, ino);
+ inode->i_uid = AXFS_GET_UID(sbi, ino);
+ size = AXFS_GET_INODE_FILE_SIZE(sbi, ino);
+ inode->i_size = size;
+ inode->i_blocks = AXFS_GET_INODE_NUM_ENTRIES(sbi, ino);
+ inode->i_blkbits = PAGE_CACHE_SIZE * 8;
+ inode->i_gid = AXFS_GET_GID(sbi, ino);
+
+ inode->i_mtime = inode->i_atime = inode->i_ctime = sbi->timestamp;
+ inode->i_ino = ino;
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_fop = &axfs_fops;
+ inode->i_data.a_ops = &axfs_aops;
+ inode->i_mapping->a_ops = &axfs_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &axfs_dir_inode_operations;
+ inode->i_fop = &axfs_directory_operations;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_data.a_ops = &axfs_aops;
+ } else {
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ init_special_inode(inode, inode->i_mode, old_decode_dev(size));
+ }
+ unlock_new_inode(inode);
+
+ return inode;
+}
+
+
+static int axfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+
+ file_accessed(file);
+
+ vma->vm_ops = &axfs_vm_ops;
+
+ vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+
+ return 0;
+}
+
+static struct dentry *axfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct super_block *sb = dir->i_sb;
+ struct axfs_super *sbi = AXFS_SB(sb);
+ u64 ino_number = dir->i_ino;
+ u64 dir_index = 0;
+ u64 entry;
+ char *name;
+ int namelen, err;
+
+ while (dir_index < AXFS_GET_INODE_NUM_ENTRIES(sbi, ino_number)) {
+ entry = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number);
+ entry += dir_index;
+
+ name = AXFS_GET_INODE_NAME(sbi, entry);
+ namelen = strlen(name);
+
+ /* fast test, the entries are sorted alphabetically and the
+ * first letter is smaller than the first letter in the search
+ * name then it isn't in this directory. Keeps this loop from
+ * needing to scan through always.
+ */
+ if (dentry->d_name.name[0] < name[0])
+ break;
+
+ dir_index++;
+
+ /* Quick check that the name is roughly the right length */
+ if (dentry->d_name.len != namelen)
+ continue;
+
+ err = memcmp(dentry->d_name.name, name, namelen);
+ if (err > 0)
+ continue;
+
+ /* The file name isn't present in the directory. */
+ if (err < 0)
+ break;
+
+ d_add(dentry, axfs_create_vfs_inode(dir->i_sb, entry));
+ goto out;
+
+ }
+ d_add(dentry, NULL);
+
+out:
+ return NULL;
+}
+
+static int axfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct axfs_super *sbi = AXFS_SB(sb);
+ u64 ino_number = inode->i_ino;
+ u64 entry;
+ loff_t dir_index;
+ char *name;
+ int namelen, mode;
+ int err = 0;
+
+ /* Get the current index into the directory and verify it is not beyond
+ the end of the list */
+ dir_index = filp->f_pos;
+ if (dir_index >= AXFS_GET_INODE_NUM_ENTRIES(sbi, ino_number))
+ goto out;
+
+ /* Verify the inode is for a directory */
+ if (!(S_ISDIR(inode->i_mode))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ while (dir_index < AXFS_GET_INODE_NUM_ENTRIES(sbi, ino_number)) {
+ entry = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number) + dir_index;
+
+ name = (char *)AXFS_GET_INODE_NAME(sbi, entry);
+ namelen = strlen(name);
+
+ mode = (int)AXFS_GET_MODE(sbi, entry);
+ err = filldir(dirent, name, namelen, dir_index, entry, mode);
+
+ if (err)
+ break;
+
+ dir_index++;
+ filp->f_pos = dir_index;
+ }
+
+out:
+ return 0;
+}
+
+/******************************************************************************
+ *
+ * axfs_fault
+ *
+ * Description: This function is mapped into the VMA operations vector, and
+ * gets called on a page fault. Depending on whether the page
+ * is XIP or compressed, xip_file_fault or filemap_fault is
+ * called. This function also logs when a fault occurs when
+ * profiling is on.
+ *
+ * Parameters:
+ * (IN) vma - The virtual memory area corresponding to a file
+ *
+ * (IN) vmf - The fault info pass in by the fault handler
+ *
+ * Returns:
+ * 0 or error number
+ *
+ *****************************************************************************/
+static int axfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct axfs_super *sbi = AXFS_SB(sb);
+ u64 ino_number = inode->i_ino;
+ u64 array_index;
+
+ array_index = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number) + vmf->pgoff;
+
+ /* if that pages are marked for write they will probably end up in RAM
+ therefore we don't want their counts for being XIP'd */
+ if (!(vma->vm_flags & VM_WRITE))
+ axfs_profiling_add(sbi, array_index, ino_number);
+
+ /* figure out if the node is XIP or compressed and call the
+ appropriate function
+ */
+ if (AXFS_IS_NODE_XIP(sbi, array_index))
+ return xip_file_fault(vma, vmf);
+ return filemap_fault(vma, vmf);
+
+}
+
+
+/******************************************************************************
+ *
+ * axfs_file_read
+ *
+ * Description: axfs_file_read is mapped into the file_operations vector for
+ * all axfs files. It loops through the pages to be read and calls
+ * either do_sync_read (if the page is a compressed one) or
+ * xip_file_read (if the page is XIP).
+ *
+ * Parameters:
+ * (IN) filp - file to be read
+ *
+ * (OUT) buf - user buffer that is filled with the data that we read.
+ *
+ * (IN) len - length of file to be read
+ *
+ * (IN) ppos - offset within the file to read from
+ *
+ * Returns:
+ * actual size of data read.
+ *
+ *****************************************************************************/
+static ssize_t axfs_file_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct axfs_super *sbi = AXFS_SB(sb);
+ size_t read = 0, total_read = 0;
+ size_t readlength, actual_size, file_size, remaining;
+ u64 ino_number = inode->i_ino;
+ u64 size, array_index;
+
+ file_size = AXFS_GET_INODE_FILE_SIZE(sbi, ino_number);
+ remaining = file_size - *ppos;
+ actual_size = len > remaining ? remaining : len;
+ readlength = actual_size < PAGE_SIZE ? actual_size : PAGE_SIZE;
+
+ for (size = actual_size; size > 0; size -= read) {
+ array_index = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number);
+ array_index += *ppos >> PAGE_SHIFT;
+
+ if (AXFS_IS_NODE_XIP(sbi, array_index)) {
+ read = xip_file_read(filp, buf, readlength, ppos);
+ } else {
+ read = do_sync_read(filp, buf, readlength, ppos);
+ }
+ buf += read;
+ total_read += read;
+
+ if ((len - total_read < PAGE_SIZE) && (total_read != len))
+ readlength = len - total_read;
+ }
+
+ return total_read;
+}
+
+static int axfs_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct axfs_super *sbi = AXFS_SB(sb);
+ u64 array_index, node_index, cnode_index, maxblock, ofs;
+ u64 ino_number = inode->i_ino;
+ u32 max_len, cnode_offset;
+ u32 cblk_size = sbi->cblock_size;
+ u32 len = 0;
+ u8 node_type;
+ void *pgdata;
+ void *src;
+ void *cblk0 = sbi->cblock_buffer[0];
+ void *cblk1 = sbi->cblock_buffer[1];
+
+ maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ pgdata = kmap(page);
+
+ if (page->index >= maxblock)
+ goto out;
+
+ array_index = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number);
+ array_index += page->index;
+
+ node_index = AXFS_GET_NODE_INDEX(sbi, array_index);
+ node_type = AXFS_GET_NODE_TYPE(sbi, array_index);
+
+ if (node_type == Compressed) {
+ /* node is in compessed region */
+ cnode_offset = AXFS_GET_CNODE_OFFSET(sbi, node_index);
+ cnode_index = AXFS_GET_CNODE_INDEX(sbi, node_index);
+ down_write(&sbi->lock);
+ if (cnode_index != sbi->current_cnode_index) {
+ /* uncompress only necessary if different cblock */
+ ofs = AXFS_GET_CBLOCK_OFFSET(sbi, cnode_index);
+ len = AXFS_GET_CBLOCK_OFFSET(sbi, cnode_index + 1);
+ len -= ofs;
+ axfs_copy_data(sb, cblk1, &(sbi->compressed), ofs, len);
+ axfs_uncompress_block(cblk0, cblk_size, cblk1, len);
+ sbi->current_cnode_index = cnode_index;
+ }
+ downgrade_write(&sbi->lock);
+ max_len = cblk_size - cnode_offset;
+ len = max_len > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : max_len;
+ src = (void *)((unsigned long)cblk0 + cnode_offset);
+ memcpy(pgdata, src, len);
+ up_read(&sbi->lock);
+ } else if (node_type == Byte_Aligned) {
+ /* node is in BA region */
+ ofs = AXFS_GET_BANODE_OFFSET(sbi, node_index);
+ max_len = sbi->byte_aligned.size - ofs;
+ len = max_len > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : max_len;
+ axfs_copy_data(sb, pgdata, &(sbi->byte_aligned), ofs, len);
+ } else {
+ /* node is XIP */
+ ofs = node_index << PAGE_SHIFT;
+ len = PAGE_CACHE_SIZE;
+ axfs_copy_data(sb, pgdata, &(sbi->xip), ofs, len);
+ }
+
+out:
+ memset(pgdata + len, 0, PAGE_CACHE_SIZE - len);
+ kunmap(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+}
+
+static int axfs_get_xip_mem(struct address_space *mapping, pgoff_t offset,
+ int create, void **kaddr, unsigned long *pfn)
+{
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct axfs_super *sbi = AXFS_SB(sb);
+ u64 ino_number = inode->i_ino;
+ u64 ino_index, node_index;
+
+ ino_index = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number);
+ ino_index += offset;
+
+ node_index = AXFS_GET_NODE_INDEX(sbi, ino_index);
+
+ *kaddr = (void *)(sbi->xip.virt_addr + (node_index << PAGE_SHIFT));
+ if (AXFS_PHYSADDR_IS_VALID(sbi)) {
+ *pfn = (AXFS_GET_XIP_REGION_PHYSADDR(sbi) >> PAGE_SHIFT);
+ *pfn += node_index;
+ } else {
+ *pfn = page_to_pfn(virt_to_page((unsigned long)*kaddr));
+ }
+
+ return 0;
+}
+


2008-08-21 08:37:20

by Carsten Otte

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

Jared Hulbert wrote:
> +/***************** functions in other axfs files ******************************/
> +int axfs_get_sb(struct file_system_type *, int, const char *, void *,
> + struct vfsmount *);
This is neither implemented nor used in axfs_inode.c - why define it here?

> +void axfs_kill_super(struct super_block *);
Same for this one.


> +void axfs_profiling_add(struct axfs_super *, unsigned long, unsigned int);
> +int axfs_copy_mtd(struct super_block *, void *, u64, u64);
> +int axfs_copy_block(struct super_block *, void *, u64, u64);
These are used, but not implemented here. Please consider putting them
in a header file


> +static int axfs_copy_data(struct super_block *sb, void *dst,
> + struct axfs_region_desc *region, u64 offset, u64 len)
> +{
> + u64 mmapped = 0;
> + u64 end = region->fsoffset + offset + len;
> + u64 begin = region->fsoffset + offset;
> + u64 left;
> + void *addr;
> + void *newdst;
> + struct axfs_super *sbi = AXFS_SB(sb);
> +
> + if (len == 0)
> + return 0;
> +
> + if (region->virt_addr) {
> + if (sbi->mmap_size >= end) {
> + mmapped = len;
> + } else if (sbi->mmap_size > begin) {
> + mmapped = sbi->mmap_size - begin;
> + }
> + }
You can save braces and make the code more readable here:
=> if (sbi->mmap_size >= end)
mmapped = len;
else if (sbi->mmap_size > begin)
mmapped = si->mmap_size - begin;


> +struct inode *axfs_create_vfs_inode(struct super_block *sb, int ino)
> +{
> + struct axfs_super *sbi = AXFS_SB(sb);
> + struct inode *inode;
> + u64 size;
[SNIP]
> + size = AXFS_GET_INODE_FILE_SIZE(sbi, ino);
> + inode->i_size = size;
The variable size is not needed. Do
inode->i_size = AXFS_GET_INODE_FILE_SIZE(sbi, ino);


> +static struct dentry *axfs_lookup(struct inode *dir, struct dentry *dentry,
> + struct nameidata *nd)
> +{
> + struct super_block *sb = dir->i_sb;
> + struct axfs_super *sbi = AXFS_SB(sb);
> + u64 ino_number = dir->i_ino;
> + u64 dir_index = 0;
> + u64 entry;
> + char *name;
> + int namelen, err;
> +
> + while (dir_index < AXFS_GET_INODE_NUM_ENTRIES(sbi, ino_number)) {
> + entry = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number);
> + entry += dir_index;
> +
> + name = AXFS_GET_INODE_NAME(sbi, entry);
> + namelen = strlen(name);
> +
> + /* fast test, the entries are sorted alphabetically and the
> + * first letter is smaller than the first letter in the search
> + * name then it isn't in this directory. Keeps this loop from
> + * needing to scan through always.
> + */
> + if (dentry->d_name.name[0] < name[0])
> + break;
> +
> + dir_index++;
> +
> + /* Quick check that the name is roughly the right length */
> + if (dentry->d_name.len != namelen)
> + continue;
> +
> + err = memcmp(dentry->d_name.name, name, namelen);
> + if (err > 0)
> + continue;
> +
> + /* The file name isn't present in the directory. */
> + if (err < 0)
> + break;
Very ingenious way to compare strings. strncmp also stops after the
first character if it does'nt fit. I doubt this has a measurable
performance advantage over using strncmp, please consider to replace
this logic to make the code smaller and more readable. See lib/string.c.

> +static int axfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> + struct inode *inode = filp->f_dentry->d_inode;
> + struct super_block *sb = inode->i_sb;
> + struct axfs_super *sbi = AXFS_SB(sb);
> + u64 ino_number = inode->i_ino;
> + u64 entry;
> + loff_t dir_index;
> + char *name;
> + int namelen, mode;
> + int err = 0;
> +
> + /* Get the current index into the directory and verify it is not beyond
> + the end of the list */
> + dir_index = filp->f_pos;
> + if (dir_index >= AXFS_GET_INODE_NUM_ENTRIES(sbi, ino_number))
> + goto out;
> +
> + /* Verify the inode is for a directory */
> + if (!(S_ISDIR(inode->i_mode))) {
> + err = -EINVAL;
> + goto out;
> + }
Well, -ENOTDIR would be the correct return code. You can remove that
sanity check alltogether, vfs_readdir makes sure this is the right
file type. If you really want to check, make it
BUG_ON(!S_ISDIR(inode->i_mode));

> +static int axfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> + struct file *file = vma->vm_file;
> + struct inode *inode = file->f_dentry->d_inode;
> + struct super_block *sb = inode->i_sb;
> + struct axfs_super *sbi = AXFS_SB(sb);
> + u64 ino_number = inode->i_ino;
> + u64 array_index;
> +
> + array_index = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number) + vmf->pgoff;
> +
> + /* if that pages are marked for write they will probably end up in RAM
> + therefore we don't want their counts for being XIP'd */
> + if (!(vma->vm_flags & VM_WRITE))
> + axfs_profiling_add(sbi, array_index, ino_number);
Thats very inacurate profiling, it does never count MAP_PRIVATE
mappings which is the regular case for executables and libraries. When
booting an enterprise distro, my sniff test shows that only about 5%
of the MAP_PRIVATE mappings get COW'ed. To get correct statistics, it
might be a good idea to find a way to add here and substract during
cow. Or to scan these mappings when the profiling information is being
retrieved - the readonly bit in the pte gives the right indication for
MIXEDMAP mappings.

2008-08-21 11:37:42

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

On Thursday 21 August 2008, Jared Hulbert wrote:
> +/***************** functions in other axfs files ******************************/
> +int axfs_get_sb(struct file_system_type *, int, const char *, void *,
> + struct vfsmount *);
> +void axfs_kill_super(struct super_block *);
> +void axfs_profiling_add(struct axfs_super *, unsigned long, unsigned int);
> +int axfs_copy_mtd(struct super_block *, void *, u64, u64);
> +int axfs_copy_block(struct super_block *, void *, u64, u64);

*Never* put extern declarations into a .c file, that's what headers are for.
If you ever change the definition, the compiler doesn't get a chance to
warn you otherwise.

> +/******************************************************************************/
> +static int axfs_readdir(struct file *, void *, filldir_t);
> +static int axfs_mmap(struct file *, struct vm_area_struct *);
> +static ssize_t axfs_file_read(struct file *, char __user *, size_t, loff_t *);
> +static int axfs_readpage(struct file *, struct page *);
> +static int axfs_fault(struct vm_area_struct *, struct vm_fault *);
> +static struct dentry *axfs_lookup(struct inode *, struct dentry *,
> + struct nameidata *);
> +static int axfs_get_xip_mem(struct address_space *, pgoff_t, int, void **,
> + unsigned long *);

For style reasons, also please don't put static forward declarations anywhere,
but define the functions in the right order so you don't need them.

Arnd <><

2008-08-21 12:18:24

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

On Thursday 21 August 2008, Jared Hulbert wrote:> + array_index = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number);> + array_index += page->index;> +> + node_index = AXFS_GET_NODE_INDEX(sbi, array_index);> + node_type = AXFS_GET_NODE_TYPE(sbi, array_index);> +> +???????if (node_type == Compressed) {> +???????????????/* node is in compessed region */> +???????????????cnode_offset = AXFS_GET_CNODE_OFFSET(sbi, node_index);> +???????????????cnode_index = AXFS_GET_CNODE_INDEX(sbi, node_index);> +???????????????down_write(&sbi->lock);> +???????????????if (cnode_index != sbi->current_cnode_index) {> +???????????????????????/* uncompress only necessary if different cblock */> +???????????????????????ofs = AXFS_GET_CBLOCK_OFFSET(sbi, cnode_index);> +???????????????????????len = AXFS_GET_CBLOCK_OFFSET(sbi, cnode_index + 1);> +???????????????????????len -= ofs;> +???????????????????????axfs_copy_data(sb, cblk1, &(sbi->compressed), ofs, len);> +???????????????????????axfs_uncompress_block(cblk0, cblk_size, cblk1, len);> +???????????????????????sbi->current_cnode_index = cnode_index;> +???????????????}> +???????????????downgrade_write(&sbi->lock);> +???????????????max_len = cblk_size - cnode_offset;> +???????????????len = max_len > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : max_len;> +???????????????src = (void *)((unsigned long)cblk0 + cnode_offset);> +???????????????memcpy(pgdata, src, len);> +???????????????up_read(&sbi->lock);
This looks very nice, but could use some comments about how the data isactually stored on disk. It took me some time to figure out that it actuallyallows to do tail merging into compressed blocks, which I was about to suggestyou implement ;-). Cramfs doesn't have them, and I found that they are themain reason why squashfs compresses better than cramfs, besides the defaultblock size, which you can change on either one.
Have you seen any benefit of the rwsem over a simple mutex? I would guessthat you can never even get into the situation where you get concurrentreaders since I haven't found a single down_read() in your code, onlydowngrade_write().
Arnd <><????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2008-08-21 15:07:04

by Jared Hulbert

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

> Have you seen any benefit of the rwsem over a simple mutex? I would guess
> that you can never even get into the situation where you get concurrent
> readers since I haven't found a single down_read() in your code, only
> downgrade_write()

We implemented a rwsem here because you can get concurrent readers.
My understanding is that downgrade_write() puts the rewem into the
same state as down_read(). Am I mistaken?

2008-08-21 15:14:48

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

On Thursday 21 August 2008, Jared Hulbert wrote:
> > Have you seen any benefit of the rwsem over a simple mutex? I would guess
> > that you can never even get into the situation where you get concurrent
> > readers since I haven't found a single down_read() in your code, only
> > downgrade_write()
>
> We implemented a rwsem here because you can get concurrent readers.
> My understanding is that downgrade_write() puts the rewem into the
> same state as down_read().  Am I mistaken?

Your interpretation of downgrade_write is correct, but if every thread
always does

down_write();
serialized_code();
downgrade_write();
parallel_code();
up_read();

Then you still won't have any concurrency, because each thread trying
to down_write() will be blocked until the previous one has done its up_read(),
causing parallel_code() to be serialized as well.

In addition to that, I'd still consider it better to use a simple mutex
if parallel_code() is a much faster operation than serialized_code(), as it
is in your case, where only the memcpy is parallel and that is much slower
than the deflate.

Arnd <><

2008-08-22 00:32:50

by Phillip Lougher

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

Jared Hulbert wrote:

> +
> +static int axfs_iget5_test(struct inode *inode, void *opaque)
> +{
> + u64 *inode_number = (u64 *) opaque;
> +
> + if (inode->i_sb == NULL) {
> + printk(KERN_ERR "axfs_iget5_test:"
> + " the super block is set to null\n");
> + }
> + if (inode->i_ino == *inode_number)
> + return 1; /* matches */
> + else
> + return 0; /* does not match */
> +}
> +

This implies inode_numbers are unique in AXFS? If so you can get rid of
the axfs_iget5_set/test logic. This is only necessary for filesystems
with non-unique inode numbers like cramfs.

> +
> +struct inode *axfs_create_vfs_inode(struct super_block *sb, int ino)
> +{
> + struct axfs_super *sbi = AXFS_SB(sb);
> + struct inode *inode;
> + u64 size;
> +
> + inode = iget5_locked(sb, ino, axfs_iget5_test, axfs_iget5_set, &ino);

If inode_numbers are unique, use iget_locked here.

> +
> + if (!(inode && (inode->i_state & I_NEW)))
> + return inode;
> +
> + inode->i_mode = AXFS_GET_MODE(sbi, ino);
> + inode->i_uid = AXFS_GET_UID(sbi, ino);
> + size = AXFS_GET_INODE_FILE_SIZE(sbi, ino);
> + inode->i_size = size;

What's the reason for splitting this into two lines, rather than

inode->i_size = AXFS_GET_INODE_FILE_SIZE(sbi, ino);


> + inode->i_blocks = AXFS_GET_INODE_NUM_ENTRIES(sbi, ino);
> + inode->i_blkbits = PAGE_CACHE_SIZE * 8;
> + inode->i_gid = AXFS_GET_GID(sbi, ino);
> +
> + inode->i_mtime = inode->i_atime = inode->i_ctime = sbi->timestamp;

No unique per inode time? Will cause problems using AXFS for archives
etc. where preserving timestamps is important.


> + inode->i_ino = ino;
> +

Unnecessary, set by iget_locked/iget_locked5

> +
> +static int axfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> + struct inode *inode = filp->f_dentry->d_inode;
> + struct super_block *sb = inode->i_sb;
> + struct axfs_super *sbi = AXFS_SB(sb);
> + u64 ino_number = inode->i_ino;
> + u64 entry;
> + loff_t dir_index;
> + char *name;
> + int namelen, mode;
> + int err = 0;
> +
> + /* Get the current index into the directory and verify it is not beyond
> + the end of the list */
> + dir_index = filp->f_pos;
> + if (dir_index >= AXFS_GET_INODE_NUM_ENTRIES(sbi, ino_number))
> + goto out;
> +
> + /* Verify the inode is for a directory */
> + if (!(S_ISDIR(inode->i_mode))) {
> + err = -EINVAL;
> + goto out;
> + }
> +
> + while (dir_index < AXFS_GET_INODE_NUM_ENTRIES(sbi, ino_number)) {
> + entry = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number) + dir_index;
> +
> + name = (char *)AXFS_GET_INODE_NAME(sbi, entry);

One to one mapping between inode number and inode name? No hard link
support...?

> + namelen = strlen(name);
> +
> + mode = (int)AXFS_GET_MODE(sbi, entry);
> + err = filldir(dirent, name, namelen, dir_index, entry, mode);
> +
> + if (err)
> + break;
> +
> + dir_index++;
> + filp->f_pos = dir_index;
> + }
> +
> +out:
> + return 0;
> +}

Are "." and ".." stored in the directory? If not then axfs_readdir
should fabricate them to avoid confusing applications that expect
readdir(3) to return them.


> +static ssize_t axfs_file_read(struct file *filp, char __user *buf, size_t len,
> + loff_t *ppos)

> + actual_size = len > remaining ? remaining : len;
> + readlength = actual_size < PAGE_SIZE ? actual_size : PAGE_SIZE;

Use min() or min_t()

> +
> +static int axfs_readpage(struct file *file, struct page *page)
> +{
> +
> + if (node_type == Compressed) {
> + /* node is in compessed region */
> + cnode_offset = AXFS_GET_CNODE_OFFSET(sbi, node_index);
> + cnode_index = AXFS_GET_CNODE_INDEX(sbi, node_index);
> + down_write(&sbi->lock);
> + if (cnode_index != sbi->current_cnode_index) {
> + /* uncompress only necessary if different cblock */
> + ofs = AXFS_GET_CBLOCK_OFFSET(sbi, cnode_index);
> + len = AXFS_GET_CBLOCK_OFFSET(sbi, cnode_index + 1);
> + len -= ofs;
> + axfs_copy_data(sb, cblk1, &(sbi->compressed), ofs, len);
> + axfs_uncompress_block(cblk0, cblk_size, cblk1, len);
> + sbi->current_cnode_index = cnode_index;

I assume compressed blocks can be larger than PAGE_CACHE_SIZE? This
suffers from the rather obvious inefficiency that you decompress a big
block > PAGE_CACHE_SIZE, but only copy one PAGE_CACHE_SIZE page out of
it. If multiple files are being read simultaneously (a common
occurrence), then each is going to replace your one cached uncompressed
block (sbi->current_cnode_index), leading to decompressing the same
blocks over and over again on sequential file access.

readpage file A, index 1 -> decompress block X
readpage file B, index 1 -> decompress block Y (replaces X)
readpage file A, index 2 -> repeated decompress of block X (replaces Y)
readpage file B, index 2 -> repeated decompress of block Y (replaces X)

and so on.

> + }
> + downgrade_write(&sbi->lock);
> + max_len = cblk_size - cnode_offset;
> + len = max_len > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : max_len;

Again, min() or min_t(). Lots of these.

Phillip

2008-08-22 02:22:50

by Phillip Lougher

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

Arnd Bergmann wrote:
> On Thursday 21 August 2008, Jared Hulbert wrote:
>> + array_index = AXFS_GET_INODE_ARRAY_INDEX(sbi, ino_number);
>> + array_index += page->index;
>> +
>> + node_index = AXFS_GET_NODE_INDEX(sbi, array_index);
>> + node_type = AXFS_GET_NODE_TYPE(sbi, array_index);
>> +
>> + if (node_type == Compressed) {
>> + /* node is in compessed region */
>> + cnode_offset = AXFS_GET_CNODE_OFFSET(sbi, node_index);
>> + cnode_index = AXFS_GET_CNODE_INDEX(sbi, node_index);
>> + down_write(&sbi->lock);
>> + if (cnode_index != sbi->current_cnode_index) {
>> + /* uncompress only necessary if different cblock */
>> + ofs = AXFS_GET_CBLOCK_OFFSET(sbi, cnode_index);
>> + len = AXFS_GET_CBLOCK_OFFSET(sbi, cnode_index + 1);
>> + len -= ofs;
>> + axfs_copy_data(sb, cblk1, &(sbi->compressed), ofs, len);
>> + axfs_uncompress_block(cblk0, cblk_size, cblk1, len);
>> + sbi->current_cnode_index = cnode_index;
>> + }
>> + downgrade_write(&sbi->lock);
>> + max_len = cblk_size - cnode_offset;
>> + len = max_len > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : max_len;
>> + src = (void *)((unsigned long)cblk0 + cnode_offset);
>> + memcpy(pgdata, src, len);
>> + up_read(&sbi->lock);
>
> This looks very nice, but could use some comments about how the data is
> actually stored on disk. It took me some time to figure out that it actually
> allows to do tail merging into compressed blocks, which I was about to suggest
> you implement ;-). Cramfs doesn't have them, and I found that they are the
> main reason why squashfs compresses better than cramfs, besides the default
> block size, which you can change on either one.

Squashfs has much larger block sizes than cramfs (last time I looked it
was limited to 4K blocks), and it compresses the metadata which helps to
get better compression. But tail merging (fragments in Squashfs
terminology) is obviously a major reason why Squashfs gets good compression.

The AXFS code is rather obscure but it doesn't look to me that it does
tail merging. The following code wouldn't work if the block in question
was a tail contained in a larger block. It assumes the block extends to
the end of the compressed block (cblk_size - cnode_offset).

>> + max_len = cblk_size - cnode_offset;
>> + len = max_len > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE :
max_len;
>> + src = (void *)((unsigned long)cblk0 + cnode_offset);
>> + memcpy(pgdata, src, len);

Perhaps the AXFS authors could clarify this?

Phillip

2008-08-22 03:24:15

by Jared Hulbert

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

> Squashfs has much larger block sizes than cramfs (last time I looked it was
> limited to 4K blocks), and it compresses the metadata which helps to get
> better compression. But tail merging (fragments in Squashfs terminology) is
> obviously a major reason why Squashfs gets good compression.
>
> The AXFS code is rather obscure but it doesn't look to me that it does tail
> merging. The following code wouldn't work if the block in question was a
> tail contained in a larger block. It assumes the block extends to the end
> of the compressed block (cblk_size - cnode_offset).

A c_block is the unit that gets compressed. It can contain multiple
c_nodes. The c_block can be PAGE_SIZE to 4GB in size, in theory :)
The c_nodes can be 1B to PAGE_SIZE. in any alignment. I pack many
tails as c_nodes in a c_block.

>>> + max_len = cblk_size - cnode_offset;
>>> + len = max_len > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE :
>>> max_len;
>>> + src = (void *)((unsigned long)cblk0 + cnode_offset);
>>> + memcpy(pgdata, src, len);
>
> Perhaps the AXFS authors could clarify this?

The memcpy in question copies a c_node to the page. The len is either
the max length of a c_node and size of the buffer I'm copying to
(PAGE_CACHE_SIZE) or it is the difference between the beginning of the
c_node in the c_block and the end of the c_block, whichever is
smaller. The confusion is probably because of the fact that this
copies extra crap to the page for tails.

2008-08-22 03:28:04

by Jared Hulbert

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

> I assume compressed blocks can be larger than PAGE_CACHE_SIZE? This suffers
> from the rather obvious inefficiency that you decompress a big block >
> PAGE_CACHE_SIZE, but only copy one PAGE_CACHE_SIZE page out of it. If
> multiple files are being read simultaneously (a common occurrence), then
> each is going to replace your one cached uncompressed block
> (sbi->current_cnode_index), leading to decompressing the same blocks over
> and over again on sequential file access.
>
> readpage file A, index 1 -> decompress block X
> readpage file B, index 1 -> decompress block Y (replaces X)
> readpage file A, index 2 -> repeated decompress of block X (replaces Y)
> readpage file B, index 2 -> repeated decompress of block Y (replaces X)
>
> and so on.

Yep. Been thinking about optimizing it. So far it hasn't been an
issue for my customers. Most fs traffic being on the XIP pages. Once
I get a good automated performance test up we'll probably look into
something to improve this.

2008-08-22 03:29:53

by Phillip Lougher

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

Jared Hulbert wrote:

>
> The memcpy in question copies a c_node to the page. The len is either
> the max length of a c_node and size of the buffer I'm copying to
> (PAGE_CACHE_SIZE) or it is the difference between the beginning of the
> c_node in the c_block and the end of the c_block, whichever is
> smaller. The confusion is probably because of the fact that this
> copies extra crap to the page for tails.


Ah yes, that's where I got confused :) Glad to see AXFS uses tail packing.


Phillip

2008-08-22 03:46:50

by Phillip Lougher

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

Jared Hulbert wrote:
>> I assume compressed blocks can be larger than PAGE_CACHE_SIZE? This suffers
>> from the rather obvious inefficiency that you decompress a big block >
>> PAGE_CACHE_SIZE, but only copy one PAGE_CACHE_SIZE page out of it. If
>> multiple files are being read simultaneously (a common occurrence), then
>> each is going to replace your one cached uncompressed block
>> (sbi->current_cnode_index), leading to decompressing the same blocks over
>> and over again on sequential file access.
>>
>> readpage file A, index 1 -> decompress block X
>> readpage file B, index 1 -> decompress block Y (replaces X)
>> readpage file A, index 2 -> repeated decompress of block X (replaces Y)
>> readpage file B, index 2 -> repeated decompress of block Y (replaces X)
>>
>> and so on.
>
> Yep. Been thinking about optimizing it. So far it hasn't been an
> issue for my customers. Most fs traffic being on the XIP pages. Once
> I get a good automated performance test up we'll probably look into
> something to improve this.

It's relatively easy to solve. Squashfs explicitly pushes the extra
pages into the pagecache (so subsequent readpages find them there and
don't call readpage on squashfs again).

Phillip

2008-08-22 10:00:43

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

On Friday 22 August 2008, Phillip Lougher wrote:
> >
> > This looks very nice, but could use some comments about how the data is
> > actually stored on disk. It took me some time to figure out that it actually
> > allows to do tail merging into compressed blocks, which I was about to suggest
> > you implement ;-). Cramfs doesn't have them, and I found that they are the
> > main reason why squashfs compresses better than cramfs, besides the default
> > block size, which you can change on either one.
>
> Squashfs has much larger block sizes than cramfs (last time I looked it
> was limited to 4K blocks), and it compresses the metadata which helps to
> get better compression. ?But tail merging (fragments in Squashfs
> terminology) is obviously a major reason why Squashfs gets good compression.

The *default* block size in cramfs is smaller than in squashfs, but they both
have user selectable block sizes. I found the impact of compressed metadata
to be almost zero. I hacked up a mksquashfs to avoid tail merging, and found
that the image size for squashfs and cramfs is practically identical if you
use the same block size and no tail merging.

> The AXFS code is rather obscure but it doesn't look to me that it does
> tail merging. ?The following code wouldn't work if the block in question
> was a tail contained in a larger block. ?It assumes the block extends to
> the end of the compressed block (cblk_size - cnode_offset).

yes, I thought the same thing when I first read that code, and was about
to send a lengthy reply about how it should be changed when I saw that
it already does exactly that ;-).

Arnd <><

2008-08-22 17:08:53

by Phillip Lougher

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

Arnd Bergmann wrote:
> On Friday 22 August 2008, Phillip Lougher wrote:
>>> This looks very nice, but could use some comments about how the data is
>>> actually stored on disk. It took me some time to figure out that it actually
>>> allows to do tail merging into compressed blocks, which I was about to suggest
>>> you implement ;-). Cramfs doesn't have them, and I found that they are the
>>> main reason why squashfs compresses better than cramfs, besides the default
>>> block size, which you can change on either one.
>> Squashfs has much larger block sizes than cramfs (last time I looked it
>> was limited to 4K blocks), and it compresses the metadata which helps to
>> get better compression. But tail merging (fragments in Squashfs
>> terminology) is obviously a major reason why Squashfs gets good compression.
>
> The *default* block size in cramfs is smaller than in squashfs, but they both
> have user selectable block sizes. I found the impact of compressed metadata
> to be almost zero.

Squashfs stores significantly more metadata than cramfs. Remember
cramfs has no support for filesystems > ~ 16Mbytes, no inode timestamps,
truncates uid/gids, no hard-links, no nlink counts, no hashed
directories, no unique inode numbers. If Squashfs didn't compress the
metadata it would be significantly larger than cramfs.

Cheers

Phillip

2008-08-22 17:20:32

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

On Fri, 22 August 2008 18:08:51 +0100, Phillip Lougher wrote:
>
> Squashfs stores significantly more metadata than cramfs. Remember
> cramfs has no support for filesystems > ~ 16Mbytes, no inode timestamps,
> truncates uid/gids, no hard-links, no nlink counts, no hashed
> directories, no unique inode numbers. If Squashfs didn't compress the
> metadata it would be significantly larger than cramfs.

Elsewhere in this maze of threads Arnd claimed to have tested the
benefits of metadata compression - and it making little impact.

My guess is that it would make a large impact if metadata would be a
significant part of the filesystem image. Usually metadata is close
enough to 0% to be mistaken for statistical noise. So compressing it
makes a significant impact on an insignificant amount of data.

Jörn

--
One of my most productive days was throwing away 1000 lines of code.
-- Ken Thompson.

2008-08-22 18:04:29

by Jared Hulbert

[permalink] [raw]
Subject: Re: [PATCH 04/10] AXFS: axfs_inode.c

> Elsewhere in this maze of threads Arnd claimed to have tested the
> benefits of metadata compression - and it making little impact.
>
> My guess is that it would make a large impact if metadata would be a
> significant part of the filesystem image. Usually metadata is close
> enough to 0% to be mistaken for statistical noise. So compressing it
> makes a significant impact on an insignificant amount of data.

Like I said early it depends on the value you assign to significant.
For a fs sizes I started designing to 16MB-64MB if you have to track a
whole bunch of 8Byte numbers for every symlink, dev node, inode, and
page. It adds up quickly to a couple of MB, that can translate to
lots of money. Even at only an extra $0.25 a system * 12Million units
= $3Million.