2008-06-09 08:46:34

by Maxim Shchetynin

[permalink] [raw]
Subject: AZFS file system proposal

Hello,

there are some users which have interest on such kind of file system like azfs. Please, have a look at this version of a diff file which introduces a first version of azfs to 2.6.26. This file system may be useful for example on IBM CellBlades where user can mount DDR2 memory of Axon controller as a disk and to be able to access it directly without any caching mechanism in between.

Subject: azfs: initial submit of azfs, a non-buffered filesystem

From: Maxim Shchetynin <[email protected]>

Non-buffered filesystem for block devices with a gendisk and
with direct_access() method in gendisk->fops.
AZFS does not buffer outgoing traffic and is doing no read ahead.
It supports mount options (given with -o) bs=x,uid=x,gid=x.
If block-size (bs) is not specified AZFS uses block-size used
by block device. Though mmap() method is available only if
block-size equals to or is greater than the system page size.

Signed-off-by: Maxim Shchetynin <[email protected]>

diff -Nuar linux-2.6.26-rc5/arch/powerpc/configs/cell_defconfig linux-2.6.26-rc5-azfs/arch/powerpc/configs/cell_defconfig
--- linux-2.6.26-rc5/arch/powerpc/configs/cell_defconfig 2008-06-05 05:10:44.000000000 +0200
+++ linux-2.6.26-rc5-azfs/arch/powerpc/configs/cell_defconfig 2008-06-06 11:53:34.000000000 +0200
@@ -240,6 +240,7 @@
# CPU Frequency drivers
#
CONFIG_AXON_RAM=m
+CONFIG_AZ_FS=m
# CONFIG_FSL_ULI1575 is not set

#
diff -Nuar linux-2.6.26-rc5/fs/Kconfig linux-2.6.26-rc5-azfs/fs/Kconfig
--- linux-2.6.26-rc5/fs/Kconfig 2008-06-05 05:10:44.000000000 +0200
+++ linux-2.6.26-rc5-azfs/fs/Kconfig 2008-06-06 16:55:11.616419992 +0200
@@ -360,6 +360,17 @@
If you are not using a security module that requires using
extended attributes for file security labels, say N.

+config AZ_FS
+ tristate "AZFS filesystem support"
+ help
+ Non-buffered filesystem for block devices with a gendisk and
+ with direct_access() method in gendisk->fops.
+ AZFS does not buffer outgoing traffic and is doing no read ahead.
+ It supports mount options (given with -o) bs=x,uid=x,gid=x.
+ If block-size (bs) is not specified AZFS uses block-size used
+ by block device. Though mmap() method is available only if
+ block-size equals to or is greater than system page size.
+
config JFS_FS
tristate "JFS filesystem support"
select NLS
diff -Nuar linux-2.6.26-rc5/fs/Makefile linux-2.6.26-rc5-azfs/fs/Makefile
--- linux-2.6.26-rc5/fs/Makefile 2008-06-05 05:10:44.000000000 +0200
+++ linux-2.6.26-rc5-azfs/fs/Makefile 2008-06-06 11:53:34.000000000 +0200
@@ -119,3 +119,4 @@
obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_GFS2_FS) += gfs2/
+obj-$(CONFIG_AZ_FS) += azfs.o
diff -Nuar linux-2.6.26-rc5/fs/azfs.c linux-2.6.26-rc5-azfs/fs/azfs.c
--- linux-2.6.26-rc5/fs/azfs.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.26-rc5-azfs/fs/azfs.c 2008-06-06 17:46:23.587653053 +0200
@@ -0,0 +1,1179 @@
+/*
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2007
+ *
+ * Author: Maxim Shchetynin <[email protected]>
+ *
+ * Non-buffered filesystem driver.
+ * It registers a filesystem which may be used for all kind of block devices
+ * which have a direct_access() method in block_device_operations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/cache.h>
+#include <linux/dcache.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/aio.h>
+#include <linux/uio.h>
+#include <asm/bug.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/string.h>
+
+#define AZFS_FILESYSTEM_NAME "azfs"
+#define AZFS_FILESYSTEM_FLAGS FS_REQUIRES_DEV
+
+#define AZFS_SUPERBLOCK_MAGIC 0xABBA1972
+#define AZFS_SUPERBLOCK_FLAGS MS_NOEXEC | \
+ MS_SYNCHRONOUS | \
+ MS_DIRSYNC | \
+ MS_ACTIVE
+
+#define AZFS_BDI_CAPABILITIES BDI_CAP_NO_ACCT_DIRTY | \
+ BDI_CAP_NO_WRITEBACK | \
+ BDI_CAP_MAP_COPY | \
+ BDI_CAP_MAP_DIRECT | \
+ BDI_CAP_VMFLAGS
+
+#define AZFS_CACHE_FLAGS SLAB_HWCACHE_ALIGN | \
+ SLAB_RECLAIM_ACCOUNT | \
+ SLAB_MEM_SPREAD
+
+enum azfs_direction {
+ AZFS_MMAP,
+ AZFS_READ,
+ AZFS_WRITE
+};
+
+struct azfs_super {
+ struct list_head list;
+ unsigned long media_size;
+ unsigned long block_size;
+ unsigned short block_shift;
+ unsigned long sector_size;
+ unsigned short sector_shift;
+ uid_t uid;
+ gid_t gid;
+ unsigned long ph_addr;
+ unsigned long io_addr;
+ struct block_device *blkdev;
+ struct dentry *root;
+ struct list_head block_list;
+ rwlock_t lock;
+};
+
+struct azfs_super_list {
+ struct list_head head;
+ spinlock_t lock;
+};
+
+struct azfs_block {
+ struct list_head list;
+ unsigned long id;
+ unsigned long count;
+};
+
+struct azfs_znode {
+ struct list_head block_list;
+ rwlock_t lock;
+ loff_t size;
+ struct inode vfs_inode;
+};
+
+static struct azfs_super_list super_list;
+static struct kmem_cache *azfs_znode_cache __read_mostly = NULL;
+static struct kmem_cache *azfs_block_cache __read_mostly = NULL;
+
+#define I2Z(inode) \
+ container_of(inode, struct azfs_znode, vfs_inode)
+
+#define for_each_block(block, block_list) \
+ list_for_each_entry(block, block_list, list)
+#define for_each_block_reverse(block, block_list) \
+ list_for_each_entry_reverse(block, block_list, list)
+#define for_each_block_safe(block, ding, block_list) \
+ list_for_each_entry_safe(block, ding, block_list, list)
+#define for_each_block_safe_reverse(block, ding, block_list) \
+ list_for_each_entry_safe_reverse(block, ding, block_list, list)
+
+/**
+ * azfs_block_init - create and initialise a new block in a list
+ * @block_list: destination list
+ * @id: block id
+ * @count: size of a block
+ */
+static inline struct azfs_block*
+azfs_block_init(struct list_head *block_list,
+ unsigned long id, unsigned long count)
+{
+ struct azfs_block *block;
+
+ block = kmem_cache_alloc(azfs_block_cache, GFP_KERNEL);
+ if (!block)
+ return NULL;
+
+ block->id = id;
+ block->count = count;
+
+ INIT_LIST_HEAD(&block->list);
+ list_add_tail(&block->list, block_list);
+
+ return block;
+}
+
+/**
+ * azfs_block_free - remove block from a list and free it back in cache
+ * @block: block to be removed
+ */
+static inline void
+azfs_block_free(struct azfs_block *block)
+{
+ list_del(&block->list);
+ kmem_cache_free(azfs_block_cache, block);
+}
+
+/**
+ * azfs_block_move - move block to another list
+ * @block: block to be moved
+ * @block_list: destination list
+ */
+static inline void
+azfs_block_move(struct azfs_block *block, struct list_head *block_list)
+{
+ list_move_tail(&block->list, block_list);
+}
+
+/**
+ * azfs_recherche - get real address of a part of a file
+ * @inode: inode
+ * @direction: data direction
+ * @from: offset for read/write operation
+ * @size: pointer to a value of the amount of data to be read/written
+ */
+static unsigned long
+azfs_recherche(struct inode *inode, enum azfs_direction direction,
+ unsigned long from, unsigned long *size)
+{
+ struct azfs_super *super;
+ struct azfs_znode *znode;
+ struct azfs_block *block;
+ unsigned long block_id, west, east;
+
+ super = inode->i_sb->s_fs_info;
+ znode = I2Z(inode);
+
+ if (from + *size > znode->size) {
+ i_size_write(inode, from + *size);
+ inode->i_op->truncate(inode);
+ }
+
+ read_lock(&znode->lock);
+
+ if (list_empty(&znode->block_list)) {
+ read_unlock(&znode->lock);
+ return 0;
+ }
+
+ block_id = from >> super->block_shift;
+
+ for_each_block(block, &znode->block_list) {
+ if (block->count > block_id)
+ break;
+ block_id -= block->count;
+ }
+
+ west = from % super->block_size;
+ east = ((block->count - block_id) << super->block_shift) - west;
+
+ if (*size > east)
+ *size = east;
+
+ block_id = ((block->id + block_id) << super->block_shift) + west;
+
+ read_unlock(&znode->lock);
+
+ block_id += direction == AZFS_MMAP ? super->ph_addr : super->io_addr;
+
+ return block_id;
+}
+
+static struct inode*
+azfs_new_inode(struct super_block *, struct inode *, int, dev_t);
+
+/**
+ * azfs_mknod - mknod() method for inode_operations
+ * @dir, @dentry, @mode, @dev: see inode_operations methods
+ */
+static int
+azfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+ struct inode *inode;
+
+ inode = azfs_new_inode(dir->i_sb, dir, mode, dev);
+ if (!inode)
+ return -ENOSPC;
+
+ if (S_ISREG(mode))
+ I2Z(inode)->size = 0;
+
+ dget(dentry);
+ d_instantiate(dentry, inode);
+
+ return 0;
+}
+
+/**
+ * azfs_create - create() method for inode_operations
+ * @dir, @dentry, @mode, @nd: see inode_operations methods
+ */
+static int
+azfs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ return azfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+/**
+ * azfs_mkdir - mkdir() method for inode_operations
+ * @dir, @dentry, @mode: see inode_operations methods
+ */
+static int
+azfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ int rc;
+
+ rc = azfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+ if (rc == 0)
+ inc_nlink(dir);
+
+ return rc;
+}
+
+/**
+ * azfs_symlink - symlink() method for inode_operations
+ * @dir, @dentry, @name: see inode_operations methods
+ */
+static int
+azfs_symlink(struct inode *dir, struct dentry *dentry, const char *name)
+{
+ struct inode *inode;
+ int rc;
+
+ inode = azfs_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO, 0);
+ if (!inode)
+ return -ENOSPC;
+
+ rc = page_symlink(inode, name, strlen(name) + 1);
+ if (rc) {
+ iput(inode);
+ return rc;
+ }
+
+ dget(dentry);
+ d_instantiate(dentry, inode);
+
+ return 0;
+}
+
+/**
+ * azfs_aio_read - aio_read() method for file_operations
+ * @iocb, @iov, @nr_segs, @pos: see file_operations methods
+ */
+static ssize_t
+azfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct inode *inode;
+ void *ziel;
+ unsigned long pin;
+ unsigned long size, todo, step;
+ ssize_t rc;
+
+ inode = iocb->ki_filp->f_mapping->host;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (pos >= i_size_read(inode)) {
+ rc = 0;
+ goto out;
+ }
+
+ ziel = iov->iov_base;
+ todo = min((loff_t) iov->iov_len, i_size_read(inode) - pos);
+
+ for (step = todo; step; step -= size) {
+ size = step;
+ pin = azfs_recherche(inode, AZFS_READ, pos, &size);
+ if (!pin) {
+ rc = -ENOSPC;
+ goto out;
+ }
+ if (copy_to_user(ziel, (void*) pin, size)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ iocb->ki_pos += size;
+ pos += size;
+ ziel += size;
+ }
+
+ rc = todo;
+
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return rc;
+}
+
+/**
+ * azfs_aio_write - aio_write() method for file_operations
+ * @iocb, @iov, @nr_segs, @pos: see file_operations methods
+ */
+static ssize_t
+azfs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct inode *inode;
+ void *quell;
+ unsigned long pin;
+ unsigned long size, todo, step;
+ ssize_t rc;
+
+ inode = iocb->ki_filp->f_mapping->host;
+
+ quell = iov->iov_base;
+ todo = iov->iov_len;
+
+ mutex_lock(&inode->i_mutex);
+
+ for (step = todo; step; step -= size) {
+ size = step;
+ pin = azfs_recherche(inode, AZFS_WRITE, pos, &size);
+ if (!pin) {
+ rc = -ENOSPC;
+ goto out;
+ }
+ if (copy_from_user((void*) pin, quell, size)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ iocb->ki_pos += size;
+ pos += size;
+ quell += size;
+ }
+
+ rc = todo;
+
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return rc;
+}
+
+/**
+ * azfs_open - open() method for file_operations
+ * @inode, @file: see file_operations methods
+ */
+static int
+azfs_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode;
+
+ if (file->f_flags & O_TRUNC) {
+ i_size_write(inode, 0);
+ inode->i_op->truncate(inode);
+ }
+ if (file->f_flags & O_APPEND)
+ inode->i_fop->llseek(file, 0, SEEK_END);
+
+ return 0;
+}
+
+/**
+ * azfs_mmap - mmap() method for file_operations
+ * @file, @vm: see file_operations methods
+ */
+static int
+azfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct azfs_super *super;
+ struct azfs_znode *znode;
+ struct inode *inode;
+ unsigned long cursor, pin;
+ unsigned long todo, size, vm_start;
+ pgprot_t page_prot;
+
+ inode = file->private_data;
+ znode = I2Z(inode);
+ super = inode->i_sb->s_fs_info;
+
+ if (super->block_size < PAGE_SIZE)
+ return -EINVAL;
+
+ cursor = vma->vm_pgoff << super->block_shift;
+ todo = vma->vm_end - vma->vm_start;
+
+ if (cursor + todo > i_size_read(inode))
+ return -EINVAL;
+
+ page_prot = pgprot_val(vma->vm_page_prot);
+ page_prot |= (_PAGE_NO_CACHE | _PAGE_RW);
+ page_prot &= ~_PAGE_GUARDED;
+ vma->vm_page_prot = __pgprot(page_prot);
+
+ vm_start = vma->vm_start;
+ for (size = todo; todo; todo -= size, size = todo) {
+ pin = azfs_recherche(inode, AZFS_MMAP, cursor, &size);
+ if (!pin)
+ return -EAGAIN;
+ pin >>= PAGE_SHIFT;
+ if (remap_pfn_range(vma, vm_start, pin, size, vma->vm_page_prot))
+ return -EAGAIN;
+
+ vm_start += size;
+ cursor += size;
+ }
+
+ return 0;
+}
+
+/**
+ * azfs_truncate - truncate() method for inode_operations
+ * @inode: see inode_operations methods
+ */
+static void
+azfs_truncate(struct inode *inode)
+{
+ struct azfs_super *super;
+ struct azfs_znode *znode;
+ struct azfs_block *block, *ding, *knoten, *west, *east;
+ unsigned long id, count;
+ signed long delta;
+
+ super = inode->i_sb->s_fs_info;
+ znode = I2Z(inode);
+
+ delta = i_size_read(inode) + (super->block_size - 1);
+ delta >>= super->block_shift;
+ delta -= inode->i_blocks;
+
+ if (delta == 0) {
+ znode->size = i_size_read(inode);
+ return;
+ }
+
+ write_lock(&znode->lock);
+
+ while (delta > 0) {
+ west = east = NULL;
+
+ write_lock(&super->lock);
+
+ if (list_empty(&super->block_list)) {
+ write_unlock(&super->lock);
+ break;
+ }
+
+ for (count = delta; count; count--) {
+ for_each_block(block, &super->block_list)
+ if (block->count >= count) {
+ east = block;
+ break;
+ }
+ if (east)
+ break;
+ }
+
+ for_each_block_reverse(block, &znode->block_list) {
+ if (block->id + block->count == east->id)
+ west = block;
+ break;
+ }
+
+ if (east->count == count) {
+ if (west) {
+ west->count += east->count;
+ azfs_block_free(east);
+ } else {
+ azfs_block_move(east, &znode->block_list);
+ }
+ } else {
+ if (west) {
+ west->count += count;
+ } else {
+ if (!azfs_block_init(&znode->block_list,
+ east->id, count)) {
+ write_unlock(&super->lock);
+ break;
+ }
+ }
+
+ east->id += count;
+ east->count -= count;
+ }
+
+ write_unlock(&super->lock);
+
+ inode->i_blocks += count;
+
+ delta -= count;
+ }
+
+ while (delta < 0) {
+ for_each_block_safe_reverse(block, knoten, &znode->block_list) {
+ id = block->id;
+ count = block->count;
+ if ((signed long) count + delta > 0) {
+ block->count += delta;
+ id += block->count;
+ count -= block->count;
+ block = NULL;
+ }
+
+ west = east = NULL;
+
+ write_lock(&super->lock);
+
+ for_each_block(ding, &super->block_list) {
+ if (!west && (ding->id + ding->count == id))
+ west = ding;
+ else if (!east && (id + count == ding->id))
+ east = ding;
+ if (west && east)
+ break;
+ }
+
+ if (west && east) {
+ west->count += count + east->count;
+ azfs_block_free(east);
+ if (block)
+ azfs_block_free(block);
+ } else if (west) {
+ west->count += count;
+ if (block)
+ azfs_block_free(block);
+ } else if (east) {
+ east->id -= count;
+ east->count += count;
+ if (block)
+ azfs_block_free(block);
+ } else {
+ if (!block) {
+ if (!azfs_block_init(&super->block_list,
+ id, count)) {
+ write_unlock(&super->lock);
+ break;
+ }
+ } else {
+ azfs_block_move(block, &super->block_list);
+ }
+ }
+
+ write_unlock(&super->lock);
+
+ inode->i_blocks -= count;
+
+ delta += count;
+
+ break;
+ }
+ }
+
+ write_unlock(&znode->lock);
+
+ znode->size = min(i_size_read(inode),
+ (loff_t) inode->i_blocks << super->block_shift);
+}
+
+/**
+ * azfs_getattr - getattr() method for inode_operations
+ * @mnt, @dentry, @stat: see inode_operations methods
+ */
+static int
+azfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+ struct azfs_super *super;
+ struct inode *inode;
+ unsigned short shift;
+
+ inode = dentry->d_inode;
+ super = inode->i_sb->s_fs_info;
+
+ generic_fillattr(inode, stat);
+ stat->blocks = inode->i_blocks;
+ shift = super->block_shift - super->sector_shift;
+ if (shift)
+ stat->blocks <<= shift;
+
+ return 0;
+}
+
+static const struct address_space_operations azfs_aops = {
+ .write_begin = simple_write_begin,
+ .write_end = simple_write_end
+};
+
+static struct backing_dev_info azfs_bdi = {
+ .ra_pages = 0,
+ .capabilities = AZFS_BDI_CAPABILITIES
+};
+
+static struct inode_operations azfs_dir_iops = {
+ .create = azfs_create,
+ .lookup = simple_lookup,
+ .link = simple_link,
+ .unlink = simple_unlink,
+ .symlink = azfs_symlink,
+ .mkdir = azfs_mkdir,
+ .rmdir = simple_rmdir,
+ .mknod = azfs_mknod,
+ .rename = simple_rename
+};
+
+static const struct file_operations azfs_reg_fops = {
+ .llseek = generic_file_llseek,
+ .aio_read = azfs_aio_read,
+ .aio_write = azfs_aio_write,
+ .open = azfs_open,
+ .mmap = azfs_mmap,
+ .fsync = simple_sync_file,
+};
+
+static struct inode_operations azfs_reg_iops = {
+ .truncate = azfs_truncate,
+ .getattr = azfs_getattr
+};
+
+/**
+ * azfs_new_inode - cook a new inode
+ * @sb: super-block
+ * @dir: parent directory
+ * @mode: file mode
+ * @dev: to be forwarded to init_special_inode()
+ */
+static struct inode*
+azfs_new_inode(struct super_block *sb, struct inode *dir, int mode, dev_t dev)
+{
+ struct azfs_super *super;
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ inode->i_mode = mode;
+ if (dir) {
+ dir->i_mtime = dir->i_ctime = inode->i_mtime;
+ inode->i_uid = current->fsuid;
+ if (dir->i_mode & S_ISGID) {
+ if (S_ISDIR(mode))
+ inode->i_mode |= S_ISGID;
+ inode->i_gid = dir->i_gid;
+ } else {
+ inode->i_gid = current->fsgid;
+ }
+ } else {
+ super = sb->s_fs_info;
+ inode->i_uid = super->uid;
+ inode->i_gid = super->gid;
+ }
+
+ inode->i_blocks = 0;
+ inode->i_mapping->a_ops = &azfs_aops;
+ inode->i_mapping->backing_dev_info = &azfs_bdi;
+
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ inode->i_op = &azfs_dir_iops;
+ inode->i_fop = &simple_dir_operations;
+ inc_nlink(inode);
+ break;
+
+ case S_IFREG:
+ inode->i_op = &azfs_reg_iops;
+ inode->i_fop = &azfs_reg_fops;
+ break;
+
+ case S_IFLNK:
+ inode->i_op = &page_symlink_inode_operations;
+ break;
+
+ default:
+ init_special_inode(inode, mode, dev);
+ break;
+ }
+
+ return inode;
+}
+
+/**
+ * azfs_alloc_inode - alloc_inode() method for super_operations
+ * @sb: see super_operations methods
+ */
+static struct inode*
+azfs_alloc_inode(struct super_block *sb)
+{
+ struct azfs_znode *znode;
+
+ znode = kmem_cache_alloc(azfs_znode_cache, GFP_KERNEL);
+
+ INIT_LIST_HEAD(&znode->block_list);
+ rwlock_init(&znode->lock);
+
+ inode_init_once(&znode->vfs_inode);
+
+ return znode ? &znode->vfs_inode : NULL;
+}
+
+/**
+ * azfs_destroy_inode - destroy_inode() method for super_operations
+ * @inode: see super_operations methods
+ */
+static void
+azfs_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(azfs_znode_cache, I2Z(inode));
+}
+
+/**
+ * azfs_delete_inode - delete_inode() method for super_operations
+ * @inode: see super_operations methods
+ */
+static void
+azfs_delete_inode(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode)) {
+ i_size_write(inode, 0);
+ azfs_truncate(inode);
+ }
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+}
+
+/**
+ * azfs_statfs - statfs() method for super_operations
+ * @dentry, @stat: see super_operations methods
+ */
+static int
+azfs_statfs(struct dentry *dentry, struct kstatfs *stat)
+{
+ struct super_block *sb;
+ struct azfs_super *super;
+ struct inode *inode;
+ unsigned long inodes, blocks;
+
+ sb = dentry->d_sb;
+ super = sb->s_fs_info;
+
+ inodes = blocks = 0;
+ mutex_lock(&sb->s_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ inodes++;
+ blocks += inode->i_blocks;
+ }
+ mutex_unlock(&sb->s_lock);
+
+ stat->f_type = AZFS_SUPERBLOCK_MAGIC;
+ stat->f_bsize = super->block_size;
+ stat->f_blocks = super->media_size >> super->block_shift;
+ stat->f_bfree = stat->f_blocks - blocks;
+ stat->f_bavail = stat->f_blocks - blocks;
+ stat->f_files = inodes + blocks;
+ stat->f_ffree = blocks + 1;
+ stat->f_namelen = NAME_MAX;
+
+ return 0;
+}
+
+static struct super_operations azfs_ops = {
+ .alloc_inode = azfs_alloc_inode,
+ .destroy_inode = azfs_destroy_inode,
+ .drop_inode = generic_delete_inode,
+ .delete_inode = azfs_delete_inode,
+ .statfs = azfs_statfs
+};
+
+enum {
+ Opt_blocksize_short,
+ Opt_blocksize_long,
+ Opt_uid,
+ Opt_gid,
+ Opt_err
+};
+
+static match_table_t tokens = {
+ {Opt_blocksize_short, "bs=%u"},
+ {Opt_blocksize_long, "blocksize=%u"},
+ {Opt_uid, "uid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_err, NULL}
+};
+
+/**
+ * azfs_parse_mount_parameters - parse options given to mount with -o
+ * @sb: super block
+ * @options: comma separated options
+ */
+static int
+azfs_parse_mount_parameters(struct super_block *sb, char *options)
+{
+ struct azfs_super *super;
+ char *option;
+ int token, value;
+ substring_t args[MAX_OPT_ARGS];
+
+ super = sb->s_fs_info;
+
+ while ((option = strsep(&options, ",")) != NULL) {
+ if (!*option)
+ continue;
+
+ token = match_token(option, tokens, args);
+ switch (token) {
+ case Opt_blocksize_short:
+ case Opt_blocksize_long:
+ if (match_int(&args[0], &value))
+ goto syntax_error;
+ super->block_size = value;
+ break;
+
+ case Opt_uid:
+ if (match_int(&args[0], &value))
+ goto syntax_error;
+ super->uid = value;
+ break;
+
+ case Opt_gid:
+ if (match_int(&args[0], &value))
+ goto syntax_error;
+ super->gid = value;
+ break;
+
+ default:
+ goto syntax_error;
+ }
+ }
+
+ return 1;
+
+syntax_error:
+ printk(KERN_ERR "%s: invalid mount option\n",
+ AZFS_FILESYSTEM_NAME);
+
+ return 0;
+}
+
+/**
+ * azfs_fill_super - fill_super routine for get_sb
+ * @sb, @data, @silent: see file_system_type methods
+ */
+static int
+azfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct gendisk *disk;
+ struct azfs_super *super = NULL, *knoten;
+ struct azfs_block *block = NULL;
+ struct inode *inode = NULL;
+ void *kaddr;
+ unsigned long pfn;
+ int rc;
+
+ BUG_ON(!sb->s_bdev);
+
+ disk = sb->s_bdev->bd_disk;
+
+ if (!disk || !disk->queue) {
+ printk(KERN_ERR "%s needs a block device which has a gendisk "
+ "with a queue\n",
+ AZFS_FILESYSTEM_NAME);
+ return -ENOSYS;
+ }
+
+ if (!disk->fops->direct_access) {
+ printk(KERN_ERR "%s needs a block device with a "
+ "direct_access() method\n",
+ AZFS_FILESYSTEM_NAME);
+ return -ENOSYS;
+ }
+
+ if (!get_device(disk->driverfs_dev)) {
+ printk(KERN_ERR "%s cannot get reference to device driver\n",
+ AZFS_FILESYSTEM_NAME);
+ return -EFAULT;
+ }
+
+ sb->s_magic = AZFS_SUPERBLOCK_MAGIC;
+ sb->s_flags = AZFS_SUPERBLOCK_FLAGS;
+ sb->s_op = &azfs_ops;
+ sb->s_maxbytes = get_capacity(disk) * disk->queue->hardsect_size;
+ sb->s_time_gran = 1;
+
+ spin_lock(&super_list.lock);
+ list_for_each_entry(knoten, &super_list.head, list)
+ if (knoten->blkdev == sb->s_bdev) {
+ super = knoten;
+ break;
+ }
+ spin_unlock(&super_list.lock);
+
+ if (super) {
+ if (strlen((char*) data))
+ printk(KERN_WARNING "/dev/%s was already mounted with "
+ "%s before, it will be mounted with "
+ "mount options used last time, "
+ "options just given would be ignored\n",
+ disk->disk_name, AZFS_FILESYSTEM_NAME);
+ sb->s_fs_info = super;
+ } else {
+ super = kzalloc(sizeof(struct azfs_super), GFP_KERNEL);
+ if (!super) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+ sb->s_fs_info = super;
+
+ if (!azfs_parse_mount_parameters(sb, (char*) data)) {
+ rc = -EINVAL;
+ goto failed;
+ }
+
+ inode = azfs_new_inode(sb, NULL, S_IFDIR | S_IRWXUGO, 0);
+ if (!inode) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ super->root = d_alloc_root(inode);
+ if (!super->root) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+ dget(super->root);
+
+ INIT_LIST_HEAD(&super->list);
+ INIT_LIST_HEAD(&super->block_list);
+ rwlock_init(&super->lock);
+
+ super->media_size = sb->s_maxbytes;
+
+ if (!super->block_size)
+ super->block_size = sb->s_blocksize;
+ super->block_shift = blksize_bits(super->block_size);
+
+ super->sector_size = disk->queue->hardsect_size;
+ super->sector_shift = blksize_bits(super->sector_size);
+
+ super->blkdev = sb->s_bdev;
+
+ block = azfs_block_init(&super->block_list,
+ 0, super->media_size >> super->block_shift);
+ if (!block) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = disk->fops->direct_access(super->blkdev, 0, &kaddr, &pfn);
+ if (rc < 0) {
+ rc = -EFAULT;
+ goto failed;
+ }
+ super->ph_addr = (unsigned long) kaddr;
+
+ super->io_addr = (unsigned long) ioremap_flags(
+ super->ph_addr, super->media_size, _PAGE_NO_CACHE);
+ if (!super->io_addr) {
+ rc = -EFAULT;
+ goto failed;
+ }
+
+ spin_lock(&super_list.lock);
+ list_add(&super->list, &super_list.head);
+ spin_unlock(&super_list.lock);
+ }
+
+ sb->s_root = super->root;
+ disk->driverfs_dev->driver_data = super;
+ disk->driverfs_dev->platform_data = sb;
+
+ if (super->block_size < PAGE_SIZE)
+ printk(KERN_INFO "Block size on %s is smaller then system "
+ "page size: mmap() would not be supported\n",
+ disk->disk_name);
+
+ return 0;
+
+failed:
+ if (super) {
+ sb->s_root = NULL;
+ sb->s_fs_info = NULL;
+ if (block)
+ azfs_block_free(block);
+ if (super->root)
+ dput(super->root);
+ if (inode)
+ iput(inode);
+ disk->driverfs_dev->driver_data = NULL;
+ kfree(super);
+ disk->driverfs_dev->platform_data = NULL;
+ put_device(disk->driverfs_dev);
+ }
+
+ return rc;
+}
+
+/**
+ * azfs_get_sb - get_sb() method for file_system_type
+ * @fs_type, @flags, @dev_name, @data, @mount: see file_system_type methods
+ */
+static int
+azfs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mount)
+{
+ return get_sb_bdev(fs_type, flags,
+ dev_name, data, azfs_fill_super, mount);
+}
+
+/**
+ * azfs_kill_sb - kill_sb() method for file_system_type
+ * @sb: see file_system_type methods
+ */
+static void
+azfs_kill_sb(struct super_block *sb)
+{
+ sb->s_root = NULL;
+ kill_block_super(sb);
+}
+
+static struct file_system_type azfs_fs = {
+ .owner = THIS_MODULE,
+ .name = AZFS_FILESYSTEM_NAME,
+ .get_sb = azfs_get_sb,
+ .kill_sb = azfs_kill_sb,
+ .fs_flags = AZFS_FILESYSTEM_FLAGS
+};
+
+/**
+ * azfs_init
+ */
+static int __init
+azfs_init(void)
+{
+ int rc;
+
+ INIT_LIST_HEAD(&super_list.head);
+ spin_lock_init(&super_list.lock);
+
+ azfs_znode_cache = kmem_cache_create("azfs_znode_cache",
+ sizeof(struct azfs_znode), 0, AZFS_CACHE_FLAGS, NULL);
+ if (!azfs_znode_cache) {
+ printk(KERN_ERR "Could not allocate inode cache for %s\n",
+ AZFS_FILESYSTEM_NAME);
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ azfs_block_cache = kmem_cache_create("azfs_block_cache",
+ sizeof(struct azfs_block), 0, AZFS_CACHE_FLAGS, NULL);
+ if (!azfs_block_cache) {
+ printk(KERN_ERR "Could not allocate block cache for %s\n",
+ AZFS_FILESYSTEM_NAME);
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = register_filesystem(&azfs_fs);
+ if (rc != 0) {
+ printk(KERN_ERR "Could not register %s\n",
+ AZFS_FILESYSTEM_NAME);
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ if (azfs_block_cache)
+ kmem_cache_destroy(azfs_block_cache);
+
+ if (azfs_znode_cache)
+ kmem_cache_destroy(azfs_znode_cache);
+
+ return rc;
+}
+
+/**
+ * azfs_exit
+ */
+static void __exit
+azfs_exit(void)
+{
+ struct azfs_super *super, *SUPER;
+ struct azfs_block *block, *knoten;
+ struct gendisk *disk;
+
+ spin_lock(&super_list.lock);
+ list_for_each_entry_safe(super, SUPER, &super_list.head, list) {
+ disk = super->blkdev->bd_disk;
+ list_del(&super->list);
+ iounmap((void*) super->io_addr);
+ write_lock(&super->lock);
+ for_each_block_safe(block, knoten, &super->block_list)
+ azfs_block_free(block);
+ write_unlock(&super->lock);
+ disk->driverfs_dev->driver_data = NULL;
+ disk->driverfs_dev->platform_data = NULL;
+ kfree(super);
+ put_device(disk->driverfs_dev);
+ }
+ spin_unlock(&super_list.lock);
+
+ unregister_filesystem(&azfs_fs);
+
+ kmem_cache_destroy(azfs_block_cache);
+ kmem_cache_destroy(azfs_znode_cache);
+}
+
+module_init(azfs_init);
+module_exit(azfs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Maxim Shchetynin <[email protected]>");
+MODULE_DESCRIPTION("Non-buffered file system for IO devices");


Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!


2008-06-09 12:55:59

by Matthew Wilcox

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Mon, Jun 09, 2008 at 10:46:50AM +0200, Maxim Shchetynin wrote:
> +config AZ_FS
> + tristate "AZFS filesystem support"
> + help
> + Non-buffered filesystem for block devices with a gendisk and
> + with direct_access() method in gendisk->fops.
> + AZFS does not buffer outgoing traffic and is doing no read ahead.
> + It supports mount options (given with -o) bs=x,uid=x,gid=x.
> + If block-size (bs) is not specified AZFS uses block-size used
> + by block device. Though mmap() method is available only if
> + block-size equals to or is greater than system page size.

This is a terrible description. I can't even suggest better wording
because I don't understand it. The correct wording would not mention
gendisks, fops or direct_access(). It would tell the user what they
might use it for.

--
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours. We can't possibly take such
a retrograde step."

2008-06-10 08:49:33

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Mon, 9 Jun 2008 06:55:30 -0600
Matthew Wilcox <[email protected]> wrote:

> On Mon, Jun 09, 2008 at 10:46:50AM +0200, Maxim Shchetynin wrote:
> > +config AZ_FS
> > + tristate "AZFS filesystem support"
> > + help
> > + Non-buffered filesystem for block devices with a gendisk and
> > + with direct_access() method in gendisk->fops.
> > + AZFS does not buffer outgoing traffic and is doing no read ahead.
> > + It supports mount options (given with -o) bs=x,uid=x,gid=x.
> > + If block-size (bs) is not specified AZFS uses block-size used
> > + by block device. Though mmap() method is available only if
> > + block-size equals to or is greater than system page size.
>
> This is a terrible description. I can't even suggest better wording
> because I don't understand it. The correct wording would not mention
> gendisks, fops or direct_access(). It would tell the user what they
> might use it for.
>

What do you think about the following - shorter description and a better place for it (moved to "Pseudo filesystems"):

--- linux-2.6.26-rc5/fs/Kconfig 2008-06-05 05:10:44.000000000 +0200
+++ linux-2.6.26-rc5-azfs/fs/Kconfig 2008-06-10 10:43:24.326686538 +0200
@@ -1017,6 +1017,19 @@
config HUGETLB_PAGE
def_bool HUGETLBFS

+config AZ_FS
+ tristate "AZFS filesystem support"
+ help
+ azfs is a file system for I/O attached memory backing. It requires
+ a block device with direct_access capability, e.g. axonram.
+ Mounting such device with azfs gives memory mapped access to the
+ underlying memory to user space.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called azfs.
+
+ If unsure, say N.
+
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem"
depends on SYSFS

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-06-10 22:02:50

by Jan Engelhardt

[permalink] [raw]
Subject: Re: AZFS file system proposal


On Tuesday 2008-06-10 10:49, Maxim Shchetynin wrote:
>
>What do you think about the following - shorter description and a better place for it (moved to "Pseudo filesystems"):
>
>--- linux-2.6.26-rc5/fs/Kconfig 2008-06-05 05:10:44.000000000 +0200
>+++ linux-2.6.26-rc5-azfs/fs/Kconfig 2008-06-10 10:43:24.326686538 +0200
>@@ -1017,6 +1017,19 @@
> config HUGETLB_PAGE
> def_bool HUGETLBFS
>
>+config AZ_FS
>+ tristate "AZFS filesystem support"

This is slightly redundant, it should probably be
tristate "AZ filesystem support"

>+ help
>+ azfs is a file system for I/O attached memory backing. It requires
>+ a block device with direct_access capability, e.g. axonram.

(a) What is axonram?
(b) why is axonram direct_access, and my other devices (including
standard PC RAM) not?

>+ Mounting such device with azfs gives memory mapped access to the
>+ underlying memory to user space.

Can't I just mmap(/dev/theblockdevice), why would I need to go through
azfs?

2008-06-17 09:07:18

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

> >+config AZ_FS
> >+ tristate "AZFS filesystem support"
>
> This is slightly redundant, it should probably be
> tristate "AZ filesystem support"

I have called it the same way as other file systems (in Kconfig they say "JFS filesystem support", "XFS filesystem support", "NTFS file system support", ...)

> >+ help
> >+ azfs is a file system for I/O attached memory backing. It requires
> >+ a block device with direct_access capability, e.g. axonram.
>
> (a) What is axonram?
> (b) why is axonram direct_access, and my other devices (including
> standard PC RAM) not?

Axonram is a module specific for IBM CellBlade and allows access to a DDR2 memory attached onto Axon controller.
You don't need direct_access and not azfs for main memory because there is already a pretty nice tmpfs for it.

> >+ Mounting such device with azfs gives memory mapped access to the
> >+ underlying memory to user space.
>
> Can't I just mmap(/dev/theblockdevice), why would I need to go through
> azfs?

Yes, you can mmap it. But what does it have to do with a file system. I would say you can either mmap it or place a file system on it.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-06-17 09:35:40

by Jan Engelhardt

[permalink] [raw]
Subject: Re: AZFS file system proposal


On Tuesday 2008-06-17 11:06, Maxim Shchetynin wrote:
>
>> (a) What is axonram?
>> (b) why is axonram direct_access, and my other devices (including
>> standard PC RAM) not?
>
>[Axonram is a module specific for IBM CellBlade and allows access to
>a DDR2 memory attached onto Axon controller.] You don't need
>direct_access and not azfs for main memory because there is already
>a pretty nice tmpfs for it.

So the reason azfs could be useful is...?

>> >+ Mounting such device with azfs gives memory mapped
>> >access to the + underlying memory to user space.
>>
>> Can't I just mmap(/dev/theblockdevice), why would I need to go through
>> azfs?
>
>Yes, you can mmap it. But what does it have to do with a file system.

Exactly, that's what I was asking myself.

>I would say you can either mmap it or place a file system on it.

The help text implies that I *need* azfs to mmap it - and that sounds
like a Rube Goldberg machine.

2008-06-17 10:53:55

by Jörn Engel

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Tue, 17 June 2008 11:35:10 +0200, Jan Engelhardt wrote:
> On Tuesday 2008-06-17 11:06, Maxim Shchetynin wrote:
>
> >I would say you can either mmap it or place a file system on it.
>
> The help text implies that I *need* azfs to mmap it - and that sounds
> like a Rube Goldberg machine.

Some people actually prefer filesystems over raw devices for a variety
of reasons:
- each file brings its own address space, which offers memory protection
from other processes,
- files can have owners and permission bits,
- files hide the fragmentation of the underlying device from users,
- a file system provides a common and well-understood api for devices
with less common or well-understood apis,
- etc.

Those reasons are as valid for azfs as for any other filesystem. I have
no doubt that azfs is useful. It probably wouldn't hurt to express the
merits of the filesystem and the problems it is supposed to solve a
little better. So far most criticism was based on the fact that noone
understood what the hell it was all about.

My personal question when looking at this is: Why not use ext2? It
appears to me that an ext2 mounted with '-o xip' would solve the same
problems.

Jörn

--
To announce that there must be no criticism of the President, or that we
are to stand by the President, right or wrong, is not only unpatriotic
and servile, but is morally treasonable to the American public.
-- Theodore Roosevelt, Kansas City Star, 1918

2008-06-17 11:57:53

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

> >> (a) What is axonram?
> >> (b) why is axonram direct_access, and my other devices (including
> >> standard PC RAM) not?
> >
> >[Axonram is a module specific for IBM CellBlade and allows access to
> >a DDR2 memory attached onto Axon controller.] You don't need
> >direct_access and not azfs for main memory because there is already
> >a pretty nice tmpfs for it.
>
> So the reason azfs could be useful is...?

The "?" at the end of your sentence might mean a question. I would be really happy to give an answer on it if I could understand it.

> >> >+ Mounting such device with azfs gives memory mapped
> >> >access to the + underlying memory to user space.
> >>
> >> Can't I just mmap(/dev/theblockdevice), why would I need to go through
> >> azfs?
> >
> >Yes, you can mmap it. But what does it have to do with a file system.
>
> Exactly, that's what I was asking myself.

I am glad I could make you feel comfortable with that answer.

> >I would say you can either mmap it or place a file system on it.
>
> The help text implies that I *need* azfs to mmap it - and that sounds
> like a Rube Goldberg machine.

Does it? If it is really so, then only because my english is not perfect. I didn't want to compel someone to do mmap or to disturb user's rights in any other way. By the way, you can also mmap some files on AZFS like on any other file system. But mmap wouldn't work if you mount AZFS with a block size less than the system page size.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-06-17 14:06:54

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Tue, 17 Jun 2008 12:53:00 +0200
Jörn Engel <[email protected]> wrote:

> On Tue, 17 June 2008 11:35:10 +0200, Jan Engelhardt wrote:
> > On Tuesday 2008-06-17 11:06, Maxim Shchetynin wrote:
> >
> > >I would say you can either mmap it or place a file system on it.
> >
> > The help text implies that I *need* azfs to mmap it - and that sounds
> > like a Rube Goldberg machine.
>
> Some people actually prefer filesystems over raw devices for a variety
> of reasons:
> - each file brings its own address space, which offers memory protection
> from other processes,
> - files can have owners and permission bits,
> - files hide the fragmentation of the underlying device from users,
> - a file system provides a common and well-understood api for devices
> with less common or well-understood apis,
> - etc.
>
> Those reasons are as valid for azfs as for any other filesystem. I have
> no doubt that azfs is useful. It probably wouldn't hurt to express the
> merits of the filesystem and the problems it is supposed to solve a
> little better. So far most criticism was based on the fact that noone
> understood what the hell it was all about.
>
> My personal question when looking at this is: Why not use ext2? It

Same reason we don't use ext2 for file system in RAM - we don't need any expensive caching and readahead mechanisms for devices which are as fast as main memory (or almost as fast).

> appears to me that an ext2 mounted with '-o xip' would solve the same
> problems.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-06-17 14:36:39

by Jan Engelhardt

[permalink] [raw]
Subject: Re: AZFS file system proposal


On Tuesday 2008-06-17 13:57, Maxim Shchetynin wrote:

>> >> (a) What is axonram?
>> >> (b) why is axonram direct_access, and my other devices (including
>> >> standard PC RAM) not?
>> >
>> >[Axonram is a module specific for IBM CellBlade and allows access to
>> >a DDR2 memory attached onto Axon controller.] You don't need
>> >direct_access and not azfs for main memory because there is already
>> >a pretty nice tmpfs for it.
>>
>> So the reason azfs could be useful is...?
>
>The "?" at the end of your sentence might mean a question. I would
>be really happy to give an answer on it if I could understand it.

If one does not need direct_access nor azfs, and ext2 plus XIP minus
caching or tmpfs plus XIP can achieve the same effect, what can azfs
that we could not already do without azfs?

2008-06-17 14:45:49

by Jörn Engel

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Tue, 17 June 2008 16:06:28 +0200, Maxim Shchetynin wrote:
> >
> > My personal question when looking at this is: Why not use ext2? It
>
> Same reason we don't use ext2 for file system in RAM - we don't need any expensive caching and readahead mechanisms for devices which are as fast as main memory (or almost as fast).

Well, with the '-o xip' option I mentioned below, there is no caching or
readahead. The direct_access method you use as well was developed for
ext2. Or rather for xip2fs, another new filesystem. After some review,
that was shot down and a mount option was added to ext2 instead.

> > appears to me that an ext2 mounted with '-o xip' would solve the same
> > problems.

Jörn

--
Homo Sapiens is a goal, not a description.
-- unknown

2008-06-17 15:01:46

by Dmitri Vorobiev

[permalink] [raw]
Subject: Re: AZFS file system proposal

Maxim Shchetynin wrote:
> Hello,
>
> there are some users which have interest on such kind of file system like azfs. Please, have a look at this version of a diff file which introduces a first version of azfs to 2.6.26. This file system may be useful for example on IBM CellBlades where user can mount DDR2 memory of Axon controller as a disk and to be able to access it directly without any caching mechanism in between.
>
> Subject: azfs: initial submit of azfs, a non-buffered filesystem
>
> From: Maxim Shchetynin <[email protected]>
>
> Non-buffered filesystem for block devices with a gendisk and
> with direct_access() method in gendisk->fops.
> AZFS does not buffer outgoing traffic and is doing no read ahead.
> It supports mount options (given with -o) bs=x,uid=x,gid=x.
> If block-size (bs) is not specified AZFS uses block-size used
> by block device. Though mmap() method is available only if
> block-size equals to or is greater than the system page size.
>
> Signed-off-by: Maxim Shchetynin <[email protected]>
>
> diff -Nuar linux-2.6.26-rc5/arch/powerpc/configs/cell_defconfig linux-2.6.26-rc5-azfs/arch/powerpc/configs/cell_defconfig
> --- linux-2.6.26-rc5/arch/powerpc/configs/cell_defconfig 2008-06-05 05:10:44.000000000 +0200
> +++ linux-2.6.26-rc5-azfs/arch/powerpc/configs/cell_defconfig 2008-06-06 11:53:34.000000000 +0200
> @@ -240,6 +240,7 @@
> # CPU Frequency drivers
> #
> CONFIG_AXON_RAM=m
> +CONFIG_AZ_FS=m
> # CONFIG_FSL_ULI1575 is not set
>
> #
> diff -Nuar linux-2.6.26-rc5/fs/Kconfig linux-2.6.26-rc5-azfs/fs/Kconfig
> --- linux-2.6.26-rc5/fs/Kconfig 2008-06-05 05:10:44.000000000 +0200
> +++ linux-2.6.26-rc5-azfs/fs/Kconfig 2008-06-06 16:55:11.616419992 +0200
> @@ -360,6 +360,17 @@
> If you are not using a security module that requires using
> extended attributes for file security labels, say N.
>
> +config AZ_FS
> + tristate "AZFS filesystem support"
> + help
> + Non-buffered filesystem for block devices with a gendisk and
> + with direct_access() method in gendisk->fops.
> + AZFS does not buffer outgoing traffic and is doing no read ahead.
> + It supports mount options (given with -o) bs=x,uid=x,gid=x.
> + If block-size (bs) is not specified AZFS uses block-size used
> + by block device. Though mmap() method is available only if
> + block-size equals to or is greater than system page size.
> +
> config JFS_FS
> tristate "JFS filesystem support"
> select NLS
> diff -Nuar linux-2.6.26-rc5/fs/Makefile linux-2.6.26-rc5-azfs/fs/Makefile
> --- linux-2.6.26-rc5/fs/Makefile 2008-06-05 05:10:44.000000000 +0200
> +++ linux-2.6.26-rc5-azfs/fs/Makefile 2008-06-06 11:53:34.000000000 +0200
> @@ -119,3 +119,4 @@
> obj-$(CONFIG_DEBUG_FS) += debugfs/
> obj-$(CONFIG_OCFS2_FS) += ocfs2/
> obj-$(CONFIG_GFS2_FS) += gfs2/
> +obj-$(CONFIG_AZ_FS) += azfs.o
> diff -Nuar linux-2.6.26-rc5/fs/azfs.c linux-2.6.26-rc5-azfs/fs/azfs.c
> --- linux-2.6.26-rc5/fs/azfs.c 1970-01-01 01:00:00.000000000 +0100
> +++ linux-2.6.26-rc5-azfs/fs/azfs.c 2008-06-06 17:46:23.587653053 +0200
> @@ -0,0 +1,1179 @@
> +/*
> + * (C) Copyright IBM Deutschland Entwicklung GmbH 2007
> + *
> + * Author: Maxim Shchetynin <[email protected]>
> + *
> + * Non-buffered filesystem driver.
> + * It registers a filesystem which may be used for all kind of block devices
> + * which have a direct_access() method in block_device_operations.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2, or (at your option)
> + * any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#include <linux/backing-dev.h>
> +#include <linux/blkdev.h>
> +#include <linux/cache.h>
> +#include <linux/dcache.h>
> +#include <linux/device.h>
> +#include <linux/err.h>
> +#include <linux/fs.h>
> +#include <linux/genhd.h>
> +#include <linux/kernel.h>
> +#include <linux/limits.h>
> +#include <linux/list.h>
> +#include <linux/module.h>
> +#include <linux/mount.h>
> +#include <linux/mm.h>
> +#include <linux/mm_types.h>
> +#include <linux/mutex.h>
> +#include <linux/namei.h>
> +#include <linux/pagemap.h>
> +#include <linux/parser.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/stat.h>
> +#include <linux/statfs.h>
> +#include <linux/string.h>
> +#include <linux/time.h>
> +#include <linux/types.h>
> +#include <linux/aio.h>
> +#include <linux/uio.h>
> +#include <asm/bug.h>
> +#include <asm/page.h>
> +#include <asm/pgtable.h>
> +#include <asm/string.h>
> +
> +#define AZFS_FILESYSTEM_NAME "azfs"
> +#define AZFS_FILESYSTEM_FLAGS FS_REQUIRES_DEV
> +
> +#define AZFS_SUPERBLOCK_MAGIC 0xABBA1972
> +#define AZFS_SUPERBLOCK_FLAGS MS_NOEXEC | \
> + MS_SYNCHRONOUS | \
> + MS_DIRSYNC | \
> + MS_ACTIVE
> +
> +#define AZFS_BDI_CAPABILITIES BDI_CAP_NO_ACCT_DIRTY | \
> + BDI_CAP_NO_WRITEBACK | \
> + BDI_CAP_MAP_COPY | \
> + BDI_CAP_MAP_DIRECT | \
> + BDI_CAP_VMFLAGS
> +
> +#define AZFS_CACHE_FLAGS SLAB_HWCACHE_ALIGN | \
> + SLAB_RECLAIM_ACCOUNT | \
> + SLAB_MEM_SPREAD
> +
> +enum azfs_direction {
> + AZFS_MMAP,
> + AZFS_READ,
> + AZFS_WRITE
> +};
> +
> +struct azfs_super {
> + struct list_head list;
> + unsigned long media_size;
> + unsigned long block_size;
> + unsigned short block_shift;
> + unsigned long sector_size;
> + unsigned short sector_shift;
> + uid_t uid;
> + gid_t gid;
> + unsigned long ph_addr;
> + unsigned long io_addr;
> + struct block_device *blkdev;
> + struct dentry *root;
> + struct list_head block_list;
> + rwlock_t lock;
> +};
> +
> +struct azfs_super_list {
> + struct list_head head;
> + spinlock_t lock;
> +};
> +
> +struct azfs_block {
> + struct list_head list;
> + unsigned long id;
> + unsigned long count;
> +};
> +
> +struct azfs_znode {
> + struct list_head block_list;
> + rwlock_t lock;
> + loff_t size;
> + struct inode vfs_inode;
> +};
> +
> +static struct azfs_super_list super_list;
> +static struct kmem_cache *azfs_znode_cache __read_mostly = NULL;
> +static struct kmem_cache *azfs_block_cache __read_mostly = NULL;
> +
> +#define I2Z(inode) \
> + container_of(inode, struct azfs_znode, vfs_inode)
> +
> +#define for_each_block(block, block_list) \
> + list_for_each_entry(block, block_list, list)
> +#define for_each_block_reverse(block, block_list) \
> + list_for_each_entry_reverse(block, block_list, list)
> +#define for_each_block_safe(block, ding, block_list) \
> + list_for_each_entry_safe(block, ding, block_list, list)
> +#define for_each_block_safe_reverse(block, ding, block_list) \
> + list_for_each_entry_safe_reverse(block, ding, block_list, list)
> +
> +/**
> + * azfs_block_init - create and initialise a new block in a list
> + * @block_list: destination list
> + * @id: block id
> + * @count: size of a block
> + */
> +static inline struct azfs_block*
> +azfs_block_init(struct list_head *block_list,
> + unsigned long id, unsigned long count)
> +{
> + struct azfs_block *block;
> +
> + block = kmem_cache_alloc(azfs_block_cache, GFP_KERNEL);
> + if (!block)
> + return NULL;
> +
> + block->id = id;
> + block->count = count;
> +
> + INIT_LIST_HEAD(&block->list);
> + list_add_tail(&block->list, block_list);
> +
> + return block;
> +}
> +
> +/**
> + * azfs_block_free - remove block from a list and free it back in cache
> + * @block: block to be removed
> + */
> +static inline void
> +azfs_block_free(struct azfs_block *block)
> +{
> + list_del(&block->list);
> + kmem_cache_free(azfs_block_cache, block);
> +}
> +
> +/**
> + * azfs_block_move - move block to another list
> + * @block: block to be moved
> + * @block_list: destination list
> + */
> +static inline void
> +azfs_block_move(struct azfs_block *block, struct list_head *block_list)
> +{
> + list_move_tail(&block->list, block_list);
> +}
> +
> +/**
> + * azfs_recherche - get real address of a part of a file
> + * @inode: inode
> + * @direction: data direction
> + * @from: offset for read/write operation
> + * @size: pointer to a value of the amount of data to be read/written
> + */
> +static unsigned long
> +azfs_recherche(struct inode *inode, enum azfs_direction direction,

At the risk of being damned by the entire francophone world, I'd still
suggest using an English keyword for the function name here.

> + unsigned long from, unsigned long *size)
> +{
> + struct azfs_super *super;
> + struct azfs_znode *znode;
> + struct azfs_block *block;
> + unsigned long block_id, west, east;
> +
> + super = inode->i_sb->s_fs_info;
> + znode = I2Z(inode);
> +
> + if (from + *size > znode->size) {
> + i_size_write(inode, from + *size);
> + inode->i_op->truncate(inode);
> + }
> +
> + read_lock(&znode->lock);
> +
> + if (list_empty(&znode->block_list)) {
> + read_unlock(&znode->lock);
> + return 0;
> + }
> +
> + block_id = from >> super->block_shift;
> +
> + for_each_block(block, &znode->block_list) {
> + if (block->count > block_id)
> + break;
> + block_id -= block->count;
> + }
> +
> + west = from % super->block_size;
> + east = ((block->count - block_id) << super->block_shift) - west;
> +
> + if (*size > east)
> + *size = east;
> +
> + block_id = ((block->id + block_id) << super->block_shift) + west;
> +
> + read_unlock(&znode->lock);
> +
> + block_id += direction == AZFS_MMAP ? super->ph_addr : super->io_addr;
> +
> + return block_id;
> +}
> +
> +static struct inode*
> +azfs_new_inode(struct super_block *, struct inode *, int, dev_t);

Would it not be better to place this function prototype along with the
bunch of macro definitions you have above into a private header?

> +
> +/**
> + * azfs_mknod - mknod() method for inode_operations
> + * @dir, @dentry, @mode, @dev: see inode_operations methods
> + */
> +static int
> +azfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
> +{
> + struct inode *inode;
> +
> + inode = azfs_new_inode(dir->i_sb, dir, mode, dev);
> + if (!inode)
> + return -ENOSPC;
> +
> + if (S_ISREG(mode))
> + I2Z(inode)->size = 0;
> +
> + dget(dentry);
> + d_instantiate(dentry, inode);
> +
> + return 0;
> +}
> +
> +/**
> + * azfs_create - create() method for inode_operations
> + * @dir, @dentry, @mode, @nd: see inode_operations methods
> + */
> +static int
> +azfs_create(struct inode *dir, struct dentry *dentry, int mode,
> + struct nameidata *nd)
> +{
> + return azfs_mknod(dir, dentry, mode | S_IFREG, 0);
> +}
> +
> +/**
> + * azfs_mkdir - mkdir() method for inode_operations
> + * @dir, @dentry, @mode: see inode_operations methods
> + */
> +static int
> +azfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
> +{
> + int rc;
> +
> + rc = azfs_mknod(dir, dentry, mode | S_IFDIR, 0);
> + if (rc == 0)

Maybe "if (!rc)" ?

> + inc_nlink(dir);
> +
> + return rc;
> +}
> +
> +/**
> + * azfs_symlink - symlink() method for inode_operations
> + * @dir, @dentry, @name: see inode_operations methods
> + */
> +static int
> +azfs_symlink(struct inode *dir, struct dentry *dentry, const char *name)
> +{
> + struct inode *inode;
> + int rc;
> +
> + inode = azfs_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO, 0);
> + if (!inode)
> + return -ENOSPC;
> +
> + rc = page_symlink(inode, name, strlen(name) + 1);
> + if (rc) {
> + iput(inode);
> + return rc;
> + }
> +
> + dget(dentry);
> + d_instantiate(dentry, inode);
> +
> + return 0;
> +}
> +
> +/**
> + * azfs_aio_read - aio_read() method for file_operations
> + * @iocb, @iov, @nr_segs, @pos: see file_operations methods
> + */
> +static ssize_t
> +azfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long nr_segs, loff_t pos)
> +{
> + struct inode *inode;
> + void *ziel;

void *target?

> + unsigned long pin;
> + unsigned long size, todo, step;
> + ssize_t rc;
> +
> + inode = iocb->ki_filp->f_mapping->host;
> +
> + mutex_lock(&inode->i_mutex);
> +
> + if (pos >= i_size_read(inode)) {
> + rc = 0;
> + goto out;
> + }
> +
> + ziel = iov->iov_base;
> + todo = min((loff_t) iov->iov_len, i_size_read(inode) - pos);
> +
> + for (step = todo; step; step -= size) {
> + size = step;
> + pin = azfs_recherche(inode, AZFS_READ, pos, &size);
> + if (!pin) {
> + rc = -ENOSPC;
> + goto out;
> + }
> + if (copy_to_user(ziel, (void*) pin, size)) {
> + rc = -EFAULT;
> + goto out;
> + }
> +
> + iocb->ki_pos += size;
> + pos += size;
> + ziel += size;
> + }
> +
> + rc = todo;
> +
> +out:
> + mutex_unlock(&inode->i_mutex);
> +
> + return rc;
> +}
> +
> +/**
> + * azfs_aio_write - aio_write() method for file_operations
> + * @iocb, @iov, @nr_segs, @pos: see file_operations methods
> + */
> +static ssize_t
> +azfs_aio_write(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long nr_segs, loff_t pos)
> +{
> + struct inode *inode;
> + void *quell;

void *source?

> + unsigned long pin;
> + unsigned long size, todo, step;
> + ssize_t rc;
> +
> + inode = iocb->ki_filp->f_mapping->host;
> +
> + quell = iov->iov_base;
> + todo = iov->iov_len;
> +
> + mutex_lock(&inode->i_mutex);
> +
> + for (step = todo; step; step -= size) {
> + size = step;
> + pin = azfs_recherche(inode, AZFS_WRITE, pos, &size);
> + if (!pin) {
> + rc = -ENOSPC;
> + goto out;
> + }
> + if (copy_from_user((void*) pin, quell, size)) {
> + rc = -EFAULT;
> + goto out;
> + }
> +
> + iocb->ki_pos += size;
> + pos += size;
> + quell += size;
> + }
> +
> + rc = todo;
> +
> +out:
> + mutex_unlock(&inode->i_mutex);
> +
> + return rc;
> +}
> +
> +/**
> + * azfs_open - open() method for file_operations
> + * @inode, @file: see file_operations methods
> + */
> +static int
> +azfs_open(struct inode *inode, struct file *file)
> +{
> + file->private_data = inode;
> +
> + if (file->f_flags & O_TRUNC) {
> + i_size_write(inode, 0);
> + inode->i_op->truncate(inode);
> + }
> + if (file->f_flags & O_APPEND)
> + inode->i_fop->llseek(file, 0, SEEK_END);
> +
> + return 0;
> +}
> +
> +/**
> + * azfs_mmap - mmap() method for file_operations
> + * @file, @vm: see file_operations methods
> + */
> +static int
> +azfs_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> + struct azfs_super *super;
> + struct azfs_znode *znode;
> + struct inode *inode;
> + unsigned long cursor, pin;
> + unsigned long todo, size, vm_start;
> + pgprot_t page_prot;
> +
> + inode = file->private_data;
> + znode = I2Z(inode);
> + super = inode->i_sb->s_fs_info;
> +
> + if (super->block_size < PAGE_SIZE)
> + return -EINVAL;
> +
> + cursor = vma->vm_pgoff << super->block_shift;
> + todo = vma->vm_end - vma->vm_start;
> +
> + if (cursor + todo > i_size_read(inode))
> + return -EINVAL;
> +
> + page_prot = pgprot_val(vma->vm_page_prot);
> + page_prot |= (_PAGE_NO_CACHE | _PAGE_RW);
> + page_prot &= ~_PAGE_GUARDED;
> + vma->vm_page_prot = __pgprot(page_prot);
> +
> + vm_start = vma->vm_start;
> + for (size = todo; todo; todo -= size, size = todo) {
> + pin = azfs_recherche(inode, AZFS_MMAP, cursor, &size);
> + if (!pin)
> + return -EAGAIN;
> + pin >>= PAGE_SHIFT;
> + if (remap_pfn_range(vma, vm_start, pin, size, vma->vm_page_prot))
> + return -EAGAIN;
> +
> + vm_start += size;
> + cursor += size;
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * azfs_truncate - truncate() method for inode_operations
> + * @inode: see inode_operations methods
> + */
> +static void
> +azfs_truncate(struct inode *inode)
> +{
> + struct azfs_super *super;
> + struct azfs_znode *znode;
> + struct azfs_block *block, *ding, *knoten, *west, *east;

The risk of me getting damned increases with that, but maybe it would be
better to use an English keyword for "knoten"?

> + unsigned long id, count;
> + signed long delta;
> +
> + super = inode->i_sb->s_fs_info;
> + znode = I2Z(inode);
> +
> + delta = i_size_read(inode) + (super->block_size - 1);
> + delta >>= super->block_shift;
> + delta -= inode->i_blocks;
> +
> + if (delta == 0) {
> + znode->size = i_size_read(inode);
> + return;
> + }
> +
> + write_lock(&znode->lock);
> +
> + while (delta > 0) {
> + west = east = NULL;
> +
> + write_lock(&super->lock);
> +
> + if (list_empty(&super->block_list)) {
> + write_unlock(&super->lock);
> + break;
> + }
> +
> + for (count = delta; count; count--) {
> + for_each_block(block, &super->block_list)
> + if (block->count >= count) {
> + east = block;
> + break;
> + }
> + if (east)
> + break;
> + }
> +
> + for_each_block_reverse(block, &znode->block_list) {
> + if (block->id + block->count == east->id)
> + west = block;
> + break;
> + }
> +
> + if (east->count == count) {
> + if (west) {
> + west->count += east->count;
> + azfs_block_free(east);
> + } else {
> + azfs_block_move(east, &znode->block_list);
> + }
> + } else {
> + if (west) {
> + west->count += count;
> + } else {
> + if (!azfs_block_init(&znode->block_list,
> + east->id, count)) {
> + write_unlock(&super->lock);
> + break;
> + }
> + }
> +
> + east->id += count;
> + east->count -= count;
> + }
> +
> + write_unlock(&super->lock);
> +
> + inode->i_blocks += count;
> +
> + delta -= count;
> + }
> +
> + while (delta < 0) {
> + for_each_block_safe_reverse(block, knoten, &znode->block_list) {
> + id = block->id;
> + count = block->count;
> + if ((signed long) count + delta > 0) {
> + block->count += delta;
> + id += block->count;
> + count -= block->count;
> + block = NULL;
> + }
> +
> + west = east = NULL;
> +
> + write_lock(&super->lock);
> +
> + for_each_block(ding, &super->block_list) {
> + if (!west && (ding->id + ding->count == id))
> + west = ding;
> + else if (!east && (id + count == ding->id))
> + east = ding;
> + if (west && east)
> + break;
> + }
> +
> + if (west && east) {
> + west->count += count + east->count;
> + azfs_block_free(east);
> + if (block)
> + azfs_block_free(block);
> + } else if (west) {
> + west->count += count;
> + if (block)
> + azfs_block_free(block);
> + } else if (east) {
> + east->id -= count;
> + east->count += count;
> + if (block)
> + azfs_block_free(block);
> + } else {
> + if (!block) {
> + if (!azfs_block_init(&super->block_list,
> + id, count)) {
> + write_unlock(&super->lock);
> + break;
> + }
> + } else {
> + azfs_block_move(block, &super->block_list);
> + }
> + }
> +
> + write_unlock(&super->lock);
> +
> + inode->i_blocks -= count;
> +
> + delta += count;
> +
> + break;
> + }
> + }
> +
> + write_unlock(&znode->lock);
> +
> + znode->size = min(i_size_read(inode),
> + (loff_t) inode->i_blocks << super->block_shift);
> +}
> +
> +/**
> + * azfs_getattr - getattr() method for inode_operations
> + * @mnt, @dentry, @stat: see inode_operations methods
> + */
> +static int
> +azfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
> +{
> + struct azfs_super *super;
> + struct inode *inode;
> + unsigned short shift;
> +
> + inode = dentry->d_inode;
> + super = inode->i_sb->s_fs_info;
> +
> + generic_fillattr(inode, stat);
> + stat->blocks = inode->i_blocks;
> + shift = super->block_shift - super->sector_shift;
> + if (shift)
> + stat->blocks <<= shift;
> +
> + return 0;
> +}
> +
> +static const struct address_space_operations azfs_aops = {
> + .write_begin = simple_write_begin,
> + .write_end = simple_write_end
> +};
> +
> +static struct backing_dev_info azfs_bdi = {
> + .ra_pages = 0,
> + .capabilities = AZFS_BDI_CAPABILITIES
> +};
> +
> +static struct inode_operations azfs_dir_iops = {
> + .create = azfs_create,
> + .lookup = simple_lookup,
> + .link = simple_link,
> + .unlink = simple_unlink,
> + .symlink = azfs_symlink,
> + .mkdir = azfs_mkdir,
> + .rmdir = simple_rmdir,
> + .mknod = azfs_mknod,
> + .rename = simple_rename
> +};
> +
> +static const struct file_operations azfs_reg_fops = {
> + .llseek = generic_file_llseek,
> + .aio_read = azfs_aio_read,
> + .aio_write = azfs_aio_write,
> + .open = azfs_open,
> + .mmap = azfs_mmap,
> + .fsync = simple_sync_file,
> +};
> +
> +static struct inode_operations azfs_reg_iops = {
> + .truncate = azfs_truncate,
> + .getattr = azfs_getattr
> +};
> +
> +/**
> + * azfs_new_inode - cook a new inode
> + * @sb: super-block
> + * @dir: parent directory
> + * @mode: file mode
> + * @dev: to be forwarded to init_special_inode()
> + */
> +static struct inode*
> +azfs_new_inode(struct super_block *sb, struct inode *dir, int mode, dev_t dev)
> +{
> + struct azfs_super *super;
> + struct inode *inode;
> +
> + inode = new_inode(sb);
> + if (!inode)
> + return NULL;
> +
> + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> +
> + inode->i_mode = mode;
> + if (dir) {
> + dir->i_mtime = dir->i_ctime = inode->i_mtime;
> + inode->i_uid = current->fsuid;
> + if (dir->i_mode & S_ISGID) {
> + if (S_ISDIR(mode))
> + inode->i_mode |= S_ISGID;
> + inode->i_gid = dir->i_gid;
> + } else {
> + inode->i_gid = current->fsgid;
> + }
> + } else {
> + super = sb->s_fs_info;
> + inode->i_uid = super->uid;
> + inode->i_gid = super->gid;
> + }
> +
> + inode->i_blocks = 0;
> + inode->i_mapping->a_ops = &azfs_aops;
> + inode->i_mapping->backing_dev_info = &azfs_bdi;
> +
> + switch (mode & S_IFMT) {
> + case S_IFDIR:
> + inode->i_op = &azfs_dir_iops;
> + inode->i_fop = &simple_dir_operations;
> + inc_nlink(inode);
> + break;
> +
> + case S_IFREG:
> + inode->i_op = &azfs_reg_iops;
> + inode->i_fop = &azfs_reg_fops;
> + break;
> +
> + case S_IFLNK:
> + inode->i_op = &page_symlink_inode_operations;
> + break;
> +
> + default:
> + init_special_inode(inode, mode, dev);
> + break;
> + }
> +
> + return inode;
> +}
> +
> +/**
> + * azfs_alloc_inode - alloc_inode() method for super_operations
> + * @sb: see super_operations methods
> + */
> +static struct inode*
> +azfs_alloc_inode(struct super_block *sb)
> +{
> + struct azfs_znode *znode;
> +
> + znode = kmem_cache_alloc(azfs_znode_cache, GFP_KERNEL);
> +
> + INIT_LIST_HEAD(&znode->block_list);
> + rwlock_init(&znode->lock);
> +
> + inode_init_once(&znode->vfs_inode);
> +
> + return znode ? &znode->vfs_inode : NULL;
> +}
> +
> +/**
> + * azfs_destroy_inode - destroy_inode() method for super_operations
> + * @inode: see super_operations methods
> + */
> +static void
> +azfs_destroy_inode(struct inode *inode)
> +{
> + kmem_cache_free(azfs_znode_cache, I2Z(inode));
> +}
> +
> +/**
> + * azfs_delete_inode - delete_inode() method for super_operations
> + * @inode: see super_operations methods
> + */
> +static void
> +azfs_delete_inode(struct inode *inode)
> +{
> + if (S_ISREG(inode->i_mode)) {
> + i_size_write(inode, 0);
> + azfs_truncate(inode);
> + }
> + truncate_inode_pages(&inode->i_data, 0);
> + clear_inode(inode);
> +}
> +
> +/**
> + * azfs_statfs - statfs() method for super_operations
> + * @dentry, @stat: see super_operations methods
> + */
> +static int
> +azfs_statfs(struct dentry *dentry, struct kstatfs *stat)
> +{
> + struct super_block *sb;
> + struct azfs_super *super;
> + struct inode *inode;
> + unsigned long inodes, blocks;
> +
> + sb = dentry->d_sb;
> + super = sb->s_fs_info;
> +
> + inodes = blocks = 0;
> + mutex_lock(&sb->s_lock);
> + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
> + inodes++;
> + blocks += inode->i_blocks;
> + }
> + mutex_unlock(&sb->s_lock);
> +
> + stat->f_type = AZFS_SUPERBLOCK_MAGIC;
> + stat->f_bsize = super->block_size;
> + stat->f_blocks = super->media_size >> super->block_shift;
> + stat->f_bfree = stat->f_blocks - blocks;
> + stat->f_bavail = stat->f_blocks - blocks;
> + stat->f_files = inodes + blocks;
> + stat->f_ffree = blocks + 1;
> + stat->f_namelen = NAME_MAX;
> +
> + return 0;
> +}
> +
> +static struct super_operations azfs_ops = {
> + .alloc_inode = azfs_alloc_inode,
> + .destroy_inode = azfs_destroy_inode,
> + .drop_inode = generic_delete_inode,
> + .delete_inode = azfs_delete_inode,
> + .statfs = azfs_statfs
> +};
> +
> +enum {
> + Opt_blocksize_short,
> + Opt_blocksize_long,
> + Opt_uid,
> + Opt_gid,
> + Opt_err
> +};
> +
> +static match_table_t tokens = {
> + {Opt_blocksize_short, "bs=%u"},
> + {Opt_blocksize_long, "blocksize=%u"},
> + {Opt_uid, "uid=%u"},
> + {Opt_gid, "gid=%u"},
> + {Opt_err, NULL}
> +};
> +
> +/**
> + * azfs_parse_mount_parameters - parse options given to mount with -o
> + * @sb: super block
> + * @options: comma separated options
> + */
> +static int
> +azfs_parse_mount_parameters(struct super_block *sb, char *options)
> +{
> + struct azfs_super *super;
> + char *option;
> + int token, value;
> + substring_t args[MAX_OPT_ARGS];
> +
> + super = sb->s_fs_info;
> +
> + while ((option = strsep(&options, ",")) != NULL) {
> + if (!*option)
> + continue;
> +
> + token = match_token(option, tokens, args);
> + switch (token) {
> + case Opt_blocksize_short:
> + case Opt_blocksize_long:
> + if (match_int(&args[0], &value))
> + goto syntax_error;
> + super->block_size = value;
> + break;
> +
> + case Opt_uid:
> + if (match_int(&args[0], &value))
> + goto syntax_error;
> + super->uid = value;
> + break;
> +
> + case Opt_gid:
> + if (match_int(&args[0], &value))
> + goto syntax_error;
> + super->gid = value;
> + break;
> +
> + default:
> + goto syntax_error;
> + }
> + }
> +
> + return 1;
> +
> +syntax_error:
> + printk(KERN_ERR "%s: invalid mount option\n",
> + AZFS_FILESYSTEM_NAME);
> +
> + return 0;
> +}
> +
> +/**
> + * azfs_fill_super - fill_super routine for get_sb
> + * @sb, @data, @silent: see file_system_type methods
> + */
> +static int
> +azfs_fill_super(struct super_block *sb, void *data, int silent)
> +{
> + struct gendisk *disk;
> + struct azfs_super *super = NULL, *knoten;
> + struct azfs_block *block = NULL;
> + struct inode *inode = NULL;
> + void *kaddr;
> + unsigned long pfn;
> + int rc;
> +
> + BUG_ON(!sb->s_bdev);
> +
> + disk = sb->s_bdev->bd_disk;
> +
> + if (!disk || !disk->queue) {
> + printk(KERN_ERR "%s needs a block device which has a gendisk "
> + "with a queue\n",
> + AZFS_FILESYSTEM_NAME);
> + return -ENOSYS;
> + }
> +
> + if (!disk->fops->direct_access) {
> + printk(KERN_ERR "%s needs a block device with a "
> + "direct_access() method\n",
> + AZFS_FILESYSTEM_NAME);
> + return -ENOSYS;
> + }
> +
> + if (!get_device(disk->driverfs_dev)) {
> + printk(KERN_ERR "%s cannot get reference to device driver\n",
> + AZFS_FILESYSTEM_NAME);
> + return -EFAULT;
> + }
> +
> + sb->s_magic = AZFS_SUPERBLOCK_MAGIC;
> + sb->s_flags = AZFS_SUPERBLOCK_FLAGS;
> + sb->s_op = &azfs_ops;
> + sb->s_maxbytes = get_capacity(disk) * disk->queue->hardsect_size;
> + sb->s_time_gran = 1;
> +
> + spin_lock(&super_list.lock);
> + list_for_each_entry(knoten, &super_list.head, list)
> + if (knoten->blkdev == sb->s_bdev) {
> + super = knoten;
> + break;
> + }
> + spin_unlock(&super_list.lock);
> +
> + if (super) {
> + if (strlen((char*) data))
> + printk(KERN_WARNING "/dev/%s was already mounted with "
> + "%s before, it will be mounted with "
> + "mount options used last time, "
> + "options just given would be ignored\n",
> + disk->disk_name, AZFS_FILESYSTEM_NAME);
> + sb->s_fs_info = super;
> + } else {
> + super = kzalloc(sizeof(struct azfs_super), GFP_KERNEL);
> + if (!super) {
> + rc = -ENOMEM;
> + goto failed;
> + }
> + sb->s_fs_info = super;
> +
> + if (!azfs_parse_mount_parameters(sb, (char*) data)) {
> + rc = -EINVAL;
> + goto failed;
> + }
> +
> + inode = azfs_new_inode(sb, NULL, S_IFDIR | S_IRWXUGO, 0);
> + if (!inode) {
> + rc = -ENOMEM;
> + goto failed;
> + }
> +
> + super->root = d_alloc_root(inode);
> + if (!super->root) {
> + rc = -ENOMEM;
> + goto failed;
> + }
> + dget(super->root);
> +
> + INIT_LIST_HEAD(&super->list);
> + INIT_LIST_HEAD(&super->block_list);
> + rwlock_init(&super->lock);
> +
> + super->media_size = sb->s_maxbytes;
> +
> + if (!super->block_size)
> + super->block_size = sb->s_blocksize;
> + super->block_shift = blksize_bits(super->block_size);
> +
> + super->sector_size = disk->queue->hardsect_size;
> + super->sector_shift = blksize_bits(super->sector_size);
> +
> + super->blkdev = sb->s_bdev;
> +
> + block = azfs_block_init(&super->block_list,
> + 0, super->media_size >> super->block_shift);
> + if (!block) {
> + rc = -ENOMEM;
> + goto failed;
> + }
> +
> + rc = disk->fops->direct_access(super->blkdev, 0, &kaddr, &pfn);
> + if (rc < 0) {
> + rc = -EFAULT;
> + goto failed;
> + }
> + super->ph_addr = (unsigned long) kaddr;
> +
> + super->io_addr = (unsigned long) ioremap_flags(
> + super->ph_addr, super->media_size, _PAGE_NO_CACHE);
> + if (!super->io_addr) {
> + rc = -EFAULT;
> + goto failed;
> + }
> +
> + spin_lock(&super_list.lock);
> + list_add(&super->list, &super_list.head);
> + spin_unlock(&super_list.lock);
> + }
> +
> + sb->s_root = super->root;
> + disk->driverfs_dev->driver_data = super;
> + disk->driverfs_dev->platform_data = sb;
> +
> + if (super->block_size < PAGE_SIZE)
> + printk(KERN_INFO "Block size on %s is smaller then system "
> + "page size: mmap() would not be supported\n",
> + disk->disk_name);
> +
> + return 0;
> +
> +failed:
> + if (super) {
> + sb->s_root = NULL;
> + sb->s_fs_info = NULL;
> + if (block)
> + azfs_block_free(block);
> + if (super->root)
> + dput(super->root);
> + if (inode)
> + iput(inode);
> + disk->driverfs_dev->driver_data = NULL;
> + kfree(super);
> + disk->driverfs_dev->platform_data = NULL;
> + put_device(disk->driverfs_dev);
> + }
> +
> + return rc;
> +}
> +
> +/**
> + * azfs_get_sb - get_sb() method for file_system_type
> + * @fs_type, @flags, @dev_name, @data, @mount: see file_system_type methods
> + */
> +static int
> +azfs_get_sb(struct file_system_type *fs_type, int flags,
> + const char *dev_name, void *data, struct vfsmount *mount)
> +{
> + return get_sb_bdev(fs_type, flags,
> + dev_name, data, azfs_fill_super, mount);
> +}
> +
> +/**
> + * azfs_kill_sb - kill_sb() method for file_system_type
> + * @sb: see file_system_type methods
> + */
> +static void
> +azfs_kill_sb(struct super_block *sb)
> +{
> + sb->s_root = NULL;
> + kill_block_super(sb);
> +}
> +
> +static struct file_system_type azfs_fs = {
> + .owner = THIS_MODULE,
> + .name = AZFS_FILESYSTEM_NAME,
> + .get_sb = azfs_get_sb,
> + .kill_sb = azfs_kill_sb,
> + .fs_flags = AZFS_FILESYSTEM_FLAGS
> +};
> +
> +/**
> + * azfs_init
> + */
> +static int __init
> +azfs_init(void)
> +{
> + int rc;
> +
> + INIT_LIST_HEAD(&super_list.head);
> + spin_lock_init(&super_list.lock);
> +
> + azfs_znode_cache = kmem_cache_create("azfs_znode_cache",
> + sizeof(struct azfs_znode), 0, AZFS_CACHE_FLAGS, NULL);
> + if (!azfs_znode_cache) {
> + printk(KERN_ERR "Could not allocate inode cache for %s\n",
> + AZFS_FILESYSTEM_NAME);
> + rc = -ENOMEM;
> + goto failed;
> + }
> +
> + azfs_block_cache = kmem_cache_create("azfs_block_cache",
> + sizeof(struct azfs_block), 0, AZFS_CACHE_FLAGS, NULL);
> + if (!azfs_block_cache) {
> + printk(KERN_ERR "Could not allocate block cache for %s\n",
> + AZFS_FILESYSTEM_NAME);
> + rc = -ENOMEM;
> + goto failed;
> + }
> +
> + rc = register_filesystem(&azfs_fs);
> + if (rc != 0) {
> + printk(KERN_ERR "Could not register %s\n",
> + AZFS_FILESYSTEM_NAME);
> + goto failed;
> + }
> +
> + return 0;
> +
> +failed:
> + if (azfs_block_cache)
> + kmem_cache_destroy(azfs_block_cache);
> +
> + if (azfs_znode_cache)
> + kmem_cache_destroy(azfs_znode_cache);
> +
> + return rc;
> +}
> +
> +/**
> + * azfs_exit
> + */
> +static void __exit
> +azfs_exit(void)
> +{
> + struct azfs_super *super, *SUPER;

I think that yelling in deep desperation like that is not quite in
agreement with the kernel coding style.

> + struct azfs_block *block, *knoten;
> + struct gendisk *disk;
> +
> + spin_lock(&super_list.lock);
> + list_for_each_entry_safe(super, SUPER, &super_list.head, list) {
> + disk = super->blkdev->bd_disk;
> + list_del(&super->list);
> + iounmap((void*) super->io_addr);
> + write_lock(&super->lock);
> + for_each_block_safe(block, knoten, &super->block_list)
> + azfs_block_free(block);
> + write_unlock(&super->lock);
> + disk->driverfs_dev->driver_data = NULL;
> + disk->driverfs_dev->platform_data = NULL;
> + kfree(super);
> + put_device(disk->driverfs_dev);
> + }
> + spin_unlock(&super_list.lock);
> +
> + unregister_filesystem(&azfs_fs);
> +
> + kmem_cache_destroy(azfs_block_cache);
> + kmem_cache_destroy(azfs_znode_cache);
> +}
> +
> +module_init(azfs_init);
> +module_exit(azfs_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Maxim Shchetynin <[email protected]>");
> +MODULE_DESCRIPTION("Non-buffered file system for IO devices");
>

An unprecedented lack of comments in this driver can hardly boost the
reader's attention. Besides, I personally think that a kind of a design
document could be extremely useful - basically, explain the purpose of
the filesystem, the basic idead behind it, etc.

Thanks,
Dmitri

2008-06-17 15:52:22

by Jörn Engel

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Tue, 17 June 2008 16:36:28 +0200, Jan Engelhardt wrote:
>
> tmpfs plus XIP

That looks seriously disturbed. Normal filesystems have a backing store
plus a page cache. Tmpfs removes the backing store and keeps everything
in the page cache. XIP removes the page cache and leaves everything in
the backing store - which is memory.

Would tmpfs plus XIP remove both the page cache and the backing store?

Jörn

--
Data dominates. If you've chosen the right data structures and organized
things well, the algorithms will almost always be self-evident. Data
structures, not algorithms, are central to programming.
-- Rob Pike

2008-06-18 11:15:48

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

> >> >> (a) What is axonram?
> >> >> (b) why is axonram direct_access, and my other devices (including
> >> >> standard PC RAM) not?
> >> >
> >> >[Axonram is a module specific for IBM CellBlade and allows access to
> >> >a DDR2 memory attached onto Axon controller.] You don't need
> >> >direct_access and not azfs for main memory because there is already
> >> >a pretty nice tmpfs for it.
> >>
> >> So the reason azfs could be useful is...?
> >
> >The "?" at the end of your sentence might mean a question. I would
> >be really happy to give an answer on it if I could understand it.
>
> If one does not need direct_access nor azfs, and ext2 plus XIP minus
> caching or tmpfs plus XIP can achieve the same effect, what can azfs
> that we could not already do without azfs?

Our users want to have a file system with bigger block sizes (64KB, 16MB, ...). They also don't want to keep metadata on DDR2 media, but to be able to use a complete DDR2 for their applications data.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-06-18 11:22:04

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Tue, 17 Jun 2008 16:36:28 +0200 (CEST)
Jan Engelhardt <[email protected]> wrote:

>
> On Tuesday 2008-06-17 13:57, Maxim Shchetynin wrote:
>
> >> >> (a) What is axonram?
> >> >> (b) why is axonram direct_access, and my other devices (including
> >> >> standard PC RAM) not?
> >> >
> >> >[Axonram is a module specific for IBM CellBlade and allows access to
> >> >a DDR2 memory attached onto Axon controller.] You don't need
> >> >direct_access and not azfs for main memory because there is already
> >> >a pretty nice tmpfs for it.
> >>
> >> So the reason azfs could be useful is...?
> >
> >The "?" at the end of your sentence might mean a question. I would
> >be really happy to give an answer on it if I could understand it.
>
> If one does not need direct_access nor azfs, and ext2 plus XIP minus
> caching or tmpfs plus XIP can achieve the same effect, what can azfs
> that we could not already do without azfs?

tmpfs plus xip? I'm 100% sure noone have tryed it yet. Does anyone want to? I not.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-06-18 11:27:30

by Christoph Hellwig

[permalink] [raw]
Subject: Re: AZFS file system proposal

> +#define AZFS_FILESYSTEM_NAME "azfs"
> +#define AZFS_FILESYSTEM_FLAGS FS_REQUIRES_DEV
> +
> +#define AZFS_SUPERBLOCK_MAGIC 0xABBA1972
> +#define AZFS_SUPERBLOCK_FLAGS MS_NOEXEC | \
> + MS_SYNCHRONOUS | \
> + MS_DIRSYNC | \
> + MS_ACTIVE
> +
> +#define AZFS_BDI_CAPABILITIES BDI_CAP_NO_ACCT_DIRTY | \
> + BDI_CAP_NO_WRITEBACK | \
> + BDI_CAP_MAP_COPY | \
> + BDI_CAP_MAP_DIRECT | \
> + BDI_CAP_VMFLAGS
> +
> +#define AZFS_CACHE_FLAGS SLAB_HWCACHE_ALIGN | \
> + SLAB_RECLAIM_ACCOUNT | \
> + SLAB_MEM_SPREAD
> +
> +enum azfs_direction {
> + AZFS_MMAP,
> + AZFS_READ,
> + AZFS_WRITE
> +};
> +
> +struct azfs_super {
> + struct list_head list;
> + unsigned long media_size;
> + unsigned long block_size;
> + unsigned short block_shift;
> + unsigned long sector_size;
> + unsigned short sector_shift;
> + uid_t uid;
> + gid_t gid;
> + unsigned long ph_addr;
> + unsigned long io_addr;
> + struct block_device *blkdev;
> + struct dentry *root;
> + struct list_head block_list;
> + rwlock_t lock;
> +};
> +
> +struct azfs_super_list {
> + struct list_head head;
> + spinlock_t lock;
> +};
> +
> +struct azfs_block {
> + struct list_head list;
> + unsigned long id;
> + unsigned long count;
> +};
> +
> +struct azfs_znode {
> + struct list_head block_list;
> + rwlock_t lock;
> + loff_t size;
> + struct inode vfs_inode;
> +};
> +
> +static struct azfs_super_list super_list;
> +static struct kmem_cache *azfs_znode_cache __read_mostly = NULL;
> +static struct kmem_cache *azfs_block_cache __read_mostly = NULL;

> +static unsigned long
> +azfs_recherche(struct inode *inode, enum azfs_direction direction,
> + unsigned long from, unsigned long *size)
> +{
> + struct azfs_super *super;
> + struct azfs_znode *znode;
> + struct azfs_block *block;
> + unsigned long block_id, west, east;
> +
> + super = inode->i_sb->s_fs_info;
> + znode = I2Z(inode);
> +
> + if (from + *size > znode->size) {
> + i_size_write(inode, from + *size);
> + inode->i_op->truncate(inode);
> + }
> +
> + read_lock(&znode->lock);
> +
> + if (list_empty(&znode->block_list)) {
> + read_unlock(&znode->lock);
> + return 0;
> + }
> +
> + block_id = from >> super->block_shift;
> +
> + for_each_block(block, &znode->block_list) {
> + if (block->count > block_id)
> + break;
> + block_id -= block->count;
> + }
> +
> + west = from % super->block_size;
> + east = ((block->count - block_id) << super->block_shift) - west;
> +
> + if (*size > east)
> + *size = east;
> +
> + block_id = ((block->id + block_id) << super->block_shift) + west;
> +
> + read_unlock(&znode->lock);
> +
> + block_id += direction == AZFS_MMAP ? super->ph_addr : super->io_addr;
> +
> + return block_id;
> +}

> +azfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long nr_segs, loff_t pos)
> +{
> + struct inode *inode;
> + void *ziel;
> + unsigned long pin;
> + unsigned long size, todo, step;
> + ssize_t rc;
> +
> + inode = iocb->ki_filp->f_mapping->host;
> +
> + mutex_lock(&inode->i_mutex);
> +
> + if (pos >= i_size_read(inode)) {
> + rc = 0;
> + goto out;
> + }
> +
> + ziel = iov->iov_base;
> + todo = min((loff_t) iov->iov_len, i_size_read(inode) - pos);
> +
> + for (step = todo; step; step -= size) {
> + size = step;
> + pin = azfs_recherche(inode, AZFS_READ, pos, &size);
> + if (!pin) {
> + rc = -ENOSPC;
> + goto out;
> + }
> + if (copy_to_user(ziel, (void*) pin, size)) {
> + rc = -EFAULT;
> + goto out;
> + }
> +
> + iocb->ki_pos += size;
> + pos += size;
> + ziel += size;
> + }
> +
> + rc = todo;
> +
> +out:
> + mutex_unlock(&inode->i_mutex);
> +
> + return rc;
> +}
> +
> +/**
> + * azfs_aio_write - aio_write() method for file_operations
> + * @iocb, @iov, @nr_segs, @pos: see file_operations methods
> + */
> +static ssize_t
> +azfs_aio_write(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long nr_segs, loff_t pos)
> +{
> + struct inode *inode;
> + void *quell;
> + unsigned long pin;
> + unsigned long size, todo, step;
> + ssize_t rc;
> +
> + inode = iocb->ki_filp->f_mapping->host;
> +
> + quell = iov->iov_base;
> + todo = iov->iov_len;
> +
> + mutex_lock(&inode->i_mutex);
> +
> + for (step = todo; step; step -= size) {
> + size = step;
> + pin = azfs_recherche(inode, AZFS_WRITE, pos, &size);
> + if (!pin) {
> + rc = -ENOSPC;
> + goto out;
> + }
> + if (copy_from_user((void*) pin, quell, size)) {
> + rc = -EFAULT;
> + goto out;
> + }
> +
> + iocb->ki_pos += size;
> + pos += size;
> + quell += size;
> + }
> +
> + rc = todo;
> +
> +out:
> + mutex_unlock(&inode->i_mutex);
> +
> + return rc;
> +}
> +
> +/**
> + * azfs_open - open() method for file_operations
> + * @inode, @file: see file_operations methods

> +static int
> +azfs_open(struct inode *inode, struct file *file)
> +{
> + file->private_data = inode;
> +
> + if (file->f_flags & O_TRUNC) {
> + i_size_write(inode, 0);
> + inode->i_op->truncate(inode);
> + }
> + if (file->f_flags & O_APPEND)
> + inode->i_fop->llseek(file, 0, SEEK_END);
> +
> + return 0;

truncate and seeking are done by the VFS, not need to do it.
Also no need to stuff the inode in file->private_data because
it's always available through file->f_path.dentry->inode.

> +/**
> + * azfs_alloc_inode - alloc_inode() method for super_operations
> + * @sb: see super_operations methods
> + */
> +static struct inode*
> +azfs_alloc_inode(struct super_block *sb)
> +{
> + struct azfs_znode *znode;
> +
> + znode = kmem_cache_alloc(azfs_znode_cache, GFP_KERNEL);
> +
> + INIT_LIST_HEAD(&znode->block_list);
> + rwlock_init(&znode->lock);

You need to check for an NULL pointer from kmem_cache_alloc
here.

> +/**
> + * azfs_delete_inode - delete_inode() method for super_operations
> + * @inode: see super_operations methods
> + */
> +static void
> +azfs_delete_inode(struct inode *inode)
> +{
> + if (S_ISREG(inode->i_mode)) {
> + i_size_write(inode, 0);
> + azfs_truncate(inode);
> + }
> + truncate_inode_pages(&inode->i_data, 0);
> + clear_inode(inode);
> +}

> +/**
> + * azfs_fill_super - fill_super routine for get_sb
> + * @sb, @data, @silent: see file_system_type methods
> + */
> +static int
> +azfs_fill_super(struct super_block *sb, void *data, int silent)
> +{
> + struct gendisk *disk;
> + struct azfs_super *super = NULL, *knoten;
> + struct azfs_block *block = NULL;
> + struct inode *inode = NULL;
> + void *kaddr;
> + unsigned long pfn;
> + int rc;
> +
> + BUG_ON(!sb->s_bdev);
> +
> + disk = sb->s_bdev->bd_disk;
> +
> + if (!disk || !disk->queue) {

This won't ever be zero, no need to check.

> + if (!get_device(disk->driverfs_dev)) {
> + printk(KERN_ERR "%s cannot get reference to device driver\n",
> + AZFS_FILESYSTEM_NAME);
> + return -EFAULT;
> + }

You don't need another reference, the disk won't go away while the
block device is open.

> + spin_lock(&super_list.lock);
> + list_for_each_entry(knoten, &super_list.head, list)
> + if (knoten->blkdev == sb->s_bdev) {
> + super = knoten;
> + break;
> + }
> + spin_unlock(&super_list.lock);

This can't happen. get_sb_bdev already searches for the same superblock
already existing and doesn't even call into fill_super in that case.

> +
> +/**
> + * azfs_kill_sb - kill_sb() method for file_system_type
> + * @sb: see file_system_type methods
> + */
> +static void
> +azfs_kill_sb(struct super_block *sb)
> +{
> + sb->s_root = NULL;

Very bad idea, if you set sb->s_root to zero before calling
generic_shutdown_super it will miss a lot of the taerdown activity.

> + spin_lock(&super_list.lock);
> + list_for_each_entry_safe(super, SUPER, &super_list.head, list) {
> + disk = super->blkdev->bd_disk;
> + list_del(&super->list);
> + iounmap((void*) super->io_addr);
> + write_lock(&super->lock);
> + for_each_block_safe(block, knoten, &super->block_list)
> + azfs_block_free(block);
> + write_unlock(&super->lock);
> + disk->driverfs_dev->driver_data = NULL;
> + disk->driverfs_dev->platform_data = NULL;
> + kfree(super);
> + put_device(disk->driverfs_dev);

All this teardown should happen in ->put_super, and with this and
the above comment there should be need for a list of all superblocks.

2008-06-18 14:01:40

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

I have updated my patch and will post it here in a few minutes.
Thank you very much Dmitri, all your comments are useful.

> > +static unsigned long
> > +azfs_recherche(struct inode *inode, enum azfs_direction direction,
>
> At the risk of being damned by the entire francophone world, I'd still
> suggest using an English keyword for the function name here.

Fixed.

> > +static struct inode*
> > +azfs_new_inode(struct super_block *, struct inode *, int, dev_t);
>
> Would it not be better to place this function prototype along with the
> bunch of macro definitions you have above into a private header?

I have a pretty few such definitions and would like to avoid adding to kernel an unnecessary .h file with just a couple of lines.

> > + if (rc == 0)
>
> Maybe "if (!rc)" ?

Not critical, but fixed.

> > + void *ziel;
>
> void *target?

Fixed.

> > + void *quell;
>
> void *source?

Fixed.

> > + struct azfs_block *block, *ding, *knoten, *west, *east;
>
> The risk of me getting damned increases with that, but maybe it would be
> better to use an English keyword for "knoten"?

Fixed.

> > + struct azfs_super *super, *SUPER;
>
> I think that yelling in deep desperation like that is not quite in
> agreement with the kernel coding style.

Fixed.

> An unprecedented lack of comments in this driver can hardly boost the
> reader's attention. Besides, I personally think that a kind of a design
> document could be extremely useful - basically, explain the purpose of
> the filesystem, the basic idead behind it, etc.

I have added Documentation/filesystems/azfs.txt file with a short description of what AZFS is.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-06-18 14:04:20

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

Thank you very much Christoph for your comments. Some of them I have already found useful and fixed my code accordinly. Some other your comments I find interesting but would like to investigate them a little bit more.
I will send an updated version of my patch soon.

> truncate and seeking are done by the VFS, not need to do it.

I think VFS is doing it not exactly the same way which I need. But I will look at it once more again to make sure.

> Also no need to stuff the inode in file->private_data because
> it's always available through file->f_path.dentry->inode.

Fixed.

> You need to check for an NULL pointer from kmem_cache_alloc
> here.

Fixed.

> > + if (!disk || !disk->queue) {
>
> This won't ever be zero, no need to check.

Agree 50%. I have removed a printk from here but have left the following line;
BUG_ON(!disk || !disk->queue);

> > + if (!get_device(disk->driverfs_dev)) {
> > + printk(KERN_ERR "%s cannot get reference to device driver\n",
> > + AZFS_FILESYSTEM_NAME);
> > + return -EFAULT;
> > + }
>
> You don't need another reference, the disk won't go away while the
> block device is open.

Not agree. I leave get_device here but remove the printk.

> > + spin_lock(&super_list.lock);
> > + list_for_each_entry(knoten, &super_list.head, list)
> > + if (knoten->blkdev == sb->s_bdev) {
> > + super = knoten;
> > + break;
> > + }
> > + spin_unlock(&super_list.lock);
>
> This can't happen. get_sb_bdev already searches for the same superblock
> already existing and doesn't even call into fill_super in that case.

I will check it.

> > +static void
> > +azfs_kill_sb(struct super_block *sb)
> > +{
> > + sb->s_root = NULL;
>
> Very bad idea, if you set sb->s_root to zero before calling
> generic_shutdown_super it will miss a lot of the taerdown activity.

I need it because I want to keep all super blocks and inodes to make it possible to mount the same AZFS partition later and to let user see all his files again. Don't forget - AZFS keeps all the inode data in RAM.

> > + spin_lock(&super_list.lock);
> > + list_for_each_entry_safe(super, SUPER, &super_list.head, list) {
> > + disk = super->blkdev->bd_disk;
> > + list_del(&super->list);
> > + iounmap((void*) super->io_addr);
> > + write_lock(&super->lock);
> > + for_each_block_safe(block, knoten, &super->block_list)
> > + azfs_block_free(block);
> > + write_unlock(&super->lock);
> > + disk->driverfs_dev->driver_data = NULL;
> > + disk->driverfs_dev->platform_data = NULL;
> > + kfree(super);
> > + put_device(disk->driverfs_dev);
>
> All this teardown should happen in ->put_super, and with this and
> the above comment there should be need for a list of all superblocks.

Same thing - super blocks and inodes of unmounted file systems are still in RAM.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-06-18 14:06:39

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

AZFS patch updated accordinly to comments of Christoph Hellwig and Dmitri Vorobiev.

Subject: azfs: initial submit of azfs, a non-buffered filesystem

From: Maxim Shchetynin <[email protected]>

AZFS is a file system which keeps all files on memory mapped random
access storage. It was designed to work on the axonram device driver
for IBM QS2x blade servers, but can operate on any block device
that exports a direct_access method.

Signed-off-by: Maxim Shchetynin <[email protected]>

diff -Nuar linux-2.6.26-rc6/Documentation/filesystems/azfs.txt linux-2.6.26-rc6-azfs/Documentation/filesystems/azfs.txt
--- linux-2.6.26-rc6/Documentation/filesystems/azfs.txt 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.26-rc6-azfs/Documentation/filesystems/azfs.txt 2008-06-18 14:27:03.492902242 +0200
@@ -0,0 +1,22 @@
+AZFS is a file system which keeps all files on memory mapped random
+access storage. It was designed to work on the axonram device driver
+for IBM QS2x blade servers, but can operate on any block device
+that exports a direct_access method.
+
+Everything in AZFS is temporary in the sense that all the data stored
+therein is lost when you switch off or reboot a system. If you unmount
+an AZFS instance, all the data will be kept on device as long your system
+is not shut down or rebooted. You can later mount AZFS on from device again
+to get access to your files.
+
+AZFS uses a block device only for data but not for file information.
+All inodes (file and directory information) is kept in RAM.
+
+When you mount AZFS you are able to specify a file system block size with
+'-o bs=<size in bytes>' option. There are no software limitations for
+a block size but you would not be able to mmap files on AZFS if block size
+is less than a system page size. If no '-o bs' option is specified on mount
+a block size of the used block device is used as a default block size for AZFS.
+
+Other available mount options for AZFS are '-o uid=<id>' and '-o gid=<id>',
+which allow you to set the owner and group of the root of the file system.
diff -Nuar linux-2.6.26-rc6/arch/powerpc/configs/cell_defconfig linux-2.6.26-rc6-azfs/arch/powerpc/configs/cell_defconfig
--- linux-2.6.26-rc6/arch/powerpc/configs/cell_defconfig 2008-06-12 23:22:24.000000000 +0200
+++ linux-2.6.26-rc6-azfs/arch/powerpc/configs/cell_defconfig 2008-06-16 11:15:37.000000000 +0200
@@ -240,6 +240,7 @@
# CPU Frequency drivers
#
CONFIG_AXON_RAM=m
+CONFIG_AZ_FS=m
# CONFIG_FSL_ULI1575 is not set

#
diff -Nuar linux-2.6.26-rc6/fs/Kconfig linux-2.6.26-rc6-azfs/fs/Kconfig
--- linux-2.6.26-rc6/fs/Kconfig 2008-06-12 23:22:24.000000000 +0200
+++ linux-2.6.26-rc6-azfs/fs/Kconfig 2008-06-16 11:17:34.000000000 +0200
@@ -1017,6 +1017,21 @@
config HUGETLB_PAGE
def_bool HUGETLBFS

+config AZ_FS
+ tristate "AZFS filesystem support"
+ help
+ azfs is a file system for I/O attached memory backing. It requires
+ a block device with direct_access capability, e.g. axonram.
+ Mounting such device with azfs gives memory mapped access to the
+ underlying memory to user space.
+
+ Read <file:Documentation/filesystems/azfs.txt> for details.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called azfs.
+
+ If unsure, say N.
+
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem"
depends on SYSFS
diff -Nuar linux-2.6.26-rc6/fs/Makefile linux-2.6.26-rc6-azfs/fs/Makefile
--- linux-2.6.26-rc6/fs/Makefile 2008-06-12 23:22:24.000000000 +0200
+++ linux-2.6.26-rc6-azfs/fs/Makefile 2008-06-16 11:17:50.000000000 +0200
@@ -119,3 +119,4 @@
obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_GFS2_FS) += gfs2/
+obj-$(CONFIG_AZ_FS) += azfs.o
diff -Nuar linux-2.6.26-rc6/fs/azfs.c linux-2.6.26-rc6-azfs/fs/azfs.c
--- linux-2.6.26-rc6/fs/azfs.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.26-rc6-azfs/fs/azfs.c 2008-06-18 15:56:13.252266896 +0200
@@ -0,0 +1,1171 @@
+/*
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2007
+ *
+ * Author: Maxim Shchetynin <[email protected]>
+ *
+ * Non-buffered filesystem driver.
+ * It registers a filesystem which may be used for all kind of block devices
+ * which have a direct_access() method in block_device_operations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/cache.h>
+#include <linux/dcache.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/aio.h>
+#include <linux/uio.h>
+#include <asm/bug.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/string.h>
+
+#define AZFS_FILESYSTEM_NAME "azfs"
+#define AZFS_FILESYSTEM_FLAGS FS_REQUIRES_DEV
+
+#define AZFS_SUPERBLOCK_MAGIC 0xABBA1972
+#define AZFS_SUPERBLOCK_FLAGS MS_NOEXEC | \
+ MS_SYNCHRONOUS | \
+ MS_DIRSYNC | \
+ MS_ACTIVE
+
+#define AZFS_BDI_CAPABILITIES BDI_CAP_NO_ACCT_DIRTY | \
+ BDI_CAP_NO_WRITEBACK | \
+ BDI_CAP_MAP_COPY | \
+ BDI_CAP_MAP_DIRECT | \
+ BDI_CAP_VMFLAGS
+
+#define AZFS_CACHE_FLAGS SLAB_HWCACHE_ALIGN | \
+ SLAB_RECLAIM_ACCOUNT | \
+ SLAB_MEM_SPREAD
+
+enum azfs_direction {
+ AZFS_MMAP,
+ AZFS_READ,
+ AZFS_WRITE
+};
+
+struct azfs_super {
+ struct list_head list;
+ unsigned long media_size;
+ unsigned long block_size;
+ unsigned short block_shift;
+ unsigned long sector_size;
+ unsigned short sector_shift;
+ uid_t uid;
+ gid_t gid;
+ unsigned long ph_addr;
+ unsigned long io_addr;
+ struct block_device *blkdev;
+ struct dentry *root;
+ struct list_head block_list;
+ rwlock_t lock;
+};
+
+struct azfs_super_list {
+ struct list_head head;
+ spinlock_t lock;
+};
+
+struct azfs_block {
+ struct list_head list;
+ unsigned long id;
+ unsigned long count;
+};
+
+struct azfs_znode {
+ struct list_head block_list;
+ rwlock_t lock;
+ loff_t size;
+ struct inode vfs_inode;
+};
+
+static struct azfs_super_list super_list;
+static struct kmem_cache *azfs_znode_cache __read_mostly = NULL;
+static struct kmem_cache *azfs_block_cache __read_mostly = NULL;
+
+#define I2Z(inode) \
+ container_of(inode, struct azfs_znode, vfs_inode)
+
+#define for_each_block(block, block_list) \
+ list_for_each_entry(block, block_list, list)
+#define for_each_block_reverse(block, block_list) \
+ list_for_each_entry_reverse(block, block_list, list)
+#define for_each_block_safe(block, temp, block_list) \
+ list_for_each_entry_safe(block, temp, block_list, list)
+#define for_each_block_safe_reverse(block, temp, block_list) \
+ list_for_each_entry_safe_reverse(block, temp, block_list, list)
+
+/**
+ * azfs_block_init - create and initialise a new block in a list
+ * @block_list: destination list
+ * @id: block id
+ * @count: size of a block
+ */
+static inline struct azfs_block*
+azfs_block_init(struct list_head *block_list,
+ unsigned long id, unsigned long count)
+{
+ struct azfs_block *block;
+
+ block = kmem_cache_alloc(azfs_block_cache, GFP_KERNEL);
+ if (!block)
+ return NULL;
+
+ block->id = id;
+ block->count = count;
+
+ INIT_LIST_HEAD(&block->list);
+ list_add_tail(&block->list, block_list);
+
+ return block;
+}
+
+/**
+ * azfs_block_free - remove block from a list and free it back in cache
+ * @block: block to be removed
+ */
+static inline void
+azfs_block_free(struct azfs_block *block)
+{
+ list_del(&block->list);
+ kmem_cache_free(azfs_block_cache, block);
+}
+
+/**
+ * azfs_block_move - move block to another list
+ * @block: block to be moved
+ * @block_list: destination list
+ */
+static inline void
+azfs_block_move(struct azfs_block *block, struct list_head *block_list)
+{
+ list_move_tail(&block->list, block_list);
+}
+
+/**
+ * azfs_block_find - get real address of a part of a file
+ * @inode: inode
+ * @direction: data direction
+ * @from: offset for read/write operation
+ * @size: pointer to a value of the amount of data to be read/written
+ */
+static unsigned long
+azfs_block_find(struct inode *inode, enum azfs_direction direction,
+ unsigned long from, unsigned long *size)
+{
+ struct azfs_super *super;
+ struct azfs_znode *znode;
+ struct azfs_block *block;
+ unsigned long block_id, west, east;
+
+ super = inode->i_sb->s_fs_info;
+ znode = I2Z(inode);
+
+ if (from + *size > znode->size) {
+ i_size_write(inode, from + *size);
+ inode->i_op->truncate(inode);
+ }
+
+ read_lock(&znode->lock);
+
+ if (list_empty(&znode->block_list)) {
+ read_unlock(&znode->lock);
+ return 0;
+ }
+
+ block_id = from >> super->block_shift;
+
+ for_each_block(block, &znode->block_list) {
+ if (block->count > block_id)
+ break;
+ block_id -= block->count;
+ }
+
+ west = from % super->block_size;
+ east = ((block->count - block_id) << super->block_shift) - west;
+
+ if (*size > east)
+ *size = east;
+
+ block_id = ((block->id + block_id) << super->block_shift) + west;
+
+ read_unlock(&znode->lock);
+
+ block_id += direction == AZFS_MMAP ? super->ph_addr : super->io_addr;
+
+ return block_id;
+}
+
+static struct inode*
+azfs_new_inode(struct super_block *, struct inode *, int, dev_t);
+
+/**
+ * azfs_mknod - mknod() method for inode_operations
+ * @dir, @dentry, @mode, @dev: see inode_operations methods
+ */
+static int
+azfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+ struct inode *inode;
+
+ inode = azfs_new_inode(dir->i_sb, dir, mode, dev);
+ if (!inode)
+ return -ENOSPC;
+
+ if (S_ISREG(mode))
+ I2Z(inode)->size = 0;
+
+ dget(dentry);
+ d_instantiate(dentry, inode);
+
+ return 0;
+}
+
+/**
+ * azfs_create - create() method for inode_operations
+ * @dir, @dentry, @mode, @nd: see inode_operations methods
+ */
+static int
+azfs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ return azfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+/**
+ * azfs_mkdir - mkdir() method for inode_operations
+ * @dir, @dentry, @mode: see inode_operations methods
+ */
+static int
+azfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ int rc;
+
+ rc = azfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+ if (!rc)
+ inc_nlink(dir);
+
+ return rc;
+}
+
+/**
+ * azfs_symlink - symlink() method for inode_operations
+ * @dir, @dentry, @name: see inode_operations methods
+ */
+static int
+azfs_symlink(struct inode *dir, struct dentry *dentry, const char *name)
+{
+ struct inode *inode;
+ int rc;
+
+ inode = azfs_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO, 0);
+ if (!inode)
+ return -ENOSPC;
+
+ rc = page_symlink(inode, name, strlen(name) + 1);
+ if (rc) {
+ iput(inode);
+ return rc;
+ }
+
+ dget(dentry);
+ d_instantiate(dentry, inode);
+
+ return 0;
+}
+
+/**
+ * azfs_aio_read - aio_read() method for file_operations
+ * @iocb, @iov, @nr_segs, @pos: see file_operations methods
+ */
+static ssize_t
+azfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct inode *inode;
+ void *target;
+ unsigned long pin;
+ unsigned long size, todo, step;
+ ssize_t rc;
+
+ inode = iocb->ki_filp->f_mapping->host;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (pos >= i_size_read(inode)) {
+ rc = 0;
+ goto out;
+ }
+
+ target = iov->iov_base;
+ todo = min((loff_t) iov->iov_len, i_size_read(inode) - pos);
+
+ for (step = todo; step; step -= size) {
+ size = step;
+ pin = azfs_block_find(inode, AZFS_READ, pos, &size);
+ if (!pin) {
+ rc = -ENOSPC;
+ goto out;
+ }
+ if (copy_to_user(target, (void*) pin, size)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ iocb->ki_pos += size;
+ pos += size;
+ target += size;
+ }
+
+ rc = todo;
+
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return rc;
+}
+
+/**
+ * azfs_aio_write - aio_write() method for file_operations
+ * @iocb, @iov, @nr_segs, @pos: see file_operations methods
+ */
+static ssize_t
+azfs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct inode *inode;
+ void *source;
+ unsigned long pin;
+ unsigned long size, todo, step;
+ ssize_t rc;
+
+ inode = iocb->ki_filp->f_mapping->host;
+
+ source = iov->iov_base;
+ todo = iov->iov_len;
+
+ mutex_lock(&inode->i_mutex);
+
+ for (step = todo; step; step -= size) {
+ size = step;
+ pin = azfs_block_find(inode, AZFS_WRITE, pos, &size);
+ if (!pin) {
+ rc = -ENOSPC;
+ goto out;
+ }
+ if (copy_from_user((void*) pin, source, size)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ iocb->ki_pos += size;
+ pos += size;
+ source += size;
+ }
+
+ rc = todo;
+
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return rc;
+}
+
+/**
+ * azfs_open - open() method for file_operations
+ * @inode, @file: see file_operations methods
+ */
+static int
+azfs_open(struct inode *inode, struct file *file)
+{
+ if (file->f_flags & O_TRUNC) {
+ i_size_write(inode, 0);
+ inode->i_op->truncate(inode);
+ }
+ if (file->f_flags & O_APPEND)
+ inode->i_fop->llseek(file, 0, SEEK_END);
+
+ return 0;
+}
+
+/**
+ * azfs_mmap - mmap() method for file_operations
+ * @file, @vm: see file_operations methods
+ */
+static int
+azfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct azfs_super *super;
+ struct azfs_znode *znode;
+ struct inode *inode;
+ unsigned long cursor, pin;
+ unsigned long todo, size, vm_start;
+ pgprot_t page_prot;
+
+ inode = file->f_dentry->d_inode;
+ znode = I2Z(inode);
+ super = inode->i_sb->s_fs_info;
+
+ if (super->block_size < PAGE_SIZE)
+ return -EINVAL;
+
+ cursor = vma->vm_pgoff << super->block_shift;
+ todo = vma->vm_end - vma->vm_start;
+
+ if (cursor + todo > i_size_read(inode))
+ return -EINVAL;
+
+ page_prot = pgprot_val(vma->vm_page_prot);
+ page_prot |= (_PAGE_NO_CACHE | _PAGE_RW);
+ page_prot &= ~_PAGE_GUARDED;
+ vma->vm_page_prot = __pgprot(page_prot);
+
+ vm_start = vma->vm_start;
+ for (size = todo; todo; todo -= size, size = todo) {
+ pin = azfs_block_find(inode, AZFS_MMAP, cursor, &size);
+ if (!pin)
+ return -EAGAIN;
+ pin >>= PAGE_SHIFT;
+ if (remap_pfn_range(vma, vm_start, pin, size, vma->vm_page_prot))
+ return -EAGAIN;
+
+ vm_start += size;
+ cursor += size;
+ }
+
+ return 0;
+}
+
+/**
+ * azfs_truncate - truncate() method for inode_operations
+ * @inode: see inode_operations methods
+ */
+static void
+azfs_truncate(struct inode *inode)
+{
+ struct azfs_super *super;
+ struct azfs_znode *znode;
+ struct azfs_block *block, *tmp_block, *temp, *west, *east;
+ unsigned long id, count;
+ signed long delta;
+
+ super = inode->i_sb->s_fs_info;
+ znode = I2Z(inode);
+
+ delta = i_size_read(inode) + (super->block_size - 1);
+ delta >>= super->block_shift;
+ delta -= inode->i_blocks;
+
+ if (delta == 0) {
+ znode->size = i_size_read(inode);
+ return;
+ }
+
+ write_lock(&znode->lock);
+
+ while (delta > 0) {
+ west = east = NULL;
+
+ write_lock(&super->lock);
+
+ if (list_empty(&super->block_list)) {
+ write_unlock(&super->lock);
+ break;
+ }
+
+ for (count = delta; count; count--) {
+ for_each_block(block, &super->block_list)
+ if (block->count >= count) {
+ east = block;
+ break;
+ }
+ if (east)
+ break;
+ }
+
+ for_each_block_reverse(block, &znode->block_list) {
+ if (block->id + block->count == east->id)
+ west = block;
+ break;
+ }
+
+ if (east->count == count) {
+ if (west) {
+ west->count += east->count;
+ azfs_block_free(east);
+ } else {
+ azfs_block_move(east, &znode->block_list);
+ }
+ } else {
+ if (west) {
+ west->count += count;
+ } else {
+ if (!azfs_block_init(&znode->block_list,
+ east->id, count)) {
+ write_unlock(&super->lock);
+ break;
+ }
+ }
+
+ east->id += count;
+ east->count -= count;
+ }
+
+ write_unlock(&super->lock);
+
+ inode->i_blocks += count;
+
+ delta -= count;
+ }
+
+ while (delta < 0) {
+ for_each_block_safe_reverse(block, tmp_block, &znode->block_list) {
+ id = block->id;
+ count = block->count;
+ if ((signed long) count + delta > 0) {
+ block->count += delta;
+ id += block->count;
+ count -= block->count;
+ block = NULL;
+ }
+
+ west = east = NULL;
+
+ write_lock(&super->lock);
+
+ for_each_block(temp, &super->block_list) {
+ if (!west && (temp->id + temp->count == id))
+ west = temp;
+ else if (!east && (id + count == temp->id))
+ east = temp;
+ if (west && east)
+ break;
+ }
+
+ if (west && east) {
+ west->count += count + east->count;
+ azfs_block_free(east);
+ if (block)
+ azfs_block_free(block);
+ } else if (west) {
+ west->count += count;
+ if (block)
+ azfs_block_free(block);
+ } else if (east) {
+ east->id -= count;
+ east->count += count;
+ if (block)
+ azfs_block_free(block);
+ } else {
+ if (!block) {
+ if (!azfs_block_init(&super->block_list,
+ id, count)) {
+ write_unlock(&super->lock);
+ break;
+ }
+ } else {
+ azfs_block_move(block, &super->block_list);
+ }
+ }
+
+ write_unlock(&super->lock);
+
+ inode->i_blocks -= count;
+
+ delta += count;
+
+ break;
+ }
+ }
+
+ write_unlock(&znode->lock);
+
+ znode->size = min(i_size_read(inode),
+ (loff_t) inode->i_blocks << super->block_shift);
+}
+
+/**
+ * azfs_getattr - getattr() method for inode_operations
+ * @mnt, @dentry, @stat: see inode_operations methods
+ */
+static int
+azfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+ struct azfs_super *super;
+ struct inode *inode;
+ unsigned short shift;
+
+ inode = dentry->d_inode;
+ super = inode->i_sb->s_fs_info;
+
+ generic_fillattr(inode, stat);
+ stat->blocks = inode->i_blocks;
+ shift = super->block_shift - super->sector_shift;
+ if (shift)
+ stat->blocks <<= shift;
+
+ return 0;
+}
+
+static const struct address_space_operations azfs_aops = {
+ .write_begin = simple_write_begin,
+ .write_end = simple_write_end
+};
+
+static struct backing_dev_info azfs_bdi = {
+ .ra_pages = 0,
+ .capabilities = AZFS_BDI_CAPABILITIES
+};
+
+static struct inode_operations azfs_dir_iops = {
+ .create = azfs_create,
+ .lookup = simple_lookup,
+ .link = simple_link,
+ .unlink = simple_unlink,
+ .symlink = azfs_symlink,
+ .mkdir = azfs_mkdir,
+ .rmdir = simple_rmdir,
+ .mknod = azfs_mknod,
+ .rename = simple_rename
+};
+
+static const struct file_operations azfs_reg_fops = {
+ .llseek = generic_file_llseek,
+ .aio_read = azfs_aio_read,
+ .aio_write = azfs_aio_write,
+ .open = azfs_open,
+ .mmap = azfs_mmap,
+ .fsync = simple_sync_file,
+};
+
+static struct inode_operations azfs_reg_iops = {
+ .truncate = azfs_truncate,
+ .getattr = azfs_getattr
+};
+
+/**
+ * azfs_new_inode - cook a new inode
+ * @sb: super-block
+ * @dir: parent directory
+ * @mode: file mode
+ * @dev: to be forwarded to init_special_inode()
+ */
+static struct inode*
+azfs_new_inode(struct super_block *sb, struct inode *dir, int mode, dev_t dev)
+{
+ struct azfs_super *super;
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ inode->i_mode = mode;
+ if (dir) {
+ dir->i_mtime = dir->i_ctime = inode->i_mtime;
+ inode->i_uid = current->fsuid;
+ if (dir->i_mode & S_ISGID) {
+ if (S_ISDIR(mode))
+ inode->i_mode |= S_ISGID;
+ inode->i_gid = dir->i_gid;
+ } else {
+ inode->i_gid = current->fsgid;
+ }
+ } else {
+ super = sb->s_fs_info;
+ inode->i_uid = super->uid;
+ inode->i_gid = super->gid;
+ }
+
+ inode->i_blocks = 0;
+ inode->i_mapping->a_ops = &azfs_aops;
+ inode->i_mapping->backing_dev_info = &azfs_bdi;
+
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ inode->i_op = &azfs_dir_iops;
+ inode->i_fop = &simple_dir_operations;
+ inc_nlink(inode);
+ break;
+
+ case S_IFREG:
+ inode->i_op = &azfs_reg_iops;
+ inode->i_fop = &azfs_reg_fops;
+ break;
+
+ case S_IFLNK:
+ inode->i_op = &page_symlink_inode_operations;
+ break;
+
+ default:
+ init_special_inode(inode, mode, dev);
+ break;
+ }
+
+ return inode;
+}
+
+/**
+ * azfs_alloc_inode - alloc_inode() method for super_operations
+ * @sb: see super_operations methods
+ */
+static struct inode*
+azfs_alloc_inode(struct super_block *sb)
+{
+ struct azfs_znode *znode;
+
+ znode = kmem_cache_alloc(azfs_znode_cache, GFP_KERNEL);
+ if (znode) {
+ INIT_LIST_HEAD(&znode->block_list);
+ rwlock_init(&znode->lock);
+
+ inode_init_once(&znode->vfs_inode);
+
+ return &znode->vfs_inode;
+ }
+
+ return NULL;
+}
+
+/**
+ * azfs_destroy_inode - destroy_inode() method for super_operations
+ * @inode: see super_operations methods
+ */
+static void
+azfs_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(azfs_znode_cache, I2Z(inode));
+}
+
+/**
+ * azfs_delete_inode - delete_inode() method for super_operations
+ * @inode: see super_operations methods
+ */
+static void
+azfs_delete_inode(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode)) {
+ i_size_write(inode, 0);
+ azfs_truncate(inode);
+ }
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+}
+
+/**
+ * azfs_statfs - statfs() method for super_operations
+ * @dentry, @stat: see super_operations methods
+ */
+static int
+azfs_statfs(struct dentry *dentry, struct kstatfs *stat)
+{
+ struct super_block *sb;
+ struct azfs_super *super;
+ struct inode *inode;
+ unsigned long inodes, blocks;
+
+ sb = dentry->d_sb;
+ super = sb->s_fs_info;
+
+ inodes = blocks = 0;
+ mutex_lock(&sb->s_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ inodes++;
+ blocks += inode->i_blocks;
+ }
+ mutex_unlock(&sb->s_lock);
+
+ stat->f_type = AZFS_SUPERBLOCK_MAGIC;
+ stat->f_bsize = super->block_size;
+ stat->f_blocks = super->media_size >> super->block_shift;
+ stat->f_bfree = stat->f_blocks - blocks;
+ stat->f_bavail = stat->f_blocks - blocks;
+ stat->f_files = inodes + blocks;
+ stat->f_ffree = blocks + 1;
+ stat->f_namelen = NAME_MAX;
+
+ return 0;
+}
+
+static struct super_operations azfs_ops = {
+ .alloc_inode = azfs_alloc_inode,
+ .destroy_inode = azfs_destroy_inode,
+ .drop_inode = generic_delete_inode,
+ .delete_inode = azfs_delete_inode,
+ .statfs = azfs_statfs
+};
+
+enum {
+ Opt_blocksize_short,
+ Opt_blocksize_long,
+ Opt_uid,
+ Opt_gid,
+ Opt_err
+};
+
+static match_table_t tokens = {
+ {Opt_blocksize_short, "bs=%u"},
+ {Opt_blocksize_long, "blocksize=%u"},
+ {Opt_uid, "uid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_err, NULL}
+};
+
+/**
+ * azfs_parse_mount_parameters - parse options given to mount with -o
+ * @super: azfs super block extension
+ * @options: comma separated options
+ */
+static int
+azfs_parse_mount_parameters(struct azfs_super *super, char *options)
+{
+ char *option;
+ int token, value;
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!options)
+ return 1;
+
+ while ((option = strsep(&options, ",")) != NULL) {
+ if (!*option)
+ continue;
+
+ token = match_token(option, tokens, args);
+ switch (token) {
+ case Opt_blocksize_short:
+ case Opt_blocksize_long:
+ if (match_int(&args[0], &value))
+ goto syntax_error;
+ super->block_size = value;
+ break;
+
+ case Opt_uid:
+ if (match_int(&args[0], &value))
+ goto syntax_error;
+ super->uid = value;
+ break;
+
+ case Opt_gid:
+ if (match_int(&args[0], &value))
+ goto syntax_error;
+ super->gid = value;
+ break;
+
+ default:
+ goto syntax_error;
+ }
+ }
+
+ return 1;
+
+syntax_error:
+ printk(KERN_ERR "%s: invalid mount option\n",
+ AZFS_FILESYSTEM_NAME);
+
+ return 0;
+}
+
+/**
+ * azfs_fill_super - fill_super routine for get_sb
+ * @sb, @data, @silent: see file_system_type methods
+ */
+static int
+azfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct gendisk *disk;
+ struct azfs_super *super = NULL, *tmp_super;
+ struct azfs_block *block = NULL;
+ struct inode *inode = NULL;
+ void *kaddr;
+ unsigned long pfn;
+ int rc;
+
+ BUG_ON(!sb->s_bdev);
+
+ disk = sb->s_bdev->bd_disk;
+
+ BUG_ON(!disk || !disk->queue);
+
+ if (!disk->fops->direct_access) {
+ printk(KERN_ERR "%s needs a block device with a "
+ "direct_access() method\n",
+ AZFS_FILESYSTEM_NAME);
+ return -ENOSYS;
+ }
+
+ get_device(disk->driverfs_dev);
+
+ sb->s_magic = AZFS_SUPERBLOCK_MAGIC;
+ sb->s_flags = AZFS_SUPERBLOCK_FLAGS;
+ sb->s_op = &azfs_ops;
+ sb->s_maxbytes = get_capacity(disk) * disk->queue->hardsect_size;
+ sb->s_time_gran = 1;
+
+ spin_lock(&super_list.lock);
+ list_for_each_entry(tmp_super, &super_list.head, list)
+ if (tmp_super->blkdev == sb->s_bdev) {
+ super = tmp_super;
+ break;
+ }
+ spin_unlock(&super_list.lock);
+
+ if (super) {
+ if (data && strlen((char*) data))
+ printk(KERN_WARNING "/dev/%s was already mounted with "
+ "%s before, it will be mounted with "
+ "mount options used last time, "
+ "options just given would be ignored\n",
+ disk->disk_name, AZFS_FILESYSTEM_NAME);
+ sb->s_fs_info = super;
+ } else {
+ super = kzalloc(sizeof(struct azfs_super), GFP_KERNEL);
+ if (!super) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+ sb->s_fs_info = super;
+
+ if (!azfs_parse_mount_parameters(super, (char*) data)) {
+ rc = -EINVAL;
+ goto failed;
+ }
+
+ inode = azfs_new_inode(sb, NULL, S_IFDIR | S_IRWXUGO, 0);
+ if (!inode) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ super->root = d_alloc_root(inode);
+ if (!super->root) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+ dget(super->root);
+
+ INIT_LIST_HEAD(&super->list);
+ INIT_LIST_HEAD(&super->block_list);
+ rwlock_init(&super->lock);
+
+ super->media_size = sb->s_maxbytes;
+
+ if (!super->block_size)
+ super->block_size = sb->s_blocksize;
+ super->block_shift = blksize_bits(super->block_size);
+
+ super->sector_size = disk->queue->hardsect_size;
+ super->sector_shift = blksize_bits(super->sector_size);
+
+ super->blkdev = sb->s_bdev;
+
+ block = azfs_block_init(&super->block_list,
+ 0, super->media_size >> super->block_shift);
+ if (!block) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = disk->fops->direct_access(super->blkdev, 0, &kaddr, &pfn);
+ if (rc < 0) {
+ rc = -EFAULT;
+ goto failed;
+ }
+ super->ph_addr = (unsigned long) kaddr;
+
+ super->io_addr = (unsigned long) ioremap_flags(
+ super->ph_addr, super->media_size, _PAGE_NO_CACHE);
+ if (!super->io_addr) {
+ rc = -EFAULT;
+ goto failed;
+ }
+
+ spin_lock(&super_list.lock);
+ list_add(&super->list, &super_list.head);
+ spin_unlock(&super_list.lock);
+ }
+
+ sb->s_root = super->root;
+ disk->driverfs_dev->driver_data = super;
+ disk->driverfs_dev->platform_data = sb;
+
+ if (super->block_size < PAGE_SIZE)
+ printk(KERN_INFO "Block size on %s is smaller then system "
+ "page size: mmap() would not be supported\n",
+ disk->disk_name);
+
+ return 0;
+
+failed:
+ if (super) {
+ sb->s_root = NULL;
+ sb->s_fs_info = NULL;
+ if (block)
+ azfs_block_free(block);
+ if (super->root)
+ dput(super->root);
+ if (inode)
+ iput(inode);
+ disk->driverfs_dev->driver_data = NULL;
+ kfree(super);
+ disk->driverfs_dev->platform_data = NULL;
+ put_device(disk->driverfs_dev);
+ }
+
+ return rc;
+}
+
+/**
+ * azfs_get_sb - get_sb() method for file_system_type
+ * @fs_type, @flags, @dev_name, @data, @mount: see file_system_type methods
+ */
+static int
+azfs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mount)
+{
+ return get_sb_bdev(fs_type, flags,
+ dev_name, data, azfs_fill_super, mount);
+}
+
+/**
+ * azfs_kill_sb - kill_sb() method for file_system_type
+ * @sb: see file_system_type methods
+ */
+static void
+azfs_kill_sb(struct super_block *sb)
+{
+ sb->s_root = NULL;
+ kill_block_super(sb);
+}
+
+static struct file_system_type azfs_fs = {
+ .owner = THIS_MODULE,
+ .name = AZFS_FILESYSTEM_NAME,
+ .get_sb = azfs_get_sb,
+ .kill_sb = azfs_kill_sb,
+ .fs_flags = AZFS_FILESYSTEM_FLAGS
+};
+
+/**
+ * azfs_init
+ */
+static int __init
+azfs_init(void)
+{
+ int rc;
+
+ INIT_LIST_HEAD(&super_list.head);
+ spin_lock_init(&super_list.lock);
+
+ azfs_znode_cache = kmem_cache_create("azfs_znode_cache",
+ sizeof(struct azfs_znode), 0, AZFS_CACHE_FLAGS, NULL);
+ if (!azfs_znode_cache) {
+ printk(KERN_ERR "Could not allocate inode cache for %s\n",
+ AZFS_FILESYSTEM_NAME);
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ azfs_block_cache = kmem_cache_create("azfs_block_cache",
+ sizeof(struct azfs_block), 0, AZFS_CACHE_FLAGS, NULL);
+ if (!azfs_block_cache) {
+ printk(KERN_ERR "Could not allocate block cache for %s\n",
+ AZFS_FILESYSTEM_NAME);
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = register_filesystem(&azfs_fs);
+ if (rc != 0) {
+ printk(KERN_ERR "Could not register %s\n",
+ AZFS_FILESYSTEM_NAME);
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ if (azfs_block_cache)
+ kmem_cache_destroy(azfs_block_cache);
+
+ if (azfs_znode_cache)
+ kmem_cache_destroy(azfs_znode_cache);
+
+ return rc;
+}
+
+/**
+ * azfs_exit
+ */
+static void __exit
+azfs_exit(void)
+{
+ struct azfs_super *super, *tmp_super;
+ struct azfs_block *block, *tmp_block;
+ struct gendisk *disk;
+
+ spin_lock(&super_list.lock);
+ list_for_each_entry_safe(super, tmp_super, &super_list.head, list) {
+ disk = super->blkdev->bd_disk;
+ list_del(&super->list);
+ iounmap((void*) super->io_addr);
+ write_lock(&super->lock);
+ for_each_block_safe(block, tmp_block, &super->block_list)
+ azfs_block_free(block);
+ write_unlock(&super->lock);
+ disk->driverfs_dev->driver_data = NULL;
+ disk->driverfs_dev->platform_data = NULL;
+ kfree(super);
+ put_device(disk->driverfs_dev);
+ }
+ spin_unlock(&super_list.lock);
+
+ unregister_filesystem(&azfs_fs);
+
+ kmem_cache_destroy(azfs_block_cache);
+ kmem_cache_destroy(azfs_znode_cache);
+}
+
+module_init(azfs_init);
+module_exit(azfs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Maxim Shchetynin <[email protected]>");
+MODULE_DESCRIPTION("Non-buffered file system for IO devices");

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-06-18 20:57:17

by Jörn Engel

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Wed, 18 June 2008 13:15:14 +0200, Maxim Shchetynin wrote:
>
> Our users want to have a file system with bigger block sizes (64KB, 16MB, ...).

Good reason.

> They also don't want to keep metadata on DDR2 media, but to be able to use a complete DDR2 for their applications data.

This I find surprising. Isn't DDR2 actually the slower type of memory?
So you spend fast memory to store metadata in order to save slow memory?

Jörn

--
The wise man seeks everything in himself; the ignorant man tries to get
everything from somebody else.
-- unknown

2008-07-01 15:00:51

by Arnd Bergmann

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Wednesday 18 June 2008, Maxim Shchetynin wrote:
> AZFS patch updated accordinly to comments of Christoph Hellwig and Dmitri Vorobiev.

Sorry for my not commenting earlier on this. I'm finally collecting my
2.6.27 patches and stumbled over it again. There are a few details
that I hope we can fix up quickly, other than that, it looks good now,
great work!

> Subject: azfs: initial submit of azfs, a non-buffered filesystem

Please make the patch subject the actual subject of your email next time,
and put the introductory text below the Signed-off-by: lines, separated
by a "---" line. That will make the standard tools work without extra
effort on my side. Also, please always Cc the person you want to merge
the patch, in this case probably me.

> diff -Nuar linux-2.6.26-rc6/fs/Makefile linux-2.6.26-rc6-azfs/fs/Makefile
> --- linux-2.6.26-rc6/fs/Makefile 2008-06-12 23:22:24.000000000 +0200
> +++ linux-2.6.26-rc6-azfs/fs/Makefile 2008-06-16 11:17:50.000000000 +0200
> @@ -119,3 +119,4 @@
> obj-$(CONFIG_DEBUG_FS) += debugfs/
> obj-$(CONFIG_OCFS2_FS) += ocfs2/
> obj-$(CONFIG_GFS2_FS) += gfs2/
> +obj-$(CONFIG_AZ_FS) += azfs.o
> diff -Nuar linux-2.6.26-rc6/fs/azfs.c linux-2.6.26-rc6-azfs/fs/azfs.c
> --- linux-2.6.26-rc6/fs/azfs.c 1970-01-01 01:00:00.000000000 +0100
> +++ linux-2.6.26-rc6-azfs/fs/azfs.c 2008-06-18 15:56:13.252266896 +0200

All other file systems are in separate directories, so it would be better
to rename fs/azfs.c to fs/azfs/inode.c

> +#define AZFS_FILESYSTEM_NAME "azfs"
> +#define AZFS_FILESYSTEM_FLAGS FS_REQUIRES_DEV
> +
> +#define AZFS_SUPERBLOCK_MAGIC 0xABBA1972
> +#define AZFS_SUPERBLOCK_FLAGS MS_NOEXEC | \
> + MS_SYNCHRONOUS | \
> + MS_DIRSYNC | \
> + MS_ACTIVE

Why MS_NOEXEC? What happens on a remount if the user does not specifies
-o remount,exec?

> +/**
> + * azfs_block_find - get real address of a part of a file
> + * @inode: inode
> + * @direction: data direction
> + * @from: offset for read/write operation
> + * @size: pointer to a value of the amount of data to be read/written
> + */
> +static unsigned long
> +azfs_block_find(struct inode *inode, enum azfs_direction direction,
> + unsigned long from, unsigned long *size)
> +{
> + struct azfs_super *super;
> + struct azfs_znode *znode;
> + struct azfs_block *block;
> + unsigned long block_id, west, east;
> +
> + super = inode->i_sb->s_fs_info;
> + znode = I2Z(inode);
> +
> + if (from + *size > znode->size) {
> + i_size_write(inode, from + *size);
> + inode->i_op->truncate(inode);
> + }
> +
> + read_lock(&znode->lock);
> +
> + if (list_empty(&znode->block_list)) {
> + read_unlock(&znode->lock);
> + return 0;
> + }
> +
> + block_id = from >> super->block_shift;
> +
> + for_each_block(block, &znode->block_list) {
> + if (block->count > block_id)
> + break;
> + block_id -= block->count;
> + }
> +
> + west = from % super->block_size;
> + east = ((block->count - block_id) << super->block_shift) - west;
> +
> + if (*size > east)
> + *size = east;
> +
> + block_id = ((block->id + block_id) << super->block_shift) + west;
> +
> + read_unlock(&znode->lock);
> +
> + block_id += direction == AZFS_MMAP ? super->ph_addr : super->io_addr;
> +
> + return block_id;
> +}

This overloading of the return type to mean either a pointer or an offset
on the block device is rather confusing. Why not just return the raw block_id
before the last += and leave that part up to the caller?

static void __iomem *
azfs_block_addr(struct inode *inode, enum azfs_direction direction,
unsigned long from, unsigned long *size)
{
struct azfs_super *super;
unsigned long offset;
void __iomem *p;

super = inode->i_sb->s_fs_info;
offset = azfs_block_find(inode, super, 0, from, size);
p = super->ph_addr + offset;

return p;
}

> + target = iov->iov_base;
> + todo = min((loff_t) iov->iov_len, i_size_read(inode) - pos);
> +
> + for (step = todo; step; step -= size) {
> + size = step;
> + pin = azfs_block_find(inode, AZFS_READ, pos, &size);
> + if (!pin) {
> + rc = -ENOSPC;
> + goto out;
> + }
> + if (copy_to_user(target, (void*) pin, size)) {
> + rc = -EFAULT;
> + goto out;
> + }

Question to the powerpc folks: is copy_to_user safe for an __iomem source?
Should there be two copies (memcpy_fromio and copy_to_user) instead?

> + page_prot = pgprot_val(vma->vm_page_prot);
> + page_prot |= (_PAGE_NO_CACHE | _PAGE_RW);
> + page_prot &= ~_PAGE_GUARDED;
> + vma->vm_page_prot = __pgprot(page_prot);

The pgprot modifications rely on powerpc specific flags, but the
file system should not really need to be powerpc only.

The flags we want are more or less the same as PAGE_AGP, because
both are I/O mapped memory that needs to be uncached but should
not be guarded, for performance reasons.

Maybe we can introduce a new PAGE_IOMEM here that we can use
in all places that need something like this. In spufs we need
the same flags for the local store mappings.

I wouldn't hold up merging the file system for this problem, but
until it is solved, the Kconfig entry should probably have
a "depends on PPC".

Arnd <><

2008-07-07 15:40:57

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

Thank you Arnd for your comments. I have changed my patch accordinly (I will send it in a few minutes).

> > Subject: azfs: initial submit of azfs, a non-buffered filesystem
>
> Please make the patch subject the actual subject of your email next time,
> and put the introductory text below the Signed-off-by: lines, separated
> by a "---" line. That will make the standard tools work without extra
> effort on my side. Also, please always Cc the person you want to merge
> the patch, in this case probably me.

Done.

> All other file systems are in separate directories, so it would be better
> to rename fs/azfs.c to fs/azfs/inode.c

Done.

> > +#define AZFS_SUPERBLOCK_FLAGS MS_NOEXEC | \
> > + MS_SYNCHRONOUS | \
> > + MS_DIRSYNC | \
> > + MS_ACTIVE
>
> Why MS_NOEXEC? What happens on a remount if the user does not specifies
> -o remount,exec?

I also don't see any reason of keeping MS_NOEXEC - have just removed it.

> > +static unsigned long
> > +azfs_block_find(struct inode *inode, enum azfs_direction direction,
> > + unsigned long from, unsigned long *size)
> > +{
> > ...
> > +}
>
> This overloading of the return type to mean either a pointer or an offset
> on the block device is rather confusing. Why not just return the raw block_id
> before the last += and leave that part up to the caller?

Changed.

> > + if (copy_to_user(target, (void*) pin, size)) {
> > + rc = -EFAULT;
> > + goto out;
> > + }
>
> Question to the powerpc folks: is copy_to_user safe for an __iomem source?
> Should there be two copies (memcpy_fromio and copy_to_user) instead?

I leave this question open.

> > + page_prot = pgprot_val(vma->vm_page_prot);
> > + page_prot |= (_PAGE_NO_CACHE | _PAGE_RW);
> > + page_prot &= ~_PAGE_GUARDED;
> > + vma->vm_page_prot = __pgprot(page_prot);
>
> The pgprot modifications rely on powerpc specific flags, but the
> file system should not really need to be powerpc only.
>
> The flags we want are more or less the same as PAGE_AGP, because
> both are I/O mapped memory that needs to be uncached but should
> not be guarded, for performance reasons.
>
> Maybe we can introduce a new PAGE_IOMEM here that we can use
> in all places that need something like this. In spufs we need
> the same flags for the local store mappings.
>
> I wouldn't hold up merging the file system for this problem, but
> until it is solved, the Kconfig entry should probably have
> a "depends on PPC".

Done.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-07-07 15:42:35

by Maxim Shchetynin

[permalink] [raw]
Subject: azfs: initial submit of azfs, a non-buffered filesystem

AZFS is a file system which keeps all files on memory mapped random
access storage. It was designed to work on the axonram device driver
for IBM QS2x blade servers, but can operate on any block device
that exports a direct_access method.

Signed-off-by: Maxim Shchetynin <[email protected]>
---

diff -Nuar linux-2.6.26-rc9/Documentation/filesystems/azfs.txt linux-2.6.26-rc9-azfs/Documentation/filesystems/azfs.txt
--- linux-2.6.26-rc9/Documentation/filesystems/azfs.txt 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.26-rc9-azfs/Documentation/filesystems/azfs.txt 2008-07-07 13:43:45.235739896 +0200
@@ -0,0 +1,22 @@
+AZFS is a file system which keeps all files on memory mapped random
+access storage. It was designed to work on the axonram device driver
+for IBM QS2x blade servers, but can operate on any block device
+that exports a direct_access method.
+
+Everything in AZFS is temporary in the sense that all the data stored
+therein is lost when you switch off or reboot a system. If you unmount
+an AZFS instance, all the data will be kept on device as long your system
+is not shut down or rebooted. You can later mount AZFS on from device again
+to get access to your files.
+
+AZFS uses a block device only for data but not for file information.
+All inodes (file and directory information) is kept in RAM.
+
+When you mount AZFS you are able to specify a file system block size with
+'-o bs=<size in bytes>' option. There are no software limitations for
+a block size but you would not be able to mmap files on AZFS if block size
+is less than a system page size. If no '-o bs' option is specified on mount
+a block size of the used block device is used as a default block size for AZFS.
+
+Other available mount options for AZFS are '-o uid=<id>' and '-o gid=<id>',
+which allow you to set the owner and group of the root of the file system.
diff -Nuar linux-2.6.26-rc9/arch/powerpc/configs/cell_defconfig linux-2.6.26-rc9-azfs/arch/powerpc/configs/cell_defconfig
--- linux-2.6.26-rc9/arch/powerpc/configs/cell_defconfig 2008-07-06 00:53:22.000000000 +0200
+++ linux-2.6.26-rc9-azfs/arch/powerpc/configs/cell_defconfig 2008-07-07 13:43:45.244738607 +0200
@@ -240,6 +240,7 @@
# CPU Frequency drivers
#
CONFIG_AXON_RAM=m
+CONFIG_AZ_FS=m
# CONFIG_FSL_ULI1575 is not set

#
diff -Nuar linux-2.6.26-rc9/fs/Kconfig linux-2.6.26-rc9-azfs/fs/Kconfig
--- linux-2.6.26-rc9/fs/Kconfig 2008-07-06 00:53:22.000000000 +0200
+++ linux-2.6.26-rc9-azfs/fs/Kconfig 2008-07-07 13:45:29.397644341 +0200
@@ -1017,6 +1017,22 @@
config HUGETLB_PAGE
def_bool HUGETLBFS

+config AZ_FS
+ tristate "AZFS filesystem support"
+ depends on PPC
+ help
+ azfs is a file system for I/O attached memory backing. It requires
+ a block device with direct_access capability, e.g. axonram.
+ Mounting such device with azfs gives memory mapped access to the
+ underlying memory to user space.
+
+ Read <file:Documentation/filesystems/azfs.txt> for details.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called azfs.
+
+ If unsure, say N.
+
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem"
depends on SYSFS
diff -Nuar linux-2.6.26-rc9/fs/Makefile linux-2.6.26-rc9-azfs/fs/Makefile
--- linux-2.6.26-rc9/fs/Makefile 2008-07-06 00:53:22.000000000 +0200
+++ linux-2.6.26-rc9-azfs/fs/Makefile 2008-07-07 13:45:49.436832234 +0200
@@ -119,3 +119,4 @@
obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_GFS2_FS) += gfs2/
+obj-$(CONFIG_AZ_FS) += azfs/
diff -Nuar linux-2.6.26-rc9/fs/azfs/Makefile linux-2.6.26-rc9-azfs/fs/azfs/Makefile
--- linux-2.6.26-rc9/fs/azfs/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.26-rc9-azfs/fs/azfs/Makefile 2008-07-07 13:46:38.413264402 +0200
@@ -0,0 +1,7 @@
+#
+# Makefile for azfs routines
+#
+
+obj-$(CONFIG_AZ_FS) += azfs.o
+
+azfs-y := inode.o
diff -Nuar linux-2.6.26-rc9/fs/azfs/inode.c linux-2.6.26-rc9-azfs/fs/azfs/inode.c
--- linux-2.6.26-rc9/fs/azfs/inode.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.26-rc9-azfs/fs/azfs/inode.c 2008-07-07 17:31:06.183098986 +0200
@@ -0,0 +1,1176 @@
+/*
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2007
+ *
+ * Author: Maxim Shchetynin <[email protected]>
+ *
+ * Non-buffered filesystem driver.
+ * It registers a filesystem which may be used for all kind of block devices
+ * which have a direct_access() method in block_device_operations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/cache.h>
+#include <linux/dcache.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/aio.h>
+#include <linux/uio.h>
+#include <asm/bug.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/string.h>
+
+#define AZFS_FILESYSTEM_NAME "azfs"
+#define AZFS_FILESYSTEM_FLAGS FS_REQUIRES_DEV
+
+#define AZFS_SUPERBLOCK_MAGIC 0xABBA1972
+#define AZFS_SUPERBLOCK_FLAGS MS_SYNCHRONOUS | \
+ MS_DIRSYNC | \
+ MS_ACTIVE
+
+#define AZFS_BDI_CAPABILITIES BDI_CAP_NO_ACCT_DIRTY | \
+ BDI_CAP_NO_WRITEBACK | \
+ BDI_CAP_MAP_COPY | \
+ BDI_CAP_MAP_DIRECT | \
+ BDI_CAP_VMFLAGS
+
+#define AZFS_CACHE_FLAGS SLAB_HWCACHE_ALIGN | \
+ SLAB_RECLAIM_ACCOUNT | \
+ SLAB_MEM_SPREAD
+
+struct azfs_super {
+ struct list_head list;
+ unsigned long media_size;
+ unsigned long block_size;
+ unsigned short block_shift;
+ unsigned long sector_size;
+ unsigned short sector_shift;
+ uid_t uid;
+ gid_t gid;
+ unsigned long ph_addr;
+ unsigned long io_addr;
+ struct block_device *blkdev;
+ struct dentry *root;
+ struct list_head block_list;
+ rwlock_t lock;
+};
+
+struct azfs_super_list {
+ struct list_head head;
+ spinlock_t lock;
+};
+
+struct azfs_block {
+ struct list_head list;
+ unsigned long id;
+ unsigned long count;
+};
+
+struct azfs_znode {
+ struct list_head block_list;
+ rwlock_t lock;
+ loff_t size;
+ struct inode vfs_inode;
+};
+
+static struct azfs_super_list super_list;
+static struct kmem_cache *azfs_znode_cache __read_mostly = NULL;
+static struct kmem_cache *azfs_block_cache __read_mostly = NULL;
+
+#define I2S(inode) \
+ inode->i_sb->s_fs_info
+#define I2Z(inode) \
+ container_of(inode, struct azfs_znode, vfs_inode)
+
+#define for_each_block(block, block_list) \
+ list_for_each_entry(block, block_list, list)
+#define for_each_block_reverse(block, block_list) \
+ list_for_each_entry_reverse(block, block_list, list)
+#define for_each_block_safe(block, temp, block_list) \
+ list_for_each_entry_safe(block, temp, block_list, list)
+#define for_each_block_safe_reverse(block, temp, block_list) \
+ list_for_each_entry_safe_reverse(block, temp, block_list, list)
+
+/**
+ * azfs_block_init - create and initialise a new block in a list
+ * @block_list: destination list
+ * @id: block id
+ * @count: size of a block
+ */
+static inline struct azfs_block*
+azfs_block_init(struct list_head *block_list,
+ unsigned long id, unsigned long count)
+{
+ struct azfs_block *block;
+
+ block = kmem_cache_alloc(azfs_block_cache, GFP_KERNEL);
+ if (!block)
+ return NULL;
+
+ block->id = id;
+ block->count = count;
+
+ INIT_LIST_HEAD(&block->list);
+ list_add_tail(&block->list, block_list);
+
+ return block;
+}
+
+/**
+ * azfs_block_free - remove block from a list and free it back in cache
+ * @block: block to be removed
+ */
+static inline void
+azfs_block_free(struct azfs_block *block)
+{
+ list_del(&block->list);
+ kmem_cache_free(azfs_block_cache, block);
+}
+
+/**
+ * azfs_block_move - move block to another list
+ * @block: block to be moved
+ * @block_list: destination list
+ */
+static inline void
+azfs_block_move(struct azfs_block *block, struct list_head *block_list)
+{
+ list_move_tail(&block->list, block_list);
+}
+
+/**
+ * azfs_block_find - get a block id of a part of a file
+ * @inode: inode
+ * @from: offset for read/write operation
+ * @size: pointer to a value of the amount of data to be read/written
+ */
+static unsigned long
+azfs_block_find(struct inode *inode, unsigned long from, unsigned long *size)
+{
+ struct azfs_super *super;
+ struct azfs_znode *znode;
+ struct azfs_block *block;
+ unsigned long block_id, west, east;
+
+ super = I2S(inode);
+ znode = I2Z(inode);
+
+ read_lock(&znode->lock);
+
+ while (from + *size > znode->size) {
+ read_unlock(&znode->lock);
+ i_size_write(inode, from + *size);
+ inode->i_op->truncate(inode);
+ read_lock(&znode->lock);
+ }
+
+ if (list_empty(&znode->block_list)) {
+ read_unlock(&znode->lock);
+ *size = 0;
+ return 0;
+ }
+
+ block_id = from >> super->block_shift;
+
+ for_each_block(block, &znode->block_list) {
+ if (block->count > block_id)
+ break;
+ block_id -= block->count;
+ }
+
+ west = from % super->block_size;
+ east = ((block->count - block_id) << super->block_shift) - west;
+
+ if (*size > east)
+ *size = east;
+
+ block_id = ((block->id + block_id) << super->block_shift) + west;
+
+ read_unlock(&znode->lock);
+
+ return block_id;
+}
+
+static struct inode*
+azfs_new_inode(struct super_block *, struct inode *, int, dev_t);
+
+/**
+ * azfs_mknod - mknod() method for inode_operations
+ * @dir, @dentry, @mode, @dev: see inode_operations methods
+ */
+static int
+azfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+ struct inode *inode;
+
+ inode = azfs_new_inode(dir->i_sb, dir, mode, dev);
+ if (!inode)
+ return -ENOSPC;
+
+ if (S_ISREG(mode))
+ I2Z(inode)->size = 0;
+
+ dget(dentry);
+ d_instantiate(dentry, inode);
+
+ return 0;
+}
+
+/**
+ * azfs_create - create() method for inode_operations
+ * @dir, @dentry, @mode, @nd: see inode_operations methods
+ */
+static int
+azfs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ return azfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+/**
+ * azfs_mkdir - mkdir() method for inode_operations
+ * @dir, @dentry, @mode: see inode_operations methods
+ */
+static int
+azfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ int rc;
+
+ rc = azfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+ if (!rc)
+ inc_nlink(dir);
+
+ return rc;
+}
+
+/**
+ * azfs_symlink - symlink() method for inode_operations
+ * @dir, @dentry, @name: see inode_operations methods
+ */
+static int
+azfs_symlink(struct inode *dir, struct dentry *dentry, const char *name)
+{
+ struct inode *inode;
+ int rc;
+
+ inode = azfs_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO, 0);
+ if (!inode)
+ return -ENOSPC;
+
+ rc = page_symlink(inode, name, strlen(name) + 1);
+ if (rc) {
+ iput(inode);
+ return rc;
+ }
+
+ dget(dentry);
+ d_instantiate(dentry, inode);
+
+ return 0;
+}
+
+/**
+ * azfs_aio_read - aio_read() method for file_operations
+ * @iocb, @iov, @nr_segs, @pos: see file_operations methods
+ */
+static ssize_t
+azfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct azfs_super *super;
+ struct inode *inode;
+ void *target;
+ unsigned long pin;
+ unsigned long size, todo, step;
+ ssize_t rc;
+
+ inode = iocb->ki_filp->f_mapping->host;
+ super = I2S(inode);
+
+ mutex_lock(&inode->i_mutex);
+
+ if (pos >= i_size_read(inode)) {
+ rc = 0;
+ goto out;
+ }
+
+ target = iov->iov_base;
+ todo = min((loff_t) iov->iov_len, i_size_read(inode) - pos);
+
+ for (step = todo; step; step -= size) {
+ size = step;
+ pin = azfs_block_find(inode, pos, &size);
+ if (!size) {
+ rc = -ENOSPC;
+ goto out;
+ }
+ pin += super->io_addr;
+ if (copy_to_user(target, (void*) pin, size)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ iocb->ki_pos += size;
+ pos += size;
+ target += size;
+ }
+
+ rc = todo;
+
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return rc;
+}
+
+/**
+ * azfs_aio_write - aio_write() method for file_operations
+ * @iocb, @iov, @nr_segs, @pos: see file_operations methods
+ */
+static ssize_t
+azfs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct azfs_super *super;
+ struct inode *inode;
+ void *source;
+ unsigned long pin;
+ unsigned long size, todo, step;
+ ssize_t rc;
+
+ inode = iocb->ki_filp->f_mapping->host;
+ super = I2S(inode);
+
+ source = iov->iov_base;
+ todo = iov->iov_len;
+
+ mutex_lock(&inode->i_mutex);
+
+ for (step = todo; step; step -= size) {
+ size = step;
+ pin = azfs_block_find(inode, pos, &size);
+ if (!size) {
+ rc = -ENOSPC;
+ goto out;
+ }
+ pin += super->io_addr;
+ if (copy_from_user((void*) pin, source, size)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ iocb->ki_pos += size;
+ pos += size;
+ source += size;
+ }
+
+ rc = todo;
+
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return rc;
+}
+
+/**
+ * azfs_open - open() method for file_operations
+ * @inode, @file: see file_operations methods
+ */
+static int
+azfs_open(struct inode *inode, struct file *file)
+{
+ if (file->f_flags & O_TRUNC) {
+ i_size_write(inode, 0);
+ inode->i_op->truncate(inode);
+ }
+ if (file->f_flags & O_APPEND)
+ inode->i_fop->llseek(file, 0, SEEK_END);
+
+ return 0;
+}
+
+/**
+ * azfs_mmap - mmap() method for file_operations
+ * @file, @vm: see file_operations methods
+ */
+static int
+azfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct azfs_super *super;
+ struct azfs_znode *znode;
+ struct inode *inode;
+ unsigned long cursor, pin;
+ unsigned long todo, size, vm_start;
+ pgprot_t page_prot;
+
+ inode = file->f_dentry->d_inode;
+ znode = I2Z(inode);
+ super = I2S(inode);
+
+ if (super->block_size < PAGE_SIZE)
+ return -EINVAL;
+
+ cursor = vma->vm_pgoff << super->block_shift;
+ todo = vma->vm_end - vma->vm_start;
+
+ if (cursor + todo > i_size_read(inode))
+ return -EINVAL;
+
+ page_prot = pgprot_val(vma->vm_page_prot);
+#ifdef CONFIG_PPC
+ page_prot |= (_PAGE_NO_CACHE | _PAGE_RW);
+ page_prot &= ~_PAGE_GUARDED;
+#else
+#warning You need to set in pgprot the PAGE_* flags specific to you architecture
+#endif
+ vma->vm_page_prot = __pgprot(page_prot);
+
+ vm_start = vma->vm_start;
+ for (size = todo; todo; todo -= size, size = todo) {
+ pin = azfs_block_find(inode, cursor, &size);
+ if (!size)
+ return -EAGAIN;
+ pin += super->ph_addr;
+ pin >>= PAGE_SHIFT;
+ if (remap_pfn_range(vma, vm_start, pin, size, vma->vm_page_prot))
+ return -EAGAIN;
+
+ vm_start += size;
+ cursor += size;
+ }
+
+ return 0;
+}
+
+/**
+ * azfs_truncate - truncate() method for inode_operations
+ * @inode: see inode_operations methods
+ */
+static void
+azfs_truncate(struct inode *inode)
+{
+ struct azfs_super *super;
+ struct azfs_znode *znode;
+ struct azfs_block *block, *tmp_block, *temp, *west, *east;
+ unsigned long id, count;
+ signed long delta;
+
+ super = I2S(inode);
+ znode = I2Z(inode);
+
+ delta = i_size_read(inode) + (super->block_size - 1);
+ delta >>= super->block_shift;
+ delta -= inode->i_blocks;
+
+ if (delta == 0) {
+ znode->size = i_size_read(inode);
+ return;
+ }
+
+ write_lock(&znode->lock);
+
+ while (delta > 0) {
+ west = east = NULL;
+
+ write_lock(&super->lock);
+
+ if (list_empty(&super->block_list)) {
+ write_unlock(&super->lock);
+ break;
+ }
+
+ for (count = delta; count; count--) {
+ for_each_block(block, &super->block_list)
+ if (block->count >= count) {
+ east = block;
+ break;
+ }
+ if (east)
+ break;
+ }
+
+ for_each_block_reverse(block, &znode->block_list) {
+ if (block->id + block->count == east->id)
+ west = block;
+ break;
+ }
+
+ if (east->count == count) {
+ if (west) {
+ west->count += east->count;
+ azfs_block_free(east);
+ } else {
+ azfs_block_move(east, &znode->block_list);
+ }
+ } else {
+ if (west) {
+ west->count += count;
+ } else {
+ if (!azfs_block_init(&znode->block_list,
+ east->id, count)) {
+ write_unlock(&super->lock);
+ break;
+ }
+ }
+
+ east->id += count;
+ east->count -= count;
+ }
+
+ write_unlock(&super->lock);
+
+ inode->i_blocks += count;
+
+ delta -= count;
+ }
+
+ while (delta < 0) {
+ for_each_block_safe_reverse(block, tmp_block, &znode->block_list) {
+ id = block->id;
+ count = block->count;
+ if ((signed long) count + delta > 0) {
+ block->count += delta;
+ id += block->count;
+ count -= block->count;
+ block = NULL;
+ }
+
+ west = east = NULL;
+
+ write_lock(&super->lock);
+
+ for_each_block(temp, &super->block_list) {
+ if (!west && (temp->id + temp->count == id))
+ west = temp;
+ else if (!east && (id + count == temp->id))
+ east = temp;
+ if (west && east)
+ break;
+ }
+
+ if (west && east) {
+ west->count += count + east->count;
+ azfs_block_free(east);
+ if (block)
+ azfs_block_free(block);
+ } else if (west) {
+ west->count += count;
+ if (block)
+ azfs_block_free(block);
+ } else if (east) {
+ east->id -= count;
+ east->count += count;
+ if (block)
+ azfs_block_free(block);
+ } else {
+ if (!block) {
+ if (!azfs_block_init(&super->block_list,
+ id, count)) {
+ write_unlock(&super->lock);
+ break;
+ }
+ } else {
+ azfs_block_move(block, &super->block_list);
+ }
+ }
+
+ write_unlock(&super->lock);
+
+ inode->i_blocks -= count;
+
+ delta += count;
+
+ break;
+ }
+ }
+
+ write_unlock(&znode->lock);
+
+ znode->size = min(i_size_read(inode),
+ (loff_t) inode->i_blocks << super->block_shift);
+}
+
+/**
+ * azfs_getattr - getattr() method for inode_operations
+ * @mnt, @dentry, @stat: see inode_operations methods
+ */
+static int
+azfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+ struct azfs_super *super;
+ struct inode *inode;
+ unsigned short shift;
+
+ inode = dentry->d_inode;
+ super = I2S(inode);
+
+ generic_fillattr(inode, stat);
+ stat->blocks = inode->i_blocks;
+ shift = super->block_shift - super->sector_shift;
+ if (shift)
+ stat->blocks <<= shift;
+
+ return 0;
+}
+
+static const struct address_space_operations azfs_aops = {
+ .write_begin = simple_write_begin,
+ .write_end = simple_write_end
+};
+
+static struct backing_dev_info azfs_bdi = {
+ .ra_pages = 0,
+ .capabilities = AZFS_BDI_CAPABILITIES
+};
+
+static struct inode_operations azfs_dir_iops = {
+ .create = azfs_create,
+ .lookup = simple_lookup,
+ .link = simple_link,
+ .unlink = simple_unlink,
+ .symlink = azfs_symlink,
+ .mkdir = azfs_mkdir,
+ .rmdir = simple_rmdir,
+ .mknod = azfs_mknod,
+ .rename = simple_rename
+};
+
+static const struct file_operations azfs_reg_fops = {
+ .llseek = generic_file_llseek,
+ .aio_read = azfs_aio_read,
+ .aio_write = azfs_aio_write,
+ .open = azfs_open,
+ .mmap = azfs_mmap,
+ .fsync = simple_sync_file,
+};
+
+static struct inode_operations azfs_reg_iops = {
+ .truncate = azfs_truncate,
+ .getattr = azfs_getattr
+};
+
+/**
+ * azfs_new_inode - cook a new inode
+ * @sb: super-block
+ * @dir: parent directory
+ * @mode: file mode
+ * @dev: to be forwarded to init_special_inode()
+ */
+static struct inode*
+azfs_new_inode(struct super_block *sb, struct inode *dir, int mode, dev_t dev)
+{
+ struct azfs_super *super;
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ inode->i_mode = mode;
+ if (dir) {
+ dir->i_mtime = dir->i_ctime = inode->i_mtime;
+ inode->i_uid = current->fsuid;
+ if (dir->i_mode & S_ISGID) {
+ if (S_ISDIR(mode))
+ inode->i_mode |= S_ISGID;
+ inode->i_gid = dir->i_gid;
+ } else {
+ inode->i_gid = current->fsgid;
+ }
+ } else {
+ super = sb->s_fs_info;
+ inode->i_uid = super->uid;
+ inode->i_gid = super->gid;
+ }
+
+ inode->i_blocks = 0;
+ inode->i_mapping->a_ops = &azfs_aops;
+ inode->i_mapping->backing_dev_info = &azfs_bdi;
+
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ inode->i_op = &azfs_dir_iops;
+ inode->i_fop = &simple_dir_operations;
+ inc_nlink(inode);
+ break;
+
+ case S_IFREG:
+ inode->i_op = &azfs_reg_iops;
+ inode->i_fop = &azfs_reg_fops;
+ break;
+
+ case S_IFLNK:
+ inode->i_op = &page_symlink_inode_operations;
+ break;
+
+ default:
+ init_special_inode(inode, mode, dev);
+ break;
+ }
+
+ return inode;
+}
+
+/**
+ * azfs_alloc_inode - alloc_inode() method for super_operations
+ * @sb: see super_operations methods
+ */
+static struct inode*
+azfs_alloc_inode(struct super_block *sb)
+{
+ struct azfs_znode *znode;
+
+ znode = kmem_cache_alloc(azfs_znode_cache, GFP_KERNEL);
+ if (znode) {
+ INIT_LIST_HEAD(&znode->block_list);
+ rwlock_init(&znode->lock);
+
+ inode_init_once(&znode->vfs_inode);
+
+ return &znode->vfs_inode;
+ }
+
+ return NULL;
+}
+
+/**
+ * azfs_destroy_inode - destroy_inode() method for super_operations
+ * @inode: see super_operations methods
+ */
+static void
+azfs_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(azfs_znode_cache, I2Z(inode));
+}
+
+/**
+ * azfs_delete_inode - delete_inode() method for super_operations
+ * @inode: see super_operations methods
+ */
+static void
+azfs_delete_inode(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode)) {
+ i_size_write(inode, 0);
+ azfs_truncate(inode);
+ }
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+}
+
+/**
+ * azfs_statfs - statfs() method for super_operations
+ * @dentry, @stat: see super_operations methods
+ */
+static int
+azfs_statfs(struct dentry *dentry, struct kstatfs *stat)
+{
+ struct super_block *sb;
+ struct azfs_super *super;
+ struct inode *inode;
+ unsigned long inodes, blocks;
+
+ sb = dentry->d_sb;
+ super = sb->s_fs_info;
+
+ inodes = blocks = 0;
+ mutex_lock(&sb->s_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ inodes++;
+ blocks += inode->i_blocks;
+ }
+ mutex_unlock(&sb->s_lock);
+
+ stat->f_type = AZFS_SUPERBLOCK_MAGIC;
+ stat->f_bsize = super->block_size;
+ stat->f_blocks = super->media_size >> super->block_shift;
+ stat->f_bfree = stat->f_blocks - blocks;
+ stat->f_bavail = stat->f_blocks - blocks;
+ stat->f_files = inodes + blocks;
+ stat->f_ffree = blocks + 1;
+ stat->f_namelen = NAME_MAX;
+
+ return 0;
+}
+
+static struct super_operations azfs_ops = {
+ .alloc_inode = azfs_alloc_inode,
+ .destroy_inode = azfs_destroy_inode,
+ .drop_inode = generic_delete_inode,
+ .delete_inode = azfs_delete_inode,
+ .statfs = azfs_statfs
+};
+
+enum {
+ Opt_blocksize_short,
+ Opt_blocksize_long,
+ Opt_uid,
+ Opt_gid,
+ Opt_err
+};
+
+static match_table_t tokens = {
+ {Opt_blocksize_short, "bs=%u"},
+ {Opt_blocksize_long, "blocksize=%u"},
+ {Opt_uid, "uid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_err, NULL}
+};
+
+/**
+ * azfs_parse_mount_parameters - parse options given to mount with -o
+ * @super: azfs super block extension
+ * @options: comma separated options
+ */
+static int
+azfs_parse_mount_parameters(struct azfs_super *super, char *options)
+{
+ char *option;
+ int token, value;
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!options)
+ return 1;
+
+ while ((option = strsep(&options, ",")) != NULL) {
+ if (!*option)
+ continue;
+
+ token = match_token(option, tokens, args);
+ switch (token) {
+ case Opt_blocksize_short:
+ case Opt_blocksize_long:
+ if (match_int(&args[0], &value))
+ goto syntax_error;
+ super->block_size = value;
+ break;
+
+ case Opt_uid:
+ if (match_int(&args[0], &value))
+ goto syntax_error;
+ super->uid = value;
+ break;
+
+ case Opt_gid:
+ if (match_int(&args[0], &value))
+ goto syntax_error;
+ super->gid = value;
+ break;
+
+ default:
+ goto syntax_error;
+ }
+ }
+
+ return 1;
+
+syntax_error:
+ printk(KERN_ERR "%s: invalid mount option\n",
+ AZFS_FILESYSTEM_NAME);
+
+ return 0;
+}
+
+/**
+ * azfs_fill_super - fill_super routine for get_sb
+ * @sb, @data, @silent: see file_system_type methods
+ */
+static int
+azfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct gendisk *disk;
+ struct azfs_super *super = NULL, *tmp_super;
+ struct azfs_block *block = NULL;
+ struct inode *inode = NULL;
+ void *kaddr;
+ unsigned long pfn;
+ int rc;
+
+ BUG_ON(!sb->s_bdev);
+
+ disk = sb->s_bdev->bd_disk;
+
+ BUG_ON(!disk || !disk->queue);
+
+ if (!disk->fops->direct_access) {
+ printk(KERN_ERR "%s needs a block device with a "
+ "direct_access() method\n",
+ AZFS_FILESYSTEM_NAME);
+ return -ENOSYS;
+ }
+
+ get_device(disk->driverfs_dev);
+
+ sb->s_magic = AZFS_SUPERBLOCK_MAGIC;
+ sb->s_flags = AZFS_SUPERBLOCK_FLAGS;
+ sb->s_op = &azfs_ops;
+ sb->s_maxbytes = get_capacity(disk) * disk->queue->hardsect_size;
+ sb->s_time_gran = 1;
+
+ spin_lock(&super_list.lock);
+ list_for_each_entry(tmp_super, &super_list.head, list)
+ if (tmp_super->blkdev == sb->s_bdev) {
+ super = tmp_super;
+ break;
+ }
+ spin_unlock(&super_list.lock);
+
+ if (super) {
+ if (data && strlen((char*) data))
+ printk(KERN_WARNING "/dev/%s was already mounted with "
+ "%s before, it will be mounted with "
+ "mount options used last time, "
+ "options just given would be ignored\n",
+ disk->disk_name, AZFS_FILESYSTEM_NAME);
+ sb->s_fs_info = super;
+ } else {
+ super = kzalloc(sizeof(struct azfs_super), GFP_KERNEL);
+ if (!super) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+ sb->s_fs_info = super;
+
+ if (!azfs_parse_mount_parameters(super, (char*) data)) {
+ rc = -EINVAL;
+ goto failed;
+ }
+
+ inode = azfs_new_inode(sb, NULL, S_IFDIR | S_IRWXUGO, 0);
+ if (!inode) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ super->root = d_alloc_root(inode);
+ if (!super->root) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+ dget(super->root);
+
+ INIT_LIST_HEAD(&super->list);
+ INIT_LIST_HEAD(&super->block_list);
+ rwlock_init(&super->lock);
+
+ super->media_size = sb->s_maxbytes;
+
+ if (!super->block_size)
+ super->block_size = sb->s_blocksize;
+ super->block_shift = blksize_bits(super->block_size);
+
+ super->sector_size = disk->queue->hardsect_size;
+ super->sector_shift = blksize_bits(super->sector_size);
+
+ super->blkdev = sb->s_bdev;
+
+ block = azfs_block_init(&super->block_list,
+ 0, super->media_size >> super->block_shift);
+ if (!block) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = disk->fops->direct_access(super->blkdev, 0, &kaddr, &pfn);
+ if (rc < 0) {
+ rc = -EFAULT;
+ goto failed;
+ }
+ super->ph_addr = (unsigned long) kaddr;
+
+ super->io_addr = (unsigned long) ioremap_flags(
+ super->ph_addr, super->media_size, _PAGE_NO_CACHE);
+ if (!super->io_addr) {
+ rc = -EFAULT;
+ goto failed;
+ }
+
+ spin_lock(&super_list.lock);
+ list_add(&super->list, &super_list.head);
+ spin_unlock(&super_list.lock);
+ }
+
+ sb->s_root = super->root;
+ disk->driverfs_dev->driver_data = super;
+ disk->driverfs_dev->platform_data = sb;
+
+ if (super->block_size < PAGE_SIZE)
+ printk(KERN_INFO "Block size on %s is smaller then system "
+ "page size: mmap() would not be supported\n",
+ disk->disk_name);
+
+ return 0;
+
+failed:
+ if (super) {
+ sb->s_root = NULL;
+ sb->s_fs_info = NULL;
+ if (block)
+ azfs_block_free(block);
+ if (super->root)
+ dput(super->root);
+ if (inode)
+ iput(inode);
+ disk->driverfs_dev->driver_data = NULL;
+ kfree(super);
+ disk->driverfs_dev->platform_data = NULL;
+ put_device(disk->driverfs_dev);
+ }
+
+ return rc;
+}
+
+/**
+ * azfs_get_sb - get_sb() method for file_system_type
+ * @fs_type, @flags, @dev_name, @data, @mount: see file_system_type methods
+ */
+static int
+azfs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mount)
+{
+ return get_sb_bdev(fs_type, flags,
+ dev_name, data, azfs_fill_super, mount);
+}
+
+/**
+ * azfs_kill_sb - kill_sb() method for file_system_type
+ * @sb: see file_system_type methods
+ */
+static void
+azfs_kill_sb(struct super_block *sb)
+{
+ sb->s_root = NULL;
+ kill_block_super(sb);
+}
+
+static struct file_system_type azfs_fs = {
+ .owner = THIS_MODULE,
+ .name = AZFS_FILESYSTEM_NAME,
+ .get_sb = azfs_get_sb,
+ .kill_sb = azfs_kill_sb,
+ .fs_flags = AZFS_FILESYSTEM_FLAGS
+};
+
+/**
+ * azfs_init
+ */
+static int __init
+azfs_init(void)
+{
+ int rc;
+
+ INIT_LIST_HEAD(&super_list.head);
+ spin_lock_init(&super_list.lock);
+
+ azfs_znode_cache = kmem_cache_create("azfs_znode_cache",
+ sizeof(struct azfs_znode), 0, AZFS_CACHE_FLAGS, NULL);
+ if (!azfs_znode_cache) {
+ printk(KERN_ERR "Could not allocate inode cache for %s\n",
+ AZFS_FILESYSTEM_NAME);
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ azfs_block_cache = kmem_cache_create("azfs_block_cache",
+ sizeof(struct azfs_block), 0, AZFS_CACHE_FLAGS, NULL);
+ if (!azfs_block_cache) {
+ printk(KERN_ERR "Could not allocate block cache for %s\n",
+ AZFS_FILESYSTEM_NAME);
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = register_filesystem(&azfs_fs);
+ if (rc != 0) {
+ printk(KERN_ERR "Could not register %s\n",
+ AZFS_FILESYSTEM_NAME);
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ if (azfs_block_cache)
+ kmem_cache_destroy(azfs_block_cache);
+
+ if (azfs_znode_cache)
+ kmem_cache_destroy(azfs_znode_cache);
+
+ return rc;
+}
+
+/**
+ * azfs_exit
+ */
+static void __exit
+azfs_exit(void)
+{
+ struct azfs_super *super, *tmp_super;
+ struct azfs_block *block, *tmp_block;
+ struct gendisk *disk;
+
+ spin_lock(&super_list.lock);
+ list_for_each_entry_safe(super, tmp_super, &super_list.head, list) {
+ disk = super->blkdev->bd_disk;
+ list_del(&super->list);
+ iounmap((void*) super->io_addr);
+ write_lock(&super->lock);
+ for_each_block_safe(block, tmp_block, &super->block_list)
+ azfs_block_free(block);
+ write_unlock(&super->lock);
+ disk->driverfs_dev->driver_data = NULL;
+ disk->driverfs_dev->platform_data = NULL;
+ kfree(super);
+ put_device(disk->driverfs_dev);
+ }
+ spin_unlock(&super_list.lock);
+
+ unregister_filesystem(&azfs_fs);
+
+ kmem_cache_destroy(azfs_block_cache);
+ kmem_cache_destroy(azfs_znode_cache);
+}
+
+module_init(azfs_init);
+module_exit(azfs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Maxim Shchetynin <[email protected]>");
+MODULE_DESCRIPTION("Non-buffered file system for IO devices");

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-07-07 19:37:56

by Uli Luckas

[permalink] [raw]
Subject: Re: azfs: initial submit of azfs, a non-buffered filesystem

On Monday, 7. July 2008, Maxim Shchetynin wrote:
> AZFS is a file system which keeps all files on memory mapped random
> access storage.
Hi Maxim,
do you mean "memory backed" instead of "memory mapped"?

regards
Uli

--

------- ROAD ...the handyPC Company - - - ) ) )

Uli Luckas
Software Development

ROAD GmbH
Bennigsenstr. 14 | 12159 Berlin | Germany
fon: +49 (30) 230069 - 64 | fax: +49 (30) 230069 - 69
url: http://www.road.de

Amtsgericht Charlottenburg: HRB 96688 B
Managing directors: Hans-Peter Constien, Hubertus von Streit

2008-07-08 09:11:35

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: azfs: initial submit of azfs, a non-buffered filesystem

Am Mon, 7 Jul 2008 21:37:43 +0200
schrieb Uli Luckas <[email protected]>:

> > AZFS is a file system which keeps all files on memory mapped random
> > access storage.
> Hi Maxim,
> do you mean "memory backed" instead of "memory mapped"?

Right, I have corrected this already in my patch.
Thank you.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Entwicklung GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Johann Weihen
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-07-08 14:45:40

by Arnd Bergmann

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Monday 07 July 2008, Maxim Shchetynin wrote:
> > > +           if (copy_to_user(target, (void*) pin, size)) {
> > > +                   rc = -EFAULT;
> > > +                   goto out;
> > > +           }
> >
> > Question to the powerpc folks: is copy_to_user safe for an __iomem source?
> > Should there be two copies (memcpy_fromio and copy_to_user) instead?
>
> I leave this question open.
>

Cc:'ing some more people that might have more of a clue on this question.
_memcpy_fromio does a "sync" at the start and an "eieio" at the end.
IFAICT, neither are needed here because the source is always memory.

It also handles unaligned memory accesses, which copy_to_user should
also do correctly, so it *looks* like it should work with just a
copy_to_user, but it still feels wrong to use an __iomem pointer
as the source for a copy_to_user.

Any ideas?

Arnd <><

2008-07-09 06:54:33

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: AZFS file system proposal


> Cc:'ing some more people that might have more of a clue on this question.
> _memcpy_fromio does a "sync" at the start and an "eieio" at the end.
> IFAICT, neither are needed here because the source is always memory.
>
> It also handles unaligned memory accesses, which copy_to_user should
> also do correctly, so it *looks* like it should work with just a
> copy_to_user, but it still feels wrong to use an __iomem pointer
> as the source for a copy_to_user.
>
> Any ideas?

It's a bit nasty yes. The problem is that copy_to/from_user might
do cache tricks which will blow up if the area is non-cacheable.

We have a similar problem with Mark's work on faster copy functions
since things like sys_read() can be called on userspace non-cacheable
memory such as spu local stores.

So I'm not 100% sure what the right approach here. Our copy_tofrom_user
today does dcbt on the source for example, which I hope only turns into
a no-op... The risk is if we start using dcbz.

Cheers,
Ben.

2008-07-09 08:59:21

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Tue, 2008-07-01 at 16:59 +0200, Arnd Bergmann wrote:
> I wouldn't hold up merging the file system for this problem, but
> until it is solved, the Kconfig entry should probably have
> a "depends on PPC".

Better, use an ifdef for powerpc flags, and #else to pgprot_noncached.

Ben.

2008-07-09 09:14:03

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

Am Wed, 09 Jul 2008 18:58:38 +1000
schrieb Benjamin Herrenschmidt <[email protected]>:

> On Tue, 2008-07-01 at 16:59 +0200, Arnd Bergmann wrote:
> > I wouldn't hold up merging the file system for this problem, but
> > until it is solved, the Kconfig entry should probably have
> > a "depends on PPC".
>
> Better, use an ifdef for powerpc flags, and #else to pgprot_noncached.

Thank you Ben. Then, how about this?

azfs_mmap(struct file *file, struct vm_area_struct *vma)
{
...
...
...
#ifdef CONFIG_PPC
pgprot_t page_prot;
#endif
...
...
...
#ifdef CONFIG_PPC
page_prot = pgprot_val(vma->vm_page_prot);
page_prot |= (_PAGE_NO_CACHE | _PAGE_RW);
page_prot &= ~_PAGE_GUARDED;
vma->vm_page_prot = __pgprot(page_prot);
#else
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
#endif
...
...
...

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Research & Development GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Martin Jetter
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!

2008-07-09 09:24:32

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: AZFS file system proposal

On Wed, 2008-07-09 at 11:14 +0200, Maxim Shchetynin wrote:
> Am Wed, 09 Jul 2008 18:58:38 +1000
> schrieb Benjamin Herrenschmidt <[email protected]>:
>
> > On Tue, 2008-07-01 at 16:59 +0200, Arnd Bergmann wrote:
> > > I wouldn't hold up merging the file system for this problem, but
> > > until it is solved, the Kconfig entry should probably have
> > > a "depends on PPC".
> >
> > Better, use an ifdef for powerpc flags, and #else to pgprot_noncached.
>
> Thank you Ben. Then, how about this?
>
> azfs_mmap(struct file *file, struct vm_area_struct *vma)
> {
> ...
> ...
> ...
> #ifdef CONFIG_PPC
> pgprot_t page_prot;
> #endif
> ...
> ...
> ...
> #ifdef CONFIG_PPC
> page_prot = pgprot_val(vma->vm_page_prot);
> page_prot |= (_PAGE_NO_CACHE | _PAGE_RW);
> page_prot &= ~_PAGE_GUARDED;
> vma->vm_page_prot = __pgprot(page_prot);
> #else
> vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> #endif
> ...

I'd rather do

pgprot_t prot;

#ifdef CONFIG_PPC
prot = <whatever>
#else
prot = pgprot_noncached(...)
#endif
vma->vm_page_prot = prot;

To limit the number of ifdef's

Cheers,
Ben.

2008-07-09 10:58:58

by Maxim Shchetynin

[permalink] [raw]
Subject: Re: AZFS file system proposal

> I'd rather do
>
> pgprot_t prot;
>
> #ifdef CONFIG_PPC
> prot = <whatever>
> #else
> prot = pgprot_noncached(...)
> #endif
> vma->vm_page_prot = prot;

I have changed my patch accordinly. Thank you.

--
Mit freundlichen Grüßen / met vriendelijke groeten / avec regards

Maxim V. Shchetynin
Linux Kernel Entwicklung
IBM Deutschland Research & Development GmbH
Linux für Cell, Abteilung 3250
Schönaicher Straße 220
71032 Böblingen

Vorsitzender des Aufsichtsrats: Martin Jetter
Geschäftsführung: Herbert Kircher
Sitz der Gesellschaft: Böblingen
Registriergericht: Amtsgericht Stuttgart, HRB 243294

Fahr nur so schnell wie dein Schutzengel fliegen kann!