From: Yulei Zhang <[email protected]>
dmemfs (Direct Memory filesystem) is device memory or reserved
memory based filesystem. This kind of memory is special as it
is not managed by kernel and it is without 'struct page'.
The original purpose of dmemfs is to drop the usage of
'struct page' to save extra system memory.
This patch introduces the basic framework of dmemfs and only
mkdir and create regular file are supported.
Signed-off-by: Xiao Guangrong <[email protected]>
Signed-off-by: Yulei Zhang <[email protected]>
---
fs/Kconfig | 1 +
fs/Makefile | 1 +
fs/dmemfs/Kconfig | 13 ++
fs/dmemfs/Makefile | 7 +
fs/dmemfs/inode.c | 275 +++++++++++++++++++++++++++++++++++++
include/uapi/linux/magic.h | 1 +
6 files changed, 298 insertions(+)
create mode 100644 fs/dmemfs/Kconfig
create mode 100644 fs/dmemfs/Makefile
create mode 100644 fs/dmemfs/inode.c
diff --git a/fs/Kconfig b/fs/Kconfig
index aa4c12282301..18e72089426f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -41,6 +41,7 @@ source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"
source "fs/zonefs/Kconfig"
+source "fs/dmemfs/Kconfig"
config FS_DAX
bool "Direct Access (DAX) support"
diff --git a/fs/Makefile b/fs/Makefile
index 1c7b0e3f6daa..10e0302c5902 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -136,3 +136,4 @@ obj-$(CONFIG_EFIVAR_FS) += efivarfs/
obj-$(CONFIG_EROFS_FS) += erofs/
obj-$(CONFIG_VBOXSF_FS) += vboxsf/
obj-$(CONFIG_ZONEFS_FS) += zonefs/
+obj-$(CONFIG_DMEM_FS) += dmemfs/
diff --git a/fs/dmemfs/Kconfig b/fs/dmemfs/Kconfig
new file mode 100644
index 000000000000..d2894a513de0
--- /dev/null
+++ b/fs/dmemfs/Kconfig
@@ -0,0 +1,13 @@
+config DMEM_FS
+ tristate "Direct Memory filesystem support"
+ help
+ dmemfs (Direct Memory filesystem) is device memory or reserved
+ memory based filesystem. This kind of memory is special as it
+ is not managed by kernel and it is without 'struct page'.
+
+ The original purpose of dmemfs is saving extra memory of
+ 'struct page' that reduces the total cost of ownership (TCO)
+ for cloud providers.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called dmemfs.
diff --git a/fs/dmemfs/Makefile b/fs/dmemfs/Makefile
new file mode 100644
index 000000000000..73bdc9cbc87e
--- /dev/null
+++ b/fs/dmemfs/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the linux dmem-filesystem routines.
+#
+obj-$(CONFIG_DMEM_FS) += dmemfs.o
+
+dmemfs-y += inode.o
diff --git a/fs/dmemfs/inode.c b/fs/dmemfs/inode.c
new file mode 100644
index 000000000000..6a8a2d9f94e9
--- /dev/null
+++ b/fs/dmemfs/inode.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/dmemfs/inode.c
+ *
+ * Authors:
+ * Xiao Guangrong <[email protected]>
+ * Chen Zhuo <[email protected]>
+ * Haiwei Li <[email protected]>
+ * Yulei Zhang <[email protected]>
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/magic.h>
+#include <linux/mman.h>
+#include <linux/statfs.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/pfn_t.h>
+#include <linux/pagevec.h>
+#include <linux/fs_parser.h>
+#include <linux/seq_file.h>
+
+MODULE_AUTHOR("Tencent Corporation");
+MODULE_LICENSE("GPL v2");
+
+struct dmemfs_mount_opts {
+ unsigned long dpage_size;
+};
+
+struct dmemfs_fs_info {
+ struct dmemfs_mount_opts mount_opts;
+};
+
+enum dmemfs_param {
+ Opt_dpagesize,
+};
+
+const struct fs_parameter_spec dmemfs_fs_parameters[] = {
+ fsparam_string("pagesize", Opt_dpagesize),
+ {}
+};
+
+static int check_dpage_size(unsigned long dpage_size)
+{
+ if (dpage_size != PAGE_SIZE && dpage_size != PMD_SIZE &&
+ dpage_size != PUD_SIZE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct inode *
+dmemfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode,
+ dev_t dev);
+
+static int
+dmemfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+ struct inode *inode = dmemfs_get_inode(dir->i_sb, dir, mode, dev);
+ int error = -ENOSPC;
+
+ if (inode) {
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
+ error = 0;
+ dir->i_mtime = dir->i_ctime = current_time(inode);
+ }
+ return error;
+}
+
+static int dmemfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
+{
+ return dmemfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int dmemfs_mkdir(struct inode *dir, struct dentry *dentry,
+ umode_t mode)
+{
+ int retval = dmemfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+
+ if (!retval)
+ inc_nlink(dir);
+ return retval;
+}
+
+static const struct inode_operations dmemfs_dir_inode_operations = {
+ .create = dmemfs_create,
+ .lookup = simple_lookup,
+ .unlink = simple_unlink,
+ .mkdir = dmemfs_mkdir,
+ .rmdir = simple_rmdir,
+ .rename = simple_rename,
+};
+
+static const struct inode_operations dmemfs_file_inode_operations = {
+ .setattr = simple_setattr,
+ .getattr = simple_getattr,
+};
+
+int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ return 0;
+}
+
+static const struct file_operations dmemfs_file_operations = {
+ .mmap = dmemfs_file_mmap,
+};
+
+static int dmemfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct dmemfs_fs_info *fsi = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt, ret;
+
+ opt = fs_parse(fc, dmemfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_dpagesize:
+ fsi->mount_opts.dpage_size = memparse(param->string, NULL);
+ ret = check_dpage_size(fsi->mount_opts.dpage_size);
+ if (ret) {
+ pr_warn("dmemfs: unknown pagesize %x.\n",
+ result.uint_32);
+ return ret;
+ }
+ break;
+ default:
+ pr_warn("dmemfs: unknown mount option [%x].\n",
+ opt);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+struct inode *dmemfs_get_inode(struct super_block *sb,
+ const struct inode *dir, umode_t mode, dev_t dev)
+{
+ struct inode *inode = new_inode(sb);
+
+ if (inode) {
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, dir, mode);
+ inode->i_mapping->a_ops = &empty_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+ mapping_set_unevictable(inode->i_mapping);
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ switch (mode & S_IFMT) {
+ default:
+ init_special_inode(inode, mode, dev);
+ break;
+ case S_IFREG:
+ inode->i_op = &dmemfs_file_inode_operations;
+ inode->i_fop = &dmemfs_file_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &dmemfs_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+
+ /*
+ * directory inodes start off with i_nlink == 2
+ * (for "." entry)
+ */
+ inc_nlink(inode);
+ break;
+ case S_IFLNK:
+ inode->i_op = &page_symlink_inode_operations;
+ break;
+ }
+ }
+ return inode;
+}
+
+static int dmemfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ simple_statfs(dentry, buf);
+ buf->f_bsize = dentry->d_sb->s_blocksize;
+
+ return 0;
+}
+
+static const struct super_operations dmemfs_ops = {
+ .statfs = dmemfs_statfs,
+ .drop_inode = generic_delete_inode,
+};
+
+static int
+dmemfs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct inode *inode;
+ struct dmemfs_fs_info *fsi = sb->s_fs_info;
+
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_blocksize = fsi->mount_opts.dpage_size;
+ sb->s_blocksize_bits = ilog2(fsi->mount_opts.dpage_size);
+ sb->s_magic = DMEMFS_MAGIC;
+ sb->s_op = &dmemfs_ops;
+ sb->s_time_gran = 1;
+
+ inode = dmemfs_get_inode(sb, NULL, S_IFDIR, 0);
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int dmemfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, dmemfs_fill_super);
+}
+
+static void dmemfs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations dmemfs_context_ops = {
+ .free = dmemfs_free_fc,
+ .parse_param = dmemfs_parse_param,
+ .get_tree = dmemfs_get_tree,
+};
+
+int dmemfs_init_fs_context(struct fs_context *fc)
+{
+ struct dmemfs_fs_info *fsi;
+
+ fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
+ if (!fsi)
+ return -ENOMEM;
+
+ fsi->mount_opts.dpage_size = PAGE_SIZE;
+ fc->s_fs_info = fsi;
+ fc->ops = &dmemfs_context_ops;
+ return 0;
+}
+
+static void dmemfs_kill_sb(struct super_block *sb)
+{
+ kill_litter_super(sb);
+}
+
+static struct file_system_type dmemfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "dmemfs",
+ .init_fs_context = dmemfs_init_fs_context,
+ .kill_sb = dmemfs_kill_sb,
+};
+
+static int __init dmemfs_init(void)
+{
+ int ret;
+
+ ret = register_filesystem(&dmemfs_fs_type);
+
+ return ret;
+}
+
+static void __exit dmemfs_uninit(void)
+{
+ unregister_filesystem(&dmemfs_fs_type);
+}
+
+module_init(dmemfs_init)
+module_exit(dmemfs_uninit)
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index f3956fc11de6..3fbd06661c8c 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -97,5 +97,6 @@
#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
#define Z3FOLD_MAGIC 0x33
#define PPC_CMM_MAGIC 0xc7571590
+#define DMEMFS_MAGIC 0x2ace90c6
#endif /* __LINUX_MAGIC_H__ */
--
2.28.0
On Thu, Oct 08, 2020 at 03:53:51PM +0800, [email protected] wrote:
> +static struct inode *
> +dmemfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode,
> + dev_t dev);
WTF is 'dev' for?
> +static int
> +dmemfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
> +{
> + struct inode *inode = dmemfs_get_inode(dir->i_sb, dir, mode, dev);
> + int error = -ENOSPC;
> +
> + if (inode) {
> + d_instantiate(dentry, inode);
> + dget(dentry); /* Extra count - pin the dentry in core */
> + error = 0;
> + dir->i_mtime = dir->i_ctime = current_time(inode);
> + }
> + return error;
> +}
... same here, seeing that you only call that thing from the next two functions
and you do *not* provide ->mknod() as a method (unsurprisingly - what would
device nodes do there?)
> +static int dmemfs_create(struct inode *dir, struct dentry *dentry,
> + umode_t mode, bool excl)
> +{
> + return dmemfs_mknod(dir, dentry, mode | S_IFREG, 0);
> +}
> +
> +static int dmemfs_mkdir(struct inode *dir, struct dentry *dentry,
> + umode_t mode)
> +{
> + int retval = dmemfs_mknod(dir, dentry, mode | S_IFDIR, 0);
> +
> + if (!retval)
> + inc_nlink(dir);
> + return retval;
> +}
> +int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> + return 0;
> +}
> +
> +static const struct file_operations dmemfs_file_operations = {
> + .mmap = dmemfs_file_mmap,
> +};
Er... Is that a placeholder for later in the series? Because as it is,
it makes no sense whatsoever - "it can be mmapped, but any access to the
mapped area will segfault".
> +struct inode *dmemfs_get_inode(struct super_block *sb,
> + const struct inode *dir, umode_t mode, dev_t dev)
> +{
> + struct inode *inode = new_inode(sb);
> +
> + if (inode) {
> + inode->i_ino = get_next_ino();
> + inode_init_owner(inode, dir, mode);
> + inode->i_mapping->a_ops = &empty_aops;
> + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> + mapping_set_unevictable(inode->i_mapping);
> + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
> + switch (mode & S_IFMT) {
> + default:
> + init_special_inode(inode, mode, dev);
> + break;
> + case S_IFREG:
> + inode->i_op = &dmemfs_file_inode_operations;
> + inode->i_fop = &dmemfs_file_operations;
> + break;
> + case S_IFDIR:
> + inode->i_op = &dmemfs_dir_inode_operations;
> + inode->i_fop = &simple_dir_operations;
> +
> + /*
> + * directory inodes start off with i_nlink == 2
> + * (for "." entry)
> + */
> + inc_nlink(inode);
> + break;
> + case S_IFLNK:
> + inode->i_op = &page_symlink_inode_operations;
> + break;
Where would symlinks come from? Or anything other than regular files and
directories, for that matter...
On Wed, Nov 11, 2020 at 4:04 AM Al Viro <[email protected]> wrote:
>
> On Thu, Oct 08, 2020 at 03:53:51PM +0800, [email protected] wrote:
>
> > +static struct inode *
> > +dmemfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode,
> > + dev_t dev);
>
> WTF is 'dev' for?
>
> > +static int
> > +dmemfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
> > +{
> > + struct inode *inode = dmemfs_get_inode(dir->i_sb, dir, mode, dev);
> > + int error = -ENOSPC;
> > +
> > + if (inode) {
> > + d_instantiate(dentry, inode);
> > + dget(dentry); /* Extra count - pin the dentry in core */
> > + error = 0;
> > + dir->i_mtime = dir->i_ctime = current_time(inode);
> > + }
> > + return error;
> > +}
>
> ... same here, seeing that you only call that thing from the next two functions
> and you do *not* provide ->mknod() as a method (unsurprisingly - what would
> device nodes do there?)
>
Thanks for pointing this out. we may need support the mknod method, otherwise
the dev is redundant and need to be removed.
> > +static int dmemfs_create(struct inode *dir, struct dentry *dentry,
> > + umode_t mode, bool excl)
> > +{
> > + return dmemfs_mknod(dir, dentry, mode | S_IFREG, 0);
> > +}
> > +
> > +static int dmemfs_mkdir(struct inode *dir, struct dentry *dentry,
> > + umode_t mode)
> > +{
> > + int retval = dmemfs_mknod(dir, dentry, mode | S_IFDIR, 0);
> > +
> > + if (!retval)
> > + inc_nlink(dir);
> > + return retval;
> > +}
>
> > +int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > + return 0;
> > +}
> > +
> > +static const struct file_operations dmemfs_file_operations = {
> > + .mmap = dmemfs_file_mmap,
> > +};
>
> Er... Is that a placeholder for later in the series? Because as it is,
> it makes no sense whatsoever - "it can be mmapped, but any access to the
> mapped area will segfault".
>
Yes, we seperate the full implementation for dmemfs_file_mmap into
patch 05/35, it
will assign the interfaces to handle the page fault.
> > +struct inode *dmemfs_get_inode(struct super_block *sb,
> > + const struct inode *dir, umode_t mode, dev_t dev)
> > +{
> > + struct inode *inode = new_inode(sb);
> > +
> > + if (inode) {
> > + inode->i_ino = get_next_ino();
> > + inode_init_owner(inode, dir, mode);
> > + inode->i_mapping->a_ops = &empty_aops;
> > + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> > + mapping_set_unevictable(inode->i_mapping);
> > + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
> > + switch (mode & S_IFMT) {
> > + default:
> > + init_special_inode(inode, mode, dev);
> > + break;
> > + case S_IFREG:
> > + inode->i_op = &dmemfs_file_inode_operations;
> > + inode->i_fop = &dmemfs_file_operations;
> > + break;
> > + case S_IFDIR:
> > + inode->i_op = &dmemfs_dir_inode_operations;
> > + inode->i_fop = &simple_dir_operations;
> > +
> > + /*
> > + * directory inodes start off with i_nlink == 2
> > + * (for "." entry)
> > + */
> > + inc_nlink(inode);
> > + break;
> > + case S_IFLNK:
> > + inode->i_op = &page_symlink_inode_operations;
> > + break;
>
> Where would symlinks come from? Or anything other than regular files and
> directories, for that matter...
You are right, so far it just supports regular files and directories.
On Wed, Nov 11, 2020 at 04:53:00PM +0800, yulei zhang wrote:
> > ... same here, seeing that you only call that thing from the next two functions
> > and you do *not* provide ->mknod() as a method (unsurprisingly - what would
> > device nodes do there?)
> >
>
> Thanks for pointing this out. we may need support the mknod method, otherwise
> the dev is redundant and need to be removed.
I'd suggest turning that into (static) __create_file(....) with
static int dmemfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
return __create_file(dir, dentry, mode | S_IFREG);
}
static int dmemfs_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode)
{
return __create_file(dir, dentry, mode | S_IFDIR);
}
(i.e. even inc_nlink() of parent folded into that).
[snip]
> Yes, we seperate the full implementation for dmemfs_file_mmap into
> patch 05/35, it
> will assign the interfaces to handle the page fault.
It would be less confusing to move the introduction of ->mmap() to that patch,
then.
On Thu, Nov 12, 2020 at 7:09 AM Al Viro <[email protected]> wrote:
>
> On Wed, Nov 11, 2020 at 04:53:00PM +0800, yulei zhang wrote:
>
> > > ... same here, seeing that you only call that thing from the next two functions
> > > and you do *not* provide ->mknod() as a method (unsurprisingly - what would
> > > device nodes do there?)
> > >
> >
> > Thanks for pointing this out. we may need support the mknod method, otherwise
> > the dev is redundant and need to be removed.
>
> I'd suggest turning that into (static) __create_file(....) with
>
> static int dmemfs_create(struct inode *dir, struct dentry *dentry,
> umode_t mode, bool excl)
> {
> return __create_file(dir, dentry, mode | S_IFREG);
> }
>
> static int dmemfs_mkdir(struct inode *dir, struct dentry *dentry,
> umode_t mode)
> {
> return __create_file(dir, dentry, mode | S_IFDIR);
> }
>
> (i.e. even inc_nlink() of parent folded into that).
>
> [snip]
>
> > Yes, we seperate the full implementation for dmemfs_file_mmap into
> > patch 05/35, it
> > will assign the interfaces to handle the page fault.
>
> It would be less confusing to move the introduction of ->mmap() to that patch,
> then.
Thanks for the suggestion. will refactor the patches accordingly.