Posting latest version to virtio mailing list as well. I guess patches 15
and 16 are most interesting to the virt community.
The reasons for creating a new fs are spelled out in the previous posting:
https://lore.kernel.org/linux-fsdevel/[email protected]/
Git tree for this version is available here:
git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git#virtiofs-v4
Thanks,
Miklos
---
David Howells (3):
vfs: Create fs_context-aware mount_bdev() replacement
fuse: convert to use the new mount API
vfs: subtype handling moved to fuse
Miklos Szeredi (2):
fuse: delete dentry if timeout is zero
fuse: dissociate DESTROY from fuseblk
Stefan Hajnoczi (7):
fuse: export fuse_end_request()
fuse: export fuse_len_args()
fuse: export fuse_get_unique()
fuse: extract fuse_fill_super_common()
fuse: add fuse_iqueue_ops callbacks
virtio-fs: add virtiofs filesystem
virtio-fs: add Documentation/filesystems/virtiofs.rst
Vivek Goyal (4):
fuse: export fuse_send_init_request()
fuse: export fuse_dequeue_forget() function
fuse: separate fuse device allocation and installation in fuse_conn
fuse: allow skipping control interface and forced unmount
Documentation/filesystems/index.rst | 10 +
Documentation/filesystems/virtiofs.rst | 60 ++
MAINTAINERS | 11 +
fs/fs_context.c | 16 +-
fs/fuse/Kconfig | 11 +
fs/fuse/Makefile | 1 +
fs/fuse/cuse.c | 4 +-
fs/fuse/dev.c | 93 +-
fs/fuse/dir.c | 28 +-
fs/fuse/fuse_i.h | 118 ++-
fs/fuse/inode.c | 464 +++++-----
fs/fuse/virtio_fs.c | 1072 ++++++++++++++++++++++++
fs/namespace.c | 2 -
fs/proc_namespace.c | 2 +-
fs/super.c | 116 ++-
include/linux/fs_context.h | 10 +-
include/uapi/linux/virtio_fs.h | 19 +
include/uapi/linux/virtio_ids.h | 1 +
18 files changed, 1774 insertions(+), 264 deletions(-)
create mode 100644 Documentation/filesystems/virtiofs.rst
create mode 100644 fs/fuse/virtio_fs.c
create mode 100644 include/uapi/linux/virtio_fs.h
--
2.21.0
From: David Howells <[email protected]>
Convert the fuse filesystem to the new internal mount API as the old
one will be obsoleted and removed. This allows greater flexibility in
communication of mount parameters between userspace, the VFS and the
filesystem.
See Documentation/filesystems/mount_api.txt for more information.
Signed-off-by: David Howells <[email protected]>
Signed-off-by: Al Viro <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/inode.c | 292 +++++++++++++++++++++++++++---------------------
1 file changed, 167 insertions(+), 125 deletions(-)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4bb885b0f032..2597ed237ada 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -15,7 +15,8 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/statfs.h>
#include <linux/random.h>
#include <linux/sched.h>
@@ -59,7 +60,13 @@ MODULE_PARM_DESC(max_user_congthresh,
/** Congestion starts at 75% of maximum */
#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
-struct fuse_mount_data {
+#ifdef CONFIG_BLOCK
+static struct file_system_type fuseblk_fs_type;
+#endif
+
+struct fuse_fs_context {
+ const char *subtype;
+ bool is_bdev;
int fd;
unsigned rootmode;
kuid_t user_id;
@@ -443,6 +450,8 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
}
enum {
+ OPT_SOURCE,
+ OPT_SUBTYPE,
OPT_FD,
OPT_ROOTMODE,
OPT_USER_ID,
@@ -454,111 +463,110 @@ enum {
OPT_ERR
};
-static const match_table_t tokens = {
- {OPT_FD, "fd=%u"},
- {OPT_ROOTMODE, "rootmode=%o"},
- {OPT_USER_ID, "user_id=%u"},
- {OPT_GROUP_ID, "group_id=%u"},
- {OPT_DEFAULT_PERMISSIONS, "default_permissions"},
- {OPT_ALLOW_OTHER, "allow_other"},
- {OPT_MAX_READ, "max_read=%u"},
- {OPT_BLKSIZE, "blksize=%u"},
- {OPT_ERR, NULL}
+static const struct fs_parameter_spec fuse_param_specs[] = {
+ fsparam_string ("source", OPT_SOURCE),
+ fsparam_u32 ("fd", OPT_FD),
+ fsparam_u32oct ("rootmode", OPT_ROOTMODE),
+ fsparam_u32 ("user_id", OPT_USER_ID),
+ fsparam_u32 ("group_id", OPT_GROUP_ID),
+ fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS),
+ fsparam_flag ("allow_other", OPT_ALLOW_OTHER),
+ fsparam_u32 ("max_read", OPT_MAX_READ),
+ fsparam_u32 ("blksize", OPT_BLKSIZE),
+ __fsparam(fs_param_is_string, "subtype", OPT_SUBTYPE,
+ fs_param_v_optional),
+ {}
+};
+
+static const struct fs_parameter_description fuse_fs_parameters = {
+ .name = "fuse",
+ .specs = fuse_param_specs,
};
-static int fuse_match_uint(substring_t *s, unsigned int *res)
+static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- int err = -ENOMEM;
- char *buf = match_strdup(s);
- if (buf) {
- err = kstrtouint(buf, 10, res);
- kfree(buf);
+ struct fs_parse_result result;
+ struct fuse_fs_context *ctx = fc->fs_private;
+ int opt;
+
+ opt = fs_parse(fc, &fuse_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case OPT_SOURCE:
+ if (fc->source)
+ return invalf(fc, "fuse: Multiple sources specified");
+ fc->source = param->string;
+ param->string = NULL;
+ break;
+
+ case OPT_SUBTYPE:
+ if (ctx->subtype)
+ return invalf(fc, "fuse: Multiple subtypes specified");
+ ctx->subtype = param->string;
+ param->string = NULL;
+ return 0;
+
+ case OPT_FD:
+ ctx->fd = result.uint_32;
+ ctx->fd_present = 1;
+ break;
+
+ case OPT_ROOTMODE:
+ if (!fuse_valid_type(result.uint_32))
+ return invalf(fc, "fuse: Invalid rootmode");
+ ctx->rootmode = result.uint_32;
+ ctx->rootmode_present = 1;
+ break;
+
+ case OPT_USER_ID:
+ ctx->user_id = make_kuid(fc->user_ns, result.uint_32);
+ if (!uid_valid(ctx->user_id))
+ return invalf(fc, "fuse: Invalid user_id");
+ ctx->user_id_present = 1;
+ break;
+
+ case OPT_GROUP_ID:
+ ctx->group_id = make_kgid(fc->user_ns, result.uint_32);
+ if (!gid_valid(ctx->group_id))
+ return invalf(fc, "fuse: Invalid group_id");
+ ctx->group_id_present = 1;
+ break;
+
+ case OPT_DEFAULT_PERMISSIONS:
+ ctx->default_permissions = 1;
+ break;
+
+ case OPT_ALLOW_OTHER:
+ ctx->allow_other = 1;
+ break;
+
+ case OPT_MAX_READ:
+ ctx->max_read = result.uint_32;
+ break;
+
+ case OPT_BLKSIZE:
+ if (!ctx->is_bdev)
+ return invalf(fc, "fuse: blksize only supported for fuseblk");
+ ctx->blksize = result.uint_32;
+ break;
+
+ default:
+ return -EINVAL;
}
- return err;
+
+ return 0;
}
-static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
- struct user_namespace *user_ns)
+static void fuse_free_fc(struct fs_context *fc)
{
- char *p;
- memset(d, 0, sizeof(struct fuse_mount_data));
- d->max_read = ~0;
- d->blksize = FUSE_DEFAULT_BLKSIZE;
-
- while ((p = strsep(&opt, ",")) != NULL) {
- int token;
- int value;
- unsigned uv;
- substring_t args[MAX_OPT_ARGS];
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case OPT_FD:
- if (match_int(&args[0], &value))
- return 0;
- d->fd = value;
- d->fd_present = 1;
- break;
-
- case OPT_ROOTMODE:
- if (match_octal(&args[0], &value))
- return 0;
- if (!fuse_valid_type(value))
- return 0;
- d->rootmode = value;
- d->rootmode_present = 1;
- break;
-
- case OPT_USER_ID:
- if (fuse_match_uint(&args[0], &uv))
- return 0;
- d->user_id = make_kuid(user_ns, uv);
- if (!uid_valid(d->user_id))
- return 0;
- d->user_id_present = 1;
- break;
-
- case OPT_GROUP_ID:
- if (fuse_match_uint(&args[0], &uv))
- return 0;
- d->group_id = make_kgid(user_ns, uv);
- if (!gid_valid(d->group_id))
- return 0;
- d->group_id_present = 1;
- break;
-
- case OPT_DEFAULT_PERMISSIONS:
- d->default_permissions = 1;
- break;
-
- case OPT_ALLOW_OTHER:
- d->allow_other = 1;
- break;
-
- case OPT_MAX_READ:
- if (match_int(&args[0], &value))
- return 0;
- d->max_read = value;
- break;
-
- case OPT_BLKSIZE:
- if (!is_bdev || match_int(&args[0], &value))
- return 0;
- d->blksize = value;
- break;
-
- default:
- return 0;
- }
- }
+ struct fuse_fs_context *ctx = fc->fs_private;
- if (!d->fd_present || !d->rootmode_present ||
- !d->user_id_present || !d->group_id_present)
- return 0;
-
- return 1;
+ if (ctx) {
+ kfree(ctx->subtype);
+ kfree(ctx);
+ }
}
static int fuse_show_options(struct seq_file *m, struct dentry *root)
@@ -1075,12 +1083,12 @@ void fuse_dev_free(struct fuse_dev *fud)
}
EXPORT_SYMBOL_GPL(fuse_dev_free);
-static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
{
+ struct fuse_fs_context *ctx = fsc->fs_private;
struct fuse_dev *fud;
struct fuse_conn *fc;
struct inode *root;
- struct fuse_mount_data d;
struct file *file;
struct dentry *root_dentry;
struct fuse_req *init_req;
@@ -1093,19 +1101,19 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
- if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
- goto err;
-
if (is_bdev) {
#ifdef CONFIG_BLOCK
err = -EINVAL;
- if (!sb_set_blocksize(sb, d.blksize))
+ if (!sb_set_blocksize(sb, ctx->blksize))
goto err;
#endif
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
}
+
+ sb->s_subtype = ctx->subtype;
+ ctx->subtype = NULL;
sb->s_magic = FUSE_SUPER_MAGIC;
sb->s_op = &fuse_super_operations;
sb->s_xattr = fuse_xattr_handlers;
@@ -1116,7 +1124,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (sb->s_user_ns != &init_user_ns)
sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
- file = fget(d.fd);
+ file = fget(ctx->fd);
err = -EINVAL;
if (!file)
goto err;
@@ -1159,17 +1167,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
fc->dont_mask = 1;
sb->s_flags |= SB_POSIXACL;
- fc->default_permissions = d.default_permissions;
- fc->allow_other = d.allow_other;
- fc->user_id = d.user_id;
- fc->group_id = d.group_id;
- fc->max_read = max_t(unsigned, 4096, d.max_read);
+ fc->default_permissions = ctx->default_permissions;
+ fc->allow_other = ctx->allow_other;
+ fc->user_id = ctx->user_id;
+ fc->group_id = ctx->group_id;
+ fc->max_read = max_t(unsigned, 4096, ctx->max_read);
/* Used by get_root_inode() */
sb->s_fs_info = fc;
err = -ENOMEM;
- root = fuse_get_root_inode(sb, d.rootmode);
+ root = fuse_get_root_inode(sb, ctx->rootmode);
sb->s_d_op = &fuse_root_dentry_operations;
root_dentry = d_make_root(root);
if (!root_dentry)
@@ -1229,11 +1237,50 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
return err;
}
-static struct dentry *fuse_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *raw_data)
+static int fuse_get_tree(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+ struct fuse_fs_context *ctx = fc->fs_private;
+
+ if (!ctx->fd_present || !ctx->rootmode_present ||
+ !ctx->user_id_present || !ctx->group_id_present)
+ return -EINVAL;
+
+#ifdef CONFIG_BLOCK
+ if (ctx->is_bdev)
+ return vfs_get_block_super(fc, fuse_fill_super);
+#endif
+
+ return get_tree_nodev(fc, fuse_fill_super);
+}
+
+static const struct fs_context_operations fuse_context_ops = {
+ .free = fuse_free_fc,
+ .parse_param = fuse_parse_param,
+ .get_tree = fuse_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int fuse_init_fs_context(struct fs_context *fc)
+{
+ struct fuse_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->max_read = ~0;
+ ctx->blksize = FUSE_DEFAULT_BLKSIZE;
+
+#ifdef CONFIG_BLOCK
+ if (fc->fs_type == &fuseblk_fs_type)
+ ctx->is_bdev = true;
+#endif
+
+ fc->fs_private = ctx;
+ fc->ops = &fuse_context_ops;
+ return 0;
}
static void fuse_sb_destroy(struct super_block *sb)
@@ -1262,19 +1309,13 @@ static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
.name = "fuse",
.fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
- .mount = fuse_mount,
+ .init_fs_context = fuse_init_fs_context,
+ .parameters = &fuse_fs_parameters,
.kill_sb = fuse_kill_sb_anon,
};
MODULE_ALIAS_FS("fuse");
#ifdef CONFIG_BLOCK
-static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *raw_data)
-{
- return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
-}
-
static void fuse_kill_sb_blk(struct super_block *sb)
{
fuse_sb_destroy(sb);
@@ -1284,7 +1325,8 @@ static void fuse_kill_sb_blk(struct super_block *sb)
static struct file_system_type fuseblk_fs_type = {
.owner = THIS_MODULE,
.name = "fuseblk",
- .mount = fuse_mount_blk,
+ .init_fs_context = fuse_init_fs_context,
+ .parameters = &fuse_fs_parameters,
.kill_sb = fuse_kill_sb_blk,
.fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
};
--
2.21.0
From: David Howells <[email protected]>
The unused vfs code can be removed. Don't pass empty subtype (same as if
->parse callback isn't called).
The bits that are left involve determining whether it's permitted to split the
filesystem type string passed in to mount(2). Consequently, this means that we
cannot get rid of the FS_HAS_SUBTYPE flag unless we define that a type string
with a dot in it always indicates a subtype specification.
Signed-off-by: David Howells <[email protected]>
Signed-off-by: Al Viro <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fs_context.c | 14 --------------
fs/fuse/inode.c | 3 +--
fs/namespace.c | 2 --
fs/proc_namespace.c | 2 +-
fs/super.c | 5 -----
include/linux/fs_context.h | 1 -
6 files changed, 2 insertions(+), 25 deletions(-)
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 270ecae32216..f6dee3b2b7de 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -508,7 +508,6 @@ void put_fs_context(struct fs_context *fc)
put_net(fc->net_ns);
put_user_ns(fc->user_ns);
put_cred(fc->cred);
- kfree(fc->subtype);
put_fc_log(fc);
put_filesystem(fc->fs_type);
kfree(fc->source);
@@ -575,17 +574,6 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
return 0;
}
- if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) &&
- strcmp(param->key, "subtype") == 0) {
- if (param->type != fs_value_is_string)
- return invalf(fc, "VFS: Legacy: Non-string subtype");
- if (fc->subtype)
- return invalf(fc, "VFS: Legacy: Multiple subtype");
- fc->subtype = param->string;
- param->string = NULL;
- return 0;
- }
-
if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
@@ -742,8 +730,6 @@ void vfs_clean_context(struct fs_context *fc)
fc->s_fs_info = NULL;
fc->sb_flags = 0;
security_free_mnt_opts(&fc->security);
- kfree(fc->subtype);
- fc->subtype = NULL;
kfree(fc->source);
fc->source = NULL;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2597ed237ada..e3375ce8e97f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -473,8 +473,7 @@ static const struct fs_parameter_spec fuse_param_specs[] = {
fsparam_flag ("allow_other", OPT_ALLOW_OTHER),
fsparam_u32 ("max_read", OPT_MAX_READ),
fsparam_u32 ("blksize", OPT_BLKSIZE),
- __fsparam(fs_param_is_string, "subtype", OPT_SUBTYPE,
- fs_param_v_optional),
+ fsparam_string ("subtype", OPT_SUBTYPE),
{}
};
diff --git a/fs/namespace.c b/fs/namespace.c
index d28d30b13043..105f995543f6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2768,8 +2768,6 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
put_filesystem(type);
return -EINVAL;
}
- } else {
- subtype = "";
}
}
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index e16fb8f2049e..273ee82d8aa9 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -88,7 +88,7 @@ static inline void mangle(struct seq_file *m, const char *s)
static void show_type(struct seq_file *m, struct super_block *sb)
{
mangle(m, sb->s_type->name);
- if (sb->s_subtype && sb->s_subtype[0]) {
+ if (sb->s_subtype) {
seq_putc(m, '.');
mangle(m, sb->s_subtype);
}
diff --git a/fs/super.c b/fs/super.c
index 80b56bc7d2db..e30a4279784c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1535,11 +1535,6 @@ int vfs_get_tree(struct fs_context *fc)
sb = fc->root->d_sb;
WARN_ON(!sb->s_bdi);
- if (fc->subtype && !sb->s_subtype) {
- sb->s_subtype = fc->subtype;
- fc->subtype = NULL;
- }
-
/*
* Write barrier is for super_cache_count(). We place it before setting
* SB_BORN as the data dependency between the two functions is the
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index ed5b4349671e..461d37912bed 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -97,7 +97,6 @@ struct fs_context {
const struct cred *cred; /* The mounter's credentials */
struct fc_log *log; /* Logging buffer */
const char *source; /* The source name (eg. dev path) */
- const char *subtype; /* The subtype to set on the superblock */
void *security; /* Linux S&M options */
void *s_fs_info; /* Proposed s_fs_info */
fmode_t bdev_mode; /* File open mode for bdev */
--
2.21.0
From: Stefan Hajnoczi <[email protected]>
virtio-fs will need to complete requests from outside fs/fuse/dev.c. Make
the symbol visible.
Signed-off-by: Stefan Hajnoczi <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/dev.c | 19 ++++++++++---------
fs/fuse/fuse_i.h | 5 +++++
2 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fdb85895737b..137b3de511ac 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -427,7 +427,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
* the 'end' callback is called if given, else the reference to the
* request is released
*/
-static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_iqueue *fiq = &fc->iq;
@@ -480,6 +480,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
put_request:
fuse_put_request(fc, req);
}
+EXPORT_SYMBOL_GPL(fuse_request_end);
static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
@@ -567,12 +568,12 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
req->in.h.unique = fuse_get_unique(fiq);
queue_request(fiq, req);
/* acquire extra reference, since request is still needed
- after request_end() */
+ after fuse_request_end() */
__fuse_get_request(req);
spin_unlock(&fiq->waitq.lock);
request_wait_answer(fc, req);
- /* Pairs with smp_wmb() in request_end() */
+ /* Pairs with smp_wmb() in fuse_request_end() */
smp_rmb();
}
}
@@ -1302,7 +1303,7 @@ __releases(fiq->waitq.lock)
* the pending list and copies request data to userspace buffer. If
* no reply is needed (FORGET) or request has been aborted or there
* was an error during the copying then it's finished by calling
- * request_end(). Otherwise add it to the processing list, and set
+ * fuse_request_end(). Otherwise add it to the processing list, and set
* the 'sent' flag.
*/
static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
@@ -1380,7 +1381,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
/* SETXATTR is special, since it may contain too large data */
if (in->h.opcode == FUSE_SETXATTR)
req->out.h.error = -E2BIG;
- request_end(fc, req);
+ fuse_request_end(fc, req);
goto restart;
}
spin_lock(&fpq->lock);
@@ -1423,7 +1424,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
if (!test_bit(FR_PRIVATE, &req->flags))
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- request_end(fc, req);
+ fuse_request_end(fc, req);
return err;
err_unlock:
@@ -1931,7 +1932,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
* the write buffer. The request is then searched on the processing
* list by the unique ID found in the header. If found, then remove
* it from the list and copy the rest of the buffer to the request.
- * The request is finished by calling request_end()
+ * The request is finished by calling fuse_request_end().
*/
static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_copy_state *cs, size_t nbytes)
@@ -2018,7 +2019,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
list_del_init(&req->list);
spin_unlock(&fpq->lock);
- request_end(fc, req);
+ fuse_request_end(fc, req);
out:
return err ? err : nbytes;
@@ -2158,7 +2159,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
req->out.h.error = -ECONNABORTED;
clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
- request_end(fc, req);
+ fuse_request_end(fc, req);
}
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 24dbca777775..67521103d3b2 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -956,6 +956,11 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req);
+/**
+ * End a finished request
+ */
+void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
+
/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc);
void fuse_wait_aborted(struct fuse_conn *fc);
--
2.21.0
From: David Howells <[email protected]>
Create a function, vfs_get_block_super(), that is fs_context-aware and a
replacement for mount_bdev(). It caches the block device pointer and file
open mode in the fs_context struct so that this information can be passed
into sget_fc()'s test and set functions.
Signed-off-by: David Howells <[email protected]>
cc: Jens Axboe <[email protected]>
cc: [email protected]
Signed-off-by: Al Viro <[email protected]>
---
fs/fs_context.c | 2 +
fs/super.c | 111 ++++++++++++++++++++++++++++++++++++-
include/linux/fs_context.h | 9 +++
3 files changed, 121 insertions(+), 1 deletion(-)
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 103643c68e3f..270ecae32216 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -501,6 +501,8 @@ void put_fs_context(struct fs_context *fc)
if (fc->need_free && fc->ops && fc->ops->free)
fc->ops->free(fc);
+ if (fc->dev_destructor)
+ fc->dev_destructor(fc);
security_free_mnt_opts(&fc->security);
put_net(fc->net_ns);
diff --git a/fs/super.c b/fs/super.c
index 5960578a4076..80b56bc7d2db 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1212,6 +1212,110 @@ int get_tree_single(struct fs_context *fc,
EXPORT_SYMBOL(get_tree_single);
#ifdef CONFIG_BLOCK
+static void fc_bdev_destructor(struct fs_context *fc)
+{
+ if (fc->bdev) {
+ blkdev_put(fc->bdev, fc->bdev_mode);
+ fc->bdev = NULL;
+ }
+}
+
+static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
+{
+ s->s_mode = fc->bdev_mode;
+ s->s_bdev = fc->bdev;
+ s->s_dev = s->s_bdev->bd_dev;
+ s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+ fc->bdev = NULL;
+ return 0;
+}
+
+static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
+{
+ return s->s_bdev == fc->bdev;
+}
+
+/**
+ * vfs_get_block_super - Get a superblock based on a single block device
+ * @fc: The filesystem context holding the parameters
+ * @keying: How to distinguish superblocks
+ * @fill_super: Helper to initialise a new superblock
+ */
+int vfs_get_block_super(struct fs_context *fc,
+ int (*fill_super)(struct super_block *,
+ struct fs_context *))
+{
+ struct block_device *bdev;
+ struct super_block *s;
+ int error = 0;
+
+ fc->bdev_mode = FMODE_READ | FMODE_EXCL;
+ if (!(fc->sb_flags & SB_RDONLY))
+ fc->bdev_mode |= FMODE_WRITE;
+
+ if (!fc->source)
+ return invalf(fc, "No source specified");
+
+ bdev = blkdev_get_by_path(fc->source, fc->bdev_mode, fc->fs_type);
+ if (IS_ERR(bdev)) {
+ errorf(fc, "%s: Can't open blockdev", fc->source);
+ return PTR_ERR(bdev);
+ }
+
+ fc->dev_destructor = fc_bdev_destructor;
+ fc->bdev = bdev;
+
+ /* Once the superblock is inserted into the list by sget_fc(), s_umount
+ * will protect the lockfs code from trying to start a snapshot while
+ * we are mounting
+ */
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+ if (bdev->bd_fsfreeze_count > 0) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
+ return -EBUSY;
+ }
+
+ fc->sb_flags |= SB_NOSEC;
+ s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ if (s->s_root) {
+ /* Don't summarily change the RO/RW state. */
+ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
+ warnf(fc, "%pg: Can't mount, would change RO state", bdev);
+ error = -EBUSY;
+ goto error_sb;
+ }
+
+ /* Leave fc->bdev to fc_bdev_destructor() to clean up to avoid
+ * locking conflicts.
+ */
+ } else {
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ sb_set_blocksize(s, block_size(bdev));
+ error = fill_super(s, fc);
+ if (error)
+ goto error_sb;
+
+ s->s_flags |= SB_ACTIVE;
+ bdev->bd_super = s;
+ }
+
+ BUG_ON(fc->root);
+ fc->root = dget(s->s_root);
+ return 0;
+
+error_sb:
+ deactivate_locked_super(s);
+ /* Leave fc->bdev to fc_bdev_destructor() to clean up */
+ return error;
+}
+EXPORT_SYMBOL(vfs_get_block_super);
+
+
static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
@@ -1411,8 +1515,13 @@ int vfs_get_tree(struct fs_context *fc)
* on the superblock.
*/
error = fc->ops->get_tree(fc);
- if (error < 0)
+ if (error < 0) {
+ if (fc->dev_destructor) {
+ fc->dev_destructor(fc);
+ fc->dev_destructor = NULL;
+ }
return error;
+ }
if (!fc->root) {
pr_err("Filesystem %s get_tree() didn't set fc->root\n",
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 7c6fe3d47fa6..ed5b4349671e 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -88,6 +88,9 @@ struct fs_context {
struct mutex uapi_mutex; /* Userspace access mutex */
struct file_system_type *fs_type;
void *fs_private; /* The filesystem's context */
+ union {
+ struct block_device *bdev; /* The backing blockdev (if applicable) */
+ };
struct dentry *root; /* The root and superblock */
struct user_namespace *user_ns; /* The user namespace for this mount */
struct net *net_ns; /* The network namespace for this mount */
@@ -97,6 +100,7 @@ struct fs_context {
const char *subtype; /* The subtype to set on the superblock */
void *security; /* Linux S&M options */
void *s_fs_info; /* Proposed s_fs_info */
+ fmode_t bdev_mode; /* File open mode for bdev */
unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
unsigned int sb_flags_mask; /* Superblock flags that were changed */
unsigned int s_iflags; /* OR'd with sb->s_iflags */
@@ -105,6 +109,7 @@ struct fs_context {
enum fs_context_phase phase:8; /* The phase the context is in */
bool need_free:1; /* Need to call ops->free() */
bool global:1; /* Goes into &init_user_ns */
+ void (*dev_destructor)(struct fs_context *fc); /* For block or mtd */
};
struct fs_context_operations {
@@ -154,6 +159,10 @@ extern int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc));
+extern int vfs_get_block_super(struct fs_context *fc,
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc));
+
extern const struct file_operations fscontext_fops;
/*
--
2.21.0
From: Stefan Hajnoczi <[email protected]>
virtio-fs will need to query the length of fuse_arg lists. Make the symbol
visible.
Signed-off-by: Stefan Hajnoczi <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/dev.c | 7 ++++---
fs/fuse/fuse_i.h | 5 +++++
2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 137b3de511ac..985654560d1a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -350,7 +350,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
}
EXPORT_SYMBOL_GPL(fuse_put_request);
-static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args)
{
unsigned nbytes = 0;
unsigned i;
@@ -360,6 +360,7 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
return nbytes;
}
+EXPORT_SYMBOL_GPL(fuse_len_args);
static u64 fuse_get_unique(struct fuse_iqueue *fiq)
{
@@ -375,7 +376,7 @@ static unsigned int fuse_req_hash(u64 unique)
static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
{
req->in.h.len = sizeof(struct fuse_in_header) +
- len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+ fuse_len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
list_add_tail(&req->list, &fiq->pending);
wake_up_locked(&fiq->waitq);
kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
@@ -1912,7 +1913,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
if (out->h.error)
return nbytes != reqsize ? -EINVAL : 0;
- reqsize += len_args(out->numargs, out->args);
+ reqsize += fuse_len_args(out->numargs, out->args);
if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
return -EINVAL;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 67521103d3b2..81e436c9620a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1098,4 +1098,9 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
/* readdir.c */
int fuse_readdir(struct file *file, struct dir_context *ctx);
+/**
+ * Return the number of bytes in an arguments list
+ */
+unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
+
#endif /* _FS_FUSE_I_H */
--
2.21.0
From: Vivek Goyal <[email protected]>
This will be used by virtio-fs to send init request to fuse server after
initialization of virt queues.
Signed-off-by: Vivek Goyal <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/dev.c | 1 +
fs/fuse/fuse_i.h | 1 +
fs/fuse/inode.c | 3 ++-
3 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 985654560d1a..bd2e5958d2f9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -139,6 +139,7 @@ void fuse_request_free(struct fuse_req *req)
fuse_req_pages_free(req);
kmem_cache_free(fuse_req_cachep, req);
}
+EXPORT_SYMBOL_GPL(fuse_request_free);
void __fuse_get_request(struct fuse_req *req)
{
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 81e436c9620a..8ced5e74e5a8 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -994,6 +994,7 @@ void fuse_conn_put(struct fuse_conn *fc);
struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
+void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req);
/**
* Add connection to control filesystem
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e3375ce8e97f..31cf0c47da13 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -962,7 +962,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
wake_up_all(&fc->blocked_waitq);
}
-static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_init_in *arg = &req->misc.init_in;
@@ -992,6 +992,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
req->end = process_init_reply;
fuse_request_send_background(fc, req);
}
+EXPORT_SYMBOL_GPL(fuse_send_init);
static void fuse_free_conn(struct fuse_conn *fc)
{
--
2.21.0
From: Vivek Goyal <[email protected]>
As of now fuse_dev_alloc() both allocates a fuse device and installs it in
fuse_conn list. fuse_dev_alloc() can fail if fuse_device allocation fails.
virtio-fs needs to initialize multiple fuse devices (one per virtio queue).
It initializes one fuse device as part of call to fuse_fill_super_common()
and rest of the devices are allocated and installed after that.
But, we can't afford to fail after calling fuse_fill_super_common() as we
don't have a way to undo all the actions done by fuse_fill_super_common().
So to avoid failures after the call to fuse_fill_super_common(),
pre-allocate all fuse devices early and install them into fuse connection
later.
This patch provides two separate helpers for fuse device allocation and
fuse device installation in fuse_conn.
Signed-off-by: Vivek Goyal <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/cuse.c | 2 +-
fs/fuse/dev.c | 2 +-
fs/fuse/fuse_i.h | 4 +++-
fs/fuse/inode.c | 25 +++++++++++++++++++++----
4 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 7bad5d428ce1..c0204e3e811f 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -505,7 +505,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
*/
fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
- fud = fuse_dev_alloc(&cc->fc);
+ fud = fuse_dev_alloc_install(&cc->fc);
if (!fud) {
kfree(cc);
return -ENOMEM;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0bdb98a3b251..2df28ca7a654 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2337,7 +2337,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
if (new->private_data)
return -EINVAL;
- fud = fuse_dev_alloc(fc);
+ fud = fuse_dev_alloc_install(fc);
if (!fud)
return -ENOMEM;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 617eca6046d2..21a2e86bbdf2 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1056,7 +1056,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
*/
void fuse_conn_put(struct fuse_conn *fc);
-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
+struct fuse_dev *fuse_dev_alloc(void);
+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e42e600e885b..a9e5b106e315 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1028,7 +1028,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
return 0;
}
-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
+struct fuse_dev *fuse_dev_alloc(void)
{
struct fuse_dev *fud;
struct list_head *pq;
@@ -1044,16 +1044,33 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
}
fud->pq.processing = pq;
- fud->fc = fuse_conn_get(fc);
fuse_pqueue_init(&fud->pq);
+ return fud;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_alloc);
+
+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc)
+{
+ fud->fc = fuse_conn_get(fc);
spin_lock(&fc->lock);
list_add_tail(&fud->entry, &fc->devices);
spin_unlock(&fc->lock);
+}
+EXPORT_SYMBOL_GPL(fuse_dev_install);
+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc)
+{
+ struct fuse_dev *fud;
+
+ fud = fuse_dev_alloc();
+ if (!fud)
+ return NULL;
+
+ fuse_dev_install(fud, fc);
return fud;
}
-EXPORT_SYMBOL_GPL(fuse_dev_alloc);
+EXPORT_SYMBOL_GPL(fuse_dev_alloc_install);
void fuse_dev_free(struct fuse_dev *fud)
{
@@ -1115,7 +1132,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
if (sb->s_user_ns != &init_user_ns)
sb->s_xattr = fuse_no_acl_xattr_handlers;
- fud = fuse_dev_alloc(fc);
+ fud = fuse_dev_alloc_install(fc);
if (!fud)
goto err;
--
2.21.0
Don't hold onto dentry in lru list if need to re-lookup it anyway at next
access. Only do this if explicitly enabled, otherwise it could result in
performance regression.
More advanced version of this patch would periodically flush out dentries
from the lru which have gone stale.
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/dir.c | 28 +++++++++++++++++++++++++---
fs/fuse/fuse_i.h | 3 +++
2 files changed, 28 insertions(+), 3 deletions(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index dd0f64f7bc06..d44f11ac22ec 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -29,12 +29,28 @@ union fuse_dentry {
struct rcu_head rcu;
};
-static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
+static void fuse_dentry_settime(struct dentry *dentry, u64 time)
{
- ((union fuse_dentry *) entry->d_fsdata)->time = time;
+ struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb);
+ bool delete = !time && fc->delete_stale;
+ /*
+ * Mess with DCACHE_OP_DELETE because dput() will be faster without it.
+ * Don't care about races, either way it's just an optimization
+ */
+ if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) ||
+ (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) {
+ spin_lock(&dentry->d_lock);
+ if (!delete)
+ dentry->d_flags &= ~DCACHE_OP_DELETE;
+ else
+ dentry->d_flags |= DCACHE_OP_DELETE;
+ spin_unlock(&dentry->d_lock);
+ }
+
+ ((union fuse_dentry *) dentry->d_fsdata)->time = time;
}
-static inline u64 fuse_dentry_time(struct dentry *entry)
+static inline u64 fuse_dentry_time(const struct dentry *entry)
{
return ((union fuse_dentry *) entry->d_fsdata)->time;
}
@@ -255,8 +271,14 @@ static void fuse_dentry_release(struct dentry *dentry)
kfree_rcu(fd, rcu);
}
+static int fuse_dentry_delete(const struct dentry *dentry)
+{
+ return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
+}
+
const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
+ .d_delete = fuse_dentry_delete,
.d_init = fuse_dentry_init,
.d_release = fuse_dentry_release,
};
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 21a2e86bbdf2..700df42520ec 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -780,6 +780,9 @@ struct fuse_conn {
/** Does the filesystem support copy_file_range? */
unsigned no_copy_file_range:1;
+ /* Delete dentries that have gone stale */
+ unsigned int delete_stale:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
--
2.21.0
Allow virtio-fs to also send DESTROY request.
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/fuse_i.h | 9 +++++++++
fs/fuse/inode.c | 12 ++++++++----
2 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 700df42520ec..48a3db6870ae 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -555,6 +555,7 @@ struct fuse_fs_context {
bool group_id_present:1;
bool default_permissions:1;
bool allow_other:1;
+ bool destroy:1;
unsigned int max_read;
unsigned int blksize;
const char *subtype;
@@ -1072,6 +1073,13 @@ void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req);
*/
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
+/**
+ * Disassociate fuse connection from superblock and kill the superblock
+ *
+ * Calls kill_anon_super(), do not use with bdev mounts.
+ */
+void fuse_kill_sb_anon(struct super_block *sb);
+
/**
* Add connection to control filesystem
*/
@@ -1184,5 +1192,6 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
* Get the next unique ID for a request
*/
u64 fuse_get_unique(struct fuse_iqueue *fiq);
+void fuse_free_conn(struct fuse_conn *fc);
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a9e5b106e315..10b777ece3b8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -982,11 +982,12 @@ void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
}
EXPORT_SYMBOL_GPL(fuse_send_init);
-static void fuse_free_conn(struct fuse_conn *fc)
+void fuse_free_conn(struct fuse_conn *fc)
{
WARN_ON(!list_empty(&fc->devices));
kfree_rcu(fc, rcu);
}
+EXPORT_SYMBOL_GPL(fuse_free_conn);
static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
{
@@ -1162,7 +1163,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
/* Root dentry doesn't have .d_revalidate */
sb->s_d_op = &fuse_dentry_operations;
- if (ctx->is_bdev) {
+ if (ctx->destroy) {
fc->destroy_req = fuse_request_alloc(0);
if (!fc->destroy_req)
goto err_put_root;
@@ -1291,8 +1292,10 @@ static int fuse_init_fs_context(struct fs_context *fc)
ctx->blksize = FUSE_DEFAULT_BLKSIZE;
#ifdef CONFIG_BLOCK
- if (fc->fs_type == &fuseblk_fs_type)
+ if (fc->fs_type == &fuseblk_fs_type) {
ctx->is_bdev = true;
+ ctx->destroy = true;
+ }
#endif
fc->fs_private = ctx;
@@ -1316,11 +1319,12 @@ static void fuse_sb_destroy(struct super_block *sb)
}
}
-static void fuse_kill_sb_anon(struct super_block *sb)
+void fuse_kill_sb_anon(struct super_block *sb)
{
fuse_sb_destroy(sb);
kill_anon_super(sb);
}
+EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
--
2.21.0
From: Vivek Goyal <[email protected]>
virtio-fs does not support aborting requests which are being
processed. That is requests which have been sent to fuse daemon on host.
Signed-off-by: Vivek Goyal <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/fuse_i.h | 8 ++++++++
fs/fuse/inode.c | 7 ++++++-
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 48a3db6870ae..dbf73e5d5b38 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -556,6 +556,8 @@ struct fuse_fs_context {
bool default_permissions:1;
bool allow_other:1;
bool destroy:1;
+ bool no_control:1;
+ bool no_force_umount:1;
unsigned int max_read;
unsigned int blksize;
const char *subtype;
@@ -784,6 +786,12 @@ struct fuse_conn {
/* Delete dentries that have gone stale */
unsigned int delete_stale:1;
+ /** Do not create entry in fusectl fs */
+ unsigned int no_control:1;
+
+ /** Do not allow MNT_FORCE umount */
+ unsigned int no_force_umount:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 10b777ece3b8..7fa0dcc6f565 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -364,7 +364,10 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
static void fuse_umount_begin(struct super_block *sb)
{
- fuse_abort_conn(get_fuse_conn_super(sb));
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ if (!fc->no_force_umount)
+ fuse_abort_conn(fc);
}
static void fuse_send_destroy(struct fuse_conn *fc)
@@ -1153,6 +1156,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
fc->user_id = ctx->user_id;
fc->group_id = ctx->group_id;
fc->max_read = max_t(unsigned, 4096, ctx->max_read);
+ fc->no_control = ctx->no_control;
+ fc->no_force_umount = ctx->no_force_umount;
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
--
2.21.0
From: Stefan Hajnoczi <[email protected]>
Add a basic file system module for virtio-fs. This does not yet contain
shared data support between host and guest or metadata coherency speedups.
However it is already significantly faster than virtio-9p.
Design Overview
===============
With the goal of designing something with better performance and local file
system semantics, a bunch of ideas were proposed.
- Use fuse protocol (instead of 9p) for communication between guest and
host. Guest kernel will be fuse client and a fuse server will run on
host to serve the requests.
- For data access inside guest, mmap portion of file in QEMU address space
and guest accesses this memory using dax. That way guest page cache is
bypassed and there is only one copy of data (on host). This will also
enable mmap(MAP_SHARED) between guests.
- For metadata coherency, there is a shared memory region which contains
version number associated with metadata and any guest changing metadata
updates version number and other guests refresh metadata on next access.
This is yet to be implemented.
How virtio-fs differs from existing approaches
==============================================
The unique idea behind virtio-fs is to take advantage of the co-location of
the virtual machine and hypervisor to avoid communication (vmexits).
DAX allows file contents to be accessed without communication with the
hypervisor. The shared memory region for metadata avoids communication in
the common case where metadata is unchanged.
By replacing expensive communication with cheaper shared memory accesses,
we expect to achieve better performance than approaches based on network
file system protocols. In addition, this also makes it easier to achieve
local file system semantics (coherency).
These techniques are not applicable to network file system protocols since
the communications channel is bypassed by taking advantage of shared memory
on a local machine. This is why we decided to build virtio-fs rather than
focus on 9P or NFS.
Caching Modes
=============
Like virtio-9p, different caching modes are supported which determine the
coherency level as well. The “cache=FOO” and “writeback” options control
the level of coherence between the guest and host filesystems.
- cache=none
metadata, data and pathname lookup are not cached in guest. They are
always fetched from host and any changes are immediately pushed to host.
- cache=always
metadata, data and pathname lookup are cached in guest and never expire.
- cache=auto
metadata and pathname lookup cache expires after a configured amount of
time (default is 1 second). Data is cached while the file is open
(close to open consistency).
- writeback/no_writeback
These options control the writeback strategy. If writeback is disabled,
then normal writes will immediately be synchronized with the host fs.
If writeback is enabled, then writes may be cached in the guest until
the file is closed or an fsync(2) performed. This option has no effect
on mmap-ed writes or writes going through the DAX mechanism.
Signed-off-by: Stefan Hajnoczi <[email protected]>
Signed-off-by: Vivek Goyal <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/Kconfig | 11 +
fs/fuse/Makefile | 1 +
fs/fuse/fuse_i.h | 5 +
fs/fuse/virtio_fs.c | 1072 +++++++++++++++++++++++++++++++
include/uapi/linux/virtio_fs.h | 19 +
include/uapi/linux/virtio_ids.h | 1 +
6 files changed, 1109 insertions(+)
create mode 100644 fs/fuse/virtio_fs.c
create mode 100644 include/uapi/linux/virtio_fs.h
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 24fc5a5c1b97..0635cba19971 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -27,3 +27,14 @@ config CUSE
If you want to develop or use a userspace character device
based on CUSE, answer Y or M.
+
+config VIRTIO_FS
+ tristate "Virtio Filesystem"
+ depends on FUSE_FS
+ select VIRTIO
+ help
+ The Virtio Filesystem allows guests to mount file systems from the
+ host.
+
+ If you want to share files between guests or with the host, answer Y
+ or M.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 9485019c2a14..6419a2b3510d 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -5,5 +5,6 @@
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
+obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o
fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index dbf73e5d5b38..85e2dcad68c1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -444,6 +444,11 @@ struct fuse_req {
/** Request is stolen from fuse_file->reserved_req */
struct file *stolen_file;
+
+#if IS_ENABLED(CONFIG_VIRTIO_FS)
+ /** virtio-fs's physically contiguous buffer for in and out args */
+ void *argbuf;
+#endif
};
struct fuse_iqueue;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
new file mode 100644
index 000000000000..197e79e536f9
--- /dev/null
+++ b/fs/fuse/virtio_fs.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio-fs: Virtio Filesystem
+ * Copyright (C) 2018 Red Hat, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/virtio.h>
+#include <linux/virtio_fs.h>
+#include <linux/delay.h>
+#include <linux/fs_context.h>
+#include <linux/highmem.h>
+#include "fuse_i.h"
+
+/* List of virtio-fs device instances and a lock for the list */
+static DEFINE_MUTEX(virtio_fs_mutex);
+static LIST_HEAD(virtio_fs_instances);
+
+enum {
+ VQ_HIPRIO,
+ VQ_REQUEST
+};
+
+/* Per-virtqueue state */
+struct virtio_fs_vq {
+ spinlock_t lock;
+ struct virtqueue *vq; /* protected by ->lock */
+ struct work_struct done_work;
+ struct list_head queued_reqs;
+ struct delayed_work dispatch_work;
+ struct fuse_dev *fud;
+ bool connected;
+ long in_flight;
+ char name[24];
+} ____cacheline_aligned_in_smp;
+
+/* A virtio-fs device instance */
+struct virtio_fs {
+ struct list_head list; /* on virtio_fs_instances */
+ char *tag;
+ struct virtio_fs_vq *vqs;
+ unsigned int nvqs; /* number of virtqueues */
+ unsigned int num_queues; /* number of request queues */
+};
+
+struct virtio_fs_forget {
+ struct fuse_in_header ih;
+ struct fuse_forget_in arg;
+ /* This request can be temporarily queued on virt queue */
+ struct list_head list;
+};
+
+static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
+{
+ struct virtio_fs *fs = vq->vdev->priv;
+
+ return &fs->vqs[vq->index];
+}
+
+static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
+{
+ return &vq_to_fsvq(vq)->fud->pq;
+}
+
+/* Add a new instance to the list or return -EEXIST if tag name exists*/
+static int virtio_fs_add_instance(struct virtio_fs *fs)
+{
+ struct virtio_fs *fs2;
+ bool duplicate = false;
+
+ mutex_lock(&virtio_fs_mutex);
+
+ list_for_each_entry(fs2, &virtio_fs_instances, list) {
+ if (strcmp(fs->tag, fs2->tag) == 0)
+ duplicate = true;
+ }
+
+ if (!duplicate)
+ list_add_tail(&fs->list, &virtio_fs_instances);
+
+ mutex_unlock(&virtio_fs_mutex);
+
+ if (duplicate)
+ return -EEXIST;
+ return 0;
+}
+
+/* Return the virtio_fs with a given tag, or NULL */
+static struct virtio_fs *virtio_fs_find_instance(const char *tag)
+{
+ struct virtio_fs *fs;
+
+ mutex_lock(&virtio_fs_mutex);
+
+ list_for_each_entry(fs, &virtio_fs_instances, list) {
+ if (strcmp(fs->tag, tag) == 0)
+ goto found;
+ }
+
+ fs = NULL; /* not found */
+
+found:
+ mutex_unlock(&virtio_fs_mutex);
+
+ return fs;
+}
+
+static void virtio_fs_free_devs(struct virtio_fs *fs)
+{
+ unsigned int i;
+
+ /* TODO lock */
+
+ for (i = 0; i < fs->nvqs; i++) {
+ struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+ if (!fsvq->fud)
+ continue;
+
+ flush_work(&fsvq->done_work);
+ flush_delayed_work(&fsvq->dispatch_work);
+
+ /* TODO need to quiesce/end_requests/decrement dev_count */
+ fuse_dev_free(fsvq->fud);
+ fsvq->fud = NULL;
+ }
+}
+
+/* Read filesystem name from virtio config into fs->tag (must kfree()). */
+static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
+ char *end;
+ size_t len;
+
+ virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
+ &tag_buf, sizeof(tag_buf));
+ end = memchr(tag_buf, '\0', sizeof(tag_buf));
+ if (end == tag_buf)
+ return -EINVAL; /* empty tag */
+ if (!end)
+ end = &tag_buf[sizeof(tag_buf)];
+
+ len = end - tag_buf;
+ fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
+ if (!fs->tag)
+ return -ENOMEM;
+ memcpy(fs->tag, tag_buf, len);
+ fs->tag[len] = '\0';
+ return 0;
+}
+
+/* Work function for hiprio completion */
+static void virtio_fs_hiprio_done_work(struct work_struct *work)
+{
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ done_work);
+ struct virtqueue *vq = fsvq->vq;
+
+ /* Free completed FUSE_FORGET requests */
+ spin_lock(&fsvq->lock);
+ do {
+ unsigned int len;
+ void *req;
+
+ virtqueue_disable_cb(vq);
+
+ while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+ kfree(req);
+ fsvq->in_flight--;
+ }
+ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_dummy_dispatch_work(struct work_struct *work)
+{
+}
+
+static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
+{
+ struct virtio_fs_forget *forget;
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ dispatch_work.work);
+ struct virtqueue *vq = fsvq->vq;
+ struct scatterlist sg;
+ struct scatterlist *sgs[] = {&sg};
+ bool notify;
+ int ret;
+
+ pr_debug("virtio-fs: worker %s called.\n", __func__);
+ while (1) {
+ spin_lock(&fsvq->lock);
+ forget = list_first_entry_or_null(&fsvq->queued_reqs,
+ struct virtio_fs_forget, list);
+ if (!forget) {
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+
+ list_del(&forget->list);
+ if (!fsvq->connected) {
+ spin_unlock(&fsvq->lock);
+ kfree(forget);
+ continue;
+ }
+
+ sg_init_one(&sg, forget, sizeof(*forget));
+
+ /* Enqueue the request */
+ dev_dbg(&vq->vdev->dev, "%s\n", __func__);
+ ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
+ ret);
+ list_add_tail(&forget->list,
+ &fsvq->queued_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ } else {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
+ ret);
+ kfree(forget);
+ }
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+
+ fsvq->in_flight++;
+ notify = virtqueue_kick_prepare(vq);
+ spin_unlock(&fsvq->lock);
+
+ if (notify)
+ virtqueue_notify(vq);
+ pr_debug("virtio-fs: worker %s dispatched one forget request.\n",
+ __func__);
+ }
+}
+
+/* Allocate and copy args into req->argbuf */
+static int copy_args_to_argbuf(struct fuse_req *req)
+{
+ unsigned int offset = 0;
+ unsigned int num_in;
+ unsigned int num_out;
+ unsigned int len;
+ unsigned int i;
+
+ num_in = req->in.numargs - req->in.argpages;
+ num_out = req->out.numargs - req->out.argpages;
+ len = fuse_len_args(num_in, (struct fuse_arg *)req->in.args) +
+ fuse_len_args(num_out, req->out.args);
+
+ req->argbuf = kmalloc(len, GFP_ATOMIC);
+ if (!req->argbuf)
+ return -ENOMEM;
+
+ for (i = 0; i < num_in; i++) {
+ memcpy(req->argbuf + offset,
+ req->in.args[i].value,
+ req->in.args[i].size);
+ offset += req->in.args[i].size;
+ }
+
+ return 0;
+}
+
+/* Copy args out of and free req->argbuf */
+static void copy_args_from_argbuf(struct fuse_req *req)
+{
+ unsigned int remaining;
+ unsigned int offset;
+ unsigned int num_in;
+ unsigned int num_out;
+ unsigned int i;
+
+ remaining = req->out.h.len - sizeof(req->out.h);
+ num_in = req->in.numargs - req->in.argpages;
+ num_out = req->out.numargs - req->out.argpages;
+ offset = fuse_len_args(num_in, (struct fuse_arg *)req->in.args);
+
+ for (i = 0; i < num_out; i++) {
+ unsigned int argsize = req->out.args[i].size;
+
+ if (req->out.argvar &&
+ i == req->out.numargs - 1 &&
+ argsize > remaining) {
+ argsize = remaining;
+ }
+
+ memcpy(req->out.args[i].value, req->argbuf + offset, argsize);
+ offset += argsize;
+
+ if (i != req->out.numargs - 1)
+ remaining -= argsize;
+ }
+
+ /* Store the actual size of the variable-length arg */
+ if (req->out.argvar)
+ req->out.args[req->out.numargs - 1].size = remaining;
+
+ kfree(req->argbuf);
+ req->argbuf = NULL;
+}
+
+/* Work function for request completion */
+static void virtio_fs_requests_done_work(struct work_struct *work)
+{
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ done_work);
+ struct fuse_pqueue *fpq = &fsvq->fud->pq;
+ struct fuse_conn *fc = fsvq->fud->fc;
+ struct virtqueue *vq = fsvq->vq;
+ struct fuse_req *req;
+ struct fuse_req *next;
+ unsigned int len, i, thislen;
+ struct page *page;
+ LIST_HEAD(reqs);
+
+ /* Collect completed requests off the virtqueue */
+ spin_lock(&fsvq->lock);
+ do {
+ virtqueue_disable_cb(vq);
+
+ while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+ spin_lock(&fpq->lock);
+ list_move_tail(&req->list, &reqs);
+ spin_unlock(&fpq->lock);
+ }
+ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ spin_unlock(&fsvq->lock);
+
+ /* End requests */
+ list_for_each_entry_safe(req, next, &reqs, list) {
+ /* TODO check unique */
+ /* TODO fuse_len_args(out) against oh.len */
+
+ copy_args_from_argbuf(req);
+
+ if (req->out.argpages && req->out.page_zeroing) {
+ len = req->out.args[req->out.numargs - 1].size;
+ for (i = 0; i < req->num_pages; i++) {
+ thislen = req->page_descs[i].length;
+ if (len < thislen) {
+ WARN_ON(req->page_descs[i].offset);
+ page = req->pages[i];
+ zero_user_segment(page, len, thislen);
+ len = 0;
+ } else {
+ len -= thislen;
+ }
+ }
+ }
+
+ spin_lock(&fpq->lock);
+ clear_bit(FR_SENT, &req->flags);
+ list_del_init(&req->list);
+ spin_unlock(&fpq->lock);
+
+ fuse_request_end(fc, req);
+ }
+}
+
+/* Virtqueue interrupt handler */
+static void virtio_fs_vq_done(struct virtqueue *vq)
+{
+ struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
+
+ dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
+
+ schedule_work(&fsvq->done_work);
+}
+
+/* Initialize virtqueues */
+static int virtio_fs_setup_vqs(struct virtio_device *vdev,
+ struct virtio_fs *fs)
+{
+ struct virtqueue **vqs;
+ vq_callback_t **callbacks;
+ const char **names;
+ unsigned int i;
+ int ret;
+
+ virtio_cread(vdev, struct virtio_fs_config, num_queues,
+ &fs->num_queues);
+ if (fs->num_queues == 0)
+ return -EINVAL;
+
+ fs->nvqs = 1 + fs->num_queues;
+
+ fs->vqs = devm_kcalloc(&vdev->dev, fs->nvqs,
+ sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
+ if (!fs->vqs)
+ return -ENOMEM;
+
+ vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
+ callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
+ GFP_KERNEL);
+ names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
+ if (!vqs || !callbacks || !names) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
+ snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
+ "hiprio");
+ names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
+ INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
+ INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
+ INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
+ virtio_fs_hiprio_dispatch_work);
+ spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
+
+ /* Initialize the requests virtqueues */
+ for (i = VQ_REQUEST; i < fs->nvqs; i++) {
+ spin_lock_init(&fs->vqs[i].lock);
+ INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
+ INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
+ virtio_fs_dummy_dispatch_work);
+ INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
+ snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
+ "requests.%u", i - VQ_REQUEST);
+ callbacks[i] = virtio_fs_vq_done;
+ names[i] = fs->vqs[i].name;
+ }
+
+ ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+ if (ret < 0)
+ goto out;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fs->vqs[i].vq = vqs[i];
+ fs->vqs[i].connected = true;
+ }
+out:
+ kfree(names);
+ kfree(callbacks);
+ kfree(vqs);
+ return ret;
+}
+
+/* Free virtqueues (device must already be reset) */
+static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
+ struct virtio_fs *fs)
+{
+ vdev->config->del_vqs(vdev);
+}
+
+static int virtio_fs_probe(struct virtio_device *vdev)
+{
+ struct virtio_fs *fs;
+ int ret;
+
+ fs = devm_kzalloc(&vdev->dev, sizeof(*fs), GFP_KERNEL);
+ if (!fs)
+ return -ENOMEM;
+ vdev->priv = fs;
+
+ ret = virtio_fs_read_tag(vdev, fs);
+ if (ret < 0)
+ goto out;
+
+ ret = virtio_fs_setup_vqs(vdev, fs);
+ if (ret < 0)
+ goto out;
+
+ /* TODO vq affinity */
+ /* TODO populate notifications vq */
+
+ /* Bring the device online in case the filesystem is mounted and
+ * requests need to be sent before we return.
+ */
+ virtio_device_ready(vdev);
+
+ ret = virtio_fs_add_instance(fs);
+ if (ret < 0)
+ goto out_vqs;
+
+ return 0;
+
+out_vqs:
+ vdev->config->reset(vdev);
+ virtio_fs_cleanup_vqs(vdev, fs);
+
+out:
+ vdev->priv = NULL;
+ return ret;
+}
+
+static void virtio_fs_remove(struct virtio_device *vdev)
+{
+ struct virtio_fs *fs = vdev->priv;
+
+ virtio_fs_free_devs(fs);
+
+ vdev->config->reset(vdev);
+ virtio_fs_cleanup_vqs(vdev, fs);
+
+ mutex_lock(&virtio_fs_mutex);
+ list_del(&fs->list);
+ mutex_unlock(&virtio_fs_mutex);
+
+ vdev->priv = NULL;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtio_fs_freeze(struct virtio_device *vdev)
+{
+ return 0; /* TODO */
+}
+
+static int virtio_fs_restore(struct virtio_device *vdev)
+{
+ return 0; /* TODO */
+}
+#endif /* CONFIG_PM_SLEEP */
+
+const static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
+ {},
+};
+
+const static unsigned int feature_table[] = {};
+
+static struct virtio_driver virtio_fs_driver = {
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .feature_table = feature_table,
+ .feature_table_size = ARRAY_SIZE(feature_table),
+ /* TODO validate config_get != NULL */
+ .probe = virtio_fs_probe,
+ .remove = virtio_fs_remove,
+#ifdef CONFIG_PM_SLEEP
+ .freeze = virtio_fs_freeze,
+ .restore = virtio_fs_restore,
+#endif
+};
+
+static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->waitq.lock)
+{
+ struct fuse_forget_link *link;
+ struct virtio_fs_forget *forget;
+ struct scatterlist sg;
+ struct scatterlist *sgs[] = {&sg};
+ struct virtio_fs *fs;
+ struct virtqueue *vq;
+ struct virtio_fs_vq *fsvq;
+ bool notify;
+ u64 unique;
+ int ret;
+
+ link = fuse_dequeue_forget(fiq, 1, NULL);
+ unique = fuse_get_unique(fiq);
+
+ fs = fiq->priv;
+ fsvq = &fs->vqs[VQ_HIPRIO];
+ spin_unlock(&fiq->waitq.lock);
+
+ /* Allocate a buffer for the request */
+ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
+
+ forget->ih = (struct fuse_in_header){
+ .opcode = FUSE_FORGET,
+ .nodeid = link->forget_one.nodeid,
+ .unique = unique,
+ .len = sizeof(*forget),
+ };
+ forget->arg = (struct fuse_forget_in){
+ .nlookup = link->forget_one.nlookup,
+ };
+
+ sg_init_one(&sg, forget, sizeof(*forget));
+
+ /* Enqueue the request */
+ vq = fsvq->vq;
+ dev_dbg(&vq->vdev->dev, "%s\n", __func__);
+ spin_lock(&fsvq->lock);
+
+ ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n",
+ ret);
+ list_add_tail(&forget->list, &fsvq->queued_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ } else {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
+ ret);
+ kfree(forget);
+ }
+ spin_unlock(&fsvq->lock);
+ goto out;
+ }
+
+ fsvq->in_flight++;
+ notify = virtqueue_kick_prepare(vq);
+
+ spin_unlock(&fsvq->lock);
+
+ if (notify)
+ virtqueue_notify(vq);
+out:
+ kfree(link);
+}
+
+static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->waitq.lock)
+{
+ /* TODO */
+ spin_unlock(&fiq->waitq.lock);
+}
+
+/* Return the number of scatter-gather list elements required */
+static unsigned int sg_count_fuse_req(struct fuse_req *req)
+{
+ unsigned int total_sgs = 1 /* fuse_in_header */;
+
+ if (req->in.numargs - req->in.argpages)
+ total_sgs += 1;
+
+ if (req->in.argpages)
+ total_sgs += req->num_pages;
+
+ if (!test_bit(FR_ISREPLY, &req->flags))
+ return total_sgs;
+
+ total_sgs += 1 /* fuse_out_header */;
+
+ if (req->out.numargs - req->out.argpages)
+ total_sgs += 1;
+
+ if (req->out.argpages)
+ total_sgs += req->num_pages;
+
+ return total_sgs;
+}
+
+/* Add pages to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_pages(struct scatterlist *sg,
+ struct page **pages,
+ struct fuse_page_desc *page_descs,
+ unsigned int num_pages,
+ unsigned int total_len)
+{
+ unsigned int i;
+ unsigned int this_len;
+
+ for (i = 0; i < num_pages && total_len; i++) {
+ sg_init_table(&sg[i], 1);
+ this_len = min(page_descs[i].length, total_len);
+ sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset);
+ total_len -= this_len;
+ }
+
+ return i;
+}
+
+/* Add args to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_args(struct scatterlist *sg,
+ struct fuse_req *req,
+ struct fuse_arg *args,
+ unsigned int numargs,
+ bool argpages,
+ void *argbuf,
+ unsigned int *len_used)
+{
+ unsigned int total_sgs = 0;
+ unsigned int len;
+
+ len = fuse_len_args(numargs - argpages, args);
+ if (len)
+ sg_init_one(&sg[total_sgs++], argbuf, len);
+
+ if (argpages)
+ total_sgs += sg_init_fuse_pages(&sg[total_sgs],
+ req->pages,
+ req->page_descs,
+ req->num_pages,
+ args[numargs - 1].size);
+
+ if (len_used)
+ *len_used = len;
+
+ return total_sgs;
+}
+
+/* Add a request to a virtqueue and kick the device */
+static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req)
+{
+ /* requests need at least 4 elements */
+ struct scatterlist *stack_sgs[6];
+ struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
+ struct scatterlist **sgs = stack_sgs;
+ struct scatterlist *sg = stack_sg;
+ struct virtio_fs_vq *fsvq;
+ unsigned int argbuf_used = 0;
+ unsigned int out_sgs = 0;
+ unsigned int in_sgs = 0;
+ unsigned int total_sgs;
+ unsigned int i;
+ int ret;
+ bool notify;
+
+ /* Does the sglist fit on the stack? */
+ total_sgs = sg_count_fuse_req(req);
+ if (total_sgs > ARRAY_SIZE(stack_sgs)) {
+ sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
+ sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
+ if (!sgs || !sg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* Use a bounce buffer since stack args cannot be mapped */
+ ret = copy_args_to_argbuf(req);
+ if (ret < 0)
+ goto out;
+
+ /* Request elements */
+ sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
+ out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
+ (struct fuse_arg *)req->in.args,
+ req->in.numargs, req->in.argpages,
+ req->argbuf, &argbuf_used);
+
+ /* Reply elements */
+ if (test_bit(FR_ISREPLY, &req->flags)) {
+ sg_init_one(&sg[out_sgs + in_sgs++],
+ &req->out.h, sizeof(req->out.h));
+ in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
+ req->out.args, req->out.numargs,
+ req->out.argpages,
+ req->argbuf + argbuf_used, NULL);
+ }
+
+ WARN_ON(out_sgs + in_sgs != total_sgs);
+
+ for (i = 0; i < total_sgs; i++)
+ sgs[i] = &sg[i];
+
+ fsvq = vq_to_fsvq(vq);
+ spin_lock(&fsvq->lock);
+
+ ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
+ if (ret < 0) {
+ /* TODO handle full virtqueue */
+ spin_unlock(&fsvq->lock);
+ goto out;
+ }
+
+ notify = virtqueue_kick_prepare(vq);
+
+ spin_unlock(&fsvq->lock);
+
+ if (notify)
+ virtqueue_notify(vq);
+
+out:
+ if (ret < 0 && req->argbuf) {
+ kfree(req->argbuf);
+ req->argbuf = NULL;
+ }
+ if (sgs != stack_sgs) {
+ kfree(sgs);
+ kfree(sg);
+ }
+
+ return ret;
+}
+
+static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->waitq.lock)
+{
+ unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+ struct virtio_fs *fs;
+ struct fuse_conn *fc;
+ struct fuse_req *req;
+ struct fuse_pqueue *fpq;
+ int ret;
+
+ WARN_ON(list_empty(&fiq->pending));
+ req = list_last_entry(&fiq->pending, struct fuse_req, list);
+ clear_bit(FR_PENDING, &req->flags);
+ list_del_init(&req->list);
+ WARN_ON(!list_empty(&fiq->pending));
+ spin_unlock(&fiq->waitq.lock);
+
+ fs = fiq->priv;
+ fc = fs->vqs[queue_id].fud->fc;
+
+ dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
+ "%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
+ __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid,
+ req->in.h.len, fuse_len_args(req->out.numargs, req->out.args));
+
+ fpq = &fs->vqs[queue_id].fud->pq;
+ spin_lock(&fpq->lock);
+ if (!fpq->connected) {
+ spin_unlock(&fpq->lock);
+ req->out.h.error = -ENODEV;
+ pr_err("virtio-fs: %s disconnected\n", __func__);
+ fuse_request_end(fc, req);
+ return;
+ }
+ list_add_tail(&req->list, fpq->processing);
+ spin_unlock(&fpq->lock);
+ set_bit(FR_SENT, &req->flags);
+ /* matches barrier in request_wait_answer() */
+ smp_mb__after_atomic();
+ /* TODO check for FR_INTERRUPTED? */
+
+retry:
+ ret = virtio_fs_enqueue_req(fs->vqs[queue_id].vq, req);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ /* Virtqueue full. Retry submission */
+ usleep_range(20, 30);
+ goto retry;
+ }
+ req->out.h.error = ret;
+ pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
+ fuse_request_end(fc, req);
+ return;
+ }
+}
+
+static void virtio_fs_flush_hiprio_queue(struct virtio_fs_vq *fsvq)
+{
+ struct virtio_fs_forget *forget;
+
+ WARN_ON(fsvq->in_flight < 0);
+
+ /* Go through pending forget requests and free them */
+ spin_lock(&fsvq->lock);
+ while (1) {
+ forget = list_first_entry_or_null(&fsvq->queued_reqs,
+ struct virtio_fs_forget, list);
+ if (!forget)
+ break;
+ list_del(&forget->list);
+ kfree(forget);
+ }
+
+ spin_unlock(&fsvq->lock);
+
+ /* Wait for in flight requests to finish.*/
+ while (1) {
+ spin_lock(&fsvq->lock);
+ if (!fsvq->in_flight) {
+ spin_unlock(&fsvq->lock);
+ break;
+ }
+ spin_unlock(&fsvq->lock);
+ usleep_range(1000, 2000);
+ }
+}
+
+const static struct fuse_iqueue_ops virtio_fs_fiq_ops = {
+ .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock,
+ .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock,
+ .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock,
+};
+
+static int virtio_fs_fill_super(struct super_block *sb)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct virtio_fs *fs = fc->iq.priv;
+ unsigned int i;
+ int err;
+ struct fuse_req *init_req;
+ struct fuse_fs_context ctx = {
+ .rootmode = S_IFDIR,
+ .default_permissions = 1,
+ .allow_other = 1,
+ .max_read = UINT_MAX,
+ .blksize = 512,
+ .destroy = true,
+ .no_control = true,
+ .no_force_umount = true,
+ };
+
+ /* TODO lock */
+ if (fs->vqs[VQ_REQUEST].fud) {
+ pr_err("virtio-fs: device already in use\n");
+ err = -EBUSY;
+ goto err;
+ }
+
+ err = -ENOMEM;
+ /* Allocate fuse_dev for hiprio and notification queues */
+ for (i = 0; i < VQ_REQUEST; i++) {
+ struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+ fsvq->fud = fuse_dev_alloc();
+ if (!fsvq->fud)
+ goto err_free_fuse_devs;
+ }
+
+ init_req = fuse_request_alloc(0);
+ if (!init_req)
+ goto err_free_fuse_devs;
+ __set_bit(FR_BACKGROUND, &init_req->flags);
+
+ ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
+ err = fuse_fill_super_common(sb, &ctx);
+ if (err < 0)
+ goto err_free_init_req;
+
+ fc = fs->vqs[VQ_REQUEST].fud->fc;
+
+ /* TODO take fuse_mutex around this loop? */
+ for (i = 0; i < fs->nvqs; i++) {
+ struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+ if (i == VQ_REQUEST)
+ continue; /* already initialized */
+ fuse_dev_install(fsvq->fud, fc);
+ atomic_inc(&fc->dev_count);
+ }
+
+ fuse_send_init(fc, init_req);
+ return 0;
+
+err_free_init_req:
+ fuse_request_free(init_req);
+err_free_fuse_devs:
+ for (i = 0; i < fs->nvqs; i++)
+ fuse_dev_free(fs->vqs[i].fud);
+err:
+ return err;
+}
+
+static void virtio_kill_sb(struct super_block *sb)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct virtio_fs *vfs;
+ struct virtio_fs_vq *fsvq;
+
+ /* If mount failed, we can still be called without any fc */
+ if (!fc)
+ return fuse_kill_sb_anon(sb);
+
+ vfs = fc->iq.priv;
+ fsvq = &vfs->vqs[VQ_HIPRIO];
+
+ /* Stop forget queue. Soon destroy will be sent */
+ spin_lock(&fsvq->lock);
+ fsvq->connected = false;
+ spin_unlock(&fsvq->lock);
+ virtio_fs_flush_hiprio_queue(fsvq);
+
+ fuse_kill_sb_anon(sb);
+ virtio_fs_free_devs(vfs);
+}
+
+static int virtio_fs_test_super(struct super_block *sb,
+ struct fs_context *fsc)
+{
+ struct fuse_conn *fc = fsc->s_fs_info;
+
+ return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
+}
+
+static int virtio_fs_set_super(struct super_block *sb,
+ struct fs_context *fsc)
+{
+ int err;
+
+ err = get_anon_bdev(&sb->s_dev);
+ if (!err)
+ fuse_conn_get(fsc->s_fs_info);
+
+ return err;
+}
+
+static int virtio_fs_get_tree(struct fs_context *fsc)
+{
+ struct virtio_fs *fs;
+ struct super_block *sb;
+ struct fuse_conn *fc;
+ int err;
+
+ fs = virtio_fs_find_instance(fsc->source);
+ if (!fs) {
+ pr_info("virtio-fs: tag <%s> not found\n", fsc->source);
+ return -EINVAL;
+ }
+
+ fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
+ if (!fc)
+ return -ENOMEM;
+
+ fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
+ fs);
+ fc->release = fuse_free_conn;
+ fc->delete_stale = true;
+
+ fsc->s_fs_info = fc;
+ sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
+ fuse_conn_put(fc);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ if (!sb->s_root) {
+ err = virtio_fs_fill_super(sb);
+ if (err) {
+ deactivate_locked_super(sb);
+ return err;
+ }
+
+ sb->s_flags |= SB_ACTIVE;
+ }
+
+ WARN_ON(fsc->root);
+ fsc->root = dget(sb->s_root);
+ return 0;
+}
+
+static const struct fs_context_operations virtio_fs_context_ops = {
+ .get_tree = virtio_fs_get_tree,
+};
+
+static int virtio_fs_init_fs_context(struct fs_context *fsc)
+{
+ fsc->ops = &virtio_fs_context_ops;
+ return 0;
+}
+
+static struct file_system_type virtio_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "virtiofs",
+ .init_fs_context = virtio_fs_init_fs_context,
+ .kill_sb = virtio_kill_sb,
+};
+
+static int __init virtio_fs_init(void)
+{
+ int ret;
+
+ ret = register_virtio_driver(&virtio_fs_driver);
+ if (ret < 0)
+ return ret;
+
+ ret = register_filesystem(&virtio_fs_type);
+ if (ret < 0) {
+ unregister_virtio_driver(&virtio_fs_driver);
+ return ret;
+ }
+
+ return 0;
+}
+module_init(virtio_fs_init);
+
+static void __exit virtio_fs_exit(void)
+{
+ unregister_filesystem(&virtio_fs_type);
+ unregister_virtio_driver(&virtio_fs_driver);
+}
+module_exit(virtio_fs_exit);
+
+MODULE_AUTHOR("Stefan Hajnoczi <[email protected]>");
+MODULE_DESCRIPTION("Virtio Filesystem");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_FS(KBUILD_MODNAME);
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h
new file mode 100644
index 000000000000..b5e99c217c86
--- /dev/null
+++ b/include/uapi/linux/virtio_fs.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+
+#ifndef _UAPI_LINUX_VIRTIO_FS_H
+#define _UAPI_LINUX_VIRTIO_FS_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_types.h>
+
+struct virtio_fs_config {
+ /* Filesystem name (UTF-8, not NUL-terminated, padded with NULs) */
+ __u8 tag[36];
+
+ /* Number of request queues */
+ __u32 num_queues;
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_VIRTIO_FS_H */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 348fd0176f75..585e07b27333 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -44,6 +44,7 @@
#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */
#define VIRTIO_ID_CRYPTO 20 /* virtio crypto */
#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */
+#define VIRTIO_ID_FS 26 /* virtio filesystem */
#define VIRTIO_ID_PMEM 27 /* virtio pmem */
#endif /* _LINUX_VIRTIO_IDS_H */
--
2.21.0
From: Stefan Hajnoczi <[email protected]>
The /dev/fuse device uses fiq->waitq and fasync to signal that requests are
available. These mechanisms do not apply to virtio-fs. This patch
introduces callbacks so alternative behavior can be used.
Note that queue_interrupt() changes along these lines:
spin_lock(&fiq->waitq.lock);
wake_up_locked(&fiq->waitq);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
spin_unlock(&fiq->waitq.lock);
- kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
Since queue_request() and queue_forget() also call kill_fasync() inside
the spinlock this should be safe.
Signed-off-by: Stefan Hajnoczi <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/cuse.c | 2 +-
fs/fuse/dev.c | 50 ++++++++++++++++++++++++++++++++----------------
fs/fuse/fuse_i.h | 42 +++++++++++++++++++++++++++++++++++++++-
fs/fuse/inode.c | 13 +++++++++----
4 files changed, 85 insertions(+), 22 deletions(-)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index d011a1ad1d4f..7bad5d428ce1 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -503,7 +503,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
* Limit the cuse channel to requests that can
* be represented in file->f_cred->user_ns.
*/
- fuse_conn_init(&cc->fc, file->f_cred->user_ns);
+ fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
fud = fuse_dev_alloc(&cc->fc);
if (!fud) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c0c30a225e78..0bdb98a3b251 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -375,13 +375,33 @@ static unsigned int fuse_req_hash(u64 unique)
return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}
-static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
+/**
+ * A new request is available, wake fiq->waitq
+ */
+static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->waitq.lock)
{
- req->in.h.len = sizeof(struct fuse_in_header) +
- fuse_len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
- list_add_tail(&req->list, &fiq->pending);
wake_up_locked(&fiq->waitq);
kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+ spin_unlock(&fiq->waitq.lock);
+}
+
+const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
+ .wake_forget_and_unlock = fuse_dev_wake_and_unlock,
+ .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock,
+ .wake_pending_and_unlock = fuse_dev_wake_and_unlock,
+};
+EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
+
+static void queue_request_and_unlock(struct fuse_iqueue *fiq,
+ struct fuse_req *req)
+__releases(fiq->waitq.lock)
+{
+ req->in.h.len = sizeof(struct fuse_in_header) +
+ fuse_len_args(req->in.numargs,
+ (struct fuse_arg *) req->in.args);
+ list_add_tail(&req->list, &fiq->pending);
+ fiq->ops->wake_pending_and_unlock(fiq);
}
void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
@@ -396,12 +416,11 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
if (fiq->connected) {
fiq->forget_list_tail->next = forget;
fiq->forget_list_tail = forget;
- wake_up_locked(&fiq->waitq);
- kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+ fiq->ops->wake_forget_and_unlock(fiq);
} else {
kfree(forget);
+ spin_unlock(&fiq->waitq.lock);
}
- spin_unlock(&fiq->waitq.lock);
}
static void flush_bg_queue(struct fuse_conn *fc)
@@ -417,8 +436,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
fc->active_background++;
spin_lock(&fiq->waitq.lock);
req->in.h.unique = fuse_get_unique(fiq);
- queue_request(fiq, req);
- spin_unlock(&fiq->waitq.lock);
+ queue_request_and_unlock(fiq, req);
}
}
@@ -506,10 +524,10 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
spin_unlock(&fiq->waitq.lock);
return 0;
}
- wake_up_locked(&fiq->waitq);
- kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+ fiq->ops->wake_interrupt_and_unlock(fiq);
+ } else {
+ spin_unlock(&fiq->waitq.lock);
}
- spin_unlock(&fiq->waitq.lock);
return 0;
}
@@ -569,11 +587,10 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
req->out.h.error = -ENOTCONN;
} else {
req->in.h.unique = fuse_get_unique(fiq);
- queue_request(fiq, req);
/* acquire extra reference, since request is still needed
after fuse_request_end() */
__fuse_get_request(req);
- spin_unlock(&fiq->waitq.lock);
+ queue_request_and_unlock(fiq, req);
request_wait_answer(fc, req);
/* Pairs with smp_wmb() in fuse_request_end() */
@@ -706,10 +723,11 @@ static int fuse_request_send_notify_reply(struct fuse_conn *fc,
req->in.h.unique = unique;
spin_lock(&fiq->waitq.lock);
if (fiq->connected) {
- queue_request(fiq, req);
+ queue_request_and_unlock(fiq, req);
err = 0;
+ } else {
+ spin_unlock(&fiq->waitq.lock);
}
- spin_unlock(&fiq->waitq.lock);
return err;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index d0c8f131fb5d..617eca6046d2 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -446,6 +446,39 @@ struct fuse_req {
struct file *stolen_file;
};
+struct fuse_iqueue;
+
+/**
+ * Input queue callbacks
+ *
+ * Input queue signalling is device-specific. For example, the /dev/fuse file
+ * uses fiq->waitq and fasync to wake processes that are waiting on queue
+ * readiness. These callbacks allow other device types to respond to input
+ * queue activity.
+ */
+struct fuse_iqueue_ops {
+ /**
+ * Signal that a forget has been queued
+ */
+ void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
+ __releases(fiq->waitq.lock);
+
+ /**
+ * Signal that an INTERRUPT request has been queued
+ */
+ void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
+ __releases(fiq->waitq.lock);
+
+ /**
+ * Signal that a request has been queued
+ */
+ void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
+ __releases(fiq->waitq.lock);
+};
+
+/** /dev/fuse input queue operations */
+extern const struct fuse_iqueue_ops fuse_dev_fiq_ops;
+
struct fuse_iqueue {
/** Connection established */
unsigned connected;
@@ -471,6 +504,12 @@ struct fuse_iqueue {
/** O_ASYNC requests */
struct fasync_struct *fasync;
+
+ /** Device-specific callbacks */
+ const struct fuse_iqueue_ops *ops;
+
+ /** Device-specific state */
+ void *priv;
};
#define FUSE_PQ_HASH_BITS 8
@@ -1009,7 +1048,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
* Initialize fuse_conn
*/
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+ const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
/**
* Release reference to fuse_conn
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 048816c95b69..e42e600e885b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -569,7 +569,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
-static void fuse_iqueue_init(struct fuse_iqueue *fiq)
+static void fuse_iqueue_init(struct fuse_iqueue *fiq,
+ const struct fuse_iqueue_ops *ops,
+ void *priv)
{
memset(fiq, 0, sizeof(struct fuse_iqueue));
init_waitqueue_head(&fiq->waitq);
@@ -577,6 +579,8 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq)
INIT_LIST_HEAD(&fiq->interrupts);
fiq->forget_list_tail = &fiq->forget_list_head;
fiq->connected = 1;
+ fiq->ops = ops;
+ fiq->priv = priv;
}
static void fuse_pqueue_init(struct fuse_pqueue *fpq)
@@ -590,7 +594,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
fpq->connected = 1;
}
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+ const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
{
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
@@ -600,7 +605,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
atomic_set(&fc->dev_count, 1);
init_waitqueue_head(&fc->blocked_waitq);
init_waitqueue_head(&fc->reserved_req_waitq);
- fuse_iqueue_init(&fc->iq);
+ fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
INIT_LIST_HEAD(&fc->bg_queue);
INIT_LIST_HEAD(&fc->entry);
INIT_LIST_HEAD(&fc->devices);
@@ -1205,7 +1210,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
if (!fc)
goto err_free_init_req;
- fuse_conn_init(fc, sb->s_user_ns);
+ fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
fc->release = fuse_free_conn;
sb->s_fs_info = fc;
--
2.21.0
From: Stefan Hajnoczi <[email protected]>
virtio-fs will need unique IDs for FORGET requests from outside
fs/fuse/dev.c. Make the symbol visible.
Signed-off-by: Stefan Hajnoczi <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/dev.c | 3 ++-
fs/fuse/fuse_i.h | 5 +++++
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index bd2e5958d2f9..167f476fbe16 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -363,11 +363,12 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args)
}
EXPORT_SYMBOL_GPL(fuse_len_args);
-static u64 fuse_get_unique(struct fuse_iqueue *fiq)
+u64 fuse_get_unique(struct fuse_iqueue *fiq)
{
fiq->reqctr += FUSE_REQ_ID_STEP;
return fiq->reqctr;
}
+EXPORT_SYMBOL_GPL(fuse_get_unique);
static unsigned int fuse_req_hash(u64 unique)
{
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8ced5e74e5a8..7e19c936ece8 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1104,4 +1104,9 @@ int fuse_readdir(struct file *file, struct dir_context *ctx);
*/
unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
+/**
+ * Get the next unique ID for a request
+ */
+u64 fuse_get_unique(struct fuse_iqueue *fiq);
+
#endif /* _FS_FUSE_I_H */
--
2.21.0
From: Stefan Hajnoczi <[email protected]>
Add information about the new "virtiofs" file system.
Signed-off-by: Stefan Hajnoczi <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
Documentation/filesystems/index.rst | 10 +++++
Documentation/filesystems/virtiofs.rst | 60 ++++++++++++++++++++++++++
MAINTAINERS | 11 +++++
3 files changed, 81 insertions(+)
create mode 100644 Documentation/filesystems/virtiofs.rst
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 2de2fe2ab078..56e94bfc580f 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -32,3 +32,13 @@ filesystem implementations.
journalling
fscrypt
+
+Filesystems
+===========
+
+Documentation for filesystem implementations.
+
+.. toctree::
+ :maxdepth: 2
+
+ virtiofs
diff --git a/Documentation/filesystems/virtiofs.rst b/Documentation/filesystems/virtiofs.rst
new file mode 100644
index 000000000000..4f338e3cb3f7
--- /dev/null
+++ b/Documentation/filesystems/virtiofs.rst
@@ -0,0 +1,60 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================================
+virtiofs: virtio-fs host<->guest shared file system
+===================================================
+
+- Copyright (C) 2019 Red Hat, Inc.
+
+Introduction
+============
+The virtiofs file system for Linux implements a driver for the paravirtualized
+VIRTIO "virtio-fs" device for guest<->host file system sharing. It allows a
+guest to mount a directory that has been exported on the host.
+
+Guests often require access to files residing on the host or remote systems.
+Use cases include making files available to new guests during installation,
+booting from a root file system located on the host, persistent storage for
+stateless or ephemeral guests, and sharing a directory between guests.
+
+Although it is possible to use existing network file systems for some of these
+tasks, they require configuration steps that are hard to automate and they
+expose the storage network to the guest. The virtio-fs device was designed to
+solve these problems by providing file system access without networking.
+
+Furthermore the virtio-fs device takes advantage of the co-location of the
+guest and host to increase performance and provide semantics that are not
+possible with network file systems.
+
+Usage
+=====
+Mount file system with tag ``myfs`` on ``/mnt``:
+
+.. code-block:: sh
+
+ guest# mount -t virtiofs myfs /mnt
+
+Please see https://virtio-fs.gitlab.io/ for details on how to configure QEMU
+and the virtiofsd daemon.
+
+Internals
+=========
+Since the virtio-fs device uses the FUSE protocol for file system requests, the
+virtiofs file system for Linux is integrated closely with the FUSE file system
+client. The guest acts as the FUSE client while the host acts as the FUSE
+server. The /dev/fuse interface between the kernel and userspace is replaced
+with the virtio-fs device interface.
+
+FUSE requests are placed into a virtqueue and processed by the host. The
+response portion of the buffer is filled in by the host and the guest handles
+the request completion.
+
+Mapping /dev/fuse to virtqueues requires solving differences in semantics
+between /dev/fuse and virtqueues. Each time the /dev/fuse device is read, the
+FUSE client may choose which request to transfer, making it possible to
+prioritize certain requests over others. Virtqueues have queue semantics and
+it is not possible to change the order of requests that have been enqueued.
+This is especially important if the virtqueue becomes full since it is then
+impossible to add high priority requests. In order to address this difference,
+the virtio-fs device uses a "hiprio" virtqueue specifically for requests that
+have priority over normal requests.
diff --git a/MAINTAINERS b/MAINTAINERS
index 9cbcf167bdd0..459b3fa8e25e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17117,6 +17117,17 @@ S: Supported
F: drivers/s390/virtio/
F: arch/s390/include/uapi/asm/virtio-ccw.h
+VIRTIO FILE SYSTEM
+M: Stefan Hajnoczi <[email protected]>
+M: Miklos Szeredi <[email protected]>
+L: [email protected]
+L: [email protected]
+W: https://virtio-fs.gitlab.io/
+S: Supported
+F: fs/fuse/virtio_fs.c
+F: include/uapi/linux/virtio_fs.h
+F: Documentation/filesystems/virtiofs.rst
+
VIRTIO GPU DRIVER
M: David Airlie <[email protected]>
M: Gerd Hoffmann <[email protected]>
--
2.21.0
From: Stefan Hajnoczi <[email protected]>
fuse_fill_super() includes code to process the fd= option and link the
struct fuse_dev to the fd's struct file. In virtio-fs there is no file
descriptor because /dev/fuse is not used.
This patch extracts fuse_fill_super_common() so that both classic fuse and
virtio-fs can share the code to initialize a mount.
Signed-off-by: Stefan Hajnoczi <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/fuse_i.h | 27 ++++++++++
fs/fuse/inode.c | 133 +++++++++++++++++++++++------------------------
2 files changed, 91 insertions(+), 69 deletions(-)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6533be37873f..d0c8f131fb5d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -504,6 +504,26 @@ struct fuse_dev {
struct list_head entry;
};
+struct fuse_fs_context {
+ int fd;
+ unsigned int rootmode;
+ kuid_t user_id;
+ kgid_t group_id;
+ bool is_bdev:1;
+ bool fd_present:1;
+ bool rootmode_present:1;
+ bool user_id_present:1;
+ bool group_id_present:1;
+ bool default_permissions:1;
+ bool allow_other:1;
+ unsigned int max_read;
+ unsigned int blksize;
+ const char *subtype;
+
+ /* fuse_dev pointer to fill in, should contain NULL on entry */
+ void **fudptr;
+};
+
/**
* A Fuse connection.
*
@@ -1000,6 +1020,13 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
void fuse_dev_free(struct fuse_dev *fud);
void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req);
+/**
+ * Fill in superblock and initialize fuse connection
+ * @sb: partially-initialized superblock to fill in
+ * @ctx: mount context
+ */
+int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
+
/**
* Add connection to control filesystem
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 31cf0c47da13..048816c95b69 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -64,23 +64,6 @@ MODULE_PARM_DESC(max_user_congthresh,
static struct file_system_type fuseblk_fs_type;
#endif
-struct fuse_fs_context {
- const char *subtype;
- bool is_bdev;
- int fd;
- unsigned rootmode;
- kuid_t user_id;
- kgid_t group_id;
- unsigned fd_present:1;
- unsigned rootmode_present:1;
- unsigned user_id_present:1;
- unsigned group_id_present:1;
- unsigned default_permissions:1;
- unsigned allow_other:1;
- unsigned max_read;
- unsigned blksize;
-};
-
struct fuse_forget_link *fuse_alloc_forget(void)
{
return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
@@ -1083,17 +1066,13 @@ void fuse_dev_free(struct fuse_dev *fud)
}
EXPORT_SYMBOL_GPL(fuse_dev_free);
-static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
+int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
{
- struct fuse_fs_context *ctx = fsc->fs_private;
struct fuse_dev *fud;
- struct fuse_conn *fc;
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
struct inode *root;
- struct file *file;
struct dentry *root_dentry;
- struct fuse_req *init_req;
int err;
- int is_bdev = sb->s_bdev != NULL;
err = -EINVAL;
if (sb->s_flags & SB_MANDLOCK)
@@ -1101,7 +1080,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
- if (is_bdev) {
+ if (ctx->is_bdev) {
#ifdef CONFIG_BLOCK
err = -EINVAL;
if (!sb_set_blocksize(sb, ctx->blksize))
@@ -1124,19 +1103,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
if (sb->s_user_ns != &init_user_ns)
sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
- file = fget(ctx->fd);
- err = -EINVAL;
- if (!file)
- goto err;
-
- /*
- * Require mount to happen from the same user namespace which
- * opened /dev/fuse to prevent potential attacks.
- */
- if (file->f_op != &fuse_dev_operations ||
- file->f_cred->user_ns != sb->s_user_ns)
- goto err_fput;
-
/*
* If we are not in the initial user namespace posix
* acls must be translated.
@@ -1144,17 +1110,9 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
if (sb->s_user_ns != &init_user_ns)
sb->s_xattr = fuse_no_acl_xattr_handlers;
- fc = kmalloc(sizeof(*fc), GFP_KERNEL);
- err = -ENOMEM;
- if (!fc)
- goto err_fput;
-
- fuse_conn_init(fc, sb->s_user_ns);
- fc->release = fuse_free_conn;
-
fud = fuse_dev_alloc(fc);
if (!fud)
- goto err_put_conn;
+ goto err;
fc->dev = sb->s_dev;
fc->sb = sb;
@@ -1173,9 +1131,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
fc->group_id = ctx->group_id;
fc->max_read = max_t(unsigned, 4096, ctx->max_read);
- /* Used by get_root_inode() */
- sb->s_fs_info = fc;
-
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
sb->s_d_op = &fuse_root_dentry_operations;
@@ -1185,20 +1140,15 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
/* Root dentry doesn't have .d_revalidate */
sb->s_d_op = &fuse_dentry_operations;
- init_req = fuse_request_alloc(0);
- if (!init_req)
- goto err_put_root;
- __set_bit(FR_BACKGROUND, &init_req->flags);
-
- if (is_bdev) {
+ if (ctx->is_bdev) {
fc->destroy_req = fuse_request_alloc(0);
if (!fc->destroy_req)
- goto err_free_init_req;
+ goto err_put_root;
}
mutex_lock(&fuse_mutex);
err = -EINVAL;
- if (file->private_data)
+ if (*ctx->fudptr)
goto err_unlock;
err = fuse_ctl_add_conn(fc);
@@ -1207,30 +1157,75 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
list_add_tail(&fc->entry, &fuse_conn_list);
sb->s_root = root_dentry;
- file->private_data = fud;
+ *ctx->fudptr = fud;
mutex_unlock(&fuse_mutex);
- /*
- * atomic_dec_and_test() in fput() provides the necessary
- * memory barrier for file->private_data to be visible on all
- * CPUs after this
- */
- fput(file);
-
- fuse_send_init(fc, init_req);
-
return 0;
err_unlock:
mutex_unlock(&fuse_mutex);
- err_free_init_req:
- fuse_request_free(init_req);
err_put_root:
dput(root_dentry);
err_dev_free:
fuse_dev_free(fud);
+ err:
+ return err;
+}
+EXPORT_SYMBOL_GPL(fuse_fill_super_common);
+
+static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+ struct fuse_fs_context *ctx = fsc->fs_private;
+ struct file *file;
+ int err;
+ struct fuse_req *init_req;
+ struct fuse_conn *fc;
+
+ err = -EINVAL;
+ file = fget(ctx->fd);
+ if (!file)
+ goto err;
+
+ /*
+ * Require mount to happen from the same user namespace which
+ * opened /dev/fuse to prevent potential attacks.
+ */
+ if ((file->f_op != &fuse_dev_operations) ||
+ (file->f_cred->user_ns != sb->s_user_ns))
+ goto err_fput;
+
+ init_req = fuse_request_alloc(0);
+ if (!init_req)
+ goto err_fput;
+ __set_bit(FR_BACKGROUND, &init_req->flags);
+
+ ctx->fudptr = &file->private_data;
+
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!fc)
+ goto err_free_init_req;
+
+ fuse_conn_init(fc, sb->s_user_ns);
+ fc->release = fuse_free_conn;
+ sb->s_fs_info = fc;
+
+ err = fuse_fill_super_common(sb, ctx);
+ if (err)
+ goto err_put_conn;
+ /*
+ * atomic_dec_and_test() in fput() provides the necessary
+ * memory barrier for file->private_data to be visible on all
+ * CPUs after this
+ */
+ fput(file);
+ fuse_send_init(get_fuse_conn_super(sb), init_req);
+ return 0;
+
err_put_conn:
fuse_conn_put(fc);
sb->s_fs_info = NULL;
+ err_free_init_req:
+ fuse_request_free(init_req);
err_fput:
fput(file);
err:
--
2.21.0
From: Vivek Goyal <[email protected]>
File systems like virtio-fs need to do not have to play directly with
forget list data structures. There is a helper function use that instead.
Rename dequeue_forget() to fuse_dequeue_forget() and export it so that
stacked filesystems can use it.
Signed-off-by: Vivek Goyal <[email protected]>
Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/fuse/dev.c | 13 +++++++------
fs/fuse/fuse_i.h | 4 ++++
2 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 167f476fbe16..c0c30a225e78 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1185,9 +1185,9 @@ __releases(fiq->waitq.lock)
return err ? err : reqsize;
}
-static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq,
- unsigned max,
- unsigned *countp)
+struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+ unsigned int max,
+ unsigned int *countp)
{
struct fuse_forget_link *head = fiq->forget_list_head.next;
struct fuse_forget_link **newhead = &head;
@@ -1206,6 +1206,7 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq,
return head;
}
+EXPORT_SYMBOL(fuse_dequeue_forget);
static int fuse_read_single_forget(struct fuse_iqueue *fiq,
struct fuse_copy_state *cs,
@@ -1213,7 +1214,7 @@ static int fuse_read_single_forget(struct fuse_iqueue *fiq,
__releases(fiq->waitq.lock)
{
int err;
- struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL);
+ struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL);
struct fuse_forget_in arg = {
.nlookup = forget->forget_one.nlookup,
};
@@ -1261,7 +1262,7 @@ __releases(fiq->waitq.lock)
}
max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
- head = dequeue_forget(fiq, max_forgets, &count);
+ head = fuse_dequeue_forget(fiq, max_forgets, &count);
spin_unlock(&fiq->waitq.lock);
arg.count = count;
@@ -2249,7 +2250,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
clear_bit(FR_PENDING, &req->flags);
list_splice_tail_init(&fiq->pending, &to_end);
while (forget_pending(fiq))
- kfree(dequeue_forget(fiq, 1, NULL));
+ kfree(fuse_dequeue_forget(fiq, 1, NULL));
wake_up_all_locked(&fiq->waitq);
spin_unlock(&fiq->waitq.lock);
kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7e19c936ece8..6533be37873f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -820,6 +820,10 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
struct fuse_forget_link *fuse_alloc_forget(void);
+struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+ unsigned int max,
+ unsigned int *countp);
+
/* Used by READDIRPLUS */
void fuse_force_forget(struct file *file, u64 nodeid);
--
2.21.0
On Tue, Sep 03, 2019 at 01:42:02PM +0200, Miklos Szeredi wrote:
> From: Stefan Hajnoczi <[email protected]>
>
> Add a basic file system module for virtio-fs. This does not yet contain
> shared data support between host and guest or metadata coherency speedups.
> However it is already significantly faster than virtio-9p.
>
> Design Overview
> ===============
>
> With the goal of designing something with better performance and local file
> system semantics, a bunch of ideas were proposed.
>
> - Use fuse protocol (instead of 9p) for communication between guest and
> host. Guest kernel will be fuse client and a fuse server will run on
> host to serve the requests.
>
> - For data access inside guest, mmap portion of file in QEMU address space
> and guest accesses this memory using dax. That way guest page cache is
> bypassed and there is only one copy of data (on host). This will also
> enable mmap(MAP_SHARED) between guests.
>
> - For metadata coherency, there is a shared memory region which contains
> version number associated with metadata and any guest changing metadata
> updates version number and other guests refresh metadata on next access.
> This is yet to be implemented.
>
> How virtio-fs differs from existing approaches
> ==============================================
>
> The unique idea behind virtio-fs is to take advantage of the co-location of
> the virtual machine and hypervisor to avoid communication (vmexits).
>
> DAX allows file contents to be accessed without communication with the
> hypervisor. The shared memory region for metadata avoids communication in
> the common case where metadata is unchanged.
>
> By replacing expensive communication with cheaper shared memory accesses,
> we expect to achieve better performance than approaches based on network
> file system protocols. In addition, this also makes it easier to achieve
> local file system semantics (coherency).
>
> These techniques are not applicable to network file system protocols since
> the communications channel is bypassed by taking advantage of shared memory
> on a local machine. This is why we decided to build virtio-fs rather than
> focus on 9P or NFS.
>
> Caching Modes
> =============
>
> Like virtio-9p, different caching modes are supported which determine the
> coherency level as well. The “cache=FOO” and “writeback” options control
> the level of coherence between the guest and host filesystems.
>
> - cache=none
> metadata, data and pathname lookup are not cached in guest. They are
> always fetched from host and any changes are immediately pushed to host.
>
> - cache=always
> metadata, data and pathname lookup are cached in guest and never expire.
>
> - cache=auto
> metadata and pathname lookup cache expires after a configured amount of
> time (default is 1 second). Data is cached while the file is open
> (close to open consistency).
>
> - writeback/no_writeback
> These options control the writeback strategy. If writeback is disabled,
> then normal writes will immediately be synchronized with the host fs.
> If writeback is enabled, then writes may be cached in the guest until
> the file is closed or an fsync(2) performed. This option has no effect
> on mmap-ed writes or writes going through the DAX mechanism.
>
> Signed-off-by: Stefan Hajnoczi <[email protected]>
> Signed-off-by: Vivek Goyal <[email protected]>
> Signed-off-by: Miklos Szeredi <[email protected]>
What's with all of the TODOs? Some of these are really scary,
looks like they need to be figured out before this is merged.
Endian-ness for fuse header also looks wrong.
> ---
> fs/fuse/Kconfig | 11 +
> fs/fuse/Makefile | 1 +
> fs/fuse/fuse_i.h | 5 +
> fs/fuse/virtio_fs.c | 1072 +++++++++++++++++++++++++++++++
> include/uapi/linux/virtio_fs.h | 19 +
> include/uapi/linux/virtio_ids.h | 1 +
> 6 files changed, 1109 insertions(+)
> create mode 100644 fs/fuse/virtio_fs.c
> create mode 100644 include/uapi/linux/virtio_fs.h
>
> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
> index 24fc5a5c1b97..0635cba19971 100644
> --- a/fs/fuse/Kconfig
> +++ b/fs/fuse/Kconfig
> @@ -27,3 +27,14 @@ config CUSE
>
> If you want to develop or use a userspace character device
> based on CUSE, answer Y or M.
> +
> +config VIRTIO_FS
> + tristate "Virtio Filesystem"
> + depends on FUSE_FS
> + select VIRTIO
> + help
> + The Virtio Filesystem allows guests to mount file systems from the
> + host.
> +
> + If you want to share files between guests or with the host, answer Y
> + or M.
> diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
> index 9485019c2a14..6419a2b3510d 100644
> --- a/fs/fuse/Makefile
> +++ b/fs/fuse/Makefile
> @@ -5,5 +5,6 @@
>
> obj-$(CONFIG_FUSE_FS) += fuse.o
> obj-$(CONFIG_CUSE) += cuse.o
> +obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o
>
> fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index dbf73e5d5b38..85e2dcad68c1 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -444,6 +444,11 @@ struct fuse_req {
>
> /** Request is stolen from fuse_file->reserved_req */
> struct file *stolen_file;
> +
> +#if IS_ENABLED(CONFIG_VIRTIO_FS)
> + /** virtio-fs's physically contiguous buffer for in and out args */
> + void *argbuf;
> +#endif
> };
>
> struct fuse_iqueue;
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> new file mode 100644
> index 000000000000..197e79e536f9
> --- /dev/null
> +++ b/fs/fuse/virtio_fs.c
> @@ -0,0 +1,1072 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * virtio-fs: Virtio Filesystem
> + * Copyright (C) 2018 Red Hat, Inc.
> + */
> +
> +#include <linux/fs.h>
> +#include <linux/module.h>
> +#include <linux/virtio.h>
> +#include <linux/virtio_fs.h>
> +#include <linux/delay.h>
> +#include <linux/fs_context.h>
> +#include <linux/highmem.h>
> +#include "fuse_i.h"
> +
> +/* List of virtio-fs device instances and a lock for the list */
> +static DEFINE_MUTEX(virtio_fs_mutex);
> +static LIST_HEAD(virtio_fs_instances);
> +
> +enum {
> + VQ_HIPRIO,
> + VQ_REQUEST
> +};
> +
> +/* Per-virtqueue state */
> +struct virtio_fs_vq {
> + spinlock_t lock;
> + struct virtqueue *vq; /* protected by ->lock */
> + struct work_struct done_work;
> + struct list_head queued_reqs;
> + struct delayed_work dispatch_work;
> + struct fuse_dev *fud;
> + bool connected;
> + long in_flight;
> + char name[24];
I'd keep names somewhere separate as they are not used on data path.
> +} ____cacheline_aligned_in_smp;
> +
> +/* A virtio-fs device instance */
> +struct virtio_fs {
> + struct list_head list; /* on virtio_fs_instances */
> + char *tag;
> + struct virtio_fs_vq *vqs;
> + unsigned int nvqs; /* number of virtqueues */
> + unsigned int num_queues; /* number of request queues */
> +};
> +
> +struct virtio_fs_forget {
> + struct fuse_in_header ih;
> + struct fuse_forget_in arg;
These structures are all native endian.
Passing them to host will make cross-endian setups painful to support,
and hardware implementations impossible.
How about converting everything to LE?
> + /* This request can be temporarily queued on virt queue */
> + struct list_head list;
> +};
> +
> +static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
> +{
> + struct virtio_fs *fs = vq->vdev->priv;
> +
> + return &fs->vqs[vq->index];
> +}
> +
> +static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
> +{
> + return &vq_to_fsvq(vq)->fud->pq;
> +}
> +
> +/* Add a new instance to the list or return -EEXIST if tag name exists*/
> +static int virtio_fs_add_instance(struct virtio_fs *fs)
> +{
> + struct virtio_fs *fs2;
> + bool duplicate = false;
> +
> + mutex_lock(&virtio_fs_mutex);
> +
> + list_for_each_entry(fs2, &virtio_fs_instances, list) {
> + if (strcmp(fs->tag, fs2->tag) == 0)
> + duplicate = true;
> + }
> +
> + if (!duplicate)
> + list_add_tail(&fs->list, &virtio_fs_instances);
This is O(N^2) as it's presumably called for each istance.
Doesn't scale - please switch to a tree or such.
> +
> + mutex_unlock(&virtio_fs_mutex);
> +
> + if (duplicate)
> + return -EEXIST;
> + return 0;
> +}
> +
> +/* Return the virtio_fs with a given tag, or NULL */
> +static struct virtio_fs *virtio_fs_find_instance(const char *tag)
> +{
> + struct virtio_fs *fs;
> +
> + mutex_lock(&virtio_fs_mutex);
> +
> + list_for_each_entry(fs, &virtio_fs_instances, list) {
> + if (strcmp(fs->tag, tag) == 0)
> + goto found;
> + }
> +
> + fs = NULL; /* not found */
> +
> +found:
> + mutex_unlock(&virtio_fs_mutex);
> +
> + return fs;
> +}
> +
> +static void virtio_fs_free_devs(struct virtio_fs *fs)
> +{
> + unsigned int i;
> +
> + /* TODO lock */
Doesn't inspire confidence, does it?
> +
> + for (i = 0; i < fs->nvqs; i++) {
> + struct virtio_fs_vq *fsvq = &fs->vqs[i];
> +
> + if (!fsvq->fud)
> + continue;
> +
> + flush_work(&fsvq->done_work);
> + flush_delayed_work(&fsvq->dispatch_work);
> +
> + /* TODO need to quiesce/end_requests/decrement dev_count */
Indeed. Won't this crash if we don't?
> + fuse_dev_free(fsvq->fud);
> + fsvq->fud = NULL;
> + }
> +}
> +
> +/* Read filesystem name from virtio config into fs->tag (must kfree()). */
> +static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
> +{
> + char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
> + char *end;
> + size_t len;
> +
> + virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
> + &tag_buf, sizeof(tag_buf));
> + end = memchr(tag_buf, '\0', sizeof(tag_buf));
> + if (end == tag_buf)
> + return -EINVAL; /* empty tag */
> + if (!end)
> + end = &tag_buf[sizeof(tag_buf)];
> +
> + len = end - tag_buf;
> + fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
> + if (!fs->tag)
> + return -ENOMEM;
> + memcpy(fs->tag, tag_buf, len);
> + fs->tag[len] = '\0';
> + return 0;
> +}
> +
> +/* Work function for hiprio completion */
> +static void virtio_fs_hiprio_done_work(struct work_struct *work)
> +{
> + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
> + done_work);
> + struct virtqueue *vq = fsvq->vq;
> +
> + /* Free completed FUSE_FORGET requests */
> + spin_lock(&fsvq->lock);
> + do {
> + unsigned int len;
> + void *req;
> +
> + virtqueue_disable_cb(vq);
> +
> + while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
> + kfree(req);
> + fsvq->in_flight--;
> + }
> + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
> + spin_unlock(&fsvq->lock);
> +}
> +
> +static void virtio_fs_dummy_dispatch_work(struct work_struct *work)
> +{
> +}
> +
> +static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
> +{
> + struct virtio_fs_forget *forget;
> + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
> + dispatch_work.work);
> + struct virtqueue *vq = fsvq->vq;
> + struct scatterlist sg;
> + struct scatterlist *sgs[] = {&sg};
> + bool notify;
> + int ret;
> +
> + pr_debug("virtio-fs: worker %s called.\n", __func__);
> + while (1) {
> + spin_lock(&fsvq->lock);
> + forget = list_first_entry_or_null(&fsvq->queued_reqs,
> + struct virtio_fs_forget, list);
> + if (!forget) {
> + spin_unlock(&fsvq->lock);
> + return;
> + }
> +
> + list_del(&forget->list);
> + if (!fsvq->connected) {
> + spin_unlock(&fsvq->lock);
> + kfree(forget);
> + continue;
> + }
> +
> + sg_init_one(&sg, forget, sizeof(*forget));
This passes to host a structure including "struct list_head list";
Not a good idea.
> +
> + /* Enqueue the request */
> + dev_dbg(&vq->vdev->dev, "%s\n", __func__);
> + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
This is easier as add_outbuf.
Also - why GFP_ATOMIC?
> + if (ret < 0) {
> + if (ret == -ENOMEM || ret == -ENOSPC) {
> + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
> + ret);
> + list_add_tail(&forget->list,
> + &fsvq->queued_reqs);
> + schedule_delayed_work(&fsvq->dispatch_work,
> + msecs_to_jiffies(1));
Can't we we requeue after some buffers get consumed?
> + } else {
> + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
> + ret);
> + kfree(forget);
> + }
> + spin_unlock(&fsvq->lock);
> + return;
> + }
> +
> + fsvq->in_flight++;
> + notify = virtqueue_kick_prepare(vq);
> + spin_unlock(&fsvq->lock);
> +
> + if (notify)
> + virtqueue_notify(vq);
> + pr_debug("virtio-fs: worker %s dispatched one forget request.\n",
> + __func__);
> + }
> +}
> +
> +/* Allocate and copy args into req->argbuf */
> +static int copy_args_to_argbuf(struct fuse_req *req)
> +{
> + unsigned int offset = 0;
> + unsigned int num_in;
> + unsigned int num_out;
> + unsigned int len;
> + unsigned int i;
> +
> + num_in = req->in.numargs - req->in.argpages;
> + num_out = req->out.numargs - req->out.argpages;
> + len = fuse_len_args(num_in, (struct fuse_arg *)req->in.args) +
> + fuse_len_args(num_out, req->out.args);
> +
> + req->argbuf = kmalloc(len, GFP_ATOMIC);
> + if (!req->argbuf)
> + return -ENOMEM;
> +
> + for (i = 0; i < num_in; i++) {
> + memcpy(req->argbuf + offset,
> + req->in.args[i].value,
> + req->in.args[i].size);
> + offset += req->in.args[i].size;
> + }
> +
> + return 0;
> +}
> +
> +/* Copy args out of and free req->argbuf */
> +static void copy_args_from_argbuf(struct fuse_req *req)
> +{
> + unsigned int remaining;
> + unsigned int offset;
> + unsigned int num_in;
> + unsigned int num_out;
> + unsigned int i;
> +
> + remaining = req->out.h.len - sizeof(req->out.h);
> + num_in = req->in.numargs - req->in.argpages;
> + num_out = req->out.numargs - req->out.argpages;
> + offset = fuse_len_args(num_in, (struct fuse_arg *)req->in.args);
> +
> + for (i = 0; i < num_out; i++) {
> + unsigned int argsize = req->out.args[i].size;
> +
> + if (req->out.argvar &&
> + i == req->out.numargs - 1 &&
> + argsize > remaining) {
> + argsize = remaining;
> + }
> +
> + memcpy(req->out.args[i].value, req->argbuf + offset, argsize);
> + offset += argsize;
> +
> + if (i != req->out.numargs - 1)
> + remaining -= argsize;
> + }
> +
> + /* Store the actual size of the variable-length arg */
> + if (req->out.argvar)
> + req->out.args[req->out.numargs - 1].size = remaining;
> +
> + kfree(req->argbuf);
> + req->argbuf = NULL;
> +}
> +
> +/* Work function for request completion */
> +static void virtio_fs_requests_done_work(struct work_struct *work)
> +{
> + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
> + done_work);
> + struct fuse_pqueue *fpq = &fsvq->fud->pq;
> + struct fuse_conn *fc = fsvq->fud->fc;
> + struct virtqueue *vq = fsvq->vq;
> + struct fuse_req *req;
> + struct fuse_req *next;
> + unsigned int len, i, thislen;
> + struct page *page;
> + LIST_HEAD(reqs);
> +
> + /* Collect completed requests off the virtqueue */
> + spin_lock(&fsvq->lock);
> + do {
> + virtqueue_disable_cb(vq);
> +
> + while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
> + spin_lock(&fpq->lock);
> + list_move_tail(&req->list, &reqs);
> + spin_unlock(&fpq->lock);
> + }
> + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
> + spin_unlock(&fsvq->lock);
> +
> + /* End requests */
> + list_for_each_entry_safe(req, next, &reqs, list) {
> + /* TODO check unique */
> + /* TODO fuse_len_args(out) against oh.len */
> +
> + copy_args_from_argbuf(req);
> +
> + if (req->out.argpages && req->out.page_zeroing) {
> + len = req->out.args[req->out.numargs - 1].size;
> + for (i = 0; i < req->num_pages; i++) {
> + thislen = req->page_descs[i].length;
> + if (len < thislen) {
> + WARN_ON(req->page_descs[i].offset);
> + page = req->pages[i];
> + zero_user_segment(page, len, thislen);
> + len = 0;
> + } else {
> + len -= thislen;
> + }
> + }
> + }
> +
> + spin_lock(&fpq->lock);
> + clear_bit(FR_SENT, &req->flags);
> + list_del_init(&req->list);
> + spin_unlock(&fpq->lock);
> +
> + fuse_request_end(fc, req);
> + }
> +}
> +
> +/* Virtqueue interrupt handler */
> +static void virtio_fs_vq_done(struct virtqueue *vq)
> +{
> + struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
> +
> + dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
> +
> + schedule_work(&fsvq->done_work);
> +}
> +
> +/* Initialize virtqueues */
> +static int virtio_fs_setup_vqs(struct virtio_device *vdev,
> + struct virtio_fs *fs)
> +{
> + struct virtqueue **vqs;
> + vq_callback_t **callbacks;
> + const char **names;
> + unsigned int i;
> + int ret;
> +
> + virtio_cread(vdev, struct virtio_fs_config, num_queues,
> + &fs->num_queues);
> + if (fs->num_queues == 0)
> + return -EINVAL;
> +
> + fs->nvqs = 1 + fs->num_queues;
> +
> + fs->vqs = devm_kcalloc(&vdev->dev, fs->nvqs,
> + sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
> + if (!fs->vqs)
> + return -ENOMEM;
> +
> + vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
> + callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
> + GFP_KERNEL);
> + names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
> + if (!vqs || !callbacks || !names) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
> + snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
> + "hiprio");
> + names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
> + INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
> + INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
> + INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
> + virtio_fs_hiprio_dispatch_work);
> + spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
> +
> + /* Initialize the requests virtqueues */
> + for (i = VQ_REQUEST; i < fs->nvqs; i++) {
> + spin_lock_init(&fs->vqs[i].lock);
> + INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
> + INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
> + virtio_fs_dummy_dispatch_work);
> + INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
> + snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
> + "requests.%u", i - VQ_REQUEST);
> + callbacks[i] = virtio_fs_vq_done;
> + names[i] = fs->vqs[i].name;
> + }
> +
> + ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
> + if (ret < 0)
> + goto out;
> +
> + for (i = 0; i < fs->nvqs; i++) {
> + fs->vqs[i].vq = vqs[i];
> + fs->vqs[i].connected = true;
> + }
> +out:
> + kfree(names);
> + kfree(callbacks);
> + kfree(vqs);
> + return ret;
> +}
> +
> +/* Free virtqueues (device must already be reset) */
> +static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
> + struct virtio_fs *fs)
> +{
> + vdev->config->del_vqs(vdev);
> +}
> +
> +static int virtio_fs_probe(struct virtio_device *vdev)
> +{
> + struct virtio_fs *fs;
> + int ret;
> +
> + fs = devm_kzalloc(&vdev->dev, sizeof(*fs), GFP_KERNEL);
> + if (!fs)
> + return -ENOMEM;
> + vdev->priv = fs;
> +
> + ret = virtio_fs_read_tag(vdev, fs);
> + if (ret < 0)
> + goto out;
> +
> + ret = virtio_fs_setup_vqs(vdev, fs);
> + if (ret < 0)
> + goto out;
> +
> + /* TODO vq affinity */
> + /* TODO populate notifications vq */
what's notifications vq?
> +
> + /* Bring the device online in case the filesystem is mounted and
> + * requests need to be sent before we return.
> + */
> + virtio_device_ready(vdev);
> +
> + ret = virtio_fs_add_instance(fs);
> + if (ret < 0)
> + goto out_vqs;
> +
> + return 0;
> +
> +out_vqs:
> + vdev->config->reset(vdev);
> + virtio_fs_cleanup_vqs(vdev, fs);
> +
> +out:
> + vdev->priv = NULL;
> + return ret;
> +}
> +
> +static void virtio_fs_remove(struct virtio_device *vdev)
> +{
> + struct virtio_fs *fs = vdev->priv;
> +
> + virtio_fs_free_devs(fs);
> +
> + vdev->config->reset(vdev);
> + virtio_fs_cleanup_vqs(vdev, fs);
> +
> + mutex_lock(&virtio_fs_mutex);
> + list_del(&fs->list);
> + mutex_unlock(&virtio_fs_mutex);
> +
> + vdev->priv = NULL;
> +}
> +
> +#ifdef CONFIG_PM_SLEEP
> +static int virtio_fs_freeze(struct virtio_device *vdev)
> +{
> + return 0; /* TODO */
> +}
> +
> +static int virtio_fs_restore(struct virtio_device *vdev)
> +{
> + return 0; /* TODO */
> +}
Is this really a good idea? I'd rather it was implemented,
but if not possible at all disabling PM seems better than just
keep going.
> +#endif /* CONFIG_PM_SLEEP */
> +
> +const static struct virtio_device_id id_table[] = {
> + { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
> + {},
> +};
> +
> +const static unsigned int feature_table[] = {};
> +
> +static struct virtio_driver virtio_fs_driver = {
> + .driver.name = KBUILD_MODNAME,
> + .driver.owner = THIS_MODULE,
> + .id_table = id_table,
> + .feature_table = feature_table,
> + .feature_table_size = ARRAY_SIZE(feature_table),
> + /* TODO validate config_get != NULL */
Why?
> + .probe = virtio_fs_probe,
> + .remove = virtio_fs_remove,
> +#ifdef CONFIG_PM_SLEEP
> + .freeze = virtio_fs_freeze,
> + .restore = virtio_fs_restore,
> +#endif
> +};
> +
> +static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
> +__releases(fiq->waitq.lock)
> +{
> + struct fuse_forget_link *link;
> + struct virtio_fs_forget *forget;
> + struct scatterlist sg;
> + struct scatterlist *sgs[] = {&sg};
> + struct virtio_fs *fs;
> + struct virtqueue *vq;
> + struct virtio_fs_vq *fsvq;
> + bool notify;
> + u64 unique;
> + int ret;
> +
> + link = fuse_dequeue_forget(fiq, 1, NULL);
> + unique = fuse_get_unique(fiq);
> +
> + fs = fiq->priv;
> + fsvq = &fs->vqs[VQ_HIPRIO];
> + spin_unlock(&fiq->waitq.lock);
> +
> + /* Allocate a buffer for the request */
> + forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
> +
> + forget->ih = (struct fuse_in_header){
> + .opcode = FUSE_FORGET,
> + .nodeid = link->forget_one.nodeid,
> + .unique = unique,
> + .len = sizeof(*forget),
> + };
> + forget->arg = (struct fuse_forget_in){
> + .nlookup = link->forget_one.nlookup,
> + };
> +
> + sg_init_one(&sg, forget, sizeof(*forget));
> +
> + /* Enqueue the request */
> + vq = fsvq->vq;
> + dev_dbg(&vq->vdev->dev, "%s\n", __func__);
> + spin_lock(&fsvq->lock);
> +
> + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
> + if (ret < 0) {
> + if (ret == -ENOMEM || ret == -ENOSPC) {
> + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n",
> + ret);
> + list_add_tail(&forget->list, &fsvq->queued_reqs);
> + schedule_delayed_work(&fsvq->dispatch_work,
> + msecs_to_jiffies(1));
> + } else {
> + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
> + ret);
> + kfree(forget);
> + }
> + spin_unlock(&fsvq->lock);
> + goto out;
> + }
code duplicated from virtio_fs_hiprio_dispatch_work
> +
> + fsvq->in_flight++;
> + notify = virtqueue_kick_prepare(vq);
> +
> + spin_unlock(&fsvq->lock);
> +
> + if (notify)
> + virtqueue_notify(vq);
> +out:
> + kfree(link);
> +}
> +
> +static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
> +__releases(fiq->waitq.lock)
> +{
> + /* TODO */
what's needed?
> + spin_unlock(&fiq->waitq.lock);
> +}
> +
> +/* Return the number of scatter-gather list elements required */
> +static unsigned int sg_count_fuse_req(struct fuse_req *req)
> +{
> + unsigned int total_sgs = 1 /* fuse_in_header */;
> +
> + if (req->in.numargs - req->in.argpages)
> + total_sgs += 1;
> +
> + if (req->in.argpages)
> + total_sgs += req->num_pages;
> +
> + if (!test_bit(FR_ISREPLY, &req->flags))
> + return total_sgs;
> +
> + total_sgs += 1 /* fuse_out_header */;
> +
> + if (req->out.numargs - req->out.argpages)
> + total_sgs += 1;
> +
> + if (req->out.argpages)
> + total_sgs += req->num_pages;
> +
> + return total_sgs;
> +}
> +
> +/* Add pages to scatter-gather list and return number of elements used */
> +static unsigned int sg_init_fuse_pages(struct scatterlist *sg,
> + struct page **pages,
> + struct fuse_page_desc *page_descs,
> + unsigned int num_pages,
> + unsigned int total_len)
> +{
> + unsigned int i;
> + unsigned int this_len;
> +
> + for (i = 0; i < num_pages && total_len; i++) {
> + sg_init_table(&sg[i], 1);
> + this_len = min(page_descs[i].length, total_len);
> + sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset);
> + total_len -= this_len;
> + }
> +
> + return i;
> +}
> +
> +/* Add args to scatter-gather list and return number of elements used */
> +static unsigned int sg_init_fuse_args(struct scatterlist *sg,
> + struct fuse_req *req,
> + struct fuse_arg *args,
> + unsigned int numargs,
> + bool argpages,
> + void *argbuf,
> + unsigned int *len_used)
> +{
> + unsigned int total_sgs = 0;
> + unsigned int len;
> +
> + len = fuse_len_args(numargs - argpages, args);
> + if (len)
> + sg_init_one(&sg[total_sgs++], argbuf, len);
> +
> + if (argpages)
> + total_sgs += sg_init_fuse_pages(&sg[total_sgs],
> + req->pages,
> + req->page_descs,
> + req->num_pages,
> + args[numargs - 1].size);
> +
> + if (len_used)
> + *len_used = len;
> +
> + return total_sgs;
> +}
> +
> +/* Add a request to a virtqueue and kick the device */
> +static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req)
> +{
> + /* requests need at least 4 elements */
> + struct scatterlist *stack_sgs[6];
> + struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
> + struct scatterlist **sgs = stack_sgs;
> + struct scatterlist *sg = stack_sg;
> + struct virtio_fs_vq *fsvq;
> + unsigned int argbuf_used = 0;
> + unsigned int out_sgs = 0;
> + unsigned int in_sgs = 0;
> + unsigned int total_sgs;
> + unsigned int i;
> + int ret;
> + bool notify;
> +
> + /* Does the sglist fit on the stack? */
> + total_sgs = sg_count_fuse_req(req);
> + if (total_sgs > ARRAY_SIZE(stack_sgs)) {
> + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
> + sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
> + if (!sgs || !sg) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + }
> +
> + /* Use a bounce buffer since stack args cannot be mapped */
> + ret = copy_args_to_argbuf(req);
> + if (ret < 0)
> + goto out;
> +
> + /* Request elements */
> + sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
> + out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
> + (struct fuse_arg *)req->in.args,
> + req->in.numargs, req->in.argpages,
> + req->argbuf, &argbuf_used);
> +
> + /* Reply elements */
> + if (test_bit(FR_ISREPLY, &req->flags)) {
> + sg_init_one(&sg[out_sgs + in_sgs++],
> + &req->out.h, sizeof(req->out.h));
> + in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
> + req->out.args, req->out.numargs,
> + req->out.argpages,
> + req->argbuf + argbuf_used, NULL);
> + }
> +
> + WARN_ON(out_sgs + in_sgs != total_sgs);
> +
> + for (i = 0; i < total_sgs; i++)
> + sgs[i] = &sg[i];
> +
> + fsvq = vq_to_fsvq(vq);
> + spin_lock(&fsvq->lock);
> +
> + ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
> + if (ret < 0) {
> + /* TODO handle full virtqueue */
Indeed. What prevents this?
> + spin_unlock(&fsvq->lock);
> + goto out;
> + }
> +
> + notify = virtqueue_kick_prepare(vq);
> +
> + spin_unlock(&fsvq->lock);
> +
> + if (notify)
> + virtqueue_notify(vq);
> +
> +out:
> + if (ret < 0 && req->argbuf) {
> + kfree(req->argbuf);
> + req->argbuf = NULL;
> + }
> + if (sgs != stack_sgs) {
> + kfree(sgs);
> + kfree(sg);
> + }
> +
> + return ret;
> +}
> +
> +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> +__releases(fiq->waitq.lock)
> +{
> + unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
> + struct virtio_fs *fs;
> + struct fuse_conn *fc;
> + struct fuse_req *req;
> + struct fuse_pqueue *fpq;
> + int ret;
> +
> + WARN_ON(list_empty(&fiq->pending));
> + req = list_last_entry(&fiq->pending, struct fuse_req, list);
> + clear_bit(FR_PENDING, &req->flags);
> + list_del_init(&req->list);
> + WARN_ON(!list_empty(&fiq->pending));
> + spin_unlock(&fiq->waitq.lock);
> +
> + fs = fiq->priv;
> + fc = fs->vqs[queue_id].fud->fc;
> +
> + dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
> + "%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
> + __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid,
> + req->in.h.len, fuse_len_args(req->out.numargs, req->out.args));
> +
> + fpq = &fs->vqs[queue_id].fud->pq;
> + spin_lock(&fpq->lock);
> + if (!fpq->connected) {
> + spin_unlock(&fpq->lock);
> + req->out.h.error = -ENODEV;
> + pr_err("virtio-fs: %s disconnected\n", __func__);
> + fuse_request_end(fc, req);
> + return;
> + }
> + list_add_tail(&req->list, fpq->processing);
> + spin_unlock(&fpq->lock);
> + set_bit(FR_SENT, &req->flags);
> + /* matches barrier in request_wait_answer() */
> + smp_mb__after_atomic();
> + /* TODO check for FR_INTERRUPTED? */
?
> +
> +retry:
> + ret = virtio_fs_enqueue_req(fs->vqs[queue_id].vq, req);
> + if (ret < 0) {
> + if (ret == -ENOMEM || ret == -ENOSPC) {
> + /* Virtqueue full. Retry submission */
> + usleep_range(20, 30);
again, why not respond to completion?
> + goto retry;
> + }
> + req->out.h.error = ret;
> + pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
> + fuse_request_end(fc, req);
> + return;
> + }
> +}
> +
> +static void virtio_fs_flush_hiprio_queue(struct virtio_fs_vq *fsvq)
> +{
> + struct virtio_fs_forget *forget;
> +
> + WARN_ON(fsvq->in_flight < 0);
> +
> + /* Go through pending forget requests and free them */
> + spin_lock(&fsvq->lock);
> + while (1) {
> + forget = list_first_entry_or_null(&fsvq->queued_reqs,
> + struct virtio_fs_forget, list);
> + if (!forget)
> + break;
> + list_del(&forget->list);
> + kfree(forget);
> + }
> +
> + spin_unlock(&fsvq->lock);
> +
> + /* Wait for in flight requests to finish.*/
> + while (1) {
> + spin_lock(&fsvq->lock);
> + if (!fsvq->in_flight) {
> + spin_unlock(&fsvq->lock);
> + break;
> + }
> + spin_unlock(&fsvq->lock);
> + usleep_range(1000, 2000);
Same thing here. Can we use e.g. a completion and not usleep?
> + }
> +}
> +
> +const static struct fuse_iqueue_ops virtio_fs_fiq_ops = {
> + .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock,
> + .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock,
> + .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock,
> +};
> +
> +static int virtio_fs_fill_super(struct super_block *sb)
> +{
> + struct fuse_conn *fc = get_fuse_conn_super(sb);
> + struct virtio_fs *fs = fc->iq.priv;
> + unsigned int i;
> + int err;
> + struct fuse_req *init_req;
> + struct fuse_fs_context ctx = {
> + .rootmode = S_IFDIR,
> + .default_permissions = 1,
> + .allow_other = 1,
> + .max_read = UINT_MAX,
> + .blksize = 512,
> + .destroy = true,
> + .no_control = true,
> + .no_force_umount = true,
> + };
> +
> + /* TODO lock */
what does this refer to?
> + if (fs->vqs[VQ_REQUEST].fud) {
> + pr_err("virtio-fs: device already in use\n");
> + err = -EBUSY;
> + goto err;
> + }
> +
> + err = -ENOMEM;
> + /* Allocate fuse_dev for hiprio and notification queues */
> + for (i = 0; i < VQ_REQUEST; i++) {
> + struct virtio_fs_vq *fsvq = &fs->vqs[i];
> +
> + fsvq->fud = fuse_dev_alloc();
> + if (!fsvq->fud)
> + goto err_free_fuse_devs;
> + }
> +
> + init_req = fuse_request_alloc(0);
> + if (!init_req)
> + goto err_free_fuse_devs;
> + __set_bit(FR_BACKGROUND, &init_req->flags);
> +
> + ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
> + err = fuse_fill_super_common(sb, &ctx);
> + if (err < 0)
> + goto err_free_init_req;
> +
> + fc = fs->vqs[VQ_REQUEST].fud->fc;
> +
> + /* TODO take fuse_mutex around this loop? */
Don't we need to figure this kind of thing out before this
code lands upstream?
> + for (i = 0; i < fs->nvqs; i++) {
> + struct virtio_fs_vq *fsvq = &fs->vqs[i];
> +
> + if (i == VQ_REQUEST)
> + continue; /* already initialized */
> + fuse_dev_install(fsvq->fud, fc);
> + atomic_inc(&fc->dev_count);
> + }
> +
> + fuse_send_init(fc, init_req);
> + return 0;
> +
> +err_free_init_req:
> + fuse_request_free(init_req);
> +err_free_fuse_devs:
> + for (i = 0; i < fs->nvqs; i++)
> + fuse_dev_free(fs->vqs[i].fud);
> +err:
> + return err;
> +}
> +
> +static void virtio_kill_sb(struct super_block *sb)
> +{
> + struct fuse_conn *fc = get_fuse_conn_super(sb);
> + struct virtio_fs *vfs;
> + struct virtio_fs_vq *fsvq;
> +
> + /* If mount failed, we can still be called without any fc */
> + if (!fc)
> + return fuse_kill_sb_anon(sb);
> +
> + vfs = fc->iq.priv;
> + fsvq = &vfs->vqs[VQ_HIPRIO];
> +
> + /* Stop forget queue. Soon destroy will be sent */
> + spin_lock(&fsvq->lock);
> + fsvq->connected = false;
> + spin_unlock(&fsvq->lock);
> + virtio_fs_flush_hiprio_queue(fsvq);
> +
> + fuse_kill_sb_anon(sb);
> + virtio_fs_free_devs(vfs);
> +}
> +
> +static int virtio_fs_test_super(struct super_block *sb,
> + struct fs_context *fsc)
> +{
> + struct fuse_conn *fc = fsc->s_fs_info;
> +
> + return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
> +}
> +
> +static int virtio_fs_set_super(struct super_block *sb,
> + struct fs_context *fsc)
> +{
> + int err;
> +
> + err = get_anon_bdev(&sb->s_dev);
> + if (!err)
> + fuse_conn_get(fsc->s_fs_info);
> +
> + return err;
> +}
> +
> +static int virtio_fs_get_tree(struct fs_context *fsc)
> +{
> + struct virtio_fs *fs;
> + struct super_block *sb;
> + struct fuse_conn *fc;
> + int err;
> +
> + fs = virtio_fs_find_instance(fsc->source);
> + if (!fs) {
> + pr_info("virtio-fs: tag <%s> not found\n", fsc->source);
> + return -EINVAL;
> + }
> +
> + fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
> + if (!fc)
> + return -ENOMEM;
> +
> + fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
> + fs);
> + fc->release = fuse_free_conn;
> + fc->delete_stale = true;
> +
> + fsc->s_fs_info = fc;
> + sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
> + fuse_conn_put(fc);
> + if (IS_ERR(sb))
> + return PTR_ERR(sb);
> +
> + if (!sb->s_root) {
> + err = virtio_fs_fill_super(sb);
> + if (err) {
> + deactivate_locked_super(sb);
> + return err;
> + }
> +
> + sb->s_flags |= SB_ACTIVE;
> + }
> +
> + WARN_ON(fsc->root);
> + fsc->root = dget(sb->s_root);
> + return 0;
> +}
> +
> +static const struct fs_context_operations virtio_fs_context_ops = {
> + .get_tree = virtio_fs_get_tree,
> +};
> +
> +static int virtio_fs_init_fs_context(struct fs_context *fsc)
> +{
> + fsc->ops = &virtio_fs_context_ops;
> + return 0;
> +}
> +
> +static struct file_system_type virtio_fs_type = {
> + .owner = THIS_MODULE,
> + .name = "virtiofs",
> + .init_fs_context = virtio_fs_init_fs_context,
> + .kill_sb = virtio_kill_sb,
> +};
> +
> +static int __init virtio_fs_init(void)
> +{
> + int ret;
> +
> + ret = register_virtio_driver(&virtio_fs_driver);
> + if (ret < 0)
> + return ret;
> +
> + ret = register_filesystem(&virtio_fs_type);
> + if (ret < 0) {
> + unregister_virtio_driver(&virtio_fs_driver);
> + return ret;
> + }
> +
> + return 0;
> +}
> +module_init(virtio_fs_init);
> +
> +static void __exit virtio_fs_exit(void)
> +{
> + unregister_filesystem(&virtio_fs_type);
> + unregister_virtio_driver(&virtio_fs_driver);
> +}
> +module_exit(virtio_fs_exit);
> +
> +MODULE_AUTHOR("Stefan Hajnoczi <[email protected]>");
> +MODULE_DESCRIPTION("Virtio Filesystem");
> +MODULE_LICENSE("GPL");
> +MODULE_ALIAS_FS(KBUILD_MODNAME);
> +MODULE_DEVICE_TABLE(virtio, id_table);
> diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h
> new file mode 100644
> index 000000000000..b5e99c217c86
> --- /dev/null
> +++ b/include/uapi/linux/virtio_fs.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
> +
> +#ifndef _UAPI_LINUX_VIRTIO_FS_H
> +#define _UAPI_LINUX_VIRTIO_FS_H
> +
> +#include <linux/types.h>
> +#include <linux/virtio_ids.h>
> +#include <linux/virtio_config.h>
> +#include <linux/virtio_types.h>
> +
> +struct virtio_fs_config {
> + /* Filesystem name (UTF-8, not NUL-terminated, padded with NULs) */
> + __u8 tag[36];
> +
> + /* Number of request queues */
> + __u32 num_queues;
> +} __attribute__((packed));
> +
> +#endif /* _UAPI_LINUX_VIRTIO_FS_H */
> diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
> index 348fd0176f75..585e07b27333 100644
> --- a/include/uapi/linux/virtio_ids.h
> +++ b/include/uapi/linux/virtio_ids.h
> @@ -44,6 +44,7 @@
> #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */
> #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */
> #define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */
> +#define VIRTIO_ID_FS 26 /* virtio filesystem */
> #define VIRTIO_ID_PMEM 27 /* virtio pmem */
>
> #endif /* _LINUX_VIRTIO_IDS_H */
> --
> 2.21.0
On Tue, Sep 03, 2019 at 09:55:49AM -0400, Michael S. Tsirkin wrote:
> On Tue, Sep 03, 2019 at 01:42:02PM +0200, Miklos Szeredi wrote:
> Endian-ness for fuse header also looks wrong.
[...]
> > +struct virtio_fs_forget {
> > + struct fuse_in_header ih;
> > + struct fuse_forget_in arg;
>
> These structures are all native endian.
>
> Passing them to host will make cross-endian setups painful to support,
> and hardware implementations impossible.
>
> How about converting everything to LE?
The driver dictates the endianness of the FUSE protocol session. The
virtio-fs device specification states that the device looks at the first
request's fuse_in_header::opcode field to detect the guest endianness.
If it sees FUSE_INIT in its native endianness then no byte-swapping is
necessary. If it sees FUSE_INIT in the opposite endianness then
byte-swapping is necessary on the device side.
On Wed, Sep 04, 2019 at 07:16:30PM +0100, Stefan Hajnoczi wrote:
> On Tue, Sep 03, 2019 at 09:55:49AM -0400, Michael S. Tsirkin wrote:
> > On Tue, Sep 03, 2019 at 01:42:02PM +0200, Miklos Szeredi wrote:
> > Endian-ness for fuse header also looks wrong.
> [...]
> > > +struct virtio_fs_forget {
> > > + struct fuse_in_header ih;
> > > + struct fuse_forget_in arg;
> >
> > These structures are all native endian.
> >
> > Passing them to host will make cross-endian setups painful to support,
> > and hardware implementations impossible.
> >
> > How about converting everything to LE?
>
> The driver dictates the endianness of the FUSE protocol session. The
> virtio-fs device specification states that the device looks at the first
> request's fuse_in_header::opcode field to detect the guest endianness.
>
> If it sees FUSE_INIT in its native endianness then no byte-swapping is
> necessary. If it sees FUSE_INIT in the opposite endianness then
> byte-swapping is necessary on the device side.
You are right. Pls ignore the comment. We need to reserve the
byte-swapped FUSE_INIT to make sure future versions of fuse don't try to
send that though. I sent a patch to that effect, let's see whether it
gets accepted.
--
MST
On Tue, Sep 03, 2019 at 09:55:49AM -0400, Michael S. Tsirkin wrote:
[..]
> What's with all of the TODOs? Some of these are really scary,
> looks like they need to be figured out before this is merged.
Hi Michael,
One of the issue I noticed is races w.r.t device removal and super
block initialization. I am about to post a set of patches which
take care of these races and also get rid of some of the scary
TODOs. Other TODOs like suspend/restore, multiqueue support etc
are improvements which we can do over a period of time.
[..]
> > +/* Per-virtqueue state */
> > +struct virtio_fs_vq {
> > + spinlock_t lock;
> > + struct virtqueue *vq; /* protected by ->lock */
> > + struct work_struct done_work;
> > + struct list_head queued_reqs;
> > + struct delayed_work dispatch_work;
> > + struct fuse_dev *fud;
> > + bool connected;
> > + long in_flight;
> > + char name[24];
>
> I'd keep names somewhere separate as they are not used on data path.
Ok, this sounds like a nice to have. Will take care of this once base
patch gets merged.
[..]
> > +struct virtio_fs_forget {
> > + struct fuse_in_header ih;
> > + struct fuse_forget_in arg;
>
> These structures are all native endian.
>
> Passing them to host will make cross-endian setups painful to support,
> and hardware implementations impossible.
>
> How about converting everything to LE?
So looks like endianness issue is now resolved (going by the other
emails). So I will not worry about it.
[..]
> > +/* Add a new instance to the list or return -EEXIST if tag name exists*/
> > +static int virtio_fs_add_instance(struct virtio_fs *fs)
> > +{
> > + struct virtio_fs *fs2;
> > + bool duplicate = false;
> > +
> > + mutex_lock(&virtio_fs_mutex);
> > +
> > + list_for_each_entry(fs2, &virtio_fs_instances, list) {
> > + if (strcmp(fs->tag, fs2->tag) == 0)
> > + duplicate = true;
> > + }
> > +
> > + if (!duplicate)
> > + list_add_tail(&fs->list, &virtio_fs_instances);
>
>
> This is O(N^2) as it's presumably called for each istance.
> Doesn't scale - please switch to a tree or such.
This is O(N) and not O(N^2) right? Addition of device is O(N), search
during mount is O(N).
This is not a frequent event at all and number of virtiofs instances
per guest are expected to be fairly small (say less than 10). So I
really don't think there is any value in converting this into a tree
(instead of a list).
[..]
> > +static void virtio_fs_free_devs(struct virtio_fs *fs)
> > +{
> > + unsigned int i;
> > +
> > + /* TODO lock */
>
> Doesn't inspire confidence, does it?
Agreed. Getting rid of this in set of fixes I am about to post.
>
> > +
> > + for (i = 0; i < fs->nvqs; i++) {
> > + struct virtio_fs_vq *fsvq = &fs->vqs[i];
> > +
> > + if (!fsvq->fud)
> > + continue;
> > +
> > + flush_work(&fsvq->done_work);
> > + flush_delayed_work(&fsvq->dispatch_work);
> > +
> > + /* TODO need to quiesce/end_requests/decrement dev_count */
>
> Indeed. Won't this crash if we don't?
Took care of this as well.
[..]
> > +static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
> > +{
> > + struct virtio_fs_forget *forget;
> > + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
> > + dispatch_work.work);
> > + struct virtqueue *vq = fsvq->vq;
> > + struct scatterlist sg;
> > + struct scatterlist *sgs[] = {&sg};
> > + bool notify;
> > + int ret;
> > +
> > + pr_debug("virtio-fs: worker %s called.\n", __func__);
> > + while (1) {
> > + spin_lock(&fsvq->lock);
> > + forget = list_first_entry_or_null(&fsvq->queued_reqs,
> > + struct virtio_fs_forget, list);
> > + if (!forget) {
> > + spin_unlock(&fsvq->lock);
> > + return;
> > + }
> > +
> > + list_del(&forget->list);
> > + if (!fsvq->connected) {
> > + spin_unlock(&fsvq->lock);
> > + kfree(forget);
> > + continue;
> > + }
> > +
> > + sg_init_one(&sg, forget, sizeof(*forget));
>
> This passes to host a structure including "struct list_head list";
>
> Not a good idea.
Ok, host does not have to see "struct list_head list". Its needed for
guest. Will look into getting rid of this.
>
>
> > +
> > + /* Enqueue the request */
> > + dev_dbg(&vq->vdev->dev, "%s\n", __func__);
> > + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
>
>
> This is easier as add_outbuf.
Will look into it.
>
> Also - why GFP_ATOMIC?
Hmm..., may be it can be GFP_KERNEL. I don't see atomic context here. Will
look into it.
>
> > + if (ret < 0) {
> > + if (ret == -ENOMEM || ret == -ENOSPC) {
> > + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
> > + ret);
> > + list_add_tail(&forget->list,
> > + &fsvq->queued_reqs);
> > + schedule_delayed_work(&fsvq->dispatch_work,
> > + msecs_to_jiffies(1));
>
> Can't we we requeue after some buffers get consumed?
That's what dispatch work is doing. It tries to requeue the request after
a while.
[..]
> > +static int virtio_fs_probe(struct virtio_device *vdev)
> > +{
> > + struct virtio_fs *fs;
> > + int ret;
> > +
> > + fs = devm_kzalloc(&vdev->dev, sizeof(*fs), GFP_KERNEL);
> > + if (!fs)
> > + return -ENOMEM;
> > + vdev->priv = fs;
> > +
> > + ret = virtio_fs_read_tag(vdev, fs);
> > + if (ret < 0)
> > + goto out;
> > +
> > + ret = virtio_fs_setup_vqs(vdev, fs);
> > + if (ret < 0)
> > + goto out;
> > +
> > + /* TODO vq affinity */
> > + /* TODO populate notifications vq */
>
> what's notifications vq?
It has not been implemented yet. At some point of time we want to have
a notion of notification queue so that host can send notifications to
guest. Will get rid of this comment for now.
[..]
> > +#ifdef CONFIG_PM_SLEEP
> > +static int virtio_fs_freeze(struct virtio_device *vdev)
> > +{
> > + return 0; /* TODO */
> > +}
> > +
> > +static int virtio_fs_restore(struct virtio_device *vdev)
> > +{
> > + return 0; /* TODO */
> > +}
>
> Is this really a good idea? I'd rather it was implemented,
> but if not possible at all disabling PM seems better than just
> keep going.
I agree. Will look into disabling it.
>
> > +#endif /* CONFIG_PM_SLEEP */
> > +
> > +const static struct virtio_device_id id_table[] = {
> > + { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
> > + {},
> > +};
> > +
> > +const static unsigned int feature_table[] = {};
> > +
> > +static struct virtio_driver virtio_fs_driver = {
> > + .driver.name = KBUILD_MODNAME,
> > + .driver.owner = THIS_MODULE,
> > + .id_table = id_table,
> > + .feature_table = feature_table,
> > + .feature_table_size = ARRAY_SIZE(feature_table),
> > + /* TODO validate config_get != NULL */
>
> Why?
Don't know. Stefan, do you remember why did you put this comment? If not,
I will get rid of it.
>
> > + .probe = virtio_fs_probe,
> > + .remove = virtio_fs_remove,
> > +#ifdef CONFIG_PM_SLEEP
> > + .freeze = virtio_fs_freeze,
> > + .restore = virtio_fs_restore,
> > +#endif
> > +};
> > +
> > +static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
> > +__releases(fiq->waitq.lock)
> > +{
> > + struct fuse_forget_link *link;
> > + struct virtio_fs_forget *forget;
> > + struct scatterlist sg;
> > + struct scatterlist *sgs[] = {&sg};
> > + struct virtio_fs *fs;
> > + struct virtqueue *vq;
> > + struct virtio_fs_vq *fsvq;
> > + bool notify;
> > + u64 unique;
> > + int ret;
> > +
> > + link = fuse_dequeue_forget(fiq, 1, NULL);
> > + unique = fuse_get_unique(fiq);
> > +
> > + fs = fiq->priv;
> > + fsvq = &fs->vqs[VQ_HIPRIO];
> > + spin_unlock(&fiq->waitq.lock);
> > +
> > + /* Allocate a buffer for the request */
> > + forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
> > +
> > + forget->ih = (struct fuse_in_header){
> > + .opcode = FUSE_FORGET,
> > + .nodeid = link->forget_one.nodeid,
> > + .unique = unique,
> > + .len = sizeof(*forget),
> > + };
> > + forget->arg = (struct fuse_forget_in){
> > + .nlookup = link->forget_one.nlookup,
> > + };
> > +
> > + sg_init_one(&sg, forget, sizeof(*forget));
> > +
> > + /* Enqueue the request */
> > + vq = fsvq->vq;
> > + dev_dbg(&vq->vdev->dev, "%s\n", __func__);
> > + spin_lock(&fsvq->lock);
> > +
> > + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
> > + if (ret < 0) {
> > + if (ret == -ENOMEM || ret == -ENOSPC) {
> > + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n",
> > + ret);
> > + list_add_tail(&forget->list, &fsvq->queued_reqs);
> > + schedule_delayed_work(&fsvq->dispatch_work,
> > + msecs_to_jiffies(1));
> > + } else {
> > + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
> > + ret);
> > + kfree(forget);
> > + }
> > + spin_unlock(&fsvq->lock);
> > + goto out;
> > + }
>
>
> code duplicated from virtio_fs_hiprio_dispatch_work
Will look into reusing the code.
>
> > +
> > + fsvq->in_flight++;
> > + notify = virtqueue_kick_prepare(vq);
> > +
> > + spin_unlock(&fsvq->lock);
> > +
> > + if (notify)
> > + virtqueue_notify(vq);
> > +out:
> > + kfree(link);
> > +}
> > +
> > +static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
> > +__releases(fiq->waitq.lock)
> > +{
> > + /* TODO */
>
> what's needed?
We have not implemented this interrupt thing. It interrupts the existing
pending requests. Its not a must to have.
[..]
> > +static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req)
> > +{
> > + /* requests need at least 4 elements */
> > + struct scatterlist *stack_sgs[6];
> > + struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
> > + struct scatterlist **sgs = stack_sgs;
> > + struct scatterlist *sg = stack_sg;
> > + struct virtio_fs_vq *fsvq;
> > + unsigned int argbuf_used = 0;
> > + unsigned int out_sgs = 0;
> > + unsigned int in_sgs = 0;
> > + unsigned int total_sgs;
> > + unsigned int i;
> > + int ret;
> > + bool notify;
> > +
> > + /* Does the sglist fit on the stack? */
> > + total_sgs = sg_count_fuse_req(req);
> > + if (total_sgs > ARRAY_SIZE(stack_sgs)) {
> > + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
> > + sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
> > + if (!sgs || !sg) {
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > + }
> > +
> > + /* Use a bounce buffer since stack args cannot be mapped */
> > + ret = copy_args_to_argbuf(req);
> > + if (ret < 0)
> > + goto out;
> > +
> > + /* Request elements */
> > + sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
> > + out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
> > + (struct fuse_arg *)req->in.args,
> > + req->in.numargs, req->in.argpages,
> > + req->argbuf, &argbuf_used);
> > +
> > + /* Reply elements */
> > + if (test_bit(FR_ISREPLY, &req->flags)) {
> > + sg_init_one(&sg[out_sgs + in_sgs++],
> > + &req->out.h, sizeof(req->out.h));
> > + in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
> > + req->out.args, req->out.numargs,
> > + req->out.argpages,
> > + req->argbuf + argbuf_used, NULL);
> > + }
> > +
> > + WARN_ON(out_sgs + in_sgs != total_sgs);
> > +
> > + for (i = 0; i < total_sgs; i++)
> > + sgs[i] = &sg[i];
> > +
> > + fsvq = vq_to_fsvq(vq);
> > + spin_lock(&fsvq->lock);
> > +
> > + ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
> > + if (ret < 0) {
> > + /* TODO handle full virtqueue */
>
> Indeed. What prevents this?
So for forget requests (VQ_HIPRIO), I already handled this by retrying
submitting the request with the help of a worker.
For regular requests (VQ_REQUESTS), I have never run into virt queue
being full so far. That's why never worried about it.
So nothing prevents this. But we have not noticed it yet. So its a TODO
item. It will be nice to retry if virtuqueue gets full (instead of
returning error to caller).
[..]
> > +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> > +__releases(fiq->waitq.lock)
> > +{
> > + unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
> > + struct virtio_fs *fs;
> > + struct fuse_conn *fc;
> > + struct fuse_req *req;
> > + struct fuse_pqueue *fpq;
> > + int ret;
> > +
> > + WARN_ON(list_empty(&fiq->pending));
> > + req = list_last_entry(&fiq->pending, struct fuse_req, list);
> > + clear_bit(FR_PENDING, &req->flags);
> > + list_del_init(&req->list);
> > + WARN_ON(!list_empty(&fiq->pending));
> > + spin_unlock(&fiq->waitq.lock);
> > +
> > + fs = fiq->priv;
> > + fc = fs->vqs[queue_id].fud->fc;
> > +
> > + dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
> > + "%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
> > + __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid,
> > + req->in.h.len, fuse_len_args(req->out.numargs, req->out.args));
> > +
> > + fpq = &fs->vqs[queue_id].fud->pq;
> > + spin_lock(&fpq->lock);
> > + if (!fpq->connected) {
> > + spin_unlock(&fpq->lock);
> > + req->out.h.error = -ENODEV;
> > + pr_err("virtio-fs: %s disconnected\n", __func__);
> > + fuse_request_end(fc, req);
> > + return;
> > + }
> > + list_add_tail(&req->list, fpq->processing);
> > + spin_unlock(&fpq->lock);
> > + set_bit(FR_SENT, &req->flags);
> > + /* matches barrier in request_wait_answer() */
> > + smp_mb__after_atomic();
> > + /* TODO check for FR_INTERRUPTED? */
>
>
> ?
hmm... we don't support FR_INTERRUPTED. Stefan, do you remember why
this TODO is here. If not, I will get rid of it.
>
> > +
> > +retry:
> > + ret = virtio_fs_enqueue_req(fs->vqs[queue_id].vq, req);
> > + if (ret < 0) {
> > + if (ret == -ENOMEM || ret == -ENOSPC) {
> > + /* Virtqueue full. Retry submission */
> > + usleep_range(20, 30);
>
> again, why not respond to completion?
Using usleep() was a quick fix. Will look into using completion in second
round of cleanup. In first round I am taking care of more scary TODOs
which deal with races w.r.t device removal.
>
> > + goto retry;
> > + }
> > + req->out.h.error = ret;
> > + pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
> > + fuse_request_end(fc, req);
> > + return;
> > + }
> > +}
> > +
> > +static void virtio_fs_flush_hiprio_queue(struct virtio_fs_vq *fsvq)
> > +{
> > + struct virtio_fs_forget *forget;
> > +
> > + WARN_ON(fsvq->in_flight < 0);
> > +
> > + /* Go through pending forget requests and free them */
> > + spin_lock(&fsvq->lock);
> > + while (1) {
> > + forget = list_first_entry_or_null(&fsvq->queued_reqs,
> > + struct virtio_fs_forget, list);
> > + if (!forget)
> > + break;
> > + list_del(&forget->list);
> > + kfree(forget);
> > + }
> > +
> > + spin_unlock(&fsvq->lock);
> > +
> > + /* Wait for in flight requests to finish.*/
> > + while (1) {
> > + spin_lock(&fsvq->lock);
> > + if (!fsvq->in_flight) {
> > + spin_unlock(&fsvq->lock);
> > + break;
> > + }
> > + spin_unlock(&fsvq->lock);
> > + usleep_range(1000, 2000);
>
> Same thing here. Can we use e.g. a completion and not usleep?
Second round cleanup.
>
> > + }
> > +}
> > +
> > +const static struct fuse_iqueue_ops virtio_fs_fiq_ops = {
> > + .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock,
> > + .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock,
> > + .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock,
> > +};
> > +
> > +static int virtio_fs_fill_super(struct super_block *sb)
> > +{
> > + struct fuse_conn *fc = get_fuse_conn_super(sb);
> > + struct virtio_fs *fs = fc->iq.priv;
> > + unsigned int i;
> > + int err;
> > + struct fuse_req *init_req;
> > + struct fuse_fs_context ctx = {
> > + .rootmode = S_IFDIR,
> > + .default_permissions = 1,
> > + .allow_other = 1,
> > + .max_read = UINT_MAX,
> > + .blksize = 512,
> > + .destroy = true,
> > + .no_control = true,
> > + .no_force_umount = true,
> > + };
> > +
> > + /* TODO lock */
>
> what does this refer to?
Took care of in first round of cleanup.
>
> > + if (fs->vqs[VQ_REQUEST].fud) {
> > + pr_err("virtio-fs: device already in use\n");
> > + err = -EBUSY;
> > + goto err;
> > + }
> > +
> > + err = -ENOMEM;
> > + /* Allocate fuse_dev for hiprio and notification queues */
> > + for (i = 0; i < VQ_REQUEST; i++) {
> > + struct virtio_fs_vq *fsvq = &fs->vqs[i];
> > +
> > + fsvq->fud = fuse_dev_alloc();
> > + if (!fsvq->fud)
> > + goto err_free_fuse_devs;
> > + }
> > +
> > + init_req = fuse_request_alloc(0);
> > + if (!init_req)
> > + goto err_free_fuse_devs;
> > + __set_bit(FR_BACKGROUND, &init_req->flags);
> > +
> > + ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
> > + err = fuse_fill_super_common(sb, &ctx);
> > + if (err < 0)
> > + goto err_free_init_req;
> > +
> > + fc = fs->vqs[VQ_REQUEST].fud->fc;
> > +
> > + /* TODO take fuse_mutex around this loop? */
>
> Don't we need to figure this kind of thing out before this
> code lands upstream?
I think we don't need this TODO. fuse_connection object and associated
super block are still being formed. And my cleanup has taken care of
the additional locking.
Thanks
Vivek
On Thu, Sep 05, 2019 at 03:15:15PM -0400, Vivek Goyal wrote:
> On Tue, Sep 03, 2019 at 09:55:49AM -0400, Michael S. Tsirkin wrote:
> [..]
> > What's with all of the TODOs? Some of these are really scary,
> > looks like they need to be figured out before this is merged.
>
> Hi Michael,
>
> One of the issue I noticed is races w.r.t device removal and super
> block initialization. I am about to post a set of patches which
> take care of these races and also get rid of some of the scary
> TODOs. Other TODOs like suspend/restore, multiqueue support etc
> are improvements which we can do over a period of time.
>
> [..]
> > > +/* Per-virtqueue state */
> > > +struct virtio_fs_vq {
> > > + spinlock_t lock;
> > > + struct virtqueue *vq; /* protected by ->lock */
> > > + struct work_struct done_work;
> > > + struct list_head queued_reqs;
> > > + struct delayed_work dispatch_work;
> > > + struct fuse_dev *fud;
> > > + bool connected;
> > > + long in_flight;
> > > + char name[24];
> >
> > I'd keep names somewhere separate as they are not used on data path.
>
> Ok, this sounds like a nice to have. Will take care of this once base
> patch gets merged.
>
> [..]
> > > +struct virtio_fs_forget {
> > > + struct fuse_in_header ih;
> > > + struct fuse_forget_in arg;
> >
> > These structures are all native endian.
> >
> > Passing them to host will make cross-endian setups painful to support,
> > and hardware implementations impossible.
> >
> > How about converting everything to LE?
>
> So looks like endianness issue is now resolved (going by the other
> emails). So I will not worry about it.
>
> [..]
> > > +/* Add a new instance to the list or return -EEXIST if tag name exists*/
> > > +static int virtio_fs_add_instance(struct virtio_fs *fs)
> > > +{
> > > + struct virtio_fs *fs2;
> > > + bool duplicate = false;
> > > +
> > > + mutex_lock(&virtio_fs_mutex);
> > > +
> > > + list_for_each_entry(fs2, &virtio_fs_instances, list) {
> > > + if (strcmp(fs->tag, fs2->tag) == 0)
> > > + duplicate = true;
> > > + }
> > > +
> > > + if (!duplicate)
> > > + list_add_tail(&fs->list, &virtio_fs_instances);
> >
> >
> > This is O(N^2) as it's presumably called for each istance.
> > Doesn't scale - please switch to a tree or such.
>
> This is O(N) and not O(N^2) right? Addition of device is O(N), search
> during mount is O(N).
>
> This is not a frequent event at all and number of virtiofs instances
> per guest are expected to be fairly small (say less than 10). So I
> really don't think there is any value in converting this into a tree
> (instead of a list).
>
> [..]
> > > +static void virtio_fs_free_devs(struct virtio_fs *fs)
> > > +{
> > > + unsigned int i;
> > > +
> > > + /* TODO lock */
> >
> > Doesn't inspire confidence, does it?
>
> Agreed. Getting rid of this in set of fixes I am about to post.
>
> >
> > > +
> > > + for (i = 0; i < fs->nvqs; i++) {
> > > + struct virtio_fs_vq *fsvq = &fs->vqs[i];
> > > +
> > > + if (!fsvq->fud)
> > > + continue;
> > > +
> > > + flush_work(&fsvq->done_work);
> > > + flush_delayed_work(&fsvq->dispatch_work);
> > > +
> > > + /* TODO need to quiesce/end_requests/decrement dev_count */
> >
> > Indeed. Won't this crash if we don't?
>
> Took care of this as well.
>
> [..]
> > > +static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
> > > +{
> > > + struct virtio_fs_forget *forget;
> > > + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
> > > + dispatch_work.work);
> > > + struct virtqueue *vq = fsvq->vq;
> > > + struct scatterlist sg;
> > > + struct scatterlist *sgs[] = {&sg};
> > > + bool notify;
> > > + int ret;
> > > +
> > > + pr_debug("virtio-fs: worker %s called.\n", __func__);
> > > + while (1) {
> > > + spin_lock(&fsvq->lock);
> > > + forget = list_first_entry_or_null(&fsvq->queued_reqs,
> > > + struct virtio_fs_forget, list);
> > > + if (!forget) {
> > > + spin_unlock(&fsvq->lock);
> > > + return;
> > > + }
> > > +
> > > + list_del(&forget->list);
> > > + if (!fsvq->connected) {
> > > + spin_unlock(&fsvq->lock);
> > > + kfree(forget);
> > > + continue;
> > > + }
> > > +
> > > + sg_init_one(&sg, forget, sizeof(*forget));
> >
> > This passes to host a structure including "struct list_head list";
> >
> > Not a good idea.
>
> Ok, host does not have to see "struct list_head list". Its needed for
> guest. Will look into getting rid of this.
>
> >
> >
> > > +
> > > + /* Enqueue the request */
> > > + dev_dbg(&vq->vdev->dev, "%s\n", __func__);
> > > + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
> >
> >
> > This is easier as add_outbuf.
>
> Will look into it.
>
> >
> > Also - why GFP_ATOMIC?
>
> Hmm..., may be it can be GFP_KERNEL. I don't see atomic context here. Will
> look into it.
>
> >
> > > + if (ret < 0) {
> > > + if (ret == -ENOMEM || ret == -ENOSPC) {
> > > + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
> > > + ret);
> > > + list_add_tail(&forget->list,
> > > + &fsvq->queued_reqs);
> > > + schedule_delayed_work(&fsvq->dispatch_work,
> > > + msecs_to_jiffies(1));
> >
> > Can't we we requeue after some buffers get consumed?
>
> That's what dispatch work is doing. It tries to requeue the request after
> a while.
>
> [..]
> > > +static int virtio_fs_probe(struct virtio_device *vdev)
> > > +{
> > > + struct virtio_fs *fs;
> > > + int ret;
> > > +
> > > + fs = devm_kzalloc(&vdev->dev, sizeof(*fs), GFP_KERNEL);
> > > + if (!fs)
> > > + return -ENOMEM;
> > > + vdev->priv = fs;
> > > +
> > > + ret = virtio_fs_read_tag(vdev, fs);
> > > + if (ret < 0)
> > > + goto out;
> > > +
> > > + ret = virtio_fs_setup_vqs(vdev, fs);
> > > + if (ret < 0)
> > > + goto out;
> > > +
> > > + /* TODO vq affinity */
> > > + /* TODO populate notifications vq */
> >
> > what's notifications vq?
>
> It has not been implemented yet. At some point of time we want to have
> a notion of notification queue so that host can send notifications to
> guest. Will get rid of this comment for now.
>
> [..]
> > > +#ifdef CONFIG_PM_SLEEP
> > > +static int virtio_fs_freeze(struct virtio_device *vdev)
> > > +{
> > > + return 0; /* TODO */
> > > +}
> > > +
> > > +static int virtio_fs_restore(struct virtio_device *vdev)
> > > +{
> > > + return 0; /* TODO */
> > > +}
> >
> > Is this really a good idea? I'd rather it was implemented,
> > but if not possible at all disabling PM seems better than just
> > keep going.
>
> I agree. Will look into disabling it.
>
> >
> > > +#endif /* CONFIG_PM_SLEEP */
> > > +
> > > +const static struct virtio_device_id id_table[] = {
> > > + { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
> > > + {},
> > > +};
> > > +
> > > +const static unsigned int feature_table[] = {};
> > > +
> > > +static struct virtio_driver virtio_fs_driver = {
> > > + .driver.name = KBUILD_MODNAME,
> > > + .driver.owner = THIS_MODULE,
> > > + .id_table = id_table,
> > > + .feature_table = feature_table,
> > > + .feature_table_size = ARRAY_SIZE(feature_table),
> > > + /* TODO validate config_get != NULL */
> >
> > Why?
>
> Don't know. Stefan, do you remember why did you put this comment? If not,
> I will get rid of it.
This comment can be removed.
> > > +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> > > +__releases(fiq->waitq.lock)
> > > +{
> > > + unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
> > > + struct virtio_fs *fs;
> > > + struct fuse_conn *fc;
> > > + struct fuse_req *req;
> > > + struct fuse_pqueue *fpq;
> > > + int ret;
> > > +
> > > + WARN_ON(list_empty(&fiq->pending));
> > > + req = list_last_entry(&fiq->pending, struct fuse_req, list);
> > > + clear_bit(FR_PENDING, &req->flags);
> > > + list_del_init(&req->list);
> > > + WARN_ON(!list_empty(&fiq->pending));
> > > + spin_unlock(&fiq->waitq.lock);
> > > +
> > > + fs = fiq->priv;
> > > + fc = fs->vqs[queue_id].fud->fc;
> > > +
> > > + dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
> > > + "%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
> > > + __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid,
> > > + req->in.h.len, fuse_len_args(req->out.numargs, req->out.args));
> > > +
> > > + fpq = &fs->vqs[queue_id].fud->pq;
> > > + spin_lock(&fpq->lock);
> > > + if (!fpq->connected) {
> > > + spin_unlock(&fpq->lock);
> > > + req->out.h.error = -ENODEV;
> > > + pr_err("virtio-fs: %s disconnected\n", __func__);
> > > + fuse_request_end(fc, req);
> > > + return;
> > > + }
> > > + list_add_tail(&req->list, fpq->processing);
> > > + spin_unlock(&fpq->lock);
> > > + set_bit(FR_SENT, &req->flags);
> > > + /* matches barrier in request_wait_answer() */
> > > + smp_mb__after_atomic();
> > > + /* TODO check for FR_INTERRUPTED? */
> >
> >
> > ?
>
> hmm... we don't support FR_INTERRUPTED. Stefan, do you remember why
> this TODO is here. If not, I will get rid of it.
We don't support FUSE_INTERRUPT yet. The purpose of this comment is
that when we do support FUSE_INTERRUPT we'll need to follow
fuse_dev_do_read() in queuing a FUSE_INTERRUPT here.
Stefan
On Fri, Sep 06, 2019 at 11:22:09AM +0100, Stefan Hajnoczi wrote:
> On Thu, Sep 05, 2019 at 03:15:15PM -0400, Vivek Goyal wrote:
> > On Tue, Sep 03, 2019 at 09:55:49AM -0400, Michael S. Tsirkin wrote:
> > [..]
> > > What's with all of the TODOs? Some of these are really scary,
> > > looks like they need to be figured out before this is merged.
> >
> > Hi Michael,
> >
> > One of the issue I noticed is races w.r.t device removal and super
> > block initialization. I am about to post a set of patches which
> > take care of these races and also get rid of some of the scary
> > TODOs. Other TODOs like suspend/restore, multiqueue support etc
> > are improvements which we can do over a period of time.
> >
> > [..]
> > > > +/* Per-virtqueue state */
> > > > +struct virtio_fs_vq {
> > > > + spinlock_t lock;
> > > > + struct virtqueue *vq; /* protected by ->lock */
> > > > + struct work_struct done_work;
> > > > + struct list_head queued_reqs;
> > > > + struct delayed_work dispatch_work;
> > > > + struct fuse_dev *fud;
> > > > + bool connected;
> > > > + long in_flight;
> > > > + char name[24];
> > >
> > > I'd keep names somewhere separate as they are not used on data path.
> >
> > Ok, this sounds like a nice to have. Will take care of this once base
> > patch gets merged.
> >
> > [..]
> > > > +struct virtio_fs_forget {
> > > > + struct fuse_in_header ih;
> > > > + struct fuse_forget_in arg;
> > >
> > > These structures are all native endian.
> > >
> > > Passing them to host will make cross-endian setups painful to support,
> > > and hardware implementations impossible.
> > >
> > > How about converting everything to LE?
> >
> > So looks like endianness issue is now resolved (going by the other
> > emails). So I will not worry about it.
> >
> > [..]
> > > > +/* Add a new instance to the list or return -EEXIST if tag name exists*/
> > > > +static int virtio_fs_add_instance(struct virtio_fs *fs)
> > > > +{
> > > > + struct virtio_fs *fs2;
> > > > + bool duplicate = false;
> > > > +
> > > > + mutex_lock(&virtio_fs_mutex);
> > > > +
> > > > + list_for_each_entry(fs2, &virtio_fs_instances, list) {
> > > > + if (strcmp(fs->tag, fs2->tag) == 0)
> > > > + duplicate = true;
> > > > + }
> > > > +
> > > > + if (!duplicate)
> > > > + list_add_tail(&fs->list, &virtio_fs_instances);
> > >
> > >
> > > This is O(N^2) as it's presumably called for each istance.
> > > Doesn't scale - please switch to a tree or such.
> >
> > This is O(N) and not O(N^2) right? Addition of device is O(N), search
> > during mount is O(N).
> >
> > This is not a frequent event at all and number of virtiofs instances
> > per guest are expected to be fairly small (say less than 10). So I
> > really don't think there is any value in converting this into a tree
> > (instead of a list).
> >
> > [..]
> > > > +static void virtio_fs_free_devs(struct virtio_fs *fs)
> > > > +{
> > > > + unsigned int i;
> > > > +
> > > > + /* TODO lock */
> > >
> > > Doesn't inspire confidence, does it?
> >
> > Agreed. Getting rid of this in set of fixes I am about to post.
> >
> > >
> > > > +
> > > > + for (i = 0; i < fs->nvqs; i++) {
> > > > + struct virtio_fs_vq *fsvq = &fs->vqs[i];
> > > > +
> > > > + if (!fsvq->fud)
> > > > + continue;
> > > > +
> > > > + flush_work(&fsvq->done_work);
> > > > + flush_delayed_work(&fsvq->dispatch_work);
> > > > +
> > > > + /* TODO need to quiesce/end_requests/decrement dev_count */
> > >
> > > Indeed. Won't this crash if we don't?
> >
> > Took care of this as well.
> >
> > [..]
> > > > +static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
> > > > +{
> > > > + struct virtio_fs_forget *forget;
> > > > + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
> > > > + dispatch_work.work);
> > > > + struct virtqueue *vq = fsvq->vq;
> > > > + struct scatterlist sg;
> > > > + struct scatterlist *sgs[] = {&sg};
> > > > + bool notify;
> > > > + int ret;
> > > > +
> > > > + pr_debug("virtio-fs: worker %s called.\n", __func__);
> > > > + while (1) {
> > > > + spin_lock(&fsvq->lock);
> > > > + forget = list_first_entry_or_null(&fsvq->queued_reqs,
> > > > + struct virtio_fs_forget, list);
> > > > + if (!forget) {
> > > > + spin_unlock(&fsvq->lock);
> > > > + return;
> > > > + }
> > > > +
> > > > + list_del(&forget->list);
> > > > + if (!fsvq->connected) {
> > > > + spin_unlock(&fsvq->lock);
> > > > + kfree(forget);
> > > > + continue;
> > > > + }
> > > > +
> > > > + sg_init_one(&sg, forget, sizeof(*forget));
> > >
> > > This passes to host a structure including "struct list_head list";
> > >
> > > Not a good idea.
> >
> > Ok, host does not have to see "struct list_head list". Its needed for
> > guest. Will look into getting rid of this.
> >
> > >
> > >
> > > > +
> > > > + /* Enqueue the request */
> > > > + dev_dbg(&vq->vdev->dev, "%s\n", __func__);
> > > > + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
> > >
> > >
> > > This is easier as add_outbuf.
> >
> > Will look into it.
> >
> > >
> > > Also - why GFP_ATOMIC?
> >
> > Hmm..., may be it can be GFP_KERNEL. I don't see atomic context here. Will
> > look into it.
> >
> > >
> > > > + if (ret < 0) {
> > > > + if (ret == -ENOMEM || ret == -ENOSPC) {
> > > > + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
> > > > + ret);
> > > > + list_add_tail(&forget->list,
> > > > + &fsvq->queued_reqs);
> > > > + schedule_delayed_work(&fsvq->dispatch_work,
> > > > + msecs_to_jiffies(1));
> > >
> > > Can't we we requeue after some buffers get consumed?
> >
> > That's what dispatch work is doing. It tries to requeue the request after
> > a while.
> >
> > [..]
> > > > +static int virtio_fs_probe(struct virtio_device *vdev)
> > > > +{
> > > > + struct virtio_fs *fs;
> > > > + int ret;
> > > > +
> > > > + fs = devm_kzalloc(&vdev->dev, sizeof(*fs), GFP_KERNEL);
> > > > + if (!fs)
> > > > + return -ENOMEM;
> > > > + vdev->priv = fs;
> > > > +
> > > > + ret = virtio_fs_read_tag(vdev, fs);
> > > > + if (ret < 0)
> > > > + goto out;
> > > > +
> > > > + ret = virtio_fs_setup_vqs(vdev, fs);
> > > > + if (ret < 0)
> > > > + goto out;
> > > > +
> > > > + /* TODO vq affinity */
> > > > + /* TODO populate notifications vq */
> > >
> > > what's notifications vq?
> >
> > It has not been implemented yet. At some point of time we want to have
> > a notion of notification queue so that host can send notifications to
> > guest. Will get rid of this comment for now.
> >
> > [..]
> > > > +#ifdef CONFIG_PM_SLEEP
> > > > +static int virtio_fs_freeze(struct virtio_device *vdev)
> > > > +{
> > > > + return 0; /* TODO */
> > > > +}
> > > > +
> > > > +static int virtio_fs_restore(struct virtio_device *vdev)
> > > > +{
> > > > + return 0; /* TODO */
> > > > +}
> > >
> > > Is this really a good idea? I'd rather it was implemented,
> > > but if not possible at all disabling PM seems better than just
> > > keep going.
> >
> > I agree. Will look into disabling it.
> >
> > >
> > > > +#endif /* CONFIG_PM_SLEEP */
> > > > +
> > > > +const static struct virtio_device_id id_table[] = {
> > > > + { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
> > > > + {},
> > > > +};
> > > > +
> > > > +const static unsigned int feature_table[] = {};
> > > > +
> > > > +static struct virtio_driver virtio_fs_driver = {
> > > > + .driver.name = KBUILD_MODNAME,
> > > > + .driver.owner = THIS_MODULE,
> > > > + .id_table = id_table,
> > > > + .feature_table = feature_table,
> > > > + .feature_table_size = ARRAY_SIZE(feature_table),
> > > > + /* TODO validate config_get != NULL */
> > >
> > > Why?
> >
> > Don't know. Stefan, do you remember why did you put this comment? If not,
> > I will get rid of it.
>
> This comment can be removed.
>
> > > > +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> > > > +__releases(fiq->waitq.lock)
> > > > +{
> > > > + unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
> > > > + struct virtio_fs *fs;
> > > > + struct fuse_conn *fc;
> > > > + struct fuse_req *req;
> > > > + struct fuse_pqueue *fpq;
> > > > + int ret;
> > > > +
> > > > + WARN_ON(list_empty(&fiq->pending));
> > > > + req = list_last_entry(&fiq->pending, struct fuse_req, list);
> > > > + clear_bit(FR_PENDING, &req->flags);
> > > > + list_del_init(&req->list);
> > > > + WARN_ON(!list_empty(&fiq->pending));
> > > > + spin_unlock(&fiq->waitq.lock);
> > > > +
> > > > + fs = fiq->priv;
> > > > + fc = fs->vqs[queue_id].fud->fc;
> > > > +
> > > > + dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
> > > > + "%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
> > > > + __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid,
> > > > + req->in.h.len, fuse_len_args(req->out.numargs, req->out.args));
> > > > +
> > > > + fpq = &fs->vqs[queue_id].fud->pq;
> > > > + spin_lock(&fpq->lock);
> > > > + if (!fpq->connected) {
> > > > + spin_unlock(&fpq->lock);
> > > > + req->out.h.error = -ENODEV;
> > > > + pr_err("virtio-fs: %s disconnected\n", __func__);
> > > > + fuse_request_end(fc, req);
> > > > + return;
> > > > + }
> > > > + list_add_tail(&req->list, fpq->processing);
> > > > + spin_unlock(&fpq->lock);
> > > > + set_bit(FR_SENT, &req->flags);
> > > > + /* matches barrier in request_wait_answer() */
> > > > + smp_mb__after_atomic();
> > > > + /* TODO check for FR_INTERRUPTED? */
> > >
> > >
> > > ?
> >
> > hmm... we don't support FR_INTERRUPTED. Stefan, do you remember why
> > this TODO is here. If not, I will get rid of it.
>
> We don't support FUSE_INTERRUPT yet. The purpose of this comment is
> that when we do support FUSE_INTERRUPT we'll need to follow
> fuse_dev_do_read() in queuing a FUSE_INTERRUPT here.
>
> Stefan
OK so pls write this explicitly in the comment.
--
MST