2008-08-28 18:20:42

by Tejun Heo

[permalink] [raw]
Subject: [PATCHSET] CUSE: implement CUSE

This patchset implements CUSE - Character device in Userspace. Except
for initialization sequence and creation of character device instead
of a mount, CUSE isn't very different from FUSE.

This patchset is consisted of the following five patches.

0001-FUSE-add-fuse_-prefix-to-several-functions.patch
0002-FUSE-export-symbols-to-be-used-by-CUSE.patch
0003-FUSE-separate-out-fuse_conn_init-from-new_conn.patch
0004-FUSE-add-fuse_conn-release.patch
0005-CUSE-implement-CUSE-Character-device-in-Userspace.patch

0001-0004 prepares FUSE for CUSE addition and 0005 implements CUSE.
Corresponding libfuse changes will be posted separately.

This patchset is on top of...

2.6.27-rc4 (b8e6c91c74e9f0279b7c51048779b3d62da60b88)
+ [1] 9p-use-single-poller patchset
+ [2] wait-kill-is_sync_wait
+ [3] poll-allow-f_op_poll-to-sleep
+ [4] uevent updates (2 patches)
+ [5] char_dev-add-release
+ [6] extend-FUSE patchset

The above three patches allow f_op->poll() to sleep and 0007 depends
on it.

This patchset is available in the following git tree.

http://git.kernel.org/?p=linux/kernel/git/tj/misc.git;a=shortlog;h=cuse
git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git cuse

and contains the following changes.

fs/Kconfig | 10
fs/fuse/Makefile | 1
fs/fuse/cuse.c | 634 +++++++++++++++++++++++++++++++++++++++++++++++++++
fs/fuse/dev.c | 32 +-
fs/fuse/dir.c | 34 +-
fs/fuse/file.c | 60 ++--
fs/fuse/fuse_i.h | 46 +++
fs/fuse/inode.c | 143 ++++++-----
include/linux/cuse.h | 40 +++
include/linux/fuse.h | 2
10 files changed, 882 insertions(+), 120 deletions(-)

Thanks.

--
tejun

[1] http://thread.gmane.org/gmane.linux.kernel/726098
[2] http://article.gmane.org/gmane.linux.kernel/726176
[3] http://article.gmane.org/gmane.linux.kernel/726178
[4] http://thread.gmane.org/gmane.linux.kernel/727127
[5] http://article.gmane.org/gmane.linux.kernel/727133
[6] http://thread.gmane.org/gmane.linux.kernel/727161


2008-08-28 18:20:59

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 1/5] FUSE: add fuse_ prefix to several functions

Add fuse_ prefix to request_send*() and get_root_inode() as some of
those functions will be exported for CUSE. With or without CUSE
export, having the function names scoped is a good idea for
debuggability.

Signed-off-by: Tejun Heo <[email protected]>
---
fs/fuse/dev.c | 23 ++++++++++++-----------
fs/fuse/dir.c | 34 +++++++++++++++++-----------------
fs/fuse/file.c | 30 +++++++++++++++---------------
fs/fuse/fuse_i.h | 9 +++++----
fs/fuse/inode.c | 12 ++++++------
5 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1c422f9..b448dfd 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -380,7 +380,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
}
}

-void request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
{
req->isreply = 1;
spin_lock(&fc->lock);
@@ -399,8 +399,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
spin_unlock(&fc->lock);
}

-static void request_send_nowait_locked(struct fuse_conn *fc,
- struct fuse_req *req)
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
+ struct fuse_req *req)
{
req->background = 1;
fc->num_background++;
@@ -414,11 +414,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
flush_bg_queue(fc);
}

-static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
{
spin_lock(&fc->lock);
if (fc->connected) {
- request_send_nowait_locked(fc, req);
+ fuse_request_send_nowait_locked(fc, req);
spin_unlock(&fc->lock);
} else {
req->out.h.error = -ENOTCONN;
@@ -426,16 +426,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
}
}

-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
{
req->isreply = 0;
- request_send_nowait(fc, req);
+ fuse_request_send_nowait(fc, req);
}

-void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
{
req->isreply = 1;
- request_send_nowait(fc, req);
+ fuse_request_send_nowait(fc, req);
}

/*
@@ -443,10 +443,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
*
* fc->connected must have been checked previously
*/
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+ struct fuse_req *req)
{
req->isreply = 1;
- request_send_nowait_locked(fc, req);
+ fuse_request_send_nowait_locked(fc, req);
}

/*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fd03330..fa28925 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
parent = dget_parent(entry);
fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
&entry->d_name, &outarg);
- request_send(fc, req);
+ fuse_request_send(fc, req);
dput(parent);
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
attr_version = fuse_get_attr_version(fc);

fuse_lookup_init(fc, req, nodeid, name, outarg);
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
/* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
{
fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
ff->reserved_req->force = 1;
- request_send(fc, ff->reserved_req);
+ fuse_request_send(fc, ff->reserved_req);
fuse_put_request(fc, ff->reserved_req);
kfree(ff);
}
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
req->out.args[0].value = &outentry;
req->out.args[1].size = sizeof(outopen);
req->out.args[1].value = &outopen;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
if (err) {
if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
else
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err)
@@ -631,7 +631,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
req->in.numargs = 1;
req->in.args[0].size = entry->d_name.len + 1;
req->in.args[0].value = entry->d_name.name;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err) {
@@ -662,7 +662,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
req->in.numargs = 1;
req->in.args[0].size = entry->d_name.len + 1;
req->in.args[0].value = entry->d_name.name;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err) {
@@ -695,7 +695,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
req->in.args[1].value = oldent->d_name.name;
req->in.args[2].size = newent->d_name.len + 1;
req->in.args[2].value = newent->d_name.name;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err) {
@@ -811,7 +811,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
else
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err) {
@@ -904,7 +904,7 @@ static int fuse_access(struct inode *inode, int mask)
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
@@ -1026,7 +1026,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
req->num_pages = 1;
req->pages[0] = page;
fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
- request_send(fc, req);
+ fuse_request_send(fc, req);
nbytes = req->out.args[0].size;
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -1060,7 +1060,7 @@ static char *read_link(struct dentry *dentry)
req->out.numargs = 1;
req->out.args[0].size = PAGE_SIZE - 1;
req->out.args[0].value = link;
- request_send(fc, req);
+ fuse_request_send(fc, req);
if (req->out.h.error) {
free_page((unsigned long) link);
link = ERR_PTR(req->out.h.error);
@@ -1266,7 +1266,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
else
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err) {
@@ -1360,7 +1360,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
req->in.args[1].value = name;
req->in.args[2].size = size;
req->in.args[2].value = value;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
@@ -1406,7 +1406,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
}
- request_send(fc, req);
+ fuse_request_send(fc, req);
ret = req->out.h.error;
if (!ret)
ret = size ? req->out.args[0].size : outarg.size;
@@ -1456,7 +1456,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
}
- request_send(fc, req);
+ fuse_request_send(fc, req);
ret = req->out.h.error;
if (!ret)
ret = size ? req->out.args[0].size : outarg.size;
@@ -1489,7 +1489,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
req->in.numargs = 1;
req->in.args[0].size = strlen(name) + 1;
req->in.args[0].value = name;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5d704e3..40895ed 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -39,7 +39,7 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
req->out.numargs = 1;
req->out.args[0].size = sizeof(*outargp);
req->out.args[0].value = outargp;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);

@@ -91,7 +91,7 @@ static void fuse_file_put(struct fuse_file *ff)
struct inode *inode = req->misc.release.dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
req->end = fuse_release_end;
- request_send_background(fc, req);
+ fuse_request_send_background(fc, req);
kfree(ff);
}
}
@@ -286,7 +286,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
req->force = 1;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
@@ -350,7 +350,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
@@ -403,7 +403,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
}
if (file->f_flags & O_NONBLOCK)
inarg->read_flags |= FUSE_READ_NONBLOCK;
- request_send(fc, req);
+ fuse_request_send(fc, req);
return req->out.args[0].size;
}

@@ -516,9 +516,9 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
struct fuse_file *ff = file->private_data;
req->ff = fuse_file_get(ff);
req->end = fuse_readpages_end;
- request_send_background(fc, req);
+ fuse_request_send_background(fc, req);
} else {
- request_send(fc, req);
+ fuse_request_send(fc, req);
fuse_readpages_end(fc, req);
}
}
@@ -646,7 +646,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
}
if (file->f_flags & O_NONBLOCK)
inarg->write_flags |= FUSE_WRITE_NONBLOCK;
- request_send(fc, req);
+ fuse_request_send(fc, req);
return req->misc.write.out.size;
}

@@ -1089,7 +1089,7 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)

req->in.args[1].size = inarg->size;
fi->writectr++;
- request_send_background_locked(fc, req);
+ fuse_request_send_background_locked(fc, req);
return;

out_free:
@@ -1335,7 +1335,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
req->out.numargs = 1;
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err)
@@ -1367,7 +1367,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
return PTR_ERR(req);

fuse_lk_fill(req, file, fl, opcode, pid, flock);
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
/* locking is restartable */
if (err == -EINTR)
@@ -1443,7 +1443,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
req->out.numargs = 1;
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS)
@@ -1483,7 +1483,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
req->out.numargs = 1;
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);

@@ -1681,7 +1681,7 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
req->out.argpages = 1;
req->out.argvar = 1;

- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
transferred = req->out.args[1].size;
fuse_put_request(fc, req);
@@ -1847,7 +1847,7 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
req->out.numargs = 1;
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f884160..c5c11d7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -619,19 +619,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
/**
* Send a request (synchronous)
*/
-void request_send(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);

/**
* Send a request with no reply
*/
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);

/**
* Send a request in the background
*/
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);

-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+ struct fuse_req *req);

/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 088ba6e..c8806bb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
req->in.numargs = 1;
req->in.args[0].size = sizeof(struct fuse_forget_in);
req->in.args[0].value = inarg;
- request_send_noreply(fc, req);
+ fuse_request_send_noreply(fc, req);
}

static void fuse_clear_inode(struct inode *inode)
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
fc->destroy_req = NULL;
req->in.h.opcode = FUSE_DESTROY;
req->force = 1;
- request_send(fc, req);
+ fuse_request_send(fc, req);
fuse_put_request(fc, req);
}
}
@@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
req->out.args[0].size =
fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
if (!err)
convert_fuse_statfs(buf, &outarg.st);
@@ -543,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
return fc;
}

-static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
memset(&attr, 0, sizeof(attr));
@@ -797,7 +797,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
req->out.args[0].size = sizeof(struct fuse_init_out);
req->out.args[0].value = &req->misc.init_out;
req->end = process_init_reply;
- request_send_background(fc, req);
+ fuse_request_send_background(fc, req);
}

static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -851,7 +851,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = fc;

err = -ENOMEM;
- root = get_root_inode(sb, d.rootmode);
+ root = fuse_get_root_inode(sb, d.rootmode);
if (!root)
goto err;

--
1.5.4.5

2008-08-28 18:21:20

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 2/5] FUSE: export symbols to be used by CUSE

Export the following symbols for CUSE.

fuse_conn_put()
fuse_conn_get()
fuse_get_root_inode()
fuse_super_operations
fuse_send_init()
fuse_flush()
fuse_fsync()
fuse_direct_io()
fuse_file_lock()
fuse_file_flock()
fuse_file_llseek()
fuse_file_ioctl()
fuse_file_compat_ioctl()
fuse_file_poll()

Signed-off-by: Tejun Heo <[email protected]>
---
fs/fuse/dev.c | 9 ++++++++-
fs/fuse/file.c | 30 ++++++++++++++++++++----------
fs/fuse/fuse_i.h | 29 +++++++++++++++++++++++++++++
fs/fuse/inode.c | 11 ++++++++---
4 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b448dfd..75e2775 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -46,6 +46,7 @@ struct fuse_req *fuse_request_alloc(void)
fuse_request_init(req);
return req;
}
+EXPORT_SYMBOL_GPL(fuse_request_alloc);

struct fuse_req *fuse_request_alloc_nofs(void)
{
@@ -124,6 +125,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
atomic_dec(&fc->num_waiting);
return ERR_PTR(err);
}
+EXPORT_SYMBOL_GPL(fuse_get_req);

/*
* Return request in fuse_file->reserved_req. However that may
@@ -208,6 +210,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
fuse_request_free(req);
}
}
+EXPORT_SYMBOL_GPL(fuse_put_request);

static unsigned len_args(unsigned numargs, struct fuse_arg *args)
{
@@ -398,6 +401,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
}
spin_unlock(&fc->lock);
}
+EXPORT_SYMBOL_GPL(fuse_request_send);

static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
struct fuse_req *req)
@@ -1092,8 +1096,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
}
spin_unlock(&fc->lock);
}
+EXPORT_SYMBOL_GPL(fuse_abort_conn);

-static int fuse_dev_release(struct inode *inode, struct file *file)
+int fuse_dev_release(struct inode *inode, struct file *file)
{
struct fuse_conn *fc = fuse_get_conn(file);
if (fc) {
@@ -1108,6 +1113,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)

return 0;
}
+EXPORT_SYMBOL_GPL(fuse_dev_release);

static int fuse_dev_fasync(int fd, struct file *file, int on)
{
@@ -1130,6 +1136,7 @@ const struct file_operations fuse_dev_operations = {
.release = fuse_dev_release,
.fasync = fuse_dev_fasync,
};
+EXPORT_SYMBOL_GPL(fuse_dev_operations);

static struct miscdevice fuse_miscdevice = {
.minor = FUSE_MINOR,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 40895ed..38b0bfb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/module.h>

static const struct file_operations fuse_direct_io_file_operations;

@@ -261,7 +262,7 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
return 0;
}

-static int fuse_flush(struct file *file, fl_owner_t id)
+int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -295,6 +296,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
}
return err;
}
+EXPORT_SYMBOL_GPL(fuse_flush);

/*
* Wait for all pending writepages on the inode to finish.
@@ -363,10 +365,11 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
return err;
}

-static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+int fuse_fsync(struct file *file, struct dentry *de, int datasync)
{
return fuse_fsync_common(file, de, datasync, 0);
}
+EXPORT_SYMBOL_GPL(fuse_fsync);

void fuse_read_fill(struct fuse_req *req, struct file *file,
struct inode *inode, loff_t pos, size_t count, int opcode)
@@ -961,8 +964,8 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
return 0;
}

-static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos, int write)
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, int write)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1027,6 +1030,7 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,

return res;
}
+EXPORT_SYMBOL_GPL(fuse_direct_io);

static ssize_t fuse_direct_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -1376,7 +1380,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
return err;
}

-static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1398,8 +1402,9 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
}
return err;
}
+EXPORT_SYMBOL_GPL(fuse_file_lock);

-static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
+int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1415,6 +1420,7 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)

return err;
}
+EXPORT_SYMBOL_GPL(fuse_file_flock);

static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
@@ -1452,7 +1458,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
return err ? 0 : outarg.block;
}

-static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
{
loff_t retval = -EINVAL;
struct inode *inode = file->f_path.dentry->d_inode;
@@ -1512,6 +1518,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
mutex_unlock(&inode->i_mutex);
return retval;
}
+EXPORT_SYMBOL_GPL(fuse_file_llseek);

static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
unsigned int nr_segs, size_t bytes, bool to_user)
@@ -1738,17 +1745,19 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
return err ? err : outarg.result;
}

-static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+long fuse_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
return fuse_file_do_ioctl(file, cmd, arg, 0);
}
+EXPORT_SYMBOL_GPL(fuse_file_ioctl);

-static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
}
+EXPORT_SYMBOL_GPL(fuse_file_compat_ioctl);

/*
* All files which have been polled are linked to RB tree
@@ -1811,7 +1820,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
spin_unlock(&fc->lock);
}

-static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+unsigned fuse_file_poll(struct file *file, poll_table *wait)
{
struct inode *inode = file->f_dentry->d_inode;
struct fuse_file *ff = file->private_data;
@@ -1859,6 +1868,7 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
}
return POLLERR;
}
+EXPORT_SYMBOL_GPL(fuse_file_poll);

/*
* This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c5c11d7..bc55f6d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -488,10 +488,14 @@ static inline u64 get_node_id(struct inode *inode)
}

/** Device operations */
+extern const struct super_operations fuse_super_operations;
+
extern const struct file_operations fuse_dev_operations;

extern struct dentry_operations fuse_dentry_operations;

+struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode);
+
/**
* Get a filled in inode
*/
@@ -503,6 +507,11 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode);

/**
+ * Send INIT command
+ */
+void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
* Send FORGET command
*/
void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
@@ -539,6 +548,21 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
int isdir);

/**
+ * Exported file operations
+ */
+loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin);
+unsigned fuse_file_poll(struct file *file, poll_table *wait);
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, int write);
+int fuse_flush(struct file *file, fl_owner_t id);
+int fuse_fsync(struct file *file, struct dentry *de, int datasync);
+int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl);
+int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl);
+long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg);
+
+/**
* Notify poll wakeup
*/
int fuse_notify_poll_wakeup(struct fuse_conn *fc,
@@ -581,6 +605,11 @@ void fuse_truncate(struct address_space *mapping, loff_t offset);
int fuse_dev_init(void);

/**
+ * Release the client device
+ */
+int fuse_dev_release(struct inode *inode, struct file *file);
+
+/**
* Cleanup the client device
*/
void fuse_dev_cleanup(void);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c8806bb..fae8732 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -536,14 +536,16 @@ void fuse_conn_put(struct fuse_conn *fc)
kfree(fc);
}
}
+EXPORT_SYMBOL_GPL(fuse_conn_put);

struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
{
atomic_inc(&fc->count);
return fc;
}
+EXPORT_SYMBOL_GPL(fuse_conn_get);

-static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
+struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
memset(&attr, 0, sizeof(attr));
@@ -553,6 +555,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
attr.nlink = 1;
return fuse_iget(sb, 1, 0, &attr, 0, 0);
}
+EXPORT_SYMBOL_GPL(fuse_get_root_inode);

struct fuse_inode_handle
{
@@ -722,7 +725,7 @@ static const struct export_operations fuse_export_operations = {
.get_parent = fuse_get_parent,
};

-static const struct super_operations fuse_super_operations = {
+const struct super_operations fuse_super_operations = {
.alloc_inode = fuse_alloc_inode,
.destroy_inode = fuse_destroy_inode,
.clear_inode = fuse_clear_inode,
@@ -733,6 +736,7 @@ static const struct super_operations fuse_super_operations = {
.statfs = fuse_statfs,
.show_options = fuse_show_options,
};
+EXPORT_SYMBOL_GPL(fuse_super_operations);

static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{
@@ -776,7 +780,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
wake_up_all(&fc->blocked_waitq);
}

-static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_init_in *arg = &req->misc.init_in;

@@ -799,6 +803,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
req->end = process_init_reply;
fuse_request_send_background(fc, req);
}
+EXPORT_SYMBOL_GPL(fuse_send_init);

static int fuse_fill_super(struct super_block *sb, void *data, int silent)
{
--
1.5.4.5

2008-08-28 18:21:52

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 4/5] FUSE: add fuse_conn->release()

Add fuse_conn->release() so that fuse_conn can be embedded in other
structures. If unspecified, the original action - kfree() - is done.

Signed-off-by: Tejun Heo <[email protected]>
---
fs/fuse/fuse_i.h | 3 +++
fs/fuse/inode.c | 6 +++++-
2 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4795264..67f33e8 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -465,6 +465,9 @@ struct fuse_conn {

/** Version counter for attribute changes */
u64 attr_version;
+
+ /** Called on final put. If implemented, should free the connection */
+ void (*release)(struct fuse_conn *);
};

static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 8d092ea..b99bb95 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -543,7 +543,11 @@ void fuse_conn_put(struct fuse_conn *fc)
fuse_request_free(fc->destroy_req);
mutex_destroy(&fc->inst_mutex);
bdi_destroy(&fc->bdi);
- kfree(fc);
+
+ if (fc->release)
+ fc->release(fc);
+ else
+ kfree(fc);
}
}
EXPORT_SYMBOL_GPL(fuse_conn_put);
--
1.5.4.5

2008-08-28 18:21:35

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 3/5] FUSE: separate out fuse_conn_init() from new_conn()

Separate out fuse_conn_init() from new_conn() and while at it
initialize fuse_conn->entry during conn initialization.

This will be used by CUSE.

Signed-off-by: Tejun Heo <[email protected]>
---
fs/fuse/fuse_i.h | 5 ++
fs/fuse/inode.c | 116 +++++++++++++++++++++++++++++------------------------
2 files changed, 68 insertions(+), 53 deletions(-)

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bc55f6d..4795264 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -679,6 +679,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);

/**
+ * Initialize fuse_conn
+ */
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+
+/**
* Release reference to fuse_conn
*/
void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fae8732..8d092ea 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -462,66 +462,76 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
return 0;
}

-static struct fuse_conn *new_conn(struct super_block *sb)
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
{
- struct fuse_conn *fc;
int err;

- fc = kzalloc(sizeof(*fc), GFP_KERNEL);
- if (fc) {
- spin_lock_init(&fc->lock);
- mutex_init(&fc->inst_mutex);
- atomic_set(&fc->count, 1);
- init_waitqueue_head(&fc->waitq);
- init_waitqueue_head(&fc->blocked_waitq);
- init_waitqueue_head(&fc->reserved_req_waitq);
- INIT_LIST_HEAD(&fc->pending);
- INIT_LIST_HEAD(&fc->processing);
- INIT_LIST_HEAD(&fc->io);
- INIT_LIST_HEAD(&fc->interrupts);
- INIT_LIST_HEAD(&fc->bg_queue);
- atomic_set(&fc->num_waiting, 0);
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
- fc->bdi.unplug_io_fn = default_unplug_io_fn;
- /* fuse does it's own writeback accounting */
- fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
- fc->polled_files = RB_ROOT;
- fc->dev = sb->s_dev;
- err = bdi_init(&fc->bdi);
- if (err)
- goto error_kfree;
- if (sb->s_bdev) {
- err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
- MAJOR(fc->dev), MINOR(fc->dev));
- } else {
- err = bdi_register_dev(&fc->bdi, fc->dev);
- }
- if (err)
- goto error_bdi_destroy;
- /*
- * For a single fuse filesystem use max 1% of dirty +
- * writeback threshold.
- *
- * This gives about 1M of write buffer for memory maps on a
- * machine with 1G and 10% dirty_ratio, which should be more
- * than enough.
- *
- * Privileged users can raise it by writing to
- *
- * /sys/class/bdi/<bdi>/max_ratio
- */
- bdi_set_max_ratio(&fc->bdi, 1);
- fc->reqctr = 0;
- fc->blocked = 1;
- fc->attr_version = 1;
- get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+ memset(fc, 0, sizeof(*fc));
+ spin_lock_init(&fc->lock);
+ mutex_init(&fc->inst_mutex);
+ atomic_set(&fc->count, 1);
+ init_waitqueue_head(&fc->waitq);
+ init_waitqueue_head(&fc->blocked_waitq);
+ init_waitqueue_head(&fc->reserved_req_waitq);
+ INIT_LIST_HEAD(&fc->pending);
+ INIT_LIST_HEAD(&fc->processing);
+ INIT_LIST_HEAD(&fc->io);
+ INIT_LIST_HEAD(&fc->interrupts);
+ INIT_LIST_HEAD(&fc->bg_queue);
+ INIT_LIST_HEAD(&fc->entry);
+ atomic_set(&fc->num_waiting, 0);
+ fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ fc->bdi.unplug_io_fn = default_unplug_io_fn;
+ /* fuse does it's own writeback accounting */
+ fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+ fc->polled_files = RB_ROOT;
+ fc->dev = sb->s_dev;
+ err = bdi_init(&fc->bdi);
+ if (err)
+ goto error_mutex_destroy;
+ if (sb->s_bdev) {
+ err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+ MAJOR(fc->dev), MINOR(fc->dev));
+ } else {
+ err = bdi_register_dev(&fc->bdi, fc->dev);
}
- return fc;
+ if (err)
+ goto error_bdi_destroy;
+ /*
+ * For a single fuse filesystem use max 1% of dirty +
+ * writeback threshold.
+ *
+ * This gives about 1M of write buffer for memory maps on a
+ * machine with 1G and 10% dirty_ratio, which should be more
+ * than enough.
+ *
+ * Privileged users can raise it by writing to
+ *
+ * /sys/class/bdi/<bdi>/max_ratio
+ */
+ bdi_set_max_ratio(&fc->bdi, 1);
+ fc->reqctr = 0;
+ fc->blocked = 1;
+ fc->attr_version = 1;
+ get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));

-error_bdi_destroy:
+ return 0;
+
+ error_bdi_destroy:
bdi_destroy(&fc->bdi);
-error_kfree:
+ error_mutex_destroy:
mutex_destroy(&fc->inst_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fuse_conn_init);
+
+static struct fuse_conn *new_conn(struct super_block *sb)
+{
+ struct fuse_conn *fc;
+
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ if (fc && fuse_conn_init(fc, sb) == 0)
+ return fc;
kfree(fc);
return NULL;
}
--
1.5.4.5

2008-08-28 18:22:12

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 5/5] CUSE: implement CUSE - Character device in Userspace

CUSE enables implementing character devices in userspace. With recent
additions of nonblock, lseek, ioctl and poll support, FUSE already has
most of what's necessary to implement character devices. All CUSE has
to do is bonding all those components - FUSE, chardev and the driver
model - nicely.

Due to the number of different objects involved and many ways an
instance can fail, object lifetime rules are a tad bit complex.
Please take a look at the comment on top of fs/fuse/cuse.c for
details.

Other than that, it's mostly straight forward. Client opens
/dev/cuse, kernel starts conversation with CUSE_INIT. The client
tells CUSE which device it wants to create. CUSE creates the device
for the client and the rest works the same way as in a direct IO FUSE
session.

Each CUSE device has a corresponding directory /sys/class/cuse/DEVNAME
(which is symlink to /sys/devices/virtual/class/DEVNAME if
SYSFS_DEPRECATED is turned off) which hosts "waiting" and "abort"
among other things. Those two files have the same meaning as the FUSE
control files.

The only notable lacking feature compared to in-kernel implementation
is mmap support.

Signed-off-by: Tejun Heo <[email protected]>
---
fs/Kconfig | 10 +
fs/fuse/Makefile | 1 +
fs/fuse/cuse.c | 634 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/cuse.h | 40 ++++
include/linux/fuse.h | 2 +
5 files changed, 687 insertions(+), 0 deletions(-)
create mode 100644 fs/fuse/cuse.c
create mode 100644 include/linux/cuse.h

diff --git a/fs/Kconfig b/fs/Kconfig
index d387358..3da7551 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -648,6 +648,16 @@ config FUSE_FS
If you want to develop a userspace FS, or if you want to use
a filesystem based on FUSE, answer Y or M.

+config CUSE
+ tristate "Character device in Userpace support"
+ depends on FUSE_FS
+ help
+ This FUSE extension allows character devices to be
+ implemented in userspace.
+
+ If you want to develop or use userspace character device
+ based on CUSE, answer Y or M.
+
config GENERIC_ACL
bool
select FS_POSIX_ACL
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 7243706..e95eeb4 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -3,5 +3,6 @@
#

obj-$(CONFIG_FUSE_FS) += fuse.o
+obj-$(CONFIG_CUSE) += cuse.o

fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 0000000..23aa995
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,634 @@
+/*
+ * CUSE: Character device in Userspace
+ *
+ * Copyright (C) 2008 SUSE Linux Products GmbH
+ * Copyright (C) 2008 Tejun Heo <[email protected]>
+ *
+ * This file is released under the GPLv2.
+ *
+ * CUSE bridges a few objects to implement a character device using
+ * userland backend. The lifetime rules of the involved objects are a
+ * bit complex.
+ *
+ * cuse_conn : contains fuse_conn and serves as bonding structure
+ * channel : file handle connected to the userland CUSE client
+ * cdev : the implemented character device
+ * mnt : vfsmount which serves dentry and inode for cdev
+ * dev : generic device for cdev
+ *
+ * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with
+ * devices, it's called 'channel' to reduce confusion.
+ *
+ * channel determines when the character device dies. When channel is
+ * closed, everything should begin to destruct. As cuse_conn and mnt
+ * dereference each other unlike FUSE, both should be destructed at
+ * the same time. This is achieved by giving the base reference of
+ * cuse_conn to mnt and never referencing cuse_conn directly, so both
+ * channel and cdev have reference to mnt which in turn has single
+ * reference to cuse_conn.
+ *
+ * On CUSE client disconnect, cuse_channel_release() unregisters dev,
+ * deletes cdev and puts mnt. When the cdev is released, it puts mnt
+ * which in turn puts the cuse_conn on release.
+ *
+ * cuse_conn_get/put() takes cuse_conn and manipulates the reference
+ * count of mnt for convenience.
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/miscdevice.h>
+#include <linux/cuse.h>
+
+#include "fuse_i.h"
+
+#define CUSE_SUPER_MAGIC 0x43555345
+
+struct cuse_conn {
+ struct fuse_conn fc;
+ struct cdev cdev;
+ struct vfsmount *mnt;
+ struct device *dev;
+ bool cdev_added:1;
+ bool disconnected:1; /* channel disconnected */
+ char *uevent_envp[UEVENT_NUM_ENVP + 1];
+ char *uevent_env_buf;
+};
+
+#define fc_to_cc(_fc) container_of((_fc), struct cuse_conn, fc)
+#define cdev_to_cc(_cdev) container_of((_cdev), struct cuse_conn, cdev)
+#define cuse_conn_get(cc) ({mntget((cc)->mnt); cc;})
+#define cuse_conn_put(cc) mntput((cc)->mnt)
+
+static struct class *cuse_class;
+static DEFINE_SPINLOCK(cuse_disconnect_lock);
+
+static loff_t cuse_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ return fuse_file_llseek(file->private_data, offset, origin);
+}
+
+static ssize_t cuse_direct_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return fuse_direct_io(file->private_data, buf, count, ppos, 0);
+}
+
+static ssize_t cuse_direct_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ /*
+ * No locking or generic_write_checks(), the client is
+ * responsible for locking and sanity checks.
+ */
+ return fuse_direct_io(file->private_data, buf, count, ppos, 1);
+}
+
+static int cuse_open(struct inode *inode, struct file *file)
+{
+ struct cuse_conn *cc = cdev_to_cc(inode->i_cdev);
+ struct file *cfile;
+
+ cfile = dentry_open(dget(cc->mnt->mnt_root), mntget(cc->mnt),
+ file->f_flags);
+ if (IS_ERR(cfile))
+ return PTR_ERR(cfile);
+
+ file->private_data = cfile;
+ return 0;
+}
+
+static int cuse_flush(struct file *file, fl_owner_t id)
+{
+ return fuse_flush(file->private_data, id);
+}
+
+static int cuse_release(struct inode *inode, struct file *file)
+{
+ return filp_close(file->private_data, NULL);
+}
+
+static int cuse_fsync(struct file *file, struct dentry *de, int datasync)
+{
+ return fuse_fsync(file->private_data, de, datasync);
+}
+
+static unsigned cuse_file_poll(struct file *file, poll_table *wait)
+{
+ return fuse_file_poll(file->private_data, wait);
+}
+
+static long cuse_file_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return fuse_file_ioctl(file->private_data, cmd, arg);
+}
+
+static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return fuse_file_compat_ioctl(file->private_data, cmd, arg);
+}
+
+static const struct file_operations cuse_frontend_fops = {
+ .llseek = cuse_file_llseek,
+ .read = cuse_direct_read,
+ .write = cuse_direct_write,
+ .open = cuse_open,
+ .flush = cuse_flush,
+ .release = cuse_release,
+ .fsync = cuse_fsync,
+ .poll = cuse_file_poll,
+ .unlocked_ioctl = cuse_file_ioctl,
+ .compat_ioctl = cuse_file_compat_ioctl,
+};
+
+static void cuse_fc_release(struct fuse_conn *fc)
+{
+ struct cuse_conn *cc = fc_to_cc(fc);
+
+ kfree(cc->uevent_env_buf);
+ kfree(cc);
+}
+
+static int cuse_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct cuse_conn *cc = NULL;
+ struct dentry *root_dentry = NULL;
+ struct inode *root = NULL;
+ int rc;
+
+ sb->s_magic = CUSE_SUPER_MAGIC;
+ sb->s_op = &fuse_super_operations;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+ cc = kzalloc(sizeof(*cc), GFP_KERNEL);
+ if (!cc)
+ goto err_nomem;
+ rc = fuse_conn_init(&cc->fc, sb);
+ if (rc)
+ goto err;
+
+ /* cuse isn't accessible to mortal users, give it some latitude */
+ cc->fc.flags = FUSE_ALLOW_OTHER;
+ cc->fc.user_id = current->euid;
+ cc->fc.group_id = current->egid;
+ cc->fc.max_read = FUSE_MAX_PAGES_PER_REQ * PAGE_SIZE;
+ cc->fc.release = cuse_fc_release;
+
+ /* transfer the initial cc refcnt to sb */
+ sb->s_fs_info = &cc->fc;
+ cc = NULL;
+
+ root = fuse_get_root_inode(sb, S_IFREG);
+ if (!root)
+ goto err_nomem;
+
+ root_dentry = d_alloc_root(root);
+ if (!root_dentry)
+ goto err_nomem;
+
+ sb->s_root = root_dentry;
+
+ return 0;
+
+ err_nomem:
+ rc = -ENOMEM;
+ err:
+ if (root_dentry)
+ dput(root_dentry);
+ else if (root)
+ iput(root);
+ kfree(cc);
+ return rc;
+}
+
+static int cuse_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ return get_sb_nodev(fs_type, flags, data, cuse_fill_super, mnt);
+}
+
+static struct file_system_type cuse_fs = {
+ .name = "cuse",
+ .get_sb = cuse_get_sb,
+ .kill_sb = kill_anon_super,
+};
+
+static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
+{
+ char *p = *pp;
+ char *key, *val;
+
+ while (p < end && *p == '\0')
+ p++;
+ if (p == end)
+ return 0;
+
+ if (end[-1] != '\0') {
+ printk(KERN_ERR "CUSE: info not properly terminated\n");
+ return -EINVAL;
+ }
+
+ key = val = p;
+ p += strlen(p);
+
+ if (valp) {
+ strsep(&val, "=");
+ if (!val)
+ val = key + strlen(key);
+ key = strstrip(key);
+ val = strstrip(val);
+ } else
+ key = strstrip(key);
+
+ if (!strlen(key)) {
+ printk(KERN_ERR "CUSE: zero length info key specified\n");
+ return -EINVAL;
+ }
+
+ *pp = p;
+ *keyp = key;
+ if (valp)
+ *valp = val;
+
+ return 1;
+}
+
+struct cuse_devinfo {
+ const char *name;
+};
+
+static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
+{
+ char *end = p + len;
+ char *key, *val;
+ int rc;
+
+ while (true) {
+ rc = cuse_parse_one(&p, end, &key, &val);
+ if (rc < 0)
+ return rc;
+ if (!rc)
+ break;
+ if (strcmp(key, "DEVNAME") == 0)
+ devinfo->name = val;
+ else
+ printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
+ key);
+ }
+
+ if (!devinfo->name || !strlen(devinfo->name)) {
+ printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int cuse_parse_hotplug_envp(char *p, size_t len, char **envp, int max)
+{
+ char *end = p + len;
+ int idx = 0;
+ char *key;
+ int rc;
+
+ while (true) {
+ rc = cuse_parse_one(&p, end, &key, NULL);
+ if (rc < 0)
+ return rc;
+ if (!rc)
+ break;
+ if (idx >= max) {
+ printk(KERN_ERR "CUSE: too many hotplug info entries\n");
+ return -ENOMEM;
+ }
+ envp[idx++] = key;
+ }
+
+ return 0;
+}
+
+static void cuse_gendev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+static void cuse_cdev_release(struct cdev *cdev)
+{
+ cuse_conn_put(cdev_to_cc(cdev));
+}
+
+static int cuse_init_worker(void *data)
+{
+ struct cuse_conn *cc = data;
+ struct cuse_init_in iin = { };
+ struct cuse_init_out iout = { };
+ struct cuse_devinfo devinfo = { };
+ struct fuse_req *req;
+ struct page *page = NULL;
+ struct device *dev;
+ bool disconnected;
+ dev_t devt;
+ int rc;
+
+ BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
+
+ /* identify ourself and query what the CUSE client wants */
+ req = fuse_get_req(&cc->fc);
+ if (IS_ERR(req)) {
+ rc = PTR_ERR(req);
+ goto out;
+ }
+
+ rc = -ENOMEM;
+ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 1);
+ if (!page)
+ goto out;
+
+ req->pages[0] = nth_page(page, 0);
+ req->pages[1] = nth_page(page, 1);
+ req->num_pages = 2;
+
+ req->in.h.opcode = CUSE_INIT;
+ req->in.h.nodeid = get_node_id(cc->mnt->mnt_sb->s_root->d_inode);
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(iin);
+ req->in.args[0].value = &iin;
+
+ iin.ver_major = CUSE_KERNEL_VERSION;
+ iin.ver_minor = CUSE_KERNEL_MINOR_VERSION;
+
+ req->out.numargs = 2;
+ req->out.args[0].size = sizeof(iout);
+ req->out.args[0].value = &iout;
+ req->out.args[1].size = 2 * CUSE_INIT_INFO_MAX;
+ req->out.argpages = 1;
+ req->out.argvar = 1;
+
+ fuse_request_send(&cc->fc, req);
+ rc = req->out.h.error;
+ if (rc)
+ goto out;
+
+ rc = -EOVERFLOW;
+ if (iout.dev_info_len > CUSE_INIT_INFO_MAX ||
+ iout.hotplug_info_len > CUSE_INIT_INFO_MAX)
+ goto out;
+
+ rc = cuse_parse_devinfo(page_address(page), iout.dev_info_len,
+ &devinfo);
+ if (rc)
+ goto out;
+
+ /* hotplug info is also used during device release, copy and parse */
+ rc = -ENOMEM;
+ cc->uevent_env_buf = kmalloc(iout.hotplug_info_len, GFP_KERNEL);
+ if (!cc->uevent_env_buf)
+ goto out;
+
+ memcpy(cc->uevent_env_buf, page_address(page) + iout.dev_info_len,
+ iout.hotplug_info_len);
+
+ rc = cuse_parse_hotplug_envp(cc->uevent_env_buf, iout.hotplug_info_len,
+ cc->uevent_envp, UEVENT_NUM_ENVP);
+ if (rc)
+ goto out;
+
+ devt = MKDEV(iout.dev_major, iout.dev_minor);
+ if (!MAJOR(devt))
+ rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
+ else
+ rc = register_chrdev_region(devt, 1, devinfo.name);
+ if (rc) {
+ printk(KERN_ERR "CUSE: failed to register chrdev region\n");
+ goto out;
+ }
+
+ /* We now have MAJ, MIN and name. Let's create the device */
+ rc = -ENOMEM;
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ goto out_unregister_chrdev_region;
+ device_initialize(dev);
+ dev->class = cuse_class;
+ dev->devt = devt;
+ dev->release = cuse_gendev_release;
+ dev_set_drvdata(dev, cc);
+ dev_set_name(dev, "%s", devinfo.name);
+
+ rc = device_add(dev);
+ if (rc)
+ goto out_put_device;
+
+ /* register cdev */
+ cdev_init(&cc->cdev, &cuse_frontend_fops);
+ cc->cdev.owner = THIS_MODULE;
+ cc->cdev.release = cuse_cdev_release;
+ kobject_set_name(&cc->cdev.kobj, "%s", devinfo.name);
+
+ rc = cdev_add(&cc->cdev, devt, 1);
+ if (rc)
+ goto out_put_device;
+ cuse_conn_get(cc); /* will be released on cdev final put */
+
+ /* transfer dev and cdev ownership to channel */
+ spin_lock(&cuse_disconnect_lock);
+ disconnected = cc->disconnected;
+ if (!disconnected) {
+ cc->dev = dev;
+ cc->cdev_added = true;
+ }
+ spin_unlock(&cuse_disconnect_lock);
+
+ if (disconnected)
+ goto out_cdev_del;
+
+ rc = 0;
+ goto out;
+
+ out_cdev_del:
+ cdev_del(&cc->cdev);
+ out_put_device:
+ put_device(dev);
+ out_unregister_chrdev_region:
+ unregister_chrdev_region(devt, 1);
+ out:
+ if (!IS_ERR(req))
+ fuse_put_request(&cc->fc, req);
+ if (page)
+ __free_pages(page, 1);
+
+ if (rc)
+ fuse_abort_conn(&cc->fc);
+
+ cuse_conn_put(cc);
+ return rc;
+}
+
+static int cuse_channel_open(struct inode *inode, struct file *file)
+{
+ struct cuse_conn *cc;
+ struct vfsmount *mnt;
+ struct fuse_req *init_req;
+ struct task_struct *worker;
+ int rc;
+
+ /* Set up cuse_conn. cuse_conn will be created when filling
+ * in superblock for the following kern_mount().
+ */
+ mnt = kern_mount(&cuse_fs);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ cc = fc_to_cc(get_fuse_conn_super(mnt->mnt_sb));
+ cc->mnt = mnt;
+
+ /* let's send fuse init request */
+ rc = -ENOMEM;
+ init_req = fuse_request_alloc();
+ if (!init_req)
+ goto err_cc_put;
+
+ cc->fc.connected = 1;
+ file->private_data = fuse_conn_get(&cc->fc);
+ fuse_send_init(&cc->fc, init_req);
+
+ /* Okay, FUSE part of initialization is complete. The rest of
+ * the initialization is a bit more involved and requires
+ * conversing with userland. Start a kthread.
+ */
+ worker = kthread_run(cuse_init_worker, cuse_conn_get(cc),
+ "cuse-init-pid%d", current->pid);
+ if (IS_ERR(worker)) {
+ fput(file);
+ rc = PTR_ERR(worker);
+ goto err_cc_put;
+ }
+
+ return 0;
+
+ err_cc_put:
+ cuse_conn_put(cc);
+ return rc;
+}
+
+static int cuse_channel_release(struct inode *inode, struct file *file)
+{
+ struct cuse_conn *cc = fc_to_cc(file->private_data);
+ int rc;
+
+ spin_lock(&cuse_disconnect_lock);
+ cc->disconnected = true;
+ spin_unlock(&cuse_disconnect_lock);
+
+ rc = fuse_dev_release(inode, file);
+ if (rc)
+ return rc;
+
+ if (cc->dev)
+ device_unregister(cc->dev);
+ if (cc->cdev_added) {
+ unregister_chrdev_region(cc->cdev.dev, 1);
+ cdev_del(&cc->cdev);
+ }
+ cuse_conn_put(cc);
+
+ return 0;
+}
+
+static struct file_operations cuse_channel_fops; /* initialized during init */
+
+static int cuse_class_dev_uevent(struct device *dev,
+ struct kobj_uevent_env *env)
+{
+ struct cuse_conn *cc = dev_get_drvdata(dev);
+ int i, rc;
+
+ for (i = 0; cc->uevent_envp[i]; i++) {
+ rc = add_uevent_var(env, "%s", cc->uevent_envp[i]);
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+ssize_t cuse_class_waiting_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cuse_conn *cc = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+}
+
+ssize_t cuse_class_abort_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cuse_conn *cc = dev_get_drvdata(dev);
+
+ fuse_abort_conn(&cc->fc);
+ return count;
+}
+
+static struct device_attribute cuse_class_dev_attrs[] = {
+ __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL),
+ __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store),
+ { }
+};
+
+static struct miscdevice cuse_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR, /* use dynamic for now */
+ .name = "cuse",
+ .fops = &cuse_channel_fops,
+};
+
+static int __init cuse_init(void)
+{
+ int rc;
+
+ /* inherit and extend fuse_dev_operations */
+ cuse_channel_fops = fuse_dev_operations;
+ cuse_channel_fops.owner = THIS_MODULE;
+ cuse_channel_fops.open = cuse_channel_open;
+ cuse_channel_fops.release = cuse_channel_release;
+
+ cuse_class = class_create(THIS_MODULE, "cuse");
+ if (IS_ERR(cuse_class))
+ return PTR_ERR(cuse_class);
+ cuse_class->dev_uevent = cuse_class_dev_uevent;
+ cuse_class->dev_attrs = cuse_class_dev_attrs;
+
+ rc = misc_register(&cuse_miscdev);
+ if (rc)
+ goto destroy_class;
+ rc = register_filesystem(&cuse_fs);
+ if (rc)
+ goto misc_deregister;
+ return 0;
+
+ misc_deregister:
+ misc_deregister(&cuse_miscdev);
+ destroy_class:
+ class_destroy(cuse_class);
+ return rc;
+}
+
+static void __exit cuse_exit(void)
+{
+ unregister_filesystem(&cuse_fs);
+ misc_deregister(&cuse_miscdev);
+ class_destroy(cuse_class);
+}
+
+module_init(cuse_init);
+module_exit(cuse_exit);
+
+MODULE_AUTHOR("Tejun Heo <[email protected]>");
+MODULE_DESCRIPTION("Character device in Userspace");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/cuse.h b/include/linux/cuse.h
new file mode 100644
index 0000000..e875723
--- /dev/null
+++ b/include/linux/cuse.h
@@ -0,0 +1,40 @@
+/*
+ * CUSE: Character device in Userspace
+ * Copyright (C) 2008 SUSE Linux Products GmbH
+ * Copyright (C) 2008 Tejun Heo <[email protected]>
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _CUSE_H_
+#define _CUSE_H_
+
+#include <linux/major.h>
+#include <linux/miscdevice.h>
+#include <linux/fuse.h>
+
+#define CUSE_KERNEL_VERSION 0
+#define CUSE_KERNEL_MINOR_VERSION 1
+
+#define CUSE_KERNEL_MAJOR MISC_MAJOR
+#define CUSE_KERNEL_MINOR MISC_DYNAMIC_MINOR
+
+#define CUSE_INIT_INFO_MAX 4096
+
+enum cuse_opcode {
+ CUSE_INIT = CUSE_BASE,
+};
+
+struct cuse_init_in {
+ __u32 ver_major;
+ __u32 ver_minor;
+};
+
+struct cuse_init_out {
+ __u32 dev_major; /* chardev major */
+ __u32 dev_minor; /* chardev minor */
+ __u32 dev_info_len; /* device info */
+ __u32 hotplug_info_len; /* uevent envs */
+};
+
+#endif /*_CUSE_H_*/
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index b772b4a..e55c2f2 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -212,6 +212,8 @@ enum fuse_opcode {
FUSE_LSEEK = 39,
FUSE_IOCTL = 40,
FUSE_POLL = 41,
+
+ CUSE_BASE = 4096,
};

enum fuse_notify_code {
--
1.5.4.5

2008-08-28 20:08:20

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 5/5] CUSE: implement CUSE - Character device in Userspace

On Fri, 29 Aug 2008 03:19:04 +0900
Tejun Heo <[email protected]> wrote:

> CUSE enables implementing character devices in userspace. With recent
> additions of nonblock, lseek, ioctl and poll support, FUSE already has
> most of what's necessary to implement character devices. All CUSE has
> to do is bonding all those components - FUSE, chardev and the driver
> model - nicely.
>
> Due to the number of different objects involved and many ways an
> instance can fail, object lifetime rules are a tad bit complex.
> Please take a look at the comment on top of fs/fuse/cuse.c for
> details.
>
> Other than that, it's mostly straight forward. Client opens
> /dev/cuse, kernel starts conversation with CUSE_INIT. The client
> tells CUSE which device it wants to create. CUSE creates the device
> for the client and the rest works the same way as in a direct IO FUSE
> session.
>
> Each CUSE device has a corresponding directory /sys/class/cuse/DEVNAME
> (which is symlink to /sys/devices/virtual/class/DEVNAME if
> SYSFS_DEPRECATED is turned off) which hosts "waiting" and "abort"
> among other things. Those two files have the same meaning as the FUSE
> control files.
>
> The only notable lacking feature compared to in-kernel implementation
> is mmap support.
>
> ...
>
> +config CUSE
> + tristate "Character device in Userpace support"
> + depends on FUSE_FS

Will this work (usefully) if CONFIG_SYSFS=n?

>
> ...
>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include <linux/stat.h>
> +#include <linux/cdev.h>
> +#include <linux/device.h>
> +#include <linux/kdev_t.h>
> +#include <linux/kthread.h>
> +#include <linux/miscdevice.h>
> +#include <linux/cuse.h>
> +
> +#include "fuse_i.h"
> +
> +#define CUSE_SUPER_MAGIC 0x43555345

Put in include/linux/magic.h?

> +struct cuse_conn {
> + struct fuse_conn fc;
> + struct cdev cdev;
> + struct vfsmount *mnt;
> + struct device *dev;
> + bool cdev_added:1;
> + bool disconnected:1; /* channel disconnected */

I didn't know you could do that with bools.

These two fields will share a word, and modifications of one are racy
wrt modifications of the other. So some form of locking is needed, and
a comment describing that locking here would be beneficial.

> + char *uevent_envp[UEVENT_NUM_ENVP + 1];
> + char *uevent_env_buf;
> +};
> +
> +#define fc_to_cc(_fc) container_of((_fc), struct cuse_conn, fc)
> +#define cdev_to_cc(_cdev) container_of((_cdev), struct cuse_conn, cdev)
> +#define cuse_conn_get(cc) ({mntget((cc)->mnt); cc;})
> +#define cuse_conn_put(cc) mntput((cc)->mnt)

I believe all the above could be implemented in C.

Making that change would fix the bug in cuse_conn_get(), which
references its arg twice.

>
> ...
>
> +static int cuse_fill_super(struct super_block *sb, void *data, int silent)
> +{
> + struct cuse_conn *cc = NULL;

this initialisation wasn't needed.

> + struct dentry *root_dentry = NULL;
> + struct inode *root = NULL;
> + int rc;
> +
> + sb->s_magic = CUSE_SUPER_MAGIC;
> + sb->s_op = &fuse_super_operations;
> + sb->s_maxbytes = MAX_LFS_FILESIZE;
> +
> + cc = kzalloc(sizeof(*cc), GFP_KERNEL);
> + if (!cc)
> + goto err_nomem;
> + rc = fuse_conn_init(&cc->fc, sb);
> + if (rc)
> + goto err;
> +
> + /* cuse isn't accessible to mortal users, give it some latitude */
> + cc->fc.flags = FUSE_ALLOW_OTHER;
> + cc->fc.user_id = current->euid;
> + cc->fc.group_id = current->egid;
> + cc->fc.max_read = FUSE_MAX_PAGES_PER_REQ * PAGE_SIZE;
> + cc->fc.release = cuse_fc_release;
> +
> + /* transfer the initial cc refcnt to sb */
> + sb->s_fs_info = &cc->fc;
> + cc = NULL;
> +
> + root = fuse_get_root_inode(sb, S_IFREG);
> + if (!root)
> + goto err_nomem;
> +
> + root_dentry = d_alloc_root(root);
> + if (!root_dentry)
> + goto err_nomem;
> +
> + sb->s_root = root_dentry;
> +
> + return 0;
> +
> + err_nomem:
> + rc = -ENOMEM;
> + err:
> + if (root_dentry)
> + dput(root_dentry);
> + else if (root)
> + iput(root);
> + kfree(cc);
> + return rc;
> +}
> +
>
> ...
>
> +static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
> +{
> + char *p = *pp;
> + char *key, *val;
> +
> + while (p < end && *p == '\0')
> + p++;
> + if (p == end)
> + return 0;
> +
> + if (end[-1] != '\0') {
> + printk(KERN_ERR "CUSE: info not properly terminated\n");
> + return -EINVAL;
> + }
> +
> + key = val = p;
> + p += strlen(p);
> +
> + if (valp) {
> + strsep(&val, "=");
> + if (!val)
> + val = key + strlen(key);
> + key = strstrip(key);
> + val = strstrip(val);
> + } else
> + key = strstrip(key);
> +
> + if (!strlen(key)) {
> + printk(KERN_ERR "CUSE: zero length info key specified\n");
> + return -EINVAL;
> + }
> +
> + *pp = p;
> + *keyp = key;
> + if (valp)
> + *valp = val;
> +
> + return 1;
> +}

OK, I have NFI whatsoever what this thing is doing and I am disinclined
to reverse-engineer it.

If this is parsing something whcih operators/users provided then it
should have been documented somewhere?

Whether it is or isn't doing that, this function really really really
needs a comment telling readers (ie: me) what it does. Or what it
tries to do, anyway.

> +struct cuse_devinfo {
> + const char *name;
> +};
> +
> +static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
> +{
> + char *end = p + len;
> + char *key, *val;
> + int rc;
> +
> + while (true) {
> + rc = cuse_parse_one(&p, end, &key, &val);
> + if (rc < 0)
> + return rc;
> + if (!rc)
> + break;
> + if (strcmp(key, "DEVNAME") == 0)
> + devinfo->name = val;
> + else
> + printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
> + key);
> + }
> +
> + if (!devinfo->name || !strlen(devinfo->name)) {
> + printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
>
> ...
>
> +static int cuse_init_worker(void *data)

This functions seems to be woefully misnamed. The name implies that it
initialises a worker. But it's a kernel thread?

Could you please document your design somehow? What does this kernel
thread do? Why does it exist? etc.

> +{
> + struct cuse_conn *cc = data;
> + struct cuse_init_in iin = { };
> + struct cuse_init_out iout = { };
> + struct cuse_devinfo devinfo = { };
> + struct fuse_req *req;
> + struct page *page = NULL;
> + struct device *dev;
> + bool disconnected;
> + dev_t devt;
> + int rc;
> +
> + BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
> +
> + /* identify ourself and query what the CUSE client wants */
> + req = fuse_get_req(&cc->fc);
> + if (IS_ERR(req)) {
> + rc = PTR_ERR(req);
> + goto out;
> + }
> +
> + rc = -ENOMEM;
> + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 1);
> + if (!page)
> + goto out;
> +
> + req->pages[0] = nth_page(page, 0);
> + req->pages[1] = nth_page(page, 1);
> + req->num_pages = 2;
> +
> + req->in.h.opcode = CUSE_INIT;
> + req->in.h.nodeid = get_node_id(cc->mnt->mnt_sb->s_root->d_inode);
> + req->in.numargs = 1;
> + req->in.args[0].size = sizeof(iin);
> + req->in.args[0].value = &iin;
> +
> + iin.ver_major = CUSE_KERNEL_VERSION;
> + iin.ver_minor = CUSE_KERNEL_MINOR_VERSION;
> +
> + req->out.numargs = 2;
> + req->out.args[0].size = sizeof(iout);
> + req->out.args[0].value = &iout;
> + req->out.args[1].size = 2 * CUSE_INIT_INFO_MAX;
> + req->out.argpages = 1;
> + req->out.argvar = 1;
> +
> + fuse_request_send(&cc->fc, req);
> + rc = req->out.h.error;
> + if (rc)
> + goto out;
> +
> + rc = -EOVERFLOW;
> + if (iout.dev_info_len > CUSE_INIT_INFO_MAX ||
> + iout.hotplug_info_len > CUSE_INIT_INFO_MAX)
> + goto out;
> +
> + rc = cuse_parse_devinfo(page_address(page), iout.dev_info_len,
> + &devinfo);
> + if (rc)
> + goto out;
> +
> + /* hotplug info is also used during device release, copy and parse */

hotplug? What's all this? Seems to have something to do with an
as-yet-undescribed relationship with udev?

> + rc = -ENOMEM;
> + cc->uevent_env_buf = kmalloc(iout.hotplug_info_len, GFP_KERNEL);
> + if (!cc->uevent_env_buf)
> + goto out;
> +
> + memcpy(cc->uevent_env_buf, page_address(page) + iout.dev_info_len,
> + iout.hotplug_info_len);
> +
> + rc = cuse_parse_hotplug_envp(cc->uevent_env_buf, iout.hotplug_info_len,
> + cc->uevent_envp, UEVENT_NUM_ENVP);
> + if (rc)
> + goto out;
> +
> + devt = MKDEV(iout.dev_major, iout.dev_minor);
> + if (!MAJOR(devt))
> + rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
> + else
> + rc = register_chrdev_region(devt, 1, devinfo.name);
> + if (rc) {
> + printk(KERN_ERR "CUSE: failed to register chrdev region\n");
> + goto out;
> + }
> +
> + /* We now have MAJ, MIN and name. Let's create the device */
> + rc = -ENOMEM;
> + dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> + if (!dev)
> + goto out_unregister_chrdev_region;
> + device_initialize(dev);
> + dev->class = cuse_class;
> + dev->devt = devt;
> + dev->release = cuse_gendev_release;
> + dev_set_drvdata(dev, cc);
> + dev_set_name(dev, "%s", devinfo.name);
> +
> + rc = device_add(dev);
> + if (rc)
> + goto out_put_device;
> +
> + /* register cdev */
> + cdev_init(&cc->cdev, &cuse_frontend_fops);
> + cc->cdev.owner = THIS_MODULE;
> + cc->cdev.release = cuse_cdev_release;
> + kobject_set_name(&cc->cdev.kobj, "%s", devinfo.name);
> +
> + rc = cdev_add(&cc->cdev, devt, 1);
> + if (rc)
> + goto out_put_device;
> + cuse_conn_get(cc); /* will be released on cdev final put */
> +
> + /* transfer dev and cdev ownership to channel */
> + spin_lock(&cuse_disconnect_lock);
> + disconnected = cc->disconnected;
> + if (!disconnected) {
> + cc->dev = dev;
> + cc->cdev_added = true;
> + }
> + spin_unlock(&cuse_disconnect_lock);
> +
> + if (disconnected)
> + goto out_cdev_del;
> +
> + rc = 0;
> + goto out;
> +
> + out_cdev_del:
> + cdev_del(&cc->cdev);
> + out_put_device:
> + put_device(dev);
> + out_unregister_chrdev_region:
> + unregister_chrdev_region(devt, 1);
> + out:
> + if (!IS_ERR(req))
> + fuse_put_request(&cc->fc, req);
> + if (page)
> + __free_pages(page, 1);
> +
> + if (rc)
> + fuse_abort_conn(&cc->fc);
> +
> + cuse_conn_put(cc);
> + return rc;
> +}

So... basically this undocumented kernel thread will for undocumented
reasons create the device node?

An obvious question which the reader of the code will ask is "why
wasn't that done synchronously"?

> +static int cuse_channel_open(struct inode *inode, struct file *file)
> +{
> + struct cuse_conn *cc;
> + struct vfsmount *mnt;
> + struct fuse_req *init_req;
> + struct task_struct *worker;
> + int rc;
> +
> + /* Set up cuse_conn. cuse_conn will be created when filling
> + * in superblock for the following kern_mount().
> + */
> + mnt = kern_mount(&cuse_fs);
> + if (IS_ERR(mnt))
> + return PTR_ERR(mnt);
> +
> + cc = fc_to_cc(get_fuse_conn_super(mnt->mnt_sb));
> + cc->mnt = mnt;
> +
> + /* let's send fuse init request */
> + rc = -ENOMEM;
> + init_req = fuse_request_alloc();
> + if (!init_req)
> + goto err_cc_put;
> +
> + cc->fc.connected = 1;
> + file->private_data = fuse_conn_get(&cc->fc);
> + fuse_send_init(&cc->fc, init_req);
> +
> + /* Okay, FUSE part of initialization is complete. The rest of
> + * the initialization is a bit more involved and requires
> + * conversing with userland. Start a kthread.
> + */
> + worker = kthread_run(cuse_init_worker, cuse_conn_get(cc),
> + "cuse-init-pid%d", current->pid);

current->pid is non-unique in a containerised setup. What are the
implications of this? It needs a comment, because the containerisation
guys will end up coming here and scratching their heads over the same
question.

> + if (IS_ERR(worker)) {
> + fput(file);
> + rc = PTR_ERR(worker);
> + goto err_cc_put;
> + }
> +
> + return 0;
> +
> + err_cc_put:
> + cuse_conn_put(cc);
> + return rc;
> +}
> +
> +static int cuse_channel_release(struct inode *inode, struct file *file)
> +{
> + struct cuse_conn *cc = fc_to_cc(file->private_data);
> + int rc;
> +
> + spin_lock(&cuse_disconnect_lock);
> + cc->disconnected = true;
> + spin_unlock(&cuse_disconnect_lock);
> +
> + rc = fuse_dev_release(inode, file);
> + if (rc)
> + return rc;
> +
> + if (cc->dev)
> + device_unregister(cc->dev);
> + if (cc->cdev_added) {
> + unregister_chrdev_region(cc->cdev.dev, 1);
> + cdev_del(&cc->cdev);
> + }
> + cuse_conn_put(cc);
> +
> + return 0;
> +}
> +
> +static struct file_operations cuse_channel_fops; /* initialized during init */

This reader is wondering what a "channel" is. Understanding high-level
concepts like this is important for understanding the implemetnation.

> +
> +static int cuse_class_dev_uevent(struct device *dev,
> + struct kobj_uevent_env *env)
> +{
> + struct cuse_conn *cc = dev_get_drvdata(dev);
> + int i, rc;
> +
> + for (i = 0; cc->uevent_envp[i]; i++) {
> + rc = add_uevent_var(env, "%s", cc->uevent_envp[i]);
> + if (rc)
> + return rc;
> + }
> + return 0;
> +}
> +
> +ssize_t cuse_class_waiting_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct cuse_conn *cc = dev_get_drvdata(dev);
> +
> + return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
> +}

<looks in fuse_i.h>

"The number of requests waiting for completion".

Why did you choose to present this particular field?

> +ssize_t cuse_class_abort_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct cuse_conn *cc = dev_get_drvdata(dev);
> +
> + fuse_abort_conn(&cc->fc);
> + return count;
> +}
>
> ...
>
> +static int __init cuse_init(void)
> +{
> + int rc;
> +
> + /* inherit and extend fuse_dev_operations */
> + cuse_channel_fops = fuse_dev_operations;
> + cuse_channel_fops.owner = THIS_MODULE;
> + cuse_channel_fops.open = cuse_channel_open;
> + cuse_channel_fops.release = cuse_channel_release;

Can't these initialisations be performed at compile-time?

> + cuse_class = class_create(THIS_MODULE, "cuse");
> + if (IS_ERR(cuse_class))
> + return PTR_ERR(cuse_class);
> + cuse_class->dev_uevent = cuse_class_dev_uevent;
> + cuse_class->dev_attrs = cuse_class_dev_attrs;
> +
> + rc = misc_register(&cuse_miscdev);
> + if (rc)
> + goto destroy_class;
> + rc = register_filesystem(&cuse_fs);
> + if (rc)
> + goto misc_deregister;
> + return 0;
> +
> + misc_deregister:
> + misc_deregister(&cuse_miscdev);
> + destroy_class:
> + class_destroy(cuse_class);
> + return rc;
> +}
> +
> +static void __exit cuse_exit(void)
> +{
> + unregister_filesystem(&cuse_fs);
> + misc_deregister(&cuse_miscdev);
> + class_destroy(cuse_class);
> +}
> +
> +module_init(cuse_init);
> +module_exit(cuse_exit);
> +
> +MODULE_AUTHOR("Tejun Heo <[email protected]>");
> +MODULE_DESCRIPTION("Character device in Userspace");
> +MODULE_LICENSE("GPL");
> diff --git a/include/linux/cuse.h b/include/linux/cuse.h
> new file mode 100644
> index 0000000..e875723
> --- /dev/null
> +++ b/include/linux/cuse.h
> @@ -0,0 +1,40 @@
> +/*
> + * CUSE: Character device in Userspace
> + * Copyright (C) 2008 SUSE Linux Products GmbH
> + * Copyright (C) 2008 Tejun Heo <[email protected]>
> + *
> + * This file is released under the GPL.
> + */
> +
> +#ifndef _CUSE_H_
> +#define _CUSE_H_
> +
> +#include <linux/major.h>
> +#include <linux/miscdevice.h>
> +#include <linux/fuse.h>
> +
> +#define CUSE_KERNEL_VERSION 0
> +#define CUSE_KERNEL_MINOR_VERSION 1

Some description of the kernel<->userspace versioning design would be
appropriate. Is it bi-directional?

> +#define CUSE_KERNEL_MAJOR MISC_MAJOR
> +#define CUSE_KERNEL_MINOR MISC_DYNAMIC_MINOR
> +
> +#define CUSE_INIT_INFO_MAX 4096
> +
> +enum cuse_opcode {
> + CUSE_INIT = CUSE_BASE,
> +};
> +
> +struct cuse_init_in {
> + __u32 ver_major;
> + __u32 ver_minor;
> +};
> +
> +struct cuse_init_out {
> + __u32 dev_major; /* chardev major */
> + __u32 dev_minor; /* chardev minor */
> + __u32 dev_info_len; /* device info */
> + __u32 hotplug_info_len; /* uevent envs */
> +};
> +
> +#endif /*_CUSE_H_*/
> diff --git a/include/linux/fuse.h b/include/linux/fuse.h
> index b772b4a..e55c2f2 100644
> --- a/include/linux/fuse.h
> +++ b/include/linux/fuse.h
> @@ -212,6 +212,8 @@ enum fuse_opcode {
> FUSE_LSEEK = 39,
> FUSE_IOCTL = 40,
> FUSE_POLL = 41,
> +
> + CUSE_BASE = 4096,
> };
>
> enum fuse_notify_code {

Nice-looking code, but I do not feel able to properly review it with
its current level of description.

2008-08-28 22:21:33

by Greg KH

[permalink] [raw]
Subject: Re: [PATCH 5/5] CUSE: implement CUSE - Character device in Userspace

On Thu, Aug 28, 2008 at 01:07:40PM -0700, Andrew Morton wrote:
> On Fri, 29 Aug 2008 03:19:04 +0900
> Tejun Heo <[email protected]> wrote:
> > +#define fc_to_cc(_fc) container_of((_fc), struct cuse_conn, fc)
> > +#define cdev_to_cc(_cdev) container_of((_cdev), struct cuse_conn, cdev)
> > +#define cuse_conn_get(cc) ({mntget((cc)->mnt); cc;})
> > +#define cuse_conn_put(cc) mntput((cc)->mnt)
>
> I believe all the above could be implemented in C.

"traditionally" container_of() is used in #define, not a function call
as it is just pointer math that can be done at compile time.

thanks,

greg k-h

2008-08-28 22:33:18

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 5/5] CUSE: implement CUSE - Character device in Userspace

On Thu, 28 Aug 2008 15:15:25 -0700
Greg KH <[email protected]> wrote:

> On Thu, Aug 28, 2008 at 01:07:40PM -0700, Andrew Morton wrote:
> > On Fri, 29 Aug 2008 03:19:04 +0900
> > Tejun Heo <[email protected]> wrote:
> > > +#define fc_to_cc(_fc) container_of((_fc), struct cuse_conn, fc)
> > > +#define cdev_to_cc(_cdev) container_of((_cdev), struct cuse_conn, cdev)
> > > +#define cuse_conn_get(cc) ({mntget((cc)->mnt); cc;})
> > > +#define cuse_conn_put(cc) mntput((cc)->mnt)
> >
> > I believe all the above could be implemented in C.
>
> "traditionally" container_of() is used in #define, not a function call
> as it is just pointer math that can be done at compile time.
>

Well yeah. But it isn't a very good tradition.

static inline struct cuse_conn *cdev_to_cc(struct cdev *cdev)
{
return container_of(cdev, struct cuse_conn, cdev);
}

should generate the same code and is prettier.

Unfortunately it has no additional type-safety. You can still pass it
the address of a tty_driver.cdev instead of a cuse_conn.cdev and the
compiler will happily swallow it. Not a big problem in practice though.

2008-08-29 02:10:49

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH 5/5] CUSE: implement CUSE - Character device in Userspace

Hello, Andrew.

Andrew Morton wrote:
>> +config CUSE
>> + tristate "Character device in Userpace support"
>> + depends on FUSE_FS
>
> Will this work (usefully) if CONFIG_SYSFS=n?

Yeah, as much as other character devices are. As long as uevent works,
most stuff should keep working.

>> +#include "fuse_i.h"
>> +
>> +#define CUSE_SUPER_MAGIC 0x43555345
>
> Put in include/linux/magic.h?

Will do.

>> +struct cuse_conn {
>> + struct fuse_conn fc;
>> + struct cdev cdev;
>> + struct vfsmount *mnt;
>> + struct device *dev;
>> + bool cdev_added:1;
>> + bool disconnected:1; /* channel disconnected */
>
> I didn't know you could do that with bools.

Hmm... I thought it was like any other integral types, no?

> These two fields will share a word, and modifications of one are racy
> wrt modifications of the other. So some form of locking is needed, and
> a comment describing that locking here would be beneficial.

Both are protected by cuse_disconnect_lock. Will add comment.

>> + char *uevent_envp[UEVENT_NUM_ENVP + 1];
>> + char *uevent_env_buf;
>> +};
>> +
>> +#define fc_to_cc(_fc) container_of((_fc), struct cuse_conn, fc)
>> +#define cdev_to_cc(_cdev) container_of((_cdev), struct cuse_conn, cdev)
>> +#define cuse_conn_get(cc) ({mntget((cc)->mnt); cc;})
>> +#define cuse_conn_put(cc) mntput((cc)->mnt)
>
> I believe all the above could be implemented in C.
>
> Making that change would fix the bug in cuse_conn_get(), which
> references its arg twice.

Will convert to functions.

>> ...
>>
>> +static int cuse_fill_super(struct super_block *sb, void *data, int silent)
>> +{
>> + struct cuse_conn *cc = NULL;
>
> this initialisation wasn't needed.

The thing is when error handling paths are lumped up using "if (xxx)
destroy(xxx);", it's a good idea to always initialize variables which
will carry allocated resource. For this iteration, it doesn't make any
difference but later if the order of initialization changes and/or a new
sequence is added before cc initialization, it's all to easy to forget
whether cc was initialized to NULL or not. gcc will catch it most of
the time but there's no guarantee, so I think it's better to keep it
this way.

>> +static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
>> +{
>> + char *p = *pp;
>> + char *key, *val;
>> +
>> + while (p < end && *p == '\0')
>> + p++;
>> + if (p == end)
>> + return 0;
>
> OK, I have NFI whatsoever what this thing is doing and I am disinclined
> to reverse-engineer it.
>
> If this is parsing something whcih operators/users provided then it
> should have been documented somewhere?
>
> Whether it is or isn't doing that, this function really really really
> needs a comment telling readers (ie: me) what it does. Or what it
> tries to do, anyway.

Yeah, reading an undocumented parsing function is really painful. Sorry
about lack of comments there. With too many components to update at
hands, I kinda ran out of steam in the last stages where I do the final
review pass through patches and add comments.

The input is packed strings - "key0=val0\0key1=val1\0" - and parse one
pulls one key/val pair out of it.

>> +static int cuse_init_worker(void *data)
>
> This functions seems to be woefully misnamed. The name implies that it
> initialises a worker. But it's a kernel thread?

CUSE initialization worker, it is, meaning that it's a kernel thread
worker for session initialization.

> Could you please document your design somehow? What does this kernel
> thread do? Why does it exist? etc.

Yeap, will do.

>> + /* hotplug info is also used during device release, copy and parse */
>
> hotplug? What's all this? Seems to have something to do with an
> as-yet-undescribed relationship with udev?

These will be feeded to uevent verbatim. The only reason why it's
called hotplug_info instead of uevent_envp is because FUSE works on
other platforms and I didn't want to put too much Linuxism into naming.

> So... basically this undocumented kernel thread will for undocumented
> reasons create the device node?
>
> An obvious question which the reader of the code will ask is "why
> wasn't that done synchronously"?

It's briefly mentioned in comment below. It's because the
initialization needs to talk with userland and the talk should happen
over a file which the following cuse_channel_open() creates. So,
cuse_channel_open() can't really talk with the client without completing
open while creating the device needs more information from user.

>> +static int cuse_channel_open(struct inode *inode, struct file *file)
>> +{
>> + struct cuse_conn *cc;
>> + struct vfsmount *mnt;
>> + struct fuse_req *init_req;
>> + struct task_struct *worker;
>> + int rc;
>> +
>> + /* Set up cuse_conn. cuse_conn will be created when filling
>> + * in superblock for the following kern_mount().
>> + */
>> + mnt = kern_mount(&cuse_fs);
>> + if (IS_ERR(mnt))
>> + return PTR_ERR(mnt);
>> +
>> + cc = fc_to_cc(get_fuse_conn_super(mnt->mnt_sb));
>> + cc->mnt = mnt;
>> +
>> + /* let's send fuse init request */
>> + rc = -ENOMEM;
>> + init_req = fuse_request_alloc();
>> + if (!init_req)
>> + goto err_cc_put;
>> +
>> + cc->fc.connected = 1;
>> + file->private_data = fuse_conn_get(&cc->fc);
>> + fuse_send_init(&cc->fc, init_req);
>> +
>> + /* Okay, FUSE part of initialization is complete. The rest of
>> + * the initialization is a bit more involved and requires
>> + * conversing with userland. Start a kthread.
>> + */
>> + worker = kthread_run(cuse_init_worker, cuse_conn_get(cc),
>> + "cuse-init-pid%d", current->pid);
>
> current->pid is non-unique in a containerised setup. What are the
> implications of this? It needs a comment, because the containerisation
> guys will end up coming here and scratching their heads over the same
> question.

Not much. It's just the name of the worker. The name differentiation
is mainly to help debugging a bit when something went wrong and doesn't
have to be unique.

>> +static struct file_operations cuse_channel_fops; /* initialized during init */
>
> This reader is wondering what a "channel" is. Understanding high-level
> concepts like this is important for understanding the implemetnation.

Yes, and from the comment at the top of the file.

* channel : file handle connected to the userland CUSE client
...
* Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with
* devices, it's called 'channel' to reduce confusion.
*
* channel determines when the character device dies. When channel is
* closed, everything should begin to destruct. As cuse_conn and mnt
...

>> +ssize_t cuse_class_waiting_show(struct device *dev,
>> + struct device_attribute *attr, char *buf)
>> +{
>> + struct cuse_conn *cc = dev_get_drvdata(dev);
>> +
>> + return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
>> +}
>
> <looks in fuse_i.h>
>
> "The number of requests waiting for completion".
>
> Why did you choose to present this particular field?

"waiting" and "abort" are what fuse exports as controls in fuse control
fs (fs/fuse/control.c). These are CUSE's counterparts.

>> +static int __init cuse_init(void)
>> +{
>> + int rc;
>> +
>> + /* inherit and extend fuse_dev_operations */
>> + cuse_channel_fops = fuse_dev_operations;
>> + cuse_channel_fops.owner = THIS_MODULE;
>> + cuse_channel_fops.open = cuse_channel_open;
>> + cuse_channel_fops.release = cuse_channel_release;
>
> Can't these initialisations be performed at compile-time?

Only by listing every member. I can't think of a good way to inherit
all and then override some in C initialization. Hmmm.... then again,
maybe it's better to list every member.

Is it something you object strongly?

>> +#define CUSE_KERNEL_VERSION 0
>> +#define CUSE_KERNEL_MINOR_VERSION 1
>
> Some description of the kernel<->userspace versioning design would be
> appropriate. Is it bi-directional?

It's just like FUSE protocol version. FUSE clients seem to distinguish
supported functionality with it. So, it's more "I'm who" kind of thing
which doesn't have much meaning for the initial version.

> Nice-looking code, but I do not feel able to properly review it with
> its current level of description.

Sorry about that. I'll add proper comments on the next round.

Thanks.

--
tejun

2008-08-29 02:21:22

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 5/5] CUSE: implement CUSE - Character device in Userspace

On Fri, 29 Aug 2008 04:09:16 +0200 Tejun Heo <[email protected]> wrote:

> >> +static int __init cuse_init(void)
> >> +{
> >> + int rc;
> >> +
> >> + /* inherit and extend fuse_dev_operations */
> >> + cuse_channel_fops = fuse_dev_operations;
> >> + cuse_channel_fops.owner = THIS_MODULE;
> >> + cuse_channel_fops.open = cuse_channel_open;
> >> + cuse_channel_fops.release = cuse_channel_release;
> >
> > Can't these initialisations be performed at compile-time?
>
> Only by listing every member. I can't think of a good way to inherit
> all and then override some in C initialization. Hmmm.... then again,
> maybe it's better to list every member.

oop, I failed to note the struct assignment there. The usual rule of
thumb applies: I can be safely ignored.

2008-08-29 05:54:52

by Tejun Heo

[permalink] [raw]
Subject: Re: [fuse-devel] [PATCHSET] CUSE: implement CUSE

Mike Hommey wrote:
> On Fri, Aug 29, 2008 at 03:18:59AM +0900, Tejun Heo wrote:
>> This patchset implements CUSE - Character device in Userspace. Except
>> for initialization sequence and creation of character device instead
>> of a mount, CUSE isn't very different from FUSE.
>
> It would be nice to have BUSE, Block device in Userspace, too.

Thought about that but it's really no different from nbd or loop
depending on your application and block devices don't really implement
the file operations so it won't have too much in common with FUSE.
Also, there's the complication of going out to disk for more memory cases.

--
tejun

2008-08-29 06:26:39

by Mike Hommey

[permalink] [raw]
Subject: Re: [fuse-devel] [PATCHSET] CUSE: implement CUSE

On Fri, Aug 29, 2008 at 03:18:59AM +0900, Tejun Heo wrote:
> This patchset implements CUSE - Character device in Userspace. Except
> for initialization sequence and creation of character device instead
> of a mount, CUSE isn't very different from FUSE.

It would be nice to have BUSE, Block device in Userspace, too.

Mike

2008-08-29 16:00:21

by Nick Bowler

[permalink] [raw]
Subject: Re: [PATCH 5/5] CUSE: implement CUSE - Character device in Userspace

On 04:09 Fri 29 Aug , Tejun Heo wrote:
> Hello, Andrew.
>
> Andrew Morton wrote:
> >> +struct cuse_conn {
> >> + struct fuse_conn fc;
> >> + struct cdev cdev;
> >> + struct vfsmount *mnt;
> >> + struct device *dev;
> >> + bool cdev_added:1;
> >> + bool disconnected:1; /* channel disconnected */
> >
> > I didn't know you could do that with bools.
>
> Hmm... I thought it was like any other integral types, no?
>

Boolean bit-fields are indeed valid. Usual semantics for objects of type
_Bool apply.

C99 6.7.2.1 Structure and union specifiers

4 A bit-field shall have a type that is a qualified or unqualified
version of _Bool, signed int, unsigned int, or some other
implementation-defined type.

--
Nick Bowler, Elliptic Semiconductor (http://www.ellipticsemi.com/)

2008-08-29 18:50:31

by Archie Cobbs

[permalink] [raw]
Subject: Re: [fuse-devel] [PATCHSET] CUSE: implement CUSE

On Fri, Aug 29, 2008 at 12:52 AM, Tejun Heo <[email protected]> wrote:
> Mike Hommey wrote:
> > On Fri, Aug 29, 2008 at 03:18:59AM +0900, Tejun Heo wrote:
> >> This patchset implements CUSE - Character device in Userspace. Except
> >> for initialization sequence and creation of character device instead
> >> of a mount, CUSE isn't very different from FUSE.
> >
> > It would be nice to have BUSE, Block device in Userspace, too.
>
> Thought about that but it's really no different from nbd or loop
> depending on your application and block devices don't really implement
> the file operations so it won't have too much in common with FUSE.

I think BUSE would be useful. For one, it allows you to avoid problems with
the extra caching you get with a loopback device. And NBD is too limiting
for some applications.

For my half-ignorant analysis of the caching issues, see:
http://code.google.com/p/s3backer/wiki/PerformanceConsiderations#Caching

This is also an example of an application where NBD doesn't suffice.

> Also, there's the complication of going out to disk for more memory cases.

Not sure what you mean exactly (my fault), but it seems BUSE would have fewer
places for memory problems (including deadlocks) than loopback over FUSE,
which is the only way to do this kind of stuff now.

-Archie

--
Archie L. Cobbs

2008-08-30 12:32:33

by Tejun Heo

[permalink] [raw]
Subject: Re: [fuse-devel] [PATCHSET] CUSE: implement CUSE

Hello,

Archie Cobbs wrote:
>> Thought about that but it's really no different from nbd or loop
>> depending on your application and block devices don't really implement
>> the file operations so it won't have too much in common with FUSE.
>
> I think BUSE would be useful. For one, it allows you to avoid problems with
> the extra caching you get with a loopback device. And NBD is too limiting
> for some applications.
>
> For my half-ignorant analysis of the caching issues, see:
> http://code.google.com/p/s3backer/wiki/PerformanceConsiderations#Caching
>
> This is also an example of an application where NBD doesn't suffice.
>
>> Also, there's the complication of going out to disk for more memory cases.
>
> Not sure what you mean exactly (my fault), but it seems BUSE would have fewer
> places for memory problems (including deadlocks) than loopback over FUSE,
> which is the only way to do this kind of stuff now.

Yeah, compared to loopback over FUSE, anything would have less
problem. :-) I don't know much about nbd but it's pretty much solving
the same problem so I think it's logical to extend nbd including
giving it a new transport if necessary? Or is there something
fundamentally better when it's done via FUSE?

Thanks.

--
tejun

2008-08-30 16:37:17

by Goswin von Brederlow

[permalink] [raw]
Subject: Re: [fuse-devel] [PATCHSET] CUSE: implement CUSE

Tejun Heo <[email protected]> writes:

> This patchset implements CUSE - Character device in Userspace. Except
> for initialization sequence and creation of character device instead
> of a mount, CUSE isn't very different from FUSE.

Would it be hard to extend this to block devices as well?

MfG
Goswin

2008-08-30 18:56:43

by Mike Hommey

[permalink] [raw]
Subject: Re: [fuse-devel] [PATCHSET] CUSE: implement CUSE

On Sat, Aug 30, 2008 at 02:30:32PM +0200, Tejun Heo wrote:
> Hello,
>
> Archie Cobbs wrote:
> >> Thought about that but it's really no different from nbd or loop
> >> depending on your application and block devices don't really implement
> >> the file operations so it won't have too much in common with FUSE.
> >
> > I think BUSE would be useful. For one, it allows you to avoid problems with
> > the extra caching you get with a loopback device. And NBD is too limiting
> > for some applications.
> >
> > For my half-ignorant analysis of the caching issues, see:
> > http://code.google.com/p/s3backer/wiki/PerformanceConsiderations#Caching
> >
> > This is also an example of an application where NBD doesn't suffice.
> >
> >> Also, there's the complication of going out to disk for more memory cases.
> >
> > Not sure what you mean exactly (my fault), but it seems BUSE would have fewer
> > places for memory problems (including deadlocks) than loopback over FUSE,
> > which is the only way to do this kind of stuff now.
>
> Yeah, compared to loopback over FUSE, anything would have less
> problem. :-) I don't know much about nbd but it's pretty much solving
> the same problem so I think it's logical to extend nbd including
> giving it a new transport if necessary? Or is there something
> fundamentally better when it's done via FUSE?

My gutt feeling is that it would have less overhead when done via FUSE
than through nbd, but that could be wrong.

Mike

2008-08-30 22:39:39

by Archie Cobbs

[permalink] [raw]
Subject: Re: [fuse-devel] [PATCHSET] CUSE: implement CUSE

On Sat, Aug 30, 2008 at 7:30 AM, Tejun Heo <[email protected]> wrote:
> Yeah, compared to loopback over FUSE, anything would have less
> problem. :-) I don't know much about nbd but it's pretty much solving
> the same problem so I think it's logical to extend nbd including
> giving it a new transport if necessary? Or is there something
> fundamentally better when it's done via FUSE?

Well, NBD is the bird in hand, but that doesn't mean it's the best way
to do things generically for all block device emulation applications.

I'd even argue that NBD should be removed from the kernel and replaced
by BUSE plus a user-land daemon. A BUSE interface could be a lot more
general, and simpler.

Not to mention that converting all block reads and writes to TCP
operations that talk to another process on the same machine via the
loopback interface seems awfully inefficient.

-Archie

--
Archie L. Cobbs

2008-08-31 04:53:17

by J. R. Okajima

[permalink] [raw]
Subject: Re: [fuse-devel] [PATCHSET] CUSE: implement CUSE


Mike Hommey:
> It would be nice to have BUSE, Block device in Userspace, too.

While it is not based upon FUSE, you may be interested in ULOOP driver
which is based upon the loopback block device.
Here is a README file from http://aufs.sourceforge.net/uloop.txt.
If you want to checkout the source files, please refer to
http://aufs.sourceforge.net/.


Junjiro R. Okajima

----------------------------------------------------------------------

ULOOP -- Loopback block device in userspace
(and a sample for HTTP and generic block device)
Junjiro Okajima

# $Id: 00readme.txt,v 1.6 2008/08/17 23:04:29 sfjro Exp $


0. Introduction
As you know, there is a Loopback block device in Linux, /dev/loop,
which enables you to mount a fs-image local file.
Also it can adopt a userspace program, such as cryptloop.
This sample ULOOP driver makes it generic, and enables to adopt any
userspace program.
You can give an empty or non-existing file to /dev/loop backend.
When a process reads from /dev/loop, this dirver wakes a user process
up and passes the I/O transaction to it. A user process makes the
required block ready and tells the driver. Then the driver completes
the I/O transaction.
Also there is sample scripts or usage for diskless nodes working with
aufs. This driver may work with it well.
The name is unrelated to YouTube. :-)


1. sample for HTTP
Simple 'make' will build ./drivers/block/uloop.ko and ./ulohttp.
Ulohttp application behaves like losetup(8). Additionally, ulohttp is
an actual daemon which handles I/O request.
Here is a syntax.

ulohttp [-b bitmap] [-c cache] device URL

The device is /dev/loopN and the URL is a URL for fs-image file via
HTTP. The http server must support byte range (Range: header).
The bitmap is a new filename or previously specified as the bitmap for
the same URL. Its filesize will be 'the size of the specified fs-image
/ pagesize (usually 4k) / bits in a byte (8)', and round-up to
pagesize.
The cache is a new filename or previously specified as the cache for
the same URL. Its filesize will be 'the size of the specified
fs-image', and round-up to pagesize.
Note that both the bitmap and the cache are re-usable as long as you
don't change the filedata and URL.

When someone reads from the specified /dev/loopN, or accesses a file
on a filesystem after mounting /dev/loopN, ULOOP driver first checks
the corresponding bit in the bitmap file. When the bit is not set,
which means the block is not retrieved yet, it passes the offset and
size of the I/O request to ulohttp daemon.
Ulohttp converts the offset and the size into HTTP GET request with
Range header and send it to the http server.
Retriving the data from the http server, ulohttp stores it to the
cache file, and tells ULOOP driver that the HTTP transfer completes.
Then the ULOOP driver sets the corresponding bit in the bitmap, and
finishes the I/O/request.

In other words, it is equivalent to this operation.
$ wget URL_for_fsimage
$ sudo mount -o loop retrieved_fsimage /mnt
But ULOOP driver and ulohttp retrieves only the data (block) on-demand,
and stores into the cache file. The first access to a block is slow
since it involves HTTP GET, but the next access to the same block is
fast since it is in the local cache file. In this case, the behaviour
is equivalent to the simple /dev/loop device.

o Note
- ulohttp requires libcurl.
- ulohttp doesn't support HTTP PUT or POST, so the device rejects
WRITE operation.
- ulohttp doesn't have a smart exit routine.
- This sample is "proof-of-concepts", do not expect the maturity level
too much.
- This driver and the sample is developed and tested on linux-2.6.21.3.
- If you implement other protocols such like nbd/enbd, iscsi, aoe or
something, instead of http, I guess it will be fantastic. :-)

o Usage
$ make
$ sudo modprobe loop
$ sudo insmod ./drivers/block/uloop.ko
$ dev=/dev/loop7
$ ./ulohttp -b /tmp/b -c /tmp/c $dev http://whatever/you/like
$ sudo mount -o ro $dev /mnt
$ ls /mnt
:::
$ sudo umount /mnt
$ killall ulohttp
$ sudo losetup -d $dev


2. sample for generic block device
The sample `ulohttp' (above) retrieves data from a remote host via
HTTP, and stores it into a local file as a cache. It means you can
reduce the network traffic and the workload on a remote server.
As you can guess easily, this scheme is also effective to a local disk
device, especially when you want to make your disk and spin down/off
it. Recent flash memory is getting larger and cheaper. You can cache
the whole contents of your harddrive into a file on your flash.
Here is a sample for it, `ulobdev.' The basic usage is very similar to
`ulohttp'. See above.
Of course, it is available for remote block devices too, such as
nbd/enbd, iscsi and aoe.

You should not mount the backend block device as readwrite, since it
modifies the superblock of the filesystem on the block device even if
you don't write anything to it.

Currently this sample supports readonly mode only.
If someone is interested in this approach and sample, I will add some
features which will support read/write mode and write-back to the
harddrive periodically, and discard/re-create the cache file.


3. libuloop API
- int ulo_init(struct ulo_init *init);
struct ulo_init {
char *path[ULO_Last];
int dev_flags;
unsigned long long size;
};
enum {ULO_DEV, ULO_CACHE, ULO_BITMAP, ULO_Last};

Initializes ULOOP driver. All members in struct ulo_init must be set
before you call ulo_init().
+ path[ULO_DEV]
pathname of loopback device such as "/dev/loopN".
+ path[ULO_CACHE]
pathname of a cache file. A userspace program stores the
real data to this file.
+ path[ULO_BITMAP]
pathname of a bitmap file. The ULOOP driver sets the bit
which is corresponding the block number when the block is
filled by a userspace program. When the bit is not set,
ULOOP driver invokes the userspace program.
+ dev_flags
Flags for open(2) of path[ULO_DEV].
+ size
the size of real data. the ULOOP library set this size to
the cache file after creating it internally.

- int ulo_loop(int sig, ulo_cb_t store, void *arg);
typedef int (*ulo_cb_t)(unsigned long long start, int size, void *arg);

Waits for a I/O request from ULOOP driver. When a user accesses a
ULOOP device, ULOOP driver translates the request to the offset in
the cache file and the requested size, and invokes the user-defined
callback function which is specified by `store.' The function `store'
must fill the data in the cache file following the given offset and
size. You can add an argument `arg' for the callback function.

- extern const struct uloop *uloop;
struct uloop {
int fd[ULO_Last];
int pagesize;
unsigned long long tgt_size, cache_size;
};

A global variable in ULOOP library. Usually you will need
'ulo_cache_fd` only. See below.
#define ulo_dev_fd ({ uloop->fd[ULO_DEV]; })
#define ulo_cache_fd ({ uloop->fd[ULO_CACHE]; })
#define ulo_bitmap_fd ({ uloop->fd[ULO_BITMAP]; })


Enjoy!