Subject: RFC: allow recording and passing of open file descriptors

Hello folks,


here's an attempt to make it to record open file descriptors (struct file*)
and make them available to other processes via file system. The semantics
are similar to dup() or passing via Unix socket in regard that the
receiving process will get reference to the same struct file instance into
its fdtable. But the big difference here is we're doing it directly via
file system (ie. retrieving is done by a simple open()). It works pretty
much like like Plan9's /srv file system:

http://man.cat-v.org/plan_9/3/srv

To archive that, the first patch introduces the concept of "file boxing",
which means an open file operation can put a reference to another file
into the struct file, which will then be returned to the caller, instead
of the newly created one. The reason for doing it this strange way is that
the new struct file instance is allocated and prepared very early, before
we're calling into the actual file operation - refactoring this so that
the open() file op directly returns a struct file* pointer would be a
massively intrusive change, that I just don't dare to do here.

The second patch introduces a new file system "srvfs" that works like
Plan9's /srv file systems.


Another use case for the first patch could be direct fd passing in FUSE,
like Peng Tao and Alessio Balsini are currently working on, via other means.

https://www.spinics.net/lists/linux-fsdevel/msg196163.html


I believe the patch 1 should be pretty straightforward and not doing any harm.
(it's not even compiled in unless explicitly enabled by something else)


have run,

--mtx

---
Enrico Weigelt, metux IT consult
Free software and Linux embedded engineering
[email protected] -- +49-151-27565287


Subject: [RFC PATCH 1/2] fs: allow filesystems to directly pass an existing struct file

In some scenarios, file systems might want to pass an already opened
struct file instance on an open() call, instead of opening a new one.

This allows similar techniques like the already well known file descriptor
passing via Unix domain sockets, but now also for plain open() calls.

Signed-off-by: Enrico Weigelt, metux IT consult <[email protected]>
---
fs/Kconfig | 3 +++
fs/internal.h | 6 ++++++
fs/namei.c | 2 +-
fs/open.c | 42 +++++++++++++++++++++++++++++++++++++++++-
include/linux/fs.h | 9 +++++++++
5 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 141a856c50e7..b8b7a77b656c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -357,4 +357,7 @@ source "fs/unicode/Kconfig"
config IO_WQ
bool

+config FS_BOXED_FILE
+ bool
+
endmenu
diff --git a/fs/internal.h b/fs/internal.h
index 6aeae7ef3380..e5e9cf038a24 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -142,6 +142,12 @@ int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
int chown_common(const struct path *path, uid_t user, gid_t group);
extern int vfs_open(const struct path *, struct file *);

+#ifdef CONFIG_FS_BOXED_FILE
+extern struct file *unbox_file(struct file *);
+#else
+static inline struct file *unbox_file(struct file *f) { return f; }
+#endif
+
/*
* inode.c
*/
diff --git a/fs/namei.c b/fs/namei.c
index 79b0ff9b151e..b186d2d75b63 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3496,7 +3496,7 @@ static struct file *path_openat(struct nameidata *nd,
}
if (likely(!error)) {
if (likely(file->f_mode & FMODE_OPENED))
- return file;
+ return unbox_file(file);
WARN_ON(1);
error = -EINVAL;
}
diff --git a/fs/open.c b/fs/open.c
index e53af13b5835..88daf09ffeb4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -769,6 +769,46 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
return ksys_fchown(fd, user, group);
}

+#ifdef CONFIG_FS_BOXED_FILE
+/*
+ * Finish up an open procedure before returning the file to the caller.
+ * in case the the fs returns some unusual things like directly passing
+ * another file, this will be handled here.
+ *
+ * This function is only supposed to be called by functions like dentry_open()
+ * and path_openat() that allocate a new struct file and finally pass it to
+ * vfs_open() - the struct file should not have been used in any ways in the
+ * meantime, or unpleasant things may happen.
+ */
+struct file *unbox_file(struct file *f)
+{
+ struct file *boxed;
+
+ if (unlikely(!f))
+ return NULL;
+
+ if (IS_ERR(f))
+ return f;
+
+ if (likely(!f->boxed_file))
+ return f;
+
+ /* the fs returned another struct file (f->lower_file) that should be
+ directly passed to our callers instead of the one that had been newly
+ created for the open procedure.
+
+ the lower_file is already ref'ed, so we keep the refcount.
+ since the upper file (f) just had been opened, and no further access,
+ we can just call fput() on it.
+ */
+
+ boxed = f->boxed_file;
+ fput(f);
+
+ return boxed;
+}
+#endif /* CONFIG_FS_BOXED_FILE */
+
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
@@ -959,7 +999,7 @@ struct file *dentry_open(const struct path *path, int flags,
f = ERR_PTR(error);
}
}
- return f;
+ return unbox_file(f);
}
EXPORT_SYMBOL(dentry_open);

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3c88fdb9b2a..a778c5c057ab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -955,6 +955,15 @@ struct file {
struct address_space *f_mapping;
errseq_t f_wb_err;
errseq_t f_sb_err; /* for syncfs */
+
+#ifdef CONFIG_FS_BOXED_FILE
+ /* Only for file systems that wanna pass an *existing* file to the
+ caller of open() instead of the newly created one. This has similar
+ semantics like passing an fd via unix socket, but instead via some
+ open() call.
+ */
+ struct file *boxed_file;
+#endif
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */

--
2.20.1