This patch allows bpf_syscall prog to perform some basic filesystem
operations: create, remove directories and unlink files. Three bpf
helpers are added for this purpose. When combined with the following
patches that allow pinning and getting bpf objects from bpf prog,
this feature can be used to create directory hierarchy in bpffs that
help manage bpf objects purely using bpf progs.
The added helpers subject to the same permission checks as their syscall
version. For example, one can not write to a read-only file system;
The identity of the current process is checked to see whether it has
sufficient permission to perform the operations.
Only directories and files in bpffs can be created or removed by these
helpers. But it won't be too hard to allow these helpers to operate
on files in other filesystems, if we want.
Signed-off-by: Hao Luo <[email protected]>
---
include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 26 +++++
kernel/bpf/inode.c | 9 +-
kernel/bpf/syscall.c | 177 +++++++++++++++++++++++++++++++++
tools/include/uapi/linux/bpf.h | 26 +++++
5 files changed, 236 insertions(+), 3 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f19abc59b6cd..fce5e26179f5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1584,6 +1584,7 @@ int bpf_link_new_fd(struct bpf_link *link);
struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
struct bpf_link *bpf_link_get_from_fd(u32 ufd);
+bool bpf_path_is_bpf_dir(const struct path *path);
int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
int bpf_obj_get_user(const char __user *pathname, int flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index afe3d0d7f5f2..a5dbc794403d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5086,6 +5086,29 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure. On error
* *dst* buffer is zeroed out.
+ *
+ * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
+ * Description
+ * Attempts to create a directory name *pathname*. The argument
+ * *pathname_sz* specifies the length of the string *pathname*.
+ * The argument *mode* specifies the mode for the new directory. It
+ * is modified by the process's umask. It has the same semantic as
+ * the syscall mkdir(2).
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_rmdir(const char *pathname, int pathname_sz)
+ * Description
+ * Deletes a directory, which must be empty.
+ * Return
+ * 0 on sucess, or a negative error in case of failure.
+ *
+ * long bpf_unlink(const char *pathname, int pathname_sz)
+ * Description
+ * Deletes a name and possibly the file it refers to. It has the
+ * same semantic as the syscall unlink(2).
+ * Return
+ * 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -5280,6 +5303,9 @@ union bpf_attr {
FN(xdp_load_bytes), \
FN(xdp_store_bytes), \
FN(copy_from_user_task), \
+ FN(mkdir), \
+ FN(rmdir), \
+ FN(unlink), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4f841e16779e..3aca00e9e950 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -414,6 +414,11 @@ static const struct inode_operations bpf_dir_iops = {
.unlink = simple_unlink,
};
+bool bpf_path_is_bpf_dir(const struct path *path)
+{
+ return d_inode(path->dentry)->i_op == &bpf_dir_iops;
+}
+
/* pin iterator link into bpffs */
static int bpf_iter_link_pin_kernel(struct dentry *parent,
const char *name, struct bpf_link *link)
@@ -439,7 +444,6 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw,
enum bpf_type type)
{
struct dentry *dentry;
- struct inode *dir;
struct path path;
umode_t mode;
int ret;
@@ -454,8 +458,7 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw,
if (ret)
goto out;
- dir = d_inode(path.dentry);
- if (dir->i_op != &bpf_dir_iops) {
+ if (!bpf_path_is_bpf_dir(&path)) {
ret = -EPERM;
goto out;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index db402ebc5570..07683b791733 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -12,6 +12,7 @@
#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
+#include <linux/namei.h>
#include <linux/anon_inodes.h>
#include <linux/fdtable.h>
#include <linux/file.h>
@@ -4867,6 +4868,176 @@ const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.arg4_type = ARG_PTR_TO_LONG,
};
+BPF_CALL_3(bpf_mkdir, const char *, pathname, int, pathname_sz, u32, raw_mode)
+{
+ struct user_namespace *mnt_userns;
+ struct dentry *dentry;
+ struct path path;
+ umode_t mode;
+ int err;
+
+ if (pathname_sz <= 1 || pathname[pathname_sz - 1])
+ return -EINVAL;
+
+ dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ if (!bpf_path_is_bpf_dir(&path)) {
+ err = -EPERM;
+ goto err_exit;
+ }
+
+ mode = raw_mode;
+ if (!IS_POSIXACL(path.dentry->d_inode))
+ mode &= ~current_umask();
+ err = security_path_mkdir(&path, dentry, mode);
+ if (err)
+ goto err_exit;
+
+ mnt_userns = mnt_user_ns(path.mnt);
+ err = vfs_mkdir(mnt_userns, d_inode(path.dentry), dentry, mode);
+
+err_exit:
+ done_path_create(&path, dentry);
+ return err;
+}
+
+const struct bpf_func_proto bpf_mkdir_proto = {
+ .func = bpf_mkdir,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_rmdir, const char *, pathname, int, pathname_sz)
+{
+ struct user_namespace *mnt_userns;
+ struct path parent;
+ struct dentry *dentry;
+ int err;
+
+ if (pathname_sz <= 1 || pathname[pathname_sz - 1])
+ return -EINVAL;
+
+ err = kern_path(pathname, 0, &parent);
+ if (err)
+ return err;
+
+ if (!bpf_path_is_bpf_dir(&parent)) {
+ err = -EPERM;
+ goto exit1;
+ }
+
+ err = mnt_want_write(parent.mnt);
+ if (err)
+ goto exit1;
+
+ dentry = kern_path_locked(pathname, &parent);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto exit2;
+ }
+
+ if (d_really_is_negative(dentry)) {
+ err = -ENOENT;
+ goto exit3;
+ }
+
+ err = security_path_rmdir(&parent, dentry);
+ if (err)
+ goto exit3;
+
+ mnt_userns = mnt_user_ns(parent.mnt);
+ err = vfs_rmdir(mnt_userns, d_inode(parent.dentry), dentry);
+exit3:
+ dput(dentry);
+ inode_unlock(d_inode(parent.dentry));
+exit2:
+ mnt_drop_write(parent.mnt);
+exit1:
+ path_put(&parent);
+ return err;
+}
+
+const struct bpf_func_proto bpf_rmdir_proto = {
+ .func = bpf_rmdir,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_2(bpf_unlink, const char *, pathname, int, pathname_sz)
+{
+ struct user_namespace *mnt_userns;
+ struct path parent;
+ struct dentry *dentry;
+ struct inode *inode = NULL;
+ int err;
+
+ if (pathname_sz <= 1 || pathname[pathname_sz - 1])
+ return -EINVAL;
+
+ err = kern_path(pathname, 0, &parent);
+ if (err)
+ return err;
+
+ err = mnt_want_write(parent.mnt);
+ if (err)
+ goto exit1;
+
+ dentry = kern_path_locked(pathname, &parent);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto exit2;
+ }
+
+ if (!bpf_path_is_bpf_dir(&parent)) {
+ err = -EPERM;
+ goto exit3;
+ }
+
+ if (d_is_negative(dentry)) {
+ err = -ENOENT;
+ goto exit3;
+ }
+
+ if (d_is_dir(dentry)) {
+ err = -EISDIR;
+ goto exit3;
+ }
+
+ inode = dentry->d_inode;
+ ihold(inode);
+ err = security_path_unlink(&parent, dentry);
+ if (err)
+ goto exit3;
+
+ mnt_userns = mnt_user_ns(parent.mnt);
+ err = vfs_unlink(mnt_userns, d_inode(parent.dentry), dentry, NULL);
+exit3:
+ dput(dentry);
+ inode_unlock(d_inode(parent.dentry));
+ if (inode)
+ iput(inode);
+exit2:
+ mnt_drop_write(parent.mnt);
+exit1:
+ path_put(&parent);
+ return err;
+}
+
+const struct bpf_func_proto bpf_unlink_proto = {
+ .func = bpf_unlink,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
static const struct bpf_func_proto *
syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -4879,6 +5050,12 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sys_close_proto;
case BPF_FUNC_kallsyms_lookup_name:
return &bpf_kallsyms_lookup_name_proto;
+ case BPF_FUNC_mkdir:
+ return &bpf_mkdir_proto;
+ case BPF_FUNC_rmdir:
+ return &bpf_rmdir_proto;
+ case BPF_FUNC_unlink:
+ return &bpf_unlink_proto;
default:
return tracing_prog_func_proto(func_id, prog);
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index afe3d0d7f5f2..a5dbc794403d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5086,6 +5086,29 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure. On error
* *dst* buffer is zeroed out.
+ *
+ * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
+ * Description
+ * Attempts to create a directory name *pathname*. The argument
+ * *pathname_sz* specifies the length of the string *pathname*.
+ * The argument *mode* specifies the mode for the new directory. It
+ * is modified by the process's umask. It has the same semantic as
+ * the syscall mkdir(2).
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_rmdir(const char *pathname, int pathname_sz)
+ * Description
+ * Deletes a directory, which must be empty.
+ * Return
+ * 0 on sucess, or a negative error in case of failure.
+ *
+ * long bpf_unlink(const char *pathname, int pathname_sz)
+ * Description
+ * Deletes a name and possibly the file it refers to. It has the
+ * same semantic as the syscall unlink(2).
+ * Return
+ * 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -5280,6 +5303,9 @@ union bpf_attr {
FN(xdp_load_bytes), \
FN(xdp_store_bytes), \
FN(copy_from_user_task), \
+ FN(mkdir), \
+ FN(rmdir), \
+ FN(unlink), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
--
2.35.1.574.g5d30c73bfb-goog
On Sat, Feb 26, 2022 at 05:13:31AM IST, Hao Luo wrote:
> This patch allows bpf_syscall prog to perform some basic filesystem
> operations: create, remove directories and unlink files. Three bpf
> helpers are added for this purpose. When combined with the following
> patches that allow pinning and getting bpf objects from bpf prog,
> this feature can be used to create directory hierarchy in bpffs that
> help manage bpf objects purely using bpf progs.
>
> The added helpers subject to the same permission checks as their syscall
> version. For example, one can not write to a read-only file system;
> The identity of the current process is checked to see whether it has
> sufficient permission to perform the operations.
>
> Only directories and files in bpffs can be created or removed by these
> helpers. But it won't be too hard to allow these helpers to operate
> on files in other filesystems, if we want.
>
> Signed-off-by: Hao Luo <[email protected]>
> ---
> include/linux/bpf.h | 1 +
> include/uapi/linux/bpf.h | 26 +++++
> kernel/bpf/inode.c | 9 +-
> kernel/bpf/syscall.c | 177 +++++++++++++++++++++++++++++++++
> tools/include/uapi/linux/bpf.h | 26 +++++
> 5 files changed, 236 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f19abc59b6cd..fce5e26179f5 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1584,6 +1584,7 @@ int bpf_link_new_fd(struct bpf_link *link);
> struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
> struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>
> +bool bpf_path_is_bpf_dir(const struct path *path);
> int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
> int bpf_obj_get_user(const char __user *pathname, int flags);
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index afe3d0d7f5f2..a5dbc794403d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5086,6 +5086,29 @@ union bpf_attr {
> * Return
> * 0 on success, or a negative error in case of failure. On error
> * *dst* buffer is zeroed out.
> + *
> + * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
> + * Description
> + * Attempts to create a directory name *pathname*. The argument
> + * *pathname_sz* specifies the length of the string *pathname*.
> + * The argument *mode* specifies the mode for the new directory. It
> + * is modified by the process's umask. It has the same semantic as
> + * the syscall mkdir(2).
> + * Return
> + * 0 on success, or a negative error in case of failure.
> + *
> + * long bpf_rmdir(const char *pathname, int pathname_sz)
> + * Description
> + * Deletes a directory, which must be empty.
> + * Return
> + * 0 on sucess, or a negative error in case of failure.
> + *
> + * long bpf_unlink(const char *pathname, int pathname_sz)
> + * Description
> + * Deletes a name and possibly the file it refers to. It has the
> + * same semantic as the syscall unlink(2).
> + * Return
> + * 0 on success, or a negative error in case of failure.
> */
> #define __BPF_FUNC_MAPPER(FN) \
> FN(unspec), \
> @@ -5280,6 +5303,9 @@ union bpf_attr {
> FN(xdp_load_bytes), \
> FN(xdp_store_bytes), \
> FN(copy_from_user_task), \
> + FN(mkdir), \
> + FN(rmdir), \
> + FN(unlink), \
> /* */
>
How about only introducing bpf_sys_mkdirat and bpf_sys_unlinkat? That would be
more useful for other cases in future, and when AT_FDCWD is passed, has the same
functionality as these, but when openat/fget is supported, it would work
relative to other dirfds as well. It can also allow using dirfd of the process
calling read for a iterator (e.g. if it sets the fd number using skel->bss).
unlinkat's AT_REMOVEDIR flag also removes the need for a bpf_rmdir.
WDYT?
> [...]
--
Kartikeya
Hi Kumar,
On Sat, Feb 26, 2022 at 9:18 PM Kumar Kartikeya Dwivedi
<[email protected]> wrote:
>
> On Sat, Feb 26, 2022 at 05:13:31AM IST, Hao Luo wrote:
> > This patch allows bpf_syscall prog to perform some basic filesystem
> > operations: create, remove directories and unlink files. Three bpf
> > helpers are added for this purpose. When combined with the following
> > patches that allow pinning and getting bpf objects from bpf prog,
> > this feature can be used to create directory hierarchy in bpffs that
> > help manage bpf objects purely using bpf progs.
> >
> > The added helpers subject to the same permission checks as their syscall
> > version. For example, one can not write to a read-only file system;
> > The identity of the current process is checked to see whether it has
> > sufficient permission to perform the operations.
> >
> > Only directories and files in bpffs can be created or removed by these
> > helpers. But it won't be too hard to allow these helpers to operate
> > on files in other filesystems, if we want.
> >
> > Signed-off-by: Hao Luo <[email protected]>
> > ---
> > + *
> > + * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
> > + * Description
> > + * Attempts to create a directory name *pathname*. The argument
> > + * *pathname_sz* specifies the length of the string *pathname*.
> > + * The argument *mode* specifies the mode for the new directory. It
> > + * is modified by the process's umask. It has the same semantic as
> > + * the syscall mkdir(2).
> > + * Return
> > + * 0 on success, or a negative error in case of failure.
> > + *
> > + * long bpf_rmdir(const char *pathname, int pathname_sz)
> > + * Description
> > + * Deletes a directory, which must be empty.
> > + * Return
> > + * 0 on sucess, or a negative error in case of failure.
> > + *
> > + * long bpf_unlink(const char *pathname, int pathname_sz)
> > + * Description
> > + * Deletes a name and possibly the file it refers to. It has the
> > + * same semantic as the syscall unlink(2).
> > + * Return
> > + * 0 on success, or a negative error in case of failure.
> > */
> >
>
> How about only introducing bpf_sys_mkdirat and bpf_sys_unlinkat? That would be
> more useful for other cases in future, and when AT_FDCWD is passed, has the same
> functionality as these, but when openat/fget is supported, it would work
> relative to other dirfds as well. It can also allow using dirfd of the process
> calling read for a iterator (e.g. if it sets the fd number using skel->bss).
> unlinkat's AT_REMOVEDIR flag also removes the need for a bpf_rmdir.
>
> WDYT?
>
The idea sounds good to me, more flexible. But I don't have a real use
case for using the added 'dirfd' at this moment. For all the use cases
I can think of, absolute paths will suffice, I think. Unless other
reviewers have opposition, I will try switching to mkdirat and
unlinkat in v2.
On Mon, Feb 28, 2022 at 02:10:39PM -0800, Hao Luo wrote:
> Hi Kumar,
>
> On Sat, Feb 26, 2022 at 9:18 PM Kumar Kartikeya Dwivedi
> <[email protected]> wrote:
> >
> > On Sat, Feb 26, 2022 at 05:13:31AM IST, Hao Luo wrote:
> > > This patch allows bpf_syscall prog to perform some basic filesystem
> > > operations: create, remove directories and unlink files. Three bpf
> > > helpers are added for this purpose. When combined with the following
> > > patches that allow pinning and getting bpf objects from bpf prog,
> > > this feature can be used to create directory hierarchy in bpffs that
> > > help manage bpf objects purely using bpf progs.
> > >
> > > The added helpers subject to the same permission checks as their syscall
> > > version. For example, one can not write to a read-only file system;
> > > The identity of the current process is checked to see whether it has
> > > sufficient permission to perform the operations.
> > >
> > > Only directories and files in bpffs can be created or removed by these
> > > helpers. But it won't be too hard to allow these helpers to operate
> > > on files in other filesystems, if we want.
> > >
> > > Signed-off-by: Hao Luo <[email protected]>
> > > ---
> > > + *
> > > + * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
> > > + * Description
> > > + * Attempts to create a directory name *pathname*. The argument
> > > + * *pathname_sz* specifies the length of the string *pathname*.
> > > + * The argument *mode* specifies the mode for the new directory. It
> > > + * is modified by the process's umask. It has the same semantic as
> > > + * the syscall mkdir(2).
> > > + * Return
> > > + * 0 on success, or a negative error in case of failure.
> > > + *
> > > + * long bpf_rmdir(const char *pathname, int pathname_sz)
> > > + * Description
> > > + * Deletes a directory, which must be empty.
> > > + * Return
> > > + * 0 on sucess, or a negative error in case of failure.
> > > + *
> > > + * long bpf_unlink(const char *pathname, int pathname_sz)
> > > + * Description
> > > + * Deletes a name and possibly the file it refers to. It has the
> > > + * same semantic as the syscall unlink(2).
> > > + * Return
> > > + * 0 on success, or a negative error in case of failure.
> > > */
> > >
> >
> > How about only introducing bpf_sys_mkdirat and bpf_sys_unlinkat? That would be
> > more useful for other cases in future, and when AT_FDCWD is passed, has the same
> > functionality as these, but when openat/fget is supported, it would work
> > relative to other dirfds as well. It can also allow using dirfd of the process
> > calling read for a iterator (e.g. if it sets the fd number using skel->bss).
> > unlinkat's AT_REMOVEDIR flag also removes the need for a bpf_rmdir.
> >
> > WDYT?
> >
>
> The idea sounds good to me, more flexible. But I don't have a real use
> case for using the added 'dirfd' at this moment. For all the use cases
> I can think of, absolute paths will suffice, I think. Unless other
> reviewers have opposition, I will try switching to mkdirat and
> unlinkat in v2.
I'm surprised you don't need "at" variants.
I thought your production setup has a top level cgroup controller and
then inner tasks inside containers manage cgroups on their own.
Since containers are involved they likely run inside their own mountns.
cgroupfs mount is single. So you probably don't even need to bind mount it
inside containers, but bpffs is not a single mount. You need
to bind mount top bpffs inside containers for tasks to access it.
Now for cgroupfs the abs path is not an issue, but for bpffs
the AT_FDCWD becomes a problem. AT_FDCWD is using current mount ns.
Inside container that will be different. Unless you bind mount into exact
same path the full path has different meanings inside and outside of the container.
It seems to me the bpf progs attached to cgroup sleepable events should
be using FD of bpffs. Then when these tracepoints are triggered from
different containers in different mountns they will get the right dir prefix.
What am I missing?
I think non-AT variants are not needed. The prog can always pass AT_FDCWD
if it's really the intent, but passing actual FD seems more error-proof.
On 2/25/22 3:43 PM, Hao Luo wrote:
> This patch allows bpf_syscall prog to perform some basic filesystem
> operations: create, remove directories and unlink files. Three bpf
> helpers are added for this purpose. When combined with the following
> patches that allow pinning and getting bpf objects from bpf prog,
> this feature can be used to create directory hierarchy in bpffs that
> help manage bpf objects purely using bpf progs.
>
> The added helpers subject to the same permission checks as their syscall
> version. For example, one can not write to a read-only file system;
> The identity of the current process is checked to see whether it has
> sufficient permission to perform the operations.
>
> Only directories and files in bpffs can be created or removed by these
> helpers. But it won't be too hard to allow these helpers to operate
> on files in other filesystems, if we want.
>
> Signed-off-by: Hao Luo <[email protected]>
> ---
> include/linux/bpf.h | 1 +
> include/uapi/linux/bpf.h | 26 +++++
> kernel/bpf/inode.c | 9 +-
> kernel/bpf/syscall.c | 177 +++++++++++++++++++++++++++++++++
> tools/include/uapi/linux/bpf.h | 26 +++++
> 5 files changed, 236 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f19abc59b6cd..fce5e26179f5 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1584,6 +1584,7 @@ int bpf_link_new_fd(struct bpf_link *link);
> struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
> struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>
> +bool bpf_path_is_bpf_dir(const struct path *path);
> int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
> int bpf_obj_get_user(const char __user *pathname, int flags);
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index afe3d0d7f5f2..a5dbc794403d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5086,6 +5086,29 @@ union bpf_attr {
> * Return
> * 0 on success, or a negative error in case of failure. On error
> * *dst* buffer is zeroed out.
> + *
> + * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
Can we make pathname_sz to be u32 instead of int? pathname_sz should
never be negative any way.
Also, I think it is a good idea to add 'u64 flags' parameter for all
three helpers, so we have room in the future to tune for new use cases.
> + * Description
> + * Attempts to create a directory name *pathname*. The argument
> + * *pathname_sz* specifies the length of the string *pathname*.
> + * The argument *mode* specifies the mode for the new directory. It
> + * is modified by the process's umask. It has the same semantic as
> + * the syscall mkdir(2).
> + * Return
> + * 0 on success, or a negative error in case of failure.
> + *
> + * long bpf_rmdir(const char *pathname, int pathname_sz)
> + * Description
> + * Deletes a directory, which must be empty.
> + * Return
> + * 0 on sucess, or a negative error in case of failure.
> + *
> + * long bpf_unlink(const char *pathname, int pathname_sz)
> + * Description
> + * Deletes a name and possibly the file it refers to. It has the
> + * same semantic as the syscall unlink(2).
> + * Return
> + * 0 on success, or a negative error in case of failure.
> */
> #define __BPF_FUNC_MAPPER(FN) \
> FN(unspec), \
> @@ -5280,6 +5303,9 @@ union bpf_attr {
> FN(xdp_load_bytes), \
> FN(xdp_store_bytes), \
> FN(copy_from_user_task), \
> + FN(mkdir), \
> + FN(rmdir), \
> + FN(unlink), \
> /* */
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
[...]
On Thu, Mar 3, 2022 at 11:13 AM Yonghong Song <[email protected]> wrote:
>
>
>
> On 3/3/22 10:56 AM, Hao Luo wrote:
> > On Wed, Mar 2, 2022 at 12:55 PM Yonghong Song <[email protected]> wrote:
> >> On 2/25/22 3:43 PM, Hao Luo wrote:
> >>> @@ -5086,6 +5086,29 @@ union bpf_attr {
> >>> * Return
> >>> * 0 on success, or a negative error in case of failure. On error
> >>> * *dst* buffer is zeroed out.
> >>> + *
> >>> + * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
> >>
> >> Can we make pathname_sz to be u32 instead of int? pathname_sz should
> >> never be negative any way.
> >>
> >> Also, I think it is a good idea to add 'u64 flags' parameter for all
> >> three helpers, so we have room in the future to tune for new use cases.
> >>
> >
> > SG. Will make this change.
> >
> > Actually, I think I need to cap patthname_sz from above, to ensure
> > pathname_sz isn't too big. Is that necessary? I see there are other
> > helpers that don't have this type of check.
>
> There is no need. The verifier should ensure the memory held by pathname
> will have at least size of pathname_sz.
>
SG. Thanks!
On Wed, Mar 2, 2022 at 11:34 AM Alexei Starovoitov
<[email protected]> wrote:
>
> On Mon, Feb 28, 2022 at 02:10:39PM -0800, Hao Luo wrote:
> > Hi Kumar,
> >
> > On Sat, Feb 26, 2022 at 9:18 PM Kumar Kartikeya Dwivedi
> > <[email protected]> wrote:
> > >
> > > On Sat, Feb 26, 2022 at 05:13:31AM IST, Hao Luo wrote:
> > > > This patch allows bpf_syscall prog to perform some basic filesystem
> > > > operations: create, remove directories and unlink files. Three bpf
> > > > helpers are added for this purpose. When combined with the following
> > > > patches that allow pinning and getting bpf objects from bpf prog,
> > > > this feature can be used to create directory hierarchy in bpffs that
> > > > help manage bpf objects purely using bpf progs.
> > > >
> > > > The added helpers subject to the same permission checks as their syscall
> > > > version. For example, one can not write to a read-only file system;
> > > > The identity of the current process is checked to see whether it has
> > > > sufficient permission to perform the operations.
> > > >
> > > > Only directories and files in bpffs can be created or removed by these
> > > > helpers. But it won't be too hard to allow these helpers to operate
> > > > on files in other filesystems, if we want.
> > > >
> > > > Signed-off-by: Hao Luo <[email protected]>
> > > > ---
> > > > + *
> > > > + * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
> > > > + * Description
> > > > + * Attempts to create a directory name *pathname*. The argument
> > > > + * *pathname_sz* specifies the length of the string *pathname*.
> > > > + * The argument *mode* specifies the mode for the new directory. It
> > > > + * is modified by the process's umask. It has the same semantic as
> > > > + * the syscall mkdir(2).
> > > > + * Return
> > > > + * 0 on success, or a negative error in case of failure.
> > > > + *
> > > > + * long bpf_rmdir(const char *pathname, int pathname_sz)
> > > > + * Description
> > > > + * Deletes a directory, which must be empty.
> > > > + * Return
> > > > + * 0 on sucess, or a negative error in case of failure.
> > > > + *
> > > > + * long bpf_unlink(const char *pathname, int pathname_sz)
> > > > + * Description
> > > > + * Deletes a name and possibly the file it refers to. It has the
> > > > + * same semantic as the syscall unlink(2).
> > > > + * Return
> > > > + * 0 on success, or a negative error in case of failure.
> > > > */
> > > >
> > >
> > > How about only introducing bpf_sys_mkdirat and bpf_sys_unlinkat? That would be
> > > more useful for other cases in future, and when AT_FDCWD is passed, has the same
> > > functionality as these, but when openat/fget is supported, it would work
> > > relative to other dirfds as well. It can also allow using dirfd of the process
> > > calling read for a iterator (e.g. if it sets the fd number using skel->bss).
> > > unlinkat's AT_REMOVEDIR flag also removes the need for a bpf_rmdir.
> > >
> > > WDYT?
> > >
> >
> > The idea sounds good to me, more flexible. But I don't have a real use
> > case for using the added 'dirfd' at this moment. For all the use cases
> > I can think of, absolute paths will suffice, I think. Unless other
> > reviewers have opposition, I will try switching to mkdirat and
> > unlinkat in v2.
>
> I'm surprised you don't need "at" variants.
> I thought your production setup has a top level cgroup controller and
> then inner tasks inside containers manage cgroups on their own.
> Since containers are involved they likely run inside their own mountns.
> cgroupfs mount is single. So you probably don't even need to bind mount it
> inside containers, but bpffs is not a single mount. You need
> to bind mount top bpffs inside containers for tasks to access it.
> Now for cgroupfs the abs path is not an issue, but for bpffs
> the AT_FDCWD becomes a problem. AT_FDCWD is using current mount ns.
> Inside container that will be different. Unless you bind mount into exact
> same path the full path has different meanings inside and outside of the container.
> It seems to me the bpf progs attached to cgroup sleepable events should
> be using FD of bpffs. Then when these tracepoints are triggered from
> different containers in different mountns they will get the right dir prefix.
> What am I missing?
>
Alexei, you are perfectly right. To be honest, I failed to see the
fact that the sleepable tp prog is in the container's mount ns. I
think we can bind mount the top bpffs into exactly the same path
inside container, right? But I haven't tested this idea. Passing FDs
should be better.
> I think non-AT variants are not needed. The prog can always pass AT_FDCWD
> if it's really the intent, but passing actual FD seems more error-proof.
Let's have the AT version. Passing FD seems the right approach, when
we use it in the context of container.
On 3/3/22 10:56 AM, Hao Luo wrote:
> On Wed, Mar 2, 2022 at 12:55 PM Yonghong Song <[email protected]> wrote:
>>
>>
>>
>> On 2/25/22 3:43 PM, Hao Luo wrote:
>>> This patch allows bpf_syscall prog to perform some basic filesystem
>>> operations: create, remove directories and unlink files. Three bpf
>>> helpers are added for this purpose. When combined with the following
>>> patches that allow pinning and getting bpf objects from bpf prog,
>>> this feature can be used to create directory hierarchy in bpffs that
>>> help manage bpf objects purely using bpf progs.
>>>
>>> The added helpers subject to the same permission checks as their syscall
>>> version. For example, one can not write to a read-only file system;
>>> The identity of the current process is checked to see whether it has
>>> sufficient permission to perform the operations.
>>>
>>> Only directories and files in bpffs can be created or removed by these
>>> helpers. But it won't be too hard to allow these helpers to operate
>>> on files in other filesystems, if we want.
>>>
>>> Signed-off-by: Hao Luo <[email protected]>
>>> ---
>>> include/linux/bpf.h | 1 +
>>> include/uapi/linux/bpf.h | 26 +++++
>>> kernel/bpf/inode.c | 9 +-
>>> kernel/bpf/syscall.c | 177 +++++++++++++++++++++++++++++++++
>>> tools/include/uapi/linux/bpf.h | 26 +++++
>>> 5 files changed, 236 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index f19abc59b6cd..fce5e26179f5 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -1584,6 +1584,7 @@ int bpf_link_new_fd(struct bpf_link *link);
>>> struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
>>> struct bpf_link *bpf_link_get_from_fd(u32 ufd);
>>>
>>> +bool bpf_path_is_bpf_dir(const struct path *path);
>>> int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
>>> int bpf_obj_get_user(const char __user *pathname, int flags);
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index afe3d0d7f5f2..a5dbc794403d 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -5086,6 +5086,29 @@ union bpf_attr {
>>> * Return
>>> * 0 on success, or a negative error in case of failure. On error
>>> * *dst* buffer is zeroed out.
>>> + *
>>> + * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
>>
>> Can we make pathname_sz to be u32 instead of int? pathname_sz should
>> never be negative any way.
>>
>> Also, I think it is a good idea to add 'u64 flags' parameter for all
>> three helpers, so we have room in the future to tune for new use cases.
>>
>
> SG. Will make this change.
>
> Actually, I think I need to cap patthname_sz from above, to ensure
> pathname_sz isn't too big. Is that necessary? I see there are other
> helpers that don't have this type of check.
There is no need. The verifier should ensure the memory held by pathname
will have at least size of pathname_sz.
>
>>> + * Description
>>> + * Attempts to create a directory name *pathname*. The argument
>>> + * *pathname_sz* specifies the length of the string *pathname*.
>>> + * The argument *mode* specifies the mode for the new directory. It
>>> + * is modified by the process's umask. It has the same semantic as
>>> + * the syscall mkdir(2).
>>> + * Return
>>> + * 0 on success, or a negative error in case of failure.
>>> + *
>>> + * long bpf_rmdir(const char *pathname, int pathname_sz)
>>> + * Description
>>> + * Deletes a directory, which must be empty.
>>> + * Return
>>> + * 0 on sucess, or a negative error in case of failure.
>>> + *
>>> + * long bpf_unlink(const char *pathname, int pathname_sz)
>>> + * Description
>>> + * Deletes a name and possibly the file it refers to. It has the
>>> + * same semantic as the syscall unlink(2).
>>> + * Return
>>> + * 0 on success, or a negative error in case of failure.
>>> */
>>> #define __BPF_FUNC_MAPPER(FN) \
>>> FN(unspec), \
>>> @@ -5280,6 +5303,9 @@ union bpf_attr {
>>> FN(xdp_load_bytes), \
>>> FN(xdp_store_bytes), \
>>> FN(copy_from_user_task), \
>>> + FN(mkdir), \
>>> + FN(rmdir), \
>>> + FN(unlink), \
>>> /* */
>>>
>>> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>> [...]
On Wed, Mar 2, 2022 at 12:55 PM Yonghong Song <[email protected]> wrote:
>
>
>
> On 2/25/22 3:43 PM, Hao Luo wrote:
> > This patch allows bpf_syscall prog to perform some basic filesystem
> > operations: create, remove directories and unlink files. Three bpf
> > helpers are added for this purpose. When combined with the following
> > patches that allow pinning and getting bpf objects from bpf prog,
> > this feature can be used to create directory hierarchy in bpffs that
> > help manage bpf objects purely using bpf progs.
> >
> > The added helpers subject to the same permission checks as their syscall
> > version. For example, one can not write to a read-only file system;
> > The identity of the current process is checked to see whether it has
> > sufficient permission to perform the operations.
> >
> > Only directories and files in bpffs can be created or removed by these
> > helpers. But it won't be too hard to allow these helpers to operate
> > on files in other filesystems, if we want.
> >
> > Signed-off-by: Hao Luo <[email protected]>
> > ---
> > include/linux/bpf.h | 1 +
> > include/uapi/linux/bpf.h | 26 +++++
> > kernel/bpf/inode.c | 9 +-
> > kernel/bpf/syscall.c | 177 +++++++++++++++++++++++++++++++++
> > tools/include/uapi/linux/bpf.h | 26 +++++
> > 5 files changed, 236 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index f19abc59b6cd..fce5e26179f5 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1584,6 +1584,7 @@ int bpf_link_new_fd(struct bpf_link *link);
> > struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
> > struct bpf_link *bpf_link_get_from_fd(u32 ufd);
> >
> > +bool bpf_path_is_bpf_dir(const struct path *path);
> > int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
> > int bpf_obj_get_user(const char __user *pathname, int flags);
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index afe3d0d7f5f2..a5dbc794403d 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -5086,6 +5086,29 @@ union bpf_attr {
> > * Return
> > * 0 on success, or a negative error in case of failure. On error
> > * *dst* buffer is zeroed out.
> > + *
> > + * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
>
> Can we make pathname_sz to be u32 instead of int? pathname_sz should
> never be negative any way.
>
> Also, I think it is a good idea to add 'u64 flags' parameter for all
> three helpers, so we have room in the future to tune for new use cases.
>
SG. Will make this change.
Actually, I think I need to cap patthname_sz from above, to ensure
pathname_sz isn't too big. Is that necessary? I see there are other
helpers that don't have this type of check.
> > + * Description
> > + * Attempts to create a directory name *pathname*. The argument
> > + * *pathname_sz* specifies the length of the string *pathname*.
> > + * The argument *mode* specifies the mode for the new directory. It
> > + * is modified by the process's umask. It has the same semantic as
> > + * the syscall mkdir(2).
> > + * Return
> > + * 0 on success, or a negative error in case of failure.
> > + *
> > + * long bpf_rmdir(const char *pathname, int pathname_sz)
> > + * Description
> > + * Deletes a directory, which must be empty.
> > + * Return
> > + * 0 on sucess, or a negative error in case of failure.
> > + *
> > + * long bpf_unlink(const char *pathname, int pathname_sz)
> > + * Description
> > + * Deletes a name and possibly the file it refers to. It has the
> > + * same semantic as the syscall unlink(2).
> > + * Return
> > + * 0 on success, or a negative error in case of failure.
> > */
> > #define __BPF_FUNC_MAPPER(FN) \
> > FN(unspec), \
> > @@ -5280,6 +5303,9 @@ union bpf_attr {
> > FN(xdp_load_bytes), \
> > FN(xdp_store_bytes), \
> > FN(copy_from_user_task), \
> > + FN(mkdir), \
> > + FN(rmdir), \
> > + FN(unlink), \
> > /* */
> >
> > /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> [...]
On Thu, Mar 3, 2022 at 10:50 AM Hao Luo <[email protected]> wrote:
>
> On Wed, Mar 2, 2022 at 11:34 AM Alexei Starovoitov
> <[email protected]> wrote:
> >
> > On Mon, Feb 28, 2022 at 02:10:39PM -0800, Hao Luo wrote:
> > > Hi Kumar,
> > >
> > > On Sat, Feb 26, 2022 at 9:18 PM Kumar Kartikeya Dwivedi
> > > <[email protected]> wrote:
> > > >
> > > > On Sat, Feb 26, 2022 at 05:13:31AM IST, Hao Luo wrote:
> > > > > This patch allows bpf_syscall prog to perform some basic filesystem
> > > > > operations: create, remove directories and unlink files. Three bpf
> > > > > helpers are added for this purpose. When combined with the following
> > > > > patches that allow pinning and getting bpf objects from bpf prog,
> > > > > this feature can be used to create directory hierarchy in bpffs that
> > > > > help manage bpf objects purely using bpf progs.
> > > > >
> > > > > The added helpers subject to the same permission checks as their syscall
> > > > > version. For example, one can not write to a read-only file system;
> > > > > The identity of the current process is checked to see whether it has
> > > > > sufficient permission to perform the operations.
> > > > >
> > > > > Only directories and files in bpffs can be created or removed by these
> > > > > helpers. But it won't be too hard to allow these helpers to operate
> > > > > on files in other filesystems, if we want.
> > > > >
> > > > > Signed-off-by: Hao Luo <[email protected]>
> > > > > ---
> > > > > + *
> > > > > + * long bpf_mkdir(const char *pathname, int pathname_sz, u32 mode)
> > > > > + * Description
> > > > > + * Attempts to create a directory name *pathname*. The argument
> > > > > + * *pathname_sz* specifies the length of the string *pathname*.
> > > > > + * The argument *mode* specifies the mode for the new directory. It
> > > > > + * is modified by the process's umask. It has the same semantic as
> > > > > + * the syscall mkdir(2).
> > > > > + * Return
> > > > > + * 0 on success, or a negative error in case of failure.
> > > > > + *
> > > > > + * long bpf_rmdir(const char *pathname, int pathname_sz)
> > > > > + * Description
> > > > > + * Deletes a directory, which must be empty.
> > > > > + * Return
> > > > > + * 0 on sucess, or a negative error in case of failure.
> > > > > + *
> > > > > + * long bpf_unlink(const char *pathname, int pathname_sz)
> > > > > + * Description
> > > > > + * Deletes a name and possibly the file it refers to. It has the
> > > > > + * same semantic as the syscall unlink(2).
> > > > > + * Return
> > > > > + * 0 on success, or a negative error in case of failure.
> > > > > */
> > > > >
> > > >
> > > > How about only introducing bpf_sys_mkdirat and bpf_sys_unlinkat? That would be
> > > > more useful for other cases in future, and when AT_FDCWD is passed, has the same
> > > > functionality as these, but when openat/fget is supported, it would work
> > > > relative to other dirfds as well. It can also allow using dirfd of the process
> > > > calling read for a iterator (e.g. if it sets the fd number using skel->bss).
> > > > unlinkat's AT_REMOVEDIR flag also removes the need for a bpf_rmdir.
> > > >
> > > > WDYT?
> > > >
> > >
> > > The idea sounds good to me, more flexible. But I don't have a real use
> > > case for using the added 'dirfd' at this moment. For all the use cases
> > > I can think of, absolute paths will suffice, I think. Unless other
> > > reviewers have opposition, I will try switching to mkdirat and
> > > unlinkat in v2.
> >
> > I'm surprised you don't need "at" variants.
> > I thought your production setup has a top level cgroup controller and
> > then inner tasks inside containers manage cgroups on their own.
> > Since containers are involved they likely run inside their own mountns.
> > cgroupfs mount is single. So you probably don't even need to bind mount it
> > inside containers, but bpffs is not a single mount. You need
> > to bind mount top bpffs inside containers for tasks to access it.
> > Now for cgroupfs the abs path is not an issue, but for bpffs
> > the AT_FDCWD becomes a problem. AT_FDCWD is using current mount ns.
> > Inside container that will be different. Unless you bind mount into exact
> > same path the full path has different meanings inside and outside of the container.
> > It seems to me the bpf progs attached to cgroup sleepable events should
> > be using FD of bpffs. Then when these tracepoints are triggered from
> > different containers in different mountns they will get the right dir prefix.
> > What am I missing?
> >
>
> Alexei, you are perfectly right. To be honest, I failed to see the
> fact that the sleepable tp prog is in the container's mount ns. I
> think we can bind mount the top bpffs into exactly the same path
> inside container, right? But I haven't tested this idea. Passing FDs
> should be better.
>
I gave this question more thought. We don't need to bind mount the top
bpffs into the container, instead, we may be able to overlay a bpffs
directory into the container. Here is the workflow in my mind:
For each job, let's say A, the container runtime can create a
directory in bpffs, for example
/sys/fs/bpf/jobs/A
and then create the cgroup for A. The sleepable tracing prog will
create the file:
/sys/fs/bpf/jobs/A/100/stats
100 is the created cgroup's id. Then the container runtime overlays
the bpffs directory into container A in the same path:
[A's container path]/sys/fs/bpf/jobs/A.
A can see the stats at the path within its mount ns:
/sys/fs/bpf/jobs/A/100/stats
When A creates cgroup, it is able to write to the top layer of the
overlayed directory. So it is
/sys/fs/bpf/jobs/A/101/stats
Some of my thoughts:
1. Compared to bind mount top bpffs into container, overlaying a
directory avoids exposing other jobs' stats. This gives better
isolation. I already have a patch for supporting laying bpffs over
other fs, it's not too hard.
2. Once the container runtime has overlayed directory into the
container, it has no need to create more cgroups for this job. It
doesn't need to track the stats of job-created cgroups, which are
mainly for inspection by the job itself. Even if it needs to collect
the stats from those cgroups, it can read from the path in the
container.
3. The overlay path in container doesn't have to be exactly the same
as the path in root mount ns. In the sleepable tracing prog, we may
select paths based on current process's ns. If we choose to do this,
we can further avoid exposing cgroup id and job name to the container.
> > I think non-AT variants are not needed. The prog can always pass AT_FDCWD
> > if it's really the intent, but passing actual FD seems more error-proof.
>
> Let's have the AT version. Passing FD seems the right approach, when
> we use it in the context of container.
On Fri, Mar 4, 2022 at 10:37 AM Hao Luo <[email protected]> wrote:
>
> I gave this question more thought. We don't need to bind mount the top
> bpffs into the container, instead, we may be able to overlay a bpffs
> directory into the container. Here is the workflow in my mind:
I don't quite follow what you mean by 'overlay' here.
Another bpffs mount or future overlayfs that supports bpffs?
> For each job, let's say A, the container runtime can create a
> directory in bpffs, for example
>
> /sys/fs/bpf/jobs/A
>
> and then create the cgroup for A. The sleepable tracing prog will
> create the file:
>
> /sys/fs/bpf/jobs/A/100/stats
>
> 100 is the created cgroup's id. Then the container runtime overlays
> the bpffs directory into container A in the same path:
Why cgroup id ? Wouldn't it be easier to use the same cgroup name
as in cgroupfs ?
> [A's container path]/sys/fs/bpf/jobs/A.
>
> A can see the stats at the path within its mount ns:
>
> /sys/fs/bpf/jobs/A/100/stats
>
> When A creates cgroup, it is able to write to the top layer of the
> overlayed directory. So it is
>
> /sys/fs/bpf/jobs/A/101/stats
>
> Some of my thoughts:
> 1. Compared to bind mount top bpffs into container, overlaying a
> directory avoids exposing other jobs' stats. This gives better
> isolation. I already have a patch for supporting laying bpffs over
> other fs, it's not too hard.
So it's overlayfs combination of bpffs and something like ext4, right?
I thought you found out that overlaryfs has to be upper fs
and lower fs shouldn't be modified underneath.
So if bpffs is a lower fs the writes into it should go
through the upper overlayfs, right?
> 2. Once the container runtime has overlayed directory into the
> container, it has no need to create more cgroups for this job. It
> doesn't need to track the stats of job-created cgroups, which are
> mainly for inspection by the job itself. Even if it needs to collect
> the stats from those cgroups, it can read from the path in the
> container.
> 3. The overlay path in container doesn't have to be exactly the same
> as the path in root mount ns. In the sleepable tracing prog, we may
> select paths based on current process's ns. If we choose to do this,
> we can further avoid exposing cgroup id and job name to the container.
The benefits make sense.
On Sat, Mar 5, 2022 at 3:47 PM Alexei Starovoitov
<[email protected]> wrote:
>
> On Fri, Mar 4, 2022 at 10:37 AM Hao Luo <[email protected]> wrote:
> >
> > I gave this question more thought. We don't need to bind mount the top
> > bpffs into the container, instead, we may be able to overlay a bpffs
> > directory into the container. Here is the workflow in my mind:
>
> I don't quite follow what you mean by 'overlay' here.
> Another bpffs mount or future overlayfs that supports bpffs?
>
> > For each job, let's say A, the container runtime can create a
> > directory in bpffs, for example
> >
> > /sys/fs/bpf/jobs/A
> >
> > and then create the cgroup for A. The sleepable tracing prog will
> > create the file:
> >
> > /sys/fs/bpf/jobs/A/100/stats
> >
> > 100 is the created cgroup's id. Then the container runtime overlays
> > the bpffs directory into container A in the same path:
>
> Why cgroup id ? Wouldn't it be easier to use the same cgroup name
> as in cgroupfs ?
>
Cgroup name isn't unique. We don't need the hierarchy information of
cgroups. We can use a library function to translate cgroup path to
cgroup id. See the get_cgroup_id() in patch 9/9. It works fine in the
selftest.
> > [A's container path]/sys/fs/bpf/jobs/A.
> >
> > A can see the stats at the path within its mount ns:
> >
> > /sys/fs/bpf/jobs/A/100/stats
> >
> > When A creates cgroup, it is able to write to the top layer of the
> > overlayed directory. So it is
> >
> > /sys/fs/bpf/jobs/A/101/stats
> >
> > Some of my thoughts:
> > 1. Compared to bind mount top bpffs into container, overlaying a
> > directory avoids exposing other jobs' stats. This gives better
> > isolation. I already have a patch for supporting laying bpffs over
> > other fs, it's not too hard.
>
> So it's overlayfs combination of bpffs and something like ext4, right?
> I thought you found out that overlaryfs has to be upper fs
> and lower fs shouldn't be modified underneath.
> So if bpffs is a lower fs the writes into it should go
> through the upper overlayfs, right?
>
It's overlayfs combining bpffs and ext4. Bpffs is the upper layer. The
lower layer is an empty ext4 directory. The merged directory is a
directory in the container.
The upper layer contains bpf objects that we want to expose to the
container, for example, the sleepable tracing progs and the iter link
for reading stats. Only the merged directory is visible to the
container and all the updates go through the merged directory.
The following is the example of workflow I'm thinking:
Step 1: We first set up directories and bpf objects needed by containers.
[# ~] ls /sys/fs/bpf/container/upper
tracing_prog iter_link
[# ~] ls /sys/fs/bpf/container/work
[# ~] ls /container
root lower
[# ~] ls /container/root
bpf
[# ~] ls /container/root/bpf
Step 2: Use overlayfs to mount a directory from bpffs into the container's home.
[# ~] mkdir /container/lower
[# ~] mkdir /sys/fs/bpf/container/workdir
[# ~] mount -t overlay overlay -o \
lowerdir=/container/lower,\
upperdir=/sys/fs/bpf/container/upper,\
workdir=/sys/fs/bpf/container/work \
/container/root/bpf
[# ~] ls /container/root/bpf
tracing_prog iter_link
Step 3: pivot root for container, we expect to see the bpf objects are
mapped into container,
[# ~] chroot /container/root
[# ~] ls /
bpf
[# ~] ls /bpf
tracing_prog iter_link
Note:
- I haven't tested Step 3. But Step 1 and step 2 seem to be working as
expected. I am testing the behaviors of the bpf objects, after we
enter the container.
- Only a directory in bpffs is mapped into the container, not the top
bpffs. The path is uniform in all containers, that is, /bpf. The
container should be able to mkdir in /bpf, etc.
> > 2. Once the container runtime has overlayed directory into the
> > container, it has no need to create more cgroups for this job. It
> > doesn't need to track the stats of job-created cgroups, which are
> > mainly for inspection by the job itself. Even if it needs to collect
> > the stats from those cgroups, it can read from the path in the
> > container.
> > 3. The overlay path in container doesn't have to be exactly the same
> > as the path in root mount ns. In the sleepable tracing prog, we may
> > select paths based on current process's ns. If we choose to do this,
> > we can further avoid exposing cgroup id and job name to the container.
>
> The benefits make sense.
On Fri, Feb 25, 2022 at 03:43:31PM -0800, Hao Luo wrote:
> This patch allows bpf_syscall prog to perform some basic filesystem
> operations: create, remove directories and unlink files. Three bpf
> helpers are added for this purpose. When combined with the following
> patches that allow pinning and getting bpf objects from bpf prog,
> this feature can be used to create directory hierarchy in bpffs that
> help manage bpf objects purely using bpf progs.
>
> The added helpers subject to the same permission checks as their syscall
> version. For example, one can not write to a read-only file system;
> The identity of the current process is checked to see whether it has
> sufficient permission to perform the operations.
>
> Only directories and files in bpffs can be created or removed by these
> helpers. But it won't be too hard to allow these helpers to operate
> on files in other filesystems, if we want.
In which contexts can those be called?
> +BPF_CALL_2(bpf_rmdir, const char *, pathname, int, pathname_sz)
> +{
> + struct user_namespace *mnt_userns;
> + struct path parent;
> + struct dentry *dentry;
> + int err;
> +
> + if (pathname_sz <= 1 || pathname[pathname_sz - 1])
> + return -EINVAL;
> +
> + err = kern_path(pathname, 0, &parent);
> + if (err)
> + return err;
> +
> + if (!bpf_path_is_bpf_dir(&parent)) {
> + err = -EPERM;
> + goto exit1;
> + }
> +
> + err = mnt_want_write(parent.mnt);
> + if (err)
> + goto exit1;
> +
> + dentry = kern_path_locked(pathname, &parent);
This can't be right. Ever. There is no promise whatsoever
that these two lookups will resolve to the same place.
> +BPF_CALL_2(bpf_unlink, const char *, pathname, int, pathname_sz)
> +{
> + struct user_namespace *mnt_userns;
> + struct path parent;
> + struct dentry *dentry;
> + struct inode *inode = NULL;
> + int err;
> +
> + if (pathname_sz <= 1 || pathname[pathname_sz - 1])
> + return -EINVAL;
> +
> + err = kern_path(pathname, 0, &parent);
> + if (err)
> + return err;
> +
> + err = mnt_want_write(parent.mnt);
> + if (err)
> + goto exit1;
> +
> + dentry = kern_path_locked(pathname, &parent);
> + if (IS_ERR(dentry)) {
> + err = PTR_ERR(dentry);
> + goto exit2;
> + }
Ditto. NAK; if you want to poke into fs/namei.c guts, do it right.
Or at least discuss that on fsdevel. As it is, it's completely broken.
It's racy *and* it blatantly leaks both vfsmount and dentry references.
NAKed-by: Al Viro <[email protected]>
Hello Al,
On Fri, Mar 11, 2022 at 7:46 PM Al Viro <[email protected]> wrote:
>
> On Fri, Feb 25, 2022 at 03:43:31PM -0800, Hao Luo wrote:
> > This patch allows bpf_syscall prog to perform some basic filesystem
> > operations: create, remove directories and unlink files. Three bpf
> > helpers are added for this purpose. When combined with the following
> > patches that allow pinning and getting bpf objects from bpf prog,
> > this feature can be used to create directory hierarchy in bpffs that
> > help manage bpf objects purely using bpf progs.
> >
> > The added helpers subject to the same permission checks as their syscall
> > version. For example, one can not write to a read-only file system;
> > The identity of the current process is checked to see whether it has
> > sufficient permission to perform the operations.
> >
> > Only directories and files in bpffs can be created or removed by these
> > helpers. But it won't be too hard to allow these helpers to operate
> > on files in other filesystems, if we want.
>
> In which contexts can those be called?
>
In a sleepable context. The plan is to introduce a certain tracepoints
as sleepable, a program that attaches to sleepable tracepoints is
allowed to call these functions. In particular, the first sleepable
tracepoint introduced in this patchset is one at the end of
cgroup_mkdir(). Do you have any advices?
> > +BPF_CALL_2(bpf_rmdir, const char *, pathname, int, pathname_sz)
> > +{
> > + struct user_namespace *mnt_userns;
> > + struct path parent;
> > + struct dentry *dentry;
> > + int err;
> > +
> > + if (pathname_sz <= 1 || pathname[pathname_sz - 1])
> > + return -EINVAL;
> > +
> > + err = kern_path(pathname, 0, &parent);
> > + if (err)
> > + return err;
> > +
> > + if (!bpf_path_is_bpf_dir(&parent)) {
> > + err = -EPERM;
> > + goto exit1;
> > + }
> > +
> > + err = mnt_want_write(parent.mnt);
> > + if (err)
> > + goto exit1;
> > +
> > + dentry = kern_path_locked(pathname, &parent);
>
> This can't be right. Ever. There is no promise whatsoever
> that these two lookups will resolve to the same place.
>
> > +BPF_CALL_2(bpf_unlink, const char *, pathname, int, pathname_sz)
> > +{
> > + struct user_namespace *mnt_userns;
> > + struct path parent;
> > + struct dentry *dentry;
> > + struct inode *inode = NULL;
> > + int err;
> > +
> > + if (pathname_sz <= 1 || pathname[pathname_sz - 1])
> > + return -EINVAL;
> > +
> > + err = kern_path(pathname, 0, &parent);
> > + if (err)
> > + return err;
> > +
> > + err = mnt_want_write(parent.mnt);
> > + if (err)
> > + goto exit1;
> > +
> > + dentry = kern_path_locked(pathname, &parent);
> > + if (IS_ERR(dentry)) {
> > + err = PTR_ERR(dentry);
> > + goto exit2;
> > + }
>
> Ditto. NAK; if you want to poke into fs/namei.c guts, do it right.
> Or at least discuss that on fsdevel. As it is, it's completely broken.
> It's racy *and* it blatantly leaks both vfsmount and dentry references.
>
> NAKed-by: Al Viro <[email protected]>
Thanks Al for taking a look. Actually, there is a simpler approach:
can we export two functions in namei.c that wrap call to do_mkdirat
and do_unlinkat, but take a kernel string as pathname? Then these two
bpf helpers can use them, don't have to be this complicated. Does this
sound good to you?
Thanks!
On Mon, Mar 14, 2022 at 10:07:31AM -0700, Hao Luo wrote:
> Hello Al,
> > In which contexts can those be called?
> >
>
> In a sleepable context. The plan is to introduce a certain tracepoints
> as sleepable, a program that attaches to sleepable tracepoints is
> allowed to call these functions. In particular, the first sleepable
> tracepoint introduced in this patchset is one at the end of
> cgroup_mkdir(). Do you have any advices?
Yes - don't do it, unless you really want a lot of user-triggerable
deadlocks.
Pathname resolution is not locking-agnostic. In particular, you can't
do it if you are under any ->i_rwsem, whether it's shared or exclusive.
That includes cgroup_mkdir() callchains. And if the pathname passed
to these functions will have you walk through the parent directory,
you would get screwed (e.g. if the next component happens to be
inexistent, triggering a lookup, which takes ->i_rwsem shared).
On Mon, Mar 14, 2022 at 4:12 PM Al Viro <[email protected]> wrote:
>
> On Mon, Mar 14, 2022 at 10:07:31AM -0700, Hao Luo wrote:
> > Hello Al,
>
> > > In which contexts can those be called?
> > >
> >
> > In a sleepable context. The plan is to introduce a certain tracepoints
> > as sleepable, a program that attaches to sleepable tracepoints is
> > allowed to call these functions. In particular, the first sleepable
> > tracepoint introduced in this patchset is one at the end of
> > cgroup_mkdir(). Do you have any advices?
>
> Yes - don't do it, unless you really want a lot of user-triggerable
> deadlocks.
>
> Pathname resolution is not locking-agnostic. In particular, you can't
> do it if you are under any ->i_rwsem, whether it's shared or exclusive.
> That includes cgroup_mkdir() callchains. And if the pathname passed
> to these functions will have you walk through the parent directory,
> you would get screwed (e.g. if the next component happens to be
> inexistent, triggering a lookup, which takes ->i_rwsem shared).
I'm thinking of two options, let's see if either can work out:
Option 1: We can put restrictions on the pathname passed into this
helper. We can explicitly require the parameter dirfd to be in bpffs
(we can verify). In addition, we check pathname to be not containing
any dot or dotdot, so the resolved path will end up inside bpffs,
therefore won't take ->i_rwsem that is in the callchain of
cgroup_mkdir().
Option 2: We can avoid pathname resolution entirely. Like above, we
can adjust the semantics of this helper to be: making an immediate
directory under the dirfd passed in. In particular, like above, we can
enforce the dirfd to be in bpffs and pathname to consist of only
alphabet and numbers. With these restrictions, we call vfs_mkdir() to
create directories.
Being able to mkdir from bpf has useful use cases, let's try to make
it happen even with many limitations.
Thanks!
On Tue, Mar 15, 2022 at 10:27 AM Hao Luo <[email protected]> wrote:
>
> On Mon, Mar 14, 2022 at 4:12 PM Al Viro <[email protected]> wrote:
> >
> > On Mon, Mar 14, 2022 at 10:07:31AM -0700, Hao Luo wrote:
> > > Hello Al,
> >
> > > > In which contexts can those be called?
> > > >
> > >
> > > In a sleepable context. The plan is to introduce a certain tracepoints
> > > as sleepable, a program that attaches to sleepable tracepoints is
> > > allowed to call these functions. In particular, the first sleepable
> > > tracepoint introduced in this patchset is one at the end of
> > > cgroup_mkdir(). Do you have any advices?
> >
> > Yes - don't do it, unless you really want a lot of user-triggerable
> > deadlocks.
> >
> > Pathname resolution is not locking-agnostic. In particular, you can't
> > do it if you are under any ->i_rwsem, whether it's shared or exclusive.
> > That includes cgroup_mkdir() callchains. And if the pathname passed
> > to these functions will have you walk through the parent directory,
> > you would get screwed (e.g. if the next component happens to be
> > inexistent, triggering a lookup, which takes ->i_rwsem shared).
>
> I'm thinking of two options, let's see if either can work out:
>
> Option 1: We can put restrictions on the pathname passed into this
> helper. We can explicitly require the parameter dirfd to be in bpffs
> (we can verify). In addition, we check pathname to be not containing
> any dot or dotdot, so the resolved path will end up inside bpffs,
> therefore won't take ->i_rwsem that is in the callchain of
> cgroup_mkdir().
>
> Option 2: We can avoid pathname resolution entirely. Like above, we
> can adjust the semantics of this helper to be: making an immediate
> directory under the dirfd passed in. In particular, like above, we can
> enforce the dirfd to be in bpffs and pathname to consist of only
> alphabet and numbers. With these restrictions, we call vfs_mkdir() to
> create directories.
>
> Being able to mkdir from bpf has useful use cases, let's try to make
> it happen even with many limitations.
Option 3. delegate vfs_mkdir to a worker and wait in the helper.
On Tue, Mar 15, 2022 at 11:59 AM Alexei Starovoitov
<[email protected]> wrote:
>
> On Tue, Mar 15, 2022 at 10:27 AM Hao Luo <[email protected]> wrote:
> >
> > On Mon, Mar 14, 2022 at 4:12 PM Al Viro <[email protected]> wrote:
> > >
> > > On Mon, Mar 14, 2022 at 10:07:31AM -0700, Hao Luo wrote:
> > > > Hello Al,
> > >
> > > > > In which contexts can those be called?
> > > > >
> > > >
> > > > In a sleepable context. The plan is to introduce a certain tracepoints
> > > > as sleepable, a program that attaches to sleepable tracepoints is
> > > > allowed to call these functions. In particular, the first sleepable
> > > > tracepoint introduced in this patchset is one at the end of
> > > > cgroup_mkdir(). Do you have any advices?
> > >
> > > Yes - don't do it, unless you really want a lot of user-triggerable
> > > deadlocks.
> > >
> > > Pathname resolution is not locking-agnostic. In particular, you can't
> > > do it if you are under any ->i_rwsem, whether it's shared or exclusive.
> > > That includes cgroup_mkdir() callchains. And if the pathname passed
> > > to these functions will have you walk through the parent directory,
> > > you would get screwed (e.g. if the next component happens to be
> > > inexistent, triggering a lookup, which takes ->i_rwsem shared).
> >
> > I'm thinking of two options, let's see if either can work out:
> >
> > Option 1: We can put restrictions on the pathname passed into this
> > helper. We can explicitly require the parameter dirfd to be in bpffs
> > (we can verify). In addition, we check pathname to be not containing
> > any dot or dotdot, so the resolved path will end up inside bpffs,
> > therefore won't take ->i_rwsem that is in the callchain of
> > cgroup_mkdir().
> >
> > Option 2: We can avoid pathname resolution entirely. Like above, we
> > can adjust the semantics of this helper to be: making an immediate
> > directory under the dirfd passed in. In particular, like above, we can
> > enforce the dirfd to be in bpffs and pathname to consist of only
> > alphabet and numbers. With these restrictions, we call vfs_mkdir() to
> > create directories.
> >
> > Being able to mkdir from bpf has useful use cases, let's try to make
> > it happen even with many limitations.
>
> Option 3. delegate vfs_mkdir to a worker and wait in the helper.
I meant _dont_ wait, of course.
On Tue, Mar 15, 2022 at 12:02 PM Al Viro <[email protected]> wrote:
>
> On Tue, Mar 15, 2022 at 10:27:39AM -0700, Hao Luo wrote:
>
> > Option 1: We can put restrictions on the pathname passed into this
> > helper. We can explicitly require the parameter dirfd to be in bpffs
> > (we can verify). In addition, we check pathname to be not containing
> > any dot or dotdot, so the resolved path will end up inside bpffs,
> > therefore won't take ->i_rwsem that is in the callchain of
> > cgroup_mkdir().
>
> Won't be enough - mount --bind the parent under itself and there you go...
> Sure, you could prohibit mountpoint crossing, etc., but at that point
> I'd question the usefulness of pathname resolution in the first place.
[Apologies for resend, my response did not get delivered to mail list]
I don't see a use case where we need to bind mount the directories in
bpffs, right now. So in option 1, we can also prohibit mountpoint
crossing.
Pathname resolution is still useful in this case. Imagine we want to
put all the created dirs under a base dir, we can open the base dir
and reuse its fd for multiple mkdirs, for example:
Userspace:
fd = openat(..., "/sys/fs/bpf", ...);
pass fd to the bpf prog
bpf prog:
bpf_mkdirat(fd, "common1", ...);
bpf_mkdirat(fd, "common1/a", ...);
bpf_mkdirat(fd, "common1/b", ...);
bpf_mkdirat(fd, "common2", ...);
...
It would be very inconvenient if we can't resolve multi-level paths.
As Alexei said, another option is to delegate syscall to a worker
thread. IMHO, we could do that in future if we find there is a need
for the full feature of pathname resolution.
Al, does that sound good?
On Tue, Mar 15, 2022 at 10:27:39AM -0700, Hao Luo wrote:
> Option 1: We can put restrictions on the pathname passed into this
> helper. We can explicitly require the parameter dirfd to be in bpffs
> (we can verify). In addition, we check pathname to be not containing
> any dot or dotdot, so the resolved path will end up inside bpffs,
> therefore won't take ->i_rwsem that is in the callchain of
> cgroup_mkdir().
Won't be enough - mount --bind the parent under itself and there you go...
Sure, you could prohibit mountpoint crossing, etc., but at that point
I'd question the usefulness of pathname resolution in the first place.