Hi,
It was noticed that some workloads suffer from contention on
increasing/decrementing the ->usage counter in their credentials,
those refcount operations are associated with overriding/reverting the
current task credentials. (the linked thread adds more context)
In some specialized cases, overlayfs is one of them, the credentials
in question have a longer lifetime than the override/revert "critical
section". In the overlayfs case, the credentials are created when the
fs is mounted and destroyed when it's unmounted. In this case of long
lived credentials, the usage counter doesn't need to be
incremented/decremented.
Add a lighter version of credentials override/revert to be used in
these specialized cases. To make sure that the override/revert calls
are paired, add a cleanup guard macro. This was suggested here:
https://lore.kernel.org/all/20231219-marken-pochen-26d888fb9bb9@brauner/
With a small number of tweaks:
- Used inline functions instead of macros;
- A small change to store the credentials into the passed argument,
the guard is now defined as (note the added '_T ='):
DEFINE_GUARD(cred, const struct cred *, _T = override_creds_light(_T),
revert_creds_light(_T));
- Allow "const" arguments to be used with these kind of guards;
Some comments:
- If patch 1/4 is not a good idea (adding the cast), the alternative
I can see is using some kind of container for the credentials;
- The only user for the backing file ops is overlayfs, so these
changes make sense, but may not make sense in the most general
case;
For the numbers, some from 'perf c2c', before this series:
(edited to fit)
#
# ----- HITM ----- Shared
# Num RmtHitm LclHitm Symbol Object Source:Line Node
# ..... ....... ....... .......................... ................ .................. ....
#
-------------------------
0 412 1028
-------------------------
41.50% 42.22% [k] revert_creds [kernel.vmlinux] atomic64_64.h:39 0 1
15.05% 10.60% [k] override_creds [kernel.vmlinux] atomic64_64.h:25 0 1
0.73% 0.58% [k] init_file [kernel.vmlinux] atomic64_64.h:25 0 1
0.24% 0.10% [k] revert_creds [kernel.vmlinux] cred.h:266 0 1
32.28% 37.16% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
9.47% 8.75% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
0.49% 0.58% [k] inode_owner_or_capable [kernel.vmlinux] mnt_idmapping.h:81 0 1
0.24% 0.00% [k] generic_permission [kernel.vmlinux] namei.c:354 0
-------------------------
1 50 103
-------------------------
100.00% 100.00% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
-------------------------
2 50 98
-------------------------
96.00% 96.94% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
2.00% 1.02% [k] update_load_avg [kernel.vmlinux] atomic64_64.h:25 0 1
0.00% 2.04% [k] update_load_avg [kernel.vmlinux] fair.c:4118 0
2.00% 0.00% [k] update_cfs_group [kernel.vmlinux] fair.c:3932 0 1
after this series:
#
# ----- HITM ----- Shared
# Num RmtHitm LclHitm Symbol Object Source:Line Node
# ..... ....... ....... .................... ................ ................ ....
#
-------------------------
0 54 88
-------------------------
100.00% 100.00% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
-------------------------
1 48 83
-------------------------
97.92% 97.59% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
2.08% 1.20% [k] update_load_avg [kernel.vmlinux] atomic64_64.h:25 0 1
0.00% 1.20% [k] update_load_avg [kernel.vmlinux] fair.c:4118 0 1
-------------------------
2 28 44
-------------------------
85.71% 79.55% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
14.29% 20.45% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
The contention is practically gone.
Link: https://lore.kernel.org/all/[email protected]/
Vinicius Costa Gomes (4):
cleanup: Fix discarded const warning when defining guards
cred: Add a light version of override/revert_creds()
overlayfs: Optimize credentials usage
fs: Optimize credentials reference count for backing file ops
fs/backing-file.c | 124 +++++++++++++++++++---------------------
fs/overlayfs/copy_up.c | 4 +-
fs/overlayfs/dir.c | 22 +++----
fs/overlayfs/file.c | 70 ++++++++++++-----------
fs/overlayfs/inode.c | 60 ++++++++++---------
fs/overlayfs/namei.c | 21 ++++---
fs/overlayfs/readdir.c | 18 +++---
fs/overlayfs/util.c | 23 ++++----
fs/overlayfs/xattrs.c | 34 +++++------
include/linux/cleanup.h | 2 +-
include/linux/cred.h | 21 +++++++
kernel/cred.c | 6 +-
12 files changed, 215 insertions(+), 190 deletions(-)
--
2.43.0
When defining guards for const types the void* return implicitly
discards the const modifier. Be explicit about it.
Compiler warning (gcc 13.2.1):
/include/linux/cleanup.h:154:18: warning: return discards ‘const’ qualifier from pointer target type [-Wdiscarded-qualifiers]
154 | { return *_T; }
| ^~~
/include/linux/cred.h:193:1: note: in expansion of macro ‘DEFINE_GUARD’
193 | DEFINE_GUARD(cred, const struct cred *, _T = override_creds_light(_T),
| ^~~~~~~~~~~~
Signed-off-by: Vinicius Costa Gomes <[email protected]>
---
include/linux/cleanup.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index c2d09bc4f976..ac521c4521cd 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -151,7 +151,7 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
#define DEFINE_GUARD(_name, _type, _lock, _unlock) \
DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \
static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
- { return *_T; }
+ { return (void *)*_T; }
#define DEFINE_GUARD_COND(_name, _ext, _condlock) \
EXTEND_CLASS(_name, _ext, \
--
2.43.0
Add a light version of override/revert_creds(), this should only be
used when the credentials in question will outlive the critical
section and the critical section doesn't change the ->usage of the
credentials.
To make their usage less error prone, introduce cleanup guards asto be
used like this:
guard(cred)(credentials_to_override_and_restore);
or this:
scoped_guard(cred, credentials_to_override_and_restore) {
/* with credentials overridden */
}
Signed-off-by: Vinicius Costa Gomes <[email protected]>
---
include/linux/cred.h | 21 +++++++++++++++++++++
kernel/cred.c | 6 +++---
2 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 2976f534a7a3..e9f2237e4bf8 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -172,6 +172,27 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
cred->cap_inheritable));
}
+/*
+ * Override creds without bumping reference count. Caller must ensure
+ * reference remains valid or has taken reference. Almost always not the
+ * interface you want. Use override_creds()/revert_creds() instead.
+ */
+static inline const struct cred *override_creds_light(const struct cred *override_cred)
+{
+ const struct cred *old = current->cred;
+
+ rcu_assign_pointer(current->cred, override_cred);
+ return old;
+}
+
+static inline void revert_creds_light(const struct cred *revert_cred)
+{
+ rcu_assign_pointer(current->cred, revert_cred);
+}
+
+DEFINE_GUARD(cred, const struct cred *, _T = override_creds_light(_T),
+ revert_creds_light(_T));
+
/**
* get_new_cred_many - Get references on a new set of credentials
* @cred: The new credentials to reference
diff --git a/kernel/cred.c b/kernel/cred.c
index c033a201c808..f95f71e3ac1d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -485,7 +485,7 @@ EXPORT_SYMBOL(abort_creds);
*/
const struct cred *override_creds(const struct cred *new)
{
- const struct cred *old = current->cred;
+ const struct cred *old;
kdebug("override_creds(%p{%ld})", new,
atomic_long_read(&new->usage));
@@ -499,7 +499,7 @@ const struct cred *override_creds(const struct cred *new)
* visible to other threads under RCU.
*/
get_new_cred((struct cred *)new);
- rcu_assign_pointer(current->cred, new);
+ old = override_creds_light(new);
kdebug("override_creds() = %p{%ld}", old,
atomic_long_read(&old->usage));
@@ -521,7 +521,7 @@ void revert_creds(const struct cred *old)
kdebug("revert_creds(%p{%ld})", old,
atomic_long_read(&old->usage));
- rcu_assign_pointer(current->cred, old);
+ revert_creds_light(old);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
--
2.43.0
File operations in overlayfs also check against the credentials of the
mounter task, stored in the superblock, this credentials will outlive
most of the operations. For these cases, use the recently introduced
guard statements to guarantee that override/revert_creds() are paired.
Signed-off-by: Vinicius Costa Gomes <[email protected]>
---
fs/overlayfs/copy_up.c | 4 +--
fs/overlayfs/dir.c | 22 +++++++------
fs/overlayfs/file.c | 70 ++++++++++++++++++++++--------------------
fs/overlayfs/inode.c | 60 +++++++++++++++++++-----------------
fs/overlayfs/namei.c | 21 ++++++-------
fs/overlayfs/readdir.c | 18 +++++------
fs/overlayfs/util.c | 23 +++++++-------
fs/overlayfs/xattrs.c | 34 ++++++++++----------
8 files changed, 130 insertions(+), 122 deletions(-)
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index b8e25ca51016..55d1f2b60775 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -1202,7 +1202,8 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
if (err)
return err;
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
while (!err) {
struct dentry *next;
struct dentry *parent = NULL;
@@ -1227,7 +1228,6 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
dput(parent);
dput(next);
}
- revert_creds(old_cred);
return err;
}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 0f8b4a719237..5aa43a3a7b3e 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -687,9 +687,9 @@ static int ovl_set_link_redirect(struct dentry *dentry)
const struct cred *old_cred;
int err;
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
err = ovl_set_redirect(dentry, false);
- revert_creds(old_cred);
return err;
}
@@ -894,12 +894,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
if (err)
goto out;
- old_cred = ovl_override_creds(dentry->d_sb);
- if (!lower_positive)
- err = ovl_remove_upper(dentry, is_dir, &list);
- else
- err = ovl_remove_and_whiteout(dentry, &list);
- revert_creds(old_cred);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred) {
+ if (!lower_positive)
+ err = ovl_remove_upper(dentry, is_dir, &list);
+ else
+ err = ovl_remove_and_whiteout(dentry, &list);
+ }
if (!err) {
if (is_dir)
clear_nlink(dentry->d_inode);
@@ -1146,7 +1147,8 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
goto out;
}
- old_cred = ovl_override_creds(old->d_sb);
+ old_cred = ovl_creds(old->d_sb);
+ old_cred = override_creds_light(old_cred);
if (!list_empty(&list)) {
opaquedir = ovl_clear_empty(new, &list);
@@ -1279,7 +1281,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
- revert_creds(old_cred);
+ revert_creds_light(old_cred);
if (update_nlink)
ovl_nlink_end(new);
else
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 05536964d37f..482bf78555e2 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -42,7 +42,8 @@ static struct file *ovl_open_realfile(const struct file *file,
if (flags & O_APPEND)
acc_mode |= MAY_APPEND;
- old_cred = ovl_override_creds(inode->i_sb);
+ old_cred = ovl_creds(inode->i_sb);
+ guard(cred)(old_cred);
real_idmap = mnt_idmap(realpath->mnt);
err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode);
if (err) {
@@ -54,7 +55,6 @@ static struct file *ovl_open_realfile(const struct file *file,
realfile = backing_file_open(&file->f_path, flags, realpath,
current_cred());
}
- revert_creds(old_cred);
pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
file, file, ovl_whatisit(inode, realinode), file->f_flags,
@@ -214,9 +214,9 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
ovl_inode_lock(inode);
real.file->f_pos = file->f_pos;
- old_cred = ovl_override_creds(inode->i_sb);
- ret = vfs_llseek(real.file, offset, whence);
- revert_creds(old_cred);
+ old_cred = ovl_creds(inode->i_sb);
+ scoped_guard(cred, old_cred)
+ ret = vfs_llseek(real.file, offset, whence);
file->f_pos = real.file->f_pos;
ovl_inode_unlock(inode);
@@ -388,7 +388,6 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct fd real;
- const struct cred *old_cred;
int ret;
ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
@@ -401,9 +400,11 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
/* Don't sync lower file for fear of receiving EROFS error */
if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ const struct cred *old_cred;
+
+ old_cred = ovl_creds(file_inode(file)->i_sb);
+ guard(cred)(old_cred);
ret = vfs_fsync_range(real.file, start, end, datasync);
- revert_creds(old_cred);
}
fdput(real);
@@ -441,9 +442,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
if (ret)
goto out_unlock;
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fallocate(real.file, mode, offset, len);
- revert_creds(old_cred);
+ old_cred = ovl_creds(file_inode(file)->i_sb);
+ scoped_guard(cred, old_cred)
+ ret = vfs_fallocate(real.file, mode, offset, len);
/* Update size */
ovl_file_modified(file);
@@ -466,9 +467,9 @@ static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
if (ret)
return ret;
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fadvise(real.file, offset, len, advice);
- revert_creds(old_cred);
+ old_cred = ovl_creds(file_inode(file)->i_sb);
+ scoped_guard(cred, old_cred)
+ ret = vfs_fadvise(real.file, offset, len, advice);
fdput(real);
@@ -509,25 +510,25 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
goto out_unlock;
}
- old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
- switch (op) {
- case OVL_COPY:
- ret = vfs_copy_file_range(real_in.file, pos_in,
- real_out.file, pos_out, len, flags);
- break;
+ old_cred = ovl_creds(file_inode(file_out)->i_sb);
+ scoped_guard(cred, old_cred)
+ switch (op) {
+ case OVL_COPY:
+ ret = vfs_copy_file_range(real_in.file, pos_in,
+ real_out.file, pos_out, len, flags);
+ break;
- case OVL_CLONE:
- ret = vfs_clone_file_range(real_in.file, pos_in,
- real_out.file, pos_out, len, flags);
- break;
+ case OVL_CLONE:
+ ret = vfs_clone_file_range(real_in.file, pos_in,
+ real_out.file, pos_out, len, flags);
+ break;
- case OVL_DEDUPE:
- ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
- real_out.file, pos_out, len,
- flags);
- break;
- }
- revert_creds(old_cred);
+ case OVL_DEDUPE:
+ ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
+ real_out.file, pos_out, len,
+ flags);
+ break;
+ }
/* Update size */
ovl_file_modified(file_out);
@@ -579,7 +580,6 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
static int ovl_flush(struct file *file, fl_owner_t id)
{
struct fd real;
- const struct cred *old_cred;
int err;
err = ovl_real_fdget(file, &real);
@@ -587,9 +587,11 @@ static int ovl_flush(struct file *file, fl_owner_t id)
return err;
if (real.file->f_op->flush) {
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ const struct cred *old_cred;
+
+ old_cred = ovl_creds(file_inode(file)->i_sb);
+ guard(cred)(old_cred);
err = real.file->f_op->flush(real.file, id);
- revert_creds(old_cred);
}
fdput(real);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c63b31a460be..9047f245ba0b 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -79,9 +79,10 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
goto out_put_write;
inode_lock(upperdentry->d_inode);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_do_notify_change(ofs, upperdentry, attr);
- revert_creds(old_cred);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred)
+ err = ovl_do_notify_change(ofs, upperdentry, attr);
+
if (!err)
ovl_copyattr(dentry->d_inode);
inode_unlock(upperdentry->d_inode);
@@ -170,7 +171,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
metacopy_blocks = ovl_is_metacopy_dentry(dentry);
type = ovl_path_real(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
err = ovl_do_getattr(&realpath, stat, request_mask, flags);
if (err)
goto out;
@@ -281,8 +283,6 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->nlink = dentry->d_inode->i_nlink;
out:
- revert_creds(old_cred);
-
return err;
}
@@ -310,7 +310,9 @@ int ovl_permission(struct mnt_idmap *idmap,
if (err)
return err;
- old_cred = ovl_override_creds(inode->i_sb);
+ old_cred = ovl_creds(inode->i_sb);
+ guard(cred)(old_cred);
+
if (!upperinode &&
!special_file(realinode->i_mode) && mask & MAY_WRITE) {
mask &= ~(MAY_WRITE | MAY_APPEND);
@@ -318,7 +320,6 @@ int ovl_permission(struct mnt_idmap *idmap,
mask |= MAY_READ;
}
err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
- revert_creds(old_cred);
return err;
}
@@ -333,9 +334,10 @@ static const char *ovl_get_link(struct dentry *dentry,
if (!dentry)
return ERR_PTR(-ECHILD);
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
p = vfs_get_link(ovl_dentry_real(dentry), done);
- revert_creds(old_cred);
+
return p;
}
@@ -468,9 +470,9 @@ struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap,
} else {
const struct cred *old_cred;
- old_cred = ovl_override_creds(inode->i_sb);
+ old_cred = ovl_creds(inode->i_sb);
+ guard(cred)(old_cred);
acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm);
- revert_creds(old_cred);
}
return acl;
@@ -496,10 +498,11 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
struct posix_acl *real_acl;
ovl_path_lower(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry,
- acl_name);
- revert_creds(old_cred);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred)
+ real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry,
+ acl_name);
+
if (IS_ERR(real_acl)) {
err = PTR_ERR(real_acl);
goto out;
@@ -519,12 +522,13 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
if (err)
goto out;
- old_cred = ovl_override_creds(dentry->d_sb);
- if (acl)
- err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
- else
- err = ovl_do_remove_acl(ofs, realdentry, acl_name);
- revert_creds(old_cred);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred) {
+ if (acl)
+ err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
+ else
+ err = ovl_do_remove_acl(ofs, realdentry, acl_name);
+ }
ovl_drop_write(dentry);
/* copy c/mtime */
@@ -599,9 +603,9 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (!realinode->i_op->fiemap)
return -EOPNOTSUPP;
- old_cred = ovl_override_creds(inode->i_sb);
+ old_cred = ovl_creds(inode->i_sb);
+ guard(cred)(old_cred);
err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
- revert_creds(old_cred);
return err;
}
@@ -661,7 +665,8 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
if (err)
goto out;
- old_cred = ovl_override_creds(inode->i_sb);
+ old_cred = ovl_creds(inode->i_sb);
+ guard(cred)(old_cred);
/*
* Store immutable/append-only flags in xattr and clear them
* in upper fileattr (in case they were set by older kernel)
@@ -672,7 +677,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
err = ovl_set_protattr(inode, upperpath.dentry, fa);
if (!err)
err = ovl_real_fileattr_set(&upperpath, fa);
- revert_creds(old_cred);
ovl_drop_write(dentry);
/*
@@ -731,10 +735,10 @@ int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
ovl_path_real(dentry, &realpath);
- old_cred = ovl_override_creds(inode->i_sb);
+ old_cred = ovl_creds(inode->i_sb);
+ guard(cred)(old_cred);
err = ovl_real_fileattr_get(&realpath, fa);
ovl_fileattr_prot_flags(inode, fa);
- revert_creds(old_cred);
return err;
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 984ffdaeed6c..0b0258c582a0 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -946,13 +946,12 @@ static int ovl_maybe_validate_verity(struct dentry *dentry)
if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) {
const struct cred *old_cred;
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
err = ovl_validate_verity(ofs, &metapath, &datapath);
if (err == 0)
ovl_set_flag(OVL_VERIFIED_DIGEST, inode);
-
- revert_creds(old_cred);
}
ovl_inode_unlock(inode);
@@ -984,9 +983,10 @@ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry)
if (ovl_dentry_lowerdata(dentry))
goto out;
- old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_lookup_data_layers(dentry, redirect, &datapath);
- revert_creds(old_cred);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred)
+ err = ovl_lookup_data_layers(dentry, redirect, &datapath);
+
if (err)
goto out_err;
@@ -1052,7 +1052,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > ofs->namelen)
return ERR_PTR(-ENAMETOOLONG);
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
upperdir = ovl_dentry_upper(dentry->d_parent);
if (upperdir) {
d.mnt = ovl_upper_mnt(ofs);
@@ -1331,7 +1332,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode));
- revert_creds(old_cred);
if (origin_path) {
dput(origin_path->dentry);
kfree(origin_path);
@@ -1355,7 +1355,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
kfree(upperredirect);
out:
kfree(d.redirect);
- revert_creds(old_cred);
return ERR_PTR(err);
}
@@ -1379,7 +1378,8 @@ bool ovl_lower_positive(struct dentry *dentry)
if (!ovl_dentry_upper(dentry))
return true;
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
/* Positive upper -> have to look up lower to see whether it exists */
for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) {
struct dentry *this;
@@ -1412,7 +1412,6 @@ bool ovl_lower_positive(struct dentry *dentry)
dput(this);
}
}
- revert_creds(old_cred);
return positive;
}
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index e71156baa7bc..58ea942921fc 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -275,7 +275,8 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data
struct dentry *dentry, *dir = path->dentry;
const struct cred *old_cred;
- old_cred = ovl_override_creds(rdd->dentry->d_sb);
+ old_cred = ovl_creds(rdd->dentry->d_sb);
+ guard(cred)(old_cred);
err = down_write_killable(&dir->d_inode->i_rwsem);
if (!err) {
@@ -290,7 +291,6 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data
}
inode_unlock(dir->d_inode);
}
- revert_creds(old_cred);
return err;
}
@@ -755,7 +755,8 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
const struct cred *old_cred;
int err;
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
if (!ctx->pos)
ovl_dir_reset(file);
@@ -807,7 +808,6 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
}
err = 0;
out:
- revert_creds(old_cred);
return err;
}
@@ -857,9 +857,9 @@ static struct file *ovl_dir_open_realfile(const struct file *file,
struct file *res;
const struct cred *old_cred;
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ old_cred = ovl_creds(file_inode(file)->i_sb);
+ guard(cred)(old_cred);
res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE));
- revert_creds(old_cred);
return res;
}
@@ -984,9 +984,9 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
struct rb_root root = RB_ROOT;
const struct cred *old_cred;
- old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_dir_read_merged(dentry, list, &root);
- revert_creds(old_cred);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred)
+ err = ovl_dir_read_merged(dentry, list, &root);
if (err)
return err;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 0217094c23ea..7ba8449d920e 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1157,15 +1157,16 @@ int ovl_nlink_start(struct dentry *dentry)
if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode))
return 0;
- old_cred = ovl_override_creds(dentry->d_sb);
- /*
- * The overlay inode nlink should be incremented/decremented IFF the
- * upper operation succeeds, along with nlink change of upper inode.
- * Therefore, before link/unlink/rename, we store the union nlink
- * value relative to the upper inode nlink in an upper inode xattr.
- */
- err = ovl_set_nlink_upper(dentry);
- revert_creds(old_cred);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred) {
+ /*
+ * The overlay inode nlink should be incremented/decremented IFF the
+ * upper operation succeeds, along with nlink change of upper inode.
+ * Therefore, before link/unlink/rename, we store the union nlink
+ * value relative to the upper inode nlink in an upper inode xattr.
+ */
+ err = ovl_set_nlink_upper(dentry);
+ }
if (err)
goto out_drop_write;
@@ -1188,9 +1189,9 @@ void ovl_nlink_end(struct dentry *dentry)
if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) {
const struct cred *old_cred;
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
ovl_cleanup_index(dentry);
- revert_creds(old_cred);
}
ovl_inode_unlock(inode);
diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c
index 383978e4663c..921a7d086fa1 100644
--- a/fs/overlayfs/xattrs.c
+++ b/fs/overlayfs/xattrs.c
@@ -45,9 +45,9 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char
if (!value && !upperdentry) {
ovl_path_lower(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
- revert_creds(old_cred);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred)
+ err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
if (err < 0)
goto out;
}
@@ -64,15 +64,16 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char
if (err)
goto out;
- old_cred = ovl_override_creds(dentry->d_sb);
- if (value) {
- err = ovl_do_setxattr(ofs, realdentry, name, value, size,
- flags);
- } else {
- WARN_ON(flags != XATTR_REPLACE);
- err = ovl_do_removexattr(ofs, realdentry, name);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred) {
+ if (value) {
+ err = ovl_do_setxattr(ofs, realdentry, name, value, size,
+ flags);
+ } else {
+ WARN_ON(flags != XATTR_REPLACE);
+ err = ovl_do_removexattr(ofs, realdentry, name);
+ }
}
- revert_creds(old_cred);
ovl_drop_write(dentry);
/* copy c/mtime */
@@ -89,9 +90,9 @@ static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char
struct path realpath;
ovl_i_path_real(inode, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_creds(dentry->d_sb);
+ guard(cred)(old_cred);
res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
- revert_creds(old_cred);
return res;
}
@@ -119,9 +120,9 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
const struct cred *old_cred;
size_t prefix_len, name_len;
- old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_listxattr(realdentry, list, size);
- revert_creds(old_cred);
+ old_cred = ovl_creds(dentry->d_sb);
+ scoped_guard(cred, old_cred)
+ res = vfs_listxattr(realdentry, list, size);
if (res <= 0 || size == 0)
return res;
@@ -268,4 +269,3 @@ const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs)
return ofs->config.userxattr ? ovl_user_xattr_handlers :
ovl_trusted_xattr_handlers;
}
-
--
2.43.0
For backing file operations, users are expected to pass credentials
that will outlive the backing file common operations.
Use the specialized guard statements to override/revert the
credentials.
Signed-off-by: Vinicius Costa Gomes <[email protected]>
---
fs/backing-file.c | 124 ++++++++++++++++++++++------------------------
1 file changed, 60 insertions(+), 64 deletions(-)
diff --git a/fs/backing-file.c b/fs/backing-file.c
index a681f38d84d8..9874f09f860f 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -140,7 +140,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
struct backing_file_ctx *ctx)
{
struct backing_aio *aio = NULL;
- const struct cred *old_cred;
+ const struct cred *old_cred = ctx->cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -153,29 +153,28 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
- old_cred = override_creds(ctx->cred);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(flags);
+ scoped_guard(cred, old_cred) {
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
- ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
- } else {
- ret = -ENOMEM;
- aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
- if (!aio)
- goto out;
+ ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
+ } else {
+ ret = -ENOMEM;
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ goto out;
- aio->orig_iocb = iocb;
- kiocb_clone(&aio->iocb, iocb, get_file(file));
- aio->iocb.ki_complete = backing_aio_rw_complete;
- refcount_set(&aio->ref, 2);
- ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
- backing_aio_put(aio);
- if (ret != -EIOCBQUEUED)
- backing_aio_cleanup(aio, ret);
+ aio->orig_iocb = iocb;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_complete = backing_aio_rw_complete;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ }
}
out:
- revert_creds(old_cred);
-
if (ctx->accessed)
ctx->accessed(ctx->user_file);
@@ -187,7 +186,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
+ const struct cred *old_cred = ctx->cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -210,39 +209,37 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
*/
flags &= ~IOCB_DIO_CALLER_COMP;
- old_cred = override_creds(ctx->cred);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(flags);
+ scoped_guard(cred, old_cred) {
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
- ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
- if (ctx->end_write)
- ctx->end_write(ctx->user_file);
- } else {
- struct backing_aio *aio;
+ ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
+ if (ctx->end_write)
+ ctx->end_write(ctx->user_file);
+ } else {
+ struct backing_aio *aio;
- ret = backing_aio_init_wq(iocb);
- if (ret)
- goto out;
+ ret = backing_aio_init_wq(iocb);
+ if (ret)
+ return ret;
- ret = -ENOMEM;
- aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
- if (!aio)
- goto out;
+ ret = -ENOMEM;
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ return ret;
- aio->orig_iocb = iocb;
- aio->end_write = ctx->end_write;
- kiocb_clone(&aio->iocb, iocb, get_file(file));
- aio->iocb.ki_flags = flags;
- aio->iocb.ki_complete = backing_aio_queue_completion;
- refcount_set(&aio->ref, 2);
- ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
- backing_aio_put(aio);
- if (ret != -EIOCBQUEUED)
- backing_aio_cleanup(aio, ret);
+ aio->orig_iocb = iocb;
+ aio->end_write = ctx->end_write;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_flags = flags;
+ aio->iocb.ki_complete = backing_aio_queue_completion;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ }
}
-out:
- revert_creds(old_cred);
-
return ret;
}
EXPORT_SYMBOL_GPL(backing_file_write_iter);
@@ -252,15 +249,15 @@ ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
unsigned int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
+ const struct cred *old_cred = ctx->cred;
ssize_t ret;
if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
return -EIO;
- old_cred = override_creds(ctx->cred);
- ret = vfs_splice_read(in, ppos, pipe, len, flags);
- revert_creds(old_cred);
+ scoped_guard(cred, old_cred) {
+ ret = vfs_splice_read(in, ppos, pipe, len, flags);
+ }
if (ctx->accessed)
ctx->accessed(ctx->user_file);
@@ -274,7 +271,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
unsigned int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
+ const struct cred *old_cred = ctx->cred;
ssize_t ret;
if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
@@ -284,12 +281,11 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
if (ret)
return ret;
- old_cred = override_creds(ctx->cred);
- file_start_write(out);
- ret = iter_file_splice_write(pipe, out, ppos, len, flags);
- file_end_write(out);
- revert_creds(old_cred);
-
+ scoped_guard(cred, old_cred) {
+ file_start_write(out);
+ ret = iter_file_splice_write(pipe, out, ppos, len, flags);
+ file_end_write(out);
+ }
if (ctx->end_write)
ctx->end_write(ctx->user_file);
@@ -300,7 +296,7 @@ EXPORT_SYMBOL_GPL(backing_file_splice_write);
int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
+ const struct cred *old_cred = ctx->cred;
int ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) ||
@@ -312,9 +308,9 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
vma_set_file(vma, file);
- old_cred = override_creds(ctx->cred);
- ret = call_mmap(vma->vm_file, vma);
- revert_creds(old_cred);
+ scoped_guard(cred, old_cred) {
+ ret = call_mmap(vma->vm_file, vma);
+ }
if (ctx->accessed)
ctx->accessed(ctx->user_file);
--
2.43.0
cc: fsdevel
On Fri, Jan 26, 2024 at 1:57 AM Vinicius Costa Gomes
<[email protected]> wrote:
>
> Hi,
>
Hi Vinicius,
I have some specific comments about the overlayfs patch,
but first I prefer to provide higher level feedback on the series.
> It was noticed that some workloads suffer from contention on
> increasing/decrementing the ->usage counter in their credentials,
> those refcount operations are associated with overriding/reverting the
> current task credentials. (the linked thread adds more context)
>
> In some specialized cases, overlayfs is one of them, the credentials
> in question have a longer lifetime than the override/revert "critical
> section". In the overlayfs case, the credentials are created when the
> fs is mounted and destroyed when it's unmounted. In this case of long
> lived credentials, the usage counter doesn't need to be
> incremented/decremented.
>
> Add a lighter version of credentials override/revert to be used in
> these specialized cases. To make sure that the override/revert calls
> are paired, add a cleanup guard macro. This was suggested here:
>
> https://lore.kernel.org/all/20231219-marken-pochen-26d888fb9bb9@brauner/
>
> With a small number of tweaks:
> - Used inline functions instead of macros;
> - A small change to store the credentials into the passed argument,
> the guard is now defined as (note the added '_T ='):
>
> DEFINE_GUARD(cred, const struct cred *, _T = override_creds_light(_T),
> revert_creds_light(_T));
>
> - Allow "const" arguments to be used with these kind of guards;
>
> Some comments:
> - If patch 1/4 is not a good idea (adding the cast), the alternative
> I can see is using some kind of container for the credentials;
> - The only user for the backing file ops is overlayfs, so these
> changes make sense, but may not make sense in the most general
> case;
>
> For the numbers, some from 'perf c2c', before this series:
> (edited to fit)
>
> #
> # ----- HITM ----- Shared
> # Num RmtHitm LclHitm Symbol Object Source:Line Node
> # ..... ....... ....... .......................... ................ ................. ....
> #
> -------------------------
> 0 412 1028
> -------------------------
> 41.50% 42.22% [k] revert_creds [kernel.vmlinux] atomic64_64.h:39 0 1
> 15.05% 10.60% [k] override_creds [kernel.vmlinux] atomic64_64.h:25 0 1
> 0.73% 0.58% [k] init_file [kernel.vmlinux] atomic64_64.h:25 0 1
> 0.24% 0.10% [k] revert_creds [kernel.vmlinux] cred.h:266 0 1
> 32.28% 37.16% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
> 9.47% 8.75% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
> 0.49% 0.58% [k] inode_owner_or_capable [kernel.vmlinux] mnt_idmapping.h:81 0 1
> 0.24% 0.00% [k] generic_permission [kernel.vmlinux] namei.c:354 0
>
> -------------------------
> 1 50 103
> -------------------------
> 100.00% 100.00% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
>
> -------------------------
> 2 50 98
> -------------------------
> 96.00% 96.94% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
> 2.00% 1.02% [k] update_load_avg [kernel.vmlinux] atomic64_64.h:25 0 1
> 0.00% 2.04% [k] update_load_avg [kernel.vmlinux] fair.c:4118 0
> 2.00% 0.00% [k] update_cfs_group [kernel.vmlinux] fair.c:3932 0 1
>
> after this series:
>
> #
> # ----- HITM ----- Shared
> # Num RmtHitm LclHitm Symbol Object Source:Line Node
> # ..... ....... ....... .................... ................ ............... ....
> #
> -------------------------
> 0 54 88
> -------------------------
> 100.00% 100.00% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
>
> -------------------------
> 1 48 83
> -------------------------
> 97.92% 97.59% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
> 2.08% 1.20% [k] update_load_avg [kernel.vmlinux] atomic64_64.h:25 0 1
> 0.00% 1.20% [k] update_load_avg [kernel.vmlinux] fair.c:4118 0 1
>
> -------------------------
> 2 28 44
> -------------------------
> 85.71% 79.55% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
> 14.29% 20.45% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
>
>
> The contention is practically gone.
That is very impressive.
Can you say which workloads were running during this test?
Specifically, I am wondering how much of the improvement came from
backing_file.c and how much from overlayfs/*.c.
The reason I am asking is because the overlayfs patch is quite large and can
take more time to review, so I am wondering out loud if we are not
better off this
course of action:
1. convert backing_file.c to use new helpers/guards
2. convert overlayfs to use new helpers/guards
#1 should definitely go in via Christian's tree and should get a wider review
from fsdevel (please CC fsdevel next time)
#2 is contained for overlayfs reviewers. Once the helpers are merged
and used by backing_file helpers, overlayfs can be converted independently.
#1 and #2 could both be merged in the same merge cycle, or not, it does not
matter. Most likely, #2 will go through Christian's tree as well, but I think we
need to work according to this merge order.
We can also work on the review in parallel and you may keep the overlayfs
patch in following posts, just wanted us to be on the same page w.r.t to
the process.
Thanks,
Amir.
On Fri, Jan 26, 2024 at 1:57 AM Vinicius Costa Gomes
<[email protected]> wrote:
>
> Add a light version of override/revert_creds(), this should only be
> used when the credentials in question will outlive the critical
> section and the critical section doesn't change the ->usage of the
> credentials.
>
> To make their usage less error prone, introduce cleanup guards asto be
> used like this:
>
> guard(cred)(credentials_to_override_and_restore);
>
> or this:
>
> scoped_guard(cred, credentials_to_override_and_restore) {
> /* with credentials overridden */
> }
>
> Signed-off-by: Vinicius Costa Gomes <[email protected]>
You may add:
Reviewed-by: Amir Goldstein <[email protected]>
I would also add:
Suggested-by: Christian Brauner <[email protected]>
Thanks,
Amir.
> ---
> include/linux/cred.h | 21 +++++++++++++++++++++
> kernel/cred.c | 6 +++---
> 2 files changed, 24 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index 2976f534a7a3..e9f2237e4bf8 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -172,6 +172,27 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
> cred->cap_inheritable));
> }
>
> +/*
> + * Override creds without bumping reference count. Caller must ensure
> + * reference remains valid or has taken reference. Almost always not the
> + * interface you want. Use override_creds()/revert_creds() instead.
> + */
> +static inline const struct cred *override_creds_light(const struct cred *override_cred)
> +{
> + const struct cred *old = current->cred;
> +
> + rcu_assign_pointer(current->cred, override_cred);
> + return old;
> +}
> +
> +static inline void revert_creds_light(const struct cred *revert_cred)
> +{
> + rcu_assign_pointer(current->cred, revert_cred);
> +}
> +
> +DEFINE_GUARD(cred, const struct cred *, _T = override_creds_light(_T),
> + revert_creds_light(_T));
> +
> /**
> * get_new_cred_many - Get references on a new set of credentials
> * @cred: The new credentials to reference
> diff --git a/kernel/cred.c b/kernel/cred.c
> index c033a201c808..f95f71e3ac1d 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -485,7 +485,7 @@ EXPORT_SYMBOL(abort_creds);
> */
> const struct cred *override_creds(const struct cred *new)
> {
> - const struct cred *old = current->cred;
> + const struct cred *old;
>
> kdebug("override_creds(%p{%ld})", new,
> atomic_long_read(&new->usage));
> @@ -499,7 +499,7 @@ const struct cred *override_creds(const struct cred *new)
> * visible to other threads under RCU.
> */
> get_new_cred((struct cred *)new);
> - rcu_assign_pointer(current->cred, new);
> + old = override_creds_light(new);
>
> kdebug("override_creds() = %p{%ld}", old,
> atomic_long_read(&old->usage));
> @@ -521,7 +521,7 @@ void revert_creds(const struct cred *old)
> kdebug("revert_creds(%p{%ld})", old,
> atomic_long_read(&old->usage));
>
> - rcu_assign_pointer(current->cred, old);
> + revert_creds_light(old);
> put_cred(override);
> }
> EXPORT_SYMBOL(revert_creds);
> --
> 2.43.0
>
On Fri, Jan 26, 2024 at 1:57 AM Vinicius Costa Gomes
<[email protected]> wrote:
>
> When defining guards for const types the void* return implicitly
> discards the const modifier. Be explicit about it.
>
> Compiler warning (gcc 13.2.1):
>
> ./include/linux/cleanup.h:154:18: warning: return discards ‘const’ qualifier from pointer target type [-Wdiscarded-qualifiers]
> 154 | { return *_T; }
> | ^~~
> ./include/linux/cred.h:193:1: note: in expansion of macro ‘DEFINE_GUARD’
> 193 | DEFINE_GUARD(cred, const struct cred *, _T = override_creds_light(_T),
> | ^~~~~~~~~~~~
>
I did not look closely, but can't you use DEFINE_LOCK_GUARD_1()?
Thanks,
Amir.
On Fri, Jan 26, 2024 at 1:57 AM Vinicius Costa Gomes
<[email protected]> wrote:
>
> For backing file operations, users are expected to pass credentials
> that will outlive the backing file common operations.
>
> Use the specialized guard statements to override/revert the
> credentials.
>
As I wrote before, I prefer to see this patch gets reviewed and merged
before the overlayfs large patch, so please reorder the series.
> Signed-off-by: Vinicius Costa Gomes <[email protected]>
> ---
> fs/backing-file.c | 124 ++++++++++++++++++++++------------------------
> 1 file changed, 60 insertions(+), 64 deletions(-)
>
> diff --git a/fs/backing-file.c b/fs/backing-file.c
> index a681f38d84d8..9874f09f860f 100644
> --- a/fs/backing-file.c
> +++ b/fs/backing-file.c
> @@ -140,7 +140,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
> struct backing_file_ctx *ctx)
> {
> struct backing_aio *aio = NULL;
> - const struct cred *old_cred;
> + const struct cred *old_cred = ctx->cred;
> ssize_t ret;
>
> if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
> @@ -153,29 +153,28 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
> !(file->f_mode & FMODE_CAN_ODIRECT))
> return -EINVAL;
>
> - old_cred = override_creds(ctx->cred);
> - if (is_sync_kiocb(iocb)) {
> - rwf_t rwf = iocb_to_rw_flags(flags);
> + scoped_guard(cred, old_cred) {
This reads very strage.
Also, I see that e.g. scoped_guard(spinlock_irqsave, ... hides the local var
used for save/restore of flags inside the macro.
Perhaps you use the same technique for scoped_guard(cred, ..
loose the local old_cred variable in all those functions and then the
code will read:
scoped_guard(cred, ctx->cred) {
which is nicer IMO.
> + if (is_sync_kiocb(iocb)) {
> + rwf_t rwf = iocb_to_rw_flags(flags);
>
> - ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
> - } else {
> - ret = -ENOMEM;
> - aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
> - if (!aio)
> - goto out;
> + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
> + } else {
> + ret = -ENOMEM;
> + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
> + if (!aio)
> + goto out;
>
> - aio->orig_iocb = iocb;
> - kiocb_clone(&aio->iocb, iocb, get_file(file));
> - aio->iocb.ki_complete = backing_aio_rw_complete;
> - refcount_set(&aio->ref, 2);
> - ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
> - backing_aio_put(aio);
> - if (ret != -EIOCBQUEUED)
> - backing_aio_cleanup(aio, ret);
> + aio->orig_iocb = iocb;
> + kiocb_clone(&aio->iocb, iocb, get_file(file));
> + aio->iocb.ki_complete = backing_aio_rw_complete;
> + refcount_set(&aio->ref, 2);
> + ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
> + backing_aio_put(aio);
> + if (ret != -EIOCBQUEUED)
> + backing_aio_cleanup(aio, ret);
> + }
if possible, I would rather avoid all this churn in functions that mostly
do work with the new cred, so either use guard(cred, ) directly or split a
helper that uses guard(cred, ) form the rest.
Thanks,
Amir.
On Fri, Jan 26, 2024 at 1:57 AM Vinicius Costa Gomes
<[email protected]> wrote:
>
> File operations in overlayfs also check against the credentials of the
> mounter task, stored in the superblock, this credentials will outlive
> most of the operations. For these cases, use the recently introduced
> guard statements to guarantee that override/revert_creds() are paired.
>
> Signed-off-by: Vinicius Costa Gomes <[email protected]>
> ---
> fs/overlayfs/copy_up.c | 4 +--
> fs/overlayfs/dir.c | 22 +++++++------
> fs/overlayfs/file.c | 70 ++++++++++++++++++++++--------------------
> fs/overlayfs/inode.c | 60 +++++++++++++++++++-----------------
> fs/overlayfs/namei.c | 21 ++++++-------
> fs/overlayfs/readdir.c | 18 +++++------
> fs/overlayfs/util.c | 23 +++++++-------
> fs/overlayfs/xattrs.c | 34 ++++++++++----------
> 8 files changed, 130 insertions(+), 122 deletions(-)
>
> diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
> index b8e25ca51016..55d1f2b60775 100644
> --- a/fs/overlayfs/copy_up.c
> +++ b/fs/overlayfs/copy_up.c
> @@ -1202,7 +1202,8 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
> if (err)
> return err;
>
> - old_cred = ovl_override_creds(dentry->d_sb);
> + old_cred = ovl_creds(dentry->d_sb);
> + guard(cred)(old_cred);
> while (!err) {
> struct dentry *next;
> struct dentry *parent = NULL;
> @@ -1227,7 +1228,6 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
> dput(parent);
> dput(next);
> }
> - revert_creds(old_cred);
>
> return err;
> }
> diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
> index 0f8b4a719237..5aa43a3a7b3e 100644
> --- a/fs/overlayfs/dir.c
> +++ b/fs/overlayfs/dir.c
> @@ -687,9 +687,9 @@ static int ovl_set_link_redirect(struct dentry *dentry)
> const struct cred *old_cred;
> int err;
>
> - old_cred = ovl_override_creds(dentry->d_sb);
> + old_cred = ovl_creds(dentry->d_sb);
> + guard(cred)(old_cred);
> err = ovl_set_redirect(dentry, false);
> - revert_creds(old_cred);
>
> return err;
> }
> @@ -894,12 +894,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
> if (err)
> goto out;
>
> - old_cred = ovl_override_creds(dentry->d_sb);
> - if (!lower_positive)
> - err = ovl_remove_upper(dentry, is_dir, &list);
> - else
> - err = ovl_remove_and_whiteout(dentry, &list);
> - revert_creds(old_cred);
> + old_cred = ovl_creds(dentry->d_sb);
> + scoped_guard(cred, old_cred) {
> + if (!lower_positive)
> + err = ovl_remove_upper(dentry, is_dir, &list);
> + else
> + err = ovl_remove_and_whiteout(dentry, &list);
> + }
> if (!err) {
> if (is_dir)
> clear_nlink(dentry->d_inode);
> @@ -1146,7 +1147,8 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
> goto out;
> }
>
> - old_cred = ovl_override_creds(old->d_sb);
> + old_cred = ovl_creds(old->d_sb);
> + old_cred = override_creds_light(old_cred);
>
> if (!list_empty(&list)) {
> opaquedir = ovl_clear_empty(new, &list);
> @@ -1279,7 +1281,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
> out_unlock:
> unlock_rename(new_upperdir, old_upperdir);
> out_revert_creds:
> - revert_creds(old_cred);
> + revert_creds_light(old_cred);
> if (update_nlink)
> ovl_nlink_end(new);
> else
Most of my comments on this patch are identical to the ones I have made on
backing file, so rather complete that review before moving on to this bigger
patch.
I even wonder if we need a specialized macro for overlayfs
guard(ovl_creds, ofs); or if
guard(cred, ovl_override_creds(dentry->d_sb));
is good enough.
One thing that stands out in functions like ovl_rename() is that,
understandably, you tried to preserve logic, but in fact, the scope of
override_creds/revert_creds() in some of the overlayfs functions ir rather
arbitrary.
The simplest solution for functions like the above is to use guard(cred, ..
and extend the scope till the end of the function.
This needs more careful review, but the end result will be much cleaner.
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 05536964d37f..482bf78555e2 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -42,7 +42,8 @@ static struct file *ovl_open_realfile(const struct file *file,
> if (flags & O_APPEND)
> acc_mode |= MAY_APPEND;
>
> - old_cred = ovl_override_creds(inode->i_sb);
> + old_cred = ovl_creds(inode->i_sb);
> + guard(cred)(old_cred);
> real_idmap = mnt_idmap(realpath->mnt);
> err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode);
> if (err) {
> @@ -54,7 +55,6 @@ static struct file *ovl_open_realfile(const struct file *file,
> realfile = backing_file_open(&file->f_path, flags, realpath,
> current_cred());
> }
> - revert_creds(old_cred);
>
> pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
> file, file, ovl_whatisit(inode, realinode), file->f_flags,
> @@ -214,9 +214,9 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
> ovl_inode_lock(inode);
> real.file->f_pos = file->f_pos;
>
> - old_cred = ovl_override_creds(inode->i_sb);
> - ret = vfs_llseek(real.file, offset, whence);
> - revert_creds(old_cred);
> + old_cred = ovl_creds(inode->i_sb);
> + scoped_guard(cred, old_cred)
> + ret = vfs_llseek(real.file, offset, whence);
>
> file->f_pos = real.file->f_pos;
> ovl_inode_unlock(inode);
> @@ -388,7 +388,6 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
> static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> {
> struct fd real;
> - const struct cred *old_cred;
> int ret;
>
> ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
> @@ -401,9 +400,11 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>
> /* Don't sync lower file for fear of receiving EROFS error */
> if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
> - old_cred = ovl_override_creds(file_inode(file)->i_sb);
> + const struct cred *old_cred;
> +
> + old_cred = ovl_creds(file_inode(file)->i_sb);
> + guard(cred)(old_cred);
> ret = vfs_fsync_range(real.file, start, end, datasync);
> - revert_creds(old_cred);
> }
>
> fdput(real);
> @@ -441,9 +442,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
> if (ret)
> goto out_unlock;
>
> - old_cred = ovl_override_creds(file_inode(file)->i_sb);
> - ret = vfs_fallocate(real.file, mode, offset, len);
> - revert_creds(old_cred);
> + old_cred = ovl_creds(file_inode(file)->i_sb);
> + scoped_guard(cred, old_cred)
> + ret = vfs_fallocate(real.file, mode, offset, len);
>
> /* Update size */
> ovl_file_modified(file);
> @@ -466,9 +467,9 @@ static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
> if (ret)
> return ret;
>
> - old_cred = ovl_override_creds(file_inode(file)->i_sb);
> - ret = vfs_fadvise(real.file, offset, len, advice);
> - revert_creds(old_cred);
> + old_cred = ovl_creds(file_inode(file)->i_sb);
> + scoped_guard(cred, old_cred)
> + ret = vfs_fadvise(real.file, offset, len, advice);
>
> fdput(real);
>
> @@ -509,25 +510,25 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
> goto out_unlock;
> }
>
> - old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
> - switch (op) {
> - case OVL_COPY:
> - ret = vfs_copy_file_range(real_in.file, pos_in,
> - real_out.file, pos_out, len, flags);
> - break;
> + old_cred = ovl_creds(file_inode(file_out)->i_sb);
> + scoped_guard(cred, old_cred)
> + switch (op) {
> + case OVL_COPY:
> + ret = vfs_copy_file_range(real_in.file, pos_in,
> + real_out.file, pos_out, len, flags);
> + break;
>
> - case OVL_CLONE:
> - ret = vfs_clone_file_range(real_in.file, pos_in,
> - real_out.file, pos_out, len, flags);
> - break;
> + case OVL_CLONE:
> + ret = vfs_clone_file_range(real_in.file, pos_in,
> + real_out.file, pos_out, len, flags);
> + break;
>
> - case OVL_DEDUPE:
> - ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
> - real_out.file, pos_out, len,
> - flags);
> - break;
> - }
> - revert_creds(old_cred);
> + case OVL_DEDUPE:
> + ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
> + real_out.file, pos_out, len,
> + flags);
> + break;
> + }
>
> /* Update size */
> ovl_file_modified(file_out);
This is another case where extending the scope to the end of the function
is the simpler/cleaner solution.
Thanks,
Amir.
Hi Amir,
Amir Goldstein <[email protected]> writes:
> cc: fsdevel
>
> On Fri, Jan 26, 2024 at 1:57 AM Vinicius Costa Gomes
> <[email protected]> wrote:
>>
>> Hi,
>>
>
> Hi Vinicius,
>
> I have some specific comments about the overlayfs patch,
> but first I prefer to provide higher level feedback on the series.
>
>> It was noticed that some workloads suffer from contention on
>> increasing/decrementing the ->usage counter in their credentials,
>> those refcount operations are associated with overriding/reverting the
>> current task credentials. (the linked thread adds more context)
>>
>> In some specialized cases, overlayfs is one of them, the credentials
>> in question have a longer lifetime than the override/revert "critical
>> section". In the overlayfs case, the credentials are created when the
>> fs is mounted and destroyed when it's unmounted. In this case of long
>> lived credentials, the usage counter doesn't need to be
>> incremented/decremented.
>>
>> Add a lighter version of credentials override/revert to be used in
>> these specialized cases. To make sure that the override/revert calls
>> are paired, add a cleanup guard macro. This was suggested here:
>>
>> https://lore.kernel.org/all/20231219-marken-pochen-26d888fb9bb9@brauner/
>>
>> With a small number of tweaks:
>> - Used inline functions instead of macros;
>> - A small change to store the credentials into the passed argument,
>> the guard is now defined as (note the added '_T ='):
>>
>> DEFINE_GUARD(cred, const struct cred *, _T = override_creds_light(_T),
>> revert_creds_light(_T));
>>
>> - Allow "const" arguments to be used with these kind of guards;
>>
>> Some comments:
>> - If patch 1/4 is not a good idea (adding the cast), the alternative
>> I can see is using some kind of container for the credentials;
>> - The only user for the backing file ops is overlayfs, so these
>> changes make sense, but may not make sense in the most general
>> case;
>>
>> For the numbers, some from 'perf c2c', before this series:
>> (edited to fit)
>>
>> #
>> # ----- HITM ----- Shared
>> # Num RmtHitm LclHitm Symbol Object Source:Line Node
>> # ..... ....... ....... .......................... ................ .................. ....
>> #
>> -------------------------
>> 0 412 1028
>> -------------------------
>> 41.50% 42.22% [k] revert_creds [kernel.vmlinux] atomic64_64.h:39 0 1
>> 15.05% 10.60% [k] override_creds [kernel.vmlinux] atomic64_64.h:25 0 1
>> 0.73% 0.58% [k] init_file [kernel.vmlinux] atomic64_64.h:25 0 1
>> 0.24% 0.10% [k] revert_creds [kernel.vmlinux] cred.h:266 0 1
>> 32.28% 37.16% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
>> 9.47% 8.75% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
>> 0.49% 0.58% [k] inode_owner_or_capable [kernel.vmlinux] mnt_idmapping.h:81 0 1
>> 0.24% 0.00% [k] generic_permission [kernel.vmlinux] namei.c:354 0
>>
>> -------------------------
>> 1 50 103
>> -------------------------
>> 100.00% 100.00% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
>>
>> -------------------------
>> 2 50 98
>> -------------------------
>> 96.00% 96.94% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
>> 2.00% 1.02% [k] update_load_avg [kernel.vmlinux] atomic64_64.h:25 0 1
>> 0.00% 2.04% [k] update_load_avg [kernel.vmlinux] fair.c:4118 0
>> 2.00% 0.00% [k] update_cfs_group [kernel.vmlinux] fair.c:3932 0 1
>>
>> after this series:
>>
>> #
>> # ----- HITM ----- Shared
>> # Num RmtHitm LclHitm Symbol Object Source:Line Node
>> # ..... ....... ....... .................... ................ ............... ....
>> #
>> -------------------------
>> 0 54 88
>> -------------------------
>> 100.00% 100.00% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
>>
>> -------------------------
>> 1 48 83
>> -------------------------
>> 97.92% 97.59% [k] update_cfs_group [kernel.vmlinux] atomic64_64.h:15 0 1
>> 2.08% 1.20% [k] update_load_avg [kernel.vmlinux] atomic64_64.h:25 0 1
>> 0.00% 1.20% [k] update_load_avg [kernel.vmlinux] fairc:4118 0 1
>>
>> -------------------------
>> 2 28 44
>> -------------------------
>> 85.71% 79.55% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
>> 14.29% 20.45% [k] generic_permission [kernel.vmlinux] mnt_idmapping.h:81 0 1
>>
>>
>> The contention is practically gone.
>
> That is very impressive.
> Can you say which workloads were running during this test?
> Specifically, I am wondering how much of the improvement came from
> backing_file.c and how much from overlayfs/*.c.
>
I received the workload packaged from one of our customer teams, it's a
docker image to run a wordpress/php/nginx thing, totally not my area and
not sure that I can give much more details. The only think that I know
is that this workload does *a lot* of faccessat().
Anyway, I did a experiment removing the backing ops patch, got this
numbers (edited for clarity):
#
# ----- HITM ----- ------- Store Refs ------ Shared
# Num RmtHitm LclHitm L1 Hit L1 Miss N/A Symbol Object Source:Line Node
# ..... ....... ....... ....... ....... ....... ......................... ................. .................. ....
#
---------------------------------------------------
0 79 97 0 0 0
---------------------------------------------------
0.00% 1.03% 0.00% 0.00% 0.00% [k] revert_creds [kernel.kallsyms] atomic64_64.h:39 0 1
1.27% 0.00% 0.00% 0.00% 0.00% [k] init_file [kernel.kallsyms] atomic64_64.h:25 0 1
62.03% 71.13% 0.00% 0.00% 0.00% [k] generic_permission [kernel.kallsyms] mnt_idmapping.h:81 0 1
35.44% 26.80% 0.00% 0.00% 0.00% [k] generic_permission [kernel.kallsyms] mnt_idmapping.h:81 0 1
1.27% 0.00% 0.00% 0.00% 0.00% [k] generic_permission [kernel.kallsyms] mnt_idmapping.h:81 0
0.00% 1.03% 0.00% 0.00% 0.00% [k] generic_permission [kernel.kallsyms] namei.c:354 0 1
---------------------------------------------------
1 52 103 0 0 0
---------------------------------------------------
98.08% 98.06% 0.00% 0.00% 0.00% [k] update_cfs_group [kernel.kallsyms] atomic64_64.h:15 0 1
0.00% 1.94% 0.00% 0.00% 0.00% [k] update_load_avg [kernel.kallsyms] atomic64_64.h:25 0 1
1.92% 0.00% 0.00% 0.00% 0.00% [k] update_cfs_group [kernel.kallsyms] fair.c:3932 0
---------------------------------------------------
2 59 77 0 0 0
---------------------------------------------------
93.22% 98.70% 0.00% 0.00% 0.00% [k] update_cfs_group [kernel.kallsyms] atomic64_64.h:15 0 1
5.08% 1.30% 0.00% 0.00% 0.00% [k] update_cfs_group [kernel.kallsyms] fair.c:3932 0 1
1.69% 0.00% 0.00% 0.00% 0.00% [k] update_load_avg [kernel.kallsyms] atomic64_64.h:25 0 1
So, the main source of contention is not in the backing file ops (but
there is still some). That seems to align with the numbers that Chen Hu
provided[1], that most of the contention was in ovl_permission().
[1] https://lore.kernel.org/all/[email protected]/
> The reason I am asking is because the overlayfs patch is quite large and can
> take more time to review, so I am wondering out loud if we are not
> better off this
> course of action:
>
> 1. convert backing_file.c to use new helpers/guards
> 2. convert overlayfs to use new helpers/guards
>
For this particular workload, (2) is more important. But I am open to
propose (1) first, no problem at all.
Also, if you think that some other way of spliting the series, for
example, one patch per function being converted, would be easier/better,
I can do that too.
> #1 should definitely go in via Christian's tree and should get a wider review
> from fsdevel (please CC fsdevel next time)
>
Of course. Will CC fsdevel.
> #2 is contained for overlayfs reviewers. Once the helpers are merged
> and used by backing_file helpers, overlayfs can be converted independently.
>
> #1 and #2 could both be merged in the same merge cycle, or not, it does not
> matter. Most likely, #2 will go through Christian's tree as well, but I think we
> need to work according to this merge order.
>
> We can also work on the review in parallel and you may keep the overlayfs
> patch in following posts, just wanted us to be on the same page w.r.t to
> the process.
>
> Thanks,
> Amir.
Cheers,
--
Vinicius
Amir Goldstein <[email protected]> writes:
> On Fri, Jan 26, 2024 at 1:57 AM Vinicius Costa Gomes
> <[email protected]> wrote:
>>
>> Add a light version of override/revert_creds(), this should only be
>> used when the credentials in question will outlive the critical
>> section and the critical section doesn't change the ->usage of the
>> credentials.
>>
>> To make their usage less error prone, introduce cleanup guards asto be
>> used like this:
>>
>> guard(cred)(credentials_to_override_and_restore);
>>
>> or this:
>>
>> scoped_guard(cred, credentials_to_override_and_restore) {
>> /* with credentials overridden */
>> }
>>
>> Signed-off-by: Vinicius Costa Gomes <[email protected]>
>
> You may add:
> Reviewed-by: Amir Goldstein <[email protected]>
>
> I would also add:
> Suggested-by: Christian Brauner <[email protected]>
>
Forgot about that one.
Cheers,
--
Vinicius
Amir Goldstein <[email protected]> writes:
> On Fri, Jan 26, 2024 at 1:57 AM Vinicius Costa Gomes
> <[email protected]> wrote:
>>
>> For backing file operations, users are expected to pass credentials
>> that will outlive the backing file common operations.
>>
>> Use the specialized guard statements to override/revert the
>> credentials.
>>
>
> As I wrote before, I prefer to see this patch gets reviewed and merged
> before the overlayfs large patch, so please reorder the series.
>
Sure. Will do.
>> Signed-off-by: Vinicius Costa Gomes <[email protected]>
>> ---
>> fs/backing-file.c | 124 ++++++++++++++++++++++------------------------
>> 1 file changed, 60 insertions(+), 64 deletions(-)
>>
>> diff --git a/fs/backing-file.c b/fs/backing-file.c
>> index a681f38d84d8..9874f09f860f 100644
>> --- a/fs/backing-file.c
>> +++ b/fs/backing-file.c
>> @@ -140,7 +140,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
>> struct backing_file_ctx *ctx)
>> {
>> struct backing_aio *aio = NULL;
>> - const struct cred *old_cred;
>> + const struct cred *old_cred = ctx->cred;
>> ssize_t ret;
>>
>> if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
>> @@ -153,29 +153,28 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
>> !(file->f_mode & FMODE_CAN_ODIRECT))
>> return -EINVAL;
>>
>> - old_cred = override_creds(ctx->cred);
>> - if (is_sync_kiocb(iocb)) {
>> - rwf_t rwf = iocb_to_rw_flags(flags);
>> + scoped_guard(cred, old_cred) {
>
> This reads very strage.
>
> Also, I see that e.g. scoped_guard(spinlock_irqsave, ... hides the local var
> used for save/restore of flags inside the macro.
>
> Perhaps you use the same technique for scoped_guard(cred, ..
> loose the local old_cred variable in all those functions and then the
> code will read:
>
> scoped_guard(cred, ctx->cred) {
>
> which is nicer IMO.
Most likely using DEFINE_LOCK_GUARD_1() would allow us to use the nicer version.
>
>> + if (is_sync_kiocb(iocb)) {
>> + rwf_t rwf = iocb_to_rw_flags(flags);
>>
>> - ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
>> - } else {
>> - ret = -ENOMEM;
>> - aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
>> - if (!aio)
>> - goto out;
>> + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
>> + } else {
>> + ret = -ENOMEM;
>> + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
>> + if (!aio)
>> + goto out;
>>
>> - aio->orig_iocb = iocb;
>> - kiocb_clone(&aio->iocb, iocb, get_file(file));
>> - aio->iocb.ki_complete = backing_aio_rw_complete;
>> - refcount_set(&aio->ref, 2);
>> - ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
>> - backing_aio_put(aio);
>> - if (ret != -EIOCBQUEUED)
>> - backing_aio_cleanup(aio, ret);
>> + aio->orig_iocb = iocb;
>> + kiocb_clone(&aio->iocb, iocb, get_file(file));
>> + aio->iocb.ki_complete = backing_aio_rw_complete;
>> + refcount_set(&aio->ref, 2);
>> + ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
>> + backing_aio_put(aio);
>> + if (ret != -EIOCBQUEUED)
>> + backing_aio_cleanup(aio, ret);
>> + }
>
> if possible, I would rather avoid all this churn in functions that mostly
> do work with the new cred, so either use guard(cred, ) directly or split a
> helper that uses guard(cred, ) form the rest.
>
Yeah, I think what happened is that I tried to keep the scope of the
guard to be as close as possible to override/revert (as you said), and
that caused the churn.
Probably using guard() more will reduce these confusing code changes. I
am going to try that.
> Thanks,
> Amir.
Cheers,
--
Vinicius
Amir Goldstein <[email protected]> writes:
> On Fri, Jan 26, 2024 at 1:57 AM Vinicius Costa Gomes
> <[email protected]> wrote:
>>
>> File operations in overlayfs also check against the credentials of the
>> mounter task, stored in the superblock, this credentials will outlive
>> most of the operations. For these cases, use the recently introduced
>> guard statements to guarantee that override/revert_creds() are paired.
>>
>> Signed-off-by: Vinicius Costa Gomes <[email protected]>
>> ---
>> fs/overlayfs/copy_up.c | 4 +--
>> fs/overlayfs/dir.c | 22 +++++++------
>> fs/overlayfs/file.c | 70 ++++++++++++++++++++++--------------------
>> fs/overlayfs/inode.c | 60 +++++++++++++++++++-----------------
>> fs/overlayfs/namei.c | 21 ++++++-------
>> fs/overlayfs/readdir.c | 18 +++++------
>> fs/overlayfs/util.c | 23 +++++++-------
>> fs/overlayfs/xattrs.c | 34 ++++++++++----------
>> 8 files changed, 130 insertions(+), 122 deletions(-)
>>
>> diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
>> index b8e25ca51016..55d1f2b60775 100644
>> --- a/fs/overlayfs/copy_up.c
>> +++ b/fs/overlayfs/copy_up.c
>> @@ -1202,7 +1202,8 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
>> if (err)
>> return err;
>>
>> - old_cred = ovl_override_creds(dentry->d_sb);
>> + old_cred = ovl_creds(dentry->d_sb);
>> + guard(cred)(old_cred);
>> while (!err) {
>> struct dentry *next;
>> struct dentry *parent = NULL;
>> @@ -1227,7 +1228,6 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
>> dput(parent);
>> dput(next);
>> }
>> - revert_creds(old_cred);
>>
>> return err;
>> }
>> diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
>> index 0f8b4a719237..5aa43a3a7b3e 100644
>> --- a/fs/overlayfs/dir.c
>> +++ b/fs/overlayfs/dir.c
>> @@ -687,9 +687,9 @@ static int ovl_set_link_redirect(struct dentry *dentry)
>> const struct cred *old_cred;
>> int err;
>>
>> - old_cred = ovl_override_creds(dentry->d_sb);
>> + old_cred = ovl_creds(dentry->d_sb);
>> + guard(cred)(old_cred);
>> err = ovl_set_redirect(dentry, false);
>> - revert_creds(old_cred);
>>
>> return err;
>> }
>> @@ -894,12 +894,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
>> if (err)
>> goto out;
>>
>> - old_cred = ovl_override_creds(dentry->d_sb);
>> - if (!lower_positive)
>> - err = ovl_remove_upper(dentry, is_dir, &list);
>> - else
>> - err = ovl_remove_and_whiteout(dentry, &list);
>> - revert_creds(old_cred);
>> + old_cred = ovl_creds(dentry->d_sb);
>> + scoped_guard(cred, old_cred) {
>> + if (!lower_positive)
>> + err = ovl_remove_upper(dentry, is_dir, &list);
>> + else
>> + err = ovl_remove_and_whiteout(dentry, &list);
>> + }
>> if (!err) {
>> if (is_dir)
>> clear_nlink(dentry->d_inode);
>> @@ -1146,7 +1147,8 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
>> goto out;
>> }
>>
>> - old_cred = ovl_override_creds(old->d_sb);
>> + old_cred = ovl_creds(old->d_sb);
>> + old_cred = override_creds_light(old_cred);
>>
>> if (!list_empty(&list)) {
>> opaquedir = ovl_clear_empty(new, &list);
>> @@ -1279,7 +1281,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
>> out_unlock:
>> unlock_rename(new_upperdir, old_upperdir);
>> out_revert_creds:
>> - revert_creds(old_cred);
>> + revert_creds_light(old_cred);
>> if (update_nlink)
>> ovl_nlink_end(new);
>> else
>
> Most of my comments on this patch are identical to the ones I have made on
> backing file, so rather complete that review before moving on to this bigger
> patch.
>
> I even wonder if we need a specialized macro for overlayfs
> guard(ovl_creds, ofs); or if
> guard(cred, ovl_override_creds(dentry->d_sb));
> is good enough.
>
I think that if the DEFINE_LOCK_GUARD_1() idea works, that might be
unecessary. Let's see.
> One thing that stands out in functions like ovl_rename() is that,
> understandably, you tried to preserve logic, but in fact, the scope of
> override_creds/revert_creds() in some of the overlayfs functions ir rather
> arbitrary.
>
That's very good to learn.
> The simplest solution for functions like the above is to use guard(cred, .
> and extend the scope till the end of the function.
> This needs more careful review, but the end result will be much cleaner.
>
Yeah, increasing the indentation level of whole blocks cause the whole
patch to be much harder to review.
Using more guard() statements will certainly help.
>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>> index 05536964d37f..482bf78555e2 100644
>> --- a/fs/overlayfs/file.c
>> +++ b/fs/overlayfs/file.c
>> @@ -42,7 +42,8 @@ static struct file *ovl_open_realfile(const struct file *file,
>> if (flags & O_APPEND)
>> acc_mode |= MAY_APPEND;
>>
>> - old_cred = ovl_override_creds(inode->i_sb);
>> + old_cred = ovl_creds(inode->i_sb);
>> + guard(cred)(old_cred);
>> real_idmap = mnt_idmap(realpath->mnt);
>> err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode);
>> if (err) {
>> @@ -54,7 +55,6 @@ static struct file *ovl_open_realfile(const struct file *file,
>> realfile = backing_file_open(&file->f_path, flags, realpath,
>> current_cred());
>> }
>> - revert_creds(old_cred);
>>
>> pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
>> file, file, ovl_whatisit(inode, realinode), file->f_flags,
>> @@ -214,9 +214,9 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
>> ovl_inode_lock(inode);
>> real.file->f_pos = file->f_pos;
>>
>> - old_cred = ovl_override_creds(inode->i_sb);
>> - ret = vfs_llseek(real.file, offset, whence);
>> - revert_creds(old_cred);
>> + old_cred = ovl_creds(inode->i_sb);
>> + scoped_guard(cred, old_cred)
>> + ret = vfs_llseek(real.file, offset, whence);
>>
>> file->f_pos = real.file->f_pos;
>> ovl_inode_unlock(inode);
>> @@ -388,7 +388,6 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
>> static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>> {
>> struct fd real;
>> - const struct cred *old_cred;
>> int ret;
>>
>> ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
>> @@ -401,9 +400,11 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>>
>> /* Don't sync lower file for fear of receiving EROFS error */
>> if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
>> - old_cred = ovl_override_creds(file_inode(file)->i_sb);
>> + const struct cred *old_cred;
>> +
>> + old_cred = ovl_creds(file_inode(file)->i_sb);
>> + guard(cred)(old_cred);
>> ret = vfs_fsync_range(real.file, start, end, datasync);
>> - revert_creds(old_cred);
>> }
>>
>> fdput(real);
>> @@ -441,9 +442,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
>> if (ret)
>> goto out_unlock;
>>
>> - old_cred = ovl_override_creds(file_inode(file)->i_sb);
>> - ret = vfs_fallocate(real.file, mode, offset, len);
>> - revert_creds(old_cred);
>> + old_cred = ovl_creds(file_inode(file)->i_sb);
>> + scoped_guard(cred, old_cred)
>> + ret = vfs_fallocate(real.file, mode, offset, len);
>>
>> /* Update size */
>> ovl_file_modified(file);
>> @@ -466,9 +467,9 @@ static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
>> if (ret)
>> return ret;
>>
>> - old_cred = ovl_override_creds(file_inode(file)->i_sb);
>> - ret = vfs_fadvise(real.file, offset, len, advice);
>> - revert_creds(old_cred);
>> + old_cred = ovl_creds(file_inode(file)->i_sb);
>> + scoped_guard(cred, old_cred)
>> + ret = vfs_fadvise(real.file, offset, len, advice);
>>
>> fdput(real);
>>
>> @@ -509,25 +510,25 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
>> goto out_unlock;
>> }
>>
>> - old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
>> - switch (op) {
>> - case OVL_COPY:
>> - ret = vfs_copy_file_range(real_in.file, pos_in,
>> - real_out.file, pos_out, len, flags);
>> - break;
>> + old_cred = ovl_creds(file_inode(file_out)->i_sb);
>> + scoped_guard(cred, old_cred)
>> + switch (op) {
>> + case OVL_COPY:
>> + ret = vfs_copy_file_range(real_in.file, pos_in,
>> + real_out.file, pos_out, len, flags);
>> + break;
>>
>> - case OVL_CLONE:
>> - ret = vfs_clone_file_range(real_in.file, pos_in,
>> - real_out.file, pos_out, len, flags);
>> - break;
>> + case OVL_CLONE:
>> + ret = vfs_clone_file_range(real_in.file, pos_in,
>> + real_out.file, pos_out, len, flags);
>> + break;
>>
>> - case OVL_DEDUPE:
>> - ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
>> - real_out.file, pos_out, len,
>> - flags);
>> - break;
>> - }
>> - revert_creds(old_cred);
>> + case OVL_DEDUPE:
>> + ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
>> + real_out.file, pos_out, len,
>> + flags);
>> + break;
>> + }
>>
>> /* Update size */
>> ovl_file_modified(file_out);
>
> This is another case where extending the scope to the end of the function
> is the simpler/cleaner solution.
>
> Thanks,
> Amir.
--
Vinicius