2022-02-02 20:14:57

by Imran Khan

[permalink] [raw]
Subject: [PATCH v4 2/2] kernfs: Replace per-fs global rwsem with per-fs hashed rwsem.

Having a single rwsem to synchronize all operations across a kernfs
based file system (cgroup, sysfs etc.) does not scale well. Replace
it with a hashed rwsem to reduce contention around single per-fs
rwsem.
Also introduce a perfs rwsem to protect per-fs list of kernfs_super_info.

Signed-off-by: Imran Khan <[email protected]>
---
fs/kernfs/dir.c | 268 +++++++++++++++++++++++++-----------
fs/kernfs/file.c | 6 +-
fs/kernfs/inode.c | 18 ++-
fs/kernfs/kernfs-internal.h | 112 +++++++++++++++
fs/kernfs/mount.c | 13 +-
fs/kernfs/symlink.c | 5 +-
include/linux/kernfs.h | 5 +-
7 files changed, 325 insertions(+), 102 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index d26fb3bffda92..89645ba453ab8 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -25,7 +25,9 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */

static bool kernfs_active(struct kernfs_node *kn)
{
- lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem);
+ int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+ lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem[idx]);
return atomic_read(&kn->active) >= 0;
}

@@ -455,35 +457,36 @@ void kernfs_put_active(struct kernfs_node *kn)
* removers may invoke this function concurrently on @kn and all will
* return after draining is complete.
*/
-static void kernfs_drain(struct kernfs_node *kn)
- __releases(&kernfs_root(kn)->kernfs_rwsem)
- __acquires(&kernfs_root(kn)->kernfs_rwsem)
+static void kernfs_drain(struct kernfs_node *kn, struct kernfs_node *anc)
+ __releases(&kernfs_root(anc)->kernfs_rwsem[a_idx])
+ __acquires(&kernfs_root(anc)->kernfs_rwsem[a_idx])
{
struct kernfs_root *root = kernfs_root(kn);
+ int a_idx = hash_ptr(anc, NR_KERNFS_LOCK_BITS);

- lockdep_assert_held_write(&root->kernfs_rwsem);
- WARN_ON_ONCE(kernfs_active(kn));
+ lockdep_assert_held_write(&root->kernfs_rwsem[a_idx]);
+ WARN_ON_ONCE(atomic_read(&kn->active) >= 0);

- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(anc);

- if (kernfs_lockdep(kn)) {
- rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
- if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
- lock_contended(&kn->dep_map, _RET_IP_);
+ if (kernfs_lockdep(anc)) {
+ rwsem_acquire(&anc->dep_map, 0, 0, _RET_IP_);
+ if (atomic_read(&anc->active) != KN_DEACTIVATED_BIAS)
+ lock_contended(&anc->dep_map, _RET_IP_);
}

/* but everyone should wait for draining */
wait_event(root->deactivate_waitq,
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);

- if (kernfs_lockdep(kn)) {
- lock_acquired(&kn->dep_map, _RET_IP_);
- rwsem_release(&kn->dep_map, _RET_IP_);
+ if (kernfs_lockdep(anc)) {
+ lock_acquired(&anc->dep_map, _RET_IP_);
+ rwsem_release(&anc->dep_map, _RET_IP_);
}

kernfs_drain_open_files(kn);

- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(anc, LOCK_SELF, 0);
}

/**
@@ -718,12 +721,11 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
int kernfs_add_one(struct kernfs_node *kn)
{
struct kernfs_node *parent = kn->parent;
- struct kernfs_root *root = kernfs_root(parent);
struct kernfs_iattrs *ps_iattr;
bool has_ns;
int ret;

- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(parent, LOCK_SELF, 0);

ret = -EINVAL;
has_ns = kernfs_ns_enabled(parent);
@@ -754,7 +756,7 @@ int kernfs_add_one(struct kernfs_node *kn)
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}

- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(parent);

/*
* Activate the new node unless CREATE_DEACTIVATED is requested.
@@ -768,7 +770,7 @@ int kernfs_add_one(struct kernfs_node *kn)
return 0;

out_unlock:
- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(parent);
return ret;
}

@@ -788,8 +790,9 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
struct rb_node *node = parent->dir.children.rb_node;
bool has_ns = kernfs_ns_enabled(parent);
unsigned int hash;
+ int idx = hash_ptr(parent, NR_KERNFS_LOCK_BITS);

- lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem);
+ lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem[idx]);

if (has_ns != (bool)ns) {
WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
@@ -820,8 +823,9 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
{
size_t len;
char *p, *name;
+ int idx = hash_ptr(parent, NR_KERNFS_LOCK_BITS);

- lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
+ lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem[idx]);

/* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
spin_lock_irq(&kernfs_rename_lock);
@@ -860,12 +864,11 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
const char *name, const void *ns)
{
struct kernfs_node *kn;
- struct kernfs_root *root = kernfs_root(parent);

- down_read(&root->kernfs_rwsem);
+ down_read_kernfs_rwsem(parent, LOCK_SELF, 0);
kn = kernfs_find_ns(parent, name, ns);
kernfs_get(kn);
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(parent);

return kn;
}
@@ -885,12 +888,11 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
const char *path, const void *ns)
{
struct kernfs_node *kn;
- struct kernfs_root *root = kernfs_root(parent);

- down_read(&root->kernfs_rwsem);
+ down_read_kernfs_rwsem(parent, LOCK_SELF, 0);
kn = kernfs_walk_ns(parent, path, ns);
kernfs_get(kn);
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(parent);

return kn;
}
@@ -916,11 +918,12 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
return ERR_PTR(-ENOMEM);

idr_init(&root->ino_idr);
- init_rwsem(&root->kernfs_rwsem);
for (lock_count = 0; lock_count < NR_KERNFS_LOCKS; lock_count++) {
spin_lock_init(&root->open_node_locks[lock_count].lock);
mutex_init(&root->open_file_mutex[lock_count].lock);
+ init_rwsem(&root->kernfs_rwsem[lock_count]);
}
+ init_rwsem(&root->supers_rwsem);
INIT_LIST_HEAD(&root->supers);

/*
@@ -1067,12 +1070,12 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
if (parent) {
spin_unlock(&dentry->d_lock);
root = kernfs_root(parent);
- down_read(&root->kernfs_rwsem);
+ down_read_kernfs_rwsem(parent, LOCK_SELF, 0);
if (kernfs_dir_changed(parent, dentry)) {
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(parent);
return 0;
}
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(parent);
} else
spin_unlock(&dentry->d_lock);

@@ -1084,7 +1087,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)

kn = kernfs_dentry_node(dentry);
root = kernfs_root(kn);
- down_read(&root->kernfs_rwsem);
+ down_read_kernfs_rwsem(kn, LOCK_SELF, 0);

/* The kernfs node has been deactivated */
if (!kernfs_active(kn))
@@ -1103,10 +1106,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
kernfs_info(dentry->d_sb)->ns != kn->ns)
goto out_bad;

- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(kn);
return 1;
out_bad:
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(kn);
return 0;
}

@@ -1125,23 +1128,28 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
const void *ns = NULL;

root = kernfs_root(parent);
- down_read(&root->kernfs_rwsem);
+
+ down_read_kernfs_rwsem(parent, LOCK_SELF, 0);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dir->i_sb)->ns;

kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
+ up_read_kernfs_rwsem(parent);
/* attach dentry and inode */
if (kn) {
/* Inactive nodes are invisible to the VFS so don't
* create a negative.
*/
+ down_read_kernfs_rwsem(kn, LOCK_SELF, 0);
if (!kernfs_active(kn)) {
- up_read(&root->kernfs_rwsem);
+ /* Unlock both node and parent before returning */
+ up_read_kernfs_rwsem(kn);
return NULL;
}
inode = kernfs_get_inode(dir->i_sb, kn);
if (!inode)
inode = ERR_PTR(-ENOMEM);
+ up_read_kernfs_rwsem(kn);
}
/*
* Needed for negative dentry validation.
@@ -1149,9 +1157,10 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
* or transforms from positive dentry in dentry_unlink_inode()
* called from vfs_rmdir().
*/
+ down_read_kernfs_rwsem(parent, LOCK_SELF, 0);
if (!IS_ERR(inode))
kernfs_set_rev(parent, dentry);
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(parent);

/* instantiate and hash (possibly negative) dentry */
return d_splice_alias(inode, dentry);
@@ -1273,8 +1282,9 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
struct kernfs_node *root)
{
struct rb_node *rbn;
+ int idx = hash_ptr(root, NR_KERNFS_LOCK_BITS);

- lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem);
+ lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem[idx]);

/* if first iteration, visit leftmost descendant which may be root */
if (!pos)
@@ -1309,9 +1319,8 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
void kernfs_activate(struct kernfs_node *kn)
{
struct kernfs_node *pos;
- struct kernfs_root *root = kernfs_root(kn);

- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(kn, LOCK_SELF, 0);

pos = NULL;
while ((pos = kernfs_next_descendant_post(pos, kn))) {
@@ -1325,14 +1334,15 @@ void kernfs_activate(struct kernfs_node *kn)
pos->flags |= KERNFS_ACTIVATED;
}

- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(kn);
}

static void __kernfs_remove(struct kernfs_node *kn)
{
struct kernfs_node *pos;
+ int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);

- lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
+ lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem[idx]);

/*
* Short-circuit if non-root @kn has already finished removal.
@@ -1346,9 +1356,16 @@ static void __kernfs_remove(struct kernfs_node *kn)

/* prevent any new usage under @kn by deactivating all nodes */
pos = NULL;
- while ((pos = kernfs_next_descendant_post(pos, kn)))
+ while ((pos = kernfs_next_descendant_post(pos, kn))) {
+ int n_idx = hash_ptr(pos, NR_KERNFS_LOCK_BITS);
+
+ if (n_idx != idx)
+ down_write_kernfs_rwsem(pos, LOCK_SELF, 1);
if (kernfs_active(pos))
atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
+ if (n_idx != idx)
+ up_write_kernfs_rwsem(pos);
+ }

/* deactivate and unlink the subtree node-by-node */
do {
@@ -1369,7 +1386,7 @@ static void __kernfs_remove(struct kernfs_node *kn)
* error paths without worrying about draining.
*/
if (kn->flags & KERNFS_ACTIVATED)
- kernfs_drain(pos);
+ kernfs_drain(pos, kn);
else
WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);

@@ -1402,11 +1419,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
*/
void kernfs_remove(struct kernfs_node *kn)
{
- struct kernfs_root *root = kernfs_root(kn);
-
- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(kn, LOCK_SELF, 0);
__kernfs_remove(kn);
- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(kn);
}

/**
@@ -1492,9 +1507,8 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn)
bool kernfs_remove_self(struct kernfs_node *kn)
{
bool ret;
- struct kernfs_root *root = kernfs_root(kn);

- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(kn, LOCK_SELF, 0);
kernfs_break_active_protection(kn);

/*
@@ -1522,9 +1536,9 @@ bool kernfs_remove_self(struct kernfs_node *kn)
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
break;

- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(kn);
schedule();
- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(kn, LOCK_SELF, 0);
}
finish_wait(waitq, &wait);
WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
@@ -1537,7 +1551,7 @@ bool kernfs_remove_self(struct kernfs_node *kn)
*/
kernfs_unbreak_active_protection(kn);

- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(kn);
return ret;
}

@@ -1555,7 +1569,7 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
{
struct kernfs_node *kn;
struct kernfs_root *root;
-
+ int idx, p_idx;
if (!parent) {
WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
name);
@@ -1563,13 +1577,15 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
}

root = kernfs_root(parent);
- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(parent, LOCK_SELF, 0);

kn = kernfs_find_ns(parent, name, ns);
- if (kn)
+ up_write_kernfs_rwsem(parent);
+ if (kn) {
+ down_write_kernfs_rwsem(kn, LOCK_SELF, 0);
__kernfs_remove(kn);
-
- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(kn);
+ }

if (kn)
return 0;
@@ -1590,35 +1606,66 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
struct kernfs_node *old_parent;
struct kernfs_root *root;
const char *old_name = NULL;
- int error;
+ int error, idx, np_idx, p_idx;

/* can't move or rename root */
if (!kn->parent)
return -EINVAL;

root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+
+ /*
+ * Take lock of node's old (current) parent.
+ * If new parent has a different lock, then take that
+ * lock as well.
+ */
+ idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+ p_idx = hash_ptr(kn->parent, NR_KERNFS_LOCK_BITS);
+ np_idx = hash_ptr(new_parent, NR_KERNFS_LOCK_BITS);
+
+ /*
+ * Take only kn's lock. The subsequent kernfs_put
+ * may free up old_parent so if old_parent has a
+ * different lock, we will explicitly release that.
+ */
+ down_write_kernfs_rwsem(kn, LOCK_SELF, 0);
+
+ if (idx != np_idx) /* new parent hashes to different lock */
+ down_write_kernfs_rwsem(new_parent, LOCK_SELF, 1);
+
+ /* old_parent hashes to a different lock */
+ if (idx != p_idx && p_idx != np_idx)
+ down_write_kernfs_rwsem(kn->parent, LOCK_SELF, 2);

error = -ENOENT;
if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
- (new_parent->flags & KERNFS_EMPTY_DIR))
+ (new_parent->flags & KERNFS_EMPTY_DIR)) {
+ if (idx != p_idx && p_idx != np_idx)
+ up_write_kernfs_rwsem(kn->parent);
goto out;
-
+ }
error = 0;
if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
- (strcmp(kn->name, new_name) == 0))
+ (strcmp(kn->name, new_name) == 0)) {
+ if (idx != p_idx && p_idx != np_idx)
+ up_write_kernfs_rwsem(kn->parent);
goto out; /* nothing to rename */
-
+ }
error = -EEXIST;
- if (kernfs_find_ns(new_parent, new_name, new_ns))
+ if (kernfs_find_ns(new_parent, new_name, new_ns)) {
+ if (idx != p_idx && p_idx != np_idx)
+ up_write_kernfs_rwsem(kn->parent);
goto out;
-
+ }
/* rename kernfs_node */
if (strcmp(kn->name, new_name) != 0) {
error = -ENOMEM;
new_name = kstrdup_const(new_name, GFP_KERNEL);
- if (!new_name)
+ if (!new_name) {
+ if (idx != p_idx && p_idx != np_idx)
+ up_write_kernfs_rwsem(kn->parent);
goto out;
+ }
} else {
new_name = NULL;
}
@@ -1646,12 +1693,22 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
kn->hash = kernfs_name_hash(kn->name, kn->ns);
kernfs_link_sibling(kn);

+ /* Release old_parent's lock, if it is different */
+ if (idx != p_idx && p_idx != np_idx)
+ up_write_kernfs_rwsem(old_parent);
kernfs_put(old_parent);
kfree_const(old_name);

error = 0;
out:
- up_write(&root->kernfs_rwsem);
+ /*
+ * If new parent lock has been taken release it.
+ * Lastly release node's lock.
+ */
+ if (idx != np_idx) /* new parent hashes to different lock */
+ up_write_kernfs_rwsem(new_parent);
+
+ up_write_kernfs_rwsem(kn);
return error;
}

@@ -1670,9 +1727,20 @@ static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
static struct kernfs_node *kernfs_dir_pos(const void *ns,
struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
{
+ int idx, p_idx;
+
+ p_idx = hash_ptr(parent, NR_KERNFS_LOCK_BITS);
+ lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem[p_idx]);
if (pos) {
- int valid = kernfs_active(pos) &&
+ int valid = 0;
+
+ idx = hash_ptr(pos, NR_KERNFS_LOCK_BITS);
+ if (idx != p_idx)
+ down_read_kernfs_rwsem(pos, LOCK_SELF, 1);
+ valid = kernfs_active(pos) &&
pos->parent == parent && hash == pos->hash;
+ if (idx != p_idx)
+ up_read_kernfs_rwsem(pos);
kernfs_put(pos);
if (!valid)
pos = NULL;
@@ -1681,18 +1749,37 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
struct rb_node *node = parent->dir.children.rb_node;
while (node) {
pos = rb_to_kn(node);
-
+ idx = hash_ptr(pos, NR_KERNFS_LOCK_BITS);
+ if (idx != p_idx)
+ down_read_kernfs_rwsem(pos, LOCK_SELF, 1);
if (hash < pos->hash)
node = node->rb_left;
else if (hash > pos->hash)
node = node->rb_right;
- else
+ else {
+ if (idx != p_idx)
+ up_read_kernfs_rwsem(pos);
break;
+ }
+ if (idx != p_idx)
+ up_read_kernfs_rwsem(pos);
}
}
/* Skip over entries which are dying/dead or in the wrong namespace */
- while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
- struct rb_node *node = rb_next(&pos->rb);
+ while (pos) {
+ struct rb_node *node;
+
+ idx = hash_ptr(pos, NR_KERNFS_LOCK_BITS);
+ if (idx != p_idx)
+ down_read_kernfs_rwsem(pos, LOCK_SELF, 1);
+ if (kernfs_active(pos) && pos->ns == ns) {
+ if (idx != p_idx)
+ up_read_kernfs_rwsem(pos);
+ break;
+ }
+ node = rb_next(&pos->rb);
+ if (idx != p_idx)
+ up_read_kernfs_rwsem(pos);
if (!node)
pos = NULL;
else
@@ -1704,16 +1791,41 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
{
+ int idx, p_idx;
+ int unlock_node = 0;
+
+ p_idx = hash_ptr(parent, NR_KERNFS_LOCK_BITS);
+ lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem[p_idx]);
pos = kernfs_dir_pos(ns, parent, ino, pos);
if (pos) {
+ idx = hash_ptr(pos, NR_KERNFS_LOCK_BITS);
+ if (idx != p_idx)
+ down_read_kernfs_rwsem(pos, LOCK_SELF, 1);
do {
struct rb_node *node = rb_next(&pos->rb);
+ if (idx != p_idx) {
+ up_read_kernfs_rwsem(pos);
+ unlock_node = 0;
+ }
if (!node)
pos = NULL;
- else
+ else {
pos = rb_to_kn(node);
+ if (pos != NULL) {
+ idx = hash_ptr(pos,
+ NR_KERNFS_LOCK_BITS);
+ if (idx != p_idx) {
+ down_read_kernfs_rwsem(pos,
+ LOCK_SELF,
+ 1);
+ unlock_node = 1;
+ }
+ }
+ }
} while (pos && (!kernfs_active(pos) || pos->ns != ns));
}
+ if (unlock_node)
+ up_read_kernfs_rwsem(pos);
return pos;
}

@@ -1729,7 +1841,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
return 0;

root = kernfs_root(parent);
- down_read(&root->kernfs_rwsem);
+ down_read_kernfs_rwsem(parent, LOCK_SELF, 0);

if (kernfs_ns_enabled(parent))
ns = kernfs_info(dentry->d_sb)->ns;
@@ -1746,12 +1858,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
file->private_data = pos;
kernfs_get(pos);

- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(parent);
if (!dir_emit(ctx, name, len, ino, type))
return 0;
- down_read(&root->kernfs_rwsem);
+ down_read_kernfs_rwsem(parent, LOCK_SELF, 0);
}
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(parent);
file->private_data = NULL;
ctx->pos = INT_MAX;
return 0;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 018d038b72fdd..5124add292582 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -855,8 +855,9 @@ static void kernfs_notify_workfn(struct work_struct *work)

root = kernfs_root(kn);
/* kick fsnotify */
- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(kn, LOCK_SELF, 0);

+ down_write(&root->supers_rwsem);
list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
struct kernfs_node *parent;
struct inode *p_inode = NULL;
@@ -892,8 +893,9 @@ static void kernfs_notify_workfn(struct work_struct *work)

iput(inode);
}
+ up_write(&root->supers_rwsem);

- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(kn);
kernfs_put(kn);
goto repeat;
}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5daa..a8b16f08a667a 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -99,11 +99,10 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
int ret;
- struct kernfs_root *root = kernfs_root(kn);

- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(kn, LOCK_SELF, 0);
ret = __kernfs_setattr(kn, iattr);
- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(kn);
return ret;
}

@@ -119,7 +118,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return -EINVAL;

root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ down_write_kernfs_rwsem(kn, LOCK_SELF, 0);
error = setattr_prepare(&init_user_ns, dentry, iattr);
if (error)
goto out;
@@ -132,7 +131,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
setattr_copy(&init_user_ns, inode, iattr);

out:
- up_write(&root->kernfs_rwsem);
+ up_write_kernfs_rwsem(kn);
return error;
}

@@ -187,14 +186,13 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
{
struct inode *inode = d_inode(path->dentry);
struct kernfs_node *kn = inode->i_private;
- struct kernfs_root *root = kernfs_root(kn);

- down_read(&root->kernfs_rwsem);
+ down_read_kernfs_rwsem(kn, LOCK_SELF, 0);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&init_user_ns, inode, stat);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(kn);

return 0;
}
@@ -287,12 +285,12 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
kn = inode->i_private;
root = kernfs_root(kn);

- down_read(&root->kernfs_rwsem);
+ down_read_kernfs_rwsem(kn, LOCK_SELF, 0);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&init_user_ns, inode, mask);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(kn);

return ret;
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index cc49a6cd94154..3f011b323173c 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -19,6 +19,9 @@
#include <linux/kernfs.h>
#include <linux/fs_context.h>

+#define LOCK_SELF 0
+#define LOCK_SELF_AND_PARENT 1
+
struct kernfs_iattrs {
kuid_t ia_uid;
kgid_t ia_gid;
@@ -102,6 +105,115 @@ static inline bool kernfs_dir_changed(struct kernfs_node *parent,
return false;
}

+/*
+ * If both node and it's parent need locking,
+ * lock child first so that kernfs_rename_ns
+ * does not change the parent, leaving us
+ * with old parent here.
+ */
+static inline void down_write_kernfs_rwsem(struct kernfs_node *kn,
+ u8 lock_parent,
+ u8 nesting)
+{
+ int idx, p_idx;
+ struct kernfs_root *root;
+
+ idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+ root = kernfs_root(kn);
+
+ down_write_nested(&root->kernfs_rwsem[idx], nesting);
+
+ kernfs_get(kn);
+
+ if (kn->parent)
+ p_idx = hash_ptr(kn->parent, NR_KERNFS_LOCK_BITS);
+
+ if (kn->parent && lock_parent && p_idx != idx) {
+ /*
+ * Node and parent hash to different locks.
+ * node's lock has already been taken.
+ * Take parent's lock and update token.
+ */
+ down_write_nested(&root->kernfs_rwsem[p_idx],
+ nesting + 1);
+
+ kernfs_get(kn->parent);
+ kn->unlock_parent = 1;
+ }
+}
+
+static inline void up_write_kernfs_rwsem(struct kernfs_node *kn)
+{
+ int p_idx, idx;
+ struct kernfs_root *root;
+
+ /* node lock is already taken in down_xxx so kn->parent is safe */
+ p_idx = hash_ptr(kn->parent, NR_KERNFS_LOCK_BITS);
+ idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+ root = kernfs_root(kn);
+
+ if (kn->unlock_parent) {
+ kn->unlock_parent = 0;
+ up_write(&root->kernfs_rwsem[p_idx]);
+ kernfs_put(kn->parent);
+ }
+
+ up_write(&root->kernfs_rwsem[idx]);
+ kernfs_put(kn);
+}
+
+static inline void down_read_kernfs_rwsem(struct kernfs_node *kn,
+ u8 lock_parent,
+ u8 nesting)
+{
+ int idx, p_idx;
+ struct kernfs_root *root;
+
+ idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+ root = kernfs_root(kn);
+
+ down_read_nested(&root->kernfs_rwsem[idx], nesting);
+
+ kernfs_get(kn);
+
+ if (kn->parent)
+ p_idx = hash_ptr(kn->parent, NR_KERNFS_LOCK_BITS);
+
+ if (kn->parent && lock_parent && p_idx != idx) {
+ /*
+ * Node and parent hash to different locks.
+ * node's lock has already been taken.
+ * Take parent's lock and update token.
+ */
+ down_read_nested(&root->kernfs_rwsem[p_idx],
+ nesting + 1);
+
+ kernfs_get(kn->parent);
+
+ kn->unlock_parent = 1;
+ }
+}
+
+static inline void up_read_kernfs_rwsem(struct kernfs_node *kn)
+{
+ int p_idx, idx;
+ struct kernfs_root *root;
+
+ /* node lock is already taken in down_xxx so kn->parent is safe */
+ p_idx = hash_ptr(kn->parent, NR_KERNFS_LOCK_BITS);
+ idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+ root = kernfs_root(kn);
+
+ if (kn->unlock_parent) {
+ kn->unlock_parent = 0;
+ up_read(&root->kernfs_rwsem[p_idx]);
+ kernfs_put(kn->parent);
+ }
+
+ up_read(&root->kernfs_rwsem[idx]);
+ kernfs_put(kn);
+}
+
extern const struct super_operations kernfs_sops;
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index cfa79715fc1a7..ebb7d9a10f47e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -236,7 +236,6 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
{
struct kernfs_super_info *info = kernfs_info(sb);
- struct kernfs_root *kf_root = kfc->root;
struct inode *inode;
struct dentry *root;

@@ -256,9 +255,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
sb->s_shrink.seeks = 0;

/* get root inode, initialize and unlock it */
- down_read(&kf_root->kernfs_rwsem);
+ down_read_kernfs_rwsem(info->root->kn, 0, 0);
inode = kernfs_get_inode(sb, info->root->kn);
- up_read(&kf_root->kernfs_rwsem);
+ up_read_kernfs_rwsem(info->root->kn);
if (!inode) {
pr_debug("kernfs: could not get root inode\n");
return -ENOMEM;
@@ -346,9 +345,9 @@ int kernfs_get_tree(struct fs_context *fc)
}
sb->s_flags |= SB_ACTIVE;

- down_write(&root->kernfs_rwsem);
+ down_write(&root->supers_rwsem);
list_add(&info->node, &info->root->supers);
- up_write(&root->kernfs_rwsem);
+ up_write(&root->supers_rwsem);
}

fc->root = dget(sb->s_root);
@@ -375,9 +374,9 @@ void kernfs_kill_sb(struct super_block *sb)
struct kernfs_super_info *info = kernfs_info(sb);
struct kernfs_root *root = info->root;

- down_write(&root->kernfs_rwsem);
+ down_write(&root->supers_rwsem);
list_del(&info->node);
- up_write(&root->kernfs_rwsem);
+ up_write(&root->supers_rwsem);

/*
* Remove the superblock from fs_supers/s_instances
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 0ab13824822f7..5d4a769e2ab1e 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -113,12 +113,11 @@ static int kernfs_getlink(struct inode *inode, char *path)
struct kernfs_node *kn = inode->i_private;
struct kernfs_node *parent = kn->parent;
struct kernfs_node *target = kn->symlink.target_kn;
- struct kernfs_root *root = kernfs_root(parent);
int error;

- down_read(&root->kernfs_rwsem);
+ down_read_kernfs_rwsem(parent, LOCK_SELF, 0);
error = kernfs_get_target_path(parent, target, path);
- up_read(&root->kernfs_rwsem);
+ up_read_kernfs_rwsem(parent);

return error;
}
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 5bf9f02ce9dce..3b3c3e0b44083 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -179,6 +179,7 @@ struct kernfs_node {
*/
struct kernfs_node *parent;
const char *name;
+ u8 unlock_parent; /* release parent's rwsem */

struct rb_node rb;

@@ -237,9 +238,10 @@ struct kernfs_root {
struct list_head supers;

wait_queue_head_t deactivate_waitq;
- struct rw_semaphore kernfs_rwsem;
struct kernfs_open_node_lock open_node_locks[NR_KERNFS_LOCKS];
struct kernfs_open_file_mutex open_file_mutex[NR_KERNFS_LOCKS];
+ struct rw_semaphore supers_rwsem;
+ struct rw_semaphore kernfs_rwsem[NR_KERNFS_LOCKS];
};

struct kernfs_open_file {
@@ -619,5 +621,4 @@ static inline int kernfs_rename(struct kernfs_node *kn,
{
return kernfs_rename_ns(kn, new_parent, new_name, NULL);
}
-
#endif /* __LINUX_KERNFS_H */
--
2.30.2


2022-02-03 14:38:00

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v4 2/2] kernfs: Replace per-fs global rwsem with per-fs hashed rwsem.

Hi Imran,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on a70bf4a85b43cb952bd39dd948b103b1b3eb2cf8]

url: https://github.com/0day-ci/linux/commits/Imran-Khan/kernfs-use-hashed-mutex-and-spinlock-in-place-of-global-ones/20220202-225301
base: a70bf4a85b43cb952bd39dd948b103b1b3eb2cf8
config: um-i386_defconfig (https://download.01.org/0day-ci/archive/20220203/[email protected]/config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
# https://github.com/0day-ci/linux/commit/e1946294e8a0e6eb1f9876e7521c92f92c8c4af9
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Imran-Khan/kernfs-use-hashed-mutex-and-spinlock-in-place-of-global-ones/20220202-225301
git checkout e1946294e8a0e6eb1f9876e7521c92f92c8c4af9
# save the config file to linux build tree
mkdir build_dir
make W=1 O=build_dir ARCH=um SUBARCH=i386 SHELL=/bin/bash fs/kernfs/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

fs/kernfs/dir.c: In function 'kernfs_dop_revalidate':
>> fs/kernfs/dir.c:1056:22: warning: variable 'root' set but not used [-Wunused-but-set-variable]
1056 | struct kernfs_root *root;
| ^~~~
fs/kernfs/dir.c: In function 'kernfs_iop_lookup':
fs/kernfs/dir.c:1126:22: warning: variable 'root' set but not used [-Wunused-but-set-variable]
1126 | struct kernfs_root *root;
| ^~~~
fs/kernfs/dir.c: In function 'kernfs_remove_by_name_ns':
fs/kernfs/dir.c:1572:11: warning: unused variable 'p_idx' [-Wunused-variable]
1572 | int idx, p_idx;
| ^~~~~
fs/kernfs/dir.c:1572:6: warning: unused variable 'idx' [-Wunused-variable]
1572 | int idx, p_idx;
| ^~~
fs/kernfs/dir.c:1571:22: warning: variable 'root' set but not used [-Wunused-but-set-variable]
1571 | struct kernfs_root *root;
| ^~~~
fs/kernfs/dir.c: In function 'kernfs_rename_ns':
fs/kernfs/dir.c:1607:22: warning: variable 'root' set but not used [-Wunused-but-set-variable]
1607 | struct kernfs_root *root;
| ^~~~
fs/kernfs/dir.c: In function 'kernfs_fop_readdir':
fs/kernfs/dir.c:1837:22: warning: variable 'root' set but not used [-Wunused-but-set-variable]
1837 | struct kernfs_root *root;
| ^~~~


vim +/root +1056 fs/kernfs/dir.c

ea015218f2f7ac Eric W. Biederman 2015-05-13 1052
d826e0365199cc Ian Kent 2021-06-15 1053 static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
d826e0365199cc Ian Kent 2021-06-15 1054 {
d826e0365199cc Ian Kent 2021-06-15 1055 struct kernfs_node *kn;
393c3714081a53 Minchan Kim 2021-11-18 @1056 struct kernfs_root *root;
d826e0365199cc Ian Kent 2021-06-15 1057
d826e0365199cc Ian Kent 2021-06-15 1058 if (flags & LOOKUP_RCU)
d826e0365199cc Ian Kent 2021-06-15 1059 return -ECHILD;
d826e0365199cc Ian Kent 2021-06-15 1060
c7e7c04274b13f Ian Kent 2021-07-16 1061 /* Negative hashed dentry? */
c7e7c04274b13f Ian Kent 2021-07-16 1062 if (d_really_is_negative(dentry)) {
c7e7c04274b13f Ian Kent 2021-07-16 1063 struct kernfs_node *parent;
c7e7c04274b13f Ian Kent 2021-07-16 1064
c7e7c04274b13f Ian Kent 2021-07-16 1065 /* If the kernfs parent node has changed discard and
c7e7c04274b13f Ian Kent 2021-07-16 1066 * proceed to ->lookup.
c7e7c04274b13f Ian Kent 2021-07-16 1067 */
c7e7c04274b13f Ian Kent 2021-07-16 1068 spin_lock(&dentry->d_lock);
c7e7c04274b13f Ian Kent 2021-07-16 1069 parent = kernfs_dentry_node(dentry->d_parent);
c7e7c04274b13f Ian Kent 2021-07-16 1070 if (parent) {
c7e7c04274b13f Ian Kent 2021-07-16 1071 spin_unlock(&dentry->d_lock);
393c3714081a53 Minchan Kim 2021-11-18 1072 root = kernfs_root(parent);
e1946294e8a0e6 Imran Khan 2022-02-03 1073 down_read_kernfs_rwsem(parent, LOCK_SELF, 0);
393c3714081a53 Minchan Kim 2021-11-18 1074 if (kernfs_dir_changed(parent, dentry)) {
e1946294e8a0e6 Imran Khan 2022-02-03 1075 up_read_kernfs_rwsem(parent);
c7e7c04274b13f Ian Kent 2021-07-16 1076 return 0;
c7e7c04274b13f Ian Kent 2021-07-16 1077 }
e1946294e8a0e6 Imran Khan 2022-02-03 1078 up_read_kernfs_rwsem(parent);
393c3714081a53 Minchan Kim 2021-11-18 1079 } else
c7e7c04274b13f Ian Kent 2021-07-16 1080 spin_unlock(&dentry->d_lock);
c7e7c04274b13f Ian Kent 2021-07-16 1081
c7e7c04274b13f Ian Kent 2021-07-16 1082 /* The kernfs parent node hasn't changed, leave the
c7e7c04274b13f Ian Kent 2021-07-16 1083 * dentry negative and return success.
c7e7c04274b13f Ian Kent 2021-07-16 1084 */
c7e7c04274b13f Ian Kent 2021-07-16 1085 return 1;
c7e7c04274b13f Ian Kent 2021-07-16 1086 }
d826e0365199cc Ian Kent 2021-06-15 1087
d826e0365199cc Ian Kent 2021-06-15 1088 kn = kernfs_dentry_node(dentry);
393c3714081a53 Minchan Kim 2021-11-18 1089 root = kernfs_root(kn);
e1946294e8a0e6 Imran Khan 2022-02-03 1090 down_read_kernfs_rwsem(kn, LOCK_SELF, 0);
d826e0365199cc Ian Kent 2021-06-15 1091
d826e0365199cc Ian Kent 2021-06-15 1092 /* The kernfs node has been deactivated */
d826e0365199cc Ian Kent 2021-06-15 1093 if (!kernfs_active(kn))
d826e0365199cc Ian Kent 2021-06-15 1094 goto out_bad;
d826e0365199cc Ian Kent 2021-06-15 1095
d826e0365199cc Ian Kent 2021-06-15 1096 /* The kernfs node has been moved? */
d826e0365199cc Ian Kent 2021-06-15 1097 if (kernfs_dentry_node(dentry->d_parent) != kn->parent)
d826e0365199cc Ian Kent 2021-06-15 1098 goto out_bad;
d826e0365199cc Ian Kent 2021-06-15 1099
d826e0365199cc Ian Kent 2021-06-15 1100 /* The kernfs node has been renamed */
d826e0365199cc Ian Kent 2021-06-15 1101 if (strcmp(dentry->d_name.name, kn->name) != 0)
d826e0365199cc Ian Kent 2021-06-15 1102 goto out_bad;
d826e0365199cc Ian Kent 2021-06-15 1103
d826e0365199cc Ian Kent 2021-06-15 1104 /* The kernfs node has been moved to a different namespace */
d826e0365199cc Ian Kent 2021-06-15 1105 if (kn->parent && kernfs_ns_enabled(kn->parent) &&
d826e0365199cc Ian Kent 2021-06-15 1106 kernfs_info(dentry->d_sb)->ns != kn->ns)
d826e0365199cc Ian Kent 2021-06-15 1107 goto out_bad;
d826e0365199cc Ian Kent 2021-06-15 1108
e1946294e8a0e6 Imran Khan 2022-02-03 1109 up_read_kernfs_rwsem(kn);
d826e0365199cc Ian Kent 2021-06-15 1110 return 1;
d826e0365199cc Ian Kent 2021-06-15 1111 out_bad:
e1946294e8a0e6 Imran Khan 2022-02-03 1112 up_read_kernfs_rwsem(kn);
d826e0365199cc Ian Kent 2021-06-15 1113 return 0;
d826e0365199cc Ian Kent 2021-06-15 1114 }
d826e0365199cc Ian Kent 2021-06-15 1115

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]