Reduce contention around global locks used in kernfs.
PATCH-1: Make global kernfs_open_file_mutex and kernfs_open_node_lock
hashed locks, where address of a kernfs_node acts as hash key.
This results in kernfs_node objects, whose address give the
different hash value, using different kernfs_open_file_mutex
and kernfs_open_node_lock rather than all kernfs_node objects
using the same kernfs_open_file_mutex and kernfs_open_node_lock
as was the case earlier.
PATCH-2: Use a hashed rw_semaphore to access permissions, so that we can
avoid contention around global per-fs rw_semaphore, seen if
multiple applications are executing inode_permission and
walk_component in parallel when trying to open sysfs file(s).
------------------------------------------------------------------
Changes since v2:
- Remove RFC tag
- Use hashed locks rather than using per kernfs_node specific lock
(Suggested by Tejun Heo <[email protected]>)
Imran Khan (2):
kernfs: use hashed mutex and spinlock in place of global ones.
kernfs: Reduce contention around global per-fs kernfs_rwsem.
fs/kernfs/dir.c | 8 +++++
fs/kernfs/file.c | 65 ++++++++++++++++-----------------
fs/kernfs/inode.c | 35 ++++++++++++------
fs/kernfs/kernfs-internal.h | 71 +++++++++++++++++++++++++++++++++++++
fs/kernfs/mount.c | 11 ++++++
5 files changed, 146 insertions(+), 44 deletions(-)
base-commit: a70bf4a85b43cb952bd39dd948b103b1b3eb2cf8
--
2.30.2
Right now a global mutex (kernfs_open_file_mutex) protects list of
kernfs_open_file instances corresponding to a sysfs attribute. So even
if different tasks are opening or closing different sysfs files they
can contend on osq_lock of this mutex. The contention is more apparent
in large scale systems with few hundred CPUs where most of the CPUs have
running tasks that are opening, accessing or closing sysfs files at any
point of time. Since each list of kernfs_open_file belongs to a
kernfs_open_node instance which in turn corresponds to one kernfs_node,
moving global kernfs_open_file_mutex within kernfs_node would sound like
fixing this contention but it has unwanted side effect of bloating up
kernfs_node size and hence kobject memory usage.
Also since kernfs_node->attr.open points to kernfs_open_node instance
corresponding to the kernfs_node, we can use a kernfs_node specific
spinlock in place of current global spinlock i.e kernfs_open_node_lock.
But this approach will increase kobject memory usage as well.
Use hashed locks in place of above mentioned global locks to reduce
kernfs access contention without increasing kobject memory usage.
Signed-off-by: Imran Khan <[email protected]>
---
fs/kernfs/file.c | 65 ++++++++++++++++++-------------------
fs/kernfs/kernfs-internal.h | 57 ++++++++++++++++++++++++++++++++
fs/kernfs/mount.c | 11 +++++++
3 files changed, 99 insertions(+), 34 deletions(-)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 9414a7a60a9f..5fdff8597082 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -18,20 +18,6 @@
#include "kernfs-internal.h"
-/*
- * There's one kernfs_open_file for each open file and one kernfs_open_node
- * for each kernfs_node with one or more open files.
- *
- * kernfs_node->attr.open points to kernfs_open_node. attr.open is
- * protected by kernfs_open_node_lock.
- *
- * filp->private_data points to seq_file whose ->private points to
- * kernfs_open_file. kernfs_open_files are chained at
- * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
- */
-static DEFINE_SPINLOCK(kernfs_open_node_lock);
-static DEFINE_MUTEX(kernfs_open_file_mutex);
-
struct kernfs_open_node {
atomic_t refcnt;
atomic_t event;
@@ -524,10 +510,13 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
struct kernfs_open_node *on, *new_on = NULL;
-
+ struct mutex *mutex = NULL;
+ spinlock_t *lock = NULL;
retry:
- mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irq(&kernfs_open_node_lock);
+ mutex = open_file_mutex_ptr(kn);
+ lock = open_node_lock_ptr(kn);
+ mutex_lock(mutex);
+ spin_lock_irq(lock);
if (!kn->attr.open && new_on) {
kn->attr.open = new_on;
@@ -540,8 +529,8 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
list_add_tail(&of->list, &on->files);
}
- spin_unlock_irq(&kernfs_open_node_lock);
- mutex_unlock(&kernfs_open_file_mutex);
+ spin_unlock_irq(lock);
+ mutex_unlock(mutex);
if (on) {
kfree(new_on);
@@ -575,10 +564,15 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
struct kernfs_open_node *on = kn->attr.open;
+ struct mutex *mutex = NULL;
+ spinlock_t *lock = NULL;
unsigned long flags;
- mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
+ mutex = open_file_mutex_ptr(kn);
+ lock = open_node_lock_ptr(kn);
+
+ mutex_lock(mutex);
+ spin_lock_irqsave(lock, flags);
if (of)
list_del(&of->list);
@@ -588,8 +582,8 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
else
on = NULL;
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
- mutex_unlock(&kernfs_open_file_mutex);
+ spin_unlock_irqrestore(lock, flags);
+ mutex_unlock(mutex);
kfree(on);
}
@@ -729,11 +723,11 @@ static void kernfs_release_file(struct kernfs_node *kn,
/*
* @of is guaranteed to have no other file operations in flight and
* we just want to synchronize release and drain paths.
- * @kernfs_open_file_mutex is enough. @of->mutex can't be used
+ * @open_file_mutex is enough. @of->mutex can't be used
* here because drain path may be called from places which can
* cause circular dependency.
*/
- lockdep_assert_held(&kernfs_open_file_mutex);
+ lockdep_assert_held(open_file_mutex_ptr(kn));
if (!of->released) {
/*
@@ -752,9 +746,9 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
struct kernfs_open_file *of = kernfs_of(filp);
if (kn->flags & KERNFS_HAS_RELEASE) {
- mutex_lock(&kernfs_open_file_mutex);
+ mutex_lock(open_file_mutex_ptr(kn));
kernfs_release_file(kn, of);
- mutex_unlock(&kernfs_open_file_mutex);
+ mutex_unlock(open_file_mutex_ptr(kn));
}
kernfs_put_open_node(kn, of);
@@ -769,19 +763,23 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
{
struct kernfs_open_node *on;
struct kernfs_open_file *of;
+ struct mutex *mutex = NULL;
+ spinlock_t *lock = NULL;
if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
return;
- spin_lock_irq(&kernfs_open_node_lock);
+ lock = open_node_lock_ptr(kn);
+ spin_lock_irq(lock);
on = kn->attr.open;
if (on)
atomic_inc(&on->refcnt);
- spin_unlock_irq(&kernfs_open_node_lock);
+ spin_unlock_irq(lock);
if (!on)
return;
- mutex_lock(&kernfs_open_file_mutex);
+ mutex = open_file_mutex_ptr(kn);
+ mutex_lock(mutex);
list_for_each_entry(of, &on->files, list) {
struct inode *inode = file_inode(of->file);
@@ -793,8 +791,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
kernfs_release_file(kn, of);
}
- mutex_unlock(&kernfs_open_file_mutex);
-
+ mutex_unlock(mutex);
kernfs_put_open_node(kn, NULL);
}
@@ -922,13 +919,13 @@ void kernfs_notify(struct kernfs_node *kn)
return;
/* kick poll immediately */
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
+ spin_lock_irqsave(open_node_lock_ptr(kn), flags);
on = kn->attr.open;
if (on) {
atomic_inc(&on->event);
wake_up_interruptible(&on->poll);
}
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+ spin_unlock_irqrestore(open_node_lock_ptr(kn), flags);
/* schedule work to kick fsnotify */
spin_lock_irqsave(&kernfs_notify_lock, flags);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index f9cc912c31e1..4bdcf7a71845 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -19,6 +19,63 @@
#include <linux/kernfs.h>
#include <linux/fs_context.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/cache.h>
+
+#ifdef CONFIG_SMP
+#define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32)))
+#else
+#define NR_KERNFS_LOCK_BITS 1
+#endif
+
+#define NR_KERNFS_LOCKS (1 << NR_KERNFS_LOCK_BITS)
+
+struct kernfs_open_node_lock {
+ spinlock_t lock;
+} ____cacheline_aligned_in_smp;
+
+struct kernfs_open_file_mutex {
+ struct mutex lock;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
+ * for each kernfs_node with one or more open files.
+ *
+ * kernfs_node->attr.open points to kernfs_open_node. attr.open is
+ * protected by open_node_locks[i].
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * kernfs_open_file. kernfs_open_files are chained at
+ * kernfs_open_node->files, which is protected by open_file_mutex[i].
+ *
+ * To reduce possible contention in sysfs access, arising due to single
+ * locks, use an array of locks and use kernfs_node object address as
+ * hash keys to get the index of these locks.
+ */
+
+struct kernfs_global_locks {
+ struct kernfs_open_node_lock open_node_locks[NR_KERNFS_LOCKS];
+ struct kernfs_open_file_mutex open_file_mutex[NR_KERNFS_LOCKS];
+};
+
+static struct kernfs_global_locks kernfs_global_locks;
+
+static inline spinlock_t *open_node_lock_ptr(struct kernfs_node *kn)
+{
+ int index = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+ return &kernfs_global_locks.open_node_locks[index].lock;
+}
+
+static inline struct mutex *open_file_mutex_ptr(struct kernfs_node *kn)
+{
+ int index = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+ return &kernfs_global_locks.open_file_mutex[index].lock;
+}
+
struct kernfs_iattrs {
kuid_t ia_uid;
kgid_t ia_gid;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index cfa79715fc1a..accfea3ccae9 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -387,6 +387,16 @@ void kernfs_kill_sb(struct super_block *sb)
kfree(info);
}
+void __init kernfs_lock_init(void)
+{
+ int count;
+
+ for (count = 0; count < NR_KERNFS_LOCKS; count++) {
+ spin_lock_init(&kernfs_global_locks.open_node_locks[count].lock);
+ mutex_init(&kernfs_global_locks.open_file_mutex[count].lock);
+ }
+}
+
void __init kernfs_init(void)
{
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
@@ -397,4 +407,5 @@ void __init kernfs_init(void)
kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
sizeof(struct kernfs_iattrs),
0, SLAB_PANIC, NULL);
+ kernfs_lock_init();
}
--
2.30.2
Right now a global per file system based rwsem (kernfs_rwsem)
synchronizes multiple kernfs operations. On a large system with
few hundred CPUs and few hundred applications simultaenously trying
to access sysfs, this results in multiple sys_open(s) contending on
kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate.
- 21.42% 21.34% showgids [kernel.kallsyms] [k] up_read
21.34% __libc_start_main
__GI___libc_open
entry_SYSCALL_64_after_hwframe
do_syscall_64
sys_open
do_sys_open
do_filp_open
- path_openat
- 20.05% link_path_walk
- 9.76% walk_component
lookup_fast
- d_revalidate.part.24
- 9.75% kernfs_dop_revalidate
up_read
- 9.46% inode_permission
- __inode_permission
- 9.46% kernfs_iop_permission
up_read
- 0.83% kernfs_iop_get_link
up_read
- 0.80% lookup_fast
d_revalidate.part.24
kernfs_dop_revalidate
up_read
- 21.31% 21.21% showgids [kernel.kallsyms] [k] down_read
21.21% __libc_start_main
__GI___libc_open
entry_SYSCALL_64_after_hwframe
do_syscall_64
sys_open
do_sys_open
do_filp_open
- path_openat
- 19.78% link_path_walk
- 10.62% inode_permission
- __inode_permission
- 10.62% kernfs_iop_permission
down_read
- 8.45% walk_component
lookup_fast
- d_revalidate.part.24
- 8.45% kernfs_dop_revalidate
down_read
- 0.71% kernfs_iop_get_link
down_read
- 0.72% lookup_fast
- d_revalidate.part.24
- 0.72% kernfs_dop_revalidate
down_read
- 0.71% may_open
inode_permission
__inode_permission
kernfs_iop_permission
down_read
Since permission is specific to a kernfs_node we can use a hashed
lock to access/modify permission. Also use kernfs reference counting
to ensure we are accessing/modifying permissions for an existing
kernfs_node object.
Using this change brings down the above mentioned down_read/up_read
numbers to ~8%, thus indicating that contention around kernfs_rwsem
has reduced to about 1/3rd of earlier value.
Signed-off-by: Imran Khan <[email protected]>
---
fs/kernfs/dir.c | 8 ++++++++
fs/kernfs/inode.c | 35 +++++++++++++++++++++++++----------
fs/kernfs/kernfs-internal.h | 18 ++++++++++++++++--
3 files changed, 49 insertions(+), 12 deletions(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e6d9772ddb4c..37daaffda718 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -720,6 +720,7 @@ int kernfs_add_one(struct kernfs_node *kn)
struct kernfs_node *parent = kn->parent;
struct kernfs_root *root = kernfs_root(parent);
struct kernfs_iattrs *ps_iattr;
+ struct rw_semaphore *rwsem = NULL;
bool has_ns;
int ret;
@@ -748,11 +749,14 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;
/* Update timestamps on the parent */
+ rwsem = iattr_rwsem_ptr(parent);
+ down_write(rwsem);
ps_iattr = parent->iattr;
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
+ up_write(rwsem);
up_write(&root->kernfs_rwsem);
@@ -1326,6 +1330,7 @@ void kernfs_activate(struct kernfs_node *kn)
static void __kernfs_remove(struct kernfs_node *kn)
{
struct kernfs_node *pos;
+ struct rw_semaphore *rwsem;
lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
@@ -1378,8 +1383,11 @@ static void __kernfs_remove(struct kernfs_node *kn)
/* update timestamps on the parent */
if (ps_iattr) {
+ rwsem = iattr_rwsem_ptr(pos->parent);
+ down_write(rwsem);
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
+ up_write(rwsem);
}
kernfs_put(pos);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5da..7a55ad440bf4 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -99,11 +99,15 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
int ret;
- struct kernfs_root *root = kernfs_root(kn);
+ struct rw_semaphore *rwsem = NULL;
- down_write(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ rwsem = iattr_rwsem_ptr(kn);
+ down_write(rwsem);
ret = __kernfs_setattr(kn, iattr);
- up_write(&root->kernfs_rwsem);
+ up_write(rwsem);
+ kernfs_put(kn);
+
return ret;
}
@@ -112,6 +116,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct kernfs_node *kn = inode->i_private;
+ struct rw_semaphore *rwsem = NULL;
struct kernfs_root *root;
int error;
@@ -119,7 +124,9 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return -EINVAL;
root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ rwsem = iattr_rwsem_ptr(kn);
+ down_write(rwsem);
error = setattr_prepare(&init_user_ns, dentry, iattr);
if (error)
goto out;
@@ -132,7 +139,8 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
setattr_copy(&init_user_ns, inode, iattr);
out:
- up_write(&root->kernfs_rwsem);
+ up_write(rwsem);
+ kernfs_put(kn);
return error;
}
@@ -187,14 +195,17 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
{
struct inode *inode = d_inode(path->dentry);
struct kernfs_node *kn = inode->i_private;
- struct kernfs_root *root = kernfs_root(kn);
+ struct rw_semaphore *rwsem = NULL;
- down_read(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ rwsem = iattr_rwsem_ptr(kn);
+ down_read(rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&init_user_ns, inode, stat);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ up_read(rwsem);
+ kernfs_put(kn);
return 0;
}
@@ -279,6 +290,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
{
struct kernfs_node *kn;
struct kernfs_root *root;
+ struct rw_semaphore *rwsem = NULL;
int ret;
if (mask & MAY_NOT_BLOCK)
@@ -287,12 +299,15 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
kn = inode->i_private;
root = kernfs_root(kn);
- down_read(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ rwsem = iattr_rwsem_ptr(kn);
+ down_read(rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&init_user_ns, inode, mask);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ up_read(rwsem);
+ kernfs_put(kn);
return ret;
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 4bdcf7a71845..f1977d72369a 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -39,17 +39,23 @@ struct kernfs_open_file_mutex {
struct mutex lock;
} ____cacheline_aligned_in_smp;
+struct kernfs_iattr_rwsem {
+ struct rw_semaphore rwsem;
+} ____cacheline_aligned_in_smp;
+
/*
* There's one kernfs_open_file for each open file and one kernfs_open_node
* for each kernfs_node with one or more open files.
*
* kernfs_node->attr.open points to kernfs_open_node. attr.open is
- * protected by open_node_locks[i].
+ * protected by open_node_locks[i].lock.
*
* filp->private_data points to seq_file whose ->private points to
* kernfs_open_file. kernfs_open_files are chained at
- * kernfs_open_node->files, which is protected by open_file_mutex[i].
+ * kernfs_open_node->files, which is protected by open_file_mutex[i].lock.
*
+ * kernfs_node->iattr points to kernfs_node's attributes and is
+ * protected by iattr_rwsem[i].rwsem
* To reduce possible contention in sysfs access, arising due to single
* locks, use an array of locks and use kernfs_node object address as
* hash keys to get the index of these locks.
@@ -58,6 +64,7 @@ struct kernfs_open_file_mutex {
struct kernfs_global_locks {
struct kernfs_open_node_lock open_node_locks[NR_KERNFS_LOCKS];
struct kernfs_open_file_mutex open_file_mutex[NR_KERNFS_LOCKS];
+ struct kernfs_iattr_rwsem iattr_rwsem[NR_KERNFS_LOCKS];
};
static struct kernfs_global_locks kernfs_global_locks;
@@ -76,6 +83,13 @@ static inline struct mutex *open_file_mutex_ptr(struct kernfs_node *kn)
return &kernfs_global_locks.open_file_mutex[index].lock;
}
+static inline struct rw_semaphore *iattr_rwsem_ptr(struct kernfs_node *kn)
+{
+ int index = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+ return &kernfs_global_locks.iattr_rwsem[index].rwsem;
+}
+
struct kernfs_iattrs {
kuid_t ia_uid;
kgid_t ia_gid;
--
2.30.2
On Thu, Jan 13, 2022 at 09:42:57PM +1100, Imran Khan wrote:
> Reduce contention around global locks used in kernfs.
>
> PATCH-1: Make global kernfs_open_file_mutex and kernfs_open_node_lock
> hashed locks, where address of a kernfs_node acts as hash key.
> This results in kernfs_node objects, whose address give the
> different hash value, using different kernfs_open_file_mutex
> and kernfs_open_node_lock rather than all kernfs_node objects
> using the same kernfs_open_file_mutex and kernfs_open_node_lock
> as was the case earlier.
>
> PATCH-2: Use a hashed rw_semaphore to access permissions, so that we can
> avoid contention around global per-fs rw_semaphore, seen if
> multiple applications are executing inode_permission and
> walk_component in parallel when trying to open sysfs file(s).
>
> ------------------------------------------------------------------
>
> Changes since v2:
> - Remove RFC tag
> - Use hashed locks rather than using per kernfs_node specific lock
> (Suggested by Tejun Heo <[email protected]>)
>
I will look at this closer after the merge window is closed (can't do
anything until then), but do these changes actually solve your
contention problem? You don't say so here, so it's hard to tell.
thanks,
greg k-h
On Thu, Jan 13, 2022 at 09:42:59PM +1100, Imran Khan wrote:
> Right now a global per file system based rwsem (kernfs_rwsem)
> synchronizes multiple kernfs operations. On a large system with
> few hundred CPUs and few hundred applications simultaenously trying
> to access sysfs, this results in multiple sys_open(s) contending on
> kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate.
>
> - 21.42% 21.34% showgids [kernel.kallsyms] [k] up_read
> 21.34% __libc_start_main
> __GI___libc_open
> entry_SYSCALL_64_after_hwframe
> do_syscall_64
> sys_open
> do_sys_open
> do_filp_open
> - path_openat
> - 20.05% link_path_walk
> - 9.76% walk_component
> lookup_fast
> - d_revalidate.part.24
> - 9.75% kernfs_dop_revalidate
> up_read
> - 9.46% inode_permission
> - __inode_permission
> - 9.46% kernfs_iop_permission
> up_read
> - 0.83% kernfs_iop_get_link
> up_read
> - 0.80% lookup_fast
> d_revalidate.part.24
> kernfs_dop_revalidate
> up_read
>
> - 21.31% 21.21% showgids [kernel.kallsyms] [k] down_read
> 21.21% __libc_start_main
> __GI___libc_open
> entry_SYSCALL_64_after_hwframe
> do_syscall_64
> sys_open
> do_sys_open
> do_filp_open
> - path_openat
> - 19.78% link_path_walk
> - 10.62% inode_permission
> - __inode_permission
> - 10.62% kernfs_iop_permission
> down_read
> - 8.45% walk_component
> lookup_fast
> - d_revalidate.part.24
> - 8.45% kernfs_dop_revalidate
> down_read
> - 0.71% kernfs_iop_get_link
> down_read
> - 0.72% lookup_fast
> - d_revalidate.part.24
> - 0.72% kernfs_dop_revalidate
> down_read
> - 0.71% may_open
> inode_permission
> __inode_permission
> kernfs_iop_permission
> down_read
>
> Since permission is specific to a kernfs_node we can use a hashed
> lock to access/modify permission. Also use kernfs reference counting
> to ensure we are accessing/modifying permissions for an existing
> kernfs_node object.
>
> Using this change brings down the above mentioned down_read/up_read
> numbers to ~8%, thus indicating that contention around kernfs_rwsem
> has reduced to about 1/3rd of earlier value.
Ah, nevermind, you do post the results here, I should have kept reading.
Nice work!
I'll look at these after 5.17-rc1 is out, thanks!
greg k-h
Hello,
On Thu, Jan 13, 2022 at 09:42:58PM +1100, Imran Khan wrote:
> +#ifdef CONFIG_SMP
> +#define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32)))
How did the 32 limit come to be? It'd be nice to have a comment explaining
that this is something which affects scalability and brief rationale on the
current number.
> +static inline spinlock_t *open_node_lock_ptr(struct kernfs_node *kn)
> +{
> + int index = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
> +
> + return &kernfs_global_locks.open_node_locks[index].lock;
> +}
> +
> +static inline struct mutex *open_file_mutex_ptr(struct kernfs_node *kn)
> +{
> + int index = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
> +
> + return &kernfs_global_locks.open_file_mutex[index].lock;
> +}
I wonder whether it'd be useful to provide some helpers so that users don't
have to get the pointer for the lock and then lock it in separate steps.
Would it make sense to provide something which locks and returns the pointer
(or token)?
Thanks.
--
tejun
Hello,
On Thu, Jan 13, 2022 at 09:42:59PM +1100, Imran Khan wrote:
> @@ -748,11 +749,14 @@ int kernfs_add_one(struct kernfs_node *kn)
> goto out_unlock;
>
> /* Update timestamps on the parent */
> + rwsem = iattr_rwsem_ptr(parent);
> + down_write(rwsem);
> ps_iattr = parent->iattr;
> if (ps_iattr) {
> ktime_get_real_ts64(&ps_iattr->ia_ctime);
> ps_iattr->ia_mtime = ps_iattr->ia_ctime;
> }
> + up_write(rwsem);
>
> up_write(&root->kernfs_rwsem);
Hmmm, so the additions / removals are still fs-global lock protected. Would
it be possible to synchronize them through hashed locks too? We can provide
double locking helpers - look up locks for both parent and child and if
different lock in the defined order (parent first most likely) and record
what happened in a token so that it can be undone later.
Without going through the code carefully, I don't remember whether there's
something which depends on global locking but I'm sure we can fix them too.
It'd be really nice if we can make all operations similarly scalable cuz
with heavy stacking addition/removals can get pretty hot too.
Thanks.
--
tejun
Hi Tejun,
Thanks for the review.
On 14/1/22 3:42 am, Tejun Heo wrote:
> Hello,
>
> On Thu, Jan 13, 2022 at 09:42:59PM +1100, Imran Khan wrote:
>> @@ -748,11 +749,14 @@ int kernfs_add_one(struct kernfs_node *kn)
>> goto out_unlock;
>>
>> /* Update timestamps on the parent */
>> + rwsem = iattr_rwsem_ptr(parent);
>> + down_write(rwsem);
>> ps_iattr = parent->iattr;
>> if (ps_iattr) {
>> ktime_get_real_ts64(&ps_iattr->ia_ctime);
>> ps_iattr->ia_mtime = ps_iattr->ia_ctime;
>> }
>> + up_write(rwsem);
>>
>> up_write(&root->kernfs_rwsem);
>
> Hmmm, so the additions / removals are still fs-global lock protected. Would
> it be possible to synchronize them through hashed locks too? We can provide
> double locking helpers - look up locks for both parent and child and if
> different lock in the defined order (parent first most likely) and record
> what happened in a token so that it can be undone later.
>
I have made changes inline with your suggestion to synchronize
addition/removal through hashed locks but so far I am not using tokens.
I am currently testing these changes (so far no issues seen). Before
floating next version for review I wanted to understand the reason
behind need of tokens. Could you please elaborate a bit about what needs
/ may have to be recorded in tokens. Just one example will do. It would
help me consolidate the next version of this change without overlooking
something.
Thanks
-- Imran
Hello,
On Sat, Jan 15, 2022 at 04:08:10AM +1100, Imran Khan wrote:
> I have made changes inline with your suggestion to synchronize
> addition/removal through hashed locks but so far I am not using tokens.
> I am currently testing these changes (so far no issues seen). Before
> floating next version for review I wanted to understand the reason
> behind need of tokens. Could you please elaborate a bit about what needs
> / may have to be recorded in tokens. Just one example will do. It would
> help me consolidate the next version of this change without overlooking
> something.
Oh, just sth to carry what needs to be done to unlock. If you didn't need
them and returning pointers to the locks was enough, that's fine too but if
double locking was necessary for e.g. removals, encapsulating it in a struct
may be neater.
Thanks.
--
tejun
Greeting,
FYI, we noticed the following commit (built with gcc-9):
commit: 86522249760a6c44da99474289f00fb302a8c393 ("[PATCH v3 2/2] kernfs: Reduce contention around global per-fs kernfs_rwsem.")
url: https://github.com/0day-ci/linux/commits/Imran-Khan/kernfs-use-hashed-mutex-and-spinlock-in-place-of-global-ones/20220113-184429
patch link: https://lore.kernel.org/lkml/[email protected]
in testcase: boot
on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>
[ 3.958358][ T0] WARNING: CPU: 0 PID: 0 at kernel/locking/rwsem.c:1309 up_write (kernel/locking/rwsem.c:1309 kernel/locking/rwsem.c:1567)
[ 3.959313][ T0] Modules linked in:
[ 3.960313][ T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-next-20220111-00002-g86522249760a #1
[ 3.961315][ T0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[ 3.962312][ T0] RIP: 0010:up_write (kernel/locking/rwsem.c:1309 kernel/locking/rwsem.c:1567)
[ 3.963311][ T0] Code: 80 3c 02 00 0f 85 8b 01 00 00 ff 34 24 48 8b 55 00 4d 89 f9 4d 89 f0 48 c7 c6 20 79 0c 9d 48 c7 c7 60 79 0c 9d e8 70 89 5b 02 <0f> 0b 59 e9 ba fe ff ff 48 89 df e8 bc c6 63 00 e9 8c fd ff ff 4c
All code
========
0: 80 3c 02 00 cmpb $0x0,(%rdx,%rax,1)
4: 0f 85 8b 01 00 00 jne 0x195
a: ff 34 24 pushq (%rsp)
d: 48 8b 55 00 mov 0x0(%rbp),%rdx
11: 4d 89 f9 mov %r15,%r9
14: 4d 89 f0 mov %r14,%r8
17: 48 c7 c6 20 79 0c 9d mov $0xffffffff9d0c7920,%rsi
1e: 48 c7 c7 60 79 0c 9d mov $0xffffffff9d0c7960,%rdi
25: e8 70 89 5b 02 callq 0x25b899a
2a:* 0f 0b ud2 <-- trapping instruction
2c: 59 pop %rcx
2d: e9 ba fe ff ff jmpq 0xfffffffffffffeec
32: 48 89 df mov %rbx,%rdi
35: e8 bc c6 63 00 callq 0x63c6f6
3a: e9 8c fd ff ff jmpq 0xfffffffffffffdcb
3f: 4c rex.WR
Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: 59 pop %rcx
3: e9 ba fe ff ff jmpq 0xfffffffffffffec2
8: 48 89 df mov %rbx,%rdi
b: e8 bc c6 63 00 callq 0x63c6cc
10: e9 8c fd ff ff jmpq 0xfffffffffffffda1
15: 4c rex.WR
[ 3.965308][ T0] RSP: 0000:ffffffff9e007bb0 EFLAGS: 00010286
[ 3.966309][ T0] RAX: 0000000000000000 RBX: ffffffff9efe76e8 RCX: 0000000000000000
[ 3.967311][ T0] RDX: c0000000ffff7fff RSI: ffffffff9e0078d0 RDI: fffffbfff3c00f68
[ 3.968311][ T0] RBP: ffffffffa0a80140 R08: 0000000000000000 R09: fffffbfff3c00f04
[ 3.969308][ T0] R10: ffffffff9e00781f R11: fffffbfff3c00f03 R12: ffffffffa0a80148
[ 3.970310][ T0] R13: ffffffffa0a801a8 R14: ffffffff9e038500 R15: ffffffff9e038500
[ 3.971309][ T0] FS: 0000000000000000(0000) GS:ffff88839d200000(0000) knlGS:0000000000000000
[ 3.972315][ T0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3.973313][ T0] CR2: ffff88843ffff000 CR3: 0000000111e2a000 CR4: 00000000000006b0
[ 3.974314][ T0] Call Trace:
[ 3.975314][ T0] <TASK>
[ 3.976327][ T0] kernfs_add_one (fs/kernfs/dir.c:761)
[ 3.977311][ T0] ? kernfs_get (fs/kernfs/dir.c:513)
[ 3.978323][ T0] kernfs_create_dir_ns (fs/kernfs/dir.c:1008)
[ 3.979317][ T0] sysfs_create_dir_ns (fs/sysfs/dir.c:61)
[ 3.980314][ T0] ? kernfs_activate (fs/kernfs/dir.c:1316)
[ 3.981314][ T0] ? sysfs_create_mount_point (fs/sysfs/dir.c:41)
[ 3.982312][ T0] ? rcu_read_lock_sched_held (kernel/rcu/update.c:123)
[ 3.983313][ T0] ? rcu_read_lock_bh_held (kernel/rcu/update.c:120)
[ 3.984318][ T0] ? rcu_read_lock_held_common (kernel/rcu/update.c:104)
[ 3.985336][ T0] ? rcu_read_lock_sched_held (kernel/rcu/update.c:123)
[ 3.986323][ T0] kobject_add_internal (lib/kobject.c:89 lib/kobject.c:255)
[ 3.987340][ T0] kobject_add (lib/kobject.c:390 lib/kobject.c:442)
[ 3.988320][ T0] ? kobject_add_internal (lib/kobject.c:428)
[ 3.989322][ T0] ? kasan_unpoison (mm/kasan/shadow.c:108 mm/kasan/shadow.c:142)
[ 3.990310][ T0] ? __kasan_slab_alloc (mm/kasan/common.c:431 mm/kasan/common.c:469)
[ 3.991327][ T0] kobject_create_and_add (lib/kobject.c:815)
[ 3.992314][ T0] mnt_init (fs/namespace.c:4372)
[ 3.993319][ T0] ? init_fs_namespace_sysctls (fs/namespace.c:4346)
[ 3.994309][ T0] ? _raw_spin_unlock_irqrestore (arch/x86/include/asm/irqflags.h:45 arch/x86/include/asm/irqflags.h:80 arch/x86/include/asm/irqflags.h:138 include/linux/spinlock_api_smp.h:151 kernel/locking/spinlock.c:194)
[ 3.995308][ T0] ? trace_hardirqs_on (kernel/trace/trace_preemptirq.c:50 (discriminator 22))
[ 3.996312][ T0] ? _raw_spin_unlock_irqrestore (arch/x86/include/asm/irqflags.h:45 arch/x86/include/asm/irqflags.h:80 arch/x86/include/asm/irqflags.h:138 include/linux/spinlock_api_smp.h:151 kernel/locking/spinlock.c:194)
[ 3.997319][ T0] vfs_caches_init (fs/dcache.c:3295)
[ 3.999313][ T0] start_kernel (init/main.c:1123)
[ 4.000320][ T0] secondary_startup_64_no_verify (arch/x86/kernel/head_64.S:300)
[ 4.001349][ T0] </TASK>
[ 4.002309][ T0] irq event stamp: 2311
[ 4.003308][ T0] hardirqs last enabled at (2321): __up_console_sem (arch/x86/include/asm/irqflags.h:45 (discriminator 1) arch/x86/include/asm/irqflags.h:80 (discriminator 1) arch/x86/include/asm/irqflags.h:138 (discriminator 1) kernel/printk/printk.c:256 (discriminator 1))
[ 4.004313][ T0] hardirqs last disabled at (2330): __up_console_sem (kernel/printk/printk.c:254 (discriminator 1))
[ 4.005309][ T0] softirqs last enabled at (0): 0x0
[ 4.006314][ T0] softirqs last disabled at (0): 0x0
[ 4.007312][ T0] ---[ end trace 0000000000000000 ]---
Poking KASLR using RDTSC...
[ 4.013584][ T0] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 4.014305][ T0] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0, 1GB 0
[ 4.015329][ T0] Spectre V1 : Mitigation: usercopy/swapgs barriers and __user pointer sanitization
[ 4.016318][ T0] Spectre V2 : Mitigation: Full generic retpoline
[ 4.017305][ T0] Spectre V2 : Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch
[ 4.018307][ T0] Speculative Store Bypass: Vulnerable
[ 4.019337][ T0] MDS: Vulnerable: Clear CPU buffers attempted, no microcode
[ 4.028751][ T0] Freeing SMP alternatives memory: 40K
[ 4.030698][ T1] smpboot: CPU0: Intel Xeon E312xx (Sandy Bridge) (family: 0x6, model: 0x2a, stepping: 0x1)
[ 4.032896][ T1] cblist_init_generic: Setting adjustable number of callback queues.
[ 4.033312][ T1] cblist_init_generic: Setting shift to 1 and lim to 1.
[ 4.034658][ T1] cblist_init_generic: Setting shift to 1 and lim to 1.
[ 4.035537][ T1] Running RCU-tasks wait API self tests
[ 4.036793][ T1] Performance Events: unsupported p6 CPU model 42 no PMU driver, software events only.
[ 4.037688][ T1] rcu: Hierarchical SRCU implementation.
[ 4.043395][ T10] Callback from call_rcu_tasks_trace() invoked.
[ 4.045071][ T1] NMI watchdog: Perf NMI watchdog permanently disabled
[ 4.046006][ T1] smp: Bringing up secondary CPUs ...
[ 4.047886][ T1] x86: Booting SMP configuration:
[ 4.048326][ T1] .... node #0, CPUs: #1
[ 0.245211][ T0] masked ExtINT on CPU#1
[ 0.245211][ T0] smpboot: CPU 1 Converting physical 0 to logical die 1
[ 4.078488][ T1] smp: Brought up 1 node, 2 CPUs
[ 4.079334][ T1] smpboot: Max logical packages: 2
[ 4.080316][ T1] smpboot: Total of 2 processors activated (9043.99 BogoMIPS)
[ 4.147690][ T9] Callback from call_rcu_tasks_rude() invoked.
[ 4.380370][ T21] node 0 deferred pages initialised in 297ms
[ 4.513597][ T1] allocated 201326592 bytes of page_ext
[ 4.514660][ T1] Node 0, zone DMA: page owner found early allocated 0 pages
[ 4.521659][ T1] Node 0, zone DMA32: page owner found early allocated 0 pages
[ 4.562068][ T1] Node 0, zone Normal: page owner found early allocated 50463 pages
[ 4.563464][ T1] devtmpfs: initialized
[ 4.565474][ T1] x86/mm: Memory block size: 128MB
[ 4.621220][ T1] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 1911260446275000 ns
[ 4.621383][ T1] futex hash table entries: 512 (order: 4, 65536 bytes, linear)
[ 4.623068][ T1] pinctrl core: initialized pinctrl subsystem
[ 4.625838][ T1] NET: Registered PF_NETLINK/PF_ROUTE protocol family
[ 4.629511][ T1] audit: initializing netlink subsys (disabled)
[ 4.633582][ T27] audit: type=2000 audit(1643043463.357:1): state=initialized audit_enabled=0 res=1
[ 4.633893][ T1] thermal_sys: Registered thermal governor 'fair_share'
[ 4.634312][ T1] thermal_sys: Registered thermal governor 'bang_bang'
[ 4.635316][ T1] thermal_sys: Registered thermal governor 'step_wise'
[ 4.636315][ T1] thermal_sys: Registered thermal governor 'user_space'
[ 4.637493][ T1] cpuidle: using governor menu
[ 4.640928][ T1] acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
[ 4.642786][ T1] PCI: Using configuration type 1 for base access
[ 4.717204][ T1] kprobes: kprobe jump-optimization is enabled. All kprobes are optimized if possible.
[ 4.719606][ T1] HugeTLB registered 2.00 MiB page size, pre-allocated 0 pages
[ 4.723437][ T1] cryptd: max_cpu_qlen set to 1000
[ 4.728040][ T1] ACPI: Added _OSI(Module Device)
[ 4.728315][ T1] ACPI: Added _OSI(Processor Device)
[ 4.729315][ T1] ACPI: Added _OSI(3.0 _SCP Extensions)
[ 4.730319][ T1] ACPI: Added _OSI(Processor Aggregator Device)
[ 4.731351][ T1] ACPI: Added _OSI(Linux-Dell-Video)
[ 4.732334][ T1] ACPI: Added _OSI(Linux-Lenovo-NV-HDMI-Audio)
[ 4.733338][ T1] ACPI: Added _OSI(Linux-HPI-Hybrid-Graphics)
[ 4.798850][ T1] ACPI: 1 ACPI AML tables successfully acquired and loaded
[ 4.814463][ T1] ACPI: Interpreter enabled
[ 4.815609][ T1] ACPI: PM: (supports S0 S3 S4 S5)
[ 4.816319][ T1] ACPI: Using IOAPIC for interrupt routing
[ 4.817654][ T1] PCI: Using host bridge windows from ACPI; if necessary, use "pci=nocrs" and report a bug
[ 4.822488][ T1] ACPI: Enabled 2 GPEs in block 00 to 0F
[ 4.947161][ T1] ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-ff])
[ 4.947405][ T1] acpi PNP0A03:00: _OSC: OS supports [ASPM ClockPM Segments MSI HPX-Type3]
[ 4.948319][ T1] acpi PNP0A03:00: _OSC: not requesting OS control; OS requires [ExtendedConfig ASPM ClockPM MSI]
[ 4.949841][ T1] acpi PNP0A03:00: fail to add MMCONFIG information, can't access extended PCI configuration space under this bridge.
[ 4.962351][ T1] acpiphp: Slot [3] registered
[ 4.963771][ T1] acpiphp: Slot [4] registered
[ 4.964799][ T1] acpiphp: Slot [5] registered
[ 4.965783][ T1] acpiphp: Slot [6] registered
[ 4.966822][ T1] acpiphp: Slot [7] registered
[ 4.967783][ T1] acpiphp: Slot [8] registered
[ 4.968810][ T1] acpiphp: Slot [9] registered
[ 4.969785][ T1] acpiphp: Slot [10] registered
[ 4.970792][ T1] acpiphp: Slot [11] registered
To reproduce:
# build kernel
cd linux
cp config-5.16.0-next-20220111-00002-g86522249760a .config
make HOSTCC=gcc-9 CC=gcc-9 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
make HOSTCC=gcc-9 CC=gcc-9 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
cd <mod-install-dir>
find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email
# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.
---
0DAY/LKP+ Test Infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation
Thanks,
Oliver Sang
Hi Tejun,
On 14/1/22 3:42 am, Tejun Heo wrote:
> Hello,
>
> On Thu, Jan 13, 2022 at 09:42:59PM +1100, Imran Khan wrote:
>> @@ -748,11 +749,14 @@ int kernfs_add_one(struct kernfs_node *kn)
>> goto out_unlock;
>>
>> /* Update timestamps on the parent */
>> + rwsem = iattr_rwsem_ptr(parent);
>> + down_write(rwsem);
>> ps_iattr = parent->iattr;
>> if (ps_iattr) {
>> ktime_get_real_ts64(&ps_iattr->ia_ctime);
>> ps_iattr->ia_mtime = ps_iattr->ia_ctime;
>> }
>> + up_write(rwsem);
>>
>> up_write(&root->kernfs_rwsem);
>
> Hmmm, so the additions / removals are still fs-global lock protected. Would
> it be possible to synchronize them through hashed locks too? We can provide
> double locking helpers - look up locks for both parent and child and if
> different lock in the defined order (parent first most likely) and record
> what happened in a token so that it can be undone later.
>
> Without going through the code carefully, I don't remember whether there's
> something which depends on global locking but I'm sure we can fix them too.
> It'd be really nice if we can make all operations similarly scalable cuz
> with heavy stacking addition/removals can get pretty hot too.
>
I have replaced global rwsem with hashed version in v4 of the patch set
at [1].
I have tried to avoid nested locking because of the following deadlock
scenario:
Say node N11 has parent node N1 and node N22 has parent node N2. Also
N11 and N2 hash to same lock and N1 and N22 hash to same lock.
In this case if we have 2 parallel contexts such that one is locking
N11 and it's parent and other is locking N22 and it's parent and
execution happens like below:
Thread 1 Thread 2
Take lock of N11 --------
---- Take lock of N22
Wait for lock of N1 ----------
-------- Wait for lock of N2
the testing that I have done with v4 are:
1. Multiple boots with systemd and udevd in place to create/remove
sysfs, cgroupfs entries
2. CPU hotplug and reading topology attributes from sysfs in parallel
3. sysfs LTP tests.
4. Above 3 tests with lockdep and KASAN enabled kernels
I will wait for your feedback about approach taken in v4 of the patch
set [1].
[1]:
https://lore.kernel.org/lkml/[email protected]/
Thanks
-- Imran