Right now a global mutex (kernfs_open_file_mutex) protects list of
kernfs_open_file instances corresponding to a sysfs attribute, so even
if different tasks are opening or closing different sysfs files they
can contend on osq_lock of this mutex. The contention is more apparent
in large scale systems with few hundred CPUs where most of the CPUs have
running tasks that are opening, accessing or closing sysfs files at any
point of time. Since each list of kernfs_open_file belongs to a
kernfs_open_node instance which in turn corresponds to one kernfs_node,
move global kernfs_open_file_mutex within kernfs_node so that it does
not block access to kernfs_open_file lists corresponding to other
kernfs_node.
Also since kernfs_node->attr.open points to kernfs_open_node instance
corresponding to the kernfs_node, we can use a kernfs_node specific
spinlock in place of current global spinlock i.e kernfs_open_node_lock.
So make this spinlock local to kernfs_node instance as well.
Signed-off-by: Imran Khan <[email protected]>
---
I have kept this patch as RFC, as I am not sure if I have overlooked any
scenario(s) where these global locks are needed.
fs/kernfs/dir.c | 2 ++
fs/kernfs/file.c | 48 +++++++++++++++---------------------------
include/linux/kernfs.h | 2 ++
3 files changed, 21 insertions(+), 31 deletions(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e6d9772ddb4c..cd68ac30f71b 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -603,6 +603,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
atomic_set(&kn->count, 1);
atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
RB_CLEAR_NODE(&kn->rb);
+ spin_lock_init(&kn->kernfs_open_node_lock);
+ mutex_init(&kn->kernfs_open_file_mutex);
kn->name = name;
kn->mode = mode;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 9414a7a60a9f..4114745d80d5 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -18,20 +18,6 @@
#include "kernfs-internal.h"
-/*
- * There's one kernfs_open_file for each open file and one kernfs_open_node
- * for each kernfs_node with one or more open files.
- *
- * kernfs_node->attr.open points to kernfs_open_node. attr.open is
- * protected by kernfs_open_node_lock.
- *
- * filp->private_data points to seq_file whose ->private points to
- * kernfs_open_file. kernfs_open_files are chained at
- * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
- */
-static DEFINE_SPINLOCK(kernfs_open_node_lock);
-static DEFINE_MUTEX(kernfs_open_file_mutex);
-
struct kernfs_open_node {
atomic_t refcnt;
atomic_t event;
@@ -526,8 +512,8 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
struct kernfs_open_node *on, *new_on = NULL;
retry:
- mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irq(&kernfs_open_node_lock);
+ mutex_lock(&kn->kernfs_open_file_mutex);
+ spin_lock_irq(&kn->kernfs_open_node_lock);
if (!kn->attr.open && new_on) {
kn->attr.open = new_on;
@@ -540,8 +526,8 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
list_add_tail(&of->list, &on->files);
}
- spin_unlock_irq(&kernfs_open_node_lock);
- mutex_unlock(&kernfs_open_file_mutex);
+ spin_unlock_irq(&kn->kernfs_open_node_lock);
+ mutex_unlock(&kn->kernfs_open_file_mutex);
if (on) {
kfree(new_on);
@@ -577,8 +563,8 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
struct kernfs_open_node *on = kn->attr.open;
unsigned long flags;
- mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
+ mutex_lock(&kn->kernfs_open_file_mutex);
+ spin_lock_irqsave(&kn->kernfs_open_node_lock, flags);
if (of)
list_del(&of->list);
@@ -588,8 +574,8 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
else
on = NULL;
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
- mutex_unlock(&kernfs_open_file_mutex);
+ spin_unlock_irqrestore(&kn->kernfs_open_node_lock, flags);
+ mutex_unlock(&kn->kernfs_open_file_mutex);
kfree(on);
}
@@ -733,7 +719,7 @@ static void kernfs_release_file(struct kernfs_node *kn,
* here because drain path may be called from places which can
* cause circular dependency.
*/
- lockdep_assert_held(&kernfs_open_file_mutex);
+ lockdep_assert_held(&kn->kernfs_open_file_mutex);
if (!of->released) {
/*
@@ -752,9 +738,9 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
struct kernfs_open_file *of = kernfs_of(filp);
if (kn->flags & KERNFS_HAS_RELEASE) {
- mutex_lock(&kernfs_open_file_mutex);
+ mutex_lock(&kn->kernfs_open_file_mutex);
kernfs_release_file(kn, of);
- mutex_unlock(&kernfs_open_file_mutex);
+ mutex_unlock(&kn->kernfs_open_file_mutex);
}
kernfs_put_open_node(kn, of);
@@ -773,15 +759,15 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
return;
- spin_lock_irq(&kernfs_open_node_lock);
+ spin_lock_irq(&kn->kernfs_open_node_lock);
on = kn->attr.open;
if (on)
atomic_inc(&on->refcnt);
- spin_unlock_irq(&kernfs_open_node_lock);
+ spin_unlock_irq(&kn->kernfs_open_node_lock);
if (!on)
return;
- mutex_lock(&kernfs_open_file_mutex);
+ mutex_lock(&kn->kernfs_open_file_mutex);
list_for_each_entry(of, &on->files, list) {
struct inode *inode = file_inode(of->file);
@@ -793,7 +779,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
kernfs_release_file(kn, of);
}
- mutex_unlock(&kernfs_open_file_mutex);
+ mutex_unlock(&kn->kernfs_open_file_mutex);
kernfs_put_open_node(kn, NULL);
}
@@ -922,13 +908,13 @@ void kernfs_notify(struct kernfs_node *kn)
return;
/* kick poll immediately */
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
+ spin_lock_irqsave(&kn->kernfs_open_node_lock, flags);
on = kn->attr.open;
if (on) {
atomic_inc(&on->event);
wake_up_interruptible(&on->poll);
}
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+ spin_unlock_irqrestore(&kn->kernfs_open_node_lock, flags);
/* schedule work to kick fsnotify */
spin_lock_irqsave(&kernfs_notify_lock, flags);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 9f650986a81b..22cd01477129 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -162,6 +162,8 @@ struct kernfs_node {
unsigned short flags;
umode_t mode;
struct kernfs_iattrs *iattr;
+ spinlock_t kernfs_open_node_lock;
+ struct mutex kernfs_open_file_mutex;
};
/*
base-commit: 0bafb8f3ebc84525d0ae0fcea22d12151b99312f
--
2.30.2
Hi everyone,
On 16/12/21 2:06 am, Imran Khan wrote:
> Right now a global mutex (kernfs_open_file_mutex) protects list of
> kernfs_open_file instances corresponding to a sysfs attribute, so even
> if different tasks are opening or closing different sysfs files they
> can contend on osq_lock of this mutex. The contention is more apparent
> in large scale systems with few hundred CPUs where most of the CPUs have
> running tasks that are opening, accessing or closing sysfs files at any
> point of time. Since each list of kernfs_open_file belongs to a
> kernfs_open_node instance which in turn corresponds to one kernfs_node,
> move global kernfs_open_file_mutex within kernfs_node so that it does
> not block access to kernfs_open_file lists corresponding to other
> kernfs_node.
>
> Also since kernfs_node->attr.open points to kernfs_open_node instance
> corresponding to the kernfs_node, we can use a kernfs_node specific
> spinlock in place of current global spinlock i.e kernfs_open_node_lock.
> So make this spinlock local to kernfs_node instance as well.
>
> Signed-off-by: Imran Khan <[email protected]>
> ---
> I have kept this patch as RFC, as I am not sure if I have overlooked any
> scenario(s) where these global locks are needed.
>
Could someone please provide some feedback about this change? Also if
there is any issues in this change, can I make these locks per-fs as has
been done in [1].
[1] https://lore.kernel.org/lkml/[email protected]/
Thanks,
Imran
> fs/kernfs/dir.c | 2 ++
> fs/kernfs/file.c | 48 +++++++++++++++---------------------------
> include/linux/kernfs.h | 2 ++
> 3 files changed, 21 insertions(+), 31 deletions(-)
>
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index e6d9772ddb4c..cd68ac30f71b 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -603,6 +603,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
> atomic_set(&kn->count, 1);
> atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
> RB_CLEAR_NODE(&kn->rb);
> + spin_lock_init(&kn->kernfs_open_node_lock);
> + mutex_init(&kn->kernfs_open_file_mutex);
>
> kn->name = name;
> kn->mode = mode;
> diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> index 9414a7a60a9f..4114745d80d5 100644
> --- a/fs/kernfs/file.c
> +++ b/fs/kernfs/file.c
> @@ -18,20 +18,6 @@
>
> #include "kernfs-internal.h"
>
> -/*
> - * There's one kernfs_open_file for each open file and one kernfs_open_node
> - * for each kernfs_node with one or more open files.
> - *
> - * kernfs_node->attr.open points to kernfs_open_node. attr.open is
> - * protected by kernfs_open_node_lock.
> - *
> - * filp->private_data points to seq_file whose ->private points to
> - * kernfs_open_file. kernfs_open_files are chained at
> - * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
> - */
> -static DEFINE_SPINLOCK(kernfs_open_node_lock);
> -static DEFINE_MUTEX(kernfs_open_file_mutex);
> -
> struct kernfs_open_node {
> atomic_t refcnt;
> atomic_t event;
> @@ -526,8 +512,8 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
> struct kernfs_open_node *on, *new_on = NULL;
>
> retry:
> - mutex_lock(&kernfs_open_file_mutex);
> - spin_lock_irq(&kernfs_open_node_lock);
> + mutex_lock(&kn->kernfs_open_file_mutex);
> + spin_lock_irq(&kn->kernfs_open_node_lock);
>
> if (!kn->attr.open && new_on) {
> kn->attr.open = new_on;
> @@ -540,8 +526,8 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
> list_add_tail(&of->list, &on->files);
> }
>
> - spin_unlock_irq(&kernfs_open_node_lock);
> - mutex_unlock(&kernfs_open_file_mutex);
> + spin_unlock_irq(&kn->kernfs_open_node_lock);
> + mutex_unlock(&kn->kernfs_open_file_mutex);
>
> if (on) {
> kfree(new_on);
> @@ -577,8 +563,8 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
> struct kernfs_open_node *on = kn->attr.open;
> unsigned long flags;
>
> - mutex_lock(&kernfs_open_file_mutex);
> - spin_lock_irqsave(&kernfs_open_node_lock, flags);
> + mutex_lock(&kn->kernfs_open_file_mutex);
> + spin_lock_irqsave(&kn->kernfs_open_node_lock, flags);
>
> if (of)
> list_del(&of->list);
> @@ -588,8 +574,8 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
> else
> on = NULL;
>
> - spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
> - mutex_unlock(&kernfs_open_file_mutex);
> + spin_unlock_irqrestore(&kn->kernfs_open_node_lock, flags);
> + mutex_unlock(&kn->kernfs_open_file_mutex);
>
> kfree(on);
> }
> @@ -733,7 +719,7 @@ static void kernfs_release_file(struct kernfs_node *kn,
> * here because drain path may be called from places which can
> * cause circular dependency.
> */
> - lockdep_assert_held(&kernfs_open_file_mutex);
> + lockdep_assert_held(&kn->kernfs_open_file_mutex);
>
> if (!of->released) {
> /*
> @@ -752,9 +738,9 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
> struct kernfs_open_file *of = kernfs_of(filp);
>
> if (kn->flags & KERNFS_HAS_RELEASE) {
> - mutex_lock(&kernfs_open_file_mutex);
> + mutex_lock(&kn->kernfs_open_file_mutex);
> kernfs_release_file(kn, of);
> - mutex_unlock(&kernfs_open_file_mutex);
> + mutex_unlock(&kn->kernfs_open_file_mutex);
> }
>
> kernfs_put_open_node(kn, of);
> @@ -773,15 +759,15 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
> if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
> return;
>
> - spin_lock_irq(&kernfs_open_node_lock);
> + spin_lock_irq(&kn->kernfs_open_node_lock);
> on = kn->attr.open;
> if (on)
> atomic_inc(&on->refcnt);
> - spin_unlock_irq(&kernfs_open_node_lock);
> + spin_unlock_irq(&kn->kernfs_open_node_lock);
> if (!on)
> return;
>
> - mutex_lock(&kernfs_open_file_mutex);
> + mutex_lock(&kn->kernfs_open_file_mutex);
>
> list_for_each_entry(of, &on->files, list) {
> struct inode *inode = file_inode(of->file);
> @@ -793,7 +779,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
> kernfs_release_file(kn, of);
> }
>
> - mutex_unlock(&kernfs_open_file_mutex);
> + mutex_unlock(&kn->kernfs_open_file_mutex);
>
> kernfs_put_open_node(kn, NULL);
> }
> @@ -922,13 +908,13 @@ void kernfs_notify(struct kernfs_node *kn)
> return;
>
> /* kick poll immediately */
> - spin_lock_irqsave(&kernfs_open_node_lock, flags);
> + spin_lock_irqsave(&kn->kernfs_open_node_lock, flags);
> on = kn->attr.open;
> if (on) {
> atomic_inc(&on->event);
> wake_up_interruptible(&on->poll);
> }
> - spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
> + spin_unlock_irqrestore(&kn->kernfs_open_node_lock, flags);
>
> /* schedule work to kick fsnotify */
> spin_lock_irqsave(&kernfs_notify_lock, flags);
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index 9f650986a81b..22cd01477129 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -162,6 +162,8 @@ struct kernfs_node {
> unsigned short flags;
> umode_t mode;
> struct kernfs_iattrs *iattr;
> + spinlock_t kernfs_open_node_lock;
> + struct mutex kernfs_open_file_mutex;
> };
>
> /*
>
> base-commit: 0bafb8f3ebc84525d0ae0fcea22d12151b99312f
On Fri, Dec 24, 2021 at 09:52:51AM +1100, Imran Khan wrote:
> Hi everyone,
>
> On 16/12/21 2:06 am, Imran Khan wrote:
> > Right now a global mutex (kernfs_open_file_mutex) protects list of
> > kernfs_open_file instances corresponding to a sysfs attribute, so even
> > if different tasks are opening or closing different sysfs files they
> > can contend on osq_lock of this mutex. The contention is more apparent
> > in large scale systems with few hundred CPUs where most of the CPUs have
> > running tasks that are opening, accessing or closing sysfs files at any
> > point of time. Since each list of kernfs_open_file belongs to a
> > kernfs_open_node instance which in turn corresponds to one kernfs_node,
> > move global kernfs_open_file_mutex within kernfs_node so that it does
> > not block access to kernfs_open_file lists corresponding to other
> > kernfs_node.
> >
> > Also since kernfs_node->attr.open points to kernfs_open_node instance
> > corresponding to the kernfs_node, we can use a kernfs_node specific
> > spinlock in place of current global spinlock i.e kernfs_open_node_lock.
> > So make this spinlock local to kernfs_node instance as well.
> >
> > Signed-off-by: Imran Khan <[email protected]>
> > ---
> > I have kept this patch as RFC, as I am not sure if I have overlooked any
> > scenario(s) where these global locks are needed.
> >
>
> Could someone please provide some feedback about this change? Also if
> there is any issues in this change, can I make these locks per-fs as has
> been done in [1].
>
> [1] https://lore.kernel.org/lkml/[email protected]/
Please test this using some tests to verify that sysfs is still working
properly and that this actually takes less time overall. In the
conversations about the last time kernfs was changed, there were lots of
discussions about proving that it actually mattered.
thanks,
greg k-h
Hi Greg,
On 24/12/21 7:40 pm, Greg KH wrote:
> On Fri, Dec 24, 2021 at 09:52:51AM +1100, Imran Khan wrote:
>> Hi everyone,
>>
>> On 16/12/21 2:06 am, Imran Khan wrote:
>>> Right now a global mutex (kernfs_open_file_mutex) protects list of
>>> kernfs_open_file instances corresponding to a sysfs attribute, so even
>>> if different tasks are opening or closing different sysfs files they
>>> can contend on osq_lock of this mutex. The contention is more apparent
>>> in large scale systems with few hundred CPUs where most of the CPUs have
>>> running tasks that are opening, accessing or closing sysfs files at any
>>> point of time. Since each list of kernfs_open_file belongs to a
>>> kernfs_open_node instance which in turn corresponds to one kernfs_node,
>>> move global kernfs_open_file_mutex within kernfs_node so that it does
>>> not block access to kernfs_open_file lists corresponding to other
>>> kernfs_node.
>>>
>>> Also since kernfs_node->attr.open points to kernfs_open_node instance
>>> corresponding to the kernfs_node, we can use a kernfs_node specific
>>> spinlock in place of current global spinlock i.e kernfs_open_node_lock.
>>> So make this spinlock local to kernfs_node instance as well.
>>>
>>> Signed-off-by: Imran Khan <[email protected]>
>>> ---
>>> I have kept this patch as RFC, as I am not sure if I have overlooked any
>>> scenario(s) where these global locks are needed.
>>>
>>
>> Could someone please provide some feedback about this change? Also if
>> there is any issues in this change, can I make these locks per-fs as has
>> been done in [1].
>>
>> [1] https://urldefense.com/v3/__https://lore.kernel.org/lkml/YZvV0ESA*[email protected]/__;Kw!!ACWV5N9M2RV99hQ!ZNLlzuX1cVFEAE5Ila2y8AzhvA3xI4HG4q13ZdcaQN__JaPLy6yuzdV0lypeVEIOHA$
>
> Please test this using some tests to verify that sysfs is still working
> properly and that this actually takes less time overall. In the
> conversations about the last time kernfs was changed, there were lots of
> discussions about proving that it actually mattered.
>
Thanks for getting back on this.
Yes sysfs and cgroup are working with this change.
I verified the change:
1. Using LTP sysfs tests
2. Doing CPU hotplugging and reading CPU topology info from sysfs in
parallel. I was getting correct topology information or "No such file or
directory error"
If you could suggest me some further tests, I can test with those as well.
As far as overall time taken was concerned, I did not see any
improvement with my test application (I am running 200 instances of it
on a system with 384 CPUs).
The main loop of this application is as follows (One can use any other
sysfs hierarchy as well):
for (int loop = 0; loop <100 ; loop++)
{
for (int port_num = 1; port_num < 2; port_num++)
{
for (int gid_index = 0; gid_index < 254; gid_index++ )
{
char ret_buf[64], ret_buf_lo[64];
char gid_file_path[1024];
int ret_len, ret_fd;
ssize_t ret_rd;
unsigned int i, saved_errno;
memset(ret_buf, 0, sizeof(ret_buf));
memset(gid_file_path, 0, sizeof(gid_file_path));
ret_len = snprintf(gid_file_path, sizeof(gid_file_path),
"/sys/class/infiniband/%s/ports/%d/gids/%d",
dev_name, port_num, gid_index);
ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC);
if (ret_fd < 0)
{
printf("Failed to open %s\n", gid_file_path);
continue;
}
/* Read the GID */
ret_rd = read(ret_fd, ret_buf, 40);
if (ret_rd == -1)
{
printf("Failed to read from file %s, errno: %u\n",
gid_file_path, saved_errno);
continue;
}
close(ret_fd);
}
}
}
The patch just moved the contention from osq_lock (corresponding to
kernfs_open_file_mutex) to read-write semaphore (kernfs_rwsem). I have
tried to address the kernfs_rwsem contention in v2 of this patch set at [1].
v2 of the patch set, reduces the test execution time to half (From ~36
secs to ~18 secs)
and contention around kernfs_rwsem lock is reduced to 1/3rd of earlier case.
8.61% 8.55% showgids [kernel.kallsyms] [k] down_read
7.80% 7.75% showgids [kernel.kallsyms] [k] up_read
I will await feedback regarding v2 of this patchset.
[1]
https://lore.kernel.org/lkml/[email protected]/
> thanks,
>
> greg k-h