This patchset contains subset of patches (after addressing review comments)
discussed at [1]. Since [1] is replacing multiple global locks and since
each of these locks can be removed independently, it was decided that we
should make these changes in parts i.e first get one set of optimizations
integrated and then work on top of those further.
The patches in this change set introduce following changes:
PATCH-1: Remove reference counting for kernfs_open_node.
PATCH-2: Make kernfs_open_node->attr.open RCU protected.
PATCH-3: Change kernfs_notify_list to llist.
PATCH-4: Introduce interface to access kernfs_open_file_mutex.
PATCH-5: Replace global kernfs_open_file_mutex with hashed mutexes.
-------------------------------------------------------------------------
Imran Khan (5):
kernfs: Remove reference counting for kernfs_open_node.
kernfs: make ->attr.open RCU protected.
kernfs: Change kernfs_notify_list to llist.
kernfs: Introduce interface to access global kernfs_open_file_mutex.
kernfs: Replace global kernfs_open_file_mutex with hashed mutexes.
fs/kernfs/file.c | 247 ++++++++++++++++++++----------------
fs/kernfs/kernfs-internal.h | 4 +
fs/kernfs/mount.c | 19 +++
include/linux/kernfs.h | 61 ++++++++-
4 files changed, 223 insertions(+), 108 deletions(-)
base-commit: 088fb7eff3496e0f61fdf68bda89b81a4d0a4434
[1] https://lore.kernel.org/lkml/[email protected]/
--
2.30.2
In current kernfs design a single mutex, kernfs_open_file_mutex, protects
the list of kernfs_open_file instances corresponding to a sysfs attribute.
So even if different tasks are opening or closing different sysfs files
they can contend on osq_lock of this mutex. The contention is more apparent
in large scale systems with few hundred CPUs where most of the CPUs have
running tasks that are opening, accessing or closing sysfs files at any
point of time.
Using hashed mutexes in place of a single global mutex, can significantly
reduce contention around global mutex and hence can provide better
scalability. Moreover as these hashed mutexes are not part of kernfs_node
objects we will not see any singnificant change in memory utilization of
kernfs based file systems like sysfs, cgroupfs etc.
Modify interface introduced in previous patch to make use of hashed
mutexes. Use kernfs_node address as hashing key.
Signed-off-by: Imran Khan <[email protected]>
---
fs/kernfs/file.c | 17 ++---------
fs/kernfs/kernfs-internal.h | 4 +++
fs/kernfs/mount.c | 19 +++++++++++++
include/linux/kernfs.h | 57 +++++++++++++++++++++++++++++++++++++
4 files changed, 83 insertions(+), 14 deletions(-)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 7a60074ec0a0..946a4a8d7e32 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -18,19 +18,6 @@
#include "kernfs-internal.h"
-/*
- * There's one kernfs_open_file for each open file and one kernfs_open_node
- * for each kernfs_node with one or more open files.
- *
- * kernfs_node->attr.open points to kernfs_open_node. attr.open is
- * RCU protected.
- *
- * filp->private_data points to seq_file whose ->private points to
- * kernfs_open_file. kernfs_open_files are chained at
- * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
- */
-static DEFINE_MUTEX(kernfs_open_file_mutex);
-
struct kernfs_open_node {
struct rcu_head rcu_head;
atomic_t event;
@@ -51,7 +38,9 @@ static LLIST_HEAD(kernfs_notify_list);
static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
{
- return &kernfs_open_file_mutex;
+ int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+ return &kernfs_locks->open_file_mutex[idx];
}
static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index eeaa779b929c..3ae214d02d44 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -164,4 +164,8 @@ void kernfs_drain_open_files(struct kernfs_node *kn);
*/
extern const struct inode_operations kernfs_symlink_iops;
+/*
+ * kernfs locks
+ */
+extern struct kernfs_global_locks *kernfs_locks;
#endif /* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index cfa79715fc1a..d0859f72d2d6 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -20,6 +20,7 @@
#include "kernfs-internal.h"
struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
+struct kernfs_global_locks *kernfs_locks;
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
@@ -387,6 +388,22 @@ void kernfs_kill_sb(struct super_block *sb)
kfree(info);
}
+static void __init kernfs_mutex_init(void)
+{
+ int count;
+
+ for (count = 0; count < NR_KERNFS_LOCKS; count++)
+ mutex_init(&kernfs_locks->open_file_mutex[count]);
+}
+
+static void __init kernfs_lock_init(void)
+{
+ kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL);
+ WARN_ON(!kernfs_locks);
+
+ kernfs_mutex_init();
+}
+
void __init kernfs_init(void)
{
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
@@ -397,4 +414,6 @@ void __init kernfs_init(void)
kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
sizeof(struct kernfs_iattrs),
0, SLAB_PANIC, NULL);
+
+ kernfs_lock_init();
}
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 2dd9c8df0f4f..13e703f615f7 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -18,6 +18,7 @@
#include <linux/uidgid.h>
#include <linux/wait.h>
#include <linux/rwsem.h>
+#include <linux/cache.h>
struct file;
struct dentry;
@@ -34,6 +35,62 @@ struct kernfs_fs_context;
struct kernfs_open_node;
struct kernfs_iattrs;
+/*
+ * NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash
+ * table of locks.
+ * Having a small hash table would impact scalability, since
+ * more and more kernfs_node objects will end up using same lock
+ * and having a very large hash table would waste memory.
+ *
+ * At the moment size of hash table of locks is being set based on
+ * the number of CPUs as follows:
+ *
+ * NR_CPU NR_KERNFS_LOCK_BITS NR_KERNFS_LOCKS
+ * 1 1 2
+ * 2-3 2 4
+ * 4-7 4 16
+ * 8-15 6 64
+ * 16-31 8 256
+ * 32 and more 10 1024
+ *
+ * The above relation between NR_CPU and number of locks is based
+ * on some internal experimentation which involved booting qemu
+ * with different values of smp, performing some sysfs operations
+ * on all CPUs and observing how increase in number of locks impacts
+ * completion time of these sysfs operations on each CPU.
+ */
+#ifdef CONFIG_SMP
+#define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32)))
+#else
+#define NR_KERNFS_LOCK_BITS 1
+#endif
+
+#define NR_KERNFS_LOCKS (1 << NR_KERNFS_LOCK_BITS)
+
+/*
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
+ * for each kernfs_node with one or more open files.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * kernfs_open_file.
+ *
+ * kernfs_open_files are chained at kernfs_open_node->files, which is
+ * protected by kernfs_global_locks.open_file_mutex[i].
+ *
+ * To reduce possible contention in sysfs access, arising due to single
+ * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node
+ * object address as hash keys to get the index of these locks.
+ *
+ * Hashed mutexes are safe to use here because operations using these don't
+ * rely on global exclusion.
+ *
+ * In future we intend to replace other global locks with hashed ones as well.
+ * kernfs_global_locks acts as a holder for all such hash tables.
+ */
+struct kernfs_global_locks {
+ struct mutex open_file_mutex[NR_KERNFS_LOCKS];
+};
+
enum kernfs_node_type {
KERNFS_DIR = 0x0001,
KERNFS_FILE = 0x0002,
--
2.30.2
After removal of kernfs_open_node->refcnt in the previous patch,
kernfs_open_node_lock can be removed as well by making ->attr.open
RCU protected. kernfs_put_open_node can delegate freeing to ->attr.open
to RCU and other readers of ->attr.open can do so under rcu_read_(un)lock.
Suggested by: Al Viro <[email protected]>
Signed-off-by: Imran Khan <[email protected]>
---
fs/kernfs/file.c | 120 +++++++++++++++++++++++++----------------
include/linux/kernfs.h | 2 +-
2 files changed, 75 insertions(+), 47 deletions(-)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e3abfa843879..36bff71ab263 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -23,16 +23,16 @@
* for each kernfs_node with one or more open files.
*
* kernfs_node->attr.open points to kernfs_open_node. attr.open is
- * protected by kernfs_open_node_lock.
+ * RCU protected.
*
* filp->private_data points to seq_file whose ->private points to
* kernfs_open_file. kernfs_open_files are chained at
* kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
*/
-static DEFINE_SPINLOCK(kernfs_open_node_lock);
static DEFINE_MUTEX(kernfs_open_file_mutex);
struct kernfs_open_node {
+ struct rcu_head rcu_head;
atomic_t event;
wait_queue_head_t poll;
struct list_head files; /* goes through kernfs_open_file.list */
@@ -51,6 +51,38 @@ struct kernfs_open_node {
static DEFINE_SPINLOCK(kernfs_notify_lock);
static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
+/*
+ * Raw deref RCU protected kn->attr.open.
+ * The caller guarantees that @on will not vanish in the middle of this
+ * function and hence we can deref ->attr.open raw.
+ */
+static struct kernfs_open_node *kernfs_deref_on_raw(struct kernfs_node *kn)
+{
+ return rcu_dereference_raw(kn->attr.open);
+}
+
+/*
+ * Deref ->attr.open corresponding to @kn while holding kernfs_open_file_mutex.
+ * ->attr.open is modified under kernfs_open_file_mutex. So it can be safely
+ * accessed outside RCU read-side critical section, while holding the mutex.
+ */
+static struct kernfs_open_node *kernfs_deref_on_protected(struct kernfs_node *kn)
+{
+ return rcu_dereference_protected(kn->attr.open,
+ lockdep_is_held(&kernfs_open_file_mutex));
+}
+
+/*
+ * Check ->attr.open corresponding to @kn while holding kernfs_open_file_mutex.
+ * ->attr.open is modified under kernfs_open_file_mutex. So it can be safely
+ * accessed outside RCU read-side critical section, while holding the mutex.
+ */
+static struct kernfs_open_node *kernfs_check_on_protected(struct kernfs_node *kn)
+{
+ return rcu_dereference_check(kn->attr.open,
+ lockdep_is_held(&kernfs_open_file_mutex));
+}
+
static struct kernfs_open_file *kernfs_of(struct file *file)
{
return ((struct seq_file *)file->private_data)->private;
@@ -156,8 +188,9 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
static int kernfs_seq_show(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
+ struct kernfs_open_node *on = kernfs_deref_on_raw(of->kn);
- of->event = atomic_read(&of->kn->attr.open->event);
+ of->event = atomic_read(&unrcu_pointer(on)->event);
return of->kn->attr.ops->seq_show(sf, v);
}
@@ -180,6 +213,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
const struct kernfs_ops *ops;
+ struct kernfs_open_node *on;
char *buf;
buf = of->prealloc_buf;
@@ -201,7 +235,8 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out_free;
}
- of->event = atomic_read(&of->kn->attr.open->event);
+ on = kernfs_deref_on_raw(of->kn);
+ of->event = atomic_read(&unrcu_pointer(on)->event);
ops = kernfs_ops(of->kn);
if (ops->read)
len = ops->read(of, buf, len, iocb->ki_pos);
@@ -519,36 +554,29 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
{
struct kernfs_open_node *on, *new_on = NULL;
- retry:
mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irq(&kernfs_open_node_lock);
-
- if (!kn->attr.open && new_on) {
- kn->attr.open = new_on;
- new_on = NULL;
- }
-
- on = kn->attr.open;
- if (on)
- list_add_tail(&of->list, &on->files);
-
- spin_unlock_irq(&kernfs_open_node_lock);
- mutex_unlock(&kernfs_open_file_mutex);
+ on = kernfs_deref_on_protected(kn);
if (on) {
- kfree(new_on);
+ list_add_tail(&of->list, &on->files);
+ mutex_unlock(&kernfs_open_file_mutex);
return 0;
+ } else {
+ /* not there, initialize a new one */
+ new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+ if (!new_on) {
+ mutex_unlock(&kernfs_open_file_mutex);
+ return -ENOMEM;
+ }
+ atomic_set(&new_on->event, 1);
+ init_waitqueue_head(&new_on->poll);
+ INIT_LIST_HEAD(&new_on->files);
+ list_add_tail(&of->list, &new_on->files);
+ rcu_assign_pointer(kn->attr.open, new_on);
}
+ mutex_unlock(&kernfs_open_file_mutex);
- /* not there, initialize a new one and retry */
- new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
- if (!new_on)
- return -ENOMEM;
-
- atomic_set(&new_on->event, 1);
- init_waitqueue_head(&new_on->poll);
- INIT_LIST_HEAD(&new_on->files);
- goto retry;
+ return 0;
}
/**
@@ -567,24 +595,25 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
static void kernfs_unlink_open_file(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
- struct kernfs_open_node *on = kn->attr.open;
- unsigned long flags;
+ struct kernfs_open_node *on;
mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
+
+ on = kernfs_deref_on_protected(kn);
+ if (!on) {
+ mutex_unlock(&kernfs_open_file_mutex);
+ return;
+ }
if (of)
list_del(&of->list);
- if (list_empty(&on->files))
- kn->attr.open = NULL;
- else
- on = NULL;
+ if (list_empty(&on->files)) {
+ rcu_assign_pointer(kn->attr.open, NULL);
+ kfree_rcu(on, rcu_head);
+ }
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
mutex_unlock(&kernfs_open_file_mutex);
-
- kfree(on);
}
static int kernfs_fop_open(struct inode *inode, struct file *file)
@@ -774,17 +803,16 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
* check under kernfs_open_file_mutex will ensure bailing out if
* ->attr.open became NULL while waiting for the mutex.
*/
- if (!kn->attr.open)
+ if (!rcu_access_pointer(kn->attr.open))
return;
mutex_lock(&kernfs_open_file_mutex);
- if (!kn->attr.open) {
+ on = kernfs_check_on_protected(kn);
+ if (!on) {
mutex_unlock(&kernfs_open_file_mutex);
return;
}
- on = kn->attr.open;
-
list_for_each_entry(of, &on->files, list) {
struct inode *inode = file_inode(of->file);
@@ -815,7 +843,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
- struct kernfs_open_node *on = kn->attr.open;
+ struct kernfs_open_node *on = kernfs_deref_on_raw(kn);
poll_wait(of->file, &on->poll, wait);
@@ -922,13 +950,13 @@ void kernfs_notify(struct kernfs_node *kn)
return;
/* kick poll immediately */
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
- on = kn->attr.open;
+ rcu_read_lock();
+ on = rcu_dereference(kn->attr.open);
if (on) {
atomic_inc(&on->event);
wake_up_interruptible(&on->poll);
}
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+ rcu_read_unlock();
/* schedule work to kick fsnotify */
spin_lock_irqsave(&kernfs_notify_lock, flags);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index e2ae15a6225e..13f54f078a52 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -114,7 +114,7 @@ struct kernfs_elem_symlink {
struct kernfs_elem_attr {
const struct kernfs_ops *ops;
- struct kernfs_open_node *open;
+ struct kernfs_open_node __rcu *open;
loff_t size;
struct kernfs_node *notify_next; /* for kernfs_notify() */
};
--
2.30.2
The decision to free kernfs_open_node object in kernfs_put_open_node can
be taken based on whether kernfs_open_node->files list is empty or not. As
far as kernfs_drain_open_files is concerned it can't overlap with
kernfs_fops_open and hence can check for ->attr.open optimistically
(if ->attr.open is NULL) or under kernfs_open_file_mutex (if it needs to
traverse the ->files list.) Thus kernfs_drain_open_files can work w/o ref
counting involved kernfs_open_node as well.
So remove ->refcnt and modify the above mentioned users accordingly.
Suggested by: Al Viro <[email protected]>
Signed-off-by: Imran Khan <[email protected]>
---
fs/kernfs/file.c | 47 ++++++++++++++++++++++++++---------------------
1 file changed, 26 insertions(+), 21 deletions(-)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 88423069407c..e3abfa843879 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -33,7 +33,6 @@ static DEFINE_SPINLOCK(kernfs_open_node_lock);
static DEFINE_MUTEX(kernfs_open_file_mutex);
struct kernfs_open_node {
- atomic_t refcnt;
atomic_t event;
wait_queue_head_t poll;
struct list_head files; /* goes through kernfs_open_file.list */
@@ -530,10 +529,8 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
}
on = kn->attr.open;
- if (on) {
- atomic_inc(&on->refcnt);
+ if (on)
list_add_tail(&of->list, &on->files);
- }
spin_unlock_irq(&kernfs_open_node_lock);
mutex_unlock(&kernfs_open_file_mutex);
@@ -548,7 +545,6 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
if (!new_on)
return -ENOMEM;
- atomic_set(&new_on->refcnt, 0);
atomic_set(&new_on->event, 1);
init_waitqueue_head(&new_on->poll);
INIT_LIST_HEAD(&new_on->files);
@@ -556,17 +552,19 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
}
/**
- * kernfs_put_open_node - put kernfs_open_node
- * @kn: target kernfs_nodet
+ * kernfs_unlink_open_file - Unlink @of from @kn.
+ *
+ * @kn: target kernfs_node
* @of: associated kernfs_open_file
*
- * Put @kn->attr.open and unlink @of from the files list. If
- * reference count reaches zero, disassociate and free it.
+ * Unlink @of from list of @kn's associated open files. If list of
+ * associated open files becomes empty, disassociate and free
+ * kernfs_open_node.
*
* LOCKING:
* None.
*/
-static void kernfs_put_open_node(struct kernfs_node *kn,
+static void kernfs_unlink_open_file(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
struct kernfs_open_node *on = kn->attr.open;
@@ -578,7 +576,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
if (of)
list_del(&of->list);
- if (atomic_dec_and_test(&on->refcnt))
+ if (list_empty(&on->files))
kn->attr.open = NULL;
else
on = NULL;
@@ -706,7 +704,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
return 0;
err_put_node:
- kernfs_put_open_node(kn, of);
+ kernfs_unlink_open_file(kn, of);
err_seq_release:
seq_release(inode, file);
err_free:
@@ -752,7 +750,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
mutex_unlock(&kernfs_open_file_mutex);
}
- kernfs_put_open_node(kn, of);
+ kernfs_unlink_open_file(kn, of);
seq_release(inode, filp);
kfree(of->prealloc_buf);
kfree(of);
@@ -768,15 +766,24 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
return;
- spin_lock_irq(&kernfs_open_node_lock);
- on = kn->attr.open;
- if (on)
- atomic_inc(&on->refcnt);
- spin_unlock_irq(&kernfs_open_node_lock);
- if (!on)
+ /*
+ * lockless opportunistic check is safe below because no one is adding to
+ * ->attr.open at this point of time. This check allows early bail out
+ * if ->attr.open is already NULL. kernfs_unlink_open_file makes
+ * ->attr.open NULL only while holding kernfs_open_file_mutex so below
+ * check under kernfs_open_file_mutex will ensure bailing out if
+ * ->attr.open became NULL while waiting for the mutex.
+ */
+ if (!kn->attr.open)
return;
mutex_lock(&kernfs_open_file_mutex);
+ if (!kn->attr.open) {
+ mutex_unlock(&kernfs_open_file_mutex);
+ return;
+ }
+
+ on = kn->attr.open;
list_for_each_entry(of, &on->files, list) {
struct inode *inode = file_inode(of->file);
@@ -789,8 +796,6 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
}
mutex_unlock(&kernfs_open_file_mutex);
-
- kernfs_put_open_node(kn, NULL);
}
/*
--
2.30.2
At present kernfs_notify_list is implemented as a singly linked
list of kernfs_node(s), where last element points to itself and
value of ->attr.next tells if node is present on the list or not.
Both addition and deletion to list happen under kernfs_notify_lock.
Change kernfs_notify_list to llist so that addition to list can heppen
locklessly. We still need kernfs_notify_lock for consumers (kernfs_notify\
_workfn) because there can be multiple concurrent work items.
Suggested by: Al Viro <[email protected]>
Signed-off-by: Imran Khan <[email protected]>
---
fs/kernfs/file.c | 47 ++++++++++++++++++------------------------
include/linux/kernfs.h | 2 +-
2 files changed, 21 insertions(+), 28 deletions(-)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 36bff71ab263..96c8493003b6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -38,18 +38,16 @@ struct kernfs_open_node {
struct list_head files; /* goes through kernfs_open_file.list */
};
-/*
- * kernfs_notify() may be called from any context and bounces notifications
- * through a work item. To minimize space overhead in kernfs_node, the
- * pending queue is implemented as a singly linked list of kernfs_nodes.
- * The list is terminated with the self pointer so that whether a
- * kernfs_node is on the list or not can be determined by testing the next
- * pointer for NULL.
+/**
+ * attribute_to_node - get kernfs_node object corresponding to a kernfs attribute
+ * @ptr: &struct kernfs_elem_attr
+ * @type: struct kernfs_node
+ * @member: name of member (i.e attr)
*/
-#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
+#define attribute_to_node(ptr, type, member) \
+ container_of(ptr, type, member)
-static DEFINE_SPINLOCK(kernfs_notify_lock);
-static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
+static LLIST_HEAD(kernfs_notify_list);
/*
* Raw deref RCU protected kn->attr.open.
@@ -876,18 +874,16 @@ static void kernfs_notify_workfn(struct work_struct *work)
struct kernfs_node *kn;
struct kernfs_super_info *info;
struct kernfs_root *root;
+ struct llist_node *free;
+ struct kernfs_elem_attr *attr;
repeat:
/* pop one off the notify_list */
- spin_lock_irq(&kernfs_notify_lock);
- kn = kernfs_notify_list;
- if (kn == KERNFS_NOTIFY_EOL) {
- spin_unlock_irq(&kernfs_notify_lock);
+ free = llist_del_first(&kernfs_notify_list);
+ if (free == NULL)
return;
- }
- kernfs_notify_list = kn->attr.notify_next;
- kn->attr.notify_next = NULL;
- spin_unlock_irq(&kernfs_notify_lock);
+ attr = llist_entry(free, struct kernfs_elem_attr, notify_next);
+ kn = attribute_to_node(attr, struct kernfs_node, attr);
root = kernfs_root(kn);
/* kick fsnotify */
down_write(&root->kernfs_rwsem);
@@ -943,12 +939,14 @@ static void kernfs_notify_workfn(struct work_struct *work)
void kernfs_notify(struct kernfs_node *kn)
{
static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
- unsigned long flags;
struct kernfs_open_node *on;
if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
return;
+ /* Because we are using llist for kernfs_notify_list */
+ WARN_ON_ONCE(in_nmi());
+
/* kick poll immediately */
rcu_read_lock();
on = rcu_dereference(kn->attr.open);
@@ -959,14 +957,9 @@ void kernfs_notify(struct kernfs_node *kn)
rcu_read_unlock();
/* schedule work to kick fsnotify */
- spin_lock_irqsave(&kernfs_notify_lock, flags);
- if (!kn->attr.notify_next) {
- kernfs_get(kn);
- kn->attr.notify_next = kernfs_notify_list;
- kernfs_notify_list = kn;
- schedule_work(&kernfs_notify_work);
- }
- spin_unlock_irqrestore(&kernfs_notify_lock, flags);
+ kernfs_get(kn);
+ llist_add(&kn->attr.notify_next, &kernfs_notify_list);
+ schedule_work(&kernfs_notify_work);
}
EXPORT_SYMBOL_GPL(kernfs_notify);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 13f54f078a52..2dd9c8df0f4f 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -116,7 +116,7 @@ struct kernfs_elem_attr {
const struct kernfs_ops *ops;
struct kernfs_open_node __rcu *open;
loff_t size;
- struct kernfs_node *notify_next; /* for kernfs_notify() */
+ struct llist_node notify_next; /* for kernfs_notify() */
};
/*
--
2.30.2
Hello Tejun,
Thank you for reviewing this.
On 6/5/22 6:01 am, Tejun Heo wrote:
> On Thu, Apr 28, 2022 at 03:54:28PM +1000, Imran Khan wrote:
>> +static struct kernfs_open_node *kernfs_deref_on_raw(struct kernfs_node *kn)
>> +{
>> + return rcu_dereference_raw(kn->attr.open);
>> +}
>
> Wrapping the above probably isn't helping anything.
>
This change is using raw deref at a few places, so I thought of putting
raw deref under a wrapper and explain in the wrapper function comment
under what conditions raw dereferencing was safe.
>> +/*
>> + * Check ->attr.open corresponding to @kn while holding kernfs_open_file_mutex.
>> + * ->attr.open is modified under kernfs_open_file_mutex. So it can be safely
>> + * accessed outside RCU read-side critical section, while holding the mutex.
>> + */
>> +static struct kernfs_open_node *kernfs_check_on_protected(struct kernfs_node *kn)
>> +{
>> + return rcu_dereference_check(kn->attr.open,
>> + lockdep_is_held(&kernfs_open_file_mutex));
>> +}
>
> Maybe name this just kernfs_deref_on()?
>
Sure. I can mention in the function comment that this should be used
only under kernfs_open_file_mutex.
>> @@ -156,8 +188,9 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
>> static int kernfs_seq_show(struct seq_file *sf, void *v)
>> {
>> struct kernfs_open_file *of = sf->private;
>> + struct kernfs_open_node *on = kernfs_deref_on_raw(of->kn);
>
> I suppose this is protected by the fact that @of is on @on?
Yes.
> If so, just add
> the condition to the checked version. The condition doesn't have to be
> perfect - e.g. you can just say that neither @on's and @of's list_head isn't
> empty. While not comprehensive, it'd still provide meaningful protection
> against mistakes and be easier to understand if the deref accessor clearly
> explains the expectations.
>
Sure. Could you please let me know if below modification looks good ?
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 946a4a8d7e32..a12d5067f8d5 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -56,12 +56,23 @@ static inline struct mutex
*kernfs_open_file_mutex_lock(struct kernfs_node *kn)
/*
* Raw deref RCU protected kn->attr.open.
- * The caller guarantees that @on will not vanish in the middle of this
- * function and hence we can deref ->attr.open raw.
+ * If both @of->list and @kn->attr.open->files are non empty, we can safely
+ * assume that @of is on @kn->attr.open and hence @kn->attr.open will
not vanish
+ * and raw derefeencing is safe here.
*/
-static struct kernfs_open_node *kernfs_deref_on_raw(struct kernfs_node *kn)
+static struct kernfs_open_node *kernfs_deref_on_raw(struct
kernfs_open_file *of, struct kernfs_node *kn)
{
- return rcu_dereference_raw(kn->attr.open);
+ struct kernfs_open_node *on;
+
+ if (list_empty(&of->list))
+ return NULL;
+
+ on = rcu_dereference_raw(kn->attr.open);
+
+ if (list_empty(&on->files))
+ return NULL;
+ else
+ return on;
}
/*
@@ -191,7 +202,10 @@ static void kernfs_seq_stop(struct seq_file *sf,
void *v)
static int kernfs_seq_show(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
- struct kernfs_open_node *on = kernfs_deref_on_raw(of->kn);
+ struct kernfs_open_node *on = kernfs_deref_on_raw(of, of->kn);
+
+ if(!on)
+ return -EINVAL;
of->event = atomic_read(&unrcu_pointer(on)->event);
@@ -238,7 +252,10 @@ static ssize_t kernfs_file_read_iter(struct kiocb
*iocb, struct iov_iter *iter)
goto out_free;
}
- on = kernfs_deref_on_raw(of->kn);
+ on = kernfs_deref_on_raw(of, of->kn);
+ if(!on)
+ return -EINVAL;
+
of->event = atomic_read(&unrcu_pointer(on)->event);
ops = kernfs_ops(of->kn);
if (ops->read)
@@ -850,7 +867,10 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
struct kernfs_node *kn =
kernfs_dentry_node(of->file->f_path.dentry);
- struct kernfs_open_node *on = kernfs_deref_on_raw(kn);
+ struct kernfs_open_node *on = kernfs_deref_on_raw(of, kn);
+
+ if (!on)
+ return EPOLLERR;
poll_wait(of->file, &on->poll, wait);
>> @@ -201,7 +235,8 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>> goto out_free;
[...]
>> @@ -922,13 +950,13 @@ void kernfs_notify(struct kernfs_node *kn)
>> return;
>>
>> /* kick poll immediately */
>> - spin_lock_irqsave(&kernfs_open_node_lock, flags);
>> - on = kn->attr.open;
>> + rcu_read_lock();
>> + on = rcu_dereference(kn->attr.open);
>
> Shouldn't this be kernfs_deref_on() too?
>
kernfs_notify can be invoked w/o holding kernfs_open_file_mutex (e.g.
cgroup_file_notify). So we have explicit RCU read-side critical section
here and hence just rcu_dereference.
Thanks
-- Imran
> Thanks.
>
On Thu, Apr 28, 2022 at 03:54:28PM +1000, Imran Khan wrote:
> +static struct kernfs_open_node *kernfs_deref_on_raw(struct kernfs_node *kn)
> +{
> + return rcu_dereference_raw(kn->attr.open);
> +}
Wrapping the above probably isn't helping anything.
> +/*
> + * Check ->attr.open corresponding to @kn while holding kernfs_open_file_mutex.
> + * ->attr.open is modified under kernfs_open_file_mutex. So it can be safely
> + * accessed outside RCU read-side critical section, while holding the mutex.
> + */
> +static struct kernfs_open_node *kernfs_check_on_protected(struct kernfs_node *kn)
> +{
> + return rcu_dereference_check(kn->attr.open,
> + lockdep_is_held(&kernfs_open_file_mutex));
> +}
Maybe name this just kernfs_deref_on()?
> @@ -156,8 +188,9 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
> static int kernfs_seq_show(struct seq_file *sf, void *v)
> {
> struct kernfs_open_file *of = sf->private;
> + struct kernfs_open_node *on = kernfs_deref_on_raw(of->kn);
I suppose this is protected by the fact that @of is on @on? If so, just add
the condition to the checked version. The condition doesn't have to be
perfect - e.g. you can just say that neither @on's and @of's list_head isn't
empty. While not comprehensive, it'd still provide meaningful protection
against mistakes and be easier to understand if the deref accessor clearly
explains the expectations.
> @@ -201,7 +235,8 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
> goto out_free;
> }
>
> - of->event = atomic_read(&of->kn->attr.open->event);
> + on = kernfs_deref_on_raw(of->kn);
> + of->event = atomic_read(&unrcu_pointer(on)->event);
Ditto here.
> @@ -815,7 +843,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
> __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
> {
> struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
> - struct kernfs_open_node *on = kn->attr.open;
> + struct kernfs_open_node *on = kernfs_deref_on_raw(kn);
and here.
> @@ -922,13 +950,13 @@ void kernfs_notify(struct kernfs_node *kn)
> return;
>
> /* kick poll immediately */
> - spin_lock_irqsave(&kernfs_open_node_lock, flags);
> - on = kn->attr.open;
> + rcu_read_lock();
> + on = rcu_dereference(kn->attr.open);
Shouldn't this be kernfs_deref_on() too?
Thanks.
--
tejun
On Thu, Apr 28, 2022 at 03:54:31PM +1000, Imran Khan wrote:
> +static void __init kernfs_mutex_init(void)
> +{
> + int count;
> +
> + for (count = 0; count < NR_KERNFS_LOCKS; count++)
> + mutex_init(&kernfs_locks->open_file_mutex[count]);
> +}
Does this need to be a separate function?
> +static void __init kernfs_lock_init(void)
> +{
> + kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL);
> + WARN_ON(!kernfs_locks);
> +
> + kernfs_mutex_init();
> +}
> +
> void __init kernfs_init(void)
> {
> kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
Other than that minor nitpick,
Acked-by: Tejun Heo <[email protected]>
Thanks.
--
tejun
On Thu, Apr 28, 2022 at 03:54:29PM +1000, Imran Khan wrote:
> At present kernfs_notify_list is implemented as a singly linked
> list of kernfs_node(s), where last element points to itself and
> value of ->attr.next tells if node is present on the list or not.
> Both addition and deletion to list happen under kernfs_notify_lock.
>
> Change kernfs_notify_list to llist so that addition to list can heppen
> locklessly. We still need kernfs_notify_lock for consumers (kernfs_notify\
> _workfn) because there can be multiple concurrent work items.
>
> Suggested by: Al Viro <[email protected]>
> Signed-off-by: Imran Khan <[email protected]>
Acked-by: Tejun Heo <[email protected]>
Thanks.
--
tejun
On Thu, Apr 28, 2022 at 03:54:27PM +1000, Imran Khan wrote:
> The decision to free kernfs_open_node object in kernfs_put_open_node can
> be taken based on whether kernfs_open_node->files list is empty or not. As
> far as kernfs_drain_open_files is concerned it can't overlap with
> kernfs_fops_open and hence can check for ->attr.open optimistically
> (if ->attr.open is NULL) or under kernfs_open_file_mutex (if it needs to
> traverse the ->files list.) Thus kernfs_drain_open_files can work w/o ref
> counting involved kernfs_open_node as well.
> So remove ->refcnt and modify the above mentioned users accordingly.
>
> Suggested by: Al Viro <[email protected]>
> Signed-off-by: Imran Khan <[email protected]>
Acked-by: Tejun Heo <[email protected]>
Thanks.
--
tejun
Hello Tejun,
On 6/5/22 6:05 am, Tejun Heo wrote:
> On Thu, Apr 28, 2022 at 03:54:31PM +1000, Imran Khan wrote:
>> +static void __init kernfs_mutex_init(void)
>> +{
>> + int count;
>> +
>> + for (count = 0; count < NR_KERNFS_LOCKS; count++)
>> + mutex_init(&kernfs_locks->open_file_mutex[count]);
>> +}
>
> Does this need to be a separate function?
>
Above, I have moved mutex initialization from kernfs_lock_init to a
separate function kernfs_mutex_init. Could you please let me know if I
am missing something here ?
>> +static void __init kernfs_lock_init(void)
>> +{
>> + kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL);
>> + WARN_ON(!kernfs_locks);
>> +
>> + kernfs_mutex_init();
>> +}
>> +
>> void __init kernfs_init(void)
>> {
>> kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
>
> Other than that minor nitpick,
>
> Acked-by: Tejun Heo <[email protected]>
>
> Thanks.
>
Thanks,
-- Imran
Hello,
On Fri, May 06, 2022 at 09:12:23AM +1000, Imran Khan wrote:
> On 6/5/22 6:01 am, Tejun Heo wrote:
> > On Thu, Apr 28, 2022 at 03:54:28PM +1000, Imran Khan wrote:
> >> +static struct kernfs_open_node *kernfs_deref_on_raw(struct kernfs_node *kn)
> >> +{
> >> + return rcu_dereference_raw(kn->attr.open);
> >> +}
> >
> > Wrapping the above probably isn't helping anything.
> >
>
> This change is using raw deref at a few places, so I thought of putting
> raw deref under a wrapper and explain in the wrapper function comment
> under what conditions raw dereferencing was safe.
I don't think they need raw deref in the first place and if you *really*
need raw deref, the wrapper explaining why they're needed and how they're
safe is the whole point of it and I don't think the wrapper achieves that.
> >> +/*
> >> + * Check ->attr.open corresponding to @kn while holding kernfs_open_file_mutex.
> >> + * ->attr.open is modified under kernfs_open_file_mutex. So it can be safely
> >> + * accessed outside RCU read-side critical section, while holding the mutex.
> >> + */
> >> +static struct kernfs_open_node *kernfs_check_on_protected(struct kernfs_node *kn)
> >> +{
> >> + return rcu_dereference_check(kn->attr.open,
> >> + lockdep_is_held(&kernfs_open_file_mutex));
> >> +}
> >
> > Maybe name this just kernfs_deref_on()?
> >
>
> Sure. I can mention in the function comment that this should be used
> only under kernfs_open_file_mutex.
and in the check condition, add the conditions that you need to make this
not trigger warning when used in all the places that you wanna use it.
> +static struct kernfs_open_node *kernfs_deref_on_raw(struct
> kernfs_open_file *of, struct kernfs_node *kn)
> {
> - return rcu_dereference_raw(kn->attr.open);
> + struct kernfs_open_node *on;
> +
> + if (list_empty(&of->list))
> + return NULL;
> +
> + on = rcu_dereference_raw(kn->attr.open);
> +
> + if (list_empty(&on->files))
> + return NULL;
> + else
> + return on;
Put the above conditions in the rcu_dereference_check(). That's what it is
for - describing the additional conditions that make the dereference safe.
Thanks.
--
tejun
Hello Tejun,
Thanks for your reply.
On 13/5/22 3:09 am, Tejun Heo wrote:
> Hello,
>
> On Fri, May 06, 2022 at 09:12:23AM +1000, Imran Khan wrote:
>> On 6/5/22 6:01 am, Tejun Heo wrote:
>>> On Thu, Apr 28, 2022 at 03:54:28PM +1000, Imran Khan wrote:
>>>> +static struct kernfs_open_node *kernfs_deref_on_raw(struct kernfs_node *kn)
>>>> +{
>>>> + return rcu_dereference_raw(kn->attr.open);
>>>> +}
>>>
>>> Wrapping the above probably isn't helping anything.
>>>
>>
>> This change is using raw deref at a few places, so I thought of putting
>> raw deref under a wrapper and explain in the wrapper function comment
>> under what conditions raw dereferencing was safe.
>
> I don't think they need raw deref in the first place and if you *really*
> need raw deref, the wrapper explaining why they're needed and how they're
> safe is the whole point of it and I don't think the wrapper achieves that.
>
Okay. I checked with rcu_access_pointer and CONFIG_PROVE_RCU enabled and did not
observe any warning(s) so rcu_access_pointer should have sufficed here. But I am
using rcu_dereference_check to accommodate checking of @of->list as well. The
checking of @of->list also covers one of your later suggestions. I have updated
the description of function as well. Could you please let me know if below looks
okay:
+/*
+ * Deref RCU protected kn->attr.open.
+ * If both @of->list and @kn->attr.open->files are non empty, we can safely
+ * assume that @of is on @kn->attr.open and hence @kn->attr.open will not
+ * vanish and derefeencing is safe here.
+ */
+static struct kernfs_open_node *
+kernfs_deref_on_check(struct kernfs_open_file *of, struct kernfs_node *kn)
+{
+ struct kernfs_open_node *on;
+
+ on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list));
+
+ if (on && list_empty(&on->files))
+ return NULL;
+ else
+ return on;
+}
+
If this looks okay then I can replace usage of kernfs_deref_on_raw with
kernfs_deref_on_check.
>>>> +/*
>>>> + * Check ->attr.open corresponding to @kn while holding kernfs_open_file_mutex.
>>>> + * ->attr.open is modified under kernfs_open_file_mutex. So it can be safely
>>>> + * accessed outside RCU read-side critical section, while holding the mutex.
>>>> + */
>>>> +static struct kernfs_open_node *kernfs_check_on_protected(struct kernfs_node *kn)
>>>> +{
>>>> + return rcu_dereference_check(kn->attr.open,
>>>> + lockdep_is_held(&kernfs_open_file_mutex));
>>>> +}
>>>
>>> Maybe name this just kernfs_deref_on()?
>>>
>>
>> Sure. I can mention in the function comment that this should be used
>> only under kernfs_open_file_mutex.
>
> and in the check condition, add the conditions that you need to make this
> not trigger warning when used in all the places that you wanna use it.
>
Since ->attr.open is always accessed/modified under kernfs_open_file_mutex, I
have included this check in helper which should be used only while holding this
mutex. Do you mean that I should include some additional checks as well in the
below helper:
+/*
+ * Deref ->attr.open corresponding to @kn while holding open_file_mutex.
+ * ->attr.open is modified under kernfs_open_file_mutex. So it can be safely
+ * safely accessed outside RCU read-side critical section, while holding
+ * the mutex.
+ */
+static struct kernfs_open_node *kernfs_deref_on(struct kernfs_node *kn)
+{
+ return rcu_dereference_protected(kn->attr.open,
+ lockdep_is_held(&kernfs_open_file_mutex));
+}
+
>> +static struct kernfs_open_node *kernfs_deref_on_raw(struct
>> kernfs_open_file *of, struct kernfs_node *kn)
>> {
>> - return rcu_dereference_raw(kn->attr.open);
>> + struct kernfs_open_node *on;
>> +
>> + if (list_empty(&of->list))
>> + return NULL;
>> +
>> + on = rcu_dereference_raw(kn->attr.open);
>> +
>> + if (list_empty(&on->files))
>> + return NULL;
>> + else
>> + return on;
>
> Put the above conditions in the rcu_dereference_check(). That's what it is
> for - describing the additional conditions that make the dereference safe.
>
As mentioned earlier, I have included checking of @of->list in
rcu_dereference_check. I am not sure if we can include checking of on->files as
well because "on" itself is the dereferenced pointer value here. So I have kept
checking of on->files separate as shown in the earlier snippet of
kernfs_deref_on_check above.
Thanks
-- Imran
Hello,
On Mon, May 16, 2022 at 02:00:50AM +1000, Imran Khan wrote:
> +/*
> + * Deref RCU protected kn->attr.open.
> + * If both @of->list and @kn->attr.open->files are non empty, we can safely
> + * assume that @of is on @kn->attr.open and hence @kn->attr.open will not
> + * vanish and derefeencing is safe here.
> + */
Maybe use proper function comment starting with /**?
> +static struct kernfs_open_node *
> +kernfs_deref_on_check(struct kernfs_open_file *of, struct kernfs_node *kn)
> +{
> + struct kernfs_open_node *on;
> +
> + on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list));
> +
> + if (on && list_empty(&on->files))
> + return NULL;
> + else
> + return on;
> +}
Why does it need to return NULL on empty on->files? We wanna trigger lockdep
warning cuz that's a bug but it's not like the caller can recover from it
(short of causing unexpected user visible error), so I don't see what the
point is.
> If this looks okay then I can replace usage of kernfs_deref_on_raw with
> kernfs_deref_on_check.
So, this is the main deref function without holding the mutex, right? Then
name it kernfs_deref_open_node() (or on but it seem a bit confusing to me).
> > and in the check condition, add the conditions that you need to make this
> > not trigger warning when used in all the places that you wanna use it.
> >
>
> Since ->attr.open is always accessed/modified under kernfs_open_file_mutex, I
> have included this check in helper which should be used only while holding this
> mutex. Do you mean that I should include some additional checks as well in the
> below helper:
Yeah, you're right. _protected makes sense for this one.
Thanks.
--
tejun
On Tue, May 17, 2022 at 12:30:26PM +1000, Imran Khan wrote:
> +/**
> + * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn.
> + *
> + * @kn: target kernfs_node.
> + * @val: RCU dereference will take place only if val is true.
> + *
> + * Fetch and return ->attr.open of @kn when current updater of ->attr.open
> + * ensures that as long as @val is true, other updater(s) of ->attr.open
> + * can not change it. The caller needs to pass value of the condition
> + * (@val) that prevents value of ->attr.open from changing.
> + *
> + * This should ONLY be used by updaters of ->attr.open.
> + */
> +static struct kernfs_open_node *
> +kernfs_deref_open_node(struct kernfs_node *kn, bool val)
> +{
> + return rcu_dereference_protected(kn->attr.open, val);
> +}
> +
> +/**
> + * kernfs_check_open_node - Get kernfs_open_node corresponding to @kn.
> + *
> + * @kn: target kernfs_node.
> + * @val: RCU dereference will take place only if val is true.
> + *
> + * rcu_dereference and return ->attr.open of @kn. This is used in cases
> + * where we can access ->attr.open outside RCU read-side critical section
> + * as long as specified conditions are correct i.e @val is true.
> + *
> + * This should ONLY be used by readers of ->attr.open.
> + */
> +static struct kernfs_open_node *
> +kernfs_check_open_node(struct kernfs_node *kn, bool val)
> +{
> + return rcu_dereference_check(kn->attr.open, val);
> +}
You gotta put the conditions inside these wrappers. These wrappers are
supposed to describe as strict as reasonably possible so that the checks can
catch people making mistakes. It doesn't make much sense to defer the
condition to the callers. Just describe what the expected conditions are in
the wrappers as best as you reasonably can.
Thanks.
--
tejun
Hello Tejun,
Thanks for your reply.
On 17/5/22 5:35 am, Tejun Heo wrote:
> Hello,
[...]
> Why does it need to return NULL on empty on->files? We wanna trigger lockdep
> warning cuz that's a bug but it's not like the caller can recover from it
> (short of causing unexpected user visible error), so I don't see what the
> point is.
>
Yeah. Returning NULL for this is wrong. I overdid the checking part because
current users are using on->event or on->poll. Also I have modified the
interface to accommodate for different condition checks (Please see below) and
this would mean that users of the interface can/should specify the condition
under which dereferenced pointer will be treated as valid.
>> If this looks okay then I can replace usage of kernfs_deref_on_raw with
>> kernfs_deref_on_check.
>
> So, this is the main deref function without holding the mutex, right? Then
> name it kernfs_deref_open_node() (or on but it seem a bit confusing to me).
>
kernfs_deref_open_node() looks better to me. But I am thinking that for
dereferencing ->attr.open outside explicit read-side critical section we need
only 2 interfaces. One for the updaters and one for the readers. These updaters
and readers should be able to deref ->attr.open under specific conditions which
guarantee the validity of dereferenced pointer. For example can we have just
have these 2 interfaces:
+/**
+ * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn.
+ *
+ * @kn: target kernfs_node.
+ * @val: RCU dereference will take place only if val is true.
+ *
+ * Fetch and return ->attr.open of @kn when current updater of ->attr.open
+ * ensures that as long as @val is true, other updater(s) of ->attr.open
+ * can not change it. The caller needs to pass value of the condition
+ * (@val) that prevents value of ->attr.open from changing.
+ *
+ * This should ONLY be used by updaters of ->attr.open.
+ */
+static struct kernfs_open_node *
+kernfs_deref_open_node(struct kernfs_node *kn, bool val)
+{
+ return rcu_dereference_protected(kn->attr.open, val);
+}
+
+/**
+ * kernfs_check_open_node - Get kernfs_open_node corresponding to @kn.
+ *
+ * @kn: target kernfs_node.
+ * @val: RCU dereference will take place only if val is true.
+ *
+ * rcu_dereference and return ->attr.open of @kn. This is used in cases
+ * where we can access ->attr.open outside RCU read-side critical section
+ * as long as specified conditions are correct i.e @val is true.
+ *
+ * This should ONLY be used by readers of ->attr.open.
+ */
+static struct kernfs_open_node *
+kernfs_check_open_node(struct kernfs_node *kn, bool val)
+{
+ return rcu_dereference_check(kn->attr.open, val);
+}
+
This will avoid need of different interfaces for different conditions
(holding kernfs_open_file_mutex, @of->list non-empty etc.) Current updaters like
kernfs_get_open_node, kernfs_unlink_open_file can specify
lockdep_is_held(&kernfs_open_file_mutex) as valid condition. Amongst current
readers kernfs_drain_files can specify the same condition as valid and
kernfs_seq_show, kernfs_file_read_iter etc. can specify !list_empty(&of->list)
as valid condition. This will also help us if dereferencing is needed under
certain other conditions in future.
For just checking pointer value we are using rcu_access_pointer anyways and that
should not need any wrapper.
Could you please let me know if this sounds reasonable? It is not changing
anything that we have discussed so far in this regard, it is just reducing the
number of interfaces .
Thanks
-- Imran