After removal of kernfs_open_node->refcnt in the previous patch,
kernfs_open_node_lock can be removed as well by making ->attr.open
RCU protected. kernfs_put_open_node can delegate freeing to ->attr.open
to RCU and other readers of ->attr.open can do so under rcu_read_(un)lock.
So make ->attr.open RCU protected and remove global kernfs_open_node_lock.
Suggested by: Al Viro <[email protected]>
Signed-off-by: Imran Khan <[email protected]>
---
fs/kernfs/file.c | 101 ++++++++++++++++++++++-------------------
include/linux/kernfs.h | 2 +-
2 files changed, 55 insertions(+), 48 deletions(-)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index aea6968c979e..bc393dcf4efa 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -23,16 +23,16 @@
* for each kernfs_node with one or more open files.
*
* kernfs_node->attr.open points to kernfs_open_node. attr.open is
- * protected by kernfs_open_node_lock.
+ * RCU protected.
*
* filp->private_data points to seq_file whose ->private points to
* kernfs_open_file. kernfs_open_files are chained at
* kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
*/
-static DEFINE_SPINLOCK(kernfs_open_node_lock);
static DEFINE_MUTEX(kernfs_open_file_mutex);
struct kernfs_open_node {
+ struct rcu_head rcu_head;
atomic_t event;
wait_queue_head_t poll;
struct list_head files; /* goes through kernfs_open_file.list */
@@ -156,8 +156,9 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
static int kernfs_seq_show(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
+ struct kernfs_open_node *on = rcu_dereference_raw(of->kn->attr.open);
- of->event = atomic_read(&of->kn->attr.open->event);
+ of->event = atomic_read(&on->event);
return of->kn->attr.ops->seq_show(sf, v);
}
@@ -180,6 +181,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
const struct kernfs_ops *ops;
+ struct kernfs_open_node *on;
char *buf;
buf = of->prealloc_buf;
@@ -201,7 +203,8 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out_free;
}
- of->event = atomic_read(&of->kn->attr.open->event);
+ on = rcu_dereference_raw(of->kn->attr.open);
+ of->event = atomic_read(&on->event);
ops = kernfs_ops(of->kn);
if (ops->read)
len = ops->read(of, buf, len, iocb->ki_pos);
@@ -519,36 +522,34 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
{
struct kernfs_open_node *on, *new_on = NULL;
- retry:
mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irq(&kernfs_open_node_lock);
-
- if (!kn->attr.open && new_on) {
- kn->attr.open = new_on;
- new_on = NULL;
- }
-
- on = kn->attr.open;
- if (on)
- list_add_tail(&of->list, &on->files);
-
- spin_unlock_irq(&kernfs_open_node_lock);
- mutex_unlock(&kernfs_open_file_mutex);
+ /**
+ * ->attr.open changes under kernfs_open_file_mutex so we don't
+ * need rcu_read_lock to ensure its existence.
+ */
+ on = rcu_dereference_protected(kn->attr.open,
+ lockdep_is_held(&kernfs_open_file_mutex));
if (on) {
- kfree(new_on);
+ list_add_tail(&of->list, &on->files);
+ mutex_unlock(&kernfs_open_file_mutex);
return 0;
+ } else {
+ /* not there, initialize a new one and retry */
+ new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+ if (!new_on) {
+ mutex_unlock(&kernfs_open_file_mutex);
+ return -ENOMEM;
+ }
+ atomic_set(&new_on->event, 1);
+ init_waitqueue_head(&new_on->poll);
+ INIT_LIST_HEAD(&new_on->files);
+ list_add_tail(&of->list, &new_on->files);
+ rcu_assign_pointer(kn->attr.open, new_on);
}
+ mutex_unlock(&kernfs_open_file_mutex);
- /* not there, initialize a new one and retry */
- new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
- if (!new_on)
- return -ENOMEM;
-
- atomic_set(&new_on->event, 1);
- init_waitqueue_head(&new_on->poll);
- INIT_LIST_HEAD(&new_on->files);
- goto retry;
+ return 0;
}
/**
@@ -566,24 +567,30 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
static void kernfs_put_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
- struct kernfs_open_node *on = kn->attr.open;
- unsigned long flags;
+ struct kernfs_open_node *on;
+
+ /* ->attr.open NULL means there are no more open files */
+ if (rcu_dereference_raw(kn->attr.open) == NULL)
+ return;
mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
+
+ on = rcu_dereference_protected(kn->attr.open,
+ lockdep_is_held(&kernfs_open_file_mutex));
+
+ if (!on) {
+ mutex_unlock(&kernfs_open_file_mutex);
+ return;
+ }
if (of)
list_del(&of->list);
- if (list_empty(&on->files))
- kn->attr.open = NULL;
- else
- on = NULL;
-
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+ if (list_empty(&on->files)) {
+ rcu_assign_pointer(kn->attr.open, NULL);
+ kfree_rcu(on, rcu_head);
+ }
mutex_unlock(&kernfs_open_file_mutex);
-
- kfree(on);
}
static int kernfs_fop_open(struct inode *inode, struct file *file)
@@ -765,12 +772,13 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
return;
- on = kn->attr.open;
- if (!on)
+ if (rcu_dereference_raw(kn->attr.open) == NULL)
return;
mutex_lock(&kernfs_open_file_mutex);
- if (!kn->attr.open) {
+ on = rcu_dereference_check(kn->attr.open,
+ lockdep_is_held(&kernfs_open_file_mutex));
+ if (!on) {
mutex_unlock(&kernfs_open_file_mutex);
return;
}
@@ -805,7 +813,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
- struct kernfs_open_node *on = kn->attr.open;
+ struct kernfs_open_node *on = rcu_dereference_raw(kn->attr.open);
poll_wait(of->file, &on->poll, wait);
@@ -912,14 +920,13 @@ void kernfs_notify(struct kernfs_node *kn)
return;
/* kick poll immediately */
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
- on = kn->attr.open;
+ rcu_read_lock();
+ on = rcu_dereference(kn->attr.open);
if (on) {
atomic_inc(&on->event);
wake_up_interruptible(&on->poll);
}
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
-
+ rcu_read_unlock();
/* schedule work to kick fsnotify */
spin_lock_irqsave(&kernfs_notify_lock, flags);
if (!kn->attr.notify_next) {
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index e2ae15a6225e..13f54f078a52 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -114,7 +114,7 @@ struct kernfs_elem_symlink {
struct kernfs_elem_attr {
const struct kernfs_ops *ops;
- struct kernfs_open_node *open;
+ struct kernfs_open_node __rcu *open;
loff_t size;
struct kernfs_node *notify_next; /* for kernfs_notify() */
};
--
2.30.2
Hello Tejun,
On 23/4/22 2:19 am, Tejun Heo wrote:
> On Sun, Apr 10, 2022 at 12:37:11PM +1000, Imran Khan wrote:
>> static int kernfs_seq_show(struct seq_file *sf, void *v)
>> {
>> struct kernfs_open_file *of = sf->private;
>> + struct kernfs_open_node *on = rcu_dereference_raw(of->kn->attr.open);
>
> I suppose raw deref is safe because @on can't go away while @of is alive,
> right?
Yes.
If that's the case, please factor out of -> on dereferencing into a
> helper and put a comment there explaining why the raw deref is safe.
>
Sure, will put dereferncing in a separate helper in next version.
>> @@ -201,7 +203,8 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>> goto out_free;
>> }
>>
>> - of->event = atomic_read(&of->kn->attr.open->event);
>> + on = rcu_dereference_raw(of->kn->attr.open);
>
> cuz we don't wanna sprinkle raw derefs in multiple places without
> explanation like this.
>
Agree.
> ...
>> /**
>> @@ -566,24 +567,30 @@ static int kernfs_get_open_node(struct kernfs_node *kn,
>> static void kernfs_put_open_node(struct kernfs_node *kn,
>> struct kernfs_open_file *of)
>> {
>> - struct kernfs_open_node *on = kn->attr.open;
>> - unsigned long flags;
>> + struct kernfs_open_node *on;
>> +
>> + /* ->attr.open NULL means there are no more open files */
>> + if (rcu_dereference_raw(kn->attr.open) == NULL)
>> + return;
>
> For pointer value check, what you want is rcu_access_pointer(). That said,
> tho, why is this being called if no one is linked on it? Before removing the
> refcnt, that'd be the same as trying to put a 0 ref. How does that happen?
Yeah this check surely should not be needed. I will remove it in next
version.
> Also, can you please rename it to unlink or something of the sort? It's
> confusing to call it put when there's no refcnt.
>
sure I will rename _put_ to _unlink_.
>>
>> mutex_lock(&kernfs_open_file_mutex);
>> - spin_lock_irqsave(&kernfs_open_node_lock, flags);
>> +
>> + on = rcu_dereference_protected(kn->attr.open,
>> + lockdep_is_held(&kernfs_open_file_mutex));
>
> Again, a better way to do it would be defining a kn -> on accessor which
> encodes the safe way to deref and use it. The deref rule is tied to the
> deref itself not the callsite.
>
Okay I will factor this out in a separate helper.
[...]
>> static int kernfs_fop_open(struct inode *inode, struct file *file)
>> @@ -765,12 +772,13 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
>> if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
>> return;
>>
>> - on = kn->attr.open;
>> - if (!on)
>> + if (rcu_dereference_raw(kn->attr.open) == NULL)
>> return;
>
> rcu_access_pointer again and you gotta explain why the lockless check is
> safe.
>
The lockless check is safe because no one will be adding to ->attr.open
at this point of time. This allows early bail out if ->attr.open in
already NULL. And if kernfs_put_open_node makes ->attr.open NULL, it
does this under open_file_mutex so subsequent check under
open_file_mutex will make sure to bail out if kernfs_put_open_node won
the race.
I will put an explanatory comment in the code, explaining the same.
[...]
>> @@ -912,14 +920,13 @@ void kernfs_notify(struct kernfs_node *kn)
>> return;
>>
>> /* kick poll immediately */
>> - spin_lock_irqsave(&kernfs_open_node_lock, flags);
>> - on = kn->attr.open;
>> + rcu_read_lock();
>> + on = rcu_dereference(kn->attr.open);
>> if (on) {
>> atomic_inc(&on->event);
>> wake_up_interruptible(&on->poll);
>> }
>> - spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
>> -
>> + rcu_read_unlock();
>
> An explanation of why this is safe in terms of event ordering would be great
> here.
>
This is safe because here we don't need to refcnt ->on in this case. If
writer (kernfs_put_open_node) has already made ->attr.open NULL we will
bail out. If kernfs_notify got an old ->attr.open we can still safely
process the event, even if kernfs_put_open_node updates ->attr.open to
NULL in parallel.
In both the cases the behaviour/order will be same as earlier code that
used kernfs_open_node_lock.
Please let me know if this answers your query or if something is still
missing.
Thanks
-- Imran