2024-04-26 19:55:51

by Lucas Karpinski

[permalink] [raw]
Subject: [RFC v2 0/1] fs/namespace: defer RCU sync for MNT_DETACH umount

Hi all,

Attached is v2 of the umount optimization. Please take a look at v1 for
the original introduction to the problem. Al made it clear in the
previous RFC that if a filesystem is shut down by umount(2), that the
shut down needs to be completed before the return from the syscall.

The change in this version looks to address that by only deferring the
release on lazy umounts.

Lucas

v2:
- Only defer releasing umount'ed filesystems for lazy umounts
v1: https://lore.kernel.org/all/[email protected]/

Lucas Karpinski (1):
fs/namespace: defer RCU sync for MNT_DETACH umount

fs/namespace.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 45 insertions(+), 6 deletions(-)

--
2.44.0



2024-04-26 19:56:04

by Lucas Karpinski

[permalink] [raw]
Subject: [RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount

Use call_rcu to defer releasing the detached filesystem when calling
namespace_unlock() during a lazy umount.

When detaching (MNT_DETACH) a filesystem, it should not be necessary to
wait for the grace period before completing the syscall. The
expectation that the filesystem is shut down by the time the syscall
returns does not apply in this case.

Calling synchronize_rcu_expedited() has a significant cost on RT kernel
that default to rcupdate.rcu_normal_after_boot=1. The struct mount
umount'ed are queued up for release in a separate list
once the grace period completes while no longer accessible to following
syscalls.

Without patch, on 6.9.0-rc2-rt kernel:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
0.02756 +- 0.00299 seconds time elapsed ( +- 10.84% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
0.04422 +- 0.00521 seconds time elapsed ( +- 11.79% )

With patch, on 6.9.0-rc2-rt kernel:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount mnt
0.02852 +- 0.00377 seconds time elapsed ( +- 13.20% )
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
0.0030812 +- 0.0000524 seconds time elapsed ( +- 1.70% )

Signed-off-by: Alexander Larsson <[email protected]>
Signed-off-by: Eric Chanudet <[email protected]>
Signed-off-by: Lucas Karpinski <[email protected]>
---
fs/namespace.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 5a51315c6678..df03fc0d1990 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -45,6 +45,11 @@ static unsigned int m_hash_shift __ro_after_init;
static unsigned int mp_hash_mask __ro_after_init;
static unsigned int mp_hash_shift __ro_after_init;

+struct mount_delayed_release {
+ struct rcu_head rcu;
+ struct hlist_head release_list;
+};
+
static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
@@ -78,6 +83,7 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static bool lazy_unlock = false; /* protected by namespace_sem */

struct mount_kattr {
unsigned int attr_set;
@@ -1553,16 +1559,39 @@ int may_umount(struct vfsmount *mnt)

EXPORT_SYMBOL(may_umount);

-static void namespace_unlock(void)
+static void free_mounts(struct hlist_head *mount_list)
{
- struct hlist_head head;
struct hlist_node *p;
struct mount *m;
+
+ hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
+ hlist_del(&m->mnt_umount);
+ mntput(&m->mnt);
+ }
+}
+
+static void delayed_mount_release(struct rcu_head *head)
+{
+ struct mount_delayed_release *drelease =
+ container_of(head, struct mount_delayed_release, rcu);
+
+ free_mounts(&drelease->release_list);
+ kfree(drelease);
+}
+
+static void namespace_unlock(void)
+{
+ HLIST_HEAD(head);
LIST_HEAD(list);
+ bool lazy;
+

hlist_move_list(&unmounted, &head);
list_splice_init(&ex_mountpoints, &list);

+ lazy = lazy_unlock;
+ lazy_unlock = false;
+
up_write(&namespace_sem);

shrink_dentry_list(&list);
@@ -1570,12 +1599,21 @@ static void namespace_unlock(void)
if (likely(hlist_empty(&head)))
return;

- synchronize_rcu_expedited();
+ if (lazy) {
+ struct mount_delayed_release *drelease =
+ kmalloc(sizeof(*drelease), GFP_KERNEL);

- hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
- hlist_del(&m->mnt_umount);
- mntput(&m->mnt);
+ if (unlikely(!drelease))
+ goto out;
+
+ hlist_move_list(&head, &drelease->release_list);
+ call_rcu(&drelease->rcu, delayed_mount_release);
+ return;
}
+
+out:
+ synchronize_rcu_expedited();
+ free_mounts(&head);
}

static inline void namespace_lock(void)
@@ -1798,6 +1836,7 @@ static int do_umount(struct mount *mnt, int flags)
}
out:
unlock_mount_hash();
+ lazy_unlock = flags & MNT_DETACH ? true : false;
namespace_unlock();
return retval;
}
--
2.44.0


2024-04-26 20:10:07

by Al Viro

[permalink] [raw]
Subject: Re: [RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount

On Fri, Apr 26, 2024 at 03:53:48PM -0400, Lucas Karpinski wrote:

> -static void namespace_unlock(void)
> +static void free_mounts(struct hlist_head *mount_list)
> {
> - struct hlist_head head;
> struct hlist_node *p;
> struct mount *m;
> +
> + hlist_for_each_entry_safe(m, p, mount_list, mnt_umount) {
> + hlist_del(&m->mnt_umount);
> + mntput(&m->mnt);

.. which may block in quite a few ways.

> + }
> +}
> +
> +static void delayed_mount_release(struct rcu_head *head)
> +{
> + struct mount_delayed_release *drelease =
> + container_of(head, struct mount_delayed_release, rcu);
> +
> + free_mounts(&drelease->release_list);

.. and therefore so can this.

> + kfree(drelease);
> +}


> + call_rcu(&drelease->rcu, delayed_mount_release);

.. which is a bad idea, since call_rcu() callbacks are run
from interrupt context. Which makes blocking in them a problem.

2024-04-30 13:29:21

by Lucas Karpinski

[permalink] [raw]
Subject: Re: [RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount

On Fri, Apr 26, 2024 at 09:09:41PM +0100, Al Viro wrote:
> > + call_rcu(&drelease->rcu, delayed_mount_release);
>
> ... which is a bad idea, since call_rcu() callbacks are run
> from interrupt context. Which makes blocking in them a problem.
>

Thanks for the quick review.

Documentation/RCU/checklist.rst suggests switching to queue_rcu_work()
function in scenarios where the callback function can block. This seems
like it would fix the issue you found, while still providing similar
performance improvements.

workqueue:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
0.003066 +- 0.000307 seconds time elapsed ( +- 10.02% )

callrcu:
perf stat -r 10 --null --pre 'mount -t tmpfs tmpfs mnt' -- umount -l mnt
0.0030812 +- 0.0000524 seconds time elapsed ( +- 1.70% )

Regards,
Lucas



2024-04-30 14:27:27

by kernel test robot

[permalink] [raw]
Subject: Re: [RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount



Hello,

kernel test robot noticed "WARNING:inconsistent_lock_state" on:

commit: 9a4131bb20fbeb5980c3e21709488b9c4ef2eee5 ("[RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount")
url: https://github.com/intel-lab-lkp/linux/commits/Lucas-Karpinski/fs-namespace-defer-RCU-sync-for-MNT_DETACH-umount/20240427-035724
base: https://git.kernel.org/cgit/linux/kernel/git/vfs/vfs.git vfs.all
patch link: https://lore.kernel.org/all/[email protected]/
patch subject: [RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount

in testcase: boot

compiler: gcc-7
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

(please refer to attached dmesg/kmsg for entire log/backtrace)


+---------------------------------------------------+------------+------------+
| | eea3260250 | 9a4131bb20 |
+---------------------------------------------------+------------+------------+
| WARNING:inconsistent_lock_state | 0 | 12 |
| inconsistent{SOFTIRQ-ON-W}->{IN-SOFTIRQ-W}usage | 0 | 12 |
+---------------------------------------------------+------------+------------+


If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-lkp/[email protected]


[ 19.407878][ C0]
[ 19.408164][ C0] ================================
[ 19.408629][ C0] WARNING: inconsistent lock state
[ 19.409083][ C0] 6.9.0-rc3-00075-g9a4131bb20fb #2 Tainted: G W TN
[ 19.409757][ C0] --------------------------------
[ 19.410209][ C0] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[ 19.410777][ C0] ksoftirqd/0/10 [HC0[0]:SC1[3]:HE1:SE0] takes:
[ 19.411306][ C0] ffffffff83066cd0 (mount_lock){+.?.}-{2:2}, at: mntput_no_expire (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1314)
[ 19.412072][ C0] {SOFTIRQ-ON-W} state was registered at:
[ 19.412574][ C0] lock_acquire (kernel/locking/lockdep.c:467 kernel/locking/lockdep.c:5756)
[ 19.412999][ C0] _raw_spin_lock (include/linux/spinlock_api_smp.h:134 kernel/locking/spinlock.c:154)
[ 19.413421][ C0] vfs_create_mount (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1122)
[ 19.413862][ C0] fc_mount (fs/namespace.c:1134)
[ 19.414246][ C0] vfs_kern_mount (fs/namespace.c:1148 fs/namespace.c:1181)
[ 19.414713][ C0] kern_mount (fs/namespace.c:5248)
[ 19.415106][ C0] shmem_init (mm/shmem.c:4684)
[ 19.415508][ C0] mnt_init (fs/namespace.c:5232)
[ 19.415907][ C0] vfs_caches_init (fs/dcache.c:3187)
[ 19.416333][ C0] start_kernel (init/main.c:1058)
[ 19.416760][ C0] x86_64_start_reservations (arch/x86/kernel/head64.c:495)
[ 19.417245][ C0] x86_64_start_kernel (arch/x86/kernel/head64.c:488)
[ 19.417694][ C0] common_startup_64 (arch/x86/kernel/head_64.S:421)
[ 19.418133][ C0] irq event stamp: 868370
[ 19.418530][ C0] hardirqs last enabled at (868370): __local_bh_enable_ip (arch/x86/include/asm/paravirt.h:698 kernel/softirq.c:387)
[ 19.419370][ C0] hardirqs last disabled at (868369): __local_bh_enable_ip (kernel/softirq.c:364 (discriminator 1))
[ 19.420216][ C0] softirqs last enabled at (868292): __do_softirq (arch/x86/include/asm/preempt.h:26 kernel/softirq.c:401 kernel/softirq.c:583)
[ 19.421025][ C0] softirqs last disabled at (868297): run_ksoftirqd (kernel/softirq.c:411 kernel/softirq.c:925)
[ 19.421823][ C0]
[ 19.421823][ C0] other info that might help us debug this:
[ 19.422522][ C0] Possible unsafe locking scenario:
[ 19.422522][ C0]
[ 19.423184][ C0] CPU0
[ 19.423510][ C0] ----
[ 19.423838][ C0] lock(mount_lock);
[ 19.424225][ C0] <Interrupt>
[ 19.424567][ C0] lock(mount_lock);
[ 19.425115][ C0]
[ 19.425115][ C0] *** DEADLOCK ***
[ 19.425115][ C0]
[ 19.425858][ C0] 2 locks held by ksoftirqd/0/10:
[ 19.426297][ C0] #0: ffffffff8301d700 (rcu_callback){....}-{0:0}, at: rcu_process_callbacks (include/linux/bottom_half.h:20 kernel/rcu/tiny.c:133)
[ 19.427122][ C0] #1: ffffffff8301d7c0 (rcu_read_lock){....}-{1:2}, at: mntput_no_expire (include/linux/rcupdate.h:329 include/linux/rcupdate.h:781 fs/namespace.c:1299)
[ 19.427921][ C0]
[ 19.427921][ C0] stack backtrace:
[ 19.428459][ C0] CPU: 0 PID: 10 Comm: ksoftirqd/0 Tainted: G W TN 6.9.0-rc3-00075-g9a4131bb20fb #2
[ 19.429328][ C0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 19.430179][ C0] Call Trace:
[ 19.430505][ C0] <TASK>
[ 19.430805][ C0] dump_stack_lvl (arch/x86/include/asm/paravirt.h:687 (discriminator 3) arch/x86/include/asm/irqflags.h:127 (discriminator 3) lib/dump_stack.c:117 (discriminator 3))
[ 19.431221][ C0] mark_lock+0x473/0x4c0
[ 19.431661][ C0] ? __lock_acquire (kernel/locking/lockdep.c:4599 kernel/locking/lockdep.c:5091)
[ 19.432102][ C0] ? __lock_acquire (include/linux/hash.h:78 kernel/locking/lockdep.c:3759 kernel/locking/lockdep.c:3782 kernel/locking/lockdep.c:3837 kernel/locking/lockdep.c:5137)
[ 19.432550][ C0] ? __lock_acquire (kernel/locking/lockdep.c:4567 kernel/locking/lockdep.c:5091)
[ 19.432989][ C0] __lock_acquire (kernel/locking/lockdep.c:4567 kernel/locking/lockdep.c:5091)
[ 19.433415][ C0] ? __lock_acquire (kernel/locking/lockdep.c:5134 (discriminator 9))
[ 19.433856][ C0] lock_acquire (kernel/locking/lockdep.c:467 kernel/locking/lockdep.c:5756)
[ 19.434269][ C0] ? mntput_no_expire (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1314)
[ 19.434708][ C0] ? mntput_no_expire (include/linux/rcupdate.h:329 include/linux/rcupdate.h:781 fs/namespace.c:1299)
[ 19.435144][ C0] _raw_spin_lock (include/linux/spinlock_api_smp.h:134 kernel/locking/spinlock.c:154)
[ 19.435555][ C0] ? mntput_no_expire (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1314)
[ 19.435994][ C0] mntput_no_expire (include/linux/seqlock.h:469 include/linux/seqlock.h:495 include/linux/seqlock.h:823 fs/namespace.c:114 fs/namespace.c:1314)
[ 19.436421][ C0] ? rcu_process_callbacks (include/linux/bottom_half.h:20 kernel/rcu/tiny.c:133)
[ 19.436900][ C0] free_mounts (fs/namespace.c:1571)
[ 19.437291][ C0] ? __x64_sys_mount_setattr (fs/namespace.c:1574)
[ 19.437772][ C0] ? __x64_sys_mount_setattr (fs/namespace.c:1574)
[ 19.438253][ C0] delayed_mount_release (fs/namespace.c:1579)
[ 19.438699][ C0] rcu_process_callbacks (include/linux/rcupdate.h:339 kernel/rcu/tiny.c:103 kernel/rcu/tiny.c:134)
[ 19.439151][ C0] __do_softirq (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:457 include/linux/jump_label.h:260 include/linux/jump_label.h:270 include/trace/events/irq.h:142 kernel/softirq.c:555)
[ 19.439556][ C0] ? smpboot_thread_fn (kernel/smpboot.c:112)
[ 19.439997][ C0] ? sort_range (kernel/smpboot.c:107)
[ 19.440389][ C0] run_ksoftirqd (kernel/softirq.c:411 kernel/softirq.c:925)
[ 19.443032][ C0] smpboot_thread_fn (kernel/smpboot.c:164 (discriminator 3))
[ 19.443510][ C0] kthread (kernel/kthread.c:388)
[ 19.443887][ C0] ? kthread_complete_and_exit (kernel/kthread.c:341)
[ 19.444367][ C0] ret_from_fork (arch/x86/kernel/process.c:153)
[ 19.444788][ C0] ? kthread_complete_and_exit (kernel/kthread.c:341)
[ 19.445269][ C0] ret_from_fork_asm (arch/x86/entry/entry_64.S:253)
[ 19.445696][ C0] </TASK>


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240430/[email protected]



--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


2024-05-01 15:02:01

by Ian Kent

[permalink] [raw]
Subject: Re: [RFC v2 1/1] fs/namespace: defer RCU sync for MNT_DETACH umount

On 30/4/24 21:25, Lucas Karpinski wrote:
> On Fri, Apr 26, 2024 at 09:09:41PM +0100, Al Viro wrote:
>>> + call_rcu(&drelease->rcu, delayed_mount_release);
>> ... which is a bad idea, since call_rcu() callbacks are run
>> from interrupt context. Which makes blocking in them a problem.
>>
> Thanks for the quick review.
>
> Documentation/RCU/checklist.rst suggests switching to queue_rcu_work()
> function in scenarios where the callback function can block. This seems
> like it would fix the issue you found, while still providing similar
> performance improvements.

You know I've been looking at this and you can see that mntput() will
just call

mntput_no_expire() which queues work to do the bulk of the work and returns.


So I'm wondering what would happen to the timing if you simply didn't
call the

rcu wait for the lazy umount case and left everything else as it is.


Ian