Direct calls to local_irq_{save/restore}() and preempt_{enable/disable}()
are not appropriate for PREEMPT_RT. To provide better PREEMPT_RT support,
change local_irq_{save/restore}() to local_lock_irq{save/restore}() and
add a local_lock_t to struct memcg_stock_pcp to cover the whole
structure including the embedded obj_stock structures.
Also disable the task and interrupt context optimization for obj_stock as
there will be no performance gain in the case of PREEMPT_RT. In this case,
task obj_stock will be there but remain unused and preempt_{enable/disable}()
will not be called for PREEMPT_RT.
Note that preempt_enable() and preempt_disable() in get_obj_stock() and
put_obj_stock() are not replaced by local_lock() and local_unlock() as it
is possible that a task accessing task_obj may get interrupted and then
access irq_obj concurrently. So using local_lock for task_obj access
may cause lockdep splat. Using separate local locks will complicate the
interaction between obj_stock and the embedding memcg_stock_pcp
structures.
Signed-off-by: Waiman Long <[email protected]>
---
mm/memcontrol.c | 51 +++++++++++++++++++++++++++++++------------------
1 file changed, 32 insertions(+), 19 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a09a7d2e0b1b..2c690eceda54 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2096,7 +2096,12 @@ struct obj_stock {
#endif
};
+/*
+ * The local_lock protects the whole memcg_stock_pcp structure including
+ * the embedded obj_stock structures.
+ */
struct memcg_stock_pcp {
+ local_lock_t lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct obj_stock task_obj;
@@ -2145,7 +2150,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
@@ -2153,7 +2158,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
return ret;
}
@@ -2189,7 +2194,7 @@ static void drain_local_stock(struct work_struct *dummy)
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.lock, flags);
stock = this_cpu_ptr(&memcg_stock);
drain_obj_stock(&stock->irq_obj);
@@ -2198,7 +2203,7 @@ static void drain_local_stock(struct work_struct *dummy)
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
}
/*
@@ -2210,7 +2215,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
struct memcg_stock_pcp *stock;
unsigned long flags;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
@@ -2223,7 +2228,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
}
/*
@@ -2779,29 +2784,34 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
* which is cheap in non-preempt kernel. The interrupt context object stock
* can only be accessed after disabling interrupt. User context code can
* access interrupt object stock, but not vice versa.
+ *
+ * This task and interrupt context optimization is disabled for PREEMPT_RT
+ * as there is no performance gain in this case and changes will be made to
+ * irq_obj only.
+ *
+ * For non-PREEMPT_RT, we are not replacing preempt_disable() by local_lock()
+ * as nesting of task_obj and irq_obj are allowed which may cause lockdep
+ * splat if local_lock() is used. Using separate local locks will complicate
+ * the interaction between obj_stock and the broader memcg_stock object.
*/
static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
{
- struct memcg_stock_pcp *stock;
-
- if (likely(in_task())) {
+ if (likely(in_task()) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
*pflags = 0UL;
preempt_disable();
- stock = this_cpu_ptr(&memcg_stock);
- return &stock->task_obj;
+ return this_cpu_ptr(&memcg_stock.task_obj);
}
- local_irq_save(*pflags);
- stock = this_cpu_ptr(&memcg_stock);
- return &stock->irq_obj;
+ local_lock_irqsave(&memcg_stock.lock, *pflags);
+ return this_cpu_ptr(&memcg_stock.irq_obj);
}
static inline void put_obj_stock(unsigned long flags)
{
- if (likely(in_task()))
+ if (likely(in_task()) && !IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
else
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
}
/*
@@ -7088,9 +7098,12 @@ static int __init mem_cgroup_init(void)
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
- for_each_possible_cpu(cpu)
- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_stock);
+ for_each_possible_cpu(cpu) {
+ struct memcg_stock_pcp *stock = per_cpu_ptr(&memcg_stock, cpu);
+
+ INIT_WORK(&stock->work, drain_local_stock);
+ local_lock_init(&stock->lock);
+ }
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
--
2.27.0
On Tue, 14 Dec 2021, Waiman Long wrote:
>@@ -2189,7 +2194,7 @@ static void drain_local_stock(struct work_struct *dummy)
> * drain_stock races is that we always operate on local CPU stock
> * here with IRQ disabled
> */
>- local_irq_save(flags);
>+ local_lock_irqsave(&memcg_stock.lock, flags);
>
> stock = this_cpu_ptr(&memcg_stock);
> drain_obj_stock(&stock->irq_obj);
So here there is still the problem that you can end up taking sleeping locks
with irqs disabled via obj_cgroup_put() >> obj_cgroup_release() - ie: the
percpu_ref_switch_lock and css_set_lock. It had occurred to me to promote
the former to a raw spinlock, but doubt we can get away with the latter.
Thanks,
Davidlohr
On 12/14/21 23:36, Davidlohr Bueso wrote:
> On Tue, 14 Dec 2021, Waiman Long wrote:
>
>> @@ -2189,7 +2194,7 @@ static void drain_local_stock(struct
>> work_struct *dummy)
>> * drain_stock races is that we always operate on local CPU stock
>> * here with IRQ disabled
>> */
>> - local_irq_save(flags);
>> + local_lock_irqsave(&memcg_stock.lock, flags);
>>
>> stock = this_cpu_ptr(&memcg_stock);
>> drain_obj_stock(&stock->irq_obj);
>
> So here there is still the problem that you can end up taking sleeping
> locks
> with irqs disabled via obj_cgroup_put() >> obj_cgroup_release() - ie: the
> percpu_ref_switch_lock and css_set_lock. It had occurred to me to promote
> the former to a raw spinlock, but doubt we can get away with the latter.
Yes, it is tricky to avoid all these. One possibility to to delay
obj_cgroup_release() through RCU. However, this is outside the scope of
this patch.
Cheers,
Longman
On 2021-12-14 09:44:12 [-0500], Waiman Long wrote:
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2096,7 +2096,12 @@ struct obj_stock {
> #endif
> };
>
> +/*
> + * The local_lock protects the whole memcg_stock_pcp structure including
> + * the embedded obj_stock structures.
> + */
> struct memcg_stock_pcp {
> + local_lock_t lock;
> struct mem_cgroup *cached; /* this never be root cgroup */
> unsigned int nr_pages;
> struct obj_stock task_obj;
> @@ -2145,7 +2150,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> if (nr_pages > MEMCG_CHARGE_BATCH)
> return ret;
>
> - local_irq_save(flags);
> + local_lock_irqsave(&memcg_stock.lock, flags);
This still does not explain why the lock is acquired here where it
appears to be unrelated to memcg_stock.lock.
>
> stock = this_cpu_ptr(&memcg_stock);
> if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
> @@ -2779,29 +2784,34 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
> * which is cheap in non-preempt kernel. The interrupt context object stock
> * can only be accessed after disabling interrupt. User context code can
> * access interrupt object stock, but not vice versa.
> + *
> + * This task and interrupt context optimization is disabled for PREEMPT_RT
> + * as there is no performance gain in this case and changes will be made to
> + * irq_obj only.
> + *
> + * For non-PREEMPT_RT, we are not replacing preempt_disable() by local_lock()
> + * as nesting of task_obj and irq_obj are allowed which may cause lockdep
> + * splat if local_lock() is used. Using separate local locks will complicate
> + * the interaction between obj_stock and the broader memcg_stock object.
> */
> static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
> {
> - struct memcg_stock_pcp *stock;
> -
> - if (likely(in_task())) {
> + if (likely(in_task()) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
> *pflags = 0UL;
> preempt_disable();
> - stock = this_cpu_ptr(&memcg_stock);
> - return &stock->task_obj;
> + return this_cpu_ptr(&memcg_stock.task_obj);
Do we need to keep the memcg_stock.task_obj for !RT?
I'm not really convinced that disabling either preemption or interrupts
is so much better compared to actual locking locking with lockdep
annotation. Looking at the history, I'm also impressed by that fact that
disabling/ enabling interrupts is *so* expensive that all this is
actually worth it.
> }
>
> - local_irq_save(*pflags);
> - stock = this_cpu_ptr(&memcg_stock);
> - return &stock->irq_obj;
> + local_lock_irqsave(&memcg_stock.lock, *pflags);
> + return this_cpu_ptr(&memcg_stock.irq_obj);
> }
>
Sebastian
On 2021-12-17 12:42:51 [+0100], To Waiman Long wrote:
> On 2021-12-14 09:44:12 [-0500], Waiman Long wrote:
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -2096,7 +2096,12 @@ struct obj_stock {
so I made something based on this. Let me clean this up and test it
first. So it is just a heads up :)
Sebastian
On 12/17/21 06:42, Sebastian Andrzej Siewior wrote:
> On 2021-12-14 09:44:12 [-0500], Waiman Long wrote:
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -2096,7 +2096,12 @@ struct obj_stock {
>> #endif
>> };
>>
>> +/*
>> + * The local_lock protects the whole memcg_stock_pcp structure including
>> + * the embedded obj_stock structures.
>> + */
>> struct memcg_stock_pcp {
>> + local_lock_t lock;
>> struct mem_cgroup *cached; /* this never be root cgroup */
>> unsigned int nr_pages;
>> struct obj_stock task_obj;
>> @@ -2145,7 +2150,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
>> if (nr_pages > MEMCG_CHARGE_BATCH)
>> return ret;
>>
>> - local_irq_save(flags);
>> + local_lock_irqsave(&memcg_stock.lock, flags);
> This still does not explain why the lock is acquired here where it
> appears to be unrelated to memcg_stock.lock.
consume_stock() can be called in both task and irq context. irq context
may include softirq where interrupt may have been enabled and something
get interrupt again. The original code just do a local_irq_save()
without documenting why we are doing so. So I didn't see a need to add
comment about that.
>>
>> stock = this_cpu_ptr(&memcg_stock);
>> if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
>> @@ -2779,29 +2784,34 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
>> * which is cheap in non-preempt kernel. The interrupt context object stock
>> * can only be accessed after disabling interrupt. User context code can
>> * access interrupt object stock, but not vice versa.
>> + *
>> + * This task and interrupt context optimization is disabled for PREEMPT_RT
>> + * as there is no performance gain in this case and changes will be made to
>> + * irq_obj only.
>> + *
>> + * For non-PREEMPT_RT, we are not replacing preempt_disable() by local_lock()
>> + * as nesting of task_obj and irq_obj are allowed which may cause lockdep
>> + * splat if local_lock() is used. Using separate local locks will complicate
>> + * the interaction between obj_stock and the broader memcg_stock object.
>> */
>> static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
>> {
>> - struct memcg_stock_pcp *stock;
>> -
>> - if (likely(in_task())) {
>> + if (likely(in_task()) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
>> *pflags = 0UL;
>> preempt_disable();
>> - stock = this_cpu_ptr(&memcg_stock);
>> - return &stock->task_obj;
>> + return this_cpu_ptr(&memcg_stock.task_obj);
> Do we need to keep the memcg_stock.task_obj for !RT?
> I'm not really convinced that disabling either preemption or interrupts
> is so much better compared to actual locking locking with lockdep
> annotation. Looking at the history, I'm also impressed by that fact that
> disabling/ enabling interrupts is *so* expensive that all this is
> actually worth it.
For !RT with voluntary or no preemption, preempt_disable() is just a
compiler barrier. So it is definitely cheaper than disabling interrupt.
The performance benefit is less with preemptible but !RT kernel.
Microbenchmark testing shows a performance improvement of a few percents
depending on the exact benchmark.
Cheers,
Longman
On 2021-12-17 13:46:53 [-0500], Waiman Long wrote:
> > annotation. Looking at the history, I'm also impressed by that fact that
> > disabling/ enabling interrupts is *so* expensive that all this is
> > actually worth it.
>
> For !RT with voluntary or no preemption, preempt_disable() is just a
> compiler barrier. So it is definitely cheaper than disabling interrupt. The
> performance benefit is less with preemptible but !RT kernel. Microbenchmark
> testing shows a performance improvement of a few percents depending on the
> exact benchmark.
Thanks for confirming. I got the feeling that this optimisation is for
!CONFIG_PREEMPTION. So I instead of depending on CONFIG_PREEMPT_RT I'm
leaning towards CONFIG_PREEMPT instead.
> Cheers,
> Longman
Sebastian
Greeting,
FYI, we noticed the following commit (built with gcc-9):
commit: 3928ba024a93a8556532c3c53e651169be83c061 ("[PATCH-next v3] mm/memcg: Properly handle memcg_stock access for PREEMPT_RT")
url: https://github.com/0day-ci/linux/commits/Waiman-Long/mm-memcg-Properly-handle-memcg_stock-access-for-PREEMPT_RT/20211214-224558
patch link: https://lore.kernel.org/lkml/[email protected]
in testcase: boot
on test machine: qemu-system-x86_64 -enable-kvm -cpu Icelake-Server -smp 4 -m 16G
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>
[ 9.553151][ C0] WARNING: possible recursive locking detected
[ 9.553540][ C0] 5.16.0-rc5-00001-g3928ba024a93 #1 Not tainted
[ 9.553934][ C0] --------------------------------------------
[ 9.554331][ C0] ksoftirqd/0/12 is trying to acquire lock:
[ 9.554690][ C0] ffffffffad262940 (&stock->lock){..-.}-{2:2}, at: local_lock_acquire (include/linux/local_lock_internal.h:28)
[ 9.555355][ C0]
[ 9.555355][ C0] but task is already holding lock:
[ 9.555804][ C0] ffffffffad262940 (&stock->lock){..-.}-{2:2}, at: local_lock_acquire (include/linux/local_lock_internal.h:28)
[ 9.556418][ C0]
[ 9.556418][ C0] other info that might help us debug this:
[ 9.556917][ C0] Possible unsafe locking scenario:
[ 9.556917][ C0]
[ 9.557370][ C0] CPU0
[ 9.557570][ C0] ----
[ 9.557771][ C0] lock(&stock->lock);
[ 9.558026][ C0] lock(&stock->lock);
[ 9.558280][ C0]
[ 9.558280][ C0] *** DEADLOCK ***
[ 9.558280][ C0]
[ 9.558780][ C0] May be due to missing lock nesting notation
[ 9.558780][ C0]
[ 9.559305][ C0] 2 locks held by ksoftirqd/0/12:
[ 9.559611][ C0] #0: ffffffffad221580 (rcu_callback){....}-{0:0}, at: rcu_process_callbacks (kernel/rcu/tiny.c:129)
[ 9.560212][ C0] #1: ffffffffad262940 (&stock->lock){..-.}-{2:2}, at: local_lock_acquire (include/linux/local_lock_internal.h:28)
[ 9.560848][ C0]
[ 9.560848][ C0] stack backtrace:
[ 9.561207][ C0] CPU: 0 PID: 12 Comm: ksoftirqd/0 Not tainted 5.16.0-rc5-00001-g3928ba024a93 #1
[ 9.561764][ C0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[ 9.562329][ C0] Call Trace:
[ 9.562553][ C0] <TASK>
[ 9.562736][ C0] validate_chain (kernel/locking/lockdep.c:3790)
[ 9.563035][ C0] __lock_acquire (kernel/locking/lockdep.c:5027)
[ 9.563325][ C0] lock_acquire (kernel/locking/lockdep.c:438 kernel/locking/lockdep.c:5639)
[ 9.563604][ C0] ? put_obj_stock (include/linux/local_lock_internal.h:29)
[ 9.563889][ C0] ? find_held_lock (kernel/locking/lockdep.c:5130)
[ 9.564181][ C0] ? get_obj_stock (include/linux/rcupdate.h:273 include/linux/rcupdate.h:721)
[ 9.564466][ C0] local_lock_acquire (include/linux/local_lock_internal.h:30)
[ 9.564830][ C0] ? put_obj_stock (include/linux/local_lock_internal.h:29)
[ 9.565115][ C0] refill_stock (mm/memcontrol.c:2215 (discriminator 3))
[ 9.565383][ C0] obj_cgroup_uncharge_pages (mm/memcontrol.c:2996)
[ 9.565719][ C0] drain_obj_stock (arch/x86/include/asm/atomic.h:53 include/linux/atomic/atomic-instrumented.h:56 mm/memcontrol.c:3173)
[ 9.566005][ C0] refill_obj_stock (include/linux/percpu-refcount.h:222 include/linux/memcontrol.h:825 mm/memcontrol.c:3228)
[ 9.566297][ C0] memcg_slab_free_hook (mm/slab.h:207 mm/slab.h:365)
[ 9.566608][ C0] ? rcu_process_callbacks (include/linux/rcupdate.h:273 kernel/rcu/tiny.c:102 kernel/rcu/tiny.c:133)
[ 9.566952][ C0] kmem_cache_free (mm/slub.c:3446 mm/slub.c:3514 mm/slub.c:3530)
[ 9.567250][ C0] ? free_inode_nonrcu (fs/inode.c:221)
[ 9.567556][ C0] rcu_process_callbacks (include/linux/rcupdate.h:273 kernel/rcu/tiny.c:102 kernel/rcu/tiny.c:133)
[ 9.567880][ C0] __do_softirq (arch/x86/include/asm/jump_label.h:27 include/linux/jump_label.h:212 include/trace/events/irq.h:142 kernel/softirq.c:559)
[ 9.568158][ C0] ? sort_range (kernel/smpboot.c:107)
[ 9.568426][ C0] run_ksoftirqd (kernel/softirq.c:412 kernel/softirq.c:922)
[ 9.568700][ C0] smpboot_thread_fn (kernel/smpboot.c:164 (discriminator 4))
[ 9.569003][ C0] kthread (kernel/kthread.c:329)
[ 9.569252][ C0] ? set_kthread_struct (kernel/kthread.c:272)
[ 9.569562][ C0] ret_from_fork (arch/x86/entry/entry_64.S:301)
[ 9.569838][ C0] </TASK>
[ OK ] Started Create Static Device Nodes in /dev.
[ OK ] Reached target Local File Systems (Pre).
[ OK ] Reached target Local File Systems.
Starting Create Volatile Files and Directories...
Starting Preprocess NFS configuration...
Starting udev Kernel Device Manager...
[ OK ] Started Create Volatile Files and Directories.
[ OK ] Started Preprocess NFS configuration.
[ OK ] Reached target NFS client services.
Starting RPC bind portmap service...
Starting Network Time Synchronization...
Starting Update UTMP about System Boot/Shutdown...
[ OK ] Started udev Kernel Device Manager.
[ OK ] Started RPC bind portmap service.
[ OK ] Started udev Coldplug all Devices.
Starting Helper to synchronize boot up for ifupdown...
[ OK ] Reached target Remote File Systems (Pre).
[ OK ] Reached target Remote File Systems.
[ OK ] Reached target RPC Port Mapper.
[ OK ] Started Helper to synchronize boot up for ifupdown.
[FAILED] Failed to start Update UTMP about System Boot/Shutdown.
See 'systemctl status systemd-update-utmp.service' for details.
Starting Raise network interfaces...
[ OK ] Started Raise network interfaces.
[ OK ] Reached target Network.
[ OK ] Started Network Time Synchronization.
[ OK ] Reached target System Time Synchronized.
[ OK ] Reached target System Initialization.
[ OK ] Started Daily apt download activities.
[ OK ] Started Daily Cleanup of Temporary Directories.
[ OK ] Started Daily rotation of log files.
[ OK ] Listening on D-Bus System Message Bus Socket.
[ OK ] Reached target Sockets.
[ OK ] Reached target Basic System.
[ OK ] Started Regular background program processing daemon.
Starting LKP bootstrap...
[ 9.728013][ C0] random: fast init done
Starting System Logging Service...
Starting LSB: OpenIPMI Driver init script...
Starting Login Service...
Starting OpenBSD Secure Shell server...
[ OK ] Started D-Bus System Message Bus.
[ 9.776118][ T219] random: dbus-daemon: uninitialized urandom read (12 bytes read)
[ 9.778254][ T219] random: dbus-daemon: uninitialized urandom read (12 bytes read)
Starting Permit User Sessions...
Starting /etc/rc.local Compatibility...
To reproduce:
# build kernel
cd linux
cp config-5.16.0-rc5-00001-g3928ba024a93 .config
make HOSTCC=gcc-9 CC=gcc-9 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
make HOSTCC=gcc-9 CC=gcc-9 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
cd <mod-install-dir>
find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email
# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.
---
0DAY/LKP+ Test Infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation
Thanks,
Oliver Sang