This patchset improves the performance of accounted kernel memory allocations
by ~30% as measured by a micro-benchmark [1]. The benchmark is very
straightforward: 1M of 64 bytes-large kmalloc() allocations.
Below are results with the disabled kernel memory accounting, the original state
and with this patchset applied.
| | Kmem disabled | Original | Patched | Delta |
|-------------+---------------+----------+---------+--------|
| User cgroup | 29764 | 84548 | 59078 | -30.0% |
| Root cgroup | 29742 | 48342 | 31501 | -34.8% |
As we can see, the patchset removes the majority of the overhead when there is
no actual accounting (a task belongs to the root memory cgroup) and almost
halves the accounting overhead otherwise.
The main idea is to get rid of unnecessary memcg to objcg conversions and switch
to a scope-based protection of objcgs, which eliminates extra operations with
objcg reference counters under a rcu read lock. More details are provided in
individual commit descriptions.
v1:
- made the objcg update fully lockless
- fixed !CONFIG_MMU build issues
rfc:
https://lwn.net/Articles/945722/
--
[1]:
static int memory_alloc_test(struct seq_file *m, void *v)
{
unsigned long i, j;
void **ptrs;
ktime_t start, end;
s64 delta, min_delta = LLONG_MAX;
ptrs = kvmalloc(sizeof(void *) * 1000000, GFP_KERNEL);
if (!ptrs)
return -ENOMEM;
for (j = 0; j < 100; j++) {
start = ktime_get();
for (i = 0; i < 1000000; i++)
ptrs[i] = kmalloc(64, GFP_KERNEL_ACCOUNT);
end = ktime_get();
delta = ktime_us_delta(end, start);
if (delta < min_delta)
min_delta = delta;
for (i = 0; i < 1000000; i++)
kfree(ptrs[i]);
}
kvfree(ptrs);
seq_printf(m, "%lld us\n", min_delta);
return 0;
}
--
Signed-off-by: Roman Gushchin (Cruise) <[email protected]>
Roman Gushchin (5):
mm: kmem: optimize get_obj_cgroup_from_current()
mm: kmem: add direct objcg pointer to task_struct
mm: kmem: make memcg keep a reference to the original objcg
mm: kmem: scoped objcg protection
percpu: scoped objcg protection
include/linux/memcontrol.h | 24 ++++-
include/linux/sched.h | 4 +
mm/memcontrol.c | 184 ++++++++++++++++++++++++++++++++-----
mm/percpu.c | 8 +-
mm/slab.h | 10 +-
5 files changed, 192 insertions(+), 38 deletions(-)
--
2.42.0
To charge a freshly allocated kernel object to a memory cgroup, the
kernel needs to obtain an objcg pointer. Currently it does it
indirectly by obtaining the memcg pointer first and then calling to
__get_obj_cgroup_from_memcg().
Usually tasks spend their entire life belonging to the same object
cgroup. So it makes sense to save the objcg pointer on task_struct
directly, so it can be obtained faster. It requires some work on fork,
exit and cgroup migrate paths, but these paths are way colder.
To avoid any costly synchronization the following rules are applied:
1) A task sets it's objcg pointer itself.
2) If a task is being migrated to another cgroup, the least
significant bit of the objcg pointer is set atomically.
3) On the allocation path the objcg pointer is obtained locklessly
using the READ_ONCE() macro and the least significant bit is
checked. If it's set, the following procedure is used to update
it locklessly:
- task->objcg is zeroed using cmpxcg
- new objcg pointer is obtained
- task->objcg is updated using try_cmpxchg
- operation is repeated if try_cmpxcg fails
It guarantees that no updates will be lost if task migration
is racing against objcg pointer update. It also allows to keep
both read and write paths fully lockless.
Because the task is keeping a reference to the objcg, it can't go away
while the task is alive.
This commit doesn't change the way the remote memcg charging works.
Signed-off-by: Roman Gushchin (Cruise) <[email protected]>
---
include/linux/memcontrol.h | 10 ++++
include/linux/sched.h | 4 ++
mm/memcontrol.c | 111 ++++++++++++++++++++++++++++++++++---
3 files changed, 116 insertions(+), 9 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ab94ad4597d0..1c1ebb269ac1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -553,6 +553,16 @@ static inline bool folio_memcg_kmem(struct folio *folio)
return folio->memcg_data & MEMCG_DATA_KMEM;
}
+static inline bool current_objcg_needs_update(struct obj_cgroup *objcg)
+{
+ return (struct obj_cgroup *)((unsigned long)objcg & 0x1);
+}
+
+static inline struct obj_cgroup *
+current_objcg_without_update_flag(struct obj_cgroup *objcg)
+{
+ return (struct obj_cgroup *)((unsigned long)objcg & ~0x1);
+}
#else
static inline bool folio_memcg_kmem(struct folio *folio)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..60de42715b56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,10 @@ struct task_struct {
struct mem_cgroup *active_memcg;
#endif
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *objcg;
+#endif
+
#ifdef CONFIG_BLK_CGROUP
struct gendisk *throttle_disk;
#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 16ac2a5838fb..ec28f9cfc2f0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3001,6 +3001,47 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
return objcg;
}
+static struct obj_cgroup *current_objcg_update(struct obj_cgroup *old)
+{
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg = NULL, *tmp = old;
+
+ old = current_objcg_without_update_flag(old);
+ if (old)
+ obj_cgroup_put(old);
+
+ rcu_read_lock();
+ do {
+ /* Atomically drop the update bit, */
+ WARN_ON_ONCE(cmpxchg(¤t->objcg, tmp, 0) != tmp);
+
+ /* ...obtain the new objcg pointer */
+ memcg = mem_cgroup_from_task(current);
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ objcg = NULL;
+ }
+
+ /*
+ * ...and try atomically set up a new objcg pointer. If it
+ * fails, it means the update flag was set concurrently, so
+ * the whole procedure should be repeated.
+ */
+ tmp = 0;
+ } while (!try_cmpxchg(¤t->objcg, &tmp, objcg));
+ rcu_read_unlock();
+
+ return objcg;
+}
+
+static inline void current_objcg_set_needs_update(struct task_struct *task)
+{
+ /* atomically set the update bit */
+ set_bit(0, (unsigned long *)¤t->objcg);
+}
+
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
struct mem_cgroup *memcg;
@@ -3008,19 +3049,26 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
if (in_task()) {
memcg = current->active_memcg;
+ if (unlikely(memcg))
+ goto from_memcg;
- /* Memcg to charge can't be determined. */
- if (likely(!memcg) && (!current->mm || (current->flags & PF_KTHREAD)))
- return NULL;
+ objcg = READ_ONCE(current->objcg);
+ if (unlikely(current_objcg_needs_update(objcg)))
+ objcg = current_objcg_update(objcg);
+
+ if (objcg) {
+ obj_cgroup_get(objcg);
+ return objcg;
+ }
} else {
memcg = this_cpu_read(int_active_memcg);
- if (likely(!memcg))
- return NULL;
+ if (unlikely(memcg))
+ goto from_memcg;
}
+ return NULL;
+from_memcg:
rcu_read_lock();
- if (!memcg)
- memcg = mem_cgroup_from_task(current);
objcg = __get_obj_cgroup_from_memcg(memcg);
rcu_read_unlock();
return objcg;
@@ -6345,6 +6393,7 @@ static void mem_cgroup_move_task(void)
mem_cgroup_clear_mc();
}
}
+
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
{
@@ -6358,8 +6407,27 @@ static void mem_cgroup_move_task(void)
}
#endif
+#ifdef CONFIG_MEMCG_KMEM
+static void mem_cgroup_fork(struct task_struct *task)
+{
+ /*
+ * Set the update flag to cause task->objcg to be initialized lazily
+ * on the first allocation.
+ */
+ task->objcg = (struct obj_cgroup *)0x1;
+}
+
+static void mem_cgroup_exit(struct task_struct *task)
+{
+ struct obj_cgroup *objcg = current_objcg_without_update_flag(task->objcg);
+
+ if (objcg)
+ obj_cgroup_put(objcg);
+}
+#endif
+
#ifdef CONFIG_LRU_GEN
-static void mem_cgroup_attach(struct cgroup_taskset *tset)
+static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
@@ -6377,10 +6445,29 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset)
task_unlock(task);
}
#else
+static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {}
+#endif /* CONFIG_LRU_GEN */
+
+#ifdef CONFIG_MEMCG_KMEM
+static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+
+ cgroup_taskset_for_each(task, css, tset)
+ current_objcg_set_needs_update(task);
+}
+#else
+static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM)
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
+ mem_cgroup_lru_gen_attach(tset);
+ mem_cgroup_kmem_attach(tset);
}
-#endif /* CONFIG_LRU_GEN */
+#endif
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
@@ -6824,9 +6911,15 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_reset = mem_cgroup_css_reset,
.css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
+#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM)
.attach = mem_cgroup_attach,
+#endif
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
+#ifdef CONFIG_MEMCG_KMEM
+ .fork = mem_cgroup_fork,
+ .exit = mem_cgroup_exit,
+#endif
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
.early_init = 0,
--
2.42.0
Similar to slab and kmem, switch to a scope-based protection of the
objcg pointer to avoid.
Signed-off-by: Roman Gushchin (Cruise) <[email protected]>
---
mm/percpu.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/mm/percpu.c b/mm/percpu.c
index a7665de8485f..f53ba692d67a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1628,14 +1628,12 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
return true;
- objcg = get_obj_cgroup_from_current();
+ objcg = current_obj_cgroup();
if (!objcg)
return true;
- if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) {
- obj_cgroup_put(objcg);
+ if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
return false;
- }
*objcgp = objcg;
return true;
@@ -1649,6 +1647,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
return;
if (likely(chunk && chunk->obj_cgroups)) {
+ obj_cgroup_get(objcg);
chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
rcu_read_lock();
@@ -1657,7 +1656,6 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
rcu_read_unlock();
} else {
obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
- obj_cgroup_put(objcg);
}
}
--
2.42.0
On Fri, Sep 29, 2023 at 11:00:52AM -0700, Roman Gushchin wrote:
> @@ -553,6 +553,16 @@ static inline bool folio_memcg_kmem(struct folio *folio)
> return folio->memcg_data & MEMCG_DATA_KMEM;
> }
>
> +static inline bool current_objcg_needs_update(struct obj_cgroup *objcg)
> +{
> + return (struct obj_cgroup *)((unsigned long)objcg & 0x1);
> +}
> +
> +static inline struct obj_cgroup *
> +current_objcg_without_update_flag(struct obj_cgroup *objcg)
> +{
> + return (struct obj_cgroup *)((unsigned long)objcg & ~0x1);
> +}
I would slightly prefer naming the bit with a define, and open-coding
the bitops in the current callsites. This makes it clearer that the
actual pointer bits are overloaded in the places where the pointer is
accessed.
> @@ -3001,6 +3001,47 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
> return objcg;
> }
>
> +static struct obj_cgroup *current_objcg_update(struct obj_cgroup *old)
> +{
> + struct mem_cgroup *memcg;
> + struct obj_cgroup *objcg = NULL, *tmp = old;
> +
> + old = current_objcg_without_update_flag(old);
> + if (old)
> + obj_cgroup_put(old);
> +
> + rcu_read_lock();
> + do {
> + /* Atomically drop the update bit, */
> + WARN_ON_ONCE(cmpxchg(¤t->objcg, tmp, 0) != tmp);
> +
> + /* ...obtain the new objcg pointer */
> + memcg = mem_cgroup_from_task(current);
> + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
> + objcg = rcu_dereference(memcg->objcg);
> + if (objcg && obj_cgroup_tryget(objcg))
> + break;
> + objcg = NULL;
> + }
As per the other thread, it would be great to have a comment here
explaining the scenario(s) when the tryget could fail and we'd have to
defer to an ancestor.
> +
> + /*
> + * ...and try atomically set up a new objcg pointer. If it
> + * fails, it means the update flag was set concurrently, so
> + * the whole procedure should be repeated.
> + */
> + tmp = 0;
> + } while (!try_cmpxchg(¤t->objcg, &tmp, objcg));
> + rcu_read_unlock();
> +
> + return objcg;
Overall this looks great to me.
AFAICS the rcu_read_lock() is needed for the mem_cgroup_from_task()
and tryget(). Is it possible to localize it around these operations?
Or am I missing some other effect it has?
> @@ -6358,8 +6407,27 @@ static void mem_cgroup_move_task(void)
> }
> #endif
>
> +#ifdef CONFIG_MEMCG_KMEM
> +static void mem_cgroup_fork(struct task_struct *task)
> +{
> + /*
> + * Set the update flag to cause task->objcg to be initialized lazily
> + * on the first allocation.
> + */
> + task->objcg = (struct obj_cgroup *)0x1;
> +}
I like this open-coding!
Should this mention why it doesn't need to be atomic? Task is in
fork(), no concurrent modifications from allocations or migrations
possible...
None of the feedback is a blocker, though.
Acked-by: Johannes Weiner <[email protected]>
On Fri, Sep 29, 2023 at 11:00:50AM -0700, Roman Gushchin <[email protected]> wrote:
> This patchset improves the performance of accounted kernel memory allocations
> by ~30% as measured by a micro-benchmark [1]. The benchmark is very
> straightforward: 1M of 64 bytes-large kmalloc() allocations.
Nice.
Have you tried how these +34% compose with -34% reported way back [1]
when file lock accounting was added (because your benchmark and lock1
sound quite similar)?
(BTW Is that your motivation (too)?)
Thanks,
Michal
[1] https://lore.kernel.org/r/20210907150757.GE17617@xsang-OptiPlex-9020/
On Wed, Oct 04, 2023 at 08:32:39PM +0200, Michal Koutn? wrote:
> On Fri, Sep 29, 2023 at 11:00:50AM -0700, Roman Gushchin <[email protected]> wrote:
> > This patchset improves the performance of accounted kernel memory allocations
> > by ~30% as measured by a micro-benchmark [1]. The benchmark is very
> > straightforward: 1M of 64 bytes-large kmalloc() allocations.
>
> Nice.
Thanks!
> Have you tried how these +34% compose with -34% reported way back [1]
> when file lock accounting was added (because your benchmark and lock1
> sound quite similar)?
No, I haven't. I'm kindly waiting for an automatic report here :)
But if someone can run these tests manually, I'll appreciate it a lot.
> (BTW Is that your motivation (too)?)
Not really, it was on my todo list for a long time and I just got some spare
cycles to figure out missing parts (mostly around targeted/remote charging).
Also plan to try similar approach to speed up generic memcg charging.
Thanks!
Manually inline memcg_kmem_bypass() and active_memcg() to speed up
get_obj_cgroup_from_current() by avoiding duplicate in_task() checks
and active_memcg() readings.
Also add a likely() macro to __get_obj_cgroup_from_memcg():
obj_cgroup_tryget() should succeed at almost all times except a very
unlikely race with the memcg deletion path.
Signed-off-by: Roman Gushchin (Cruise) <[email protected]>
Acked-by: Shakeel Butt <[email protected]>
---
mm/memcontrol.c | 34 ++++++++++++++--------------------
1 file changed, 14 insertions(+), 20 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9741d62d0424..16ac2a5838fb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1068,19 +1068,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
}
EXPORT_SYMBOL(get_mem_cgroup_from_mm);
-static __always_inline bool memcg_kmem_bypass(void)
-{
- /* Allow remote memcg charging from any context. */
- if (unlikely(active_memcg()))
- return false;
-
- /* Memcg to charge can't be determined. */
- if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
- return true;
-
- return false;
-}
-
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -3007,7 +2994,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
objcg = rcu_dereference(memcg->objcg);
- if (objcg && obj_cgroup_tryget(objcg))
+ if (likely(objcg && obj_cgroup_tryget(objcg)))
break;
objcg = NULL;
}
@@ -3016,16 +3003,23 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
- struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
- if (memcg_kmem_bypass())
- return NULL;
+ if (in_task()) {
+ memcg = current->active_memcg;
+
+ /* Memcg to charge can't be determined. */
+ if (likely(!memcg) && (!current->mm || (current->flags & PF_KTHREAD)))
+ return NULL;
+ } else {
+ memcg = this_cpu_read(int_active_memcg);
+ if (likely(!memcg))
+ return NULL;
+ }
rcu_read_lock();
- if (unlikely(active_memcg()))
- memcg = active_memcg();
- else
+ if (!memcg)
memcg = mem_cgroup_from_task(current);
objcg = __get_obj_cgroup_from_memcg(memcg);
rcu_read_unlock();
--
2.42.0