by Vasily Averin

[permalink] [raw]

Subject: [RFC PATCH] memcg: adjust memcg for new cgroup allocations

This is first of patches announced here:
https://lore.kernel.org/all/[email protected]/
---
Usually accounted allocations use memory cgroup of current process.
In case of cgroup this looks incorrect: memory cgroup can be created
both from the host and from inside container. At the same time
it is intuitively expected that these both operations should lead
to the same result.
This issue was addressed by Roman Gushchin in commit 3e38e0aaca9e
"mm: memcg: charge memcg percpu memory to the parent cgroup",
He adjusted memcg for allocations called inside mem_cgroup_css_alloc.

However, now we want to enable accounting for some other cgroup-related
resources called from cgroup_mkdir. We would like to guarantee that
all new accounted allocation will be charged to the same memory cgroup.

This patch moves memcg adjustment from mem_cgroup_css_alloc() to its
callers: cgroup_mkdir() and cgroup_apply_control(). In own turn this
requires to create new proxy function mem_cgroup_from_cgroup().

Signed-off-by: Vasily Averin <[email protected]>
---
include/linux/memcontrol.h | 18 ++++++++++++++++++
kernel/cgroup/cgroup.c | 22 ++++++++++++++++------
mm/memcontrol.c | 4 +---
3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4d31ce55b1c0..342426d1edbf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1813,6 +1813,19 @@ static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
{
return memcg ? memcg : root_mem_cgroup;
}
+
+static inline struct mem_cgroup *mem_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg = NULL;
+
+ css = cgroup_get_e_css(cgroup, &memory_cgrp_subsys);
+
+ if (css)
+ memcg = container_of(css, struct mem_cgroup, css);
+
+ return mem_cgroup_or_root(memcg);
+}
#else
static inline bool mem_cgroup_kmem_disabled(void)
{
@@ -1878,6 +1891,11 @@ static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
{
return NULL;
}
+
+static inline struct mem_cgroup *mem_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+ return NULL;
+}
#endif /* CONFIG_MEMCG_KMEM */

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ffaccd6373f1..544d93a8878f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3247,13 +3247,16 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
*/
static int cgroup_apply_control(struct cgroup *cgrp)
{
+ struct mem_cgroup *memcg, *old_memcg;
int ret;

cgroup_propagate_control(cgrp);

+ memcg = mem_cgroup_from_cgroup(cgrp);
+ old_memcg = set_active_memcg(memcg);
ret = cgroup_apply_control_enable(cgrp);
if (ret)
- return ret;
+ goto out_memcg;

/*
* At this point, cgroup_e_css_by_mask() results reflect the new csses
@@ -3261,10 +3264,11 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* css associations of all tasks in the subtree.
*/
ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;

- return 0;
+out_memcg:
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+ return ret;
}

/**
@@ -5532,6 +5536,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
+ struct mem_cgroup *memcg, *old_memcg;
int ret;

/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
@@ -5547,10 +5552,12 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
goto out_unlock;
}

+ memcg = mem_cgroup_from_cgroup(parent);
+ old_memcg = set_active_memcg(memcg);
cgrp = cgroup_create(parent, name, mode);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
- goto out_unlock;
+ goto out_memcg;
}

/*
@@ -5577,10 +5584,13 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
kernfs_activate(cgrp->kn);

ret = 0;
- goto out_unlock;
+ goto out_memcg;

out_destroy:
cgroup_destroy_locked(cgrp);
+out_memcg:
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
out_unlock:
cgroup_kn_unlock(parent_kn);
return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b69979c9ced5..e170c64e66e2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5239,11 +5239,9 @@ static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
- struct mem_cgroup *memcg, *old_memcg;
+ struct mem_cgroup *memcg;

- old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
- set_active_memcg(old_memcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);

--
2.34.1

2022-08-17 09:43:31

On 8/18/22 16:19, Christian Brauner wrote:
> On Thu, Aug 18, 2022 at 12:12:30PM +0300, Vasily Averin wrote:
>> The patch was announced here:
>> https://lore.kernel.org/all/[email protected]/
>> "5) simple_xattrs: replace list to rb-tree
>> This significantly reduces the search time for existing entries."
>>
>> It was compiled but was not tested yet.
>> ---
>> Currently simple_xattr uses a list to store existing entries.
>> If the list grows, the presence check may be slow and potentially
>> lead to problems. Red-black tree should work more efficiently
>> in this situation.
>>
>> This patch replaces list to rb_tree and switches simple_xattr_* calls
>> to its using.
>>
>> Signed-off-by: Vasily Averin <[email protected]>
>> ---
>
> I think the background for the performance issues in the commit message
> would be helpful and I have a few comments. Also, trying to test whether the
> lockups are gone due to the rbtree switch would be +1.
>
> This will likely conflict with some acl/xattr changes I have lined up so
> if we decide to proceed I wouldn't mind dealing with this series if
> there are no objections.

I would be very grateful if you pick up this issue.
Unfortunately I do not have enough time to process it properly.

I'm agree with all your remarks, however I would like to comment following one.

> I think keeping this rather close to the original code might be nicer.
> I find the code more difficult to follow afterwards. So how about
> (COMPLETELY UNTESTED) sm like:

I had this idea too, however it have one disadvantage in rb-tree scenario:
in the most typical case, when adding a new entry, we run through the tree twice:
first in simple_xattr_rb_search() and then in simple_xattr_rb_insert().
In my patch version we run through the rb-tree once only.

However now I think we can save closest neighbour on "search" stage,
and use it on "insert" stage. This should be safe because both functions
are called under the same spinlock.

> @@ -1077,30 +1139,40 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
> }
>
> spin_lock(&xattrs->lock);
> - list_for_each_entry(xattr, &xattrs->head, list) {
> - if (!strcmp(name, xattr->name)) {
> - if (flags & XATTR_CREATE) {
> - xattr = new_xattr;
> - err = -EEXIST;
> - } else if (new_xattr) {
> - list_replace(&xattr->list, &new_xattr->list);
> - if (removed_size)
> - *removed_size = xattr->size;
> - } else {
> - list_del(&xattr->list);
> - if (removed_size)
> - *removed_size = xattr->size;
> - }
> - goto out;
> + /* Find any matching xattr by name. */
> + xattr = simple_xattr_rb_search(&xattrs->rb_root, name);
> + if (xattr) {
> + if (flags & XATTR_CREATE) {
> + /* Creating request but the xattr already existed. */
> + xattr = new_xattr;
> + err = -EEXIST;
> + } else if (new_xattr) {
> + /* Replace the existing xattr. */
> + rb_replace_node(&xattr->node, &new_xattr->node,
> + &xattrs->rb_root);
> + if (removed_size)
> + *removed_size = xattr->size;
> + } else {
> + /* No new xattr specified so wipe the existing xattr. */
> + rb_erase(&xattr->node, &xattrs->rb_root);
> + if (removed_size)
> + *removed_size = xattr->size;
> }
> + goto out;
> }
> +
> if (flags & XATTR_REPLACE) {
> + /* There's no matching xattr so fail on replace. */
> xattr = new_xattr;
> err = -ENODATA;
> } else {
> - list_add(&new_xattr->list, &xattrs->head);
> - xattr = NULL;
> + /*
> + * We're holding the lock and verified that there's no
> + * pre-existing xattr so this should always succeed.
> + */
> + WARN_ON(!simple_xattr_rb_insert(&xattrs->rb_root, new_xattr))
> }
> +
> out:
> spin_unlock(&xattrs->lock);
> if (xattr) {
>
>
>> - xattr = new_xattr;
>> - err = -ENODATA;
>> - } else {
>> - list_add(&new_xattr->list, &xattrs->head);
>> - xattr = NULL;
>> - }
>> -out:
>> spin_unlock(&xattrs->lock);
>> if (xattr) {
>> kfree(xattr->name);

2022-08-23 19:28:48

by Tejun Heo

[permalink] [raw]

Subject: Re: [RFC PATCH] memcg: adjust memcg for new cgroup allocations

Hello,

On Tue, Aug 23, 2022 at 03:04:31PM +0300, Vasily Averin wrote:
> I would like to properly handle cgroup1 case too.
> To do it we can enable accounting for new 'struct cgroup' objects,
> and bind them to memcg of creator task.

I'm not sure it'd be a good idea to introduce two different behaviors for
handling the same thing. I'd just leave cgroup1 as-is.

Thanks.

--
tejun