2015-12-11 19:54:31

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 1/4] net: tcp_memcontrol: simplify linkage between socket and page counter fix

Fixlet for the same-named patch currently in mmots. The forward decl
is no longer necessary when the socket directly points to the memcg.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/net/sock.h | 1 -
1 file changed, 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 0db770f..0835627 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -229,7 +229,6 @@ struct sock_common {
/* public: */
};

-struct cg_proto;
/**
* struct sock - network layer representation of sockets
* @__sk_common: shared layout with inet_timewait_sock
--
2.6.3


2015-12-11 19:54:33

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 2/4] mm: memcontrol: reign in the CONFIG space madness

What CONFIG_INET and CONFIG_LEGACY_KMEM guard inside the memory
controller code is insignificant, having these conditionals is not
worth the complication and fragility that comes with them.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 14 +++++--------
init/Kconfig | 26 +++--------------------
mm/memcontrol.c | 52 +++-------------------------------------------
mm/vmpressure.c | 2 --
4 files changed, 11 insertions(+), 83 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2bb14d02..47995b4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -233,9 +233,11 @@ struct mem_cgroup {
*/
struct mem_cgroup_stat_cpu __percpu *stat;

-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
+ unsigned long socket_pressure;
+
+ /* Legacy tcp memory accounting */
struct cg_proto tcp_mem;
-#endif
+
#ifndef CONFIG_SLOB
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
@@ -254,10 +256,6 @@ struct mem_cgroup {
struct wb_domain cgwb_domain;
#endif

-#ifdef CONFIG_INET
- unsigned long socket_pressure;
-#endif
-
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
@@ -712,15 +710,13 @@ void sock_update_memcg(struct sock *sk);
void sock_release_memcg(struct sock *sk);
bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
-#if defined(CONFIG_MEMCG) && defined(CONFIG_INET)
+#ifdef CONFIG_MEMCG
extern struct static_key_false memcg_sockets_enabled_key;
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
if (memcg->tcp_mem.memory_pressure)
return true;
-#endif
do {
if (time_before(jiffies, memcg->socket_pressure))
return true;
diff --git a/init/Kconfig b/init/Kconfig
index e8cdf02..b8fe7586 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1057,25 +1057,6 @@ config MEMCG_SWAP_ENABLED
For those who want to have the feature enabled by default should
select this option (if, for some reason, they need to disable it
then swapaccount=0 does the trick).
-config MEMCG_LEGACY_KMEM
- bool
-config MEMCG_KMEM
- bool "Legacy Memory Resource Controller Kernel Memory accounting"
- depends on MEMCG
- depends on SLUB || SLAB
- select MEMCG_LEGACY_KMEM
- help
- The Kernel Memory extension for Memory Resource Controller can limit
- the amount of memory used by kernel objects in the system. Those are
- fundamentally different from the entities handled by the standard
- Memory Controller, which are page-based, and can be swapped. Users of
- the kmem extension can use it to guarantee that no group of processes
- will ever exhaust kernel resources alone.
-
- This option affects the ORIGINAL cgroup interface. The cgroup2 memory
- controller includes important in-kernel memory consumers per default.
-
- If you're using cgroup2, say N.

config CGROUP_HUGETLB
bool "HugeTLB Resource Controller for Control Groups"
@@ -1225,10 +1206,9 @@ config USER_NS
to provide different user info for different servers.

When user namespaces are enabled in the kernel it is
- recommended that the MEMCG and MEMCG_KMEM options also be
- enabled and that user-space use the memory control groups to
- limit the amount of memory a memory unprivileged users can
- use.
+ recommended that the MEMCG option also be enabled and that
+ user-space use the memory control groups to limit the amount
+ of memory a memory unprivileged users can use.

If unsure, say N.

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5cf7fd2..422ff3f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2821,11 +2821,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
case _KMEM:
counter = &memcg->kmem;
break;
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
case _TCP:
counter = &memcg->tcp_mem.memory_allocated;
break;
-#endif
default:
BUG();
}
@@ -2985,7 +2983,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
#endif /* !CONFIG_SLOB */

-#ifdef CONFIG_MEMCG_LEGACY_KMEM
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
@@ -3003,16 +3000,7 @@ out:
mutex_unlock(&memcg_limit_mutex);
return ret;
}
-#else
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long limit)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_MEMCG_LEGACY_KMEM */

-
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
{
int ret;
@@ -3047,12 +3035,6 @@ out:
mutex_unlock(&memcg_limit_mutex);
return ret;
}
-#else
-static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_MEMCG_LEGACY_KMEM && CONFIG_INET */

/*
* The user of this function is...
@@ -3115,11 +3097,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
case _KMEM:
counter = &memcg->kmem;
break;
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
case _TCP:
counter = &memcg->tcp_mem.memory_allocated;
break;
-#endif
default:
BUG();
}
@@ -4072,7 +4052,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_numa_stat_show,
},
#endif
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
{
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
@@ -4105,7 +4084,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_slab_show,
},
#endif
-#ifdef CONFIG_INET
{
.name = "kmem.tcp.limit_in_bytes",
.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
@@ -4129,8 +4107,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
-#endif
-#endif
{ }, /* terminate */
};

@@ -4258,15 +4234,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
+ memcg->socket_pressure = jiffies;
#ifndef CONFIG_SLOB
memcg->kmemcg_id = -1;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
-#ifdef CONFIG_INET
- memcg->socket_pressure = jiffies;
-#endif
return &memcg->css;

free_out:
@@ -4299,10 +4273,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
page_counter_init(&memcg->tcp_mem.memory_allocated,
&parent->tcp_mem.memory_allocated);
-#endif

/*
* No need to take a reference to the parent because cgroup
@@ -4314,9 +4286,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
page_counter_init(&memcg->tcp_mem.memory_allocated, NULL);
-#endif
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -4331,10 +4301,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (ret)
return ret;

-#ifdef CONFIG_INET
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
-#endif

/*
* Make sure the memcg is initialized: mem_cgroup_iter()
@@ -4374,17 +4342,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

-#ifdef CONFIG_INET
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
-#endif
-
- memcg_free_kmem(memcg);

-#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
if (memcg->tcp_mem.active)
static_branch_dec(&memcg_sockets_enabled_key);
-#endif

memcg_free_kmem(memcg);

@@ -5585,8 +5547,6 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
commit_charge(newpage, memcg, true);
}

-#ifdef CONFIG_INET
-
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
EXPORT_SYMBOL(memcg_sockets_enabled_key);

@@ -5612,10 +5572,8 @@ void sock_update_memcg(struct sock *sk)
memcg = mem_cgroup_from_task(current);
if (memcg == root_mem_cgroup)
goto out;
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
goto out;
-#endif
if (css_tryget_online(&memcg->css))
sk->sk_memcg = memcg;
out:
@@ -5641,7 +5599,6 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
gfp_t gfp_mask = GFP_KERNEL;

-#ifdef CONFIG_MEMCG_LEGACY_KMEM
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
struct page_counter *counter;

@@ -5654,7 +5611,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
memcg->tcp_mem.memory_pressure = 1;
return false;
}
-#endif
+
/* Don't block in the packet receive path */
if (in_softirq())
gfp_mask = GFP_NOWAIT;
@@ -5673,19 +5630,16 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
-#ifdef CONFIG_MEMCG_LEGACY_KMEM
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
nr_pages);
return;
}
-#endif
+
page_counter_uncharge(&memcg->memory, nr_pages);
css_put_many(&memcg->css, nr_pages);
}

-#endif /* CONFIG_INET */
-
static int __init cgroup_memory(char *s)
{
char *token;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 8cdeebe..506f03e 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -275,7 +275,6 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,

level = vmpressure_calc_level(scanned, reclaimed);

-#ifdef CONFIG_INET
if (level > VMPRESSURE_LOW) {
/*
* Let the socket buffer allocator know that
@@ -287,7 +286,6 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
*/
memcg->socket_pressure = jiffies + HZ;
}
-#endif
}
}

--
2.6.3

2015-12-11 19:54:36

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 3/4] mm: memcontrol: flatten struct cg_proto

There are no more external users of struct cg_proto, flatten the
structure into struct mem_cgroup.

Since using those struct members doesn't stand out as much anymore,
add cgroup2 static branches to make it clearer which code is legacy.

Suggested-by: Vladimir Davydov <[email protected]>
Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 14 ++++++--------
mm/memcontrol.c | 33 +++++++++++++++------------------
2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 47995b4..a3869bf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,12 +85,6 @@ enum mem_cgroup_events_target {
MEM_CGROUP_NTARGETS,
};

-struct cg_proto {
- struct page_counter memory_allocated; /* Current allocated memory. */
- int memory_pressure;
- bool active;
-};
-
#ifdef CONFIG_MEMCG
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
@@ -169,8 +163,11 @@ struct mem_cgroup {

/* Accounted resources */
struct page_counter memory;
+
+ /* Legacy consumer-oriented counters */
struct page_counter memsw;
struct page_counter kmem;
+ struct page_counter tcpmem;

/* Normal memory consumption range */
unsigned long low;
@@ -236,7 +233,8 @@ struct mem_cgroup {
unsigned long socket_pressure;

/* Legacy tcp memory accounting */
- struct cg_proto tcp_mem;
+ bool tcpmem_active;
+ int tcpmem_pressure;

#ifndef CONFIG_SLOB
/* Index in the kmem_cache->memcg_params.memcg_caches array */
@@ -715,7 +713,7 @@ extern struct static_key_false memcg_sockets_enabled_key;
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
- if (memcg->tcp_mem.memory_pressure)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
return true;
do {
if (time_before(jiffies, memcg->socket_pressure))
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 422ff3f..306842c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2822,7 +2822,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
counter = &memcg->kmem;
break;
case _TCP:
- counter = &memcg->tcp_mem.memory_allocated;
+ counter = &memcg->tcpmem;
break;
default:
BUG();
@@ -3007,11 +3007,11 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)

mutex_lock(&memcg_limit_mutex);

- ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, limit);
+ ret = page_counter_limit(&memcg->tcpmem, limit);
if (ret)
goto out;

- if (!memcg->tcp_mem.active) {
+ if (!memcg->tcpmem_active) {
/*
* The active flag needs to be written after the static_key
* update. This is what guarantees that the socket activation
@@ -3029,7 +3029,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
* patched in yet.
*/
static_branch_inc(&memcg_sockets_enabled_key);
- memcg->tcp_mem.active = true;
+ memcg->tcpmem_active = true;
}
out:
mutex_unlock(&memcg_limit_mutex);
@@ -3098,7 +3098,7 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
counter = &memcg->kmem;
break;
case _TCP:
- counter = &memcg->tcp_mem.memory_allocated;
+ counter = &memcg->tcpmem;
break;
default:
BUG();
@@ -4273,8 +4273,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
- page_counter_init(&memcg->tcp_mem.memory_allocated,
- &parent->tcp_mem.memory_allocated);
+ page_counter_init(&memcg->tcpmem, &parent->tcpmem);

/*
* No need to take a reference to the parent because cgroup
@@ -4286,7 +4285,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
- page_counter_init(&memcg->tcp_mem.memory_allocated, NULL);
+ page_counter_init(&memcg->tcpmem, NULL);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -4345,7 +4344,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);

- if (memcg->tcp_mem.active)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);

memcg_free_kmem(memcg);
@@ -5572,7 +5571,7 @@ void sock_update_memcg(struct sock *sk)
memcg = mem_cgroup_from_task(current);
if (memcg == root_mem_cgroup)
goto out;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
if (css_tryget_online(&memcg->css))
sk->sk_memcg = memcg;
@@ -5600,15 +5599,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
gfp_t gfp_mask = GFP_KERNEL;

if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- struct page_counter *counter;
+ struct page_counter *fail;

- if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated,
- nr_pages, &counter)) {
- memcg->tcp_mem.memory_pressure = 0;
+ if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
+ memcg->tcpmem_pressure = 0;
return true;
}
- page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages);
- memcg->tcp_mem.memory_pressure = 1;
+ page_counter_charge(&memcg->tcpmem, nr_pages);
+ memcg->tcpmem_pressure = 1;
return false;
}

@@ -5631,8 +5629,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
- nr_pages);
+ page_counter_uncharge(&memcg->tcpmem, nr_pages);
return;
}

--
2.6.3

2015-12-11 19:54:39

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 4/4] mm: memcontrol: clean up alloc, online, offline, free functions

The creation and teardown of struct mem_cgroup is fairly messy and
that has attracted mistakes and subtle bugs before.

The main cause for this is that there is no clear model about what
needs to happen when, and that attracts more chaos. So create one:

1. mem_cgroup_alloc() should allocate struct mem_cgroup and its
auxiliary members and initialize work items, locks etc. so that the
object it returns is fully initialized and in a neutral state.

2. mem_cgroup_css_alloc() will use mem_cgroup_alloc() to obtain a new
memcg object and configure it and the system according to the role
of the new memory-controlled cgroup in the hierarchy.

3. mem_cgroup_css_online() is no longer needed to synchronize with
iterators, but it verifies css->id which isn't available earlier.

4. mem_cgroup_css_offline() implements stuff that needs to happen upon
the user-visible destruction of a cgroup, which includes stopping
all user interfacing as well as releasing certain structures when
continued memory consumption would be unexpected at that point.

5. mem_cgroup_css_free() prepares the system and the memcg object for
the object's disappearance, neutralizes its state, and then gives
it back to mem_cgroup_free().

6. mem_cgroup_free() releases struct mem_cgroup and auxiliary memory.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 3 -
mm/memcontrol.c | 225 +++++++++++++++------------------------------
2 files changed, 75 insertions(+), 153 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a3869bf..27123e5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -181,9 +181,6 @@ struct mem_cgroup {
/* vmpressure notifications */
struct vmpressure vmpressure;

- /* css_online() has been completed */
- int initialized;
-
/*
* Should the accounting and control be hierarchical, per subtree?
*/
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 306842c..af8714a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -896,17 +896,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (css == &root->css)
break;

- if (css_tryget(css)) {
- /*
- * Make sure the memcg is initialized:
- * mem_cgroup_css_online() orders the the
- * initialization against setting the flag.
- */
- if (smp_load_acquire(&memcg->initialized))
- break;
-
- css_put(css);
- }
+ if (css_tryget(css))
+ break;

memcg = NULL;
}
@@ -2851,37 +2842,14 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
#ifndef CONFIG_SLOB
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
- int err = 0;
int memcg_id;

BUG_ON(memcg->kmemcg_id >= 0);
BUG_ON(memcg->kmem_state);

- /*
- * For simplicity, we won't allow this to be disabled. It also can't
- * be changed if the cgroup has children already, or if tasks had
- * already joined.
- *
- * If tasks join before we set the limit, a person looking at
- * kmem.usage_in_bytes will have no way to determine when it took
- * place, which makes the value quite meaningless.
- *
- * After it first became limited, changes in the value of the limit are
- * of course permitted.
- */
- mutex_lock(&memcg_create_mutex);
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- err = -EBUSY;
- mutex_unlock(&memcg_create_mutex);
- if (err)
- goto out;
-
memcg_id = memcg_alloc_cache_id();
- if (memcg_id < 0) {
- err = memcg_id;
- goto out;
- }
+ if (memcg_id < 0)
+ return memcg_id;

static_branch_inc(&memcg_kmem_enabled_key);
/*
@@ -2892,17 +2860,14 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
-out:
- return err;
+
+ return 0;
}

-static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+static int memcg_propagate_kmem(struct mem_cgroup *parent,
+ struct mem_cgroup *memcg)
{
int ret = 0;
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-
- if (!parent)
- return 0;

mutex_lock(&memcg_limit_mutex);
/*
@@ -2986,11 +2951,18 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- int ret;
+ int ret = 0;

mutex_lock(&memcg_limit_mutex);
/* Top-level cgroup doesn't propagate from root */
if (!memcg_kmem_online(memcg)) {
+ mutex_lock(&memcg_create_mutex);
+ if (cgroup_is_populated(memcg->css.cgroup) ||
+ (memcg->use_hierarchy && memcg_has_children(memcg)))
+ ret = -EBUSY;
+ mutex_unlock(&memcg_create_mutex);
+ if (ret)
+ goto out;
ret = memcg_online_kmem(memcg);
if (ret)
goto out;
@@ -4145,90 +4117,44 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
kfree(memcg->nodeinfo[node]);
}

-static struct mem_cgroup *mem_cgroup_alloc(void)
-{
- struct mem_cgroup *memcg;
- size_t size;
-
- size = sizeof(struct mem_cgroup);
- size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
- memcg = kzalloc(size, GFP_KERNEL);
- if (!memcg)
- return NULL;
-
- memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat)
- goto out_free;
-
- if (memcg_wb_domain_init(memcg, GFP_KERNEL))
- goto out_free_stat;
-
- return memcg;
-
-out_free_stat:
- free_percpu(memcg->stat);
-out_free:
- kfree(memcg);
- return NULL;
-}
-
-/*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
- * (scanning all at force_empty is too costly...)
- *
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
- */
-
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
+static void mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;

- cancel_work_sync(&memcg->high_work);
-
- mem_cgroup_remove_from_trees(memcg);
-
+ memcg_wb_domain_exit(memcg);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
-
free_percpu(memcg->stat);
- memcg_wb_domain_exit(memcg);
kfree(memcg);
}

-static struct cgroup_subsys_state * __ref
-mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- long error = -ENOMEM;
+ size_t size;
int node;

- memcg = mem_cgroup_alloc();
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
- return ERR_PTR(error);
+ return NULL;
+
+ memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat)
+ goto fail;

for_each_node(node)
if (alloc_mem_cgroup_per_zone_info(memcg, node))
- goto free_out;
+ goto fail;

- /* root ? */
- if (parent_css == NULL) {
- root_mem_cgroup = memcg;
- page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
- page_counter_init(&memcg->memsw, NULL);
- page_counter_init(&memcg->kmem, NULL);
- }
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto fail;

INIT_WORK(&memcg->high_work, high_work_func);
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
- memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
@@ -4241,48 +4167,37 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
- return &memcg->css;
-
-free_out:
- __mem_cgroup_free(memcg);
- return ERR_PTR(error);
+ return memcg;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
}

-static int
-mem_cgroup_css_online(struct cgroup_subsys_state *css)
+static struct cgroup_subsys_state * __ref
+mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
- int ret;
-
- if (css->id > MEM_CGROUP_ID_MAX)
- return -ENOSPC;
+ struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
+ struct mem_cgroup *memcg;
+ long error = -ENOMEM;

- if (!parent)
- return 0;
+ memcg = mem_cgroup_alloc();
+ if (!memcg)
+ return ERR_PTR(error);

mutex_lock(&memcg_create_mutex);
-
- memcg->use_hierarchy = parent->use_hierarchy;
- memcg->oom_kill_disable = parent->oom_kill_disable;
- memcg->swappiness = mem_cgroup_swappiness(parent);
-
- if (parent->use_hierarchy) {
+ memcg->high = PAGE_COUNTER_MAX;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
+ if (parent)
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+ if (parent && parent->use_hierarchy) {
+ memcg->use_hierarchy = true;
+ memcg->oom_kill_disable = parent->oom_kill_disable;
page_counter_init(&memcg->memory, &parent->memory);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
-
- /*
- * No need to take a reference to the parent because cgroup
- * core guarantees its existence.
- */
} else {
page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
@@ -4296,19 +4211,30 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
}
mutex_unlock(&memcg_create_mutex);

- ret = memcg_propagate_kmem(memcg);
- if (ret)
- return ret;
+ /* The following stuff does not apply to the root */
+ if (!parent) {
+ root_mem_cgroup = memcg;
+ return &memcg->css;
+ }
+
+ error = memcg_propagate_kmem(parent, memcg);
+ if (error)
+ goto fail;

if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);

- /*
- * Make sure the memcg is initialized: mem_cgroup_iter()
- * orders reading memcg->initialized against its callers
- * reading the memcg members.
- */
- smp_store_release(&memcg->initialized, 1);
+ return &memcg->css;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
+}
+
+static int
+mem_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ if (css->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;

return 0;
}
@@ -4330,10 +4256,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);

- vmpressure_cleanup(&memcg->vmpressure);
-
memcg_offline_kmem(memcg);
-
wb_memcg_offline(memcg);
}

@@ -4347,9 +4270,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);

+ vmpressure_cleanup(&memcg->vmpressure);
+ cancel_work_sync(&memcg->high_work);
+ mem_cgroup_remove_from_trees(memcg);
memcg_free_kmem(memcg);
-
- __mem_cgroup_free(memcg);
+ mem_cgroup_free(memcg);
}

/**
--
2.6.3

2015-12-12 16:33:48

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 2/4] mm: memcontrol: reign in the CONFIG space madness

On Fri, Dec 11, 2015 at 02:54:11PM -0500, Johannes Weiner wrote:
> What CONFIG_INET and CONFIG_LEGACY_KMEM guard inside the memory
> controller code is insignificant, having these conditionals is not
> worth the complication and fragility that comes with them.
>
> Signed-off-by: Johannes Weiner <[email protected]>

Acked-by: Vladimir Davydov <[email protected]>

> @@ -4374,17 +4342,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>
> -#ifdef CONFIG_INET
> if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
> static_branch_dec(&memcg_sockets_enabled_key);
> -#endif
> -
> - memcg_free_kmem(memcg);

I wonder where the second call to memcg_free_kmem comes from. Luckily,
it couldn't result in a breakage. And now it's removed.

>
> -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
> if (memcg->tcp_mem.active)
> static_branch_dec(&memcg_sockets_enabled_key);
> -#endif
>
> memcg_free_kmem(memcg);
>

2015-12-12 16:39:36

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 3/4] mm: memcontrol: flatten struct cg_proto

On Fri, Dec 11, 2015 at 02:54:12PM -0500, Johannes Weiner wrote:
> There are no more external users of struct cg_proto, flatten the
> structure into struct mem_cgroup.
>
> Since using those struct members doesn't stand out as much anymore,
> add cgroup2 static branches to make it clearer which code is legacy.
>
> Suggested-by: Vladimir Davydov <[email protected]>
> Signed-off-by: Johannes Weiner <[email protected]>

Looks good to me, thanks!

Acked-by: Vladimir Davydov <[email protected]>

2015-12-12 17:21:10

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 2/4] mm: memcontrol: reign in the CONFIG space madness

On Sat, Dec 12, 2015 at 07:33:32PM +0300, Vladimir Davydov wrote:
> On Fri, Dec 11, 2015 at 02:54:11PM -0500, Johannes Weiner wrote:
> > What CONFIG_INET and CONFIG_LEGACY_KMEM guard inside the memory
> > controller code is insignificant, having these conditionals is not
> > worth the complication and fragility that comes with them.
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
>
> Acked-by: Vladimir Davydov <[email protected]>
>
> > @@ -4374,17 +4342,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> > {
> > struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> >
> > -#ifdef CONFIG_INET
> > if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
> > static_branch_dec(&memcg_sockets_enabled_key);
> > -#endif
> > -
> > - memcg_free_kmem(memcg);
>
> I wonder where the second call to memcg_free_kmem comes from. Luckily,
> it couldn't result in a breakage. And now it's removed.

Lol, I had to double check my trees to see what's going on as I don't
remember this being part of the patch. But it looks like the double
free came from the "net: drop tcp_memcontrol.c" patch and I must have
removed it again during conflict resolution when rebasing this patch
on top of yours. I must have thought git's auto-merge added it.

However, this causes an underflow of the kmem static branch, so we
will have to fix this directly in "net: drop tcp_memcontrol.c".

Andrew, could you please pick this up? However, it's important to also
then remove the hunk above from THIS patch, the one that deletes the
excessive memcg_free_kmem(). We need exactly one memcg_free_kmem() in
mem_cgroup_css_free(). :-)

>From 94a14b7b0f7ed5b5ac88ca285a7e8ec3215ea59c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <[email protected]>
Date: Sat, 12 Dec 2015 12:14:31 -0500
Subject: [PATCH] net: drop tcp_memcontrol.c fix

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/memcontrol.c | 2 --
1 file changed, 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5cf7fd2..a4ce8d3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4386,8 +4386,6 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
static_branch_dec(&memcg_sockets_enabled_key);
#endif

- memcg_free_kmem(memcg);
-
__mem_cgroup_free(memcg);
}

--
2.6.4

2015-12-14 17:15:18

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 4/4] mm: memcontrol: clean up alloc, online, offline, free functions

On Fri, Dec 11, 2015 at 02:54:13PM -0500, Johannes Weiner wrote:
...
> -static int
> -mem_cgroup_css_online(struct cgroup_subsys_state *css)
> +static struct cgroup_subsys_state * __ref
> +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> {
> - struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> - struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
> - int ret;
> -
> - if (css->id > MEM_CGROUP_ID_MAX)
> - return -ENOSPC;
> + struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
> + struct mem_cgroup *memcg;
> + long error = -ENOMEM;
>
> - if (!parent)
> - return 0;
> + memcg = mem_cgroup_alloc();
> + if (!memcg)
> + return ERR_PTR(error);
>
> mutex_lock(&memcg_create_mutex);

It is pointless to take memcg_create_mutex in ->css_alloc. It won't
prevent setting use_hierarchy for parent after a new child was
allocated, but before it was added to the list of children (see
create_css()). Taking the mutex in ->css_online renders this race
impossible. That is, your cleanup breaks use_hierarchy consistency
check.

Can we drop this use_hierarchy consistency check at all and allow
children of a cgroup with use_hierarchy=1 have use_hierarchy=0? Yeah,
that might result in some strangeness if cgroups are created in parallel
with use_hierarchy flipped, but is it a valid use case? I surmise, one
just sets use_hierarchy for a cgroup once and for good before starting
to create sub-cgroups.

> -
> - memcg->use_hierarchy = parent->use_hierarchy;
> - memcg->oom_kill_disable = parent->oom_kill_disable;
> - memcg->swappiness = mem_cgroup_swappiness(parent);
> -
> - if (parent->use_hierarchy) {
> + memcg->high = PAGE_COUNTER_MAX;
> + memcg->soft_limit = PAGE_COUNTER_MAX;
> + if (parent)
> + memcg->swappiness = mem_cgroup_swappiness(parent);
> + if (parent && parent->use_hierarchy) {
> + memcg->use_hierarchy = true;
> + memcg->oom_kill_disable = parent->oom_kill_disable;

oom_kill_disable was propagated to child cgroup despite use_hierarchy
configuration. I don't see any reason to change this.

> page_counter_init(&memcg->memory, &parent->memory);
> - memcg->high = PAGE_COUNTER_MAX;
> - memcg->soft_limit = PAGE_COUNTER_MAX;
> page_counter_init(&memcg->memsw, &parent->memsw);
> page_counter_init(&memcg->kmem, &parent->kmem);
> page_counter_init(&memcg->tcpmem, &parent->tcpmem);
> -
> - /*
> - * No need to take a reference to the parent because cgroup
> - * core guarantees its existence.
> - */
> } else {
> page_counter_init(&memcg->memory, NULL);
> - memcg->high = PAGE_COUNTER_MAX;
> - memcg->soft_limit = PAGE_COUNTER_MAX;
> page_counter_init(&memcg->memsw, NULL);
> page_counter_init(&memcg->kmem, NULL);
> page_counter_init(&memcg->tcpmem, NULL);
> @@ -4296,19 +4211,30 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
> }
> mutex_unlock(&memcg_create_mutex);
>
> - ret = memcg_propagate_kmem(memcg);
> - if (ret)
> - return ret;
> + /* The following stuff does not apply to the root */
> + if (!parent) {
> + root_mem_cgroup = memcg;
> + return &memcg->css;
> + }
> +
> + error = memcg_propagate_kmem(parent, memcg);

I don't think ->css_alloc is the right place for this function: if
create_css() fails after ->css_alloc and before ->css_online, it'll call
->css_free, which won't cleanup kmem properly.

> + if (error)
> + goto fail;
>
> if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
> static_branch_inc(&memcg_sockets_enabled_key);

Frankly, I don't get why this should live here either. This has nothing
to do with memcg allocation and looks rather like a preparation for
online.

>
> - /*
> - * Make sure the memcg is initialized: mem_cgroup_iter()
> - * orders reading memcg->initialized against its callers
> - * reading the memcg members.
> - */
> - smp_store_release(&memcg->initialized, 1);
> + return &memcg->css;
> +fail:
> + mem_cgroup_free(memcg);
> + return NULL;
> +}
> +
> +static int
> +mem_cgroup_css_online(struct cgroup_subsys_state *css)
> +{
> + if (css->id > MEM_CGROUP_ID_MAX)
> + return -ENOSPC;
>
> return 0;
> }
> @@ -4330,10 +4256,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
> }
> spin_unlock(&memcg->event_list_lock);
>
> - vmpressure_cleanup(&memcg->vmpressure);
> -
> memcg_offline_kmem(memcg);
> -
> wb_memcg_offline(memcg);
> }
>
> @@ -4347,9 +4270,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
> static_branch_dec(&memcg_sockets_enabled_key);
>
> + vmpressure_cleanup(&memcg->vmpressure);

vmpressure->work can be scheduled after offline, so ->css_free is
definitely the right place for vmpressure_cleanup. Looks like you've
just fixed a potential use-after-free bug.

Thanks,
Vladimir

> + cancel_work_sync(&memcg->high_work);
> + mem_cgroup_remove_from_trees(memcg);
> memcg_free_kmem(memcg);
> -
> - __mem_cgroup_free(memcg);
> + mem_cgroup_free(memcg);
> }
>
> /**

2015-12-15 19:39:23

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 4/4] mm: memcontrol: clean up alloc, online, offline, free functions

Hi Vladimir,

On Mon, Dec 14, 2015 at 08:14:55PM +0300, Vladimir Davydov wrote:
> On Fri, Dec 11, 2015 at 02:54:13PM -0500, Johannes Weiner wrote:
> ...
> > -static int
> > -mem_cgroup_css_online(struct cgroup_subsys_state *css)
> > +static struct cgroup_subsys_state * __ref
> > +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> > {
> > - struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> > - struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
> > - int ret;
> > -
> > - if (css->id > MEM_CGROUP_ID_MAX)
> > - return -ENOSPC;
> > + struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
> > + struct mem_cgroup *memcg;
> > + long error = -ENOMEM;
> >
> > - if (!parent)
> > - return 0;
> > + memcg = mem_cgroup_alloc();
> > + if (!memcg)
> > + return ERR_PTR(error);
> >
> > mutex_lock(&memcg_create_mutex);
>
> It is pointless to take memcg_create_mutex in ->css_alloc. It won't
> prevent setting use_hierarchy for parent after a new child was
> allocated, but before it was added to the list of children (see
> create_css()). Taking the mutex in ->css_online renders this race
> impossible. That is, your cleanup breaks use_hierarchy consistency
> check.
>
> Can we drop this use_hierarchy consistency check at all and allow
> children of a cgroup with use_hierarchy=1 have use_hierarchy=0? Yeah,
> that might result in some strangeness if cgroups are created in parallel
> with use_hierarchy flipped, but is it a valid use case? I surmise, one
> just sets use_hierarchy for a cgroup once and for good before starting
> to create sub-cgroups.

I don't think we have to support airtight exclusion between somebody
changing the parent attribute and creating new children that inherit
these attributes. Everything will still work if this race happens.

Does that mean we have to remove the restriction altogether? I'm not
convinced. We can just keep it for historical purposes so that we do
not *encourage* this weird setting.

I think that's good enough. Let's just remove the memcg_create_mutex.

> > - memcg->use_hierarchy = parent->use_hierarchy;
> > - memcg->oom_kill_disable = parent->oom_kill_disable;
> > - memcg->swappiness = mem_cgroup_swappiness(parent);
> > -
> > - if (parent->use_hierarchy) {
> > + memcg->high = PAGE_COUNTER_MAX;
> > + memcg->soft_limit = PAGE_COUNTER_MAX;
> > + if (parent)
> > + memcg->swappiness = mem_cgroup_swappiness(parent);
> > + if (parent && parent->use_hierarchy) {
> > + memcg->use_hierarchy = true;
> > + memcg->oom_kill_disable = parent->oom_kill_disable;
>
> oom_kill_disable was propagated to child cgroup despite use_hierarchy
> configuration. I don't see any reason to change this.

Just a weird quirk of !use_hierarchy, I guess. Technically, if you're
not in hierarchy mode then your parent is the root, not whatever is in
the css tree above you. And root can't disable OOM killing. So I don't
really care either way. Restored to previous behavior.

> > @@ -4296,19 +4211,30 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
> > }
> > mutex_unlock(&memcg_create_mutex);
> >
> > - ret = memcg_propagate_kmem(memcg);
> > - if (ret)
> > - return ret;
> > + /* The following stuff does not apply to the root */
> > + if (!parent) {
> > + root_mem_cgroup = memcg;
> > + return &memcg->css;
> > + }
> > +
> > + error = memcg_propagate_kmem(parent, memcg);
>
> I don't think ->css_alloc is the right place for this function: if
> create_css() fails after ->css_alloc and before ->css_online, it'll call
> ->css_free, which won't cleanup kmem properly.

Hm, on the other hand, the static branch maintenance is done from
css_free. If online doesn't run we'll have an underflow there.

I think the easiest way to deal with this is to have memcg_free_kmem()
check if offlining has happened, and if not call memcg_offline_kmem().

> > + if (error)
> > + goto fail;
> >
> > if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
> > static_branch_inc(&memcg_sockets_enabled_key);
>
> Frankly, I don't get why this should live here either. This has nothing
> to do with memcg allocation and looks rather like a preparation for
> online.

It matches css_free() as per above. Unless we need synchronization
with showing up as a child in the parent, there is no need to use
online and fragment the initialization. So I'd rather consolidate.

> > @@ -4347,9 +4270,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> > if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
> > static_branch_dec(&memcg_sockets_enabled_key);
> >
> > + vmpressure_cleanup(&memcg->vmpressure);
>
> vmpressure->work can be scheduled after offline, so ->css_free is
> definitely the right place for vmpressure_cleanup. Looks like you've
> just fixed a potential use-after-free bug.

Yay! Since nobody reported this problem, it should be okay to keep it
in this patch.

So, how about the following fixlets on top to address your comments?

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index af8714a..124a802 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,13 +250,6 @@ enum res_type {
/* Used for OOM nofiier */
#define OOM_CONTROL (0)

-/*
- * The memcg_create_mutex will be held whenever a new cgroup is created.
- * As a consequence, any change that needs to protect against new child cgroups
- * appearing has to hold it as well.
- */
-static DEFINE_MUTEX(memcg_create_mutex);
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -2660,14 +2653,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
bool ret;

- /*
- * The lock does not prevent addition or deletion of children, but
- * it prevents a new child from being initialized based on this
- * parent in css_online(), so it's enough to decide whether
- * hierarchically inherited attributes can still be changed or not.
- */
- lockdep_assert_held(&memcg_create_mutex);
-
rcu_read_lock();
ret = css_next_child(NULL, &memcg->css);
rcu_read_unlock();
@@ -2730,10 +2715,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);

- mutex_lock(&memcg_create_mutex);
-
if (memcg->use_hierarchy == val)
- goto out;
+ return 0;

/*
* If parent's use_hierarchy is set, we can't make any modifications
@@ -2752,9 +2735,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
} else
retval = -EINVAL;

-out:
- mutex_unlock(&memcg_create_mutex);
-
return retval;
}

@@ -2929,6 +2909,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)

static void memcg_free_kmem(struct mem_cgroup *memcg)
{
+ /* css_alloc() failed, offlining didn't happen */
+ if (unlikely(memcg->kmem_state == KMEM_ONLINE))
+ memcg_offline_kmem(memcg);
+
if (memcg->kmem_state == KMEM_ALLOCATED) {
memcg_destroy_kmem_caches(memcg);
static_branch_dec(&memcg_kmem_enabled_key);
@@ -2956,11 +2940,9 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
mutex_lock(&memcg_limit_mutex);
/* Top-level cgroup doesn't propagate from root */
if (!memcg_kmem_online(memcg)) {
- mutex_lock(&memcg_create_mutex);
if (cgroup_is_populated(memcg->css.cgroup) ||
(memcg->use_hierarchy && memcg_has_children(memcg)))
ret = -EBUSY;
- mutex_unlock(&memcg_create_mutex);
if (ret)
goto out;
ret = memcg_online_kmem(memcg);
@@ -4184,14 +4166,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (!memcg)
return ERR_PTR(error);

- mutex_lock(&memcg_create_mutex);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
- if (parent)
+ if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg->oom_kill_disable = parent->oom_kill_disable;
+ }
if (parent && parent->use_hierarchy) {
memcg->use_hierarchy = true;
- memcg->oom_kill_disable = parent->oom_kill_disable;
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4209,7 +4191,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent != root_mem_cgroup)
memory_cgrp_subsys.broken_hierarchy = true;
}
- mutex_unlock(&memcg_create_mutex);

/* The following stuff does not apply to the root */
if (!parent) {

2015-12-16 12:17:48

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 4/4] mm: memcontrol: clean up alloc, online, offline, free functions

On Tue, Dec 15, 2015 at 02:38:58PM -0500, Johannes Weiner wrote:
> On Mon, Dec 14, 2015 at 08:14:55PM +0300, Vladimir Davydov wrote:
> > On Fri, Dec 11, 2015 at 02:54:13PM -0500, Johannes Weiner wrote:
> > ...
> > > -static int
> > > -mem_cgroup_css_online(struct cgroup_subsys_state *css)
> > > +static struct cgroup_subsys_state * __ref
> > > +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> > > {
> > > - struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> > > - struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
> > > - int ret;
> > > -
> > > - if (css->id > MEM_CGROUP_ID_MAX)
> > > - return -ENOSPC;
> > > + struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
> > > + struct mem_cgroup *memcg;
> > > + long error = -ENOMEM;
> > >
> > > - if (!parent)
> > > - return 0;
> > > + memcg = mem_cgroup_alloc();
> > > + if (!memcg)
> > > + return ERR_PTR(error);
> > >
> > > mutex_lock(&memcg_create_mutex);
> >
> > It is pointless to take memcg_create_mutex in ->css_alloc. It won't
> > prevent setting use_hierarchy for parent after a new child was
> > allocated, but before it was added to the list of children (see
> > create_css()). Taking the mutex in ->css_online renders this race
> > impossible. That is, your cleanup breaks use_hierarchy consistency
> > check.
> >
> > Can we drop this use_hierarchy consistency check at all and allow
> > children of a cgroup with use_hierarchy=1 have use_hierarchy=0? Yeah,
> > that might result in some strangeness if cgroups are created in parallel
> > with use_hierarchy flipped, but is it a valid use case? I surmise, one
> > just sets use_hierarchy for a cgroup once and for good before starting
> > to create sub-cgroups.
>
> I don't think we have to support airtight exclusion between somebody
> changing the parent attribute and creating new children that inherit
> these attributes. Everything will still work if this race happens.
>
> Does that mean we have to remove the restriction altogether? I'm not
> convinced. We can just keep it for historical purposes so that we do
> not *encourage* this weird setting.

Well, legacy hierarchy is scheduled to die, so it's too late to
encourage or discourage any setting regarding it.

Besides, hierarchy mode must be enabled for 99% setups, because this is
what systemd does at startup. So I don't think we would hurt anybody by
dropping this check altogether - IMO it'd be fairer than having a check
that might sometimes fail.

It's not something I really care about though, so I don't insist.

>
> I think that's good enough. Let's just remove the memcg_create_mutex.

I'm fine with it, but I think this deserves a comment in the commit
message.

...
> So, how about the following fixlets on top to address your comments?
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index af8714a..124a802 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -250,13 +250,6 @@ enum res_type {
> /* Used for OOM nofiier */
> #define OOM_CONTROL (0)
>
> -/*
> - * The memcg_create_mutex will be held whenever a new cgroup is created.
> - * As a consequence, any change that needs to protect against new child cgroups
> - * appearing has to hold it as well.
> - */
> -static DEFINE_MUTEX(memcg_create_mutex);
> -
> /* Some nice accessors for the vmpressure. */
> struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
> {
> @@ -2660,14 +2653,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
> {
> bool ret;
>
> - /*
> - * The lock does not prevent addition or deletion of children, but
> - * it prevents a new child from being initialized based on this
> - * parent in css_online(), so it's enough to decide whether
> - * hierarchically inherited attributes can still be changed or not.
> - */
> - lockdep_assert_held(&memcg_create_mutex);
> -
> rcu_read_lock();
> ret = css_next_child(NULL, &memcg->css);
> rcu_read_unlock();
> @@ -2730,10 +2715,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
>
> - mutex_lock(&memcg_create_mutex);
> -
> if (memcg->use_hierarchy == val)
> - goto out;
> + return 0;
>
> /*
> * If parent's use_hierarchy is set, we can't make any modifications
> @@ -2752,9 +2735,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
> } else
> retval = -EINVAL;
>
> -out:
> - mutex_unlock(&memcg_create_mutex);
> -
> return retval;
> }
>
> @@ -2929,6 +2909,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
>
> static void memcg_free_kmem(struct mem_cgroup *memcg)
> {
> + /* css_alloc() failed, offlining didn't happen */
> + if (unlikely(memcg->kmem_state == KMEM_ONLINE))

It's not a hot-path, so there's no need in using 'unlikely' here apart
from improving readability, but the comment should be enough.

> + memcg_offline_kmem(memcg);
> +

Calling 'offline' from css_free looks a little bit awkward, but let it
be.

Anyway, it's a really nice cleanup, thanks!

Acked-by: Vladimir Davydov <[email protected]>

> if (memcg->kmem_state == KMEM_ALLOCATED) {
> memcg_destroy_kmem_caches(memcg);
> static_branch_dec(&memcg_kmem_enabled_key);
> @@ -2956,11 +2940,9 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
> mutex_lock(&memcg_limit_mutex);
> /* Top-level cgroup doesn't propagate from root */
> if (!memcg_kmem_online(memcg)) {
> - mutex_lock(&memcg_create_mutex);
> if (cgroup_is_populated(memcg->css.cgroup) ||
> (memcg->use_hierarchy && memcg_has_children(memcg)))
> ret = -EBUSY;
> - mutex_unlock(&memcg_create_mutex);
> if (ret)
> goto out;
> ret = memcg_online_kmem(memcg);
> @@ -4184,14 +4166,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> if (!memcg)
> return ERR_PTR(error);
>
> - mutex_lock(&memcg_create_mutex);
> memcg->high = PAGE_COUNTER_MAX;
> memcg->soft_limit = PAGE_COUNTER_MAX;
> - if (parent)
> + if (parent) {
> memcg->swappiness = mem_cgroup_swappiness(parent);
> + memcg->oom_kill_disable = parent->oom_kill_disable;
> + }
> if (parent && parent->use_hierarchy) {
> memcg->use_hierarchy = true;
> - memcg->oom_kill_disable = parent->oom_kill_disable;
> page_counter_init(&memcg->memory, &parent->memory);
> page_counter_init(&memcg->memsw, &parent->memsw);
> page_counter_init(&memcg->kmem, &parent->kmem);
> @@ -4209,7 +4191,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> if (parent != root_mem_cgroup)
> memory_cgrp_subsys.broken_hierarchy = true;
> }
> - mutex_unlock(&memcg_create_mutex);
>
> /* The following stuff does not apply to the root */
> if (!parent) {
>

2015-12-17 00:44:30

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 4/4] mm: memcontrol: clean up alloc, online, offline, free functions

On Wed, Dec 16, 2015 at 03:17:27PM +0300, Vladimir Davydov wrote:
> On Tue, Dec 15, 2015 at 02:38:58PM -0500, Johannes Weiner wrote:
> > On Mon, Dec 14, 2015 at 08:14:55PM +0300, Vladimir Davydov wrote:
> > > On Fri, Dec 11, 2015 at 02:54:13PM -0500, Johannes Weiner wrote:
> > > ...
> > > > -static int
> > > > -mem_cgroup_css_online(struct cgroup_subsys_state *css)
> > > > +static struct cgroup_subsys_state * __ref
> > > > +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> > > > {
> > > > - struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> > > > - struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
> > > > - int ret;
> > > > -
> > > > - if (css->id > MEM_CGROUP_ID_MAX)
> > > > - return -ENOSPC;
> > > > + struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
> > > > + struct mem_cgroup *memcg;
> > > > + long error = -ENOMEM;
> > > >
> > > > - if (!parent)
> > > > - return 0;
> > > > + memcg = mem_cgroup_alloc();
> > > > + if (!memcg)
> > > > + return ERR_PTR(error);
> > > >
> > > > mutex_lock(&memcg_create_mutex);
> > >
> > > It is pointless to take memcg_create_mutex in ->css_alloc. It won't
> > > prevent setting use_hierarchy for parent after a new child was
> > > allocated, but before it was added to the list of children (see
> > > create_css()). Taking the mutex in ->css_online renders this race
> > > impossible. That is, your cleanup breaks use_hierarchy consistency
> > > check.
> > >
> > > Can we drop this use_hierarchy consistency check at all and allow
> > > children of a cgroup with use_hierarchy=1 have use_hierarchy=0? Yeah,
> > > that might result in some strangeness if cgroups are created in parallel
> > > with use_hierarchy flipped, but is it a valid use case? I surmise, one
> > > just sets use_hierarchy for a cgroup once and for good before starting
> > > to create sub-cgroups.
> >
> > I don't think we have to support airtight exclusion between somebody
> > changing the parent attribute and creating new children that inherit
> > these attributes. Everything will still work if this race happens.
> >
> > Does that mean we have to remove the restriction altogether? I'm not
> > convinced. We can just keep it for historical purposes so that we do
> > not *encourage* this weird setting.
>
> Well, legacy hierarchy is scheduled to die, so it's too late to
> encourage or discourage any setting regarding it.

That's the main reason I don't want to blatantly change the interface
at this point :)

> Besides, hierarchy mode must be enabled for 99% setups, because this is
> what systemd does at startup. So I don't think we would hurt anybody by
> dropping this check altogether - IMO it'd be fairer than having a check
> that might sometimes fail.

Yeah, I don't actually think anybody will run into this in practice,
but also want to keep changes to the legacy interface, user-visible or
not, at a minimum: fixing bugs and refactoring code for v2, basically.

> It's not something I really care about though, so I don't insist.

Thanks! I left it for now just to be safe.

> > @@ -2929,6 +2909,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
> >
> > static void memcg_free_kmem(struct mem_cgroup *memcg)
> > {
> > + /* css_alloc() failed, offlining didn't happen */
> > + if (unlikely(memcg->kmem_state == KMEM_ONLINE))
>
> It's not a hot-path, so there's no need in using 'unlikely' here apart
> from improving readability, but the comment should be enough.

Yeah it was entirely done for readability to reinforce that we
(almost) never expect this to happen. The comment elaborates and
explains it a bit, but the unlikely() carries the main signal.

> > + memcg_offline_kmem(memcg);
> > +
>
> Calling 'offline' from css_free looks a little bit awkward, but let it
> be.
>
> Anyway, it's a really nice cleanup, thanks!
>
> Acked-by: Vladimir Davydov <[email protected]>

Thanks!

2015-12-22 23:11:41

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/4] mm: memcontrol: reign in the CONFIG space madness

On Sat, 12 Dec 2015 12:20:57 -0500 Johannes Weiner <[email protected]> wrote:

> On Sat, Dec 12, 2015 at 07:33:32PM +0300, Vladimir Davydov wrote:
> > On Fri, Dec 11, 2015 at 02:54:11PM -0500, Johannes Weiner wrote:
> > > What CONFIG_INET and CONFIG_LEGACY_KMEM guard inside the memory
> > > controller code is insignificant, having these conditionals is not
> > > worth the complication and fragility that comes with them.
> > >
> > > Signed-off-by: Johannes Weiner <[email protected]>
> >
> > Acked-by: Vladimir Davydov <[email protected]>
> >
> > > @@ -4374,17 +4342,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> > > {
> > > struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> > >
> > > -#ifdef CONFIG_INET
> > > if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
> > > static_branch_dec(&memcg_sockets_enabled_key);
> > > -#endif
> > > -
> > > - memcg_free_kmem(memcg);
> >
> > I wonder where the second call to memcg_free_kmem comes from. Luckily,
> > it couldn't result in a breakage. And now it's removed.
>
> Lol, I had to double check my trees to see what's going on as I don't
> remember this being part of the patch. But it looks like the double
> free came from the "net: drop tcp_memcontrol.c" patch and I must have
> removed it again during conflict resolution when rebasing this patch
> on top of yours. I must have thought git's auto-merge added it.
>
> However, this causes an underflow of the kmem static branch, so we
> will have to fix this directly in "net: drop tcp_memcontrol.c".
>
> Andrew, could you please pick this up? However, it's important to also
> then remove the hunk above from THIS patch, the one that deletes the
> excessive memcg_free_kmem(). We need exactly one memcg_free_kmem() in
> mem_cgroup_css_free(). :-)

So you want to retain
mm-memcontrol-reign-in-the-config-space-madness.patch's removal of the
ifdef CONFIG_INET?


What I have is

Against net-drop-tcp_memcontrolc.patch:

--- a/mm/memcontrol.c~net-drop-tcp_memcontrolc-fix
+++ a/mm/memcontrol.c
@@ -4421,8 +4421,6 @@ static void mem_cgroup_css_free(struct c
static_branch_dec(&memcg_sockets_enabled_key);
#endif

- memcg_free_kmem(memcg);
-
__mem_cgroup_free(memcg);
}


and against mm-memcontrol-reign-in-the-config-space-madness.patch:

--- a/mm/memcontrol.c~mm-memcontrol-reign-in-the-config-space-madness-fix
+++ a/mm/memcontrol.c
@@ -4380,6 +4380,8 @@ static void mem_cgroup_css_free(struct c
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);

+ memcg_free_kmem(memcg);
+
if (memcg->tcp_mem.active)
static_branch_dec(&memcg_sockets_enabled_key);


Producing

static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);

memcg_free_kmem(memcg);

if (memcg->tcp_mem.active)
static_branch_dec(&memcg_sockets_enabled_key);

__mem_cgroup_free(memcg);
}

And I did s/reign/rein/;)

2015-12-22 23:15:30

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/4] mm: memcontrol: reign in the CONFIG space madness

On Tue, 22 Dec 2015 15:11:38 -0800 Andrew Morton <[email protected]> wrote:

> What I have is

And after a bit of reject resolution in
mm-memcontrol-clean-up-alloc-online-offline-free-functions.patch we
have


static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);

vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
memcg_free_kmem(memcg);

if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);

mem_cgroup_free(memcg);
}

code looks a bit strange. Can we move the static_branch_dec's together
and run cgroup_subsys_on_dfl just once?

2015-12-22 23:38:36

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 2/4] mm: memcontrol: reign in the CONFIG space madness

On Tue, Dec 22, 2015 at 03:15:27PM -0800, Andrew Morton wrote:
> On Tue, 22 Dec 2015 15:11:38 -0800 Andrew Morton <[email protected]> wrote:
>
> > What I have is
>
> And after a bit of reject resolution in
> mm-memcontrol-clean-up-alloc-online-offline-free-functions.patch we
> have
>
>
> static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>
> if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
> static_branch_dec(&memcg_sockets_enabled_key);
>
> vmpressure_cleanup(&memcg->vmpressure);
> cancel_work_sync(&memcg->high_work);
> mem_cgroup_remove_from_trees(memcg);
> memcg_free_kmem(memcg);
>
> if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
> static_branch_dec(&memcg_sockets_enabled_key);
>
> mem_cgroup_free(memcg);
> }
>
> code looks a bit strange. Can we move the static_branch_dec's together
> and run cgroup_subsys_on_dfl just once?

Thanks for fixing it up. I think we can at least put the branches next
to each other. Here is what I have in my local tree:

static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);

if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);

vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
memcg_free_kmem(memcg);
mem_cgroup_free(memcg);
}

However, I don't think turning it into this would be an improvement:

if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
if (!cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
} else if (memcg->tcpmem_active) {
static_branch_dec(&memcg_sockets_enabled_key);
}

Plus, I'm a little worried that conflating cgroup and cgroup2 blocks
will get us into trouble. Yeah, that code looks a little unusual, but
I can't help but think it's easier to follow the code flow for one
particular mode when the jump labels are always explicit. Then the
brain can easily pattern-match and ignore blocks of the other mode.
It doesn't work the same when we hide keywords in implicit else ifs.