2024-05-03 20:19:04

by Roman Gushchin

[permalink] [raw]
Subject: [PATCH v1 0/4] Page counters optimizations

From: Roman Gushchin <[email protected]>

This patchset reorganizes page_counter structures which helps to make
memory cgroup and hugetlb cgroup structures smaller (20%-35%) and
more cache-effective. It also eliminates useless tracking of protected
memory usage when it's not needed.

include/linux/hugetlb.h | 4 +-
include/linux/hugetlb_cgroup.h | 9 +---
include/linux/memcontrol.h | 15 ++----
include/linux/page_counter.h | 88 +++++++++++++++++++++++++-------
mm/hugetlb.c | 14 +++--
mm/hugetlb_cgroup.c | 150 ++++++++++++++++++++----------------------------------
mm/memcontrol.c | 351 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------------------------------------------------
mm/page_counter.c | 76 ++++++++++++++++++----------
8 files changed, 340 insertions(+), 367 deletions(-)


Roman Gushchin (4):
mm: memcg: convert enum res_type to mem_counter_type
mm: memcg: merge multiple page_counters into a single structure
mm: memcg: don't call propagate_protected_usage() needlessly
mm: page_counters: initialize usage using ATOMIC_LONG_INIT() macro

include/linux/hugetlb.h | 4 +-
include/linux/hugetlb_cgroup.h | 9 +-
include/linux/memcontrol.h | 15 +-
include/linux/page_counter.h | 88 +++++++--
mm/hugetlb.c | 14 +-
mm/hugetlb_cgroup.c | 150 +++++---------
mm/memcontrol.c | 351 ++++++++++++++-------------------
mm/page_counter.c | 76 ++++---
8 files changed, 340 insertions(+), 367 deletions(-)

--
2.43.2



2024-05-03 20:19:29

by Roman Gushchin

[permalink] [raw]
Subject: [PATCH v1 3/4] mm: memcg: don't call propagate_protected_usage() needlessly

Memory protection (min/low) requires a constant tracking of
the usage of protected memory. propagate_protected_usage() is called
each time and does a number of operations even in cases when
the actual memory protection functionality is not supported
(e.g. hugetlb cgroups or memcg swap counters). It's an obvious
inefficiency, which can be addressed by calling
propagate_protected_usage() optionally and only for the right
counter type. It eliminates a number of operations from hot paths.

Signed-off-by: Roman Gushchin <[email protected]>
---
include/linux/page_counter.h | 8 +++++++-
mm/hugetlb_cgroup.c | 2 +-
mm/memcontrol.c | 4 ++--
mm/page_counter.c | 15 ++++++++++++---
4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index ae6cc080b78b..5d963f54fcb8 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -62,6 +62,7 @@ struct page_counter {
/* Keep all the read most fields in a separete cacheline. */
CACHELINE_PADDING(_pad2_);

+ bool protection_support;
unsigned long min;
unsigned long low;
unsigned long high[__MCT_NR_ITEMS];
@@ -75,8 +76,12 @@ struct page_counter {
#define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE)
#endif

+/*
+ * Protection is supported only for the first counter (with id 0).
+ */
static inline void page_counter_init(struct page_counter *counter,
- struct page_counter *parent)
+ struct page_counter *parent,
+ bool protection_support)
{
int i;

@@ -86,6 +91,7 @@ static inline void page_counter_init(struct page_counter *counter,
}

counter->parent = parent;
+ counter->protection_support = protection_support;
}

static inline unsigned long page_counter_read(struct page_counter *counter,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 069c7f164dc5..81cb78d0714f 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -79,7 +79,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
int idx;

page_counter_init(&h_cgroup->res,
- parent_h_cgroup ? &parent_h_cgroup->res : NULL);
+ parent_h_cgroup ? &parent_h_cgroup->res : NULL, false);

for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
unsigned long limit;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f4511506ef1f..46becae5ff99 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5761,11 +5761,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
- page_counter_init(&memcg->memory, &parent->memory);
+ page_counter_init(&memcg->memory, &parent->memory, true);
} else {
init_memcg_stats();
init_memcg_events();
- page_counter_init(&memcg->memory, NULL);
+ page_counter_init(&memcg->memory, NULL, true);
root_mem_cgroup = memcg;
return &memcg->css;
}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index b6ca3adbc226..5a27e3141ff3 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -60,7 +60,8 @@ void page_counter_cancel(struct page_counter *counter,
new = 0;
atomic_long_set(&counter->usage[id], new);
}
- propagate_protected_usage(counter, new);
+ if (counter->protection_support && id == 0)
+ propagate_protected_usage(counter, new);
}

/**
@@ -76,12 +77,14 @@ void page_counter_charge(struct page_counter *counter,
unsigned long nr_pages)
{
struct page_counter *c;
+ bool track_protection = counter->protection_support && (id == 0);

for (c = counter; c; c = c->parent) {
long new;

new = atomic_long_add_return(nr_pages, &c->usage[id]);
- propagate_protected_usage(c, new);
+ if (track_protection)
+ propagate_protected_usage(c, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -107,6 +110,7 @@ bool page_counter_try_charge(struct page_counter *counter,
struct page_counter **fail)
{
struct page_counter *c;
+ bool track_protection = counter->protection_support && (id == 0);

for (c = counter; c; c = c->parent) {
long new;
@@ -136,7 +140,8 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
- propagate_protected_usage(c, new);
+ if (track_protection)
+ propagate_protected_usage(c, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
@@ -226,6 +231,8 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
{
struct page_counter *c;

+ WARN_ON_ONCE(!counter->protection_support);
+
WRITE_ONCE(counter->min, nr_pages);

for (c = counter; c; c = c->parent)
@@ -243,6 +250,8 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
{
struct page_counter *c;

+ WARN_ON_ONCE(!counter->protection_support);
+
WRITE_ONCE(counter->low, nr_pages);

for (c = counter; c; c = c->parent)
--
2.43.2


2024-05-03 20:20:02

by Roman Gushchin

[permalink] [raw]
Subject: [PATCH v1 1/4] mm: memcg: convert enum res_type to mem_counter_type

The res_type enum is listing all types of memory tracked by memory
cgroups: generic memory, swap, kernel memory, tcp etc; and it's
currently used only for dealing with corresponding sysfs files.

To prepare for tracking of various types of memory by a single
page_counter structure, a similar enumeration is needed. Instead
of introducing a completely new enumeration, let's re-purpose
the existing one: rename it into mem_counter_type, change items
names to be more meaningful and move to page_counter.h. The latter
is needed to have the total number of different memory types
available.

This change doesn't bring any functional difference, it's a pure
refactoring.

Signed-off-by: Roman Gushchin <[email protected]>
---
include/linux/page_counter.h | 8 ++++
mm/memcontrol.c | 91 +++++++++++++++++-------------------
2 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 8cd858d912c4..2486f98a0c71 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -7,6 +7,14 @@
#include <linux/limits.h>
#include <asm/page.h>

+enum mem_counter_type {
+ MCT_MEMORY, /* cgroup v1 and v2 */
+ MCT_SWAP, /* cgroup v2 only */
+ MCT_MEMSW = MCT_SWAP, /* cgroup v1 only */
+ MCT_KMEM, /* cgroup v1 only */
+ MCT_TCPMEM, /* cgroup v1 only */
+};
+
struct page_counter {
/*
* Make sure 'usage' does not share cacheline with any other field. The
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a13d16cde372..894e5b6fe468 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -206,14 +206,6 @@ static struct move_charge_struct {
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2

-/* for encoding cft->private value on file */
-enum res_type {
- _MEM,
- _MEMSWAP,
- _KMEM,
- _TCP,
-};
-
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -4108,16 +4100,16 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct page_counter *counter;

switch (MEMFILE_TYPE(cft->private)) {
- case _MEM:
+ case MCT_MEMORY:
counter = &memcg->memory;
break;
- case _MEMSWAP:
+ case MCT_MEMSW:
counter = &memcg->memsw;
break;
- case _KMEM:
+ case MCT_KMEM:
counter = &memcg->kmem;
break;
- case _TCP:
+ case MCT_TCPMEM:
counter = &memcg->tcpmem;
break;
default:
@@ -4273,20 +4265,20 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
break;
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
- case _MEM:
+ case MCT_MEMORY:
ret = mem_cgroup_resize_max(memcg, nr_pages, false);
break;
- case _MEMSWAP:
+ case MCT_MEMSW:
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
- case _KMEM:
+ case MCT_KMEM:
pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
"Writing any value to this file has no effect. "
"Please report your usecase to [email protected] if you "
"depend on this functionality.\n");
ret = 0;
break;
- case _TCP:
+ case MCT_TCPMEM:
ret = memcg_update_tcp_max(memcg, nr_pages);
break;
}
@@ -4310,16 +4302,16 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
struct page_counter *counter;

switch (MEMFILE_TYPE(of_cft(of)->private)) {
- case _MEM:
+ case MCT_MEMORY:
counter = &memcg->memory;
break;
- case _MEMSWAP:
+ case MCT_MEMSW:
counter = &memcg->memsw;
break;
- case _KMEM:
+ case MCT_KMEM:
counter = &memcg->kmem;
break;
- case _TCP:
+ case MCT_TCPMEM:
counter = &memcg->tcpmem;
break;
default:
@@ -4706,7 +4698,8 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
}

static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args, enum res_type type)
+ struct eventfd_ctx *eventfd, const char *args,
+ enum mem_counter_type type)
{
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
@@ -4720,10 +4713,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

mutex_lock(&memcg->thresholds_lock);

- if (type == _MEM) {
+ if (type == MCT_MEMORY) {
thresholds = &memcg->thresholds;
usage = mem_cgroup_usage(memcg, false);
- } else if (type == _MEMSWAP) {
+ } else if (type == MCT_MEMSW) {
thresholds = &memcg->memsw_thresholds;
usage = mem_cgroup_usage(memcg, true);
} else
@@ -4731,7 +4724,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

/* Check if a threshold crossed before adding a new one */
if (thresholds->primary)
- __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+ __mem_cgroup_threshold(memcg, type == MCT_MEMSW);

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

@@ -4788,17 +4781,17 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
- return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, MCT_MEMORY);
}

static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
{
- return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, MCT_MEMSW);
}

static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, enum res_type type)
+ struct eventfd_ctx *eventfd, enum mem_counter_type type)
{
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
@@ -4807,10 +4800,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

mutex_lock(&memcg->thresholds_lock);

- if (type == _MEM) {
+ if (type == MCT_MEMORY) {
thresholds = &memcg->thresholds;
usage = mem_cgroup_usage(memcg, false);
- } else if (type == _MEMSWAP) {
+ } else if (type == MCT_MEMSW) {
thresholds = &memcg->memsw_thresholds;
usage = mem_cgroup_usage(memcg, true);
} else
@@ -4820,7 +4813,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
goto unlock;

/* Check if a threshold crossed before removing */
- __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+ __mem_cgroup_threshold(memcg, type == MCT_MEMSW);

/* Calculate new number of threshold */
size = entries = 0;
@@ -4885,13 +4878,13 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, MCT_MEMORY);
}

static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd)
{
- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, MCT_MEMSW);
}

static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
@@ -5426,30 +5419,30 @@ static int memory_stat_show(struct seq_file *m, void *v);
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
+ .private = MEMFILE_PRIVATE(MCT_MEMORY, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
+ .private = MEMFILE_PRIVATE(MCT_MEMORY, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
- .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
+ .private = MEMFILE_PRIVATE(MCT_MEMORY, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
- .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
+ .private = MEMFILE_PRIVATE(MCT_MEMORY, RES_SOFT_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
- .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
+ .private = MEMFILE_PRIVATE(MCT_MEMORY, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
@@ -5498,24 +5491,24 @@ static struct cftype mem_cgroup_legacy_files[] = {
#endif
{
.name = "kmem.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+ .private = MEMFILE_PRIVATE(MCT_KMEM, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
- .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+ .private = MEMFILE_PRIVATE(MCT_KMEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
- .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
+ .private = MEMFILE_PRIVATE(MCT_KMEM, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
- .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
+ .private = MEMFILE_PRIVATE(MCT_KMEM, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
@@ -5527,24 +5520,24 @@ static struct cftype mem_cgroup_legacy_files[] = {
#endif
{
.name = "kmem.tcp.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
+ .private = MEMFILE_PRIVATE(MCT_TCPMEM, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.tcp.usage_in_bytes",
- .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
+ .private = MEMFILE_PRIVATE(MCT_TCPMEM, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.tcp.failcnt",
- .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
+ .private = MEMFILE_PRIVATE(MCT_TCPMEM, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.tcp.max_usage_in_bytes",
- .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
+ .private = MEMFILE_PRIVATE(MCT_TCPMEM, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
@@ -8394,24 +8387,24 @@ static struct cftype swap_files[] = {
static struct cftype memsw_files[] = {
{
.name = "memsw.usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+ .private = MEMFILE_PRIVATE(MCT_MEMSW, RES_USAGE),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+ .private = MEMFILE_PRIVATE(MCT_MEMSW, RES_MAX_USAGE),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+ .private = MEMFILE_PRIVATE(MCT_MEMSW, RES_LIMIT),
.write = mem_cgroup_write,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+ .private = MEMFILE_PRIVATE(MCT_MEMSW, RES_FAILCNT),
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
--
2.43.2


2024-05-03 20:20:29

by Roman Gushchin

[permalink] [raw]
Subject: [PATCH v1 4/4] mm: page_counters: initialize usage using ATOMIC_LONG_INIT() macro

When a page_counter structure is initialized, there is no need to
use an atomic set operation to initialize usage counters because at
this point the structure is not visible to anybody else.
ATOMIC_LONG_INIT() is what should be used in such cases.

Signed-off-by: Roman Gushchin <[email protected]>
---
include/linux/page_counter.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 5d963f54fcb8..af636dd943a7 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -86,7 +86,7 @@ static inline void page_counter_init(struct page_counter *counter,
int i;

for (i = 0; i < __MCT_NR_ITEMS; i++) {
- atomic_long_set(&counter->usage[i], 0);
+ counter->usage[i] = (atomic_long_t)ATOMIC_LONG_INIT(0);
counter->max[i] = PAGE_COUNTER_MAX;
}

--
2.43.2


2024-05-03 20:20:34

by Roman Gushchin

[permalink] [raw]
Subject: [PATCH v1 2/4] mm: memcg: merge multiple page_counters into a single structure

Struct page_counter is used by memory and hugetlb cgroups to track
the resource usage. Memory cgroups are using up to four different
counters (on cgroup v1). Hugetlb cgroups are using 2 * HUGE_MAX_HSTATE
counters.

Every page_counter structure (except the one belonging to the root
cgroup) contains a pointer to the corresponding parent structure.
It's obviously wasteful because for each cgroup there are several
similar chains of page_counter structures. It's not particularly
cache-effective too.

Also every page_counter structure contains fields required for
tracking the usage of the memory protection, while it's used only
for the main ("memory") counter and only by memory cgroups.

To address these issues let's make page_counter structures to support
N counters and use a single page_counter structure per memory or
hugetlb cgroup. N is determined at the compile time depending on
the configuration and HUGE_MAX_HSTATE value. HUGE_MAX_HSTATE is
relatively small on all existing platforms (max is 5 on sparc),
so it's not too far from 4 memcg memory types.

Memory min/low functionality is supported only for the first counter.

Memory savings (on x86_64):
original:
memcg:
/* size: 2112, cachelines: 33, members: 46 */
/* sum members: 1987, holes: 8, sum holes: 117 */
/* padding: 8 */
/* paddings: 3, sum paddings: 72 */
/* forced alignments: 7, forced holes: 2, sum forced holes: 96 */
hugetlb:
/* size: 1280, cachelines: 20, members: 8 */
/* sum members: 1224, holes: 1, sum holes: 56 */
/* forced alignments: 3, forced holes: 1, sum forced holes: 56 */

patched:
memcg:
/* size: 1664, cachelines: 26, members: 43 */
/* sum members: 1539, holes: 8, sum holes: 117 */
/* padding: 8 */
/* paddings: 1, sum paddings: 40 */
/* forced alignments: 4, forced holes: 2, sum forced holes: 96 */
hugetlb:
/* size: 832, cachelines: 13, members: 7 */
/* sum members: 776, holes: 1, sum holes: 56 */
/* paddings: 1, sum paddings: 40 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 56 */

So it's 20-35% reduction in the number of cache lines.

Signed-off-by: Roman Gushchin <[email protected]>
---
include/linux/hugetlb.h | 4 +-
include/linux/hugetlb_cgroup.h | 9 +-
include/linux/memcontrol.h | 15 +-
include/linux/page_counter.h | 72 +++++++--
mm/hugetlb.c | 14 +-
mm/hugetlb_cgroup.c | 150 +++++++-----------
mm/memcontrol.c | 272 ++++++++++++++-------------------
mm/page_counter.c | 61 +++++---
8 files changed, 277 insertions(+), 320 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 68244bb3637a..32aeb2b53bc7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -67,9 +67,9 @@ struct resv_map {
* here. If these fields are 0, then either the mapping is shared, or
* cgroup accounting is disabled for this resv_map.
*/
- struct page_counter *reservation_counter;
unsigned long pages_per_hpage;
struct cgroup_subsys_state *css;
+ int counter_idx;
#endif
};

@@ -102,8 +102,8 @@ struct file_region {
* file_region in resv_map. These fields hold the info needed to
* uncharge each reservation.
*/
- struct page_counter *reservation_counter;
struct cgroup_subsys_state *css;
+ int counter_idx;
#endif
};

diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index e5d64b8b59c2..7f9ec65494ec 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -36,14 +36,9 @@ struct hugetlb_cgroup {
struct cgroup_subsys_state css;

/*
- * the counter to account for hugepages from hugetlb.
+ * resource counters for hugepages and hugepage reservations
*/
- struct page_counter hugepage[HUGE_MAX_HSTATE];
-
- /*
- * the counter to account for hugepage reservations from hugetlb.
- */
- struct page_counter rsvd_hugepage[HUGE_MAX_HSTATE];
+ struct page_counter res;

atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 030d34e9d117..bbb99f029bc1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -187,16 +187,7 @@ struct mem_cgroup {
struct mem_cgroup_id id;

/* Accounted resources */
- struct page_counter memory; /* Both v1 & v2 */
-
- union {
- struct page_counter swap; /* v2 only */
- struct page_counter memsw; /* v1 only */
- };
-
- /* Legacy consumer-oriented counters */
- struct page_counter kmem; /* v1 only */
- struct page_counter tcpmem; /* v1 only */
+ struct page_counter memory;

/* Range enforcement for interrupt charges */
struct work_struct high_work;
@@ -652,7 +643,7 @@ static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
return false;

return READ_ONCE(memcg->memory.elow) >=
- page_counter_read(&memcg->memory);
+ page_counter_read(&memcg->memory, MCT_MEMORY);
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
@@ -662,7 +653,7 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
return false;

return READ_ONCE(memcg->memory.emin) >=
- page_counter_read(&memcg->memory);
+ page_counter_read(&memcg->memory, MCT_MEMORY);
}

void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 2486f98a0c71..ae6cc080b78b 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -7,12 +7,35 @@
#include <linux/limits.h>
#include <asm/page.h>

+/*
+ * Page counters are used by memory and hugetlb cgroups.
+ * Memory cgroups are using up to 4 separate counters:
+ * memory, swap (memory + swap on cgroup v1), kmem and tcpmem.
+ * Hugetlb cgroups are using 2 * HUGE_MAX_HSTATE separate
+ * counters: for tracking the usage and reservations of each
+ * supported hugepage size.
+ */
+
+#ifdef CONFIG_CGROUP_HUGETLB
+#ifdef HUGE_MAX_HSTATE
+#define __MCT_HUGETLB_MAX (HUGE_MAX_HSTATE * 2 - 1)
+#else
+#define __MCT_HUGETLB_MAX 1
+#endif
+#endif /* CONFIG_CGROUP_HUGETLB */
+
enum mem_counter_type {
+#ifdef CONFIG_MEMCG
MCT_MEMORY, /* cgroup v1 and v2 */
MCT_SWAP, /* cgroup v2 only */
MCT_MEMSW = MCT_SWAP, /* cgroup v1 only */
MCT_KMEM, /* cgroup v1 only */
MCT_TCPMEM, /* cgroup v1 only */
+#endif
+#ifdef CONFIG_CGROUP_HUGETLB
+ MCT_HUGETLB_MAX = __MCT_HUGETLB_MAX,
+#endif
+ __MCT_NR_ITEMS,
};

struct page_counter {
@@ -20,7 +43,7 @@ struct page_counter {
* Make sure 'usage' does not share cacheline with any other field. The
* memcg->memory.usage is a hot member of struct mem_cgroup.
*/
- atomic_long_t usage;
+ atomic_long_t usage[__MCT_NR_ITEMS];
CACHELINE_PADDING(_pad1_);

/* effective memory.min and memory.min usage tracking */
@@ -33,16 +56,16 @@ struct page_counter {
atomic_long_t low_usage;
atomic_long_t children_low_usage;

- unsigned long watermark;
- unsigned long failcnt;
+ unsigned long watermark[__MCT_NR_ITEMS];
+ unsigned long failcnt[__MCT_NR_ITEMS];

/* Keep all the read most fields in a separete cacheline. */
CACHELINE_PADDING(_pad2_);

unsigned long min;
unsigned long low;
- unsigned long high;
- unsigned long max;
+ unsigned long high[__MCT_NR_ITEMS];
+ unsigned long max[__MCT_NR_ITEMS];
struct page_counter *parent;
} ____cacheline_internodealigned_in_smp;

@@ -55,38 +78,55 @@ struct page_counter {
static inline void page_counter_init(struct page_counter *counter,
struct page_counter *parent)
{
- atomic_long_set(&counter->usage, 0);
- counter->max = PAGE_COUNTER_MAX;
+ int i;
+
+ for (i = 0; i < __MCT_NR_ITEMS; i++) {
+ atomic_long_set(&counter->usage[i], 0);
+ counter->max[i] = PAGE_COUNTER_MAX;
+ }
+
counter->parent = parent;
}

-static inline unsigned long page_counter_read(struct page_counter *counter)
+static inline unsigned long page_counter_read(struct page_counter *counter,
+ enum mem_counter_type id)
{
- return atomic_long_read(&counter->usage);
+ return atomic_long_read(&counter->usage[id]);
}

-void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
-void page_counter_charge(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_cancel(struct page_counter *counter,
+ enum mem_counter_type id,
+ unsigned long nr_pages);
+void page_counter_charge(struct page_counter *counter,
+ enum mem_counter_type id,
+ unsigned long nr_pages);
bool page_counter_try_charge(struct page_counter *counter,
+ enum mem_counter_type id,
unsigned long nr_pages,
struct page_counter **fail);
-void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_uncharge(struct page_counter *counter,
+ enum mem_counter_type id,
+ unsigned long nr_pages);
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);

static inline void page_counter_set_high(struct page_counter *counter,
+ enum mem_counter_type id,
unsigned long nr_pages)
{
- WRITE_ONCE(counter->high, nr_pages);
+ WRITE_ONCE(counter->high[id], nr_pages);
}

-int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
+int page_counter_set_max(struct page_counter *counter,
+ enum mem_counter_type id,
+ unsigned long nr_pages);
int page_counter_memparse(const char *buf, const char *max,
unsigned long *nr_pages);

-static inline void page_counter_reset_watermark(struct page_counter *counter)
+static inline void page_counter_reset_watermark(struct page_counter *counter,
+ enum mem_counter_type id)
{
- counter->watermark = page_counter_read(counter);
+ counter->watermark[id] = page_counter_read(counter, id);
}

#endif /* _LINUX_PAGE_COUNTER_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 33d175add044..45dccf0df5ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -455,7 +455,7 @@ static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
- nrg->reservation_counter = rg->reservation_counter;
+ nrg->counter_idx = rg->counter_idx;
nrg->css = rg->css;
if (rg->css)
css_get(rg->css);
@@ -470,8 +470,6 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
{
#ifdef CONFIG_CGROUP_HUGETLB
if (h_cg) {
- nrg->reservation_counter =
- &h_cg->rsvd_hugepage[hstate_index(h)];
nrg->css = &h_cg->css;
/*
* The caller will hold exactly one h_cg->css reference for the
@@ -484,6 +482,7 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
* untouched.
*/
css_get(&h_cg->css);
+ nrg->counter_idx = hstate_index(h);
if (!resv->pages_per_hpage)
resv->pages_per_hpage = pages_per_huge_page(h);
/* pages_per_hpage should be the same for all entries in
@@ -491,8 +490,8 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
*/
VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
} else {
- nrg->reservation_counter = NULL;
nrg->css = NULL;
+ nrg->counter_idx = 0;
}
#endif
}
@@ -509,7 +508,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
- return rg->reservation_counter == org->reservation_counter &&
+ return rg->counter_idx == org->counter_idx &&
rg->css == org->css;

#else
@@ -1068,14 +1067,13 @@ resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
{
#ifdef CONFIG_CGROUP_HUGETLB
if (!h_cg || !h) {
- resv_map->reservation_counter = NULL;
resv_map->pages_per_hpage = 0;
resv_map->css = NULL;
+ resv_map->counter_idx = 0;
} else {
- resv_map->reservation_counter =
- &h_cg->rsvd_hugepage[hstate_index(h)];
resv_map->pages_per_hpage = pages_per_huge_page(h);
resv_map->css = &h_cg->css;
+ resv_map->counter_idx = hstate_index(h);
}
#endif
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index e20339a346b9..069c7f164dc5 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -29,25 +29,14 @@

static struct hugetlb_cgroup *root_h_cgroup __read_mostly;

-static inline struct page_counter *
-__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
- bool rsvd)
-{
- if (rsvd)
- return &h_cg->rsvd_hugepage[idx];
- return &h_cg->hugepage[idx];
-}
-
-static inline struct page_counter *
-hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
-{
- return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
-}
-
-static inline struct page_counter *
-hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
+/*
+ * There are at least 2 * HUGE_MAX_HSTATE counters.
+ * First HUGE_MAX_HSTATE counters are usage counters.
+ * Second HUGE_MAX_HSTATE counters are reservation counters.
+ */
+static inline int counter_idx(int idx, bool rsvd)
{
- return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
+ return rsvd ? HUGE_MAX_HSTATE + idx : idx;
}

static inline
@@ -78,8 +67,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
struct hstate *h;

for_each_hstate(h) {
- if (page_counter_read(
- hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
+ if (page_counter_read(&h_cg->res, hstate_index(h)))
return true;
}
return false;
@@ -90,35 +78,23 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
{
int idx;

+ page_counter_init(&h_cgroup->res,
+ parent_h_cgroup ? &parent_h_cgroup->res : NULL);
+
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
- struct page_counter *fault_parent = NULL;
- struct page_counter *rsvd_parent = NULL;
unsigned long limit;
int ret;

- if (parent_h_cgroup) {
- fault_parent = hugetlb_cgroup_counter_from_cgroup(
- parent_h_cgroup, idx);
- rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
- parent_h_cgroup, idx);
- }
- page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
- idx),
- fault_parent);
- page_counter_init(
- hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
- rsvd_parent);
-
limit = round_down(PAGE_COUNTER_MAX,
pages_per_huge_page(&hstates[idx]));

- ret = page_counter_set_max(
- hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
- limit);
+ /* hugepages */
+ ret = page_counter_set_max(&h_cgroup->res, counter_idx(idx, false),
+ limit);
VM_BUG_ON(ret);
- ret = page_counter_set_max(
- hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
- limit);
+ /* hugepage reservations */
+ ret = page_counter_set_max(&h_cgroup->res, counter_idx(idx, true),
+ limit);
VM_BUG_ON(ret);
}
}
@@ -188,7 +164,6 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
struct page *page)
{
unsigned int nr_pages;
- struct page_counter *counter;
struct hugetlb_cgroup *page_hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
struct folio *folio = page_folio(page);
@@ -206,11 +181,10 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
- page_counter_charge(&parent->hugepage[idx], nr_pages);
+ page_counter_charge(&parent->res, idx, nr_pages);
}
- counter = &h_cg->hugepage[idx];
/* Take the pages off the local counter */
- page_counter_cancel(counter, nr_pages);
+ page_counter_cancel(&h_cg->res, idx, nr_pages);

set_hugetlb_cgroup(folio, parent);
out:
@@ -271,9 +245,8 @@ static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
}
rcu_read_unlock();

- if (!page_counter_try_charge(
- __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
- nr_pages, &counter)) {
+ if (!page_counter_try_charge(&h_cg->res, counter_idx(idx, rsvd),
+ nr_pages, &counter)) {
ret = -ENOMEM;
hugetlb_event(h_cg, idx, HUGETLB_MAX);
css_put(&h_cg->css);
@@ -353,9 +326,7 @@ static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
return;
__set_hugetlb_cgroup(folio, NULL, rsvd);

- page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
- rsvd),
- nr_pages);
+ page_counter_uncharge(&h_cg->res, counter_idx(idx, rsvd), nr_pages);

if (rsvd)
css_put(&h_cg->css);
@@ -391,9 +362,7 @@ static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
if (hugetlb_cgroup_disabled() || !h_cg)
return;

- page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
- rsvd),
- nr_pages);
+ page_counter_uncharge(&h_cg->res, counter_idx(idx, rsvd), nr_pages);

if (rsvd)
css_put(&h_cg->css);
@@ -414,11 +383,11 @@ void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
unsigned long end)
{
- if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
- !resv->css)
+ if (hugetlb_cgroup_disabled() || !resv || !resv->css)
return;

- page_counter_uncharge(resv->reservation_counter,
+ page_counter_uncharge(&hugetlb_cgroup_from_css(resv->css)->res,
+ resv->counter_idx,
(end - start) * resv->pages_per_hpage);
css_put(resv->css);
}
@@ -431,9 +400,9 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
return;

- if (rg->reservation_counter && resv->pages_per_hpage &&
- !resv->reservation_counter) {
- page_counter_uncharge(rg->reservation_counter,
+ if (rg->css && resv->pages_per_hpage && !resv->css) {
+ page_counter_uncharge(&hugetlb_cgroup_from_css(rg->css)->res,
+ rg->counter_idx,
nr_pages * resv->pages_per_hpage);
/*
* Only do css_put(rg->css) when we delete the entire region
@@ -485,7 +454,7 @@ static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
* counter, so use that.
*/
seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
- page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
+ page_counter_read(&h_cg->res, idx) * PAGE_SIZE);

/*
* For each node, transverse the css tree to obtain the hierarchical
@@ -511,30 +480,26 @@ static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- struct page_counter *counter;
- struct page_counter *rsvd_counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
-
- counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
- rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
+ int idx = MEMFILE_IDX(cft->private);

switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
- return (u64)page_counter_read(counter) * PAGE_SIZE;
+ return (u64)page_counter_read(&h_cg->res, counter_idx(idx, false)) * PAGE_SIZE;
case RES_RSVD_USAGE:
- return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
+ return (u64)page_counter_read(&h_cg->res, counter_idx(idx, true)) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->max * PAGE_SIZE;
+ return (u64)h_cg->res.max[counter_idx(idx, false)] * PAGE_SIZE;
case RES_RSVD_LIMIT:
- return (u64)rsvd_counter->max * PAGE_SIZE;
+ return (u64)h_cg->res.max[counter_idx(idx, true)] * PAGE_SIZE;
case RES_MAX_USAGE:
- return (u64)counter->watermark * PAGE_SIZE;
+ return (u64)h_cg->res.watermark[counter_idx(idx, false)] * PAGE_SIZE;
case RES_RSVD_MAX_USAGE:
- return (u64)rsvd_counter->watermark * PAGE_SIZE;
+ return (u64)h_cg->res.watermark[counter_idx(idx, true)] * PAGE_SIZE;
case RES_FAILCNT:
- return counter->failcnt;
+ return (u64)h_cg->res.failcnt[counter_idx(idx, false)];
case RES_RSVD_FAILCNT:
- return rsvd_counter->failcnt;
+ return (u64)h_cg->res.failcnt[counter_idx(idx, true)];
default:
BUG();
}
@@ -542,32 +507,29 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,

static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
{
- int idx;
u64 val;
struct cftype *cft = seq_cft(seq);
unsigned long limit;
- struct page_counter *counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
-
- idx = MEMFILE_IDX(cft->private);
- counter = &h_cg->hugepage[idx];
+ int idx = MEMFILE_IDX(cft->private);
+ bool rsvd = false;

limit = round_down(PAGE_COUNTER_MAX,
pages_per_huge_page(&hstates[idx]));

switch (MEMFILE_ATTR(cft->private)) {
case RES_RSVD_USAGE:
- counter = &h_cg->rsvd_hugepage[idx];
+ rsvd = true;
fallthrough;
case RES_USAGE:
- val = (u64)page_counter_read(counter);
+ val = (u64)page_counter_read(&h_cg->res, counter_idx(idx, rsvd));
seq_printf(seq, "%llu\n", val * PAGE_SIZE);
break;
case RES_RSVD_LIMIT:
- counter = &h_cg->rsvd_hugepage[idx];
+ rsvd = true;
fallthrough;
case RES_LIMIT:
- val = (u64)counter->max;
+ val = (u64)h_cg->res.max[counter_idx(idx, rsvd)];
if (val == limit)
seq_puts(seq, "max\n");
else
@@ -586,9 +548,10 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off,
const char *max)
{
- int ret, idx;
+ int ret;
unsigned long nr_pages;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+ int idx = MEMFILE_IDX(of_cft(of)->private);
bool rsvd = false;

if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
@@ -599,7 +562,6 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
if (ret)
return ret;

- idx = MEMFILE_IDX(of_cft(of)->private);
nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));

switch (MEMFILE_ATTR(of_cft(of)->private)) {
@@ -608,9 +570,8 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
fallthrough;
case RES_LIMIT:
mutex_lock(&hugetlb_limit_mutex);
- ret = page_counter_set_max(
- __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
- nr_pages);
+ ret = page_counter_set_max(&h_cg->res, counter_idx(idx, rsvd),
+ nr_pages);
mutex_unlock(&hugetlb_limit_mutex);
break;
default:
@@ -636,24 +597,21 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
int ret = 0;
- struct page_counter *counter, *rsvd_counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
-
- counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
- rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
+ int idx = MEMFILE_IDX(of_cft(of)->private);

switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_MAX_USAGE:
- page_counter_reset_watermark(counter);
+ page_counter_reset_watermark(&h_cg->res, counter_idx(idx, false));
break;
case RES_RSVD_MAX_USAGE:
- page_counter_reset_watermark(rsvd_counter);
+ page_counter_reset_watermark(&h_cg->res, counter_idx(idx, true));
break;
case RES_FAILCNT:
- counter->failcnt = 0;
+ h_cg->res.failcnt[counter_idx(idx, false)] = 0;
break;
case RES_RSVD_FAILCNT:
- rsvd_counter->failcnt = 0;
+ h_cg->res.failcnt[counter_idx(idx, true)] = 0;
break;
default:
ret = -EINVAL;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 894e5b6fe468..f4511506ef1f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -465,7 +465,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,

static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
- unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long nr_pages = page_counter_read(&memcg->memory, MCT_MEMORY);
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
unsigned long excess = 0;

@@ -1631,14 +1631,14 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
unsigned long count;
unsigned long limit;

- count = page_counter_read(&memcg->memory);
- limit = READ_ONCE(memcg->memory.max);
+ count = page_counter_read(&memcg->memory, MCT_MEMORY);
+ limit = READ_ONCE(memcg->memory.max[MCT_MEMORY]);
if (count < limit)
margin = limit - count;

if (do_memsw_account()) {
- count = page_counter_read(&memcg->memsw);
- limit = READ_ONCE(memcg->memsw.max);
+ count = page_counter_read(&memcg->memory, MCT_MEMSW);
+ limit = READ_ONCE(memcg->memory.max[MCT_MEMSW]);
if (count < limit)
margin = min(margin, limit - count);
else
@@ -1896,19 +1896,23 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
lockdep_assert_held(&oom_lock);

pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->memory)),
- K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
+ K((u64)page_counter_read(&memcg->memory, MCT_MEMORY)),
+ K((u64)READ_ONCE(memcg->memory.max[MCT_MEMORY])),
+ memcg->memory.failcnt[MCT_MEMORY]);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->swap)),
- K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
+ K((u64)page_counter_read(&memcg->memory, MCT_SWAP)),
+ K((u64)READ_ONCE(memcg->memory.max[MCT_SWAP])),
+ memcg->memory.failcnt[MCT_SWAP]);
else {
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
+ K((u64)page_counter_read(&memcg->memory, MCT_MEMSW)),
+ K((u64)memcg->memory.max[MCT_MEMSW]),
+ memcg->memory.failcnt[MCT_MEMSW]);
pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
+ K((u64)page_counter_read(&memcg->memory, MCT_KMEM)),
+ K((u64)memcg->memory.max[MCT_KMEM]),
+ memcg->memory.failcnt[MCT_KMEM]);
}

pr_info("Memory cgroup stats for ");
@@ -1924,18 +1928,18 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
*/
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long max = READ_ONCE(memcg->memory.max);
+ unsigned long max = READ_ONCE(memcg->memory.max[MCT_MEMORY]);

if (do_memsw_account()) {
if (mem_cgroup_swappiness(memcg)) {
/* Calculate swap excess capacity from memsw limit */
- unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
+ unsigned long swap = READ_ONCE(memcg->memory.max[MCT_MEMSW]) - max;

max += min(swap, (unsigned long)total_swap_pages);
}
} else {
if (mem_cgroup_swappiness(memcg))
- max += min(READ_ONCE(memcg->swap.max),
+ max += min(READ_ONCE(memcg->memory.max[MCT_SWAP]),
(unsigned long)total_swap_pages);
}
return max;
@@ -1943,7 +1947,7 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)

unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
- return page_counter_read(&memcg->memory);
+ return page_counter_read(&memcg->memory, MCT_MEMORY);
}

static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2497,9 +2501,9 @@ static void drain_stock(struct memcg_stock_pcp *stock)
return;

if (stock_pages) {
- page_counter_uncharge(&old->memory, stock_pages);
+ page_counter_uncharge(&old->memory, MCT_MEMORY, stock_pages);
if (do_memsw_account())
- page_counter_uncharge(&old->memsw, stock_pages);
+ page_counter_uncharge(&old->memory, MCT_MEMSW, stock_pages);

WRITE_ONCE(stock->nr_pages, 0);
}
@@ -2625,8 +2629,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
do {
unsigned long pflags;

- if (page_counter_read(&memcg->memory) <=
- READ_ONCE(memcg->memory.high))
+ if (page_counter_read(&memcg->memory, MCT_MEMORY) <=
+ READ_ONCE(memcg->memory.high[MCT_MEMORY]))
continue;

memcg_memory_event(memcg, MEMCG_HIGH);
@@ -2727,8 +2731,8 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg)
u64 overage, max_overage = 0;

do {
- overage = calculate_overage(page_counter_read(&memcg->memory),
- READ_ONCE(memcg->memory.high));
+ overage = calculate_overage(page_counter_read(&memcg->memory, MCT_MEMORY),
+ READ_ONCE(memcg->memory.high[MCT_MEMORY]));
max_overage = max(overage, max_overage);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2741,8 +2745,8 @@ static u64 swap_find_max_overage(struct mem_cgroup *memcg)
u64 overage, max_overage = 0;

do {
- overage = calculate_overage(page_counter_read(&memcg->swap),
- READ_ONCE(memcg->swap.high));
+ overage = calculate_overage(page_counter_read(&memcg->memory, MCT_SWAP),
+ READ_ONCE(memcg->memory.high[MCT_SWAP]));
if (overage)
memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
max_overage = max(overage, max_overage);
@@ -2905,16 +2909,15 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
return 0;

if (!do_memsw_account() ||
- page_counter_try_charge(&memcg->memsw, batch, &counter)) {
- if (page_counter_try_charge(&memcg->memory, batch, &counter))
+ page_counter_try_charge(&memcg->memory, MCT_MEMSW, batch, &counter)) {
+ if (page_counter_try_charge(&memcg->memory, MCT_MEMORY, batch, &counter))
goto done_restock;
if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, batch);
- mem_over_limit = mem_cgroup_from_counter(counter, memory);
+ page_counter_uncharge(&memcg->memory, MCT_MEMSW, batch);
} else {
- mem_over_limit = mem_cgroup_from_counter(counter, memsw);
reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
}
+ mem_over_limit = mem_cgroup_from_counter(counter, memory);

if (batch > nr_pages) {
batch = nr_pages;
@@ -3016,9 +3019,9 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
* being freed very soon. Allow memory usage go over the limit
* temporarily by force charging it.
*/
- page_counter_charge(&memcg->memory, nr_pages);
+ page_counter_charge(&memcg->memory, MCT_MEMORY, nr_pages);
if (do_memsw_account())
- page_counter_charge(&memcg->memsw, nr_pages);
+ page_counter_charge(&memcg->memory, MCT_MEMSW, nr_pages);

return 0;

@@ -3038,10 +3041,10 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
do {
bool mem_high, swap_high;

- mem_high = page_counter_read(&memcg->memory) >
- READ_ONCE(memcg->memory.high);
- swap_high = page_counter_read(&memcg->swap) >
- READ_ONCE(memcg->swap.high);
+ mem_high = page_counter_read(&memcg->memory, MCT_MEMORY) >
+ READ_ONCE(memcg->memory.high[MCT_MEMORY]);
+ swap_high = page_counter_read(&memcg->memory, MCT_SWAP) >
+ READ_ONCE(memcg->memory.high[MCT_SWAP]);

/* Don't bother a random interrupted task */
if (!in_task()) {
@@ -3101,9 +3104,9 @@ void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
if (mem_cgroup_is_root(memcg))
return;

- page_counter_uncharge(&memcg->memory, nr_pages);
+ page_counter_uncharge(&memcg->memory, MCT_MEMORY, nr_pages);
if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ page_counter_uncharge(&memcg->memory, MCT_MEMSW, nr_pages);
}

static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
@@ -3373,9 +3376,9 @@ static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
if (nr_pages > 0)
- page_counter_charge(&memcg->kmem, nr_pages);
+ page_counter_charge(&memcg->memory, MCT_KMEM, nr_pages);
else
- page_counter_uncharge(&memcg->kmem, -nr_pages);
+ page_counter_uncharge(&memcg->memory, MCT_KMEM, -nr_pages);
}
}

@@ -3877,7 +3880,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
bool drained = false;
int ret;
bool limits_invariant;
- struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
+ enum mem_counter_type id = memsw ? MCT_MEMSW : MCT_MEMORY;

do {
if (signal_pending(current)) {
@@ -3888,18 +3891,18 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
mutex_lock(&memcg_max_mutex);
/*
* Make sure that the new limit (memsw or memory limit) doesn't
- * break our basic invariant rule memory.max <= memsw.max.
+ * break our basic invariant rule max[MCT_MEMORY] <= max[MCT_MEMSW].
*/
- limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
- max <= memcg->memsw.max;
+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max[MCT_MEMORY]) :
+ max <= memcg->memory.max[MCT_MEMSW];
if (!limits_invariant) {
mutex_unlock(&memcg_max_mutex);
ret = -EINVAL;
break;
}
- if (max > counter->max)
+ if (max > memcg->memory.max[id])
enlarge = true;
- ret = page_counter_set_max(counter, max);
+ ret = page_counter_set_max(&memcg->memory, id, max);
mutex_unlock(&memcg_max_mutex);

if (!ret)
@@ -4021,7 +4024,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
drain_all_stock(memcg);

/* try to free all pages in this cgroup */
- while (nr_retries && page_counter_read(&memcg->memory)) {
+ while (nr_retries && page_counter_read(&memcg->memory, MCT_MEMORY)) {
if (signal_pending(current))
return -EINTR;

@@ -4077,10 +4080,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
if (swap)
val += total_swap_pages - get_nr_swap_pages();
} else {
- if (!swap)
- val = page_counter_read(&memcg->memory);
- else
- val = page_counter_read(&memcg->memsw);
+ val = page_counter_read(&memcg->memory, swap ? MCT_MEMSW : MCT_MEMORY);
}
return val;
}
@@ -4097,38 +4097,22 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct page_counter *counter;
-
- switch (MEMFILE_TYPE(cft->private)) {
- case MCT_MEMORY:
- counter = &memcg->memory;
- break;
- case MCT_MEMSW:
- counter = &memcg->memsw;
- break;
- case MCT_KMEM:
- counter = &memcg->kmem;
- break;
- case MCT_TCPMEM:
- counter = &memcg->tcpmem;
- break;
- default:
- BUG();
- }
+ enum mem_counter_type id = MEMFILE_TYPE(cft->private);

switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
- if (counter == &memcg->memory)
+ if (id == MCT_MEMORY)
return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
- if (counter == &memcg->memsw)
+ else if (id == MCT_MEMSW)
return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
- return (u64)page_counter_read(counter) * PAGE_SIZE;
+ else
+ return (u64)page_counter_read(&memcg->memory, id) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->max * PAGE_SIZE;
+ return memcg->memory.max[id] * PAGE_SIZE;
case RES_MAX_USAGE:
- return (u64)counter->watermark * PAGE_SIZE;
+ return memcg->memory.watermark[id] * PAGE_SIZE;
case RES_FAILCNT:
- return counter->failcnt;
+ return memcg->memory.failcnt[id];
case RES_SOFT_LIMIT:
return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
default:
@@ -4213,7 +4197,7 @@ static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)

mutex_lock(&memcg_max_mutex);

- ret = page_counter_set_max(&memcg->tcpmem, max);
+ ret = page_counter_set_max(&memcg->memory, MCT_TCPMEM, max);
if (ret)
goto out;

@@ -4299,31 +4283,14 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- struct page_counter *counter;
-
- switch (MEMFILE_TYPE(of_cft(of)->private)) {
- case MCT_MEMORY:
- counter = &memcg->memory;
- break;
- case MCT_MEMSW:
- counter = &memcg->memsw;
- break;
- case MCT_KMEM:
- counter = &memcg->kmem;
- break;
- case MCT_TCPMEM:
- counter = &memcg->tcpmem;
- break;
- default:
- BUG();
- }
+ enum mem_counter_type id = MEMFILE_TYPE(of_cft(of)->private);

switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_MAX_USAGE:
- page_counter_reset_watermark(counter);
+ page_counter_reset_watermark(&memcg->memory, id);
break;
case RES_FAILCNT:
- counter->failcnt = 0;
+ memcg->memory.failcnt[id] = 0;
break;
default:
BUG();
@@ -4531,8 +4498,8 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
- memory = min(memory, READ_ONCE(mi->memory.max));
- memsw = min(memsw, READ_ONCE(mi->memsw.max));
+ memory = min(memory, READ_ONCE(mi->memory.max[MCT_MEMORY]));
+ memsw = min(memsw, READ_ONCE(mi->memory.max[MCT_MEMSW]));
}
seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
@@ -5016,9 +4983,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,

*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
- READ_ONCE(memcg->memory.high));
- unsigned long used = page_counter_read(&memcg->memory);
+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max[MCT_MEMORY]),
+ READ_ONCE(memcg->memory.high[MCT_MEMORY]));
+ unsigned long used = page_counter_read(&memcg->memory, MCT_MEMORY);

*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
memcg = parent;
@@ -5783,30 +5750,22 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(memcg))
return ERR_CAST(memcg);

- page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, MCT_MEMORY, PAGE_COUNTER_MAX);
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
memcg->zswap_max = PAGE_COUNTER_MAX;
WRITE_ONCE(memcg->zswap_writeback,
!parent || READ_ONCE(parent->zswap_writeback));
#endif
- page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, MCT_SWAP, PAGE_COUNTER_MAX);
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
-
page_counter_init(&memcg->memory, &parent->memory);
- page_counter_init(&memcg->swap, &parent->swap);
- page_counter_init(&memcg->kmem, &parent->kmem);
- page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
init_memcg_stats();
init_memcg_events();
page_counter_init(&memcg->memory, NULL);
- page_counter_init(&memcg->swap, NULL);
- page_counter_init(&memcg->kmem, NULL);
- page_counter_init(&memcg->tcpmem, NULL);
-
root_mem_cgroup = memcg;
return &memcg->css;
}
@@ -5949,16 +5908,17 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-
- page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
- page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
- page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
- page_counter_set_min(&memcg->memory, 0);
- page_counter_set_low(&memcg->memory, 0);
- page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
+ struct page_counter *c = &memcg->memory;
+
+ page_counter_set_max(c, MCT_MEMORY, PAGE_COUNTER_MAX);
+ page_counter_set_max(c, MCT_SWAP, PAGE_COUNTER_MAX);
+ page_counter_set_max(c, MCT_KMEM, PAGE_COUNTER_MAX);
+ page_counter_set_max(c, MCT_TCPMEM, PAGE_COUNTER_MAX);
+ page_counter_set_min(c, 0);
+ page_counter_set_low(c, 0);
+ page_counter_set_high(c, MCT_MEMORY, PAGE_COUNTER_MAX);
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
- page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_set_high(c, MCT_SWAP, PAGE_COUNTER_MAX);
memcg_wb_domain_size_changed(memcg);
}

@@ -6529,7 +6489,7 @@ static void __mem_cgroup_clear_mc(void)
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
- page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
+ page_counter_uncharge(&mc.from->memory, MCT_MEMSW, mc.moved_swap);

mem_cgroup_id_put_many(mc.from, mc.moved_swap);

@@ -6538,7 +6498,7 @@ static void __mem_cgroup_clear_mc(void)
* should uncharge to->memory.
*/
if (!mem_cgroup_is_root(mc.to))
- page_counter_uncharge(&mc.to->memory, mc.moved_swap);
+ page_counter_uncharge(&mc.to->memory, MCT_MEMORY, mc.moved_swap);

mc.moved_swap = 0;
}
@@ -6906,7 +6866,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

- return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
+ return (u64)page_counter_read(&memcg->memory, MCT_MEMORY) * PAGE_SIZE;
}

static u64 memory_peak_read(struct cgroup_subsys_state *css,
@@ -6914,7 +6874,7 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

- return (u64)memcg->memory.watermark * PAGE_SIZE;
+ return (u64)memcg->memory.watermark[MCT_MEMORY] * PAGE_SIZE;
}

static int memory_min_show(struct seq_file *m, void *v)
@@ -6966,7 +6926,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
- READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high[MCT_MEMORY]));
}

static ssize_t memory_high_write(struct kernfs_open_file *of,
@@ -6983,10 +6943,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;

- page_counter_set_high(&memcg->memory, high);
+ page_counter_set_high(&memcg->memory, MCT_MEMORY, high);

for (;;) {
- unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long nr_pages = page_counter_read(&memcg->memory, MCT_MEMORY);
unsigned long reclaimed;

if (nr_pages <= high)
@@ -7015,7 +6975,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
static int memory_max_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
- READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max[MCT_MEMORY]));
}

static ssize_t memory_max_write(struct kernfs_open_file *of,
@@ -7032,10 +6992,10 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;

- xchg(&memcg->memory.max, max);
+ xchg(&memcg->memory.max[MCT_MEMORY], max);

for (;;) {
- unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long nr_pages = page_counter_read(&memcg->memory, MCT_MEMORY);

if (nr_pages <= max)
break;
@@ -7541,7 +7501,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
if (memcg == root)
return;

- usage = page_counter_read(&memcg->memory);
+ usage = page_counter_read(&memcg->memory, MCT_MEMORY);
if (!usage)
return;

@@ -7553,7 +7513,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
return;
}

- parent_usage = page_counter_read(&parent->memory);
+ parent_usage = page_counter_read(&parent->memory, MCT_MEMORY);

WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
READ_ONCE(memcg->memory.min),
@@ -7713,9 +7673,9 @@ static void uncharge_batch(const struct uncharge_gather *ug)
unsigned long flags;

if (ug->nr_memory) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
+ page_counter_uncharge(&ug->memcg->memory, MCT_MEMORY, ug->nr_memory);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ page_counter_uncharge(&ug->memcg->memory, MCT_MEMSW, ug->nr_memory);
if (ug->nr_kmem)
memcg_account_kmem(ug->memcg, -ug->nr_kmem);
memcg_oom_recover(ug->memcg);
@@ -7854,9 +7814,9 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)

/* Force-charge the new page. The old one will be freed soon */
if (!mem_cgroup_is_root(memcg)) {
- page_counter_charge(&memcg->memory, nr_pages);
+ page_counter_charge(&memcg->memory, MCT_MEMORY, nr_pages);
if (do_memsw_account())
- page_counter_charge(&memcg->memsw, nr_pages);
+ page_counter_charge(&memcg->memory, MCT_MEMSW, nr_pages);
}

css_get(&memcg->css);
@@ -7964,13 +7924,13 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
struct page_counter *fail;

- if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
+ if (page_counter_try_charge(&memcg->memory, MCT_TCPMEM, nr_pages, &fail)) {
memcg->tcpmem_pressure = 0;
return true;
}
memcg->tcpmem_pressure = 1;
if (gfp_mask & __GFP_NOFAIL) {
- page_counter_charge(&memcg->tcpmem, nr_pages);
+ page_counter_charge(&memcg->memory, MCT_TCPMEM, nr_pages);
return true;
}
return false;
@@ -7992,7 +7952,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- page_counter_uncharge(&memcg->tcpmem, nr_pages);
+ page_counter_uncharge(&memcg->memory, MCT_TCPMEM, nr_pages);
return;
}

@@ -8126,12 +8086,12 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
folio->memcg_data = 0;

if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, nr_entries);
+ page_counter_uncharge(&memcg->memory, MCT_MEMORY, nr_entries);

if (memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
- page_counter_charge(&swap_memcg->memsw, nr_entries);
- page_counter_uncharge(&memcg->memsw, nr_entries);
+ page_counter_charge(&swap_memcg->memory, MCT_MEMSW, nr_entries);
+ page_counter_uncharge(&memcg->memory, MCT_MEMSW, nr_entries);
}

/*
@@ -8181,7 +8141,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);

if (!mem_cgroup_is_root(memcg) &&
- !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
+ !page_counter_try_charge(&memcg->memory, MCT_SWAP, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
mem_cgroup_id_put(memcg);
@@ -8214,9 +8174,9 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
if (memcg) {
if (!mem_cgroup_is_root(memcg)) {
if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ page_counter_uncharge(&memcg->memory, MCT_MEMSW, nr_pages);
else
- page_counter_uncharge(&memcg->swap, nr_pages);
+ page_counter_uncharge(&memcg->memory, MCT_SWAP, nr_pages);
}
mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
mem_cgroup_id_put_many(memcg, nr_pages);
@@ -8232,8 +8192,8 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
return nr_swap_pages;
for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
- READ_ONCE(memcg->swap.max) -
- page_counter_read(&memcg->swap));
+ READ_ONCE(memcg->memory.max[MCT_SWAP]) -
+ page_counter_read(&memcg->memory, MCT_SWAP));
return nr_swap_pages;
}

@@ -8253,10 +8213,10 @@ bool mem_cgroup_swap_full(struct folio *folio)
return false;

for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
- unsigned long usage = page_counter_read(&memcg->swap);
+ unsigned long usage = page_counter_read(&memcg->memory, MCT_SWAP);

- if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
- usage * 2 >= READ_ONCE(memcg->swap.max))
+ if (usage * 2 >= READ_ONCE(memcg->memory.high[MCT_SWAP]) ||
+ usage * 2 >= READ_ONCE(memcg->memory.max[MCT_SWAP]))
return true;
}

@@ -8281,7 +8241,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

- return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
+ return (u64)page_counter_read(&memcg->memory, MCT_SWAP) * PAGE_SIZE;
}

static u64 swap_peak_read(struct cgroup_subsys_state *css,
@@ -8289,13 +8249,13 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

- return (u64)memcg->swap.watermark * PAGE_SIZE;
+ return (u64)memcg->memory.watermark[MCT_SWAP] * PAGE_SIZE;
}

static int swap_high_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
- READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high[MCT_SWAP]));
}

static ssize_t swap_high_write(struct kernfs_open_file *of,
@@ -8310,7 +8270,7 @@ static ssize_t swap_high_write(struct kernfs_open_file *of,
if (err)
return err;

- page_counter_set_high(&memcg->swap, high);
+ page_counter_set_high(&memcg->memory, MCT_SWAP, high);

return nbytes;
}
@@ -8318,7 +8278,7 @@ static ssize_t swap_high_write(struct kernfs_open_file *of,
static int swap_max_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
- READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max[MCT_SWAP]));
}

static ssize_t swap_max_write(struct kernfs_open_file *of,
@@ -8333,7 +8293,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
if (err)
return err;

- xchg(&memcg->swap.max, max);
+ xchg(&memcg->memory.max[MCT_SWAP], max);

return nbytes;
}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index db20d6452b71..b6ca3adbc226 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -44,18 +44,21 @@ static void propagate_protected_usage(struct page_counter *c,
/**
* page_counter_cancel - take pages out of the local counter
* @counter: counter
+ * @id: memory resource type
* @nr_pages: number of pages to cancel
*/
-void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
+void page_counter_cancel(struct page_counter *counter,
+ enum mem_counter_type id,
+ unsigned long nr_pages)
{
long new;

- new = atomic_long_sub_return(nr_pages, &counter->usage);
+ new = atomic_long_sub_return(nr_pages, &counter->usage[id]);
/* More uncharges than charges? */
if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
new, nr_pages)) {
new = 0;
- atomic_long_set(&counter->usage, new);
+ atomic_long_set(&counter->usage[id], new);
}
propagate_protected_usage(counter, new);
}
@@ -63,31 +66,35 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
/**
* page_counter_charge - hierarchically charge pages
* @counter: counter
+ * @id: memory resource type
* @nr_pages: number of pages to charge
*
* NOTE: This does not consider any configured counter limits.
*/
-void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
+void page_counter_charge(struct page_counter *counter,
+ enum mem_counter_type id,
+ unsigned long nr_pages)
{
struct page_counter *c;

for (c = counter; c; c = c->parent) {
long new;

- new = atomic_long_add_return(nr_pages, &c->usage);
+ new = atomic_long_add_return(nr_pages, &c->usage[id]);
propagate_protected_usage(c, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
*/
- if (new > READ_ONCE(c->watermark))
- WRITE_ONCE(c->watermark, new);
+ if (new > READ_ONCE(c->watermark[id]))
+ WRITE_ONCE(c->watermark[id], new);
}
}

/**
* page_counter_try_charge - try to hierarchically charge pages
* @counter: counter
+ * @id: memory resource type
* @nr_pages: number of pages to charge
* @fail: points first counter to hit its limit, if any
*
@@ -95,6 +102,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
* of its ancestors has hit its configured limit.
*/
bool page_counter_try_charge(struct page_counter *counter,
+ enum mem_counter_type id,
unsigned long nr_pages,
struct page_counter **fail)
{
@@ -116,15 +124,15 @@ bool page_counter_try_charge(struct page_counter *counter,
* we either see the new limit or the setter sees the
* counter has changed and retries.
*/
- new = atomic_long_add_return(nr_pages, &c->usage);
- if (new > c->max) {
- atomic_long_sub(nr_pages, &c->usage);
+ new = atomic_long_add_return(nr_pages, &c->usage[id]);
+ if (new > c->max[id]) {
+ atomic_long_sub(nr_pages, &c->usage[id]);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt which is only used
* to report stats.
*/
- data_race(c->failcnt++);
+ data_race(c->failcnt[id]++);
*fail = c;
goto failed;
}
@@ -133,14 +141,14 @@ bool page_counter_try_charge(struct page_counter *counter,
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
*/
- if (new > READ_ONCE(c->watermark))
- WRITE_ONCE(c->watermark, new);
+ if (new > READ_ONCE(c->watermark[id]))
+ WRITE_ONCE(c->watermark[id], new);
}
return true;

failed:
for (c = counter; c != *fail; c = c->parent)
- page_counter_cancel(c, nr_pages);
+ page_counter_cancel(c, id, nr_pages);

return false;
}
@@ -148,19 +156,23 @@ bool page_counter_try_charge(struct page_counter *counter,
/**
* page_counter_uncharge - hierarchically uncharge pages
* @counter: counter
+ * @id: memory resource type
* @nr_pages: number of pages to uncharge
*/
-void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
+void page_counter_uncharge(struct page_counter *counter,
+ enum mem_counter_type id,
+ unsigned long nr_pages)
{
struct page_counter *c;

for (c = counter; c; c = c->parent)
- page_counter_cancel(c, nr_pages);
+ page_counter_cancel(c, id, nr_pages);
}

/**
* page_counter_set_max - set the maximum number of pages allowed
* @counter: counter
+ * @id: memory resource type
* @nr_pages: limit to set
*
* Returns 0 on success, -EBUSY if the current number of pages on the
@@ -168,7 +180,9 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
*
* The caller must serialize invocations on the same counter.
*/
-int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
+int page_counter_set_max(struct page_counter *counter,
+ enum mem_counter_type id,
+ unsigned long nr_pages)
{
for (;;) {
unsigned long old;
@@ -185,17 +199,18 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
* the limit, so if it sees the old limit, we see the
* modified counter and retry.
*/
- usage = page_counter_read(counter);
+ usage = page_counter_read(counter, id);

if (usage > nr_pages)
return -EBUSY;

- old = xchg(&counter->max, nr_pages);
+ old = xchg(&counter->max[id], nr_pages);

- if (page_counter_read(counter) <= usage || nr_pages >= old)
+ if (page_counter_read(counter, id) <= usage ||
+ nr_pages >= old)
return 0;

- counter->max = old;
+ counter->max[id] = old;
cond_resched();
}
}
@@ -214,7 +229,7 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
WRITE_ONCE(counter->min, nr_pages);

for (c = counter; c; c = c->parent)
- propagate_protected_usage(c, atomic_long_read(&c->usage));
+ propagate_protected_usage(c, atomic_long_read(&c->usage[0]));
}

/**
@@ -231,7 +246,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
WRITE_ONCE(counter->low, nr_pages);

for (c = counter; c; c = c->parent)
- propagate_protected_usage(c, atomic_long_read(&c->usage));
+ propagate_protected_usage(c, atomic_long_read(&c->usage[0]));
}

/**
--
2.43.2


2024-05-03 21:13:32

by Shakeel Butt

[permalink] [raw]
Subject: Re: [PATCH v1 2/4] mm: memcg: merge multiple page_counters into a single structure

On Fri, May 03, 2024 at 01:18:33PM -0700, Roman Gushchin wrote:
[...]
> enum mem_counter_type {
> +#ifdef CONFIG_MEMCG
> MCT_MEMORY, /* cgroup v1 and v2 */
> MCT_SWAP, /* cgroup v2 only */
> MCT_MEMSW = MCT_SWAP, /* cgroup v1 only */
> MCT_KMEM, /* cgroup v1 only */
> MCT_TCPMEM, /* cgroup v1 only */
> +#endif
> +#ifdef CONFIG_CGROUP_HUGETLB
> + MCT_HUGETLB_MAX = __MCT_HUGETLB_MAX,
> +#endif
> + __MCT_NR_ITEMS,
> };
>

Thanks for the awesome work. I haven't gone through all the patches yet
but wanted to ask a quick question. In the above enum are you trying to
do a union between memcg and hugetlb? It gave me a big pause to
understand what you are trying to do.

2024-05-03 21:18:43

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH v1 2/4] mm: memcg: merge multiple page_counters into a single structure

On Fri, May 03, 2024 at 02:11:17PM -0700, Shakeel Butt wrote:
> On Fri, May 03, 2024 at 01:18:33PM -0700, Roman Gushchin wrote:
> [...]
> > enum mem_counter_type {
> > +#ifdef CONFIG_MEMCG
> > MCT_MEMORY, /* cgroup v1 and v2 */
> > MCT_SWAP, /* cgroup v2 only */
> > MCT_MEMSW = MCT_SWAP, /* cgroup v1 only */
> > MCT_KMEM, /* cgroup v1 only */
> > MCT_TCPMEM, /* cgroup v1 only */
> > +#endif
> > +#ifdef CONFIG_CGROUP_HUGETLB
> > + MCT_HUGETLB_MAX = __MCT_HUGETLB_MAX,
> > +#endif
> > + __MCT_NR_ITEMS,
> > };
> >
>
> Thanks for the awesome work. I haven't gone through all the patches yet
> but wanted to ask a quick question. In the above enum are you trying to
> do a union between memcg and hugetlb? It gave me a big pause to
> understand what you are trying to do.

Yep, sort of. So the page_counter structure supports N independent
counters, where N is sufficient enough for both memcg and hugetlb cases.

MCT_MEMORY, MCT_SWAP etc are used directly in the memcontrol.c code,
while hugetlb code just indexes. MCT_HUGETLB_MAX magic is needed to define
N at the compile time.

Thanks!

2024-05-08 02:07:54

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH v1 2/4] mm: memcg: merge multiple page_counters into a single structure


> On May 7, 2024, at 5:26 PM, T.J. Mercier <[email protected]> wrote:
>
> On Fri, May 3, 2024 at 2:18 PM Roman Gushchin <[email protected]> wrote:
>>
>>> On Fri, May 03, 2024 at 02:11:17PM -0700, Shakeel Butt wrote:
>>>> On Fri, May 03, 2024 at 01:18:33PM -0700, Roman Gushchin wrote:
>>> [...]
>>>> enum mem_counter_type {
>>>> +#ifdef CONFIG_MEMCG
>>>> MCT_MEMORY, /* cgroup v1 and v2 */
>>>> MCT_SWAP, /* cgroup v2 only */
>>>> MCT_MEMSW = MCT_SWAP, /* cgroup v1 only */
>>>> MCT_KMEM, /* cgroup v1 only */
>>>> MCT_TCPMEM, /* cgroup v1 only */
>>>> +#endif
>>>> +#ifdef CONFIG_CGROUP_HUGETLB
>>>> + MCT_HUGETLB_MAX = __MCT_HUGETLB_MAX,
>>>> +#endif
>>>> + __MCT_NR_ITEMS,
>>>> };
>>>>
>>>
>>> Thanks for the awesome work. I haven't gone through all the patches yet
>>> but wanted to ask a quick question. In the above enum are you trying to
>>> do a union between memcg and hugetlb? It gave me a big pause to
>>> understand what you are trying to do.
>>
>> Yep, sort of. So the page_counter structure supports N independent
>> counters, where N is sufficient enough for both memcg and hugetlb cases.
>>
>> MCT_MEMORY, MCT_SWAP etc are used directly in the memcontrol.c code,
>> while hugetlb code just indexes. MCT_HUGETLB_MAX magic is needed to define
>> N at the compile time.
>
> Where N is __MCT_NR_ITEMS for all the counter array lengths? That
> doesn't look like it works if MCT_HUGETLB_MAX is small... i.e. there
> is both CONFIG_MEMCG and CONFIG_CGROUP_HUGETLB and (__MCT_HUGETLB_MAX
> = 1 or 3) since MCT_HUGETLB_MAX would be < MCT_TCPMEM and then
> __MCT_NR_ITEMS would be wrong?
>
> If so, what about:
>
> enum mem_counter_type {
> #ifdef CONFIG_MEMCG
> MCT_MEMORY, /* cgroup v1 and v2 */
> MCT_SWAP, /* cgroup v2 only */
> MCT_MEMSW = MCT_SWAP, /* cgroup v1 only */
> MCT_KMEM, /* cgroup v1 only */
> MCT_TCPMEM, /* cgroup v1 only */
> #endif
> MCT_MEMCG_NR_ITEMS,
> #ifdef CONFIG_CGROUP_HUGETLB
> MCT_HUGETLB_MAX = MCT_MEMCG_NR_ITEMS + __MCT_HUGETLB_MAX,
> #else
> MCT_HUGETLB_MAX = 0,
> #endif
> __MCT_NR_ITEMS = MAX(MCT_MEMCG_NR_ITEMS, MCT_HUGETLB_MAX)
> };

The page_counter structure is not shared between memory and hugetlb cgroups, so N should be big enough to accommodate 4 memcg counters __or__ 2 * HUGE_MAX_STATE hugetlb counters. Your version has enough space for both.

Thanks!

2024-05-10 07:16:12

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v1 2/4] mm: memcg: merge multiple page_counters into a single structure

Hello,

kernel test robot noticed "WARNING:at_mm/page_counter.c:#page_counter_cancel" on:

commit: 214583b2262ef6157ee9834fa23a7da8f2292dd2 ("[PATCH v1 2/4] mm: memcg: merge multiple page_counters into a single structure")
url: https://github.com/intel-lab-lkp/linux/commits/Roman-Gushchin/mm-memcg-convert-enum-res_type-to-mem_counter_type/20240504-042046
base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/all/[email protected]/
patch subject: [PATCH v1 2/4] mm: memcg: merge multiple page_counters into a single structure

in testcase: ltp
version: ltp-x86_64-14c1f76-1_20240504
with following parameters:

disk: 1HDD
fs: xfs
test: syscalls-03

compiler: gcc-13
test machine: 4 threads 1 sockets Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz (Ivy Bridge) with 8G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-lkp/[email protected]


kern :warn : [ 551.565920] ------------[ cut here ]------------
kern :warn : [ 551.573137] page_counter underflow: -512 nr_pages=512
kern :warn : [ 551.585841] WARNING: CPU: 0 PID: 6724 at mm/page_counter.c:58 page_counter_cancel (mm/page_counter.c:58 (discriminator 1))
kern :warn : [ 551.810031] CPU: 0 PID: 6724 Comm: memfd_create03 Tainted: G S 6.9.0-rc4-00574-g214583b2262e #1
kern :warn : [ 551.820871] Hardware name: Hewlett-Packard HP Pro 3340 MT/17A1, BIOS 8.07 01/24/2013
kern :warn : [ 551.829368] RIP: 0010:page_counter_cancel (mm/page_counter.c:58 (discriminator 1))
kern :warn : [ 551.835103] Code: 3c 02 00 75 4f 49 c7 04 24 00 00 00 00 31 f6 e9 71 ff ff ff 48 89 ea 48 c7 c7 a0 88 f6 83 c6 05 06 21 d6 03 01 e8 84 d9 72 ff <0f> 0b eb ad 48 89 34 24 e8 d7 94 fb ff 48 8b 34 24 e9 67 ff ff ff
All code
========
0: 3c 02 cmp $0x2,%al
2: 00 75 4f add %dh,0x4f(%rbp)
5: 49 c7 04 24 00 00 00 movq $0x0,(%r12)
c: 00
d: 31 f6 xor %esi,%esi
f: e9 71 ff ff ff jmp 0xffffffffffffff85
14: 48 89 ea mov %rbp,%rdx
17: 48 c7 c7 a0 88 f6 83 mov $0xffffffff83f688a0,%rdi
1e: c6 05 06 21 d6 03 01 movb $0x1,0x3d62106(%rip) # 0x3d6212b
25: e8 84 d9 72 ff call 0xffffffffff72d9ae
2a:* 0f 0b ud2 <-- trapping instruction
2c: eb ad jmp 0xffffffffffffffdb
2e: 48 89 34 24 mov %rsi,(%rsp)
32: e8 d7 94 fb ff call 0xfffffffffffb950e
37: 48 8b 34 24 mov (%rsp),%rsi
3b: e9 67 ff ff ff jmp 0xffffffffffffffa7

Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: eb ad jmp 0xffffffffffffffb1
4: 48 89 34 24 mov %rsi,(%rsp)
8: e8 d7 94 fb ff call 0xfffffffffffb94e4
d: 48 8b 34 24 mov (%rsp),%rsi
11: e9 67 ff ff ff jmp 0xffffffffffffff7d
kern :warn : [ 551.854617] RSP: 0018:ffffc9000817fb58 EFLAGS: 00010286
kern :warn : [ 551.860610] RAX: 0000000000000000 RBX: ffff8881001c4100 RCX: ffffffff8239a90e
kern :warn : [ 551.868499] RDX: 1ffff11030706a6c RSI: 0000000000000008 RDI: ffff888183835360
kern :warn : [ 551.876394] RBP: 0000000000000200 R08: 0000000000000001 R09: fffff5200102ff23
kern :warn : [ 551.884295] R10: ffffc9000817f91f R11: 205d363233542020 R12: ffff8881001c4100
kern :warn : [ 551.892184] R13: 0000000000000000 R14: 0000000000000000 R15: ffffffff869a1de8
kern :warn : [ 551.900067] FS: 00007f45c0bc1740(0000) GS:ffff888183800000(0000) knlGS:0000000000000000
kern :warn : [ 551.908910] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
kern :warn : [ 551.915420] CR2: 00007f45c0c73900 CR3: 0000000206448002 CR4: 00000000001706f0
kern :warn : [ 551.923304] Call Trace:
kern :warn : [ 551.926508] <TASK>
kern :warn : [ 551.929366] ? __warn (kernel/panic.c:694)
kern :warn : [ 551.933354] ? page_counter_cancel (mm/page_counter.c:58 (discriminator 1))
kern :warn : [ 551.938467] ? report_bug (lib/bug.c:180 lib/bug.c:219)
kern :warn : [ 551.942892] ? handle_bug (arch/x86/kernel/traps.c:239 (discriminator 1))
kern :warn : [ 551.947142] ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1))
kern :warn : [ 551.951741] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621)
kern :warn : [ 551.956684] ? llist_add_batch (lib/llist.c:33 (discriminator 14))
kern :warn : [ 551.961451] ? page_counter_cancel (mm/page_counter.c:58 (discriminator 1))
kern :warn : [ 551.966564] ? page_counter_cancel (mm/page_counter.c:58 (discriminator 1))
kern :warn : [ 551.971674] page_counter_uncharge (mm/page_counter.c:168 (discriminator 3))
kern :warn : [ 551.976706] hugetlb_cgroup_uncharge_counter (mm/hugetlb_cgroup.c:392)
kern :warn : [ 551.982684] hugetlb_vm_op_close (mm/hugetlb.c:5222)
kern :warn : [ 551.987713] remove_vma (mm/mmap.c:142)
kern :warn : [ 551.991870] do_vmi_align_munmap (mm/mmap.c:2336 mm/mmap.c:2685)
kern :warn : [ 551.996897] ? __pfx_do_vmi_align_munmap (mm/mmap.c:2561)
kern :warn : [ 552.002446] do_vmi_munmap (mm/mmap.c:2757)
kern :warn : [ 552.006948] __vm_munmap (mm/mmap.c:3036)
kern :warn : [ 552.011288] ? __pfx___vm_munmap (mm/mmap.c:3027)
kern :warn : [ 552.016138] ? __pfx_ksys_write (fs/read_write.c:633)
kern :warn : [ 552.020914] __x64_sys_munmap (mm/mmap.c:3050)
kern :warn : [ 552.025509] do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1))
kern :warn : [ 552.029924] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
kern :warn : [ 552.035733] RIP: 0033:0x7f45c0cc58f7
kern :warn : [ 552.040067] Code: 00 00 00 48 8b 15 09 05 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 0b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d d9 04 0d 00 f7 d8 64 89 01 48
All code
========
0: 00 00 add %al,(%rax)
2: 00 48 8b add %cl,-0x75(%rax)
5: 15 09 05 0d 00 adc $0xd0509,%eax
a: f7 d8 neg %eax
c: 64 89 02 mov %eax,%fs:(%rdx)
f: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax
16: c3 ret
17: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1)
1e: 00 00 00
21: 66 90 xchg %ax,%ax
23: b8 0b 00 00 00 mov $0xb,%eax
28: 0f 05 syscall
2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction
30: 73 01 jae 0x33
32: c3 ret
33: 48 8b 0d d9 04 0d 00 mov 0xd04d9(%rip),%rcx # 0xd0513
3a: f7 d8 neg %eax
3c: 64 89 01 mov %eax,%fs:(%rcx)
3f: 48 rex.W

Code starting with the faulting instruction
===========================================
0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax
6: 73 01 jae 0x9
8: c3 ret
9: 48 8b 0d d9 04 0d 00 mov 0xd04d9(%rip),%rcx # 0xd04e9
10: f7 d8 neg %eax
12: 64 89 01 mov %eax,%fs:(%rcx)
15: 48 rex.W


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240510/[email protected]

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki