LinuxLists.cc - [PATCH v3 0/2] memcg: variable type fixes

2011-02-17 20:55:08

Subject: [PATCH v3 0/2] memcg: variable type fixes

Changes since -v2:
- Reworded patch 1 commit message.

This is a two part series that is a cleanup of memcg internal
counters. These two patches were originally proposed by Johannes
Weiner [1]. The original series was based on an older mmotm, so I had
to massage the patches a little. The patches are based on
mmotm-2011-02-10-16-26.

Patch 1 implements the idea that memcg only has to use signed types
for some of the counters, but not for the constant monotonically
increasing event counters where the sign-bit is a waste.
Originally proposed in [2].

Patch 2 converts the memcg fundamental page statistics counters to
native words as they should be wide enough for the expected values.
Originally proposed in [3].

References:
[1] https://lkml.org/lkml/2010/11/7/170
[2] https://lkml.org/lkml/2010/11/7/174
[3] https://lkml.org/lkml/2010/11/7/171

Johannes Weiner (2):
memcg: break out event counters from other stats
memcg: use native word page statistics counters

mm/memcontrol.c | 78 ++++++++++++++++++++++++++++++++++++-------------------
1 files changed, 51 insertions(+), 27 deletions(-)

--
1.7.3.1

2011-02-17 20:54:58

by Greg Thelen

[permalink] [raw]

Subject: [PATCH v3 2/2] memcg: use native word page statistics counters

From: Johannes Weiner <[email protected]>

The statistic counters are in units of pages, there is no reason to
make them 64-bit wide on 32-bit machines.

Make them native words. Since they are signed, this leaves 31 bit on
32-bit machines, which can represent roughly 8TB assuming a page size
of 4k.

Signed-off-by: Johannes Weiner <[email protected]>
Signed-off-by: Greg Thelen <[email protected]>
Acked-by: KAMEZAWA Hiroyuki <[email protected]>
Acked-by: Balbir Singh <[email protected]>
---
mm/memcontrol.c | 29 ++++++++++++++---------------
1 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a11ff1e..1c2704a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -107,7 +107,7 @@ enum mem_cgroup_events_index {
};

struct mem_cgroup_stat_cpu {
- s64 count[MEM_CGROUP_STAT_NSTATS];
+ long count[MEM_CGROUP_STAT_NSTATS];
unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
};

@@ -546,11 +546,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
* common workload, threashold and synchonization as vmstat[] should be
* implemented.
*/
-static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
- enum mem_cgroup_stat_index idx)
+static long mem_cgroup_read_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
{
+ long val = 0;
int cpu;
- s64 val = 0;

get_online_cpus();
for_each_online_cpu(cpu)
@@ -564,9 +564,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
return val;
}

-static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+static long mem_cgroup_local_usage(struct mem_cgroup *mem)
{
- s64 ret;
+ long ret;

ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
@@ -1761,7 +1761,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)

spin_lock(&mem->pcp_counter_lock);
for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
- s64 x = per_cpu(mem->stat->count[i], cpu);
+ long x = per_cpu(mem->stat->count[i], cpu);

per_cpu(mem->stat->count[i], cpu) = 0;
mem->nocpu_base.count[i] += x;
@@ -3473,13 +3473,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
}

-static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
- enum mem_cgroup_stat_index idx)
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
{
struct mem_cgroup *iter;
- s64 val = 0;
+ long val = 0;

- /* each per cpu's value can be minus.Then, use s64 */
+ /* Per-cpu values can be negative, use a signed accumulator */
for_each_mem_cgroup_tree(iter, mem)
val += mem_cgroup_read_stat(iter, idx);

@@ -3499,12 +3499,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
return res_counter_read_u64(&mem->memsw, RES_USAGE);
}

- val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
- val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
+ val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);

if (swap)
- val += mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_SWAPOUT);
+ val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);

return val << PAGE_SHIFT;
}
--
1.7.3.1

2011-02-17 20:55:13

by Greg Thelen

[permalink] [raw]

Subject: [PATCH v3 1/2] memcg: break out event counters from other stats

From: Johannes Weiner <[email protected]>

For increasing and decreasing per-cpu cgroup usage counters it makes
sense to use signed types, as single per-cpu values might go negative
during updates. But this is not the case for only-ever-increasing
event counters.

All the counters have been signed 64-bit so far, which was enough to
count events even with the sign bit wasted.

This patch:
- divides s64 counters into signed usage counters and unsigned
monotonically increasing event counters.
- converts unsigned event counters into 'unsigned long' rather than
'u64'. This matches the type used by the /proc/vmstat event counters.

The next patch narrows the signed usage counters type (on 32-bit CPUs,
that is).

Signed-off-by: Johannes Weiner <[email protected]>
Signed-off-by: Greg Thelen <[email protected]>
Acked-by: KAMEZAWA Hiroyuki <[email protected]>
Acked-by: Balbir Singh <[email protected]>
---
Changelog since v2:
- Reworded commit message.

mm/memcontrol.c | 49 +++++++++++++++++++++++++++++++++++++------------
1 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b7e3379..a11ff1e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -93,19 +93,22 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
- MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
- MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
- /* incremented at every pagein/pageout */
- MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
-
MEM_CGROUP_STAT_NSTATS,
};

+enum mem_cgroup_events_index {
+ MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
+ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
+ MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
+ MEM_CGROUP_EVENTS_NSTATS,
+};
+
struct mem_cgroup_stat_cpu {
s64 count[MEM_CGROUP_STAT_NSTATS];
+ unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
};

/*
@@ -577,6 +580,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
}

+static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
+ enum mem_cgroup_events_index idx)
+{
+ unsigned long val = 0;
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ val += per_cpu(mem->stat->events[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+ spin_lock(&mem->pcp_counter_lock);
+ val += mem->nocpu_base.events[idx];
+ spin_unlock(&mem->pcp_counter_lock);
+#endif
+ return val;
+}
+
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
bool file, int nr_pages)
{
@@ -589,13 +608,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,

/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
+ __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
else {
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+ __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
nr_pages = -nr_pages; /* for event */
}

- __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
+ __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);

preempt_enable();
}
@@ -617,9 +636,9 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,

static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
{
- s64 val;
+ unsigned long val;

- val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
+ val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);

return !(val & ((1 << event_mask_shift) - 1));
}
@@ -1747,6 +1766,12 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
per_cpu(mem->stat->count[i], cpu) = 0;
mem->nocpu_base.count[i] += x;
}
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+ unsigned long x = per_cpu(mem->stat->events[i], cpu);
+
+ per_cpu(mem->stat->events[i], cpu) = 0;
+ mem->nocpu_base.events[i] += x;
+ }
/* need to clear ON_MOVE value, works as a kind of lock. */
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
spin_unlock(&mem->pcp_counter_lock);
@@ -3699,9 +3724,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
s->stat[MCS_RSS] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
--
1.7.3.1