As of v3.11-9444-g3ea67d0 "memcg: add per cgroup writeback pages accounting"
memcg use of __this_cpu_add(counter, -nr_pages) leads to incorrect statistic
values because the negated nr_pages is not sign extended (counter is long,
nr_pages is unsigned int). The memcg fix is __this_cpu_sub(counter, nr_pages).
But that doesn't simply work because __this_cpu_sub(counter, nr_pages) was
implemented as __this_cpu_add(counter, -nr_pages) which suffers the same
problem. Example:
unsigned int delta = 1
preempt_disable()
this_cpu_write(long_counter, 0)
this_cpu_sub(long_counter, delta)
preempt_enable()
Before this change long_counter on a 64 bit machine ends with value 0xffffffff,
rather than 0xffffffffffffffff. This is because this_cpu_sub(pcp, delta) boils
down to:
long_counter = 0 + 0xffffffff
v3.12-rc6 shows that only new memcg code is affected by this problem - the new
mem_cgroup_move_account_page_stat() is the only place where an unsigned
adjustment is used. All other callers (e.g. shrink_dcache_sb) already use a
signed adjustment, so no problems before v3.12. Though I did not audit the
stable kernel trees, so there could be something hiding in there.
Patch 1 creates a test module for percpu operations which demonstrates the
__this_cpu_sub() problems. This patch is independent can be discarded if there
is no interest.
Patch 2 fixes __this_cpu_sub() to work with unsigned adjustments.
Patch 3 uses __this_cpu_sub() in memcg.
An alternative smaller solution is for memcg to use:
__this_cpu_add(counter, -(int)nr_pages)
admitting that __this_cpu_add/sub() doesn't work with unsigned adjustments. But
I felt like fixing the core services to prevent this in the future.
Changes from V1:
- more accurate patch titles, patch logs, and test module description now
referring to per cpu operations rather than per cpu counters.
- move small test code update from patch 2 to patch 1 (where the test is
introduced).
Greg Thelen (3):
percpu: add test module for various percpu operations
percpu: fix this_cpu_sub() subtrahend casting for unsigneds
memcg: use __this_cpu_sub() to dec stats to avoid incorrect subtrahend
casting
arch/x86/include/asm/percpu.h | 3 +-
include/linux/percpu.h | 8 +--
lib/Kconfig.debug | 9 +++
lib/Makefile | 2 +
lib/percpu_test.c | 138 ++++++++++++++++++++++++++++++++++++++++++
mm/memcontrol.c | 2 +-
6 files changed, 156 insertions(+), 6 deletions(-)
create mode 100644 lib/percpu_test.c
--
1.8.4.1
Tests various percpu operations.
Enable with CONFIG_PERCPU_TEST=m.
Signed-off-by: Greg Thelen <[email protected]>
Acked-by: Tejun Heo <[email protected]>
---
lib/Kconfig.debug | 9 ++++
lib/Makefile | 2 +
lib/percpu_test.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 149 insertions(+)
create mode 100644 lib/percpu_test.c
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 06344d9..9fdb452 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1472,6 +1472,15 @@ config INTERVAL_TREE_TEST
help
A benchmark measuring the performance of the interval tree library
+config PERCPU_TEST
+ tristate "Per cpu operations test"
+ depends on m && DEBUG_KERNEL
+ help
+ Enable this option to build test module which validates per-cpu
+ operations.
+
+ If unsure, say N.
+
config ATOMIC64_SELFTEST
bool "Perform an atomic64_t self-test at boot"
help
diff --git a/lib/Makefile b/lib/Makefile
index f3bb2cb..bb016e1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -157,6 +157,8 @@ obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o
interval_tree_test-objs := interval_tree_test_main.o interval_tree.o
+obj-$(CONFIG_PERCPU_TEST) += percpu_test.o
+
obj-$(CONFIG_ASN1) += asn1_decoder.o
obj-$(CONFIG_FONT_SUPPORT) += fonts/
diff --git a/lib/percpu_test.c b/lib/percpu_test.c
new file mode 100644
index 0000000..fcca49e
--- /dev/null
+++ b/lib/percpu_test.c
@@ -0,0 +1,138 @@
+#include <linux/module.h>
+
+/* validate @native and @pcp counter values match @expected */
+#define CHECK(native, pcp, expected) \
+ do { \
+ WARN((native) != (expected), \
+ "raw %ld (0x%lx) != expected %ld (0x%lx)", \
+ (long)(native), (long)(native), \
+ (long)(expected), (long)(expected)); \
+ WARN(__this_cpu_read(pcp) != (expected), \
+ "pcp %ld (0x%lx) != expected %ld (0x%lx)", \
+ (long)__this_cpu_read(pcp), (long)__this_cpu_read(pcp), \
+ (long)(expected), (long)(expected)); \
+ } while (0)
+
+static DEFINE_PER_CPU(long, long_counter);
+static DEFINE_PER_CPU(unsigned long, ulong_counter);
+
+static int __init percpu_test_init(void)
+{
+ /*
+ * volatile prevents compiler from optimizing it uses, otherwise the
+ * +ul_one and -ul_one below would replace with inc/dec instructions.
+ */
+ volatile unsigned int ui_one = 1;
+ long l = 0;
+ unsigned long ul = 0;
+
+ pr_info("percpu test start\n");
+
+ preempt_disable();
+
+ l += -1;
+ __this_cpu_add(long_counter, -1);
+ CHECK(l, long_counter, -1);
+
+ l += 1;
+ __this_cpu_add(long_counter, 1);
+ CHECK(l, long_counter, 0);
+
+ ul = 0;
+ __this_cpu_write(ulong_counter, 0);
+
+ ul += 1UL;
+ __this_cpu_add(ulong_counter, 1UL);
+ CHECK(ul, ulong_counter, 1);
+
+ ul += -1UL;
+ __this_cpu_add(ulong_counter, -1UL);
+ CHECK(ul, ulong_counter, 0);
+
+ ul += -(unsigned long)1;
+ __this_cpu_add(ulong_counter, -(unsigned long)1);
+ CHECK(ul, ulong_counter, -1);
+
+ ul = 0;
+ __this_cpu_write(ulong_counter, 0);
+
+ ul -= 1;
+ __this_cpu_dec(ulong_counter);
+ CHECK(ul, ulong_counter, 0xffffffffffffffff);
+ CHECK(ul, ulong_counter, -1);
+
+ l += -ui_one;
+ __this_cpu_add(long_counter, -ui_one);
+ CHECK(l, long_counter, 0xffffffff);
+
+ l += ui_one;
+ __this_cpu_add(long_counter, ui_one);
+ CHECK(l, long_counter, 0x100000000);
+
+
+ l = 0;
+ __this_cpu_write(long_counter, 0);
+
+ l -= ui_one;
+ __this_cpu_sub(long_counter, ui_one);
+ CHECK(l, long_counter, -1);
+
+ l = 0;
+ __this_cpu_write(long_counter, 0);
+
+ l += ui_one;
+ __this_cpu_add(long_counter, ui_one);
+ CHECK(l, long_counter, 1);
+
+ l += -ui_one;
+ __this_cpu_add(long_counter, -ui_one);
+ CHECK(l, long_counter, 0x100000000);
+
+ l = 0;
+ __this_cpu_write(long_counter, 0);
+
+ l -= ui_one;
+ this_cpu_sub(long_counter, ui_one);
+ CHECK(l, long_counter, -1);
+ CHECK(l, long_counter, 0xffffffffffffffff);
+
+ ul = 0;
+ __this_cpu_write(ulong_counter, 0);
+
+ ul += ui_one;
+ __this_cpu_add(ulong_counter, ui_one);
+ CHECK(ul, ulong_counter, 1);
+
+ ul = 0;
+ __this_cpu_write(ulong_counter, 0);
+
+ ul -= ui_one;
+ __this_cpu_sub(ulong_counter, ui_one);
+ CHECK(ul, ulong_counter, -1);
+ CHECK(ul, ulong_counter, 0xffffffffffffffff);
+
+ ul = 3;
+ __this_cpu_write(ulong_counter, 3);
+
+ ul = this_cpu_sub_return(ulong_counter, ui_one);
+ CHECK(ul, ulong_counter, 2);
+
+ ul = __this_cpu_sub_return(ulong_counter, ui_one);
+ CHECK(ul, ulong_counter, 1);
+
+ preempt_enable();
+
+ pr_info("percpu test done\n");
+ return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void __exit percpu_test_exit(void)
+{
+}
+
+module_init(percpu_test_init)
+module_exit(percpu_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Greg Thelen");
+MODULE_DESCRIPTION("percpu operations test");
--
1.8.4.1
this_cpu_sub() is implemented as negation and addition.
This patch casts the adjustment to the counter type before negation to
sign extend the adjustment. This helps in cases where the counter
type is wider than an unsigned adjustment. An alternative to this
patch is to declare such operations unsupported, but it seemed useful
to avoid surprises.
This patch specifically helps the following example:
unsigned int delta = 1
preempt_disable()
this_cpu_write(long_counter, 0)
this_cpu_sub(long_counter, delta)
preempt_enable()
Before this change long_counter on a 64 bit machine ends with value
0xffffffff, rather than 0xffffffffffffffff. This is because
this_cpu_sub(pcp, delta) boils down to this_cpu_add(pcp, -delta),
which is basically:
long_counter = 0 + 0xffffffff
Also apply the same cast to:
__this_cpu_sub()
__this_cpu_sub_return()
this_cpu_sub_return()
All percpu_test.ko passes, especially the following cases which
previously failed:
l -= ui_one;
__this_cpu_sub(long_counter, ui_one);
CHECK(l, long_counter, -1);
l -= ui_one;
this_cpu_sub(long_counter, ui_one);
CHECK(l, long_counter, -1);
CHECK(l, long_counter, 0xffffffffffffffff);
ul -= ui_one;
__this_cpu_sub(ulong_counter, ui_one);
CHECK(ul, ulong_counter, -1);
CHECK(ul, ulong_counter, 0xffffffffffffffff);
ul = this_cpu_sub_return(ulong_counter, ui_one);
CHECK(ul, ulong_counter, 2);
ul = __this_cpu_sub_return(ulong_counter, ui_one);
CHECK(ul, ulong_counter, 1);
Signed-off-by: Greg Thelen <[email protected]>
Acked-by: Tejun Heo <[email protected]>
---
arch/x86/include/asm/percpu.h | 3 ++-
include/linux/percpu.h | 8 ++++----
2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 0da5200..b3e18f8 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -128,7 +128,8 @@ do { \
do { \
typedef typeof(var) pao_T__; \
const int pao_ID__ = (__builtin_constant_p(val) && \
- ((val) == 1 || (val) == -1)) ? (val) : 0; \
+ ((val) == 1 || (val) == -1)) ? \
+ (int)(val) : 0; \
if (0) { \
pao_T__ pao_tmp__; \
pao_tmp__ = (val); \
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index cc88172..c74088a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -332,7 +332,7 @@ do { \
#endif
#ifndef this_cpu_sub
-# define this_cpu_sub(pcp, val) this_cpu_add((pcp), -(val))
+# define this_cpu_sub(pcp, val) this_cpu_add((pcp), -(typeof(pcp))(val))
#endif
#ifndef this_cpu_inc
@@ -418,7 +418,7 @@ do { \
# define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
#endif
-#define this_cpu_sub_return(pcp, val) this_cpu_add_return(pcp, -(val))
+#define this_cpu_sub_return(pcp, val) this_cpu_add_return(pcp, -(typeof(pcp))(val))
#define this_cpu_inc_return(pcp) this_cpu_add_return(pcp, 1)
#define this_cpu_dec_return(pcp) this_cpu_add_return(pcp, -1)
@@ -586,7 +586,7 @@ do { \
#endif
#ifndef __this_cpu_sub
-# define __this_cpu_sub(pcp, val) __this_cpu_add((pcp), -(val))
+# define __this_cpu_sub(pcp, val) __this_cpu_add((pcp), -(typeof(pcp))(val))
#endif
#ifndef __this_cpu_inc
@@ -668,7 +668,7 @@ do { \
__pcpu_size_call_return2(__this_cpu_add_return_, pcp, val)
#endif
-#define __this_cpu_sub_return(pcp, val) __this_cpu_add_return(pcp, -(val))
+#define __this_cpu_sub_return(pcp, val) __this_cpu_add_return(pcp, -(typeof(pcp))(val))
#define __this_cpu_inc_return(pcp) __this_cpu_add_return(pcp, 1)
#define __this_cpu_dec_return(pcp) __this_cpu_add_return(pcp, -1)
--
1.8.4.1
As of v3.11-9444-g3ea67d0 "memcg: add per cgroup writeback pages
accounting" memcg counter errors are possible when moving charged
memory to a different memcg. Charge movement occurs when processing
writes to memory.force_empty, moving tasks to a memcg with
memcg.move_charge_at_immigrate=1, or memcg deletion. An example
showing error after memory.force_empty:
$ cd /sys/fs/cgroup/memory
$ mkdir x
$ rm /data/tmp/file
$ (echo $BASHPID >> x/tasks && exec mmap_writer /data/tmp/file 1M) &
[1] 13600
$ grep ^mapped x/memory.stat
mapped_file 1048576
$ echo 13600 > tasks
$ echo 1 > x/memory.force_empty
$ grep ^mapped x/memory.stat
mapped_file 4503599627370496
mapped_file should end with 0.
4503599627370496 == 0x10,0000,0000,0000 == 0x100,0000,0000 pages
1048576 == 0x10,0000 == 0x100 pages
This issue only affects the source memcg on 64 bit machines; the
destination memcg counters are correct. So the rmdir case is not too
important because such counters are soon disappearing with the entire
memcg. But the memcg.force_empty and
memory.move_charge_at_immigrate=1 cases are larger problems as the
bogus counters are visible for the (possibly long) remaining life of
the source memcg.
The problem is due to memcg use of __this_cpu_from(.., -nr_pages),
which is subtly wrong because it subtracts the unsigned int nr_pages
(either -1 or -512 for THP) from a signed long percpu counter. When
nr_pages=-1, -nr_pages=0xffffffff. On 64 bit machines
stat->count[idx] is signed 64 bit. So memcg's attempt to simply
decrement a count (e.g. from 1 to 0) boils down to:
long count = 1
unsigned int nr_pages = 1
count += -nr_pages /* -nr_pages == 0xffff,ffff */
count is now 0x1,0000,0000 instead of 0
The fix is to subtract the unsigned page count rather than adding its
negation. This only works once "percpu: fix this_cpu_sub() subtrahend
casting for unsigneds" is applied to fix this_cpu_sub().
Signed-off-by: Greg Thelen <[email protected]>
Acked-by: Tejun Heo <[email protected]>
---
mm/memcontrol.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index aa8185c..b7ace0f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3773,7 +3773,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
{
/* Update stat data for mem_cgroup */
preempt_disable();
- __this_cpu_add(from->stat->count[idx], -nr_pages);
+ __this_cpu_sub(from->stat->count[idx], nr_pages);
__this_cpu_add(to->stat->count[idx], nr_pages);
preempt_enable();
}
--
1.8.4.1
On Sun, Oct 27, 2013 at 10:30:16AM -0700, Greg Thelen wrote:
> this_cpu_sub() is implemented as negation and addition.
>
> This patch casts the adjustment to the counter type before negation to
> sign extend the adjustment. This helps in cases where the counter
> type is wider than an unsigned adjustment. An alternative to this
> patch is to declare such operations unsupported, but it seemed useful
> to avoid surprises.
>
> This patch specifically helps the following example:
> unsigned int delta = 1
> preempt_disable()
> this_cpu_write(long_counter, 0)
> this_cpu_sub(long_counter, delta)
> preempt_enable()
>
> Before this change long_counter on a 64 bit machine ends with value
> 0xffffffff, rather than 0xffffffffffffffff. This is because
> this_cpu_sub(pcp, delta) boils down to this_cpu_add(pcp, -delta),
> which is basically:
> long_counter = 0 + 0xffffffff
>
> Also apply the same cast to:
> __this_cpu_sub()
> __this_cpu_sub_return()
> this_cpu_sub_return()
>
> All percpu_test.ko passes, especially the following cases which
> previously failed:
>
> l -= ui_one;
> __this_cpu_sub(long_counter, ui_one);
> CHECK(l, long_counter, -1);
>
> l -= ui_one;
> this_cpu_sub(long_counter, ui_one);
> CHECK(l, long_counter, -1);
> CHECK(l, long_counter, 0xffffffffffffffff);
>
> ul -= ui_one;
> __this_cpu_sub(ulong_counter, ui_one);
> CHECK(ul, ulong_counter, -1);
> CHECK(ul, ulong_counter, 0xffffffffffffffff);
>
> ul = this_cpu_sub_return(ulong_counter, ui_one);
> CHECK(ul, ulong_counter, 2);
>
> ul = __this_cpu_sub_return(ulong_counter, ui_one);
> CHECK(ul, ulong_counter, 1);
>
> Signed-off-by: Greg Thelen <[email protected]>
> Acked-by: Tejun Heo <[email protected]>
FWIW:
Acked-by: Johannes Weiner <[email protected]>
On Sun, Oct 27, 2013 at 10:30:17AM -0700, Greg Thelen wrote:
> As of v3.11-9444-g3ea67d0 "memcg: add per cgroup writeback pages
> accounting" memcg counter errors are possible when moving charged
> memory to a different memcg. Charge movement occurs when processing
> writes to memory.force_empty, moving tasks to a memcg with
> memcg.move_charge_at_immigrate=1, or memcg deletion. An example
> showing error after memory.force_empty:
> $ cd /sys/fs/cgroup/memory
> $ mkdir x
> $ rm /data/tmp/file
> $ (echo $BASHPID >> x/tasks && exec mmap_writer /data/tmp/file 1M) &
> [1] 13600
> $ grep ^mapped x/memory.stat
> mapped_file 1048576
> $ echo 13600 > tasks
> $ echo 1 > x/memory.force_empty
> $ grep ^mapped x/memory.stat
> mapped_file 4503599627370496
>
> mapped_file should end with 0.
> 4503599627370496 == 0x10,0000,0000,0000 == 0x100,0000,0000 pages
> 1048576 == 0x10,0000 == 0x100 pages
>
> This issue only affects the source memcg on 64 bit machines; the
> destination memcg counters are correct. So the rmdir case is not too
> important because such counters are soon disappearing with the entire
> memcg. But the memcg.force_empty and
> memory.move_charge_at_immigrate=1 cases are larger problems as the
> bogus counters are visible for the (possibly long) remaining life of
> the source memcg.
>
> The problem is due to memcg use of __this_cpu_from(.., -nr_pages),
> which is subtly wrong because it subtracts the unsigned int nr_pages
> (either -1 or -512 for THP) from a signed long percpu counter. When
> nr_pages=-1, -nr_pages=0xffffffff. On 64 bit machines
> stat->count[idx] is signed 64 bit. So memcg's attempt to simply
> decrement a count (e.g. from 1 to 0) boils down to:
> long count = 1
> unsigned int nr_pages = 1
> count += -nr_pages /* -nr_pages == 0xffff,ffff */
> count is now 0x1,0000,0000 instead of 0
>
> The fix is to subtract the unsigned page count rather than adding its
> negation. This only works once "percpu: fix this_cpu_sub() subtrahend
> casting for unsigneds" is applied to fix this_cpu_sub().
>
> Signed-off-by: Greg Thelen <[email protected]>
> Acked-by: Tejun Heo <[email protected]>
Huh, it looked so innocent... At first I thought 2/3 would fix this
case as well but the cast happens only after the negation, so the sign
extension does not happen. Alright, then.
Acked-by: Johannes Weiner <[email protected]>