percpu_pagelist_fraction_sysctl_handler calls proc_dointvec_minmax
and blindly assumes that return value of 0 means success.
In fact the other valid case is when it got a zero length input.
After that it proceeds to a division by percpu_pagelist_fraction
value which is conveniently set to a default of zero, resulting in
division by zero.
Other than checking the bytecount to be more than zero, perhaps
a better default value for percpu_pagelist_fraction would help too.
[ 661.985469] divide error: 0000 [#1] SMP DEBUG_PAGEALLOC
[ 661.985868] Modules linked in: binfmt_misc cfg80211 rfkill rpcsec_gss_krb5 ttm drm_kms_helper drm i2c_piix4 microcode i2c_core joydev serio_raw pcspkr virtio_blk nfsd
[ 661.986008] CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19
[ 661.986008] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[ 661.986008] task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000
[ 661.986008] RIP: 0010:[<ffffffff81152664>] [<ffffffff81152664>] percpu_pagelist_fraction_sysctl_handler+0x84/0x120
[ 661.988031] RSP: 0018:ffff8800d87a3e78 EFLAGS: 00010246
[ 661.988031] RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000
[ 661.988031] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010
[ 661.988031] RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50
[ 661.988031] R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060
[ 661.988031] R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800
[ 661.988031] FS: 00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000
[ 661.988031] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 661.988031] CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0
[ 661.988031] Stack:
[ 661.988031] 0000000000000001 ffffffffffffffea ffffffff81c3c3e0 0000000000000000
[ 661.988031] ffff8800d87a3ee8 ffffffff8122b163 ffff8800d87a3f50 00007fff1564969c
[ 661.988031] 0000000000000000 ffff8800d8098f00 00007fff1564969c ffff8800d87a3f50
[ 661.988031] Call Trace:
[ 661.988031] [<ffffffff8122b163>] proc_sys_call_handler+0xb3/0xc0
[ 661.988031] [<ffffffff8122b184>] proc_sys_write+0x14/0x20
[ 661.988031] [<ffffffff811ba93a>] vfs_write+0xba/0x1e0
[ 661.988031] [<ffffffff811bb486>] SyS_write+0x46/0xb0
[ 661.988031] [<ffffffff816db7ff>] tracesys+0xe1/0xe6
[ 661.988031] Code: 1f 84 00 00 00 00 00 48 83 bb b0 06 00 00 00 0f 84 7c 00 00 00 48 63 0d 93 6a e1 00 48 8b 83 b8 06 00 00 31 d2 41 bc 60 00 00 00 <48> f7 f1 ba 01 00 00 00 49 89 c5 48 c1 e8 02 48 85 c0 48 0f 44
Signed-off-by: Oleg Drokin <[email protected]>
CC: Rohit Seth <[email protected]>
---
mm/page_alloc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5dba293..91d0265 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5854,7 +5854,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
int ret;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret < 0))
+ if (!write || (ret < 0) || !*length)
return ret;
mutex_lock(&pcp_batch_high_lock);
--
1.9.0
On Sat, 3 May 2014, Oleg Drokin wrote:
> percpu_pagelist_fraction_sysctl_handler calls proc_dointvec_minmax
> and blindly assumes that return value of 0 means success.
> In fact the other valid case is when it got a zero length input.
>
> After that it proceeds to a division by percpu_pagelist_fraction
> value which is conveniently set to a default of zero, resulting in
> division by zero.
>
> Other than checking the bytecount to be more than zero, perhaps
> a better default value for percpu_pagelist_fraction would help too.
>
> [ 661.985469] divide error: 0000 [#1] SMP DEBUG_PAGEALLOC
> [ 661.985868] Modules linked in: binfmt_misc cfg80211 rfkill rpcsec_gss_krb5 ttm drm_kms_helper drm i2c_piix4 microcode i2c_core joydev serio_raw pcspkr virtio_blk nfsd
> [ 661.986008] CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19
> [ 661.986008] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [ 661.986008] task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000
> [ 661.986008] RIP: 0010:[<ffffffff81152664>] [<ffffffff81152664>] percpu_pagelist_fraction_sysctl_handler+0x84/0x120
> [ 661.988031] RSP: 0018:ffff8800d87a3e78 EFLAGS: 00010246
> [ 661.988031] RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000
> [ 661.988031] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010
> [ 661.988031] RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50
> [ 661.988031] R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060
> [ 661.988031] R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800
> [ 661.988031] FS: 00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000
> [ 661.988031] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 661.988031] CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0
> [ 661.988031] Stack:
> [ 661.988031] 0000000000000001 ffffffffffffffea ffffffff81c3c3e0 0000000000000000
> [ 661.988031] ffff8800d87a3ee8 ffffffff8122b163 ffff8800d87a3f50 00007fff1564969c
> [ 661.988031] 0000000000000000 ffff8800d8098f00 00007fff1564969c ffff8800d87a3f50
> [ 661.988031] Call Trace:
> [ 661.988031] [<ffffffff8122b163>] proc_sys_call_handler+0xb3/0xc0
> [ 661.988031] [<ffffffff8122b184>] proc_sys_write+0x14/0x20
> [ 661.988031] [<ffffffff811ba93a>] vfs_write+0xba/0x1e0
> [ 661.988031] [<ffffffff811bb486>] SyS_write+0x46/0xb0
> [ 661.988031] [<ffffffff816db7ff>] tracesys+0xe1/0xe6
> [ 661.988031] Code: 1f 84 00 00 00 00 00 48 83 bb b0 06 00 00 00 0f 84 7c 00 00 00 48 63 0d 93 6a e1 00 48 8b 83 b8 06 00 00 31 d2 41 bc 60 00 00 00 <48> f7 f1 ba 01 00 00 00 49 89 c5 48 c1 e8 02 48 85 c0 48 0f 44
>
> Signed-off-by: Oleg Drokin <[email protected]>
> CC: Rohit Seth <[email protected]>
> ---
> mm/page_alloc.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5dba293..91d0265 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5854,7 +5854,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
> int ret;
>
> ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> - if (!write || (ret < 0))
> + if (!write || (ret < 0) || !*length)
> return ret;
>
> mutex_lock(&pcp_batch_high_lock);
This hasn't made it to linux-next yet (probably because you didn't cc
Andrew Morton, the mm maintainer), but I'm wondering why it's needed.
Shouldn't this value always be >= min_percpu_pagelist_fract?
If there's something going on in proc_dointvec_minmax() that disregards
that minimum then we need to fix it rather than the caller.
Hello!
On Jun 2, 2014, at 9:40 PM, David Rientjes wrote:
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 5dba293..91d0265 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -5854,7 +5854,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
>> int ret;
>>
>> ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
>> - if (!write || (ret < 0))
>> + if (!write || (ret < 0) || !*length)
>> return ret;
>>
>> mutex_lock(&pcp_batch_high_lock);
> This hasn't made it to linux-next yet (probably because you didn't cc
> Andrew Morton, the mm maintainer), but I'm wondering why it's needed.
> Shouldn't this value always be >= min_percpu_pagelist_fract?
>
> If there's something going on in proc_dointvec_minmax() that disregards
> that minimum then we need to fix it rather than the caller.
It's needed because at the very beginning of proc_dointvec_minmax it checks if the length is 0 and if it is, immediately returns success because there's nothing to be done
from it's perspective.
And since percpu_pagelist_fraction is 0 from the very beginning, next step is division by this value.
If length is not 0 on the other hand, then there's some value proc_dointvec_minmax can interpret and the min and max checks happen and everything works correctly.
This is kind of hard to fix in proc_dointvec_minmax because there's no passed in value to check against, I think. Sure, you can also check the passed in pointer to value
to make sure it's within range, but that goes beyond the scope of the function.
You might as well just assign a sane value to percpu_pagelist_fraction, which has an added benefit of when you cat that /proc file, you'll get real value of what the kernel uses instead of 0.
Bye,
Oleg-
On Mon, 2 Jun 2014, Oleg Drokin wrote:
> It's needed because at the very beginning of proc_dointvec_minmax it checks if the length is 0 and if it is, immediately returns success because there's nothing to be done
> from it's perspective.
> And since percpu_pagelist_fraction is 0 from the very beginning, next step is division by this value.
>
> If length is not 0 on the other hand, then there's some value proc_dointvec_minmax can interpret and the min and max checks happen and everything works correctly.
>
> This is kind of hard to fix in proc_dointvec_minmax because there's no passed in value to check against, I think. Sure, you can also check the passed in pointer to value
> to make sure it's within range, but that goes beyond the scope of the function.
> You might as well just assign a sane value to percpu_pagelist_fraction, which has an added benefit of when you cat that /proc file, you'll get real value of what the kernel uses instead of 0.
>
I'm pretty sure we want to allow users to restore the kernel default
behavior if they've already written to this file and now want to change it
back.
What do you think about doing it like this instead?
---
Documentation/sysctl/vm.txt | 3 ++-
kernel/sysctl.c | 3 +--
mm/page_alloc.c | 20 ++++++++++++--------
3 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is
set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
The initial value is zero. Kernel does not use this value at boot time to set
-the high water marks for each per cpu page list.
+the high water marks for each per cpu page list. If the user writes '0' to this
+sysctl, it will revert to this default behavior.
==============================================================
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
-static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -1305,7 +1304,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
- .extra1 = &min_percpu_pagelist_fract,
+ .extra1 = &zero,
},
#ifdef CONFIG_MMU
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRAC (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -4223,8 +4224,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
pageset_update(&p->pcp, high, batch);
}
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+ struct per_cpu_pageset *pcp)
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
@@ -5850,20 +5851,23 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- unsigned int cpu;
int ret;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret < 0))
+ if (!write || ret < 0)
return ret;
+ if (percpu_pagelist_fraction &&
+ percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRAC)
+ percpu_pagelist_fraction = MIN_PERCPU_PAGELIST_FRAC;
+
mutex_lock(&pcp_batch_high_lock);
for_each_populated_zone(zone) {
- unsigned long high;
- high = zone->managed_pages / percpu_pagelist_fraction;
+ unsigned int cpu;
+
for_each_possible_cpu(cpu)
- pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
- high);
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
}
mutex_unlock(&pcp_batch_high_lock);
return 0;
Hello!
On Jun 2, 2014, at 11:57 PM, David Rientjes wrote:
> I'm pretty sure we want to allow users to restore the kernel default
> behavior if they've already written to this file and now want to change it
> back.
>
> What do you think about doing it like this instead?
> ---
> Documentation/sysctl/vm.txt | 3 ++-
> kernel/sysctl.c | 3 +--
> mm/page_alloc.c | 20 ++++++++++++--------
> 3 files changed, 15 insertions(+), 11 deletions(-)
>
> diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
> --- a/Documentation/sysctl/vm.txt
> +++ b/Documentation/sysctl/vm.txt
> @@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is
> set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
>
> The initial value is zero. Kernel does not use this value at boot time to set
> -the high water marks for each per cpu page list.
> +the high water marks for each per cpu page list. If the user writes '0' to this
> +sysctl, it will revert to this default behavior.
I think this is probably a better idea indeed.
Always good to let users return back to defaults too.
Thanks!
If the percpu_pagelist_fraction sysctl is set by the user, it is
impossible to restore it to the kernel default since the user cannot
write 0 to the sysctl.
This patch allows the user to write 0 to restore the default behavior.
It still requires a fraction equal to or larger than 8, however, as stated
by the documentation for sanity. If a value in the range [1, 7] is
written, the sysctl will return EINVAL.
This also fixes a division by zero identified by Oleg that occurs if a
write() occurs with zero length and the value hasn't been changed before
(so that percpu_pagelist_fraction is still 0).
Reported-by: Oleg Drokin <[email protected]>
Cc: [email protected]
Signed-off-by: David Rientjes <[email protected]>
---
Documentation/sysctl/vm.txt | 3 ++-
kernel/sysctl.c | 3 +--
mm/page_alloc.c | 30 +++++++++++++++++++++---------
3 files changed, 24 insertions(+), 12 deletions(-)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is
set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
The initial value is zero. Kernel does not use this value at boot time to set
-the high water marks for each per cpu page list.
+the high water marks for each per cpu page list. If the user writes '0' to this
+sysctl, it will revert to this default behavior.
==============================================================
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
-static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -1305,7 +1304,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
- .extra1 = &min_percpu_pagelist_fract,
+ .extra1 = &zero,
},
#ifdef CONFIG_MMU
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -4107,7 +4108,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
@@ -4223,8 +4224,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
pageset_update(&p->pcp, high, batch);
}
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+ struct per_cpu_pageset *pcp)
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
@@ -5849,21 +5850,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ const int old_percpu_pagelist_fraction = percpu_pagelist_fraction;
struct zone *zone;
- unsigned int cpu;
int ret;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret < 0))
+ if (!write || ret < 0)
return ret;
+ /* Sanity checking to avoid pcp imbalance */
+ if (percpu_pagelist_fraction &&
+ percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+ percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ return -EINVAL;
+ }
+
+ /* No change? */
+ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ return 0;
+
mutex_lock(&pcp_batch_high_lock);
for_each_populated_zone(zone) {
- unsigned long high;
- high = zone->managed_pages / percpu_pagelist_fraction;
+ unsigned int cpu;
+
for_each_possible_cpu(cpu)
- pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
- high);
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
}
mutex_unlock(&pcp_batch_high_lock);
return 0;
Hello!
On Jun 3, 2014, at 9:22 PM, David Rientjes wrote:
>
> @@ -5849,21 +5850,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
> int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
> void __user *buffer, size_t *length, loff_t *ppos)
> {
> + const int old_percpu_pagelist_fraction = percpu_pagelist_fraction;
> struct zone *zone;
> - unsigned int cpu;
> int ret;
>
> ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> - if (!write || (ret < 0))
> + if (!write || ret < 0)
> return ret;
>
> + /* Sanity checking to avoid pcp imbalance */
> + if (percpu_pagelist_fraction &&
> + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
> + percpu_pagelist_fraction = old_percpu_pagelist_fraction;
> + return -EINVAL;
> + }
> +
> + /* No change? */
> + if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
> + return 0;
> +
> mutex_lock(&pcp_batch_high_lock);
> for_each_populated_zone(zone) {
> - unsigned long high;
> - high = zone->managed_pages / percpu_pagelist_fraction;
> + unsigned int cpu;
> +
> for_each_possible_cpu(cpu)
> - pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
> - high);
> + pageset_set_high_and_batch(zone,
> + per_cpu_ptr(zone->pageset, cpu));
BTW, I just realized this version is racy (as was the previous one).
A parallel writer could write a value of 0 while we are in the middle of pageset_set_high_and_batch
and it's possible that'll result in division by zero still.
Also it's possible an incorrect value might set for some of the zones.
I imagine we might want to expand the lock area all the way up to before the proc_dointvec_minmax call jsut to be extra safe.
> }
> mutex_unlock(&pcp_batch_high_lock);
> return 0;
Bye,
Oleg-
If the percpu_pagelist_fraction sysctl is set by the user, it is impossible to
restore it to the kernel default since the user cannot write 0 to the sysctl.
This patch allows the user to write 0 to restore the default behavior. It
still requires a fraction equal to or larger than 8, however, as stated by the
documentation for sanity. If a value in the range [1, 7] is written, the sysctl
will return EINVAL.
Reported-by: Oleg Drokin <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
Documentation/sysctl/vm.txt | 3 ++-
kernel/sysctl.c | 3 +--
mm/page_alloc.c | 41 +++++++++++++++++++++++++++++------------
3 files changed, 32 insertions(+), 15 deletions(-)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is
set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
The initial value is zero. Kernel does not use this value at boot time to set
-the high water marks for each per cpu page list.
+the high water marks for each per cpu page list. If the user writes '0' to this
+sysctl, it will revert to this default behavior.
==============================================================
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
-static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -1305,7 +1304,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
- .extra1 = &min_percpu_pagelist_fract,
+ .extra1 = &zero,
},
#ifdef CONFIG_MMU
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -4107,7 +4108,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
@@ -4223,8 +4224,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
pageset_update(&p->pcp, high, batch);
}
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+ struct per_cpu_pageset *pcp)
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
@@ -5850,23 +5851,39 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- unsigned int cpu;
+ int old_percpu_pagelist_fraction;
int ret;
+ mutex_lock(&pcp_batch_high_lock);
+ old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret < 0))
- return ret;
+ if (!write || ret < 0)
+ goto out;
+
+ /* Sanity checking to avoid pcp imbalance */
+ if (percpu_pagelist_fraction &&
+ percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+ percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = 0;
+ /* No change? */
+ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ goto out;
- mutex_lock(&pcp_batch_high_lock);
for_each_populated_zone(zone) {
- unsigned long high;
- high = zone->managed_pages / percpu_pagelist_fraction;
+ unsigned int cpu;
+
for_each_possible_cpu(cpu)
- pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
- high);
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
}
+out:
mutex_unlock(&pcp_batch_high_lock);
- return 0;
+ return ret;
}
int hashdist = HASHDIST_DEFAULT;
Hello!
On Jun 4, 2014, at 8:34 PM, David Rientjes wrote:
> @@ -5850,23 +5851,39 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
> void __user *buffer, size_t *length, loff_t *ppos)
> {
> struct zone *zone;
> - unsigned int cpu;
> + int old_percpu_pagelist_fraction;
> int ret;
>
> + mutex_lock(&pcp_batch_high_lock);
> + old_percpu_pagelist_fraction = percpu_pagelist_fraction;
> +
> ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> - if (!write || (ret < 0))
> - return ret;
> + if (!write || ret < 0)
> + goto out;
> +
> + /* Sanity checking to avoid pcp imbalance */
> + if (percpu_pagelist_fraction &&
> + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
> + percpu_pagelist_fraction = old_percpu_pagelist_fraction;
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + ret = 0;
Minor nitpick I guess, but ret cannot be anything but 0 here I think (until somebody changes the way proc_dointvec_minmax for write=true operates)?
The patch is good otherwise.
Thanks.-
On Wed, 4 Jun 2014, Oleg Drokin wrote:
> Minor nitpick I guess, but ret cannot be anything but 0 here I think (until somebody changes the way proc_dointvec_minmax for write=true operates)?
>
We need to return 0 regardless of whether proc_dointvec_minmax() changes
in the future, the patch is correct.
Oleg reports a division by zero error on zero-length write() to the
percpu_pagelist_fraction sysctl:
divide error: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000
RIP: 0010:[<ffffffff81152664>] [<ffffffff81152664>] percpu_pagelist_fraction_sysctl_handler+0x84/0x120
RSP: 0018:ffff8800d87a3e78 EFLAGS: 00010246
RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010
RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50
R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060
R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800
FS: 00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0
Stack:
0000000000000001 ffffffffffffffea ffffffff81c3c3e0 0000000000000000
ffff8800d87a3ee8 ffffffff8122b163 ffff8800d87a3f50 00007fff1564969c
0000000000000000 ffff8800d8098f00 00007fff1564969c ffff8800d87a3f50
Call Trace:
[<ffffffff8122b163>] proc_sys_call_handler+0xb3/0xc0
[<ffffffff8122b184>] proc_sys_write+0x14/0x20
[<ffffffff811ba93a>] vfs_write+0xba/0x1e0
[<ffffffff811bb486>] SyS_write+0x46/0xb0
[<ffffffff816db7ff>] tracesys+0xe1/0xe6
However, if the percpu_pagelist_fraction sysctl is set by the user, it is also
impossible to restore it to the kernel default since the user cannot write 0 to
the sysctl.
This patch allows the user to write 0 to restore the default behavior. It
still requires a fraction equal to or larger than 8, however, as stated by the
documentation for sanity. If a value in the range [1, 7] is written, the sysctl
will return EINVAL.
This successfully solves the divide by zero issue at the same time.
Reported-by: Oleg Drokin <[email protected]>
Cc: [email protected]
Signed-off-by: David Rientjes <[email protected]>
---
v3: remove needless ret = 0 assignment per Oleg
rewrote changelog
added [email protected]
Documentation/sysctl/vm.txt | 3 ++-
kernel/sysctl.c | 3 +--
mm/page_alloc.c | 40 ++++++++++++++++++++++++++++------------
3 files changed, 31 insertions(+), 15 deletions(-)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is
set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
The initial value is zero. Kernel does not use this value at boot time to set
-the high water marks for each per cpu page list.
+the high water marks for each per cpu page list. If the user writes '0' to this
+sysctl, it will revert to this default behavior.
==============================================================
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
-static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -1328,7 +1327,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
- .extra1 = &min_percpu_pagelist_fract,
+ .extra1 = &zero,
},
#ifdef CONFIG_MMU
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -4145,7 +4146,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
@@ -4261,8 +4262,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
pageset_update(&p->pcp, high, batch);
}
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+ struct per_cpu_pageset *pcp)
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
@@ -5881,23 +5882,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- unsigned int cpu;
+ int old_percpu_pagelist_fraction;
int ret;
+ mutex_lock(&pcp_batch_high_lock);
+ old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret < 0))
- return ret;
+ if (!write || ret < 0)
+ goto out;
+
+ /* Sanity checking to avoid pcp imbalance */
+ if (percpu_pagelist_fraction &&
+ percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+ percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* No change? */
+ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ goto out;
- mutex_lock(&pcp_batch_high_lock);
for_each_populated_zone(zone) {
- unsigned long high;
- high = zone->managed_pages / percpu_pagelist_fraction;
+ unsigned int cpu;
+
for_each_possible_cpu(cpu)
- pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
- high);
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
}
+out:
mutex_unlock(&pcp_batch_high_lock);
- return 0;
+ return ret;
}
int hashdist = HASHDIST_DEFAULT;