The same calculation is currently done in three different places.
Factor that code so future changes has to be made at only one place.
Signed-of-by: Jerome Marchand <[email protected]>
---
fs/proc/meminfo.c | 5 +----
include/linux/mman.h | 12 ++++++++++++
mm/mmap.c | 4 +---
mm/nommu.c | 3 +--
4 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 5aa847a..6f7767d 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -24,7 +24,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
unsigned long committed;
- unsigned long allowed;
struct vmalloc_info vmi;
long cached;
unsigned long pages[NR_LRU_LISTS];
@@ -37,8 +36,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
si_meminfo(&i);
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
- allowed = ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100) + total_swap_pages;
cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
@@ -153,7 +150,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
K(global_page_state(NR_WRITEBACK_TEMP)),
- K(allowed),
+ K(vm_commit_limit()),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,
vmi.used >> 10,
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 92dc257..d622d34 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -7,6 +7,9 @@
#include <linux/atomic.h>
#include <uapi/linux/mman.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern struct percpu_counter vm_committed_as;
@@ -87,4 +90,13 @@ calc_vm_flag_bits(unsigned long flags)
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
}
+
+/*
+ * Commited memory limit enforced when OVERCOMMIT_NEVER policy is used
+ */
+static inline unsigned long vm_commit_limit()
+{
+ return ((totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100) + total_swap_pages;
+}
#endif /* _LINUX_MMAN_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 1edbaa3..06c98f8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
- allowed = (totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100;
+ allowed = vm_commit_limit();
/*
* Reserve some for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed += total_swap_pages;
/*
* Don't let a single process grow so big a user can't recover
diff --git a/mm/nommu.c b/mm/nommu.c
index ecd1f15..d8a957b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
- allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+ allowed = vm_commit_limit();
/*
* Reserve some 3% for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed += total_swap_pages;
/*
* Don't let a single process grow so big a user can't recover
--
1.7.7.6
Some applications that run on HPC clusters are designed around the
availability of RAM and the overcommit ratio is fine tuned to get the
maximum usage of memory without swapping. With growing memory, the
1%-of-all-RAM grain provided by overcommit_ratio has become too coarse
for these workload (on a 2TB machine it represents no less than
20GB).
This patch adds the new overcommit_kbytes sysctl variable that allow a
much finer grain.
Signed-of-by: Jerome Marchand <[email protected]>
---
Documentation/sysctl/vm.txt | 12 ++++++++++++
Documentation/vm/overcommit-accounting | 7 ++++---
include/linux/mm.h | 5 +++++
include/linux/mman.h | 13 +++++++++++--
kernel/sysctl.c | 10 +++++++++-
mm/mmap.c | 25 +++++++++++++++++++++++++
mm/nommu.c | 1 +
7 files changed, 67 insertions(+), 6 deletions(-)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 36ecc26..a23aea1 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -47,6 +47,7 @@ Currently, these files are in /proc/sys/vm:
- numa_zonelist_order
- oom_dump_tasks
- oom_kill_allocating_task
+- overcommit_kbytes
- overcommit_memory
- overcommit_ratio
- page-cluster
@@ -561,6 +562,17 @@ The default value is 0.
==============================================================
+overcommit_kbytes:
+
+When overcommit_memory is set to 2, the committed address space is not
+permitted to exceed swap plus this amount of physical RAM. See below.
+
+Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one
+of them may be specified at a time. Setting one disable the other (which
+then appears as 0 when read).
+
+==============================================================
+
overcommit_memory:
This value contains a flag that enables memory overcommitment.
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
index 8eaa2fc..cbfaaa6 100644
--- a/Documentation/vm/overcommit-accounting
+++ b/Documentation/vm/overcommit-accounting
@@ -14,8 +14,8 @@ The Linux kernel supports the following overcommit handling modes
2 - Don't overcommit. The total address space commit
for the system is not permitted to exceed swap + a
- configurable percentage (default is 50) of physical RAM.
- Depending on the percentage you use, in most situations
+ configurable amount (default is 50%) of physical RAM.
+ Depending on the amount you use, in most situations
this means a process will not be killed while accessing
pages but will receive errors on memory allocation as
appropriate.
@@ -26,7 +26,8 @@ The Linux kernel supports the following overcommit handling modes
The overcommit policy is set via the sysctl `vm.overcommit_memory'.
-The overcommit percentage is set via `vm.overcommit_ratio'.
+The overcommit amount can be set via `vm.overcommit_ratio' (percentage)
+or `vm.overcommit_kbytes' (absolute value).
The current overcommit limit and amount committed are viewable in
/proc/meminfo as CommitLimit and Committed_AS respectively.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f022460..15f2b0c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -53,6 +53,11 @@ extern int sysctl_legacy_va_layout;
extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
+extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *,
+ size_t *, loff_t *);
+extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
+ size_t *, loff_t *);
+
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
/* to align the pointer to the (next) page boundary */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index d622d34..debd0f9 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -12,6 +12,7 @@
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
+extern unsigned long sysctl_overcommit_kbytes;
extern struct percpu_counter vm_committed_as;
#ifdef CONFIG_SMP
@@ -96,7 +97,15 @@ calc_vm_flag_bits(unsigned long flags)
*/
static inline unsigned long vm_commit_limit()
{
- return ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100) + total_swap_pages;
+ unsigned long allowed;
+
+ if (sysctl_overcommit_kbytes)
+ allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
+ else
+ allowed = ((totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100);
+ allowed += total_swap_pages;
+
+ return allowed;
}
#endif /* _LINUX_MMAN_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc4..e1968a4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,6 +97,7 @@
/* External variables not in a header file. */
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
+extern unsigned long sysctl_overcommit_kbytes;
extern int max_threads;
extern int suid_dumpable;
#ifdef CONFIG_COREDUMP
@@ -1119,7 +1120,14 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_ratio,
.maxlen = sizeof(sysctl_overcommit_ratio),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = overcommit_ratio_handler,
+ },
+ {
+ .procname = "overcommit_kbytes",
+ .data = &sysctl_overcommit_kbytes,
+ .maxlen = sizeof(sysctl_overcommit_kbytes),
+ .mode = 0644,
+ .proc_handler = overcommit_kbytes_handler,
},
{
.procname = "page-cluster",
diff --git a/mm/mmap.c b/mm/mmap.c
index 06c98f8..c4a09a5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly = 0;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -95,6 +96,30 @@ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
*/
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
+int overcommit_ratio_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_kbytes = 0;
+ return ret;
+}
+
+int overcommit_kbytes_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_ratio = 0;
+ return ret;
+}
+
/*
* The global memory commitment made in the system can be a metric
* that can be used to drive ballooning decisions when Linux is hosted
diff --git a/mm/nommu.c b/mm/nommu.c
index d8a957b..3c4216a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn;
struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly = 0;
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
--
1.7.7.6
On 08/19/2013 08:17 AM, Jerome Marchand wrote:
> Some applications that run on HPC clusters are designed around the
> availability of RAM and the overcommit ratio is fine tuned to get the
> maximum usage of memory without swapping. With growing memory, the
> 1%-of-all-RAM grain provided by overcommit_ratio has become too coarse
> for these workload (on a 2TB machine it represents no less than
> 20GB).
>
> This patch adds the new overcommit_kbytes sysctl variable that allow a
> much finer grain.
Instead of introducing yet another tunable, why don't we just make the
ratio that comes in from the user more fine-grained?
sysctl overcommit_ratio=0.2
We change the internal 'sysctl_overcommit_ratio' to store tenths or
hundreths of a percent (or whatever), then parse the input as two
integers. I don't think we need fully correct floating point parsing
and rounding here, so it shouldn't be too much of a chore. It'd
probably end up being less code than you have as it stands.
On 08/19/2013 06:55 PM, Dave Hansen wrote:
> On 08/19/2013 08:17 AM, Jerome Marchand wrote:
>> Some applications that run on HPC clusters are designed around the
>> availability of RAM and the overcommit ratio is fine tuned to get the
>> maximum usage of memory without swapping. With growing memory, the
>> 1%-of-all-RAM grain provided by overcommit_ratio has become too coarse
>> for these workload (on a 2TB machine it represents no less than
>> 20GB).
>>
>> This patch adds the new overcommit_kbytes sysctl variable that allow a
>> much finer grain.
>
> Instead of introducing yet another tunable, why don't we just make the
> ratio that comes in from the user more fine-grained?
>
> sysctl overcommit_ratio=0.2
>
> We change the internal 'sysctl_overcommit_ratio' to store tenths or
> hundreths of a percent (or whatever), then parse the input as two
> integers. I don't think we need fully correct floating point parsing
> and rounding here, so it shouldn't be too much of a chore. It'd
> probably end up being less code than you have as it stands.
>
Whatever works for me. I did it in that way to be more consistent with
what has already been done for dirty*_ratio/bytes.
Thanks,
Jerome
On 08/19/2013 06:55 PM, Dave Hansen wrote:
> On 08/19/2013 08:17 AM, Jerome Marchand wrote:
>> Some applications that run on HPC clusters are designed around the
>> availability of RAM and the overcommit ratio is fine tuned to get the
>> maximum usage of memory without swapping. With growing memory, the
>> 1%-of-all-RAM grain provided by overcommit_ratio has become too coarse
>> for these workload (on a 2TB machine it represents no less than
>> 20GB).
>>
>> This patch adds the new overcommit_kbytes sysctl variable that allow a
>> much finer grain.
>
> Instead of introducing yet another tunable, why don't we just make the
> ratio that comes in from the user more fine-grained?
>
> sysctl overcommit_ratio=0.2
>
> We change the internal 'sysctl_overcommit_ratio' to store tenths or
> hundreths of a percent (or whatever), then parse the input as two
> integers. I don't think we need fully correct floating point parsing
> and rounding here, so it shouldn't be too much of a chore. It'd
> probably end up being less code than you have as it stands.
>
Now that I think about it, that could break user space. Sure write access
wouldn't be a problem (one can still write a plain integer), but a script
that reads a fractional value when it expects an integer might not be able
to cope with it.
On 08/21/2013 08:22 AM, Jerome Marchand wrote:
>> > Instead of introducing yet another tunable, why don't we just make the
>> > ratio that comes in from the user more fine-grained?
>> >
>> > sysctl overcommit_ratio=0.2
>> >
>> > We change the internal 'sysctl_overcommit_ratio' to store tenths or
>> > hundreths of a percent (or whatever), then parse the input as two
>> > integers. I don't think we need fully correct floating point parsing
>> > and rounding here, so it shouldn't be too much of a chore. It'd
>> > probably end up being less code than you have as it stands.
>> >
> Now that I think about it, that could break user space. Sure write access
> wouldn't be a problem (one can still write a plain integer), but a script
> that reads a fractional value when it expects an integer might not be able
> to cope with it.
You're right. Something doing FOO=$(cat overcommit_ratio) and then
trying do do arithmetic would just fail loudly. But, it would probably
fail silently if we create another tunable that all of a sudden returns
0 (when the kernel is not _behaving_ like it is set to 0).
I'm not sure there's a good way out of this without breakage (or at
least confusing) of _some_ old scripts/programs. Either way has ups and
downs.
The existing dirty_ratio/bytes stuff just annoys me because I end up
having to check two places whenever I go looking for it.
On 08/21/2013 06:23 PM, Dave Hansen wrote:
> On 08/21/2013 08:22 AM, Jerome Marchand wrote:
>>>> Instead of introducing yet another tunable, why don't we just make the
>>>> ratio that comes in from the user more fine-grained?
>>>>
>>>> sysctl overcommit_ratio=0.2
>>>>
>>>> We change the internal 'sysctl_overcommit_ratio' to store tenths or
>>>> hundreths of a percent (or whatever), then parse the input as two
>>>> integers. I don't think we need fully correct floating point parsing
>>>> and rounding here, so it shouldn't be too much of a chore. It'd
>>>> probably end up being less code than you have as it stands.
>>>>
>> Now that I think about it, that could break user space. Sure write access
>> wouldn't be a problem (one can still write a plain integer), but a script
>> that reads a fractional value when it expects an integer might not be able
>> to cope with it.
>
> You're right. Something doing FOO=$(cat overcommit_ratio) and then
> trying do do arithmetic would just fail loudly. But, it would probably
> fail silently if we create another tunable that all of a sudden returns
> 0 (when the kernel is not _behaving_ like it is set to 0).
>
> I'm not sure there's a good way out of this without breakage (or at
> least confusing) of _some_ old scripts/programs. Either way has ups and
> downs.
>
> The existing dirty_ratio/bytes stuff just annoys me because I end up
> having to check two places whenever I go looking for it.
>
Right. Then we could just use some overcommit_fine_ratio internally and
overcommit_ratio would show and set a rounded value. I doubt that a script
that reads 80% would notice the difference if it is actually 79.5%.
We could also use overcommit_kbytes internally, but then overcommit_ratio
would fluctuate if RAM ram is added/removed (e.g. memory hotplug or baloon
driver). That might be a problem.
Changes since v1:
- use overcommit_ratio_ppm instead of overcommit_kbytes
- keep both variables in sync
Some applications that run on HPC clusters are designed around the
availability of RAM and the overcommit ratio is fine tuned to get the
maximum usage of memory without swapping. With growing memory, the 1%
of all RAM grain provided by overcommit_ratio has become too coarse
for these workload (on a 2TB machine it represents no less than
20GB).
This patch adds the new overcommit_ratio_ppm sysctl variable that
allow to set overcommit ratio with a part per million precision.
The old overcommit_ratio variable can still be used to set and read
the ratio with a 1% precision. That way, overcommit_ratio interface
isn't broken in any way that I can imagine.
Signed-off-by: Jerome Marchand <[email protected]>
---
include/linux/mman.h | 6 ++--
include/linux/sysctl.h | 2 +
kernel/sysctl.c | 63 +++++++++++++++++++++++++++++++++++++++++++++--
mm/mmap.c | 2 +-
mm/nommu.c | 2 +-
5 files changed, 67 insertions(+), 8 deletions(-)
diff --git a/include/linux/mman.h b/include/linux/mman.h
index d622d34..24f9c12 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -11,7 +11,7 @@
#include <linux/swap.h>
extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
+extern int sysctl_overcommit_ratio_ppm;
extern struct percpu_counter vm_committed_as;
#ifdef CONFIG_SMP
@@ -96,7 +96,7 @@ calc_vm_flag_bits(unsigned long flags)
*/
static inline unsigned long vm_commit_limit()
{
- return ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100) + total_swap_pages;
+ return ((u64) (totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio_ppm / 100000) + total_swap_pages;
}
#endif /* _LINUX_MMAN_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 14a8ff2..2e2389c 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -51,6 +51,8 @@ extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+extern int proc_dointvec_percent_ppm(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);
extern int proc_doulongvec_minmax(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc4..a4d2e37 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,7 +96,7 @@
/* External variables not in a header file. */
extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
+extern int sysctl_overcommit_ratio_ppm;
extern int max_threads;
extern int suid_dumpable;
#ifdef CONFIG_COREDUMP
@@ -1116,8 +1116,15 @@ static struct ctl_table vm_table[] = {
},
{
.procname = "overcommit_ratio",
- .data = &sysctl_overcommit_ratio,
- .maxlen = sizeof(sysctl_overcommit_ratio),
+ .data = &sysctl_overcommit_ratio_ppm,
+ .maxlen = sizeof(sysctl_overcommit_ratio_ppm),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_percent_ppm,
+ },
+ {
+ .procname = "overcommit_ratio_ppm",
+ .data = &sysctl_overcommit_ratio_ppm,
+ .maxlen = sizeof(sysctl_overcommit_ratio_ppm),
.mode = 0644,
.proc_handler = proc_dointvec,
},
@@ -2433,6 +2440,56 @@ int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
do_proc_dointvec_ms_jiffies_conv, NULL);
}
+static int do_proc_dointvec_percent_ppm_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned long ppm = (*negp ? -*lvalp : *lvalp) * 10000;
+
+ if (ppm > INT_MAX)
+ return 1;
+ *valp = (int)ppm;
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = (unsigned long)-val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = lval / 10000;
+ if (lval % 10000 >= 5000)
+ (*lvalp)++;
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_percent_ppm - read a vector of integers as percent and convert it to ppm
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in percents, and are converted
+ * into parts per million.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_percent_ppm(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_percent_ppm_conv, NULL);
+}
+
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
diff --git a/mm/mmap.c b/mm/mmap.c
index 06c98f8..bdec0e1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -85,7 +85,7 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+int sysctl_overcommit_ratio_ppm __read_mostly = 500000; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
diff --git a/mm/nommu.c b/mm/nommu.c
index d8a957b..cf10a9b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -59,7 +59,7 @@ unsigned long max_mapnr;
unsigned long highest_memmap_pfn;
struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50; /* default is 50% */
+int sysctl_overcommit_ratio_ppm = 500000; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
--
1.7.7.6
On 09/05/2013 05:51 AM, Jerome Marchand wrote:
> This patch adds the new overcommit_ratio_ppm sysctl variable that
> allow to set overcommit ratio with a part per million precision.
> The old overcommit_ratio variable can still be used to set and read
> the ratio with a 1% precision. That way, overcommit_ratio interface
> isn't broken in any way that I can imagine.
Looks like a pretty sane solution. Could you also make a Documentation/
update, please?
On 09/05/2013 04:41 PM, Dave Hansen wrote:
> On 09/05/2013 05:51 AM, Jerome Marchand wrote:
>> This patch adds the new overcommit_ratio_ppm sysctl variable that
>> allow to set overcommit ratio with a part per million precision.
>> The old overcommit_ratio variable can still be used to set and read
>> the ratio with a 1% precision. That way, overcommit_ratio interface
>> isn't broken in any way that I can imagine.
>
> Looks like a pretty sane solution. Could you also make a Documentation/
> update, please?
Damn! I forgot. Will do.
Thanks
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
hi!
> >> This patch adds the new overcommit_ratio_ppm sysctl variable that
> >> allow to set overcommit ratio with a part per million precision.
> >> The old overcommit_ratio variable can still be used to set and read
> >> the ratio with a 1% precision. That way, overcommit_ratio interface
> >> isn't broken in any way that I can imagine.
> >
> > Looks like a pretty sane solution. Could you also make a Documentation/
> > update, please?
>
> Damn! I forgot. Will do.
Actually... would something like overcommit_bytes be better interface? overcommit_pages?
If system would normally allow allocating "n" pages, with overcommit
it would allow allocating "n + overcommit_pages" pages. That seems
like right granularity...
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On 09/06/2013 12:11 AM, Pavel Machek wrote:
> hi!
>
>>>> This patch adds the new overcommit_ratio_ppm sysctl variable that
>>>> allow to set overcommit ratio with a part per million precision.
>>>> The old overcommit_ratio variable can still be used to set and read
>>>> the ratio with a 1% precision. That way, overcommit_ratio interface
>>>> isn't broken in any way that I can imagine.
>>>
>>> Looks like a pretty sane solution. Could you also make a Documentation/
>>> update, please?
>>
>> Damn! I forgot. Will do.
>
> Actually... would something like overcommit_bytes be better interface? overcommit_pages?
>
> If system would normally allow allocating "n" pages, with overcommit
> it would allow allocating "n + overcommit_pages" pages. That seems
> like right granularity...
>
I don't know what do you mean by "normally".
Anyway, I've considered that option: my concern about mixing absolute and
proportional values is that they would diverge if the amount of ram varies
(e.g. memory hotplug or virt baloon driver).
Changes since v2:
- update documentation
Changes since v1:
- use overcommit_ratio_ppm instead of overcommit_kbytes
- keep both variables in sync
Some applications that run on HPC clusters are designed around the
availability of RAM and the overcommit ratio is fine tuned to get the
maximum usage of memory without swapping. With growing memory, the 1%
of all RAM grain provided by overcommit_ratio has become too coarse
for these workload (on a 2TB machine it represents no less than
20GB).
This patch adds the new overcommit_ratio_ppm sysctl variable that
allow to set overcommit ratio with a part per million precision.
The old overcommit_ratio variable can still be used to set and read
the ratio with a 1% precision. That way, overcommit_ratio interface
isn't broken in any way that I can imagine.
Signed-off-by: Jerome Marchand <[email protected]>
---
Documentation/sysctl/vm.txt | 10 +++++
Documentation/vm/overcommit-accounting | 7 ++--
include/linux/mman.h | 6 ++--
include/linux/sysctl.h | 2 +
kernel/sysctl.c | 63 ++++++++++++++++++++++++++++++--
mm/mmap.c | 2 +-
mm/nommu.c | 2 +-
7 files changed, 81 insertions(+), 11 deletions(-)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 36ecc26..5cd5c53 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -49,6 +49,7 @@ Currently, these files are in /proc/sys/vm:
- oom_kill_allocating_task
- overcommit_memory
- overcommit_ratio
+- overcommit_ratio_ppm
- page-cluster
- panic_on_oom
- percpu_pagelist_fraction
@@ -591,6 +592,15 @@ overcommit_ratio:
When overcommit_memory is set to 2, the committed address
space is not permitted to exceed swap plus this percentage
of physical RAM. See above.
+If overcommit_ratio_ppm has been set, overcommit_ratio shows a
+rounded value.
+
+==============================================================
+
+overcommit_ratio_ppm:
+
+Same as overcommit_ratio, but allows to set the ratio with a finer
+grain (part per million).
==============================================================
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
index 8eaa2fc..15b5ecb 100644
--- a/Documentation/vm/overcommit-accounting
+++ b/Documentation/vm/overcommit-accounting
@@ -14,8 +14,8 @@ The Linux kernel supports the following overcommit handling modes
2 - Don't overcommit. The total address space commit
for the system is not permitted to exceed swap + a
- configurable percentage (default is 50) of physical RAM.
- Depending on the percentage you use, in most situations
+ configurable ratio (default is 50%) of physical RAM.
+ Depending on the ratio you use, in most situations
this means a process will not be killed while accessing
pages but will receive errors on memory allocation as
appropriate.
@@ -26,7 +26,8 @@ The Linux kernel supports the following overcommit handling modes
The overcommit policy is set via the sysctl `vm.overcommit_memory'.
-The overcommit percentage is set via `vm.overcommit_ratio'.
+The overcommit percentage is set via `vm.overcommit_ratio' or
+`vm.overcommit_ratio_ppm'.
The current overcommit limit and amount committed are viewable in
/proc/meminfo as CommitLimit and Committed_AS respectively.
diff --git a/include/linux/mman.h b/include/linux/mman.h
index d622d34..24f9c12 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -11,7 +11,7 @@
#include <linux/swap.h>
extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
+extern int sysctl_overcommit_ratio_ppm;
extern struct percpu_counter vm_committed_as;
#ifdef CONFIG_SMP
@@ -96,7 +96,7 @@ calc_vm_flag_bits(unsigned long flags)
*/
static inline unsigned long vm_commit_limit()
{
- return ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100) + total_swap_pages;
+ return ((u64) (totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio_ppm / 100000) + total_swap_pages;
}
#endif /* _LINUX_MMAN_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 14a8ff2..2e2389c 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -51,6 +51,8 @@ extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+extern int proc_dointvec_percent_ppm(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);
extern int proc_doulongvec_minmax(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc4..a94ff8d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,7 +96,7 @@
/* External variables not in a header file. */
extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
+extern int sysctl_overcommit_ratio_ppm;
extern int max_threads;
extern int suid_dumpable;
#ifdef CONFIG_COREDUMP
@@ -1116,8 +1116,15 @@ static struct ctl_table vm_table[] = {
},
{
.procname = "overcommit_ratio",
- .data = &sysctl_overcommit_ratio,
- .maxlen = sizeof(sysctl_overcommit_ratio),
+ .data = &sysctl_overcommit_ratio_ppm,
+ .maxlen = sizeof(sysctl_overcommit_ratio_ppm),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_percent_ppm,
+ },
+ {
+ .procname = "overcommit_ratio_ppm",
+ .data = &sysctl_overcommit_ratio_ppm,
+ .maxlen = sizeof(sysctl_overcommit_ratio_ppm),
.mode = 0644,
.proc_handler = proc_dointvec,
},
@@ -2433,6 +2440,56 @@ int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
do_proc_dointvec_ms_jiffies_conv, NULL);
}
+static int do_proc_dointvec_percent_ppm_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned long ppm = (*negp ? -*lvalp : *lvalp) * 10000;
+
+ if (ppm > INT_MAX)
+ return 1;
+ *valp = (int)ppm;
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = (unsigned long)-val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = lval / 10000;
+ if (lval % 10000 >= 5000)
+ (*lvalp)++;
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_percent_ppm - read a vector of integers as percent and convert it to ppm
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in percents, and are converted
+ * into parts per million.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_percent_ppm(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_percent_ppm_conv, NULL);
+}
+
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
diff --git a/mm/mmap.c b/mm/mmap.c
index 3a1bd2c..b996483 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -85,7 +85,7 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+int sysctl_overcommit_ratio_ppm __read_mostly = 500000; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
diff --git a/mm/nommu.c b/mm/nommu.c
index d8a957b..cf10a9b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -59,7 +59,7 @@ unsigned long max_mapnr;
unsigned long highest_memmap_pfn;
struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50; /* default is 50% */
+int sysctl_overcommit_ratio_ppm = 500000; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
--
1.7.7.6