2018-02-08 02:12:11

by Matthew Wilcox

[permalink] [raw]
Subject: [RFC] Warn the user when they could overflow mapcount


Kirill and I were talking about trying to overflow page->_mapcount
the other day and realised that the default settings of pid_max and
max_map_count prevent it [1]. But there isn't even documentation to
warn a sysadmin that they've just opened themselves up to the possibility
that they've opened their system up to a sufficiently-determined attacker.

I'm not sufficiently wise in the ways of the MM to understand exactly
what goes wrong if we do wrap mapcount. Kirill says:

rmap depends on mapcount to decide when the page is not longer mapped.
If it sees page_mapcount() == 0 due to 32-bit wrap we are screwed;
data corruption, etc.

That seems pretty bad. So here's a patch which adds documentation to the
two sysctls that a sysadmin could use to shoot themselves in the foot,
and adds a warning if they change either of them to a dangerous value.
It's possible to get into a dangerous situation without triggering this
warning (already have the file mapped a lot of times, then lower pid_max,
then raise max_map_count, then map the file a lot more times), but it's
unlikely to happen.

Comments?

[1] map_count counts the number of times that a page is mapped to
userspace; max_map_count restricts the number of times a process can
map a page and pid_max restricts the number of processes that can exist.
So map_count can never be larger than pid_max * max_map_count.

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 412314eebda6..ec90cd633e99 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -718,6 +718,8 @@ pid_max:
PID allocation wrap value. When the kernel's next PID value
reaches this value, it wraps back to a minimum PID value.
PIDs of value pid_max or larger are not allocated.
+Increasing this value without decreasing vm.max_map_count may
+allow a hostile user to corrupt kernel memory

==============================================================

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index ff234d229cbb..0ab306ea8f80 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -379,7 +379,8 @@ While most applications need less than a thousand maps, certain
programs, particularly malloc debuggers, may consume lots of them,
e.g., up to one or two maps per allocation.

-The default value is 65536.
+The default value is 65530. Increasing this value without decreasing
+pid_max may allow a hostile user to corrupt kernel memory.

=============================================================

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 173d2484f6e3..ebc301b21589 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -123,8 +123,6 @@ extern int mmap_rnd_compat_bits __read_mostly;
#define MAPCOUNT_ELF_CORE_MARGIN (5)
#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

-extern int sysctl_max_map_count;
-
extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 7633d55d9a24..7bb10c1b3be3 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,6 +4,8 @@

#include <linux/rculist.h>

+extern int pid_max;
+
enum pid_type
{
PIDTYPE_PID,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 992bc9948232..c939f403ad08 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -235,5 +235,9 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,

int sysctl_max_threads(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+int sysctl_pid_max(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+int sysctl_max_map_count(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);

#endif /* _LINUX_SYSCTL_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 5d30c87e3c42..9e230ae214c9 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -61,6 +61,27 @@ int pid_max = PID_MAX_DEFAULT;

int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
+extern int max_map_count;
+
+int sysctl_pid_max(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int ret;
+
+ t = *table;
+ t.data = &pid_max;
+ t.extra1 = &pid_max_min;
+ t.extra2 = &pid_max_max;
+
+ ret = proc_douintvec_minmax(&t, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ if ((INT_MAX / max_map_count) > pid_max)
+ pr_warn("pid_max is dangerously large\n");
+ return 0;
+}

/*
* PID-map pages start out as NULL, they get allocated upon
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0b53eef7d34b..e24becc39020 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -308,7 +308,6 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
return ret;
}

-extern int pid_max;
static int zero = 0;
static struct ctl_table pid_ns_ctl_table[] = {
{
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2fb4e27c636a..a137acc0971f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,8 +105,6 @@ extern int core_uses_pid;
extern char core_pattern[];
extern unsigned int core_pipe_limit;
#endif
-extern int pid_max;
-extern int pid_max_min, pid_max_max;
extern int percpu_pagelist_fraction;
extern int latencytop_enabled;
extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
@@ -784,12 +782,10 @@ static struct ctl_table kern_table[] = {
#endif
{
.procname = "pid_max",
- .data = &pid_max,
- .maxlen = sizeof (int),
+ .data = NULL,
+ .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &pid_max_min,
- .extra2 = &pid_max_max,
+ .proc_handler = sysctl_pid_max,
},
{
.procname = "panic_on_oops",
@@ -1454,11 +1450,10 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_MMU
{
.procname = "max_map_count",
- .data = &sysctl_max_map_count,
- .maxlen = sizeof(sysctl_max_map_count),
+ .data = NULL,
+ .maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .proc_handler = sysctl_max_map_count,
},
#else
{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2a6d0325a761..3e9d08a1416a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -663,8 +663,6 @@ extern unsigned long tracing_thresh;

/* PID filtering */

-extern int pid_max;
-
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
diff --git a/mm/internal.h b/mm/internal.h
index e6bd35182dae..23b014958eb9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -79,6 +79,7 @@ static inline void set_page_refcounted(struct page *page)
}

extern unsigned long highest_memmap_pfn;
+extern int max_map_count;

/*
* Maximum number of reclaim retries without progress before the OOM
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d3c922ea1a1..5b66a4a48192 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -147,7 +147,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
*prev = vma;

if (start != vma->vm_start) {
- if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+ if (unlikely(mm->map_count >= max_map_count)) {
error = -ENOMEM;
goto out;
}
@@ -164,7 +164,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
}

if (end != vma->vm_end) {
- if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+ if (unlikely(mm->map_count >= max_map_count)) {
error = -ENOMEM;
goto out;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 9efdc021ad22..9016dae43fee 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1355,7 +1355,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
return -EOVERFLOW;

/* Too many mappings? */
- if (mm->map_count > sysctl_max_map_count)
+ if (mm->map_count > max_map_count)
return -ENOMEM;

/* Obtain the address to map to. we verify (or select) it and ensure
@@ -2546,7 +2546,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
}

/*
- * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
+ * __split_vma() bypasses max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
*/
int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -2621,7 +2621,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
- if (mm->map_count >= sysctl_max_map_count)
+ if (mm->map_count >= max_map_count)
return -ENOMEM;

return __split_vma(mm, vma, addr, new_below);
@@ -2672,7 +2672,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* not exceed its limit; but let map_count go just above
* its limit temporarily, to help free resources as expected.
*/
- if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
+ if (end < vma->vm_end && mm->map_count >= max_map_count)
return -ENOMEM;

error = __split_vma(mm, vma, start, 0);
@@ -2917,7 +2917,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;

- if (mm->map_count > sysctl_max_map_count)
+ if (mm->map_count > max_map_count)
return -ENOMEM;

if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
@@ -3532,6 +3532,30 @@ void mm_drop_all_locks(struct mm_struct *mm)
mutex_unlock(&mm_all_locks_mutex);
}

+int max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+
+int sysctl_max_map_count(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int ret;
+ int min = 0;
+ int max = ~0;
+
+ t = *table;
+ t.data = &max_map_count;
+ t.extra1 = &min;
+ t.extra2 = &max;
+
+ ret = proc_douintvec_minmax(&t, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ if ((INT_MAX / max_map_count) > pid_max)
+ pr_warn("max_map_count is dangerously large\n");
+ return 0;
+}
+
/*
* initialise the percpu counter for VM
*/
diff --git a/mm/mremap.c b/mm/mremap.c
index 049470aa1e3e..fdb1d71ab2cc 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -281,7 +281,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* We'd prefer to avoid failure later on in do_munmap:
* which may split one vma into three before unmapping.
*/
- if (mm->map_count >= sysctl_max_map_count - 3)
+ if (mm->map_count >= max_map_count - 3)
return -ENOMEM;

/*
diff --git a/mm/nommu.c b/mm/nommu.c
index 4b9864b17cb0..4cd9d4b9f473 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1487,7 +1487,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_file)
return -ENOMEM;

- if (mm->map_count >= sysctl_max_map_count)
+ if (mm->map_count >= max_map_count)
return -ENOMEM;

region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c7dd9c86e353..9a2edf3925be 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -740,16 +740,14 @@ static inline void rmv_page_order(struct page *page)

/*
* This function checks whether a page is free && is the buddy
- * we can do coalesce a page and its buddy if
+ * we can coalesce a page and its buddy if
* (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we set ->_mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE.
- * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
- * serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set PG_buddy.
+ * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -794,9 +792,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
- * field.
+ * free pages of length of (1 << order) and marked with PageBuddy().
+ * Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
diff --git a/mm/util.c b/mm/util.c
index c1250501364f..2ac777548694 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -538,7 +538,6 @@ EXPORT_SYMBOL_GPL(__page_mapcount);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */




2018-02-08 02:58:14

by Jann Horn

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

On Thu, Feb 8, 2018 at 3:11 AM, Matthew Wilcox <[email protected]> wrote:
> Kirill and I were talking about trying to overflow page->_mapcount
> the other day and realised that the default settings of pid_max and
> max_map_count prevent it [1]. But there isn't even documentation to
> warn a sysadmin that they've just opened themselves up to the possibility
> that they've opened their system up to a sufficiently-determined attacker.
>
> I'm not sufficiently wise in the ways of the MM to understand exactly
> what goes wrong if we do wrap mapcount. Kirill says:
>
> rmap depends on mapcount to decide when the page is not longer mapped.
> If it sees page_mapcount() == 0 due to 32-bit wrap we are screwed;
> data corruption, etc.

How much memory would you need to trigger this? You need one
vm_area_struct per increment, and those are 200 bytes? So at least
800GiB of memory for the vm_area_structs, and maybe more for other
data structures?

I wouldn't be too surprised if there are more 32-bit overflows that
start being realistic once you put something on the order of terabytes
of memory into one machine, given that refcount_t is 32 bits wide -
for example, the i_count. See
https://bugs.chromium.org/p/project-zero/issues/detail?id=809 for an
example where, given a sufficiently high RLIMIT_MEMLOCK, it was
possible to overflow a 32-bit refcounter on a system with just ~32GiB
of free memory (minimum required to store 2^32 64-bit pointers).

On systems with RAM on the order of terabytes, it's probably a good
idea to turn on refcount hardening to make issues like that
non-exploitable for now.

> That seems pretty bad. So here's a patch which adds documentation to the
> two sysctls that a sysadmin could use to shoot themselves in the foot,
> and adds a warning if they change either of them to a dangerous value.

I have negative feelings about this patch, mostly because AFAICS:

- It documents an issue instead of fixing it.
- It likely only addresses a small part of the actual problem.

> It's possible to get into a dangerous situation without triggering this
> warning (already have the file mapped a lot of times, then lower pid_max,
> then raise max_map_count, then map the file a lot more times), but it's
> unlikely to happen.
>
> Comments?
>
> [1] map_count counts the number of times that a page is mapped to
> userspace; max_map_count restricts the number of times a process can
> map a page and pid_max restricts the number of processes that can exist.
> So map_count can never be larger than pid_max * max_map_count.
[...]
> +int sysctl_pid_max(struct ctl_table *table, int write,
> + void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> + struct ctl_table t;
> + int ret;
> +
> + t = *table;
> + t.data = &pid_max;
> + t.extra1 = &pid_max_min;
> + t.extra2 = &pid_max_max;
> +
> + ret = proc_douintvec_minmax(&t, write, buffer, lenp, ppos);
> + if (ret || !write)
> + return ret;
> +
> + if ((INT_MAX / max_map_count) > pid_max)
> + pr_warn("pid_max is dangerously large\n");

This in reordered is "if (pid_max * max_map_count < INT_MAX)
pr_warn(...);", no? That doesn't make sense to me. Same thing again
further down.

[...]
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 4d3c922ea1a1..5b66a4a48192 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -147,7 +147,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
> *prev = vma;
>
> if (start != vma->vm_start) {
> - if (unlikely(mm->map_count >= sysctl_max_map_count)) {
> + if (unlikely(mm->map_count >= max_map_count)) {

Why the renaming?

2018-02-08 03:19:00

by Tobin C. Harding

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

On Wed, Feb 07, 2018 at 06:11:12PM -0800, Matthew Wilcox wrote:
>
> Kirill and I were talking about trying to overflow page->_mapcount
> the other day and realised that the default settings of pid_max and
> max_map_count prevent it [1]. But there isn't even documentation to
> warn a sysadmin that they've just opened themselves up to the possibility
> that they've opened their system up to a sufficiently-determined attacker.
>
> I'm not sufficiently wise in the ways of the MM to understand exactly
> what goes wrong if we do wrap mapcount. Kirill says:
>
> rmap depends on mapcount to decide when the page is not longer mapped.
> If it sees page_mapcount() == 0 due to 32-bit wrap we are screwed;
> data corruption, etc.
>
> That seems pretty bad. So here's a patch which adds documentation to the
> two sysctls that a sysadmin could use to shoot themselves in the foot,
> and adds a warning if they change either of them to a dangerous value.
> It's possible to get into a dangerous situation without triggering this
> warning (already have the file mapped a lot of times, then lower pid_max,
> then raise max_map_count, then map the file a lot more times), but it's
> unlikely to happen.
>
> Comments?
>
> [1] map_count counts the number of times that a page is mapped to
> userspace; max_map_count restricts the number of times a process can
> map a page and pid_max restricts the number of processes that can exist.
> So map_count can never be larger than pid_max * max_map_count.
>
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 412314eebda6..ec90cd633e99 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -718,6 +718,8 @@ pid_max:
> PID allocation wrap value. When the kernel's next PID value
> reaches this value, it wraps back to a minimum PID value.
> PIDs of value pid_max or larger are not allocated.
> +Increasing this value without decreasing vm.max_map_count may
> +allow a hostile user to corrupt kernel memory
>
> ==============================================================
>
> diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
> index ff234d229cbb..0ab306ea8f80 100644
> --- a/Documentation/sysctl/vm.txt
> +++ b/Documentation/sysctl/vm.txt
> @@ -379,7 +379,8 @@ While most applications need less than a thousand maps, certain
> programs, particularly malloc debuggers, may consume lots of them,
> e.g., up to one or two maps per allocation.
>
> -The default value is 65536.
> +The default value is 65530. Increasing this value without decreasing
> +pid_max may allow a hostile user to corrupt kernel memory.

Just checking - did you mean the final '0' on this value?

Tobin

2018-02-08 04:05:52

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

On Thu, Feb 08, 2018 at 03:56:26AM +0100, Jann Horn wrote:
> How much memory would you need to trigger this? You need one
> vm_area_struct per increment, and those are 200 bytes? So at least
> 800GiB of memory for the vm_area_structs, and maybe more for other
> data structures?

That's a good point that I hadn't considered. Systems with that quantity
of memory are becoming available though.

> On systems with RAM on the order of terabytes, it's probably a good
> idea to turn on refcount hardening to make issues like that
> non-exploitable for now.

_mapcount is a bad candidate to be turned into a refcount_t. It's
completely legitimate to go to 0 and then back to 1. Also, we care
about being able to efficiently notice when it goes from 2 to 1 and
then from 1 to 0 (and we currently do that by biasing the count by -1).
I suppose it wouldn't be too hard to notice when we go from 0x7fff'ffff
to 0x8000'0000 and saturate the counter there.

> > That seems pretty bad. So here's a patch which adds documentation to the
> > two sysctls that a sysadmin could use to shoot themselves in the foot,
> > and adds a warning if they change either of them to a dangerous value.
>
> I have negative feelings about this patch, mostly because AFAICS:
>
> - It documents an issue instead of fixing it.

I prefer to think of it as warning the sysadmin they're doing something
dangerous, rather than preventing them from doing it ...

> - It likely only addresses a small part of the actual problem.

By this, you mean that there's a more general class of problem, and I make
no attempt to address it?

> > + if ((INT_MAX / max_map_count) > pid_max)
> > + pr_warn("pid_max is dangerously large\n");
>
> This in reordered is "if (pid_max * max_map_count < INT_MAX)
> pr_warn(...);", no? That doesn't make sense to me. Same thing again
> further down.

I should get more sleep before writing patches.

> > - if (unlikely(mm->map_count >= sysctl_max_map_count)) {
> > + if (unlikely(mm->map_count >= max_map_count)) {
>
> Why the renaming?

Because you can't have a function and an integer with the same name,
and the usual pattern we follow is that sysctl_foo_bar() is the function
to handle the variable foo_bar.

2018-02-08 04:07:52

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

On Thu, Feb 08, 2018 at 02:18:04PM +1100, Tobin C. Harding wrote:
> > +++ b/Documentation/sysctl/vm.txt
> > @@ -379,7 +379,8 @@ While most applications need less than a thousand maps, certain
> > programs, particularly malloc debuggers, may consume lots of them,
> > e.g., up to one or two maps per allocation.
> >
> > -The default value is 65536.
> > +The default value is 65530. Increasing this value without decreasing
> > +pid_max may allow a hostile user to corrupt kernel memory.
>
> Just checking - did you mean the final '0' on this value?

That's what my laptop emits ...

mm/mmap.c:int max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
include/linux/mm.h:#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
include/linux/mm.h:#define MAPCOUNT_ELF_CORE_MARGIN (5)

should be the same value for everybody.

2018-02-08 18:00:47

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

On Thu, 08 Feb 2018 03:56:26 +0100, Jann Horn said:

> I wouldn't be too surprised if there are more 32-bit overflows that
> start being realistic once you put something on the order of terabytes
> of memory into one machine, given that refcount_t is 32 bits wide -
> for example, the i_count. See
> https://bugs.chromium.org/p/project-zero/issues/detail?id=809 for an
> example where, given a sufficiently high RLIMIT_MEMLOCK, it was
> possible to overflow a 32-bit refcounter on a system with just ~32GiB
> of free memory (minimum required to store 2^32 64-bit pointers).
>
> On systems with RAM on the order of terabytes, it's probably a good
> idea to turn on refcount hardening to make issues like that
> non-exploitable for now.

I have at least 10 systems across the hall that have 3T of RAM on them
across our various HPC clusters. So this is indeed no longer a hypothetical
issue.


Attachments:
(No filename) (497.00 B)

2018-02-08 18:06:45

by Daniel Micay

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

>> That seems pretty bad. So here's a patch which adds documentation to the
>> two sysctls that a sysadmin could use to shoot themselves in the foot,
>> and adds a warning if they change either of them to a dangerous value.
>
> I have negative feelings about this patch, mostly because AFAICS:
>
> - It documents an issue instead of fixing it.
> - It likely only addresses a small part of the actual problem.

The standard map_max_count / pid_max are very low and there are many
situations where either or both need to be raised.

VM fragmentation in long-lived processes is a major issue. There are
allocators like jemalloc designed to minimize VM fragmentation by
never unmapping memory but they're relying on not having anything else
using mmap regularly so they can have all their ranges merged
together, unless they decide to do something like making a 1TB
PROT_NONE mapping up front to slowly consume. If you Google this
sysctl name, you'll find lots of people running into the limit. If
you're using a debugging / hardened allocator designed to use a lot of
guard pages, the standard map_max_count is close to unusable...

I think the same thing applies to pid_max. There are too many
reasonable reasons to increase it. Process-per-request is quite
reasonable if you care about robustness / security and want to sandbox
each request handler. Look at Chrome / Chromium: it's currently
process-per-site-instance, but they're moving to having more processes
with site isolation to isolate iframes into their own processes to
work towards enforcing the boundaries between sites at a process
level. It's way worse for fine-grained server-side sandboxing. Using a
lot of processes like this does counter VM fragmentation especially if
long-lived processes doing a lot of work are mostly avoided... but if
your allocators like using guard pages you're still going to hit the
limit.

I do think the default value in the documentation should be fixed but
if there's a clear problem with raising these it really needs to be
fixed. Google either of the sysctl names and look at all the people
running into issues and needing to raise them. It's only going to
become more common to raise these with people trying to use lots of
fine-grained sandboxing. Process-per-request is back in style.

2018-02-08 18:57:55

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

On Thu, Feb 08, 2018 at 01:05:33PM -0500, Daniel Micay wrote:
> The standard map_max_count / pid_max are very low and there are many
> situations where either or both need to be raised.

[snip good reasons]

> I do think the default value in the documentation should be fixed but
> if there's a clear problem with raising these it really needs to be
> fixed. Google either of the sysctl names and look at all the people
> running into issues and needing to raise them. It's only going to
> become more common to raise these with people trying to use lots of
> fine-grained sandboxing. Process-per-request is back in style.

So we should make the count saturate instead, then? That's going to
be interesting.

2018-02-08 19:36:34

by Daniel Micay

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

I don't think the kernel can get away with the current approach.
Object sizes and counts on 64-bit should be 64-bit unless there's a
verifiable reason they can get away with 32-bit. Having it use leak
memory isn't okay, just much less bad than vulnerabilities exploitable
beyond just denial of service.

Every 32-bit reference count should probably have a short comment
explaining why it can't overflow on 64-bit... if that can't be written
or it's too complicated to demonstrate, it probably needs to be
64-bit. It's one of many pervasive forms of integer overflows in the
kernel... :(

2018-02-08 19:43:38

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

On Thu, Feb 08, 2018 at 02:33:58PM -0500, Daniel Micay wrote:
> I don't think the kernel can get away with the current approach.
> Object sizes and counts on 64-bit should be 64-bit unless there's a
> verifiable reason they can get away with 32-bit. Having it use leak
> memory isn't okay, just much less bad than vulnerabilities exploitable
> beyond just denial of service.
>
> Every 32-bit reference count should probably have a short comment
> explaining why it can't overflow on 64-bit... if that can't be written
> or it's too complicated to demonstrate, it probably needs to be
> 64-bit. It's one of many pervasive forms of integer overflows in the
> kernel... :(

Expanding _mapcount to 64-bit, and for that matter expanding _refcount
to 64-bit too is going to have a severe effect on memory consumption.
It'll take an extra 8 bytes per page of memory in your system, so 2GB
for a machine with 1TB memory (earlier we established this attack isn't
feasible for a machine with less than 1TB).

It's not something a user is going to hit accidentally; it is only
relevant to an attack scenario. That's a lot of memory to sacrifice to
defray this attack. I think we should be able to do better.

2018-02-08 19:50:04

by Daniel Micay

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

I guess it could saturate and then switch to tracking the count via an
object pointer -> count mapping with a global lock? Whatever the
solution is should probably be a generic one since it's a recurring
issue.

2018-02-08 20:22:04

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

On Thu, Feb 08, 2018 at 02:48:52PM -0500, Daniel Micay wrote:
> I guess it could saturate and then switch to tracking the count via an
> object pointer -> count mapping with a global lock? Whatever the
> solution is should probably be a generic one since it's a recurring
> issue.

I was thinking of saturating _mapcount at 2 billion (allowing _refcount
the extra space to go into the 2-3 billion range). Once saturated,
disallow all attempts at mapping it until _mapcount has gone below 2
billion again. We can walk the page->mapping->i_mmap tree and find
tasks with more than, say, 10 mappings each, and kill them.

Now that I think about it, though, perhaps the simplest solution is not
to worry about checking whether _mapcount has saturated, and instead when
adding a new mmap, check whether this task already has it mapped 10 times.
If so, refuse the mapping.

Now we can argue that since pid_max is smaller than 400 million that
_mapcount will never overflow, and so we don't need to check it.
Convincing argument?


2018-02-08 21:39:56

by Matthew Wilcox

[permalink] [raw]
Subject: [RFC] Limit mappings to ten per page per process

On Thu, Feb 08, 2018 at 12:21:00PM -0800, Matthew Wilcox wrote:
> Now that I think about it, though, perhaps the simplest solution is not
> to worry about checking whether _mapcount has saturated, and instead when
> adding a new mmap, check whether this task already has it mapped 10 times.
> If so, refuse the mapping.

That turns out to be quite easy. Comments on this approach?

diff --git a/mm/mmap.c b/mm/mmap.c
index 9efdc021ad22..fd64ff662117 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1615,6 +1615,34 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}

+/**
+ * mmap_max_overlaps - Check the process has not exceeded its quota of mappings.
+ * @mm: The memory map for the process creating the mapping.
+ * @file: The file the mapping is coming from.
+ * @pgoff: The start of the mapping in the file.
+ * @count: The number of pages to map.
+ *
+ * Return: %true if this region of the file has too many overlapping mappings
+ * by this process.
+ */
+bool mmap_max_overlaps(struct mm_struct *mm, struct file *file,
+ pgoff_t pgoff, pgoff_t count)
+{
+ unsigned int overlaps = 0;
+ struct vm_area_struct *vma;
+
+ if (!file)
+ return false;
+
+ vma_interval_tree_foreach(vma, &file->f_mapping->i_mmap,
+ pgoff, pgoff + count) {
+ if (vma->vm_mm == mm)
+ overlaps++;
+ }
+
+ return overlaps > 9;
+}
+
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
@@ -1640,6 +1668,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return -ENOMEM;
}

+ if (mmap_max_overlaps(mm, file, pgoff, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
/* Clear old maps */
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 049470aa1e3e..27cf5cf9fc0f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -430,6 +430,10 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
(new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);

+ if (mmap_max_overlaps(mm, vma->vm_file, pgoff,
+ (new_len - old_len) >> PAGE_SHIFT))
+ return ERR_PTR(-ENOMEM);
+
if (vma->vm_flags & VM_ACCOUNT) {
unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))

2018-02-09 01:48:50

by Daniel Micay

[permalink] [raw]
Subject: Re: [RFC] Warn the user when they could overflow mapcount

I think there are likely legitimate programs mapping something a bunch of times.

Falling back to a global object -> count mapping (an rbtree / radix
trie or whatever) with a lock once it hits saturation wouldn't risk
breaking something. It would permanently leave the inline count
saturated and just use the address of the inline counter as the key
for the map to find the 64-bit counter. Once it gets to 0 in the map,
it can delete it from the map and do the standard freeing process,
avoiding leaks. It would really just make it a 64-bit reference count
heavily size optimized for the common case. It would work elsewhere
too, not just this case.

2018-02-09 04:27:05

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [RFC] Limit mappings to ten per page per process

On Thu, Feb 08, 2018 at 01:37:43PM -0800, Matthew Wilcox wrote:
> On Thu, Feb 08, 2018 at 12:21:00PM -0800, Matthew Wilcox wrote:
> > Now that I think about it, though, perhaps the simplest solution is not
> > to worry about checking whether _mapcount has saturated, and instead when
> > adding a new mmap, check whether this task already has it mapped 10 times.
> > If so, refuse the mapping.
>
> That turns out to be quite easy. Comments on this approach?

This *may* break some remap_file_pages() users.

And it may be rather costly for popular binaries. Consider libc.so.

--
Kirill A. Shutemov

2018-02-14 13:52:49

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC] Limit mappings to ten per page per process

On Fri, Feb 09, 2018 at 07:26:09AM +0300, Kirill A. Shutemov wrote:
> On Thu, Feb 08, 2018 at 01:37:43PM -0800, Matthew Wilcox wrote:
> > On Thu, Feb 08, 2018 at 12:21:00PM -0800, Matthew Wilcox wrote:
> > > Now that I think about it, though, perhaps the simplest solution is not
> > > to worry about checking whether _mapcount has saturated, and instead when
> > > adding a new mmap, check whether this task already has it mapped 10 times.
> > > If so, refuse the mapping.
> >
> > That turns out to be quite easy. Comments on this approach?
>
> This *may* break some remap_file_pages() users.

We have some?! ;-) I don't understand the use case where they want to
map the same page of a file multiple times into the same process. I mean,
yes, of course, they might ask for it, but I don't understand why they would.
Do you have any insight here?

> And it may be rather costly for popular binaries. Consider libc.so.

We already walk this tree to insert the mapping; this just adds a second
walk of the tree to check which overlapping mappings exist. I would
expect it to just make the tree cache-hot.

2018-02-14 14:07:28

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [RFC] Limit mappings to ten per page per process

On Wed, Feb 14, 2018 at 05:51:41AM -0800, Matthew Wilcox wrote:
> On Fri, Feb 09, 2018 at 07:26:09AM +0300, Kirill A. Shutemov wrote:
> > On Thu, Feb 08, 2018 at 01:37:43PM -0800, Matthew Wilcox wrote:
> > > On Thu, Feb 08, 2018 at 12:21:00PM -0800, Matthew Wilcox wrote:
> > > > Now that I think about it, though, perhaps the simplest solution is not
> > > > to worry about checking whether _mapcount has saturated, and instead when
> > > > adding a new mmap, check whether this task already has it mapped 10 times.
> > > > If so, refuse the mapping.
> > >
> > > That turns out to be quite easy. Comments on this approach?
> >
> > This *may* break some remap_file_pages() users.
>
> We have some?! ;-)

I can't prove otherwise :)

> I don't understand the use case where they want to map the same page of
> a file multiple times into the same process. I mean, yes, of course,
> they might ask for it, but I don't understand why they would. Do you
> have any insight here?

Some form of data deduplication? Like having repeating chunks stored once
on presistent storage and page cache, but put into memory in
"uncompressed" form.

It's not limited to remap_file_pages(). Plain mmap() can be used for this
too.

--
Kirill A. Shutemov

2018-03-02 22:02:43

by Matthew Wilcox

[permalink] [raw]
Subject: [RFC] Handle mapcount overflows


Here's my third effort to handle page->_mapcount overflows.

The idea is to minimise overhead, so we keep a list of users with more
than 5000 mappings. In order to overflow _mapcount, you have to have
2 billion mappings, so you'd need 400,000 tasks to evade the tracking,
and your sysadmin has probably accused you of forkbombing the system
long before then. Not to mention the 6GB of RAM you consumed just in
stacks and the 24GB of RAM you consumed in page tables ... but I digress.

Let's assume that the sysadmin has increased the number of processes to
100,000. You'd need to create 20,000 mappings per process to overflow
_mapcount, and they'd end up on the 'heavy_users' list. Not everybody
on the heavy_users list is going to be guilty, but if we hit an overflow,
we look at everybody on the heavy_users list and if they've got the page
mapped more than 1000 times, they get a SIGSEGV.

I'm not entirely sure how to forcibly tear down a task's mappings, so
I've just left a comment in there to do that. Looking for feedback on
this approach.

diff --git a/mm/internal.h b/mm/internal.h
index 7059a8389194..977852b8329e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -97,6 +97,11 @@ extern void putback_lru_page(struct page *page);
*/
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);

+#ifdef CONFIG_64BIT
+extern void mm_mapcount_overflow(struct page *page);
+#else
+static inline void mm_mapcount_overflow(struct page *page) { }
+#endif
/*
* in mm/page_alloc.c
*/
diff --git a/mm/mmap.c b/mm/mmap.c
index 9efdc021ad22..575766ec02f8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1315,6 +1315,115 @@ static inline int mlock_future_check(struct mm_struct *mm,
return 0;
}

+#ifdef CONFIG_64BIT
+/*
+ * Machines with more than 2TB of memory can create enough VMAs to overflow
+ * page->_mapcount if they all point to the same page. 32-bit machines do
+ * not need to be concerned.
+ */
+/*
+ * Experimentally determined. gnome-shell currently uses fewer than
+ * 3000 mappings, so should have zero effect on desktop users.
+ */
+#define mm_track_threshold 5000
+static DEFINE_SPINLOCK(heavy_users_lock);
+static DEFINE_IDR(heavy_users);
+
+static void mmap_track_user(struct mm_struct *mm, int max)
+{
+ struct mm_struct *entry;
+ unsigned int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&heavy_users_lock);
+ idr_for_each_entry(&heavy_users, entry, id) {
+ if (entry == mm)
+ break;
+ if (entry->map_count < mm_track_threshold)
+ idr_remove(&heavy_users, id);
+ }
+ if (!entry)
+ idr_alloc(&heavy_users, mm, 0, 0, GFP_ATOMIC);
+ spin_unlock(&heavy_users_lock);
+}
+
+static void mmap_untrack_user(struct mm_struct *mm)
+{
+ struct mm_struct *entry;
+ unsigned int id;
+
+ spin_lock(&heavy_users_lock);
+ idr_for_each_entry(&heavy_users, entry, id) {
+ if (entry == mm) {
+ idr_remove(&heavy_users, id);
+ break;
+ }
+ }
+ spin_unlock(&heavy_users_lock);
+}
+
+static void kill_mm(struct task_struct *tsk)
+{
+ /* Tear down the mappings first */
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, tsk, true);
+}
+
+static void kill_abuser(struct mm_struct *mm)
+{
+ struct task_struct *tsk;
+
+ for_each_process(tsk)
+ if (tsk->mm == mm)
+ break;
+
+ if (down_write_trylock(&mm->mmap_sem)) {
+ kill_mm(tsk);
+ up_write(&mm->mmap_sem);
+ } else {
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, tsk, true);
+ }
+}
+
+void mm_mapcount_overflow(struct page *page)
+{
+ struct mm_struct *entry = current->mm;
+ unsigned int id;
+ struct vm_area_struct *vma;
+ struct address_space *mapping = page_mapping(page);
+ unsigned long pgoff = page_to_pgoff(page);
+ unsigned int count = 0;
+
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + 1) {
+ if (vma->vm_mm == entry)
+ count++;
+ if (count > 1000)
+ kill_mm(current);
+ }
+
+ rcu_read_lock();
+ idr_for_each_entry(&heavy_users, entry, id) {
+ count = 0;
+
+ vma_interval_tree_foreach(vma, &mapping->i_mmap,
+ pgoff, pgoff + 1) {
+ if (vma->vm_mm == entry)
+ count++;
+ if (count > 1000) {
+ kill_abuser(entry);
+ goto out;
+ }
+ }
+ }
+ if (!entry)
+ panic("No abusers found but mapcount exceeded\n");
+out:
+ rcu_read_unlock();
+}
+#else
+static void mmap_track_user(struct mm_struct *mm, int max) { }
+static void mmap_untrack_user(struct mm_struct *mm) { }
+#endif
+
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
@@ -1357,6 +1466,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
+ if (mm->map_count > mm_track_threshold)
+ mmap_track_user(mm, mm_track_threshold);

/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
@@ -2997,6 +3108,8 @@ void exit_mmap(struct mm_struct *mm)
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);

+ mmap_untrack_user(mm);
+
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 47db27f8049e..d88acf5c98e9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1190,6 +1190,7 @@ void page_add_file_rmap(struct page *page, bool compound)
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
} else {
+ int v;
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));

@@ -1197,8 +1198,13 @@ void page_add_file_rmap(struct page *page, bool compound)
if (PageMlocked(page))
clear_page_mlock(compound_head(page));
}
- if (!atomic_inc_and_test(&page->_mapcount))
+ v = atomic_inc_return(&page->_mapcount);
+ if (likely(v > 0))
goto out;
+ if (unlikely(v < 0)) {
+ mm_mapcount_overflow(page);
+ goto out;
+ }
}
__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:


2018-03-03 03:35:43

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [RFC] Handle mapcount overflows

On Fri, Mar 02, 2018 at 01:26:37PM -0800, Matthew Wilcox wrote:
> Here's my third effort to handle page->_mapcount overflows.

If you like this approach, but wonder if it works, here's a little forkbomb
of a program and a patch to add instrumentation.

In my dmesg, I never see the max mapcount getting above 65539. I see a mix
of unlucky, it him! and it me! messages.

#define _GNU_SOURCE

#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>

int dummy;

int main(int argc, char **argv)
{
int fd = open(argv[1], O_RDWR);
int i;

if (fd < 0) {
perror(argv[1]);
return 1;
}

// Spawn 511 children
for (i = 0; i < 9; i++)
fork();

for (i = 0; i < 5000; i++)
dummy = *(int *)mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, 0);
}


diff --git a/mm/mmap.c b/mm/mmap.c
index 575766ec02f8..2b6187156db0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1325,7 +1325,7 @@ static inline int mlock_future_check(struct mm_struct *mm,
* Experimentally determined. gnome-shell currently uses fewer than
* 3000 mappings, so should have zero effect on desktop users.
*/
-#define mm_track_threshold 5000
+#define mm_track_threshold 50
static DEFINE_SPINLOCK(heavy_users_lock);
static DEFINE_IDR(heavy_users);

@@ -1377,9 +1377,11 @@ static void kill_abuser(struct mm_struct *mm)
break;

if (down_write_trylock(&mm->mmap_sem)) {
+ printk_ratelimited("it him!\n");
kill_mm(tsk);
up_write(&mm->mmap_sem);
} else {
+ printk_ratelimited("unlucky!\n");
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, tsk, true);
}
}
@@ -1396,8 +1398,10 @@ void mm_mapcount_overflow(struct page *page)
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + 1) {
if (vma->vm_mm == entry)
count++;
- if (count > 1000)
+ if (count > 1000) {
+ printk_ratelimited("it me!\n");
kill_mm(current);
+ }
}

rcu_read_lock();
@@ -1408,7 +1412,7 @@ void mm_mapcount_overflow(struct page *page)
pgoff, pgoff + 1) {
if (vma->vm_mm == entry)
count++;
- if (count > 1000) {
+ if (count > 10) {
kill_abuser(entry);
goto out;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index d88acf5c98e9..3f0509f6f011 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1190,6 +1190,7 @@ void page_add_file_rmap(struct page *page, bool compound)
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
} else {
+ static int max = 0;
int v;
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1199,12 +1200,14 @@ void page_add_file_rmap(struct page *page, bool compound)
clear_page_mlock(compound_head(page));
}
v = atomic_inc_return(&page->_mapcount);
- if (likely(v > 0))
- goto out;
- if (unlikely(v < 0)) {
+ if (unlikely(v > 65535)) {
+ if (max < v) max = v;
+ printk_ratelimited("overflow %d max %d\n", v, max);
mm_mapcount_overflow(page);
goto out;
}
+ if (likely(v > 0))
+ goto out;
}
__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:


2019-05-01 15:12:03

by Jann Horn

[permalink] [raw]
Subject: Re: [RFC] Handle mapcount overflows

[extremely slow reply]

On Fri, Mar 2, 2018 at 4:26 PM Matthew Wilcox <[email protected]> wrote:
> Here's my third effort to handle page->_mapcount overflows.
>
> The idea is to minimise overhead, so we keep a list of users with more
> than 5000 mappings. In order to overflow _mapcount, you have to have
> 2 billion mappings, so you'd need 400,000 tasks to evade the tracking,
> and your sysadmin has probably accused you of forkbombing the system
> long before then. Not to mention the 6GB of RAM you consumed just in
> stacks and the 24GB of RAM you consumed in page tables ... but I digress.
>
> Let's assume that the sysadmin has increased the number of processes to
> 100,000. You'd need to create 20,000 mappings per process to overflow
> _mapcount, and they'd end up on the 'heavy_users' list. Not everybody
> on the heavy_users list is going to be guilty, but if we hit an overflow,
> we look at everybody on the heavy_users list and if they've got the page
> mapped more than 1000 times, they get a SIGSEGV.
>
> I'm not entirely sure how to forcibly tear down a task's mappings, so
> I've just left a comment in there to do that. Looking for feedback on
> this approach.
[...]
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 9efdc021ad22..575766ec02f8 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
[...]
> +static void kill_mm(struct task_struct *tsk)
> +{
> + /* Tear down the mappings first */
> + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, tsk, true);
> +}

The mapping teardown could maybe be something like
unmap_mapping_range_vma()? That doesn't remove the VMA, but it gets
rid of the PTEs; and it has the advantage of working without taking
the mmap_sem. And then it isn't even necessarily required to actually
kill the abuser; instead, the abuser would just take a minor fault on
the next access, and the abusers would take away each other's
references, slowing each other down.

> +static void kill_abuser(struct mm_struct *mm)
> +{
> + struct task_struct *tsk;
> +
> + for_each_process(tsk)
> + if (tsk->mm == mm)
> + break;

(There can be multiple processes sharing the ->mm.)

> + if (down_write_trylock(&mm->mmap_sem)) {
> + kill_mm(tsk);
> + up_write(&mm->mmap_sem);
> + } else {
> + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, tsk, true);
> + }

Hmm. Having to fall back if the lock is taken here is kind of bad, I
think. __get_user_pages_locked() with locked==NULL can keep the
mmap_sem blocked arbitrarily long, meaning that an attacker could
force the fallback path, right? For example, __access_remote_vm() uses
get_user_pages_remote() with locked==NULL. And IIRC you can avoid
getting killed by a SIGKILL by being stuck in unkillable disk sleep,
which I think FUSE can create by not responding to a request.

> +}
> +
> +void mm_mapcount_overflow(struct page *page)
> +{
> + struct mm_struct *entry = current->mm;
> + unsigned int id;
> + struct vm_area_struct *vma;
> + struct address_space *mapping = page_mapping(page);
> + unsigned long pgoff = page_to_pgoff(page);
> + unsigned int count = 0;
> +
> + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + 1) {

I think this needs the i_mmap_rwsem?

> + if (vma->vm_mm == entry)
> + count++;
> + if (count > 1000)
> + kill_mm(current);
> + }
> +
> + rcu_read_lock();
> + idr_for_each_entry(&heavy_users, entry, id) {
> + count = 0;
> +
> + vma_interval_tree_foreach(vma, &mapping->i_mmap,
> + pgoff, pgoff + 1) {
> + if (vma->vm_mm == entry)
> + count++;
> + if (count > 1000) {
> + kill_abuser(entry);
> + goto out;

Even if someone has 1000 mappings of the range in question, that
doesn't necessarily mean that there are actually any non-zero PTEs in
the abuser. This probably needs to get some feedback from
kill_abuser() to figure out whether at least one reference has been
reclaimed.

> + }
> + }
> + }
> + if (!entry)
> + panic("No abusers found but mapcount exceeded\n");
> +out:
> + rcu_read_unlock();
> +}
[...]
> @@ -1357,6 +1466,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> /* Too many mappings? */
> if (mm->map_count > sysctl_max_map_count)
> return -ENOMEM;
> + if (mm->map_count > mm_track_threshold)
> + mmap_track_user(mm, mm_track_threshold);

I think this check would have to be copied to a few other places;
AFAIK you can e.g. use a series of mremap() calls to create multiple
mappings of the same file page. Something like:

char *addr = mmap(0x100000000, 0x1000, PROT_READ, MAP_SHARED, fd, 0);
for (int i=0; i<1000; i++) {
mremap(addr, 0x1000, 0x2000, 0);
mremap(addr+0x1000, 0x1000, 0x1000, MREMAP_FIXED|MREMAP_MAYMOVE,
0x200000000 + i * 0x1000);
}

> /* Obtain the address to map to. we verify (or select) it and ensure
> * that it represents a valid section of the address space.
> @@ -2997,6 +3108,8 @@ void exit_mmap(struct mm_struct *mm)
> /* mm's last user has gone, and its about to be pulled down */
> mmu_notifier_release(mm);
>
> + mmap_untrack_user(mm);
> +
> if (mm->locked_vm) {
> vma = mm->mmap;
> while (vma) {

I'd move that call further down, to reduce the chance that the task
blocks after being untracked but before actually dropping its
references.