2009-11-04 19:15:48

by Christoph Lameter

[permalink] [raw]
Subject: [MM] Make mm counters per cpu instead of atomic

From: Christoph Lameter <[email protected]>
Subject: Make mm counters per cpu

Changing the mm counters to per cpu counters is possible after the introduction
of the generic per cpu operations (currently in percpu and -next).

With that the contention on the counters in mm_struct can be avoided. The
USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
need to perform atomic updates to mm counters anymore. Various code paths
can be simplified since per cpu counter updates are fast and batching
of counter updates is no longer needed.

One price to pay for these improvements is the need to scan over all percpu
counters when the actual count values are needed.

Signed-off-by: Christoph Lameter <[email protected]>

---
fs/proc/task_mmu.c | 14 +++++++++-
include/linux/mm_types.h | 16 ++++--------
include/linux/sched.h | 61 ++++++++++++++++++++---------------------------
kernel/fork.c | 25 ++++++++++++++-----
mm/filemap_xip.c | 2 -
mm/fremap.c | 2 -
mm/init-mm.c | 3 ++
mm/memory.c | 20 +++++++--------
mm/rmap.c | 10 +++----
mm/swapfile.c | 2 -
10 files changed, 84 insertions(+), 71 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600
@@ -24,11 +24,10 @@ struct address_space;

#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)

-#if USE_SPLIT_PTLOCKS
-typedef atomic_long_t mm_counter_t;
-#else /* !USE_SPLIT_PTLOCKS */
-typedef unsigned long mm_counter_t;
-#endif /* !USE_SPLIT_PTLOCKS */
+struct mm_counter {
+ long file;
+ long anon;
+};

/*
* Each physical page in the system has a struct page associated with
@@ -223,11 +222,8 @@ struct mm_struct {
* by mmlist_lock
*/

- /* Special counters, in some configurations protected by the
- * page_table_lock, in other configurations by being atomic.
- */
- mm_counter_t _file_rss;
- mm_counter_t _anon_rss;
+ /* Special percpu counters */
+ struct mm_counter *rss;

unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/sched.h 2009-11-04 13:13:42.000000000 -0600
@@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
extern void arch_unmap_area(struct mm_struct *, unsigned long);
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);

-#if USE_SPLIT_PTLOCKS
-/*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-
-#else /* !USE_SPLIT_PTLOCKS */
-/*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
-#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
-#define get_mm_counter(mm, member) ((mm)->_##member)
-#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
-#define inc_mm_counter(mm, member) (mm)->_##member++
-#define dec_mm_counter(mm, member) (mm)->_##member--
-
-#endif /* !USE_SPLIT_PTLOCKS */
-
-#define get_mm_rss(mm) \
- (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
-#define update_hiwater_rss(mm) do { \
- unsigned long _rss = get_mm_rss(mm); \
- if ((mm)->hiwater_rss < _rss) \
- (mm)->hiwater_rss = _rss; \
-} while (0)
-#define update_hiwater_vm(mm) do { \
- if ((mm)->hiwater_vm < (mm)->total_vm) \
- (mm)->hiwater_vm = (mm)->total_vm; \
-} while (0)
+static inline unsigned long get_mm_rss(struct mm_struct *mm)
+{
+ int cpu;
+ unsigned long r = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+ r = c->file + c->anon;
+ }
+
+ return r;
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+ unsigned long _rss = get_mm_rss(mm);
+ if (mm->hiwater_rss < _rss)
+ mm->hiwater_rss = _rss;
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+ if (mm->hiwater_vm < mm->total_vm)
+ mm->hiwater_vm = mm->total_vm;
+}

static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600
@@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct

static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
{
+ int cpu;
+
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
@@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
mm->nr_ptes = 0;
- set_mm_counter(mm, file_rss, 0);
- set_mm_counter(mm, anon_rss, 0);
+ for_each_possible_cpu(cpu) {
+ struct mm_counter *m;
+
+ memset(m, sizeof(struct mm_counter), 0);
+ }
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
@@ -480,7 +485,13 @@ struct mm_struct * mm_alloc(void)
mm = allocate_mm();
if (mm) {
memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm, current);
+ mm->rss = alloc_percpu(struct mm_counter);
+ if (mm->rss)
+ mm = mm_init(mm, current);
+ else {
+ free_mm(mm);
+ mm = NULL;
+ }
}
return mm;
}
@@ -496,6 +507,7 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
+ free_percpu(mm->rss);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -631,6 +643,9 @@ struct mm_struct *dup_mm(struct task_str
goto fail_nomem;

memcpy(mm, oldmm, sizeof(*mm));
+ mm->rss = alloc_percpu(struct mm_counter);
+ if (!mm->rss)
+ goto fail_nomem;

/* Initializing for Swap token stuff */
mm->token_priority = 0;
@@ -661,15 +676,13 @@ free_pt:
mm->binfmt = NULL;
mmput(mm);

-fail_nomem:
- return NULL;
-
fail_nocontext:
/*
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
*/
mm_free_pgd(mm);
+fail_nomem:
free_mm(mm);
return NULL;
}
Index: linux-2.6/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.orig/fs/proc/task_mmu.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/fs/proc/task_mmu.c 2009-11-04 13:13:42.000000000 -0600
@@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
int task_statm(struct mm_struct *mm, int *shared, int *text,
int *data, int *resident)
{
- *shared = get_mm_counter(mm, file_rss);
+ int cpu;
+ int anon_rss = 0;
+ int file_rss = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+ anon_rss += c->anon;
+ file_rss += c->file;
+ }
+ *shared = file_rss;
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm;
- *resident = *shared + get_mm_counter(mm, anon_rss);
+ *resident = *shared + anon_rss;
return mm->total_vm;
}

Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/filemap_xip.c 2009-11-04 13:13:42.000000000 -0600
@@ -194,7 +194,7 @@ retry:
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush_notify(vma, address, pte);
page_remove_rmap(page);
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
page_cache_release(page);
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/fremap.c 2009-11-04 13:13:42.000000000 -0600
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
page_remove_rmap(page);
page_cache_release(page);
update_hiwater_rss(mm);
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
}
} else {
if (!pte_file(pte))
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/memory.c 2009-11-04 13:13:42.000000000 -0600
@@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
{
if (file_rss)
- add_mm_counter(mm, file_rss, file_rss);
+ __this_cpu_add(mm->rss->file, file_rss);
if (anon_rss)
- add_mm_counter(mm, anon_rss, anon_rss);
+ __this_cpu_add(mm->rss->anon, anon_rss);
}

/*
@@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st

/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter(mm, file_rss);
+ __this_cpu_inc(mm->rss->file);
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));

@@ -2148,11 +2148,11 @@ gotten:
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter(mm, file_rss);
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_dec(mm->rss->file);
+ __this_cpu_inc(mm->rss->anon);
}
} else
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
* discarded at swap_free().
*/

- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
if (!pte_none(*page_table))
goto release;

- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
page_add_new_anon_rmap(page, vma, address);
setpte:
set_pte_at(mm, address, page_table, entry);
@@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
page_add_new_anon_rmap(page, vma, address);
} else {
- inc_mm_counter(mm, file_rss);
+ __this_cpu_inc(mm->rss->file);
page_add_file_rmap(page);
if (flags & FAULT_FLAG_WRITE) {
dirty_page = page;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/rmap.c 2009-11-04 13:13:42.000000000 -0600
@@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page

if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
if (PageAnon(page))
- dec_mm_counter(mm, anon_rss);
+ __this_cpu_dec(mm->rss->anon);
else
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (PageAnon(page)) {
@@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
- dec_mm_counter(mm, anon_rss);
+ __this_cpu_dec(mm->rss->anon);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
@@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);


page_remove_rmap(page);
@@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned

page_remove_rmap(page);
page_cache_release(page);
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/swapfile.c 2009-11-04 13:13:42.000000000 -0600
@@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
goto out;
}

- inc_mm_counter(vma->vm_mm, anon_rss);
+ __this_cpu_inc(vma->vm_mm->rss->anon);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6/mm/init-mm.c
===================================================================
--- linux-2.6.orig/mm/init-mm.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/init-mm.c 2009-11-04 13:13:42.000000000 -0600
@@ -8,6 +8,8 @@
#include <asm/atomic.h>
#include <asm/pgtable.h>

+DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
+
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
@@ -17,4 +19,5 @@ struct mm_struct init_mm = {
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,
+ .rss = &init_mm_counters,
};


2009-11-04 19:18:14

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Remove rss batching from copy_page_range()

From: Christoph Lameter <[email protected]>
Subject: Remove rss batching from copy_page_range()

With per cpu counters in mm there is no need for batching
mm counter updates anymore. Update counters directly while
copying pages.

Signed-off-by: Christoph Lameter <[email protected]>

---
mm/memory.c | 27 ++++++++-------------------
1 file changed, 8 insertions(+), 19 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2009-11-04 12:15:03.000000000 -0600
+++ linux-2.6/mm/memory.c 2009-11-04 13:03:45.000000000 -0600
@@ -376,14 +376,6 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
return 0;
}

-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
-{
- if (file_rss)
- __this_cpu_add(mm->rss->file, file_rss);
- if (anon_rss)
- __this_cpu_add(mm->rss->anon, anon_rss);
-}
-
/*
* This function is called to print an error when a bad pte
* is found. For example, we might have a PFN-mapped pte in
@@ -575,7 +567,7 @@ out:
static inline void
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
- unsigned long addr, int *rss)
+ unsigned long addr)
{
unsigned long vm_flags = vma->vm_flags;
pte_t pte = *src_pte;
@@ -630,7 +622,10 @@ copy_one_pte(struct mm_struct *dst_mm, s
if (page) {
get_page(page);
page_dup_rmap(page);
- rss[PageAnon(page)]++;
+ if (PageAnon(page))
+ __this_cpu_inc(dst_mm->rss->anon);
+ else
+ __this_cpu_inc(dst_mm->rss->file);
}

out_set_pte:
@@ -645,10 +640,8 @@ static int copy_pte_range(struct mm_stru
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];

again:
- rss[1] = rss[0] = 0;
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
@@ -674,14 +667,13 @@ again:
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr);
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(orig_src_pte);
- add_mm_rss(dst_mm, rss[0], rss[1]);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
if (addr != end)
@@ -803,8 +795,6 @@ static unsigned long zap_pte_range(struc
struct mm_struct *mm = tlb->mm;
pte_t *pte;
spinlock_t *ptl;
- int file_rss = 0;
- int anon_rss = 0;

pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -850,14 +840,14 @@ static unsigned long zap_pte_range(struc
set_pte_at(mm, addr, pte,
pgoff_to_pte(page->index));
if (PageAnon(page))
- anon_rss--;
+ __this_cpu_dec(mm->rss->anon);
else {
if (pte_dirty(ptent))
set_page_dirty(page);
if (pte_young(ptent) &&
likely(!VM_SequentialReadHint(vma)))
mark_page_accessed(page);
- file_rss--;
+ __this_cpu_dec(mm->rss->file);
}
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
@@ -880,7 +870,6 @@ static unsigned long zap_pte_range(struc
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

- add_mm_rss(mm, file_rss, anon_rss);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);

2009-11-04 21:01:03

by Andi Kleen

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

Christoph Lameter <[email protected]> writes:
>
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.

Do you have numbers how costly alloc_percpu() is? I wonder what this
does to fork() overhead.

-Andi


--
[email protected] -- Speaking for myself only.

2009-11-04 21:02:17

by Andi Kleen

[permalink] [raw]
Subject: Re: [MM] Remove rss batching from copy_page_range()

Christoph Lameter <[email protected]> writes:

> From: Christoph Lameter <[email protected]>
> Subject: Remove rss batching from copy_page_range()
>
> With per cpu counters in mm there is no need for batching
> mm counter updates anymore. Update counters directly while
> copying pages.

Hmm, but with all the inlining with some luck the local
counters will be in registers. That will never be the case
with the per cpu counters.

So I'm not sure it's really an improvement?

-Andi

--
[email protected] -- Speaking for myself only.

2009-11-04 22:03:46

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Remove rss batching from copy_page_range()

On Wed, 4 Nov 2009, Andi Kleen wrote:

> > With per cpu counters in mm there is no need for batching
> > mm counter updates anymore. Update counters directly while
> > copying pages.
>
> Hmm, but with all the inlining with some luck the local
> counters will be in registers. That will never be the case
> with the per cpu counters.

The function is too big for that to occur and the counters have to be
preserved across function calls. The code is shorter with the patch
applied:

christoph@:~/n/linux-2.6$ size mm/memory.o
text data bss dec hex filename
20140 56 40 20236 4f0c mm/memory.o
christoph@:~/n/linux-2.6$ quilt push
Applying patch mmcounter
patching file include/linux/mm_types.h
patching file include/linux/sched.h
patching file kernel/fork.c
patching file fs/proc/task_mmu.c
patching file mm/filemap_xip.c
patching file mm/fremap.c
patching file mm/memory.c
patching file mm/rmap.c
patching file mm/swapfile.c
patching file mm/init-mm.c

Now at patch mmcounter
christoph@:~/n/linux-2.6$ make mm/memory.o
CHK include/linux/version.h
CHK include/linux/utsrelease.h
UPD include/linux/utsrelease.h
SYMLINK include/asm -> include/asm-x86
CC arch/x86/kernel/asm-offsets.s
GEN include/asm/asm-offsets.h
CALL scripts/checksyscalls.sh
CC mm/memory.o
christoph@:~/n/linux-2.6$ size mm/memory.o
text data bss dec hex filename
20028 56 40 20124 4e9c mm/memory.o
christoph@:~/n/linux-2.6$ quilt push
Applying patch simplify
patching file mm/memory.c

Now at patch simplify
christoph@:~/n/linux-2.6$ make mm/memory.o
CHK include/linux/version.h
CHK include/linux/utsrelease.h
SYMLINK include/asm -> include/asm-x86
CALL scripts/checksyscalls.sh
CC mm/memory.o
christoph@:~/n/linux-2.6$ size mm/memory.o
text data bss dec hex filename
19888 56 40 19984 4e10 mm/memory.o

2009-11-05 00:26:09

by Dave Jones

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Wed, Nov 04, 2009 at 02:14:41PM -0500, Christoph Lameter wrote:

> + memset(m, sizeof(struct mm_counter), 0);

Args wrong way around.

Dave

2009-11-05 01:19:24

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Wed, 4 Nov 2009 14:14:41 -0500 (EST)
Christoph Lameter <[email protected]> wrote:

> From: Christoph Lameter <[email protected]>
> Subject: Make mm counters per cpu
>
> Changing the mm counters to per cpu counters is possible after the introduction
> of the generic per cpu operations (currently in percpu and -next).
>
> With that the contention on the counters in mm_struct can be avoided. The
> USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> need to perform atomic updates to mm counters anymore. Various code paths
> can be simplified since per cpu counter updates are fast and batching
> of counter updates is no longer needed.
>
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.
>
> Signed-off-by: Christoph Lameter <[email protected]>
>

Hmm, I don't fully understand _new_ percpu but...
In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig.
BIGSMP.

Then, if 65536 process runs, this consumes

65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes.

But x86's vmalloc area just has 80? MBytes. I (and my customers) don't have
this kind of exteme machine, but cpus tend to be many-core (and still support
32bit mode), now.

If 32, 64 cpus,
65536 * 8 * 32 = 16MB
65536 * 8 * 32 = 32MB

And if I add swap_usage,
65536 * 12 * 32 = 24MB.

It's influenced by the number of deivces attached to the sysytem but
people will see more -ENOMEM.

It seems this consumption/footprint is very big.

Thanks,
-Kame





> ---
> fs/proc/task_mmu.c | 14 +++++++++-
> include/linux/mm_types.h | 16 ++++--------
> include/linux/sched.h | 61 ++++++++++++++++++++---------------------------
> kernel/fork.c | 25 ++++++++++++++-----
> mm/filemap_xip.c | 2 -
> mm/fremap.c | 2 -
> mm/init-mm.c | 3 ++
> mm/memory.c | 20 +++++++--------
> mm/rmap.c | 10 +++----
> mm/swapfile.c | 2 -
> 10 files changed, 84 insertions(+), 71 deletions(-)
>
> Index: linux-2.6/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600
> @@ -24,11 +24,10 @@ struct address_space;
>
> #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
>
> -#if USE_SPLIT_PTLOCKS
> -typedef atomic_long_t mm_counter_t;
> -#else /* !USE_SPLIT_PTLOCKS */
> -typedef unsigned long mm_counter_t;
> -#endif /* !USE_SPLIT_PTLOCKS */
> +struct mm_counter {
> + long file;
> + long anon;
> +};
>
> /*
> * Each physical page in the system has a struct page associated with
> @@ -223,11 +222,8 @@ struct mm_struct {
> * by mmlist_lock
> */
>
> - /* Special counters, in some configurations protected by the
> - * page_table_lock, in other configurations by being atomic.
> - */
> - mm_counter_t _file_rss;
> - mm_counter_t _anon_rss;
> + /* Special percpu counters */
> + struct mm_counter *rss;
>
> unsigned long hiwater_rss; /* High-watermark of RSS usage */
> unsigned long hiwater_vm; /* High-water virtual memory usage */
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/include/linux/sched.h 2009-11-04 13:13:42.000000000 -0600
> @@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
> extern void arch_unmap_area(struct mm_struct *, unsigned long);
> extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
>
> -#if USE_SPLIT_PTLOCKS
> -/*
> - * The mm counters are not protected by its page_table_lock,
> - * so must be incremented atomically.
> - */
> -#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
> -#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
> -#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
> -#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
> -#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
> -
> -#else /* !USE_SPLIT_PTLOCKS */
> -/*
> - * The mm counters are protected by its page_table_lock,
> - * so can be incremented directly.
> - */
> -#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
> -#define get_mm_counter(mm, member) ((mm)->_##member)
> -#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
> -#define inc_mm_counter(mm, member) (mm)->_##member++
> -#define dec_mm_counter(mm, member) (mm)->_##member--
> -
> -#endif /* !USE_SPLIT_PTLOCKS */
> -
> -#define get_mm_rss(mm) \
> - (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
> -#define update_hiwater_rss(mm) do { \
> - unsigned long _rss = get_mm_rss(mm); \
> - if ((mm)->hiwater_rss < _rss) \
> - (mm)->hiwater_rss = _rss; \
> -} while (0)
> -#define update_hiwater_vm(mm) do { \
> - if ((mm)->hiwater_vm < (mm)->total_vm) \
> - (mm)->hiwater_vm = (mm)->total_vm; \
> -} while (0)
> +static inline unsigned long get_mm_rss(struct mm_struct *mm)
> +{
> + int cpu;
> + unsigned long r = 0;
> +
> + for_each_possible_cpu(cpu) {
> + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> + r = c->file + c->anon;
> + }
> +
> + return r;
> +}
> +
> +static inline void update_hiwater_rss(struct mm_struct *mm)
> +{
> + unsigned long _rss = get_mm_rss(mm);
> + if (mm->hiwater_rss < _rss)
> + mm->hiwater_rss = _rss;
> +}
> +
> +static inline void update_hiwater_vm(struct mm_struct *mm)
> +{
> + if (mm->hiwater_vm < mm->total_vm)
> + mm->hiwater_vm = mm->total_vm;
> +}
>
> static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
> {
> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600
> @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
>
> static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> {
> + int cpu;
> +
> atomic_set(&mm->mm_users, 1);
> atomic_set(&mm->mm_count, 1);
> init_rwsem(&mm->mmap_sem);
> @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
> (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
> mm->core_state = NULL;
> mm->nr_ptes = 0;
> - set_mm_counter(mm, file_rss, 0);
> - set_mm_counter(mm, anon_rss, 0);
> + for_each_possible_cpu(cpu) {
> + struct mm_counter *m;
> +
> + memset(m, sizeof(struct mm_counter), 0);
> + }
> spin_lock_init(&mm->page_table_lock);
> mm->free_area_cache = TASK_UNMAPPED_BASE;
> mm->cached_hole_size = ~0UL;
> @@ -480,7 +485,13 @@ struct mm_struct * mm_alloc(void)
> mm = allocate_mm();
> if (mm) {
> memset(mm, 0, sizeof(*mm));
> - mm = mm_init(mm, current);
> + mm->rss = alloc_percpu(struct mm_counter);
> + if (mm->rss)
> + mm = mm_init(mm, current);
> + else {
> + free_mm(mm);
> + mm = NULL;
> + }
> }
> return mm;
> }
> @@ -496,6 +507,7 @@ void __mmdrop(struct mm_struct *mm)
> mm_free_pgd(mm);
> destroy_context(mm);
> mmu_notifier_mm_destroy(mm);
> + free_percpu(mm->rss);
> free_mm(mm);
> }
> EXPORT_SYMBOL_GPL(__mmdrop);
> @@ -631,6 +643,9 @@ struct mm_struct *dup_mm(struct task_str
> goto fail_nomem;
>
> memcpy(mm, oldmm, sizeof(*mm));
> + mm->rss = alloc_percpu(struct mm_counter);
> + if (!mm->rss)
> + goto fail_nomem;
>
> /* Initializing for Swap token stuff */
> mm->token_priority = 0;
> @@ -661,15 +676,13 @@ free_pt:
> mm->binfmt = NULL;
> mmput(mm);
>
> -fail_nomem:
> - return NULL;
> -
> fail_nocontext:
> /*
> * If init_new_context() failed, we cannot use mmput() to free the mm
> * because it calls destroy_context()
> */
> mm_free_pgd(mm);
> +fail_nomem:
> free_mm(mm);
> return NULL;
> }
> Index: linux-2.6/fs/proc/task_mmu.c
> ===================================================================
> --- linux-2.6.orig/fs/proc/task_mmu.c 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/fs/proc/task_mmu.c 2009-11-04 13:13:42.000000000 -0600
> @@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
> int task_statm(struct mm_struct *mm, int *shared, int *text,
> int *data, int *resident)
> {
> - *shared = get_mm_counter(mm, file_rss);
> + int cpu;
> + int anon_rss = 0;
> + int file_rss = 0;
> +
> + for_each_possible_cpu(cpu) {
> + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> + anon_rss += c->anon;
> + file_rss += c->file;
> + }
> + *shared = file_rss;
> *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
> >> PAGE_SHIFT;
> *data = mm->total_vm - mm->shared_vm;
> - *resident = *shared + get_mm_counter(mm, anon_rss);
> + *resident = *shared + anon_rss;
> return mm->total_vm;
> }
>
> Index: linux-2.6/mm/filemap_xip.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap_xip.c 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/filemap_xip.c 2009-11-04 13:13:42.000000000 -0600
> @@ -194,7 +194,7 @@ retry:
> flush_cache_page(vma, address, pte_pfn(*pte));
> pteval = ptep_clear_flush_notify(vma, address, pte);
> page_remove_rmap(page);
> - dec_mm_counter(mm, file_rss);
> + __this_cpu_dec(mm->rss->file);
> BUG_ON(pte_dirty(pteval));
> pte_unmap_unlock(pte, ptl);
> page_cache_release(page);
> Index: linux-2.6/mm/fremap.c
> ===================================================================
> --- linux-2.6.orig/mm/fremap.c 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/fremap.c 2009-11-04 13:13:42.000000000 -0600
> @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
> page_remove_rmap(page);
> page_cache_release(page);
> update_hiwater_rss(mm);
> - dec_mm_counter(mm, file_rss);
> + __this_cpu_dec(mm->rss->file);
> }
> } else {
> if (!pte_file(pte))
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/memory.c 2009-11-04 13:13:42.000000000 -0600
> @@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
> static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
> {
> if (file_rss)
> - add_mm_counter(mm, file_rss, file_rss);
> + __this_cpu_add(mm->rss->file, file_rss);
> if (anon_rss)
> - add_mm_counter(mm, anon_rss, anon_rss);
> + __this_cpu_add(mm->rss->anon, anon_rss);
> }
>
> /*
> @@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st
>
> /* Ok, finally just insert the thing.. */
> get_page(page);
> - inc_mm_counter(mm, file_rss);
> + __this_cpu_inc(mm->rss->file);
> page_add_file_rmap(page);
> set_pte_at(mm, addr, pte, mk_pte(page, prot));
>
> @@ -2148,11 +2148,11 @@ gotten:
> if (likely(pte_same(*page_table, orig_pte))) {
> if (old_page) {
> if (!PageAnon(old_page)) {
> - dec_mm_counter(mm, file_rss);
> - inc_mm_counter(mm, anon_rss);
> + __this_cpu_dec(mm->rss->file);
> + __this_cpu_inc(mm->rss->anon);
> }
> } else
> - inc_mm_counter(mm, anon_rss);
> + __this_cpu_inc(mm->rss->anon);
> flush_cache_page(vma, address, pte_pfn(orig_pte));
> entry = mk_pte(new_page, vma->vm_page_prot);
> entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> @@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
> * discarded at swap_free().
> */
>
> - inc_mm_counter(mm, anon_rss);
> + __this_cpu_inc(mm->rss->anon);
> pte = mk_pte(page, vma->vm_page_prot);
> if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
> pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> @@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
> if (!pte_none(*page_table))
> goto release;
>
> - inc_mm_counter(mm, anon_rss);
> + __this_cpu_inc(mm->rss->anon);
> page_add_new_anon_rmap(page, vma, address);
> setpte:
> set_pte_at(mm, address, page_table, entry);
> @@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
> if (flags & FAULT_FLAG_WRITE)
> entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> if (anon) {
> - inc_mm_counter(mm, anon_rss);
> + __this_cpu_inc(mm->rss->anon);
> page_add_new_anon_rmap(page, vma, address);
> } else {
> - inc_mm_counter(mm, file_rss);
> + __this_cpu_inc(mm->rss->file);
> page_add_file_rmap(page);
> if (flags & FAULT_FLAG_WRITE) {
> dirty_page = page;
> Index: linux-2.6/mm/rmap.c
> ===================================================================
> --- linux-2.6.orig/mm/rmap.c 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/rmap.c 2009-11-04 13:13:42.000000000 -0600
> @@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page
>
> if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
> if (PageAnon(page))
> - dec_mm_counter(mm, anon_rss);
> + __this_cpu_dec(mm->rss->anon);
> else
> - dec_mm_counter(mm, file_rss);
> + __this_cpu_dec(mm->rss->file);
> set_pte_at(mm, address, pte,
> swp_entry_to_pte(make_hwpoison_entry(page)));
> } else if (PageAnon(page)) {
> @@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
> list_add(&mm->mmlist, &init_mm.mmlist);
> spin_unlock(&mmlist_lock);
> }
> - dec_mm_counter(mm, anon_rss);
> + __this_cpu_dec(mm->rss->anon);
> } else if (PAGE_MIGRATION) {
> /*
> * Store the pfn of the page in a special migration
> @@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
> entry = make_migration_entry(page, pte_write(pteval));
> set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
> } else
> - dec_mm_counter(mm, file_rss);
> + __this_cpu_dec(mm->rss->file);
>
>
> page_remove_rmap(page);
> @@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned
>
> page_remove_rmap(page);
> page_cache_release(page);
> - dec_mm_counter(mm, file_rss);
> + __this_cpu_dec(mm->rss->file);
> (*mapcount)--;
> }
> pte_unmap_unlock(pte - 1, ptl);
> Index: linux-2.6/mm/swapfile.c
> ===================================================================
> --- linux-2.6.orig/mm/swapfile.c 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/swapfile.c 2009-11-04 13:13:42.000000000 -0600
> @@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
> goto out;
> }
>
> - inc_mm_counter(vma->vm_mm, anon_rss);
> + __this_cpu_inc(vma->vm_mm->rss->anon);
> get_page(page);
> set_pte_at(vma->vm_mm, addr, pte,
> pte_mkold(mk_pte(page, vma->vm_page_prot)));
> Index: linux-2.6/mm/init-mm.c
> ===================================================================
> --- linux-2.6.orig/mm/init-mm.c 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/init-mm.c 2009-11-04 13:13:42.000000000 -0600
> @@ -8,6 +8,8 @@
> #include <asm/atomic.h>
> #include <asm/pgtable.h>
>
> +DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
> +
> struct mm_struct init_mm = {
> .mm_rb = RB_ROOT,
> .pgd = swapper_pg_dir,
> @@ -17,4 +19,5 @@ struct mm_struct init_mm = {
> .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
> .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
> .cpu_vm_mask = CPU_MASK_ALL,
> + .rss = &init_mm_counters,
> };
>

2009-11-05 08:27:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [MM] Remove rss batching from copy_page_range()

On Wed, Nov 04, 2009 at 05:02:12PM -0500, Christoph Lameter wrote:
> On Wed, 4 Nov 2009, Andi Kleen wrote:
>
> > > With per cpu counters in mm there is no need for batching
> > > mm counter updates anymore. Update counters directly while
> > > copying pages.
> >
> > Hmm, but with all the inlining with some luck the local
> > counters will be in registers. That will never be the case
> > with the per cpu counters.
>
> The function is too big for that to occur and the counters have to be

If it's only called once then gcc doesn't care about size.

> preserved across function calls. The code is shorter with the patch
> applied:

I see. Thanks for the data.

-Andi

2009-11-05 19:44:53

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Wed, 4 Nov 2009, Dave Jones wrote:

> On Wed, Nov 04, 2009 at 02:14:41PM -0500, Christoph Lameter wrote:
>
> > + memset(m, sizeof(struct mm_counter), 0);
>
> Args wrong way around.

Right. It works because percpu_alloc zeroes the data anyways.

2009-11-05 18:02:57

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Hmm, I don't fully understand _new_ percpu but...
> In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig.
> BIGSMP.

x86-32 only supports 32 processors. Plus per cpu areas are only allocated
for the possible processors.

> Then, if 65536 process runs, this consumes
>
> 65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes.

With 32 possible cpus this results in 16m of per cpu space use.

2009-11-05 20:23:33

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

From: Christoph Lameter <[email protected]>
Subject: Make mm counters per cpu V2

Changing the mm counters to per cpu counters is possible after the introduction
of the generic per cpu operations (currently in percpu and -next).

With that the contention on the counters in mm_struct can be avoided. The
USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
need to perform atomic updates to mm counters anymore. Various code paths
can be simplified since per cpu counter updates are fast and batching
of counter updates is no longer needed.

One price to pay for these improvements is the need to scan over all percpu
counters when the actual count values are needed.

V1->V2
- Remove useless and buggy per cpu counter initialization.
alloc_percpu already zeros the values.

Signed-off-by: Christoph Lameter <[email protected]>

---
fs/proc/task_mmu.c | 14 +++++++++-
include/linux/mm_types.h | 16 ++++--------
include/linux/sched.h | 61 ++++++++++++++++++++---------------------------
kernel/fork.c | 18 +++++++++----
mm/filemap_xip.c | 2 -
mm/fremap.c | 2 -
mm/init-mm.c | 3 ++
mm/memory.c | 20 +++++++--------
mm/rmap.c | 10 +++----
mm/swapfile.c | 2 -
10 files changed, 77 insertions(+), 71 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/include/linux/mm_types.h 2009-11-05 09:22:37.000000000 -0600
@@ -24,11 +24,10 @@ struct address_space;

#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)

-#if USE_SPLIT_PTLOCKS
-typedef atomic_long_t mm_counter_t;
-#else /* !USE_SPLIT_PTLOCKS */
-typedef unsigned long mm_counter_t;
-#endif /* !USE_SPLIT_PTLOCKS */
+struct mm_counter {
+ long file;
+ long anon;
+};

/*
* Each physical page in the system has a struct page associated with
@@ -223,11 +222,8 @@ struct mm_struct {
* by mmlist_lock
*/

- /* Special counters, in some configurations protected by the
- * page_table_lock, in other configurations by being atomic.
- */
- mm_counter_t _file_rss;
- mm_counter_t _anon_rss;
+ /* Special percpu counters */
+ struct mm_counter *rss;

unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/include/linux/sched.h 2009-11-05 09:22:37.000000000 -0600
@@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
extern void arch_unmap_area(struct mm_struct *, unsigned long);
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);

-#if USE_SPLIT_PTLOCKS
-/*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-
-#else /* !USE_SPLIT_PTLOCKS */
-/*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
-#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
-#define get_mm_counter(mm, member) ((mm)->_##member)
-#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
-#define inc_mm_counter(mm, member) (mm)->_##member++
-#define dec_mm_counter(mm, member) (mm)->_##member--
-
-#endif /* !USE_SPLIT_PTLOCKS */
-
-#define get_mm_rss(mm) \
- (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
-#define update_hiwater_rss(mm) do { \
- unsigned long _rss = get_mm_rss(mm); \
- if ((mm)->hiwater_rss < _rss) \
- (mm)->hiwater_rss = _rss; \
-} while (0)
-#define update_hiwater_vm(mm) do { \
- if ((mm)->hiwater_vm < (mm)->total_vm) \
- (mm)->hiwater_vm = (mm)->total_vm; \
-} while (0)
+static inline unsigned long get_mm_rss(struct mm_struct *mm)
+{
+ int cpu;
+ unsigned long r = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+ r = c->file + c->anon;
+ }
+
+ return r;
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+ unsigned long _rss = get_mm_rss(mm);
+ if (mm->hiwater_rss < _rss)
+ mm->hiwater_rss = _rss;
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+ if (mm->hiwater_vm < mm->total_vm)
+ mm->hiwater_vm = mm->total_vm;
+}

static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/kernel/fork.c 2009-11-05 09:25:30.000000000 -0600
@@ -452,8 +452,6 @@ static struct mm_struct * mm_init(struct
(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
mm->nr_ptes = 0;
- set_mm_counter(mm, file_rss, 0);
- set_mm_counter(mm, anon_rss, 0);
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
@@ -480,7 +478,13 @@ struct mm_struct * mm_alloc(void)
mm = allocate_mm();
if (mm) {
memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm, current);
+ mm->rss = alloc_percpu(struct mm_counter);
+ if (mm->rss)
+ mm = mm_init(mm, current);
+ else {
+ free_mm(mm);
+ mm = NULL;
+ }
}
return mm;
}
@@ -496,6 +500,7 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
+ free_percpu(mm->rss);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -631,6 +636,9 @@ struct mm_struct *dup_mm(struct task_str
goto fail_nomem;

memcpy(mm, oldmm, sizeof(*mm));
+ mm->rss = alloc_percpu(struct mm_counter);
+ if (!mm->rss)
+ goto fail_nomem;

/* Initializing for Swap token stuff */
mm->token_priority = 0;
@@ -661,15 +669,13 @@ free_pt:
mm->binfmt = NULL;
mmput(mm);

-fail_nomem:
- return NULL;
-
fail_nocontext:
/*
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
*/
mm_free_pgd(mm);
+fail_nomem:
free_mm(mm);
return NULL;
}
Index: linux-2.6/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.orig/fs/proc/task_mmu.c 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/fs/proc/task_mmu.c 2009-11-05 09:22:37.000000000 -0600
@@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
int task_statm(struct mm_struct *mm, int *shared, int *text,
int *data, int *resident)
{
- *shared = get_mm_counter(mm, file_rss);
+ int cpu;
+ int anon_rss = 0;
+ int file_rss = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+ anon_rss += c->anon;
+ file_rss += c->file;
+ }
+ *shared = file_rss;
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm;
- *resident = *shared + get_mm_counter(mm, anon_rss);
+ *resident = *shared + anon_rss;
return mm->total_vm;
}

Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/filemap_xip.c 2009-11-05 09:22:37.000000000 -0600
@@ -194,7 +194,7 @@ retry:
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush_notify(vma, address, pte);
page_remove_rmap(page);
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
page_cache_release(page);
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/fremap.c 2009-11-05 09:22:37.000000000 -0600
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
page_remove_rmap(page);
page_cache_release(page);
update_hiwater_rss(mm);
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
}
} else {
if (!pte_file(pte))
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/memory.c 2009-11-05 09:22:37.000000000 -0600
@@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
{
if (file_rss)
- add_mm_counter(mm, file_rss, file_rss);
+ __this_cpu_add(mm->rss->file, file_rss);
if (anon_rss)
- add_mm_counter(mm, anon_rss, anon_rss);
+ __this_cpu_add(mm->rss->anon, anon_rss);
}

/*
@@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st

/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter(mm, file_rss);
+ __this_cpu_inc(mm->rss->file);
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));

@@ -2148,11 +2148,11 @@ gotten:
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter(mm, file_rss);
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_dec(mm->rss->file);
+ __this_cpu_inc(mm->rss->anon);
}
} else
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
* discarded at swap_free().
*/

- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
if (!pte_none(*page_table))
goto release;

- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
page_add_new_anon_rmap(page, vma, address);
setpte:
set_pte_at(mm, address, page_table, entry);
@@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
page_add_new_anon_rmap(page, vma, address);
} else {
- inc_mm_counter(mm, file_rss);
+ __this_cpu_inc(mm->rss->file);
page_add_file_rmap(page);
if (flags & FAULT_FLAG_WRITE) {
dirty_page = page;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/rmap.c 2009-11-05 09:22:37.000000000 -0600
@@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page

if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
if (PageAnon(page))
- dec_mm_counter(mm, anon_rss);
+ __this_cpu_dec(mm->rss->anon);
else
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (PageAnon(page)) {
@@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
- dec_mm_counter(mm, anon_rss);
+ __this_cpu_dec(mm->rss->anon);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
@@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);


page_remove_rmap(page);
@@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned

page_remove_rmap(page);
page_cache_release(page);
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/swapfile.c 2009-11-05 09:22:37.000000000 -0600
@@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
goto out;
}

- inc_mm_counter(vma->vm_mm, anon_rss);
+ __this_cpu_inc(vma->vm_mm->rss->anon);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6/mm/init-mm.c
===================================================================
--- linux-2.6.orig/mm/init-mm.c 2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/init-mm.c 2009-11-05 09:22:37.000000000 -0600
@@ -8,6 +8,8 @@
#include <asm/atomic.h>
#include <asm/pgtable.h>

+DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
+
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
@@ -17,4 +19,5 @@ struct mm_struct init_mm = {
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,
+ .rss = &init_mm_counters,
};

2009-11-05 23:45:12

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Thu, 5 Nov 2009 10:10:56 -0500 (EST)
Christoph Lameter <[email protected]> wrote:

> On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
> > Hmm, I don't fully understand _new_ percpu but...
> > In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig.
> > BIGSMP.
>
> x86-32 only supports 32 processors. Plus per cpu areas are only allocated
> for the possible processors.
>
My number is just from Kconfig.

> > Then, if 65536 process runs, this consumes
> >
> > 65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes.
>
> With 32 possible cpus this results in 16m of per cpu space use.
>
If swap_usage is added, 24m, 25% of vmalloc area.
(But, yes, returning -ENOMEM to fork() is ok to me, 65536 proc are extreme.)

Thanks,
-Kame

2009-11-06 01:13:39

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
Christoph Lameter <[email protected]> wrote:

> From: Christoph Lameter <[email protected]>
> Subject: Make mm counters per cpu V2
>
> Changing the mm counters to per cpu counters is possible after the introduction
> of the generic per cpu operations (currently in percpu and -next).
>
> With that the contention on the counters in mm_struct can be avoided. The
> USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> need to perform atomic updates to mm counters anymore. Various code paths
> can be simplified since per cpu counter updates are fast and batching
> of counter updates is no longer needed.
>
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.
>
> V1->V2
> - Remove useless and buggy per cpu counter initialization.
> alloc_percpu already zeros the values.
>
> Signed-off-by: Christoph Lameter <[email protected]>
>
Thanks. My small concern is read-side.

This is the result of 'top -b -n 1' with 2000 processes(most of them just sleep)
on my 8cpu, SMP box.

== [Before]
Performance counter stats for 'top -b -n 1' (5 runs):

406.690304 task-clock-msecs # 0.442 CPUs ( +- 3.327% )
32 context-switches # 0.000 M/sec ( +- 0.000% )
0 CPU-migrations # 0.000 M/sec ( +- 0.000% )
718 page-faults # 0.002 M/sec ( +- 0.000% )
987832447 cycles # 2428.955 M/sec ( +- 2.655% )
933831356 instructions # 0.945 IPC ( +- 2.585% )
17383990 cache-references # 42.745 M/sec ( +- 1.676% )
353620 cache-misses # 0.870 M/sec ( +- 0.614% )

0.920712639 seconds time elapsed ( +- 1.609% )

== [After]
Performance counter stats for 'top -b -n 1' (5 runs):

675.926348 task-clock-msecs # 0.568 CPUs ( +- 0.601% )
62 context-switches # 0.000 M/sec ( +- 1.587% )
0 CPU-migrations # 0.000 M/sec ( +- 0.000% )
1095 page-faults # 0.002 M/sec ( +- 0.000% )
1896320818 cycles # 2805.514 M/sec ( +- 1.494% )
1790600289 instructions # 0.944 IPC ( +- 1.333% )
35406398 cache-references # 52.382 M/sec ( +- 0.876% )
722781 cache-misses # 1.069 M/sec ( +- 0.192% )

1.190605561 seconds time elapsed ( +- 0.417% )

Because I know 'ps' related workload is used in various ways, "How this will
be in large smp" is my concern.

Maybe usual use of 'ps -elf' will not read RSS value and not affected by this.
If this counter supports single-thread-mode (most of apps are single threaded),
impact will not be big.

Thanks,
-Kame

2009-11-06 03:26:19

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

On Fri, 6 Nov 2009 10:11:06 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> This is the result of 'top -b -n 1' with 2000 processes(most of them just sleep)
> on my 8cpu, SMP box.
>
> == [Before]
> Performance counter stats for 'top -b -n 1' (5 runs):
>
> 406.690304 task-clock-msecs # 0.442 CPUs ( +- 3.327% )
> 32 context-switches # 0.000 M/sec ( +- 0.000% )
> 0 CPU-migrations # 0.000 M/sec ( +- 0.000% )
> 718 page-faults # 0.002 M/sec ( +- 0.000% )
> 987832447 cycles # 2428.955 M/sec ( +- 2.655% )
> 933831356 instructions # 0.945 IPC ( +- 2.585% )
> 17383990 cache-references # 42.745 M/sec ( +- 1.676% )
> 353620 cache-misses # 0.870 M/sec ( +- 0.614% )
>
> 0.920712639 seconds time elapsed ( +- 1.609% )
>
> == [After]
> Performance counter stats for 'top -b -n 1' (5 runs):
>
> 675.926348 task-clock-msecs # 0.568 CPUs ( +- 0.601% )
> 62 context-switches # 0.000 M/sec ( +- 1.587% )
> 0 CPU-migrations # 0.000 M/sec ( +- 0.000% )
> 1095 page-faults # 0.002 M/sec ( +- 0.000% )
> 1896320818 cycles # 2805.514 M/sec ( +- 1.494% )
> 1790600289 instructions # 0.944 IPC ( +- 1.333% )
> 35406398 cache-references # 52.382 M/sec ( +- 0.876% )
> 722781 cache-misses # 1.069 M/sec ( +- 0.192% )
>
> 1.190605561 seconds time elapsed ( +- 0.417% )
>
> Because I know 'ps' related workload is used in various ways, "How this will
> be in large smp" is my concern.
>
> Maybe usual use of 'ps -elf' will not read RSS value and not affected by this.
> If this counter supports single-thread-mode (most of apps are single threaded),
> impact will not be big.
>

Measured extreme case benefits with attached program.
please see # of page faults. Bigger is better.
please let me know my program is buggy.
Excuse:
My .config may not be for extreme performace challenge, and my host only have 8cpus.
(memcg is enabled, hahaha...)

# of page fault is not very stable (affected by task-clock-msecs.)
but maybe we have some improvements.

I'd like to see score of "top" and this in big servers......

BTW, can't we have single-thread-mode for this counter ?
Usual program's read-side will get much benefit.....


==[Before]==
Performance counter stats for './multi-fault 8' (5 runs):

474810.516710 task-clock-msecs # 7.912 CPUs ( +- 0.006% )
10713 context-switches # 0.000 M/sec ( +- 2.529% )
8 CPU-migrations # 0.000 M/sec ( +- 0.000% )
16669105 page-faults # 0.035 M/sec ( +- 0.449% )
1487101488902 cycles # 3131.989 M/sec ( +- 0.012% )
307164795479 instructions # 0.207 IPC ( +- 0.177% )
2355518599 cache-references # 4.961 M/sec ( +- 0.420% )
901969818 cache-misses # 1.900 M/sec ( +- 0.824% )

60.008425257 seconds time elapsed ( +- 0.004% )

==[After]==
Performance counter stats for './multi-fault 8' (5 runs):

474212.969563 task-clock-msecs # 7.902 CPUs ( +- 0.007% )
10281 context-switches # 0.000 M/sec ( +- 0.156% )
9 CPU-migrations # 0.000 M/sec ( +- 0.000% )
16795696 page-faults # 0.035 M/sec ( +- 2.218% )
1485411063159 cycles # 3132.371 M/sec ( +- 0.014% )
305810331186 instructions # 0.206 IPC ( +- 0.133% )
2391293765 cache-references # 5.043 M/sec ( +- 0.737% )
890490519 cache-misses # 1.878 M/sec ( +- 0.212% )

60.010631769 seconds time elapsed ( +- 0.004% )

Thanks,
-Kame

==

/*
* multi-fault.c :: causes 60secs of parallel page fault in multi-thread.
* % gcc -O2 -o multi-fault multi-fault.c -lpthread
* % multi-fault # of cpus.
*/

#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#define NR_THREADS 32
pthread_t threads[NR_THREADS];
/*
* For avoiding contention in page table lock, FAULT area is
* sparse. If FAULT_LENGTH is too large for your cpus, decrease it.
*/
#define MMAP_LENGTH (8 * 1024 * 1024)
#define FAULT_LENGTH (2 * 1024 * 1024)
void *mmap_area[NR_THREADS];
#define PAGE_SIZE 4096

pthread_barrier_t barrier;
int name[NR_THREADS];

void *worker(void *data)
{
int cpu = *(int *)data;
cpu_set_t set;

CPU_ZERO(&set);
CPU_SET(cpu, &set);
sched_setaffinity(0, sizeof(set), &set);
pthread_barrier_wait(&barrier);

while (1) {
char *c;
char *start = mmap_area[cpu];
char *end = mmap_area[cpu] + FAULT_LENGTH;

for (c = start; c < end; c += PAGE_SIZE)
*c = 0;

madvise(start, FAULT_LENGTH, MADV_DONTNEED);
}
return NULL;
}

int main(int argc, char *argv[])
{
int i, num, ret;

if (argc < 2)
return 0;

num = atoi(argv[1]);

pthread_barrier_init(&barrier, NULL, num + 1);

for (i = 0; i < num; i++) {
name[i] = i;
ret = pthread_create(&threads[i], NULL, worker, &name[i]);
if (ret < 0) {
perror("pthread create");
return 0;
}
mmap_area[i] = mmap(NULL, MMAP_LENGTH,
PROT_WRITE | PROT_READ,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
}
pthread_barrier_wait(&barrier);
sleep(60);
return 0;
}




2009-11-06 04:11:25

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
Christoph Lameter <[email protected]> wrote:
> +static inline unsigned long get_mm_rss(struct mm_struct *mm)
> +{
> + int cpu;
> + unsigned long r = 0;
> +
> + for_each_possible_cpu(cpu) {
> + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> + r = c->file + c->anon;

r += c->file + c->anon;

Thanks,
-Kame

2009-11-06 04:18:24

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
Christoph Lameter <[email protected]> wrote:

> +static inline unsigned long get_mm_rss(struct mm_struct *mm)
> +{
> + int cpu;
> + unsigned long r = 0;
> +
> + for_each_possible_cpu(cpu) {
> + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> + r = c->file + c->anon;
> + }
> +
> + return r;
> +}
> +
> +static inline void update_hiwater_rss(struct mm_struct *mm)
> +{
> + unsigned long _rss = get_mm_rss(mm);
> + if (mm->hiwater_rss < _rss)
> + mm->hiwater_rss = _rss;
> +}
> +

I'm sorry for my replies are scatterd.

Isn't it better to add some filter in following path ?

==
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
enum ttu_flags flags)
{
<snip>
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
==

Thanks,
-Kame

2009-11-06 17:34:15

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote:

> BTW, can't we have single-thread-mode for this counter ?
> Usual program's read-side will get much benefit.....

Thanks for the measurements.

A single thread mode would be good. Ideas on how to add that would be
appreciated.

2009-11-06 19:03:26

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

Christoph Lameter wrote:
> On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
>> BTW, can't we have single-thread-mode for this counter ?
>> Usual program's read-side will get much benefit.....
>
> Thanks for the measurements.
>
> A single thread mode would be good. Ideas on how to add that would be
> appreciated.
>

Maybe there are some ways....At brief thought....
==
struct usage_counter {
long rss;
long file;
}


struct mm_struct {
....
atomic_long_t rss; /* only updated when usage_counter is NULL */
atomic_long_t file; /* only updated when usage_counter is NULL */
struct usage_counter *usage; /* percpu counter used when
multi-threaded */
.....
}

And allocate mm->usage only when the first CLONE_THREAD is specified.

if (mm->usage)
access per cpu
else
atomic_long_xxx

and read operation will be

val = atomic_read(mm->rss);
if (mm->usage)
for_each_possible_cpu()....
==
Does "if" seems too costly ?

If this idea is bad, I think moving mm_counter to task_struct from
mm_struct and doing slow-sync is an idea instead of percpu.

for example

struct task_struct {
....
mm_counter_t temp_counter;
....
};

struct mm_struct {
.....
atomic_long_t rss;
atomic_long_t file;
};

And adds temp_counter's value to mm_struct at some good point....before
sleep ?
kswapd and reclaim routine can update mm_struct's counter, directly.
Readers just read mm_struct's counter.

Thanks,
-Kame








2009-11-06 19:15:07

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote:

> And allocate mm->usage only when the first CLONE_THREAD is specified.

Ok.

> if (mm->usage)
> access per cpu
> else
> atomic_long_xxx

If we just have one thread: Do we need atomic access at all?

> and read operation will be
>
> val = atomic_read(mm->rss);
> if (mm->usage)
> for_each_possible_cpu()....

or
val = m->rss
for_each_cpu(cpu) val+= percpu ...


> ==
> Does "if" seems too costly ?

The above method would avoid the if.

> If this idea is bad, I think moving mm_counter to task_struct from
> mm_struct and doing slow-sync is an idea instead of percpu.

Yeah then the access is effectively percpu as long as preempt is disabled.

But then for the mmap_writer_lock we would need to traverse a doubly
linked list to add up the counters. Bad caching on that one and we would
have to lock the list too. Sigh.

> kswapd and reclaim routine can update mm_struct's counter, directly.
> Readers just read mm_struct's counter.

Would work for rss counters but not for avoiding the rw semaphore I guess.

2009-11-06 19:20:52

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

Christoph Lameter wrote:
> On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
>> And allocate mm->usage only when the first CLONE_THREAD is specified.
>
> Ok.
>
>> if (mm->usage)
>> access per cpu
>> else
>> atomic_long_xxx
>
> If we just have one thread: Do we need atomic access at all?
>
Unfortunately, kswapd/vmscan touch this.

Thanks,
-Kame

2009-11-06 19:49:08

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote:

> > If we just have one thread: Do we need atomic access at all?
> >
> Unfortunately, kswapd/vmscan touch this.

Right. And those can also occur from another processor that the process
never has run on before. Argh.

2009-11-10 22:45:16

by Andrew Morton

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

On Fri, 6 Nov 2009 10:11:06 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:

> On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
> Christoph Lameter <[email protected]> wrote:
>
> > From: Christoph Lameter <[email protected]>
> > Subject: Make mm counters per cpu V2
> >
> > Changing the mm counters to per cpu counters is possible after the introduction
> > of the generic per cpu operations (currently in percpu and -next).
> >
> > With that the contention on the counters in mm_struct can be avoided. The
> > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> > need to perform atomic updates to mm counters anymore. Various code paths
> > can be simplified since per cpu counter updates are fast and batching
> > of counter updates is no longer needed.
> >
> > One price to pay for these improvements is the need to scan over all percpu
> > counters when the actual count values are needed.
> >
> > V1->V2
> > - Remove useless and buggy per cpu counter initialization.
> > alloc_percpu already zeros the values.
> >
> > Signed-off-by: Christoph Lameter <[email protected]>
> >
> Thanks. My small concern is read-side.

Me too.

For example, with 1000 possible CPUs (possible, not present and not
online), and 1000 processes, ps(1) will have to wallow through a
million cachelines in task_statm().

And then we have get_mm_rs(), which now will hit 1000 cachelines. And
get_mm_rs() is called (via
account_user_time()->acct_update_integrals()) from the clock tick.

Adding a thousand cache misses to the timer interrupt is the sort of
thing which makes people unhappy?

2009-11-10 23:22:29

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic V2

On Tue, 10 Nov 2009, Andrew Morton wrote:

> Adding a thousand cache misses to the timer interrupt is the sort of
> thing which makes people unhappy?

Obviously I was hoping for new ideas instead of just restatements of the
problem.

2009-11-17 06:46:16

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote:
> From: Christoph Lameter <[email protected]>
> Subject: Make mm counters per cpu
>
> Changing the mm counters to per cpu counters is possible after the introduction
> of the generic per cpu operations (currently in percpu and -next).
>
> With that the contention on the counters in mm_struct can be avoided. The
> USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> need to perform atomic updates to mm counters anymore. Various code paths
> can be simplified since per cpu counter updates are fast and batching
> of counter updates is no longer needed.
>
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.
>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> ---
> fs/proc/task_mmu.c | 14 +++++++++-
> include/linux/mm_types.h | 16 ++++--------
> include/linux/sched.h | 61 ++++++++++++++++++++---------------------------
> kernel/fork.c | 25 ++++++++++++++-----
> mm/filemap_xip.c | 2 -
> mm/fremap.c | 2 -
> mm/init-mm.c | 3 ++
> mm/memory.c | 20 +++++++--------
> mm/rmap.c | 10 +++----
> mm/swapfile.c | 2 -
> 10 files changed, 84 insertions(+), 71 deletions(-)
>
> Index: linux-2.6/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600
> @@ -24,11 +24,10 @@ struct address_space;

> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600
> @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
>
> static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> {
> + int cpu;
> +
> atomic_set(&mm->mm_users, 1);
> atomic_set(&mm->mm_count, 1);
> init_rwsem(&mm->mmap_sem);
> @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
> (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
> mm->core_state = NULL;
> mm->nr_ptes = 0;
> - set_mm_counter(mm, file_rss, 0);
> - set_mm_counter(mm, anon_rss, 0);
> + for_each_possible_cpu(cpu) {
> + struct mm_counter *m;
> +
> + memset(m, sizeof(struct mm_counter), 0);
Above memset is wrong.
1) m isn't initiated;
2) It seems the 2nd and the 3rd parameters should be interchanged.

2009-11-17 07:29:25

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Tue, 2009-11-17 at 14:48 +0800, Zhang, Yanmin wrote:
> On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote:
> > From: Christoph Lameter <[email protected]>
> > Subject: Make mm counters per cpu
> >
> > Changing the mm counters to per cpu counters is possible after the introduction
> > of the generic per cpu operations (currently in percpu and -next).
> >
> > With that the contention on the counters in mm_struct can be avoided. The
> > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> > need to perform atomic updates to mm counters anymore. Various code paths
> > can be simplified since per cpu counter updates are fast and batching
> > of counter updates is no longer needed.
> >
> > One price to pay for these improvements is the need to scan over all percpu
> > counters when the actual count values are needed.
> >
> > Signed-off-by: Christoph Lameter <[email protected]>
> >
> > ---
> > fs/proc/task_mmu.c | 14 +++++++++-
> > include/linux/mm_types.h | 16 ++++--------
> > include/linux/sched.h | 61 ++++++++++++++++++++---------------------------
> > kernel/fork.c | 25 ++++++++++++++-----
> > mm/filemap_xip.c | 2 -
> > mm/fremap.c | 2 -
> > mm/init-mm.c | 3 ++
> > mm/memory.c | 20 +++++++--------
> > mm/rmap.c | 10 +++----
> > mm/swapfile.c | 2 -
> > 10 files changed, 84 insertions(+), 71 deletions(-)
> >
> > Index: linux-2.6/include/linux/mm_types.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600
> > +++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600
> > @@ -24,11 +24,10 @@ struct address_space;
>
> > Index: linux-2.6/kernel/fork.c
> > ===================================================================
> > --- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600
> > +++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600
> > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> >
> > static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> > {
> > + int cpu;
> > +
> > atomic_set(&mm->mm_users, 1);
> > atomic_set(&mm->mm_count, 1);
> > init_rwsem(&mm->mmap_sem);
> > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
> > (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
> > mm->core_state = NULL;
> > mm->nr_ptes = 0;
> > - set_mm_counter(mm, file_rss, 0);
> > - set_mm_counter(mm, anon_rss, 0);
> > + for_each_possible_cpu(cpu) {
> > + struct mm_counter *m;
> > +
> > + memset(m, sizeof(struct mm_counter), 0);
> Above memset is wrong.
> 1) m isn't initiated;
> 2) It seems the 2nd and the 3rd parameters should be interchanged.
Changing it to below fixes the command hang issue.

for_each_possible_cpu(cpu) {
struct mm_counter *m = per_cpu(mm->rss->readers, cpu);

memset(m, 0, sizeof(struct mm_counter));
}

2009-11-17 09:32:00

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Tue, 2009-11-17 at 15:31 +0800, Zhang, Yanmin wrote:
> On Tue, 2009-11-17 at 14:48 +0800, Zhang, Yanmin wrote:
> > On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote:
> > > From: Christoph Lameter <[email protected]>
> > > Subject: Make mm counters per cpu
> > >
> > > Changing the mm counters to per cpu counters is possible after the introduction
> > > of the generic per cpu operations (currently in percpu and -next).
> > >
> > > With that the contention on the counters in mm_struct can be avoided. The
> > > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> > > need to perform atomic updates to mm counters anymore. Various code paths
> > > can be simplified since per cpu counter updates are fast and batching
> > > of counter updates is no longer needed.
> > >
> > > One price to pay for these improvements is the need to scan over all percpu
> > > counters when the actual count values are needed.
> > >
> > > Signed-off-by: Christoph Lameter <[email protected]>
> > >
> > > ---
> > > fs/proc/task_mmu.c | 14 +++++++++-
> > > include/linux/mm_types.h | 16 ++++--------
> > > include/linux/sched.h | 61 ++++++++++++++++++++---------------------------
> > > kernel/fork.c | 25 ++++++++++++++-----
> > > mm/filemap_xip.c | 2 -
> > > mm/fremap.c | 2 -
> > > mm/init-mm.c | 3 ++
> > > mm/memory.c | 20 +++++++--------
> > > mm/rmap.c | 10 +++----
> > > mm/swapfile.c | 2 -
> > > 10 files changed, 84 insertions(+), 71 deletions(-)
> > >
> > > Index: linux-2.6/include/linux/mm_types.h
> > > ===================================================================
> > > --- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600
> > > +++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600
> > > @@ -24,11 +24,10 @@ struct address_space;
> >
> > > Index: linux-2.6/kernel/fork.c
> > > ===================================================================
> > > --- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600
> > > +++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600
> > > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> > >
> > > static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> > > {
> > > + int cpu;
> > > +
> > > atomic_set(&mm->mm_users, 1);
> > > atomic_set(&mm->mm_count, 1);
> > > init_rwsem(&mm->mmap_sem);
> > > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
> > > (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
> > > mm->core_state = NULL;
> > > mm->nr_ptes = 0;
> > > - set_mm_counter(mm, file_rss, 0);
> > > - set_mm_counter(mm, anon_rss, 0);
> > > + for_each_possible_cpu(cpu) {
> > > + struct mm_counter *m;
> > > +
> > > + memset(m, sizeof(struct mm_counter), 0);
> > Above memset is wrong.
> > 1) m isn't initiated;
> > 2) It seems the 2nd and the 3rd parameters should be interchanged.
> Changing it to below fixes the command hang issue.
>
> for_each_possible_cpu(cpu) {
> struct mm_counter *m = per_cpu(mm->rss->readers, cpu);
>
> memset(m, 0, sizeof(struct mm_counter));
> }

Sorry. I was too optimistic and used another kernel to boot.
The right change above should be:
struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);

Compiler doesn't report error/warning when I use any member.

With the change, command 'make oldconfig' and a boot command still
hangs.

Yanmin

2009-11-17 17:29:53

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Tue, 17 Nov 2009, Zhang, Yanmin wrote:

> The right change above should be:
> struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);

Right.

> With the change, command 'make oldconfig' and a boot command still
> hangs.

Not sure if its worth spending more time on this but if you want I will
consolidate the fixes so far and put out another patchset.

Where does it hang during boot?

2009-11-19 00:45:38

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Tue, 2009-11-17 at 12:25 -0500, Christoph Lameter wrote:
> On Tue, 17 Nov 2009, Zhang, Yanmin wrote:
>
> > The right change above should be:
> > struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);
>
> Right.
>
> > With the change, command 'make oldconfig' and a boot command still
> > hangs.
>
> Not sure if its worth spending more time on this but if you want I will
> consolidate the fixes so far and put out another patchset.
>
> Where does it hang during boot?
>
1) A init boot script calss pidof and pidof hands in
access_process_vm => (mutex_lock <=> mutex_unlock), so actually in
mm_reader_lock.
2) 'make oldconfig' hangs in sys_map => msleep, actually in mm_writer_lock.

I will check it today.


2009-11-23 08:48:41

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Tue, 2009-11-17 at 12:25 -0500, Christoph Lameter wrote:
> On Tue, 17 Nov 2009, Zhang, Yanmin wrote:
>
> > The right change above should be:
> > struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);
>
> Right.
>
> > With the change, command 'make oldconfig' and a boot command still
> > hangs.
>
> Not sure if its worth spending more time on this but if you want I will
> consolidate the fixes so far and put out another patchset.
>
> Where does it hang during boot?
Definitely faint.

1) In function exec_mmap: in the 2nd 'if (old_mm) {', mm_reader_unlock
should be used. Your patch uses mm_reader_lock. I found it when reviewing your
patch, but forgot to fix it when testing.
2) In function madvise: the last unlock should be mm_reader_unlock. Your
patch uses mm_writer_unlock.

It's easy to hit the issues with normal testing. I'm surprised you didn't
hit them.

Another theoretic issue is below scenario:
Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then
it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter
will overflow. Considering multiple thread cases, it might be faster to
overflow than what we imagine. When it overflows, processes will hang there.

2009-11-23 14:40:33

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Mon, 23 Nov 2009, Zhang, Yanmin wrote:

> Another theoretic issue is below scenario:
> Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then
> it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter
> will overflow. Considering multiple thread cases, it might be faster to
> overflow than what we imagine. When it overflows, processes will hang there.

True.... We need to find some alternative to per cpu data to scale mmap
sem then.

2009-11-24 08:00:04

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Mon, 2009-11-23 at 08:31 -0600, Christoph Lameter wrote:
> On Mon, 23 Nov 2009, Zhang, Yanmin wrote:
>
> > Another theoretic issue is below scenario:
> > Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then
> > it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter
> > will overflow. Considering multiple thread cases, it might be faster to
> > overflow than what we imagine. When it overflows, processes will hang there.
>
> True.... We need to find some alternative to per cpu data to scale mmap
> sem then.
I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone
/sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical
cpu machine, and didn't find big result difference between with your patch and without
your patch.


2009-11-24 15:18:41

by Christoph Lameter

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Tue, 24 Nov 2009, Zhang, Yanmin wrote:

> > True.... We need to find some alternative to per cpu data to scale mmap
> > sem then.
> I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone
> /sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical
> cpu machine, and didn't find big result difference between with your patch and without
> your patch.

This affects loads that heavily use mmap_sem. You wont find too many
issues in tests that do not run processes with a large thread count and
cause lots of faults or uses of get_user_pages(). The tests you list are
not of that nature.

2009-11-25 01:20:32

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [MM] Make mm counters per cpu instead of atomic

On Tue, 2009-11-24 at 09:17 -0600, Christoph Lameter wrote:
> On Tue, 24 Nov 2009, Zhang, Yanmin wrote:
>
> > > True.... We need to find some alternative to per cpu data to scale mmap
> > > sem then.
> > I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone
> > /sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical
> > cpu machine, and didn't find big result difference between with your patch and without
> > your patch.
>
> This affects loads that heavily use mmap_sem. You wont find too many
> issues in tests that do not run processes with a large thread count and
> cause lots of faults or uses of get_user_pages(). The tests you list are
> not of that nature.
sysbench_oltp(mysql) is kind of such workload. Both sysbench and mysql are
multi-threaded. 2 years ago, I investigated a scalability issue of such
workload and found mysql causes frequent down_read(mm->mmap_sem). Nick changes
it to down_read to fix it.

But this workload doesn't work well with more than 64 threads because mysql has some
unreasonable big locks in userspace (implemented as a conditional spinlock in
userspace).

Yanmin