LinuxLists.cc - [PATCH] show per-process swap usage via procfs

2009-11-04 06:26:56

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: [PATCH] show per-process swap usage via procfs

Passed several tests and one bug was fixed since RFC version.
This patch is against mmotm.
=
From: KAMEZAWA Hiroyuki <[email protected]>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

After this, /proc/<pid>/status includes following line
<snip>
VmPeak: 315360 kB
VmSize: 315360 kB
VmLck: 0 kB
VmHWM: 180452 kB
VmRSS: 180452 kB
VmData: 311624 kB
VmStk: 84 kB
VmExe: 4 kB
VmLib: 1568 kB
VmPTE: 640 kB
VmSwap: 131240 kB <=== new information

Note:
Because this patch catches swap_pte on page table, this will
not catch shmem's swapout. It's already accounted in per-shmem
inode and we don't need to do more.

Changelog: 2009/11/03
- clean up.
- fixed initialization bug at fork (init_mm())

Acked-by: Acked-by; David Rientjes <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
fs/proc/task_mmu.c | 9 ++++++---
include/linux/mm_types.h | 1 +
kernel/fork.c | 1 +
mm/memory.c | 30 +++++++++++++++++++++---------
mm/rmap.c | 1 +
mm/swapfile.c | 1 +
6 files changed, 31 insertions(+), 12 deletions(-)

Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
+++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
*/
mm_counter_t _file_rss;
mm_counter_t _anon_rss;
+ mm_counter_t _swap_usage;

unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
Index: mmotm-2.6.32-Nov2/mm/memory.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memory.c
+++ mmotm-2.6.32-Nov2/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
return 0;
}

-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
{
if (file_rss)
add_mm_counter(mm, file_rss, file_rss);
if (anon_rss)
add_mm_counter(mm, anon_rss, anon_rss);
+ if (swap_usage)
+ add_mm_counter(mm, swap_usage, swap_usage);
}

/*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
- if (is_write_migration_entry(entry) &&
+ if (!is_migration_entry(entry))
+ rss[2]++;
+ else if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];
+ int rss[3];
swp_entry_t entry = (swp_entry_t){0};

again:
- rss[1] = rss[0] = 0;
+ rss[2] = rss[1] = rss[0] = 0;
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
@@ -688,7 +693,7 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(orig_src_pte);
- add_mm_rss(dst_mm, rss[0], rss[1]);
+ add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();

@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
spinlock_t *ptl;
int file_rss = 0;
int anon_rss = 0;
+ int swap_usage = 0;

pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
if (pte_file(ptent)) {
if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
print_bad_pte(vma, addr, ptent, NULL);
- } else if
- (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
- print_bad_pte(vma, addr, ptent, NULL);
+ } else {
+ swp_entry_t ent = pte_to_swp_entry(ptent);
+
+ if (!is_migration_entry(ent))
+ swap_usage--;
+ if (unlikely(!free_swap_and_cache(ent)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ }
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

- add_mm_rss(mm, file_rss, anon_rss);
+ add_mm_rss(mm, file_rss, anon_rss, swap_usage);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);

@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
*/

inc_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, swap_usage);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mmotm-2.6.32-Nov2/mm/swapfile.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
+++ mmotm-2.6.32-Nov2/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
}

inc_mm_counter(vma->vm_mm, anon_rss);
+ dec_mm_counter(vma->vm_mm, swap_usage);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
+++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
unsigned long data, text, lib;
- unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;

/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+ swap = get_mm_counter(mm, swap_usage);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
- "VmPTE:\t%8lu kB\n",
+ "VmPTE:\t%8lu kB\n"
+ "VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+ (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+ swap << (PAGE_SHIFT - 10));
}

unsigned long task_vsize(struct mm_struct *mm)
Index: mmotm-2.6.32-Nov2/mm/rmap.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/rmap.c
+++ mmotm-2.6.32-Nov2/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, anon_rss);
+ inc_mm_counter(mm, swap_usage);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
Index: mmotm-2.6.32-Nov2/kernel/fork.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/kernel/fork.c
+++ mmotm-2.6.32-Nov2/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
mm->nr_ptes = 0;
set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0);
+ set_mm_counter(mm, swap_usage, 0);
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;

2009-11-04 19:16:30

by Christoph Lameter

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs

On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)

Hmmm... Could we do some rework of the counters first so that they are per
cpu?

2009-11-04 23:25:26

by KOSAKI Motohiro

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs

> On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
> > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > RSS usage is important information but one more information which
> > is often asked by users is "usage of swap".(user support team said.)
>
> Hmmm... Could we do some rework of the counters first so that they are per
> cpu?

per-cpu swap counter?
It seems overkill effort....

2009-11-05 00:09:37

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs

On Wed, 4 Nov 2009 14:15:40 -0500 (EST)
Christoph Lameter <[email protected]> wrote:

> On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
> > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > RSS usage is important information but one more information which
> > is often asked by users is "usage of swap".(user support team said.)
>
> Hmmm... Could we do some rework of the counters first so that they are per
> cpu?
>
I don't think swap_usage counter has much costs because it's call path
is always slow path. But, I'm not in hurry. So rework is ok.

I'll post my percpu array counter with some rework, CCing you.
Maybe it can be used in this case.

Thanks,
-Kame

2009-11-05 02:31:21

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs

On Thu, 5 Nov 2009 08:25:28 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:

> > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> >
> > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > RSS usage is important information but one more information which
> > > is often asked by users is "usage of swap".(user support team said.)
> >
> > Hmmm... Could we do some rework of the counters first so that they are per
> > cpu?
>
> per-cpu swap counter?
> It seems overkill effort....
>
I nearly agree with you.

Thanks,
-Kame

2009-11-05 05:19:33

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: [RFC][PATCH] lib: generic percpu counter array

On Thu, 5 Nov 2009 09:06:59 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:

> I'll post my percpu array counter with some rework, CCing you.
> Maybe it can be used in this case.
>

This pach has been on my queue for a month.
I'm glad if I can get advise from you. This patch is for memcg, now.

==
From: KAMEZAWA Hiroyuki <[email protected]>

Now, percpu code is rewritten and it's easy to use in dynamic.
We have lib/percpu_counter.c but it uses
- unsigned long long
- spinlock
so, it tend to be big size and not very optimized.

Anothter major percpu coutner is vm_stat[]. This patch implements
vm_stat[] style counter array in lib/percpu_counter.c
This is designed for introducing vm_stat[] style counter to memcg,
but maybe useful for other people. By using this, counter array
using percpu can be implemented easily in compact structure.

usage in my assumption is like this.

enum {
ELEM_A, ELEM_B, NR_ELEMENTS};
struct hoge {
....
...
DEFINE_COUNTER_ARRAY(name, NR_ELEMENT);
.....
} xxxx;

counter_array_add(_CA(xxxx->name), ELEM_A, val),

Changelog 2009/11/05
- renamed name of structures.
- rewrote all comments
- support "nosync" mode
- fixed !SMP case
- changed percpu value from "char" to "long"

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/percpu_counter.h | 107 +++++++++++++++++++++++++++++
lib/percpu_counter.c | 148 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 255 insertions(+)

Index: mmotm-2.6.32-Nov2/include/linux/percpu_counter.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/percpu_counter.h
+++ mmotm-2.6.32-Nov2/include/linux/percpu_counter.h
@@ -77,6 +77,59 @@ static inline s64 percpu_counter_read_po
return 1;
}

+/*
+ * Counter Array is array of counter like percpu_counter but it's idea is
+ * mainly from vm_stat[]. Unlike vm_stat[], this counter use "int" for batch
+ * value, If user wants, this can provides "nosync" percpu counter.
+ * But in that case, read will be slow.
+ *
+ * One more point is size of this array. This uses cacheline-size+elements
+ * size object and also use element size of percpu area. So, this will use
+ * bigger amount of memory than simple atomic_t.
+ */
+
+struct _pad_counter_array {
+ char elements;
+ char nosync;
+ int batch;
+ long *array;
+#ifdef CONFIG_HOTPLUG_CPU
+ struct list_head list;
+#endif
+} ____cacheline_aligned_in_smp;
+
+struct counter_array {
+ struct _pad_counter_array v;
+ atomic_long_t counters[0];
+};
+
+#define DEFINE_COUNTER_ARRAY(name, elements) \
+ struct {\
+ struct counter_array ca;\
+ long __counters[(elements)]; } name;
+
+#define DEFINE_COUNTER_ARRAY_NOSYNC(name, elements) \
+ struct {\
+ struct counter_array ca; } name;
+/*
+ * For access counters, using this macro is an easy way as
+ * array_counter_add( _CA(object->name), elem, val);
+ */
+#define _CA(x) (&(x)->ca)
+/* about "nosync" see lib/percpu_counrer.c for its meaning. */
+int counter_array_init(struct counter_array *ca, int size, int nosync);
+void counter_array_destroy(struct counter_array *ca);
+void counter_array_add(struct counter_array *ca, int idx, int val);
+void __counter_array_add(struct counter_array *ca, int idx, int val, int batch);
+
+static inline long counter_array_read(struct counter_array *ca, int idx)
+{
+ return atomic_long_read(&ca->counters[idx]);
+}
+
+/* take all percpu value into account */
+long counter_array_sum(struct counter_array *ca, int idx);
+
#else

struct percpu_counter {
@@ -129,6 +182,45 @@ static inline s64 percpu_counter_sum(str
return percpu_counter_read(fbc);
}

+struct counter_array {
+ atomic_long_t counters[0];
+};
+#define DEFINE_COUNTER_ARRAY(name) \
+ struct {\
+ struct counter_array ac;\
+ unsigned long counters[(elements)]; } name;\
+
+static inline int counter_array_init(struct counter_array *ca,
+ int size, int nosync)
+{
+ return 0;
+}
+
+static inline void counter_array_destroy(struct counter_array *ca)
+{
+}
+
+static inline void
+counter_array_add(struct counter_array *ca, int idx, int val)
+{
+ ca->counters[idx] += val;
+}
+
+static inline void
+__counter_array_add(struct counter_array *ca, int idx, int val, int batch)
+{
+ ca->counters[idx] += val;
+}
+
+static inline counter_array_read(struct counter_array *ca, int idx)
+{
+ return ca->counters[idx];
+}
+
+static inline counter_array_sum(struct counter_array *ca, int idx)
+{
+ return ca->counters[idx];
+}
#endif /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
@@ -146,4 +238,19 @@ static inline void percpu_counter_sub(st
percpu_counter_add(fbc, -amount);
}

+static inline void counter_array_inc(struct counter_array *ca, int idx)
+{
+ counter_array_add(ca, idx, 1);
+}
+
+static inline void counter_array_dec(struct counter_array *ca, int idx)
+{
+ counter_array_add(ca, idx, -1);
+}
+
+static inline void
+counter_array_sub(struct counter_array *ca, int idx, int val)
+{
+ counter_array_add(ca, idx, -val);
+}
#endif /* _LINUX_PERCPU_COUNTER_H */
Index: mmotm-2.6.32-Nov2/lib/percpu_counter.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/lib/percpu_counter.c
+++ mmotm-2.6.32-Nov2/lib/percpu_counter.c
@@ -144,3 +144,151 @@ static int __init percpu_counter_startup
return 0;
}
module_init(percpu_counter_startup);
+
+/* COUNTER_ARRAY */
+DEFINE_MUTEX(counter_array_mutex);
+LIST_HEAD(counter_arrays);
+#ifdef CONFIG_HOTPLUG_CPU
+#define MAINTAIN_LIST(ca) (!(ca)->v.nosync)
+#else
+#define MAINTAIN_LIST 0
+#endif
+
+/**
+ * counter_array_init - initialize counter array with percpu.
+ * @ca: counter array to be initialized
+ * @size: the number of elements in this array
+ * @nosync: need to sync in batch or not
+ *
+ * Initialize counter array which contains elements of @size. Modification
+ * of each value will be cached in percpu area and merged into global atomic
+ * counter in batched manner. If nosync==1, global atomic counter will not be
+ * used, but readers has to use countar_array_sum() always.
+ *
+ * If nosync is specified, this skips entry for a list of CPU HOTPLUG
+ * notification. If you ofren alloc/free coutners, nosync is appreciated.
+ * But you have to use counter_array_sum() to read values. It's trade-off.
+ */
+int counter_array_init(struct counter_array *ca, int size, int nosync)
+{
+ ca->v.array = __alloc_percpu(size * sizeof(long), __alignof__(long));
+ if (!ca->v.array)
+ return -ENOMEM;
+ ca->v.nosync = nosync;
+ ca->v.elements = size;
+
+ if (MAINTAIN_LIST(ca)) {
+ mutex_lock(&counter_array_mutex);
+ list_add(&ca->v.list, &counter_arrays);
+ mutex_unlock(&counter_array_mutex);
+ }
+ return 0;
+}
+
+void counter_array_destroy(struct counter_array *ca)
+{
+ if (MAINTAIN_LIST(ca)) {
+ mutex_lock(&counter_array_mutex);
+ list_add(&ca->v.list, &counter_arrays);
+ mutex_unlock(&counter_array_mutex);
+ }
+ free_percpu(ca->v.array);
+ ca->v.array = NULL;
+}
+#undef MAINTAIN_LIST
+
+/**
+ * __counter_array_add - add specified value to counter[idx]
+ * @ca: counter array to be modified
+ * @idx: index in counter array
+ * @val: value to be added
+ * @batch: threshould to coalesce percpu value to global counter.
+ *
+ * Add specified value to counter[idx]. Users can control how frequently
+ * synchronization will happen by "batch" value. If counter is initialized
+ * as "nosync" counter, no synchronization will happen.
+ */
+void __counter_array_add(struct counter_array *ca, int idx, int val, int batch)
+{
+ long count, *pcount;
+
+ preempt_disable();
+
+ pcount = this_cpu_ptr(ca->v.array);
+ count = pcount[idx] + val;
+ if (!ca->v.nosync && ((count > batch) || (count < -batch))) {
+ atomic_long_add(count, &ca->counters[idx]);
+ pcount[idx] = 0;
+ } else
+ pcount[idx] = count;
+ preempt_enable();
+}
+
+void counter_array_add(struct counter_array *ca, int idx, int val)
+{
+ __counter_array_add(ca, idx, val, percpu_counter_batch);
+}
+
+long counter_array_sum(struct counter_array *ca, int idx)
+{
+ long val, *pcount;
+ int cpu;
+
+ if (ca->v.nosync) {
+ val = 0;
+ /* We don't have CPU HOTPLUG callback */
+ for_each_possible_cpu(cpu) {
+ pcount = per_cpu_ptr(ca->v.array, cpu);
+ val += pcount[idx];
+ }
+ } else {
+ /*
+ * We don't have CPU HOTPLUG callback. There maybe race
+ * but amount of error is below batch value.
+ */
+ val = atomic_long_read(&ca->counters[idx]);
+ for_each_online_cpu(cpu) {
+ pcount = per_cpu_ptr(ca->v.array, cpu);
+ val += pcount[idx];
+ }
+ }
+ return val;
+}
+
+static int __cpuinit counter_array_hotcpu_callback(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ struct _pad_counter_array *pca;
+ unsigned int cpu;
+
+ if (action != CPU_DEAD)
+ return NOTIFY_OK;
+
+ cpu = (unsigned long)hcpu;
+ /*
+ * nosync counter is not on this list.
+ */
+ mutex_lock(&counter_array_mutex);
+ list_for_each_entry(pca, &counter_arrays, list) {
+ struct counter_array *ca;
+ long *pcount;
+ int idx;
+
+ pcount = per_cpu_ptr(pca->array, cpu);
+ ca = container_of(pca, struct counter_array, v);
+ for (idx = 0; idx < ca->v.elements; idx++) {
+ atomic_long_add(pcount[idx], &ca->counters[idx]);
+ pcount[idx] = 0;
+ }
+ }
+ mutex_unlock(&counter_array_mutex);
+
+ return NOTIFY_OK;
+}
+
+static int __init counter_array_startup(void)
+{
+ hotcpu_notifier(counter_array_hotcpu_callback, 0);
+ return 0;
+}
+module_init(counter_array_startup);

2009-11-05 14:41:46

by KOSAKI Motohiro

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs

2009/11/4 KAMEZAWA Hiroyuki <[email protected]>:
>
> Passed several tests and one bug was fixed since RFC version.
> This patch is against mmotm.
> =
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
> ?After this, /proc/<pid>/status includes following line
> ?<snip>
> ?VmPeak: ? 315360 kB
> ?VmSize: ? 315360 kB
> ?VmLck: ? ? ? ? 0 kB
> ?VmHWM: ? ?180452 kB
> ?VmRSS: ? ?180452 kB
> ?VmData: ? 311624 kB
> ?VmStk: ? ? ? ?84 kB
> ?VmExe: ? ? ? ? 4 kB
> ?VmLib: ? ? ?1568 kB
> ?VmPTE: ? ? ? 640 kB
> ?VmSwap: ? 131240 kB <=== new information
>
> Note:
> ?Because this patch catches swap_pte on page table, this will
> ?not catch shmem's swapout. It's already accounted in per-shmem
> ?inode and we don't need to do more.

Sidenote: top(1) can show SWAP usage. but it is crazy buggy
implementation. it define
VIRT = SWAP + RES (see man top or actual source code). this patch help
to fix its insane
calculation.

Acked-by: KOSAKI Motohiro <[email protected]>

2009-11-05 17:54:36

by Christoph Lameter

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs

On Thu, 5 Nov 2009, KOSAKI Motohiro wrote:

> > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> >
> > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > RSS usage is important information but one more information which
> > > is often asked by users is "usage of swap".(user support team said.)
> >
> > Hmmm... Could we do some rework of the counters first so that they are per
> > cpu?
>
> per-cpu swap counter?
> It seems overkill effort....

The other alternative is to use atomic ops which are significantly slower
and have an impact on critical sections.

2009-11-05 15:11:29

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs

Hi, Kame.

On Wed, Nov 4, 2009 at 3:24 PM, KAMEZAWA Hiroyuki
<[email protected]> wrote:
>
> Passed several tests and one bug was fixed since RFC version.
> This patch is against mmotm.
> =
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
> ?After this, /proc/<pid>/status includes following line
> ?<snip>
> ?VmPeak: ? 315360 kB
> ?VmSize: ? 315360 kB
> ?VmLck: ? ? ? ? 0 kB
> ?VmHWM: ? ?180452 kB
> ?VmRSS: ? ?180452 kB
> ?VmData: ? 311624 kB
> ?VmStk: ? ? ? ?84 kB
> ?VmExe: ? ? ? ? 4 kB
> ?VmLib: ? ? ?1568 kB
> ?VmPTE: ? ? ? 640 kB
> ?VmSwap: ? 131240 kB <=== new information
>
> Note:
> ?Because this patch catches swap_pte on page table, this will
> ?not catch shmem's swapout. It's already accounted in per-shmem
> ?inode and we don't need to do more.
>
> Changelog: 2009/11/03
> ?- clean up.
> ?- fixed initialization bug at fork (init_mm())
>
> Acked-by: Acked-by; David Rientjes <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> ?fs/proc/task_mmu.c ? ? ? | ? ?9 ++++++---
> ?include/linux/mm_types.h | ? ?1 +
> ?kernel/fork.c ? ? ? ? ? ?| ? ?1 +
> ?mm/memory.c ? ? ? ? ? ? ?| ? 30 +++++++++++++++++++++---------
> ?mm/rmap.c ? ? ? ? ? ? ? ?| ? ?1 +
> ?mm/swapfile.c ? ? ? ? ? ?| ? ?1 +
> ?6 files changed, 31 insertions(+), 12 deletions(-)
>
> Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
> +++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
> @@ -228,6 +228,7 @@ struct mm_struct {
> ? ? ? ? */
> ? ? ? ?mm_counter_t _file_rss;
> ? ? ? ?mm_counter_t _anon_rss;
> + ? ? ? mm_counter_t _swap_usage;
>
> ? ? ? ?unsigned long hiwater_rss; ? ? ?/* High-watermark of RSS usage */
> ? ? ? ?unsigned long hiwater_vm; ? ? ? /* High-water virtual memory usage */
> Index: mmotm-2.6.32-Nov2/mm/memory.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/memory.c
> +++ mmotm-2.6.32-Nov2/mm/memory.c
> @@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
> ? ? ? ?return 0;
> ?}
>
> -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
> +static inline void
> +add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
> ?{
> ? ? ? ?if (file_rss)
> ? ? ? ? ? ? ? ?add_mm_counter(mm, file_rss, file_rss);
> ? ? ? ?if (anon_rss)
> ? ? ? ? ? ? ? ?add_mm_counter(mm, anon_rss, anon_rss);
> + ? ? ? if (swap_usage)
> + ? ? ? ? ? ? ? add_mm_counter(mm, swap_usage, swap_usage);
> ?}
>
> ?/*
> @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? &src_mm->mmlist);
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?spin_unlock(&mmlist_lock);
> ? ? ? ? ? ? ? ? ? ? ? ?}
> - ? ? ? ? ? ? ? ? ? ? ? if (is_write_migration_entry(entry) &&
> + ? ? ? ? ? ? ? ? ? ? ? if (!is_migration_entry(entry))
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? rss[2]++;

First thought I come to is that we believe !is_migration_entry(entry) equal
swap entry?
We began supporting HWPOISON.
HWPOISON would be rare event so some less exact swap accouting may
be allowed, I think. Is this enough to jusitfy that?

> + ? ? ? ? ? ? ? ? ? ? ? else if (is_write_migration_entry(entry) &&
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?is_cow_mapping(vm_flags)) {
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?/*
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? * COW mappings require pages in both parent
> @@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
> ? ? ? ?pte_t *src_pte, *dst_pte;
> ? ? ? ?spinlock_t *src_ptl, *dst_ptl;
> ? ? ? ?int progress = 0;
> - ? ? ? int rss[2];
> + ? ? ? int rss[3];
> ? ? ? ?swp_entry_t entry = (swp_entry_t){0};
>
> ?again:
> - ? ? ? rss[1] = rss[0] = 0;
> + ? ? ? rss[2] = rss[1] = rss[0] = 0;
> ? ? ? ?dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
> ? ? ? ?if (!dst_pte)
> ? ? ? ? ? ? ? ?return -ENOMEM;
> @@ -688,7 +693,7 @@ again:
> ? ? ? ?arch_leave_lazy_mmu_mode();
> ? ? ? ?spin_unlock(src_ptl);
> ? ? ? ?pte_unmap_nested(orig_src_pte);
> - ? ? ? add_mm_rss(dst_mm, rss[0], rss[1]);
> + ? ? ? add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
> ? ? ? ?pte_unmap_unlock(orig_dst_pte, dst_ptl);
> ? ? ? ?cond_resched();
>
> @@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
> ? ? ? ?spinlock_t *ptl;
> ? ? ? ?int file_rss = 0;
> ? ? ? ?int anon_rss = 0;
> + ? ? ? int swap_usage = 0;
>
> ? ? ? ?pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> ? ? ? ?arch_enter_lazy_mmu_mode();
> @@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
> ? ? ? ? ? ? ? ?if (pte_file(ptent)) {
> ? ? ? ? ? ? ? ? ? ? ? ?if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?print_bad_pte(vma, addr, ptent, NULL);
> - ? ? ? ? ? ? ? } else if
> - ? ? ? ? ? ? ? ? (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
> - ? ? ? ? ? ? ? ? ? ? ? print_bad_pte(vma, addr, ptent, NULL);
> + ? ? ? ? ? ? ? } else {
> + ? ? ? ? ? ? ? ? ? ? ? swp_entry_t ent = pte_to_swp_entry(ptent);
> +
> + ? ? ? ? ? ? ? ? ? ? ? if (!is_migration_entry(ent))
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? swap_usage--;

ditto

> + ? ? ? ? ? ? ? ? ? ? ? if (unlikely(!free_swap_and_cache(ent)))
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? print_bad_pte(vma, addr, ptent, NULL);
> + ? ? ? ? ? ? ? }
> ? ? ? ? ? ? ? ?pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> ? ? ? ?} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
>
> - ? ? ? add_mm_rss(mm, file_rss, anon_rss);
> + ? ? ? add_mm_rss(mm, file_rss, anon_rss, swap_usage);
> ? ? ? ?arch_leave_lazy_mmu_mode();
> ? ? ? ?pte_unmap_unlock(pte - 1, ptl);
>
> @@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
> ? ? ? ? */
>
> ? ? ? ?inc_mm_counter(mm, anon_rss);
> + ? ? ? dec_mm_counter(mm, swap_usage);
> ? ? ? ?pte = mk_pte(page, vma->vm_page_prot);
> ? ? ? ?if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
> ? ? ? ? ? ? ? ?pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> Index: mmotm-2.6.32-Nov2/mm/swapfile.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
> +++ mmotm-2.6.32-Nov2/mm/swapfile.c
> @@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
> ? ? ? ?}
>
> ? ? ? ?inc_mm_counter(vma->vm_mm, anon_rss);
> + ? ? ? dec_mm_counter(vma->vm_mm, swap_usage);
> ? ? ? ?get_page(page);
> ? ? ? ?set_pte_at(vma->vm_mm, addr, pte,
> ? ? ? ? ? ? ? ? ? pte_mkold(mk_pte(page, vma->vm_page_prot)));
> Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
> +++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> @@ -17,7 +17,7 @@
> ?void task_mem(struct seq_file *m, struct mm_struct *mm)
> ?{
> ? ? ? ?unsigned long data, text, lib;
> - ? ? ? unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
> + ? ? ? unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
>
> ? ? ? ?/*
> ? ? ? ? * Note: to minimize their overhead, mm maintains hiwater_vm and
> @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
> ? ? ? ?data = mm->total_vm - mm->shared_vm - mm->stack_vm;
> ? ? ? ?text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
> ? ? ? ?lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
> + ? ? ? swap = get_mm_counter(mm, swap_usage);
> ? ? ? ?seq_printf(m,
> ? ? ? ? ? ? ? ?"VmPeak:\t%8lu kB\n"
> ? ? ? ? ? ? ? ?"VmSize:\t%8lu kB\n"
> @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
> ? ? ? ? ? ? ? ?"VmStk:\t%8lu kB\n"
> ? ? ? ? ? ? ? ?"VmExe:\t%8lu kB\n"
> ? ? ? ? ? ? ? ?"VmLib:\t%8lu kB\n"
> - ? ? ? ? ? ? ? "VmPTE:\t%8lu kB\n",
> + ? ? ? ? ? ? ? "VmPTE:\t%8lu kB\n"
> + ? ? ? ? ? ? ? "VmSwap:\t%8lu kB\n",
> ? ? ? ? ? ? ? ?hiwater_vm << (PAGE_SHIFT-10),
> ? ? ? ? ? ? ? ?(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
> ? ? ? ? ? ? ? ?mm->locked_vm << (PAGE_SHIFT-10),
> @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
> ? ? ? ? ? ? ? ?total_rss << (PAGE_SHIFT-10),
> ? ? ? ? ? ? ? ?data << (PAGE_SHIFT-10),
> ? ? ? ? ? ? ? ?mm->stack_vm << (PAGE_SHIFT-10), text, lib,
> - ? ? ? ? ? ? ? (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
> + ? ? ? ? ? ? ? (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
> + ? ? ? ? ? ? ? swap << (PAGE_SHIFT - 10));
> ?}
>
> ?unsigned long task_vsize(struct mm_struct *mm)
> Index: mmotm-2.6.32-Nov2/mm/rmap.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/rmap.c
> +++ mmotm-2.6.32-Nov2/mm/rmap.c
> @@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?spin_unlock(&mmlist_lock);
> ? ? ? ? ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ? ? ? ? ?dec_mm_counter(mm, anon_rss);
> + ? ? ? ? ? ? ? ? ? ? ? inc_mm_counter(mm, swap_usage);
> ? ? ? ? ? ? ? ?} else if (PAGE_MIGRATION) {
> ? ? ? ? ? ? ? ? ? ? ? ?/*
> ? ? ? ? ? ? ? ? ? ? ? ? * Store the pfn of the page in a special migration
> Index: mmotm-2.6.32-Nov2/kernel/fork.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/kernel/fork.c
> +++ mmotm-2.6.32-Nov2/kernel/fork.c
> @@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
> ? ? ? ?mm->nr_ptes = 0;
> ? ? ? ?set_mm_counter(mm, file_rss, 0);
> ? ? ? ?set_mm_counter(mm, anon_rss, 0);
> + ? ? ? set_mm_counter(mm, swap_usage, 0);
> ? ? ? ?spin_lock_init(&mm->page_table_lock);
> ? ? ? ?mm->free_area_cache = TASK_UNMAPPED_BASE;
> ? ? ? ?mm->cached_hole_size = ~0UL;
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. ?For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>

That's good.
>From now on, we can chagne scanning of pte to find swap pte
in smaps_pte_rangem, too. :)

--
Kind regards,
Minchan Kim

2009-11-05 17:53:12

by Christoph Lameter

[permalink] [raw]

Subject: Re: [RFC][PATCH] lib: generic percpu counter array

On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:

> +static inline void
> +counter_array_add(struct counter_array *ca, int idx, int val)
> +{
> + ca->counters[idx] += val;
> +}

This is not a per cpu operation and therefore expensive. The new percpu
this_cpu_inc f.e. generates a single x86 instruction for an increment.

> +void __counter_array_add(struct counter_array *ca, int idx, int val, int batch)
> +{
> + long count, *pcount;
> +
> + preempt_disable();
> +
> + pcount = this_cpu_ptr(ca->v.array);
> + count = pcount[idx] + val;
> + if (!ca->v.nosync && ((count > batch) || (count < -batch))) {
> + atomic_long_add(count, &ca->counters[idx]);
> + pcount[idx] = 0;
> + } else
> + pcount[idx] = count;
> + preempt_enable();
> +}

Too expensive to use in critical VM paths. The percpu operations generate
a single instruction instead of the code above. No need for preempt etc.

2009-11-05 20:04:13

by Christoph Lameter

[permalink] [raw]

Subject: Re: [RFC][PATCH] lib: generic percpu counter array

On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Anothter major percpu coutner is vm_stat[]. This patch implements
> vm_stat[] style counter array in lib/percpu_counter.c
> This is designed for introducing vm_stat[] style counter to memcg,
> but maybe useful for other people. By using this, counter array
> using percpu can be implemented easily in compact structure.

Note that vm_stat support was written that way because we have extreme
space constraints due to the need to keep statistics per zone and per cpu
and avoid cache line pressure that would result through the use of big
integer arrays per zone and per cpu. For a large number of zones and cpus
this is desastrous.

If you only need to keep statistics per cpu for an entity then the vmstat
approach is overkill. A per cpu allocation of a counter is enough.

2009-11-05 23:50:37

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs

On Fri, 6 Nov 2009 00:11:32 +0900
Minchan Kim <[email protected]> wrote:

> Hi, Kame.
>
Hi,

<snip>
> > /*
> > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> > &src_mm->mmlist);
> > spin_unlock(&mmlist_lock);
> > }
> > - if (is_write_migration_entry(entry) &&
> > + if (!is_migration_entry(entry))
> > + rss[2]++;
>
> First thought I come to is that we believe !is_migration_entry(entry) equal
> swap entry?
> We began supporting HWPOISON.
> HWPOISON would be rare event so some less exact swap accouting may
> be allowed, I think. Is this enough to jusitfy that?
>
Ah, ok, I'll fix here.

> > + else if (is_write_migration_entry(entry) &&
> > is_cow_mapping(vm_flags)) {
> > /*
> > * COW mappings require pages in both parent
> > @@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
> > pte_t *src_pte, *dst_pte;
> > spinlock_t *src_ptl, *dst_ptl;
> > int progress = 0;
> > - int rss[2];
> > + int rss[3];
> > swp_entry_t entry = (swp_entry_t){0};
> >
> > again:
> > - rss[1] = rss[0] = 0;
> > + rss[2] = rss[1] = rss[0] = 0;
> > dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
> > if (!dst_pte)
> > return -ENOMEM;
> > @@ -688,7 +693,7 @@ again:
> > arch_leave_lazy_mmu_mode();
> > spin_unlock(src_ptl);
> > pte_unmap_nested(orig_src_pte);
> > - add_mm_rss(dst_mm, rss[0], rss[1]);
> > + add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
> > pte_unmap_unlock(orig_dst_pte, dst_ptl);
> > cond_resched();
> >
> > @@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
> > spinlock_t *ptl;
> > int file_rss = 0;
> > int anon_rss = 0;
> > + int swap_usage = 0;
> >
> > pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> > arch_enter_lazy_mmu_mode();
> > @@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
> > if (pte_file(ptent)) {
> > if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
> > print_bad_pte(vma, addr, ptent, NULL);
> > - } else if
> > - (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
> > - print_bad_pte(vma, addr, ptent, NULL);
> > + } else {
> > + swp_entry_t ent = pte_to_swp_entry(ptent);
> > +
> > + if (!is_migration_entry(ent))
> > + swap_usage--;
>
> ditto
>
ok, will do.

> > + if (unlikely(!free_swap_and_cache(ent)))
> > + print_bad_pte(vma, addr, ptent, NULL);
> > + }
> > pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> > } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
> >
> > - add_mm_rss(mm, file_rss, anon_rss);
> > + add_mm_rss(mm, file_rss, anon_rss, swap_usage);
> > arch_leave_lazy_mmu_mode();
> > pte_unmap_unlock(pte - 1, ptl);
> >
> > @@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
> > */
> >
> > inc_mm_counter(mm, anon_rss);
> > + dec_mm_counter(mm, swap_usage);
> > pte = mk_pte(page, vma->vm_page_prot);
> > if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
> > pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> > Index: mmotm-2.6.32-Nov2/mm/swapfile.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
> > +++ mmotm-2.6.32-Nov2/mm/swapfile.c
> > @@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
> > }
> >
> > inc_mm_counter(vma->vm_mm, anon_rss);
> > + dec_mm_counter(vma->vm_mm, swap_usage);
> > get_page(page);
> > set_pte_at(vma->vm_mm, addr, pte,
> > pte_mkold(mk_pte(page, vma->vm_page_prot)));
> > Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
> > +++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> > @@ -17,7 +17,7 @@
> > void task_mem(struct seq_file *m, struct mm_struct *mm)
> > {
> > unsigned long data, text, lib;
> > - unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
> > + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
> >
> > /*
> > * Note: to minimize their overhead, mm maintains hiwater_vm and
> > @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
> > data = mm->total_vm - mm->shared_vm - mm->stack_vm;
> > text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
> > lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
> > + swap = get_mm_counter(mm, swap_usage);
> > seq_printf(m,
> > "VmPeak:\t%8lu kB\n"
> > "VmSize:\t%8lu kB\n"
> > @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
> > "VmStk:\t%8lu kB\n"
> > "VmExe:\t%8lu kB\n"
> > "VmLib:\t%8lu kB\n"
> > - "VmPTE:\t%8lu kB\n",
> > + "VmPTE:\t%8lu kB\n"
> > + "VmSwap:\t%8lu kB\n",
> > hiwater_vm << (PAGE_SHIFT-10),
> > (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
> > mm->locked_vm << (PAGE_SHIFT-10),
> > @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
> > total_rss << (PAGE_SHIFT-10),
> > data << (PAGE_SHIFT-10),
> > mm->stack_vm << (PAGE_SHIFT-10), text, lib,
> > - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
> > + (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
> > + swap << (PAGE_SHIFT - 10));
> > }
> >
> > unsigned long task_vsize(struct mm_struct *mm)
> > Index: mmotm-2.6.32-Nov2/mm/rmap.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/mm/rmap.c
> > +++ mmotm-2.6.32-Nov2/mm/rmap.c
> > @@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
> > spin_unlock(&mmlist_lock);
> > }
> > dec_mm_counter(mm, anon_rss);
> > + inc_mm_counter(mm, swap_usage);
> > } else if (PAGE_MIGRATION) {
> > /*
> > * Store the pfn of the page in a special migration
> > Index: mmotm-2.6.32-Nov2/kernel/fork.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/kernel/fork.c
> > +++ mmotm-2.6.32-Nov2/kernel/fork.c
> > @@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
> > mm->nr_ptes = 0;
> > set_mm_counter(mm, file_rss, 0);
> > set_mm_counter(mm, anon_rss, 0);
> > + set_mm_counter(mm, swap_usage, 0);
> > spin_lock_init(&mm->page_table_lock);
> > mm->free_area_cache = TASK_UNMAPPED_BASE;
> > mm->cached_hole_size = ~0UL;
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to [email protected]. For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >
>
> That's good.
> From now on, we can chagne scanning of pte to find swap pte
> in smaps_pte_rangem, too. :)
>

Thanks, I'll update this.
-Kame

2009-11-06 00:51:32

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [RFC][PATCH] lib: generic percpu counter array

Thank you for review.

On Thu, 5 Nov 2009 10:15:36 -0500 (EST)
Christoph Lameter <[email protected]> wrote:

> On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
> > +static inline void
> > +counter_array_add(struct counter_array *ca, int idx, int val)
> > +{
> > + ca->counters[idx] += val;
> > +}
>
> This is not a per cpu operation and therefore expensive. The new percpu
> this_cpu_inc f.e. generates a single x86 instruction for an increment.
>
This code is for !SMP.

> > +void __counter_array_add(struct counter_array *ca, int idx, int val, int batch)
> > +{
> > + long count, *pcount;
> > +
> > + preempt_disable();
> > +
> > + pcount = this_cpu_ptr(ca->v.array);
> > + count = pcount[idx] + val;
> > + if (!ca->v.nosync && ((count > batch) || (count < -batch))) {
> > + atomic_long_add(count, &ca->counters[idx]);
> > + pcount[idx] = 0;
> > + } else
> > + pcount[idx] = count;
> > + preempt_enable();
> > +}
>
> Too expensive to use in critical VM paths. The percpu operations generate
> a single instruction instead of the code above. No need for preempt etc.
>
Hmm, ok. I'll have to see your patch, more.
I wonder how to use indexed-array and ops like add_return..

Thanks,
-Kame

2009-11-06 00:58:37

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [RFC][PATCH] lib: generic percpu counter array

On Thu, 5 Nov 2009 10:20:18 -0500 (EST)
Christoph Lameter <[email protected]> wrote:

> On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
> > Anothter major percpu coutner is vm_stat[]. This patch implements
> > vm_stat[] style counter array in lib/percpu_counter.c
> > This is designed for introducing vm_stat[] style counter to memcg,
> > but maybe useful for other people. By using this, counter array
> > using percpu can be implemented easily in compact structure.
>
>
> Note that vm_stat support was written that way because we have extreme
> space constraints due to the need to keep statistics per zone and per cpu
> and avoid cache line pressure that would result through the use of big
> integer arrays per zone and per cpu. For a large number of zones and cpus
> this is desastrous.
>
> If you only need to keep statistics per cpu for an entity then the vmstat
> approach is overkill. A per cpu allocation of a counter is enough.
>
counter per memcg is required.
Memcg uses its own one but I want to remove it. (it doesn't consider memory
placement.)
What I can use under /lib is percpu_counter, but it's really overkill.

My concern on pure percpu counter is "read" side.
Now, we read counters only via status file and sometimes vmscan will read it.
For supporting dirty_ratio, we need to read them more.
I'll check I can move it to pure percpu counter as you do in mm_counters and
see how read side is affected by for_each_possible_cpu(). Anyway, it's
better than current one.

Thanks,
-Kame

2009-11-06 04:43:04

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: [PATCH] show per-process swap usage via procfs v2

From: KAMEZAWA Hiroyuki <[email protected]>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

After this, /proc/<pid>/status includes following line
<snip>
VmPeak: 315360 kB
VmSize: 315360 kB
VmLck: 0 kB
VmHWM: 180452 kB
VmRSS: 180452 kB
VmData: 311624 kB
VmStk: 84 kB
VmExe: 4 kB
VmLib: 1568 kB
VmPTE: 640 kB
VmSwap: 131240 kB <=== new information

Note:
Because this patch catches swap_pte on page table, this will
not catch shmem's swapout. It's already accounted in per-shmem
inode and we don't need to do more.

Changelog: 2009/11/06
- fixed bad use of is_migration_entry. Now, non_swap_entry() is used.
Changelog: 2009/11/03
- clean up.
- fixed initialization bug at fork (init_mm())

Acked-by: Acked-by; David Rientjes <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
fs/proc/task_mmu.c | 9 ++++++---
include/linux/mm_types.h | 1 +
kernel/fork.c | 1 +
mm/memory.c | 30 +++++++++++++++++++++---------
mm/rmap.c | 1 +
mm/swapfile.c | 1 +
6 files changed, 31 insertions(+), 12 deletions(-)

Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
+++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
*/
mm_counter_t _file_rss;
mm_counter_t _anon_rss;
+ mm_counter_t _swap_usage;

unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
Index: mmotm-2.6.32-Nov2/mm/memory.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memory.c
+++ mmotm-2.6.32-Nov2/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
return 0;
}

-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
{
if (file_rss)
add_mm_counter(mm, file_rss, file_rss);
if (anon_rss)
add_mm_counter(mm, anon_rss, anon_rss);
+ if (swap_usage)
+ add_mm_counter(mm, swap_usage, swap_usage);
}

/*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
- if (is_write_migration_entry(entry) &&
+ if (!non_swap_entry(entry))
+ rss[2]++;
+ else if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];
+ int rss[3];
swp_entry_t entry = (swp_entry_t){0};

again:
- rss[1] = rss[0] = 0;
+ rss[2] = rss[1] = rss[0] = 0;
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
@@ -688,7 +693,7 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(orig_src_pte);
- add_mm_rss(dst_mm, rss[0], rss[1]);
+ add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();

@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
spinlock_t *ptl;
int file_rss = 0;
int anon_rss = 0;
+ int swap_usage = 0;

pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
if (pte_file(ptent)) {
if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
print_bad_pte(vma, addr, ptent, NULL);
- } else if
- (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
- print_bad_pte(vma, addr, ptent, NULL);
+ } else {
+ swp_entry_t ent = pte_to_swp_entry(ptent);
+
+ if (!non_swap_entry(ent))
+ swap_usage--;
+ if (unlikely(!free_swap_and_cache(ent)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ }
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

- add_mm_rss(mm, file_rss, anon_rss);
+ add_mm_rss(mm, file_rss, anon_rss, swap_usage);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);

@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
*/

inc_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, swap_usage);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mmotm-2.6.32-Nov2/mm/swapfile.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
+++ mmotm-2.6.32-Nov2/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
}

inc_mm_counter(vma->vm_mm, anon_rss);
+ dec_mm_counter(vma->vm_mm, swap_usage);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
+++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
unsigned long data, text, lib;
- unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;

/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+ swap = get_mm_counter(mm, swap_usage);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
- "VmPTE:\t%8lu kB\n",
+ "VmPTE:\t%8lu kB\n"
+ "VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+ (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+ swap << (PAGE_SHIFT - 10));
}

unsigned long task_vsize(struct mm_struct *mm)
Index: mmotm-2.6.32-Nov2/mm/rmap.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/rmap.c
+++ mmotm-2.6.32-Nov2/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, anon_rss);
+ inc_mm_counter(mm, swap_usage);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
Index: mmotm-2.6.32-Nov2/kernel/fork.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/kernel/fork.c
+++ mmotm-2.6.32-Nov2/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
mm->nr_ptes = 0;
set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0);
+ set_mm_counter(mm, swap_usage, 0);
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;

2009-11-06 15:19:27

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs v2

On Fri, Nov 6, 2009 at 1:40 PM, KAMEZAWA Hiroyuki
<[email protected]> wrote:
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
> ?After this, /proc/<pid>/status includes following line
> ?<snip>
> ?VmPeak: ? 315360 kB
> ?VmSize: ? 315360 kB
> ?VmLck: ? ? ? ? 0 kB
> ?VmHWM: ? ?180452 kB
> ?VmRSS: ? ?180452 kB
> ?VmData: ? 311624 kB
> ?VmStk: ? ? ? ?84 kB
> ?VmExe: ? ? ? ? 4 kB
> ?VmLib: ? ? ?1568 kB
> ?VmPTE: ? ? ? 640 kB
> ?VmSwap: ? 131240 kB <=== new information
>
> Note:
> ?Because this patch catches swap_pte on page table, this will
> ?not catch shmem's swapout. It's already accounted in per-shmem
> ?inode and we don't need to do more.
>
> Changelog: 2009/11/06
> ?- fixed bad use of is_migration_entry. Now, non_swap_entry() is used.
> Changelog: 2009/11/03
> ?- clean up.
> ?- fixed initialization bug at fork (init_mm())
>
> Acked-by: Acked-by; David Rientjes <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
Reviewed-by: Minchan Kim <[email protected]>

--
Kind regards,
Minchan Kim

2009-11-08 17:04:56

by Pavel Machek

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs

On Thu 2009-11-05 10:04:01, Christoph Lameter wrote:
> On Thu, 5 Nov 2009, KOSAKI Motohiro wrote:
>
> > > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> > >
> > > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > > RSS usage is important information but one more information which
> > > > is often asked by users is "usage of swap".(user support team said.)
> > >
> > > Hmmm... Could we do some rework of the counters first so that they are per
> > > cpu?
> >
> > per-cpu swap counter?
> > It seems overkill effort....
>
> The other alternative is to use atomic ops which are significantly slower
> and have an impact on critical sections.

...but compared to disk i/o, overhead should be almost zero, right?
Keep it simple...

Pavel

--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2009-11-11 02:28:18

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: [PATCH] show per-process swap usage via procfs v3

Updated Documentation/filesystems/proc.txt

==
From: KAMEZAWA Hiroyuki <[email protected]>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

After this, /proc/<pid>/status includes following line
<snip>
VmPeak: 315360 kB
VmSize: 315360 kB
VmLck: 0 kB
VmHWM: 180452 kB
VmRSS: 180452 kB
VmData: 311624 kB
VmStk: 84 kB
VmExe: 4 kB
VmLib: 1568 kB
VmPTE: 640 kB
VmSwap: 131240 kB <=== new information

Note:
Because this patch catches swap_pte on page table, this will
not catch shmem's swapout. It's already accounted in per-shmem
inode and we don't need to do more.

Changelog: 2009/11/11
- added an update for Documentation/filesystems/proc.txt
Changelog: 2009/11/06
- fixed bad use of is_migration_entry. Now, non_swap_entry() is used.
Changelog: 2009/11/03
- clean up.
- fixed initialization bug at fork (init_mm())

Reviewed-by: Minchan Kim <[email protected]>
Acked-by: Acked-by; David Rientjes <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
Documentation/filesystems/proc.txt | 2 ++
fs/proc/task_mmu.c | 9 ++++++---
include/linux/mm_types.h | 1 +
kernel/fork.c | 1 +
mm/memory.c | 30 +++++++++++++++++++++---------
mm/rmap.c | 1 +
mm/swapfile.c | 1 +
7 files changed, 33 insertions(+), 12 deletions(-)

Index: mm-test-kernel/include/linux/mm_types.h
===================================================================
--- mm-test-kernel.orig/include/linux/mm_types.h
+++ mm-test-kernel/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
*/
mm_counter_t _file_rss;
mm_counter_t _anon_rss;
+ mm_counter_t _swap_usage;

unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
Index: mm-test-kernel/mm/memory.c
===================================================================
--- mm-test-kernel.orig/mm/memory.c
+++ mm-test-kernel/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
return 0;
}

-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
{
if (file_rss)
add_mm_counter(mm, file_rss, file_rss);
if (anon_rss)
add_mm_counter(mm, anon_rss, anon_rss);
+ if (swap_usage)
+ add_mm_counter(mm, swap_usage, swap_usage);
}

/*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
- if (is_write_migration_entry(entry) &&
+ if (!non_swap_entry(entry))
+ rss[2]++;
+ else if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];
+ int rss[3];
swp_entry_t entry = (swp_entry_t){0};

again:
- rss[1] = rss[0] = 0;
+ rss[2] = rss[1] = rss[0] = 0;
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
@@ -688,7 +693,7 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(orig_src_pte);
- add_mm_rss(dst_mm, rss[0], rss[1]);
+ add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();

@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
spinlock_t *ptl;
int file_rss = 0;
int anon_rss = 0;
+ int swap_usage = 0;

pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
if (pte_file(ptent)) {
if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
print_bad_pte(vma, addr, ptent, NULL);
- } else if
- (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
- print_bad_pte(vma, addr, ptent, NULL);
+ } else {
+ swp_entry_t ent = pte_to_swp_entry(ptent);
+
+ if (!non_swap_entry(ent))
+ swap_usage--;
+ if (unlikely(!free_swap_and_cache(ent)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ }
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

- add_mm_rss(mm, file_rss, anon_rss);
+ add_mm_rss(mm, file_rss, anon_rss, swap_usage);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);

@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
*/

inc_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, swap_usage);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mm-test-kernel/mm/swapfile.c
===================================================================
--- mm-test-kernel.orig/mm/swapfile.c
+++ mm-test-kernel/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
}

inc_mm_counter(vma->vm_mm, anon_rss);
+ dec_mm_counter(vma->vm_mm, swap_usage);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mm-test-kernel/fs/proc/task_mmu.c
===================================================================
--- mm-test-kernel.orig/fs/proc/task_mmu.c
+++ mm-test-kernel/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
unsigned long data, text, lib;
- unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;

/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+ swap = get_mm_counter(mm, swap_usage);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
- "VmPTE:\t%8lu kB\n",
+ "VmPTE:\t%8lu kB\n"
+ "VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+ (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+ swap << (PAGE_SHIFT - 10));
}

unsigned long task_vsize(struct mm_struct *mm)
Index: mm-test-kernel/mm/rmap.c
===================================================================
--- mm-test-kernel.orig/mm/rmap.c
+++ mm-test-kernel/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, anon_rss);
+ inc_mm_counter(mm, swap_usage);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
Index: mm-test-kernel/kernel/fork.c
===================================================================
--- mm-test-kernel.orig/kernel/fork.c
+++ mm-test-kernel/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
mm->nr_ptes = 0;
set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0);
+ set_mm_counter(mm, swap_usage, 0);
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
Index: mm-test-kernel/Documentation/filesystems/proc.txt
===================================================================
--- mm-test-kernel.orig/Documentation/filesystems/proc.txt
+++ mm-test-kernel/Documentation/filesystems/proc.txt
@@ -163,6 +163,7 @@ read the file /proc/PID/status:
VmExe: 68 kB
VmLib: 1412 kB
VmPTE: 20 kb
+ VmSwap: 0 kb
Threads: 1
SigQ: 0/28578
SigPnd: 0000000000000000
@@ -213,6 +214,7 @@ Table 1-2: Contents of the statm files (
VmExe size of text segment
VmLib size of shared library code
VmPTE size of page table entries
+ VmSwap size of swapped out private rss.
Threads number of threads
SigQ number of signals queued/max. number for queue
SigPnd bitmap of pending signals for the thread

2009-11-12 15:22:32

by Christoph Lameter

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs v3

On Wed, 11 Nov 2009, KAMEZAWA Hiroyuki wrote:

>
> Index: mm-test-kernel/include/linux/mm_types.h
> ===================================================================
> --- mm-test-kernel.orig/include/linux/mm_types.h
> +++ mm-test-kernel/include/linux/mm_types.h
> @@ -228,6 +228,7 @@ struct mm_struct {
> */
> mm_counter_t _file_rss;
> mm_counter_t _anon_rss;
> + mm_counter_t _swap_usage;

This is going to be another hit on vm performance if we get down this
road.

At least put

#ifdef CONFIG_SWAP ?

around this so that we can switch it off?

> @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> &src_mm->mmlist);
> spin_unlock(&mmlist_lock);
> }
> - if (is_write_migration_entry(entry) &&
> + if (!non_swap_entry(entry))
> + rss[2]++;
> + else if (is_write_migration_entry(entry) &&
> is_cow_mapping(vm_flags)) {
> /*

What are the implications for fork performance?

2009-11-13 01:53:53

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs v3

On Thu, 12 Nov 2009 10:20:29 -0500 (EST)
Christoph Lameter <[email protected]> wrote:

> On Wed, 11 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
> >
> > Index: mm-test-kernel/include/linux/mm_types.h
> > ===================================================================
> > --- mm-test-kernel.orig/include/linux/mm_types.h
> > +++ mm-test-kernel/include/linux/mm_types.h
> > @@ -228,6 +228,7 @@ struct mm_struct {
> > */
> > mm_counter_t _file_rss;
> > mm_counter_t _anon_rss;
> > + mm_counter_t _swap_usage;
>
> This is going to be another hit on vm performance if we get down this
> road.
>
> At least put
>
> #ifdef CONFIG_SWAP ?
>
> around this so that we can switch it off?
>
Hmm, okay. But I'm not sure I can do it in clean way.
(Or, I'll wait for you updates for mm_counters, or I do by myself.)

> > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> > &src_mm->mmlist);
> > spin_unlock(&mmlist_lock);
> > }
> > - if (is_write_migration_entry(entry) &&
> > + if (!non_swap_entry(entry))
> > + rss[2]++;
> > + else if (is_write_migration_entry(entry) &&
> > is_cow_mapping(vm_flags)) {
> > /*
>
> What are the implications for fork performance?

This path is executed when page table entry contains a entry of
!pte_none() && !pte_present().

There are not very big chance to reach here.(this path is under unlikely()).

Thanks,
-Kame

2009-11-13 02:37:44

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [PATCH] show per-process swap usage via procfs v3

On Fri, 13 Nov 2009 10:51:12 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> > > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> > > &src_mm->mmlist);
> > > spin_unlock(&mmlist_lock);
> > > }
> > > - if (is_write_migration_entry(entry) &&
> > > + if (!non_swap_entry(entry))
> > > + rss[2]++;
> > > + else if (is_write_migration_entry(entry) &&
> > > is_cow_mapping(vm_flags)) {
> > > /*
> >
> > What are the implications for fork performance?
>
> This path is executed when page table entry contains a entry of
> !pte_none() && !pte_present().
>
> There are not very big chance to reach here.(this path is under unlikely()).
>

[before]
text data bss dec hex filename
6649003 3221828 10232816 20103647 132c1df vmlinux
[after]
text data bss dec hex filename
6649243 3221828 10232816 20103887 132c2cf vmlinux

Now, 240 bytes of text size..Hmm.

Thanks,
-Kame