Hello,
Matt Mackall brings us many memory-footprint-optimization
opportunities with his pagemap/kpagemap patches. However I wonder if
the binary interface can be improved.
This seq_file based implementation iterates in the unit of vmas. So
super large vmas can be a problem. The next step is to to do address
based iteration. That should not be hard.
Andrew, please stop me early if it is at all a wrong interface :)
Thank you,
Fengguang
===
Show a process's memory maps page-by-page in /proc/<pid>/pmaps.
It helps to analyze application's memory footprint in a comprehensive way.
Pages share the same states are grouped into a page range.
For each page range, the following fields are exported:
- first page index
- number of pages in the range
- well known page/pte flags
- number of mmap users
Only page flags not expected to disappear in the near future are exported:
Y:young R:referenced A:active U:uptodate P:ptedirty D:dirty W:writeback
Here is a sample output:
# cat /proc/$$/pmaps
08048000-080c9000 r-xp 08048000 00:00 0
32840 129 Y_A_P__ 1
080c9000-080f6000 rwxp 080c9000 00:00 0 [heap]
32969 45 Y_A_P__ 1
f7dba000-f7dc3000 r-xp 00000000 03:00 176633 /lib/libnss_files-2.3.6.so
0 1 Y_AU___ 1
1 1 YR_U___ 1
5 1 YR_U___ 1
8 1 Y_AU___ 1
f7dc3000-f7dc5000 rwxp 00008000 03:00 176633 /lib/libnss_files-2.3.6.so
8 2 Y_A_P__ 1
f7dc5000-f7dcd000 r-xp 00000000 03:00 176635 /lib/libnss_nis-2.3.6.so
0 1 Y_AU___ 1
1 1 YR_U___ 1
4 1 YR_U___ 1
7 1 Y_AU___ 1
f7dcd000-f7dcf000 rwxp 00007000 03:00 176635 /lib/libnss_nis-2.3.6.so
7 2 Y_A_P__ 1
f7dcf000-f7de1000 r-xp 00000000 03:00 176630 /lib/libnsl-2.3.6.so
0 1 Y_AU___ 1
1 3 YR_U___ 1
16 1 YR_U___ 1
f7de1000-f7de3000 rwxp 00011000 03:00 176630 /lib/libnsl-2.3.6.so
17 2 Y_A_P__ 1
[...]
Matt Mackall's pagemap/kpagemap and John Berthels's exmap can achieve the same goals,
and probably more. But this text based pmaps interface should be more easy to use.
Cc: Jeremy Fitzhardinge <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Matt Mackall <[email protected]>
Cc: John Berthels <[email protected]>
Cc: Nick Piggin <[email protected]>
Signed-off-by: Fengguang Wu <[email protected]>
---
fs/proc/base.c | 5 +
fs/proc/internal.h | 1
fs/proc/task_mmu.c | 170 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 176 insertions(+)
--- linux-2.6.23-rc2.orig/fs/proc/task_mmu.c
+++ linux-2.6.23-rc2/fs/proc/task_mmu.c
@@ -258,6 +258,126 @@ static void smaps_pte_range(struct vm_ar
cond_resched();
}
+struct pmaps_private {
+ struct vm_area_struct *vma;
+ struct seq_file *m;
+ pte_t ptent;
+ struct page *page;
+ unsigned long offset;
+ unsigned long len;
+};
+
+static unsigned long page_mask;
+#define PG_YOUNG PG_readahead /* reuse any non-relevant flag */
+#define PG_DIRTY PG_lru /* ditto */
+#define NR_FAKED_FLAG 2
+#define PG_COUNT (sizeof(page_flags)/sizeof(page_flags[0]))
+
+/*
+ * Page state names, prefixed by their abbreviations.
+ */
+struct pmaps_page_flag {
+ unsigned long mask;
+ const char *name;
+ bool faked;
+};
+
+static struct pmaps_page_flag page_flags [] = {
+ {1 << PG_YOUNG, "Y:young", 1},
+ {1 << PG_referenced, "R:referenced", 0},
+ {1 << PG_active, "A:active", 0},
+
+ {1 << PG_uptodate, "U:uptodate", 0},
+ {1 << PG_DIRTY, "P:ptedirty", 1},
+ {1 << PG_dirty, "D:dirty", 0},
+ {1 << PG_writeback, "W:writeback", 0},
+};
+
+static unsigned long pmaps_page_flags(pte_t ptent, struct page* page)
+{
+ unsigned long flags;
+
+ flags = page->flags & page_mask;
+
+ if (pte_young(ptent))
+ flags |= (1 << PG_YOUNG);
+
+ if (pte_dirty(ptent))
+ flags |= (1 << PG_DIRTY);
+
+ return flags;
+}
+
+static int pmaps_pages_similiar(pte_t ptent0, struct page* page0,
+ pte_t ptent, struct page* page)
+{
+ if (!page0)
+ return 0;
+
+ if (page_mapcount(page0) != page_mapcount(page))
+ return 0;
+
+ if (pmaps_page_flags(ptent0, page0) != pmaps_page_flags(ptent, page))
+ return 0;
+
+ return 1;
+}
+
+static void pmaps_show_range(struct pmaps_private *pp)
+{
+ int i;
+ unsigned long flags;
+
+ if (!pp->page)
+ return;
+
+ seq_printf(pp->m, "%lu\t%lu\t", pp->offset, pp->len);
+
+ flags = pmaps_page_flags(pp->ptent, pp->page);
+ for (i = 0; i < PG_COUNT; i++)
+ seq_putc(pp->m, (flags & page_flags[i].mask) ?
+ page_flags[i].name[0] : '_');
+
+ seq_printf(pp->m, "\t%d\n", page_mapcount(pp->page));
+}
+
+static void pmaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ void *private)
+{
+ struct pmaps_private *pp = private;
+ pte_t *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page;
+ pgoff_t offset;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ offset = page_index(page);
+ if (offset == pp->offset + pp->len &&
+ pmaps_pages_similiar(pp->ptent, pp->page,
+ ptent, page)) {
+ pp->len++;
+ } else {
+ pmaps_show_range(pp);
+ pp->page = page;
+ pp->ptent = ptent;
+ pp->offset = offset;
+ pp->len = 1;
+ }
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+}
+
static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
void *private)
@@ -359,6 +479,26 @@ static int show_smap(struct seq_file *m,
return show_map_internal(m, v, &mss);
}
+static int show_pmaps(struct seq_file *m, void *v)
+{
+ struct vm_area_struct *vma = v;
+ struct pmaps_private pp;
+
+ if (!vma->vm_mm || is_vm_hugetlb_page(vma))
+ return 0;
+
+ memset(&pp, 0, sizeof pp);
+ pp.m = m;
+ pp.vma = vma;
+
+ spin_lock(&vma->vm_mm->page_table_lock);
+ show_map_internal(m, v, NULL);
+ walk_page_range(vma, pmaps_pte_range, &pp);
+ pmaps_show_range(&pp);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
+}
+
void clear_refs_smap(struct mm_struct *mm)
{
struct vm_area_struct *vma;
@@ -482,6 +622,13 @@ static struct seq_operations proc_pid_sm
.show = show_smap
};
+static struct seq_operations proc_pid_pmaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_pmaps
+};
+
static int do_maps_open(struct inode *inode, struct file *file,
struct seq_operations *ops)
{
@@ -558,3 +705,26 @@ const struct file_operations proc_smaps_
.llseek = seq_lseek,
.release = seq_release_private,
};
+
+static int pmaps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_pid_pmaps_op);
+}
+
+const struct file_operations proc_pmaps_operations = {
+ .open = pmaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static __init int task_mmu_init(void)
+{
+ int i;
+ for (page_mask = 0, i = 0; i < PG_COUNT; i++)
+ if (!page_flags[i].faked)
+ page_mask |= page_flags[i].mask;
+ return 0;
+}
+
+module_init(task_mmu_init);
--- linux-2.6.23-rc2.orig/fs/proc/base.c
+++ linux-2.6.23-rc2/fs/proc/base.c
@@ -45,6 +45,9 @@
*
* Paul Mundt <[email protected]>:
* Overall revision about smaps.
+ *
+ * Fengguang Wu <[email protected]>:
+ * Initial version of pmaps.
*/
#include <asm/uaccess.h>
@@ -2070,6 +2073,7 @@ static const struct pid_entry tgid_base_
#ifdef CONFIG_MMU
REG("clear_refs", S_IWUSR, clear_refs),
REG("smaps", S_IRUGO, smaps),
+ REG("pmaps", S_IRUGO, pmaps),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
@@ -2356,6 +2360,7 @@ static const struct pid_entry tid_base_s
#ifdef CONFIG_MMU
REG("clear_refs", S_IWUSR, clear_refs),
REG("smaps", S_IRUGO, smaps),
+ REG("pmaps", S_IRUGO, pmaps),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
--- linux-2.6.23-rc2.orig/fs/proc/internal.h
+++ linux-2.6.23-rc2/fs/proc/internal.h
@@ -53,6 +53,7 @@ extern const struct file_operations proc
extern const struct file_operations proc_maps_operations;
extern const struct file_operations proc_numa_maps_operations;
extern const struct file_operations proc_smaps_operations;
+extern const struct file_operations proc_pmaps_operations;
void free_proc_entry(struct proc_dir_entry *de);
On Tue, Aug 14, 2007 at 04:52:04PM +0800, Fengguang Wu wrote:
> Hello,
>
> Matt Mackall brings us many memory-footprint-optimization
> opportunities with his pagemap/kpagemap patches. However I wonder if
> the binary interface can be improved.
>
> This seq_file based implementation iterates in the unit of vmas. So
> super large vmas can be a problem. The next step is to to do address
> based iteration. That should not be hard.
>
> Andrew, please stop me early if it is at all a wrong interface :)
>
> Thank you,
> Fengguang
> ===
>
> Show a process's memory maps page-by-page in /proc/<pid>/pmaps.
> It helps to analyze application's memory footprint in a comprehensive way.
>
> Pages share the same states are grouped into a page range.
> For each page range, the following fields are exported:
> - first page index
> - number of pages in the range
> - well known page/pte flags
> - number of mmap users
>
> Only page flags not expected to disappear in the near future are exported:
>
> Y:young R:referenced A:active U:uptodate P:ptedirty D:dirty W:writeback
>
> Here is a sample output:
>
> # cat /proc/$$/pmaps
> 08048000-080c9000 r-xp 08048000 00:00 0
> 32840 129 Y_A_P__ 1
> 080c9000-080f6000 rwxp 080c9000 00:00 0 [heap]
> 32969 45 Y_A_P__ 1
> f7dba000-f7dc3000 r-xp 00000000 03:00 176633 /lib/libnss_files-2.3.6.so
> 0 1 Y_AU___ 1
> 1 1 YR_U___ 1
> 5 1 YR_U___ 1
> 8 1 Y_AU___ 1
> f7dc3000-f7dc5000 rwxp 00008000 03:00 176633 /lib/libnss_files-2.3.6.so
> 8 2 Y_A_P__ 1
> f7dc5000-f7dcd000 r-xp 00000000 03:00 176635 /lib/libnss_nis-2.3.6.so
> 0 1 Y_AU___ 1
> 1 1 YR_U___ 1
> 4 1 YR_U___ 1
> 7 1 Y_AU___ 1
> f7dcd000-f7dcf000 rwxp 00007000 03:00 176635 /lib/libnss_nis-2.3.6.so
> 7 2 Y_A_P__ 1
> f7dcf000-f7de1000 r-xp 00000000 03:00 176630 /lib/libnsl-2.3.6.so
> 0 1 Y_AU___ 1
> 1 3 YR_U___ 1
> 16 1 YR_U___ 1
> f7de1000-f7de3000 rwxp 00011000 03:00 176630 /lib/libnsl-2.3.6.so
> 17 2 Y_A_P__ 1
> [...]
That's a _lot_ of data to generate and parse. I doubt we can watch a
larger app in realtime with this interface. And it doesn't give us
physical page numbers either.
--
Mathematics is the supreme nostalgia of our time.
On Tue, Aug 14, 2007 at 11:26:18AM -0500, Matt Mackall wrote:
> On Tue, Aug 14, 2007 at 04:52:04PM +0800, Fengguang Wu wrote:
> > Hello,
> >
> > Matt Mackall brings us many memory-footprint-optimization
> > opportunities with his pagemap/kpagemap patches. However I wonder if
> > the binary interface can be improved.
> >
> > This seq_file based implementation iterates in the unit of vmas. So
> > super large vmas can be a problem. The next step is to to do address
> > based iteration. That should not be hard.
> >
> > Andrew, please stop me early if it is at all a wrong interface :)
> >
> > Thank you,
> > Fengguang
> > ===
> >
> > Show a process's memory maps page-by-page in /proc/<pid>/pmaps.
> > It helps to analyze application's memory footprint in a comprehensive way.
> >
> > Pages share the same states are grouped into a page range.
> > For each page range, the following fields are exported:
> > - first page index
> > - number of pages in the range
> > - well known page/pte flags
> > - number of mmap users
> >
> > Only page flags not expected to disappear in the near future are exported:
> >
> > Y:young R:referenced A:active U:uptodate P:ptedirty D:dirty W:writeback
> >
> > Here is a sample output:
> >
> > # cat /proc/$$/pmaps
> > 08048000-080c9000 r-xp 08048000 00:00 0
> > 32840 129 Y_A_P__ 1
> > 080c9000-080f6000 rwxp 080c9000 00:00 0 [heap]
> > 32969 45 Y_A_P__ 1
> > f7dba000-f7dc3000 r-xp 00000000 03:00 176633 /lib/libnss_files-2.3.6.so
> > 0 1 Y_AU___ 1
> > 1 1 YR_U___ 1
> > 5 1 YR_U___ 1
> > 8 1 Y_AU___ 1
> > f7dc3000-f7dc5000 rwxp 00008000 03:00 176633 /lib/libnss_files-2.3.6.so
> > 8 2 Y_A_P__ 1
> > f7dc5000-f7dcd000 r-xp 00000000 03:00 176635 /lib/libnss_nis-2.3.6.so
> > 0 1 Y_AU___ 1
> > 1 1 YR_U___ 1
> > 4 1 YR_U___ 1
> > 7 1 Y_AU___ 1
> > f7dcd000-f7dcf000 rwxp 00007000 03:00 176635 /lib/libnss_nis-2.3.6.so
> > 7 2 Y_A_P__ 1
> > f7dcf000-f7de1000 r-xp 00000000 03:00 176630 /lib/libnsl-2.3.6.so
> > 0 1 Y_AU___ 1
> > 1 3 YR_U___ 1
> > 16 1 YR_U___ 1
> > f7de1000-f7de3000 rwxp 00011000 03:00 176630 /lib/libnsl-2.3.6.so
> > 17 2 Y_A_P__ 1
> > [...]
>
> That's a _lot_ of data to generate and parse. I doubt we can watch a
Yes, but won't be too bad because it works in a sparse way.
1) It will only generate output for resident pages, that normally is
much smaller than the mapped size. Take my shell for example, the
(size:rss) ratio is (7:1)!
wfg ~% cat /proc/$$/smaps |grep Size|sum
sum 50552.000
avg 777.723
wfg ~% cat /proc/$$/smaps |grep Rss|sum
sum 7604.000
avg 116.985
2) The page range trick suppresses more output. Look at these lines:
> > 08048000-080c9000 r-xp 08048000 00:00 0
> > 32840 129 Y_A_P__ 1
> > 080c9000-080f6000 rwxp 080c9000 00:00 0 [heap]
> > 32969 45 Y_A_P__ 1
It's interesting to see that the seq_file interface demands some
more programming efforts, and provides this flexibility as well.
3) A 64bit number can be encoded in hex in 8 bytes, no more than its
binary presentation. And often the text string will be much shorter
because of the common small values.
4) String formatting is cheap. Much cheaper than pte/struct page
walkings, and context switches. Not to mention its user friendliness.
> larger app in realtime with this interface. And it doesn't give us
> physical page numbers either.
This could be a problem. Binary interface is discouraging, which is
good and bad. The good thing about it is that we can safely export
the raw PFN/page flag numbers without upsetting upset normal users. In
this way a possible useful kernel debugging interface can be shipped
everywhere.
If kpagemap is a reasonable kernel debug interface, I do not see the
reason why kpagemap and pmaps cannot coexist :)
On Thu, Aug 16, 2007 at 09:37:01AM +0800, Fengguang Wu wrote:
> 3) A 64bit number can be encoded in hex in 8 bytes, no more than its
~~ sorry, I'm a fool!
> binary presentation. And often the text string will be much shorter
> because of the common small values.
On Tuesday 14 August 2007 17:26, Matt Mackall wrote:
> On Tue, Aug 14, 2007 at 04:52:04PM +0800, Fengguang Wu wrote:
> > Here is a sample output:
> >
> > # cat /proc/$$/pmaps
> > 080c9000-080f6000 rwxp 080c9000 00:00 0 [heap]
> > 32969 45 Y_A_P__ 1
...
> > f7de1000-f7de3000 rwxp 00011000 03:00 176630 /lib/libnsl-2.3.6.so
> > 17 2 Y_A_P__ 1
> > [...]
>
> That's a _lot_ of data to generate and parse. I doubt we can watch a
> larger app in realtime with this interface. And it doesn't give us
> physical page numbers either.
> > 080c9000-080f6000 rwxp 080c9000 00:00 0 [heap]
> > 32969 45 Y_A_P__ 1
^^^^^ ^^
> > f7dba000-f7dc3000 r-xp 00000000 03:00 176633 /lib/libnss_files-2.3.6.so
^^ ^^ ^^^^^^
Since it's a new /proc file, we are not bound by compatibility.
Export *all* numbers in hex, and ensure that they are always even-chars wide
("fff" bad, "0fff" good). This way you can do binary<->string conversions
(both in kernel and in userspace) 8 bits per CPU cycle, or even faster.
--
vda