High-order allocations are obviously more costly, and it's very useful
to know how many of them happens, if there are any issues
(or suspicions) with memory fragmentation.
This commit changes existing per-zone allocation counters to be
per-zone per-order. These counters are displayed using a new
procfs interface (similar to /proc/buddyinfo):
$ cat /proc/allocinfo
DMA 0 0 0 0 0 \
0 0 0 0 0 0
DMA32 3 0 1 0 0 \
0 0 0 0 0 0
Normal 4997056 23594 10902 23686 931 \
23 122 786 17 1 0
Movable 0 0 0 0 0 \
0 0 0 0 0 0
Device 0 0 0 0 0 \
0 0 0 0 0 0
The existing vmstat interface remains untouched*, and still shows
the total number of single page allocations, so high-order allocations
are represented as a corresponding number of order-0 allocations.
$ cat /proc/vmstat | grep alloc
pgalloc_dma 0
pgalloc_dma32 7
pgalloc_normal 5461660
pgalloc_movable 0
pgalloc_device 0
* I've added device zone for consistency with other zones,
and to avoid messy exclusion of this zone in the code.
Signed-off-by: Roman Gushchin <[email protected]>
Suggested-by: Johannes Weiner <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
arch/s390/appldata/appldata_mem.c | 7 +++
include/linux/mmzone.h | 2 +
include/linux/vm_event_item.h | 19 ++++--
include/linux/vmstat.h | 13 +++++
mm/page_alloc.c | 11 +++-
mm/vmstat.c | 120 +++++++++++++++++++++++++++++++++++---
6 files changed, 158 insertions(+), 14 deletions(-)
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index 598df57..06216ff0 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -81,6 +81,7 @@ static void appldata_get_mem_data(void *data)
static struct sysinfo val;
unsigned long ev[NR_VM_EVENT_ITEMS];
struct appldata_mem_data *mem_data;
+ int order;
mem_data = data;
mem_data->sync_count_1++;
@@ -92,6 +93,12 @@ static void appldata_get_mem_data(void *data)
mem_data->pswpout = ev[PSWPOUT];
mem_data->pgalloc = ev[PGALLOC_NORMAL];
mem_data->pgalloc += ev[PGALLOC_DMA];
+ for (order = 1; order < MAX_ORDER; ++order) {
+ mem_data->pgalloc +=
+ ev[PGALLOC_NORMAL + order * MAX_NR_ZONES] << order;
+ mem_data->pgalloc +=
+ ev[PGALLOC_DMA + order * MAX_NR_ZONES] << order;
+ }
mem_data->pgfault = ev[PGFAULT];
mem_data->pgmajfault = ev[PGMAJFAULT];
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 16532fa..6598285 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -66,6 +66,8 @@ enum migratetype {
/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern char * const migratetype_names[MIGRATE_TYPES];
+extern const char *zone_name(int idx);
+
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 37e8d31..75bbac8 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -19,12 +19,23 @@
#define HIGHMEM_ZONE(xx)
#endif
-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
+#ifdef CONFIG_ZONE_DEVICE
+#define DEVICE_ZONE(xx) xx##__DEVICE,
+#else
+#define DEVICE_ZONE(xx)
+#endif
+
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx)
+
+#define PGALLOC_EVENTS_SIZE (MAX_NR_ZONES * MAX_ORDER)
+#define PGALLOC_EVENTS_CUT_SIZE (MAX_NR_ZONES * (MAX_ORDER - 1))
+#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL)
enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
- FOR_ALL_ZONES(PGALLOC),
- FOR_ALL_ZONES(ALLOCSTALL),
- FOR_ALL_ZONES(PGSCAN_SKIP),
+ FOR_ALL_ZONES(PGALLOC)
+ __PGALLOC_LAST = PGALLOC_FIRST_ZONE + PGALLOC_EVENTS_SIZE - 1,
+ FOR_ALL_ZONES(ALLOCSTALL)
+ FOR_ALL_ZONES(PGSCAN_SKIP)
PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
PGFAULT, PGMAJFAULT,
PGLAZYFREED,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b3d85f3..ec30215 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -103,6 +103,19 @@ static inline void vm_events_fold_cpu(int cpu)
#define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
+static inline void __count_alloc_event(enum zone_type zid, unsigned int order)
+{
+ enum vm_event_item item;
+
+ if (unlikely(order >= MAX_ORDER)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ item = PGALLOC_FIRST_ZONE + order * MAX_NR_ZONES + zid;
+ __count_vm_events(item, 1);
+}
+
/*
* Zone and node-based page accounting with per cpu differentials.
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 80e4adb..e74b327 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -233,6 +233,13 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
+const char *zone_name(int zid)
+{
+ if (zid < MAX_NR_ZONES)
+ return zone_names[zid];
+ return NULL;
+}
+
char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
@@ -2779,7 +2786,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_alloc_event(page_zonenum(page), order);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
@@ -2827,7 +2834,7 @@ struct page *rmqueue(struct zone *preferred_zone,
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_alloc_event(page_zonenum(page), order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441b..cd465f6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,6 +27,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/mmzone.h>
#include "internal.h"
@@ -34,18 +35,18 @@
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret)
+static void sum_vm_events(unsigned long *ret, int off, size_t nr_events)
{
int cpu;
int i;
- memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
+ memset(ret, 0, nr_events * sizeof(unsigned long));
for_each_online_cpu(cpu) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- ret[i] += this->event[i];
+ for (i = 0; i < nr_events; i++)
+ ret[i] += this->event[off + i];
}
}
@@ -57,7 +58,7 @@ static void sum_vm_events(unsigned long *ret)
void all_vm_events(unsigned long *ret)
{
get_online_cpus();
- sum_vm_events(ret);
+ sum_vm_events(ret, 0, NR_VM_EVENT_ITEMS);
put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);
@@ -915,8 +916,15 @@ int fragmentation_index(struct zone *zone, unsigned int order)
#define TEXT_FOR_HIGHMEM(xx)
#endif
+#ifdef CONFIG_ZONE_DEVICE
+#define TEXT_FOR_DEVICE(xx) xx "_device",
+#else
+#define TEXT_FOR_DEVICE(xx)
+#endif
+
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable",
+ TEXT_FOR_HIGHMEM(xx) xx "_movable", \
+ TEXT_FOR_DEVICE(xx)
const char * const vmstat_text[] = {
/* enum zone_stat_item countes */
@@ -1480,12 +1488,86 @@ enum writeback_stat_item {
NR_VM_WRITEBACK_STAT_ITEMS,
};
+static void sum_alloc_events(unsigned long *v)
+{
+ int zid, order, index;
+
+ for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+ for (order = 1; order < MAX_ORDER; order++) {
+ index = PGALLOC_FIRST_ZONE + zid;
+ v[index] += v[index + order * MAX_NR_ZONES] << order;
+ }
+ }
+}
+
+static int allocinfo_show(struct seq_file *m, void *arg)
+{
+ unsigned long allocs[PGALLOC_EVENTS_SIZE];
+ unsigned int order;
+ int zid;
+
+ if (arg != SEQ_START_TOKEN)
+ return 0;
+
+ get_online_cpus();
+ sum_vm_events(allocs, PGALLOC_FIRST_ZONE, PGALLOC_EVENTS_SIZE);
+ put_online_cpus();
+
+ for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+ seq_printf(m, "%8s ", zone_name(zid));
+
+ for (order = 0; order < MAX_ORDER; order++)
+ seq_printf(m, "%10lu ",
+ allocs[zid + order * MAX_NR_ZONES]);
+
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+
+static void *allocinfo_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+ return SEQ_START_TOKEN;
+}
+
+static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void allocinfo_stop(struct seq_file *m, void *arg)
+{
+}
+
+static const struct seq_operations allocinfo_op = {
+ .start = allocinfo_start,
+ .next = allocinfo_next,
+ .stop = allocinfo_stop,
+ .show = allocinfo_show,
+};
+
+static int allocinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &allocinfo_op);
+}
+
+static const struct file_operations allocinfo_file_operations = {
+ .open = allocinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
int i, stat_items_size;
- if (*pos >= ARRAY_SIZE(vmstat_text))
+ if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE)
return NULL;
stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
@@ -1513,6 +1595,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
#ifdef CONFIG_VM_EVENT_COUNTERS
all_vm_events(v);
+ sum_alloc_events(v);
v[PGPGIN] /= 2; /* sectors -> kbytes */
v[PGPGOUT] /= 2;
#endif
@@ -1521,8 +1604,16 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
{
+ int alloc_event_start = NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NODE_STAT_ITEMS +
+ NR_VM_WRITEBACK_STAT_ITEMS +
+ PGALLOC_FIRST_ZONE;
+
(*pos)++;
- if (*pos >= ARRAY_SIZE(vmstat_text))
+ if (*pos == alloc_event_start + MAX_NR_ZONES)
+ *(pos) += PGALLOC_EVENTS_CUT_SIZE;
+
+ if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE)
return NULL;
return (unsigned long *)m->private + *pos;
}
@@ -1531,6 +1622,18 @@ static int vmstat_show(struct seq_file *m, void *arg)
{
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
+ int alloc_event_start = NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NODE_STAT_ITEMS +
+ NR_VM_WRITEBACK_STAT_ITEMS +
+ PGALLOC_FIRST_ZONE;
+
+ if (off >= alloc_event_start + PGALLOC_EVENTS_SIZE)
+ off -= PGALLOC_EVENTS_CUT_SIZE;
+
+ if (unlikely(off >= sizeof(vmstat_text))) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
@@ -1790,6 +1893,7 @@ void __init init_mm_internals(void)
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
+ proc_create("allocinfo", 0444, NULL, &allocinfo_file_operations);
proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
--
2.7.4
On Thu, Jul 06, 2017 at 02:04:31PM +0100, Roman Gushchin wrote:
> High-order allocations are obviously more costly, and it's very useful
> to know how many of them happens, if there are any issues
> (or suspicions) with memory fragmentation.
>
> This commit changes existing per-zone allocation counters to be
> per-zone per-order. These counters are displayed using a new
> procfs interface (similar to /proc/buddyinfo):
>
> $ cat /proc/allocinfo
> DMA 0 0 0 0 0 \
> 0 0 0 0 0 0
> DMA32 3 0 1 0 0 \
> 0 0 0 0 0 0
> Normal 4997056 23594 10902 23686 931 \
> 23 122 786 17 1 0
> Movable 0 0 0 0 0 \
> 0 0 0 0 0 0
> Device 0 0 0 0 0 \
> 0 0 0 0 0 0
>
> The existing vmstat interface remains untouched*, and still shows
> the total number of single page allocations, so high-order allocations
> are represented as a corresponding number of order-0 allocations.
>
> $ cat /proc/vmstat | grep alloc
> pgalloc_dma 0
> pgalloc_dma32 7
> pgalloc_normal 5461660
> pgalloc_movable 0
> pgalloc_device 0
>
> * I've added device zone for consistency with other zones,
> and to avoid messy exclusion of this zone in the code.
>
The alloc counter updates are themselves a surprisingly heavy cost to
the allocation path and this makes it worse for a debugging case that is
relatively rare. I'm extremely reluctant for such a patch to be added
given that the tracepoints can be used to assemble such a monitor even
if it means running a userspace daemon to keep track of it. Would such a
solution be suitable? Failing that if this is a severe issue, would it be
possible to at least make this a compile-time or static tracepoint option?
That way, only people that really need it have to take the penalty.
--
Mel Gorman
SUSE Labs
On Thu, Jul 06, 2017 at 02:19:41PM +0100, Mel Gorman wrote:
> On Thu, Jul 06, 2017 at 02:04:31PM +0100, Roman Gushchin wrote:
> > High-order allocations are obviously more costly, and it's very useful
> > to know how many of them happens, if there are any issues
> > (or suspicions) with memory fragmentation.
> >
> > This commit changes existing per-zone allocation counters to be
> > per-zone per-order. These counters are displayed using a new
> > procfs interface (similar to /proc/buddyinfo):
> >
> > $ cat /proc/allocinfo
> > DMA 0 0 0 0 0 \
> > 0 0 0 0 0 0
> > DMA32 3 0 1 0 0 \
> > 0 0 0 0 0 0
> > Normal 4997056 23594 10902 23686 931 \
> > 23 122 786 17 1 0
> > Movable 0 0 0 0 0 \
> > 0 0 0 0 0 0
> > Device 0 0 0 0 0 \
> > 0 0 0 0 0 0
> >
> > The existing vmstat interface remains untouched*, and still shows
> > the total number of single page allocations, so high-order allocations
> > are represented as a corresponding number of order-0 allocations.
> >
> > $ cat /proc/vmstat | grep alloc
> > pgalloc_dma 0
> > pgalloc_dma32 7
> > pgalloc_normal 5461660
> > pgalloc_movable 0
> > pgalloc_device 0
> >
> > * I've added device zone for consistency with other zones,
> > and to avoid messy exclusion of this zone in the code.
> >
>
> The alloc counter updates are themselves a surprisingly heavy cost to
> the allocation path and this makes it worse for a debugging case that is
> relatively rare. I'm extremely reluctant for such a patch to be added
> given that the tracepoints can be used to assemble such a monitor even
> if it means running a userspace daemon to keep track of it. Would such a
> solution be suitable? Failing that if this is a severe issue, would it be
> possible to at least make this a compile-time or static tracepoint option?
> That way, only people that really need it have to take the penalty.
I've tried to measure the difference with my patch applied and without
any accounting at all (__count_alloc_event() redefined to an empty function),
and I wasn't able to find any measurable difference.
Can you, please, provide more details, how your scenario looked like,
when alloc coutners were costly?
As new counters replace an old one, and both are per-cpu counters, I believe,
that the difference should be really small.
If there is a case, when the difference is meaningful,
I'll, of course, make the whole thing a compile-time option.
Thank you!
On Thu, Jul 6, 2017 at 9:19 AM, Mel Gorman <[email protected]> wrote:
> The alloc counter updates are themselves a surprisingly heavy cost to
> the allocation path and this makes it worse for a debugging case that is
> relatively rare. I'm extremely reluctant for such a patch to be added
> given that the tracepoints can be used to assemble such a monitor even
> if it means running a userspace daemon to keep track of it. Would such a
> solution be suitable? Failing that if this is a severe issue, would it be
> possible to at least make this a compile-time or static tracepoint option?
> That way, only people that really need it have to take the penalty.
>
> --
> Mel Gorman
We (Akamai) have been struggling with memory fragmentation issues for
years, and especially the inability to track positive or negative
changes to fragmentation between allocator changes and kernels without
simply looking for how many allocations are failing. We've had someone
toying with trying to report the same data via scanning all pages at
report time versus keeping running stats, although we don't have
working code yet. If it did work it would avoid the runtime overhead.
I don't believe tracepoints are a workable solution for us, since we
would have to be collecting the data from boot, as well as continually
processing the data in userspace at high cost. Ultimately the
locations and other properties (merge-ability) of the allocations in
the buddy groups are also important, which would be interesting to add
on-top of Roman's patch.
On Thu, 6 Jul 2017, Roman Gushchin wrote:
> +#define PGALLOC_EVENTS_SIZE (MAX_NR_ZONES * MAX_ORDER)
> +#define PGALLOC_EVENTS_CUT_SIZE (MAX_NR_ZONES * (MAX_ORDER - 1))
> +#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL)
You are significantly increasing the per cpu counters (ZONES *
MAX_ORDER * cpus!!!). This will increase the cache footprint of critical
functions significantly and thus lead to regressions.
Typically counters for zones are placed in the zone structures but
you would also significantly increase the per zone counters ...
On Thu, Jul 06, 2017 at 03:46:34PM +0100, Roman Gushchin wrote:
> > The alloc counter updates are themselves a surprisingly heavy cost to
> > the allocation path and this makes it worse for a debugging case that is
> > relatively rare. I'm extremely reluctant for such a patch to be added
> > given that the tracepoints can be used to assemble such a monitor even
> > if it means running a userspace daemon to keep track of it. Would such a
> > solution be suitable? Failing that if this is a severe issue, would it be
> > possible to at least make this a compile-time or static tracepoint option?
> > That way, only people that really need it have to take the penalty.
>
> I've tried to measure the difference with my patch applied and without
> any accounting at all (__count_alloc_event() redefined to an empty function),
> and I wasn't able to find any measurable difference.
> Can you, please, provide more details, how your scenario looked like,
> when alloc coutners were costly?
>
At the time I used a page allocator microbenchmark from mmtests to call
the allocator directly without zeroing pages. Triggering allocations from
userspace generally mask the overhead by the zeroing costs. It's just a few
cycles but given the budget for the page allocator in some circumstances
is tiny, it was noticable. perf was used to examine the cost.
> As new counters replace an old one, and both are per-cpu counters, I believe,
> that the difference should be really small.
>
Minimally you add a new branch and a small number of computations. It's
small but it's there. The cache footprint of the counters is also increased.
That is hard to take given that it's overhead for everybody on the off-chance
it can debug something.
It's not a strong objection and I won't nak it on this basis but given
that the same information can be easily obtained using tracepoints
(optionally lower overhead with systemtap), the information is rarely
going to be useful (no latency information for example) and there is an
increased maintenance cost then it does not seem to be that useful.
Maybe it would be slightly more convincing if there was an example of
real problems in the field that can be debugged with this. For high-order
allocations, I previously found that it was the latency that was of the
most concern and not the absolute count that happened since the system
started. Granted, the same criticism could be leveled at the existing
alloc counters but at least by correlating that value with allocstall,
you can determine what percentage of allocations stalled recently and
optionally ftrace at that point to figure out why. The same steps would
indicate then if it's only high-order allocations that stall, add stack
tracing to figure out where they are coming from and go from there. Even if
the per-order counters exist, all the other debugging steps are necessary
so I'm struggling to see how I would use them properly.
--
Mel Gorman
SUSE Labs
On Thu, Jul 06, 2017 at 10:54:24AM -0400, Debabrata Banerjee wrote:
> On Thu, Jul 6, 2017 at 9:19 AM, Mel Gorman <[email protected]> wrote:
>
> > The alloc counter updates are themselves a surprisingly heavy cost to
> > the allocation path and this makes it worse for a debugging case that is
> > relatively rare. I'm extremely reluctant for such a patch to be added
> > given that the tracepoints can be used to assemble such a monitor even
> > if it means running a userspace daemon to keep track of it. Would such a
> > solution be suitable? Failing that if this is a severe issue, would it be
> > possible to at least make this a compile-time or static tracepoint option?
> > That way, only people that really need it have to take the penalty.
> >
> > --
> > Mel Gorman
>
> We (Akamai) have been struggling with memory fragmentation issues for
> years, and especially the inability to track positive or negative
> changes to fragmentation between allocator changes and kernels without
> simply looking for how many allocations are failing. We've had someone
> toying with trying to report the same data via scanning all pages at
> report time versus keeping running stats, although we don't have
> working code yet. If it did work it would avoid the runtime overhead.
> I don't believe tracepoints are a workable solution for us, since we
> would have to be collecting the data from boot, as well as continually
> processing the data in userspace at high cost. Ultimately the
> locations and other properties (merge-ability) of the allocations in
> the buddy groups are also important, which would be interesting to add
> on-top of Roman's patch.
These counters do not actually help you solve that particular problem.
Knowing how many allocations happened since the system booted doesn't tell
you much about how many failed or why they failed. You don't even know
what frequency they occured at unless you monitor it constantly so you're
back to square one whether this information is available from proc or not.
There even is a tracepoint that can be used to track information related
to events that degrade fragmentation (trace_mm_page_alloc_extfrag) although
the primary thing it tells you is that "the probability that an allocation
will fail due to fragmentation in the future is potentially higher".
--
Mel Gorman
SUSE Labs
On Thu, Jul 6, 2017 at 11:51 AM, Mel Gorman <[email protected]> wrote:
>
> These counters do not actually help you solve that particular problem.
> Knowing how many allocations happened since the system booted doesn't tell
> you much about how many failed or why they failed. You don't even know
> what frequency they occured at unless you monitor it constantly so you're
> back to square one whether this information is available from proc or not.
> There even is a tracepoint that can be used to track information related
> to events that degrade fragmentation (trace_mm_page_alloc_extfrag) although
> the primary thing it tells you is that "the probability that an allocation
> will fail due to fragmentation in the future is potentially higher".
I agree these counters don't have enough information, but there a
start to a first order approximation of the current state of memory.
buddyinfo and pagetypeinfo basically show no information now, because
they only involve the small amount of free memory under the watermark
and all our machines are in this state. As second order approximation,
it would be nice to be able to get answers like: "There are
reclaimable high order allocations of at least this order" and "None
of this order allocation can become available due to unmovable and
unreclaimable allocations"
On Thu, Jul 06, 2017 at 12:12:47PM -0400, Debabrata Banerjee wrote:
> On Thu, Jul 6, 2017 at 11:51 AM, Mel Gorman <[email protected]> wrote:
> >
> > These counters do not actually help you solve that particular problem.
> > Knowing how many allocations happened since the system booted doesn't tell
> > you much about how many failed or why they failed. You don't even know
> > what frequency they occured at unless you monitor it constantly so you're
> > back to square one whether this information is available from proc or not.
> > There even is a tracepoint that can be used to track information related
> > to events that degrade fragmentation (trace_mm_page_alloc_extfrag) although
> > the primary thing it tells you is that "the probability that an allocation
> > will fail due to fragmentation in the future is potentially higher".
>
> I agree these counters don't have enough information, but there a
> start to a first order approximation of the current state of memory.
That incurs a universal cost on the off-chance of debugging and ultimately
the debugging is only useful in combination with developing kernel patches
in which case it could be behind a kconfig option.
> buddyinfo and pagetypeinfo basically show no information now, because
They can be used to calculate a fragmentation index at a given point in
time. Admittedly, building a bigger picture requires a full scan of memory
(and that's what was required when fragmentation avoidance was first
being implemented).
> they only involve the small amount of free memory under the watermark
> and all our machines are in this state. As second order approximation,
> it would be nice to be able to get answers like: "There are
> reclaimable high order allocations of at least this order" and "None
> of this order allocation can become available due to unmovable and
> unreclaimable allocations"
Which this patch doesn't provide as what you are looking for requires
a full scan of memory to determine. I've done it in the past using a
severe abuse of systemtap to load a module that scans all of memory with
a variation of PAGE_OWNER to identify stack traces of pages that "don't
belonw" within a pageblock.
Even *with* that information, your options for tuning an unmodified kernel
are basically limited to increasing min_free_kbytes, altering THP's level
of aggression when compacting or brute forcing with either drop_caches,
compact_node or both. All other options after that require kernel patches
-- altering annotations, altering fallback mechanisms, altering compaction,
improving support for pages that can be migrated etc.
--
Mel Gorman
SUSE Labs
On Thu, Jul 06, 2017 at 04:47:05PM +0100, Mel Gorman wrote:
> On Thu, Jul 06, 2017 at 03:46:34PM +0100, Roman Gushchin wrote:
> > > The alloc counter updates are themselves a surprisingly heavy cost to
> > > the allocation path and this makes it worse for a debugging case that is
> > > relatively rare. I'm extremely reluctant for such a patch to be added
> > > given that the tracepoints can be used to assemble such a monitor even
> > > if it means running a userspace daemon to keep track of it. Would such a
> > > solution be suitable? Failing that if this is a severe issue, would it be
> > > possible to at least make this a compile-time or static tracepoint option?
> > > That way, only people that really need it have to take the penalty.
> >
> > I've tried to measure the difference with my patch applied and without
> > any accounting at all (__count_alloc_event() redefined to an empty function),
> > and I wasn't able to find any measurable difference.
> > Can you, please, provide more details, how your scenario looked like,
> > when alloc coutners were costly?
> >
>
> At the time I used a page allocator microbenchmark from mmtests to call
> the allocator directly without zeroing pages. Triggering allocations from
> userspace generally mask the overhead by the zeroing costs. It's just a few
> cycles but given the budget for the page allocator in some circumstances
> is tiny, it was noticable. perf was used to examine the cost.
I'll try to measure the difference with mmtests.
I agree, that it's not a feature that worth significant performance penalty,
but if it's small even in a special benchmark, I'd say, it's acceptable.
> > As new counters replace an old one, and both are per-cpu counters, I believe,
> > that the difference should be really small.
> >
>
> Minimally you add a new branch and a small number of computations. It's
> small but it's there. The cache footprint of the counters is also increased.
> That is hard to take given that it's overhead for everybody on the off-chance
> it can debug something.
>
> It's not a strong objection and I won't nak it on this basis but given
> that the same information can be easily obtained using tracepoints
> (optionally lower overhead with systemtap), the information is rarely
> going to be useful (no latency information for example) and there is an
> increased maintenance cost then it does not seem to be that useful.
Tracepoints are good for investigations on one machine, not so convenient
if we are talking about gathering stats from the fleet with production load.
Unfortunately, some memory fragmentation issues are hard to reproduce on
a single dev machine.
> Maybe it would be slightly more convincing if there was an example of
> real problems in the field that can be debugged with this. For high-order
> allocations, I previously found that it was the latency that was of the
> most concern and not the absolute count that happened since the system
> started.
We met an issue with compaction consuming too much CPU under some specific
conditions, and one of the suspicions was a significant number of high-order
allocations, requested by some third-party device drivers.
Knowing the number of allocations is especially helpful for comparing
different kernel versions in a such case, as it's hard to distinguish changes
in mm, changes in these drivers or just workload/environment changes,
leaded to an increased or decreased fragmentation.
Roman
On Thu, Jul 06, 2017 at 05:43:04PM +0100, Roman Gushchin wrote:
> > At the time I used a page allocator microbenchmark from mmtests to call
> > the allocator directly without zeroing pages. Triggering allocations from
> > userspace generally mask the overhead by the zeroing costs. It's just a few
> > cycles but given the budget for the page allocator in some circumstances
> > is tiny, it was noticable. perf was used to examine the cost.
>
> I'll try to measure the difference with mmtests.
>
> I agree, that it's not a feature that worth significant performance penalty,
> but if it's small even in a special benchmark, I'd say, it's acceptable.
>
Note that even if you keep the cycle overhead down, the CPU cache footprint
for such a large increase remains. That will be permanent and unfixable which
is why I would like a Kconfig option at the very least for the vast majority
of people that have no intention or ability to debug such a situation.
> > > As new counters replace an old one, and both are per-cpu counters, I believe,
> > > that the difference should be really small.
> > >
> >
> > Minimally you add a new branch and a small number of computations. It's
> > small but it's there. The cache footprint of the counters is also increased.
> > That is hard to take given that it's overhead for everybody on the off-chance
> > it can debug something.
> >
> > It's not a strong objection and I won't nak it on this basis but given
> > that the same information can be easily obtained using tracepoints
> > (optionally lower overhead with systemtap), the information is rarely
> > going to be useful (no latency information for example) and there is an
> > increased maintenance cost then it does not seem to be that useful.
>
> Tracepoints are good for investigations on one machine, not so convenient
> if we are talking about gathering stats from the fleet with production load.
> Unfortunately, some memory fragmentation issues are hard to reproduce on
> a single dev machine.
>
Sure, but just knowing that some high-order allocations occurred in the
past doesn't help either.
> > Maybe it would be slightly more convincing if there was an example of
> > real problems in the field that can be debugged with this. For high-order
> > allocations, I previously found that it was the latency that was of the
> > most concern and not the absolute count that happened since the system
> > started.
>
> We met an issue with compaction consuming too much CPU under some specific
> conditions, and one of the suspicions was a significant number of high-order
> allocations, requested by some third-party device drivers.
>
Even if this was the suspicion, you would have to activate monitoring on
the machine under load at the time the problem is occurring to determine if
the high-order allocations are currently happening or happened in the past.
If you are continually logging this data then logging allocation stalls for
high-order allocations would give you similar information. If you have to
activate a monitor anyway (or an agent that monitors for high CPU usage),
then it might as well be ftrace based as well as anything else. Even a
basic systemtap script would be able to capture only stack traces for
allocation requests that take longer than a threshold to limit the amount
of data recorded. Even *if* you had these counters running on your grid,
they will tell you nothing about how long those allocations are or whether
compaction is involved and that is what is key to begin debugging the issue.
A basic monitor of /proc/vmstat for the compact_* can be used an indication
of excessive time spent in compaction although you're back to ftrace to
quantify how much of a problem it is in terms of time. For example,
rapidly increasing compact_fail combined with rapidly increasing
compact_migrate_scanned and compact_free_scanned will tell you that
compaction is active and failing with a comparison of the ratio of
compact_fail to compact_success telling you if it's persistent or slow
progress. You'd need top information to see if it's the compaction daemon
that is consuming all the CPU or processes. If it's the daemon then that
points you in the direction of what potentially needs fixing. If it's
processes then there is a greater problem and ftrace needs to be used to
establish *what* is doing the high-allocation requests and whether they
can be reduced somehow or whether it's a general fragmentation problem
(in which case your day takes a turn for the worse).
What I'm trying to say is that in themselves, an high-order allocation
count doesn't help debug this class of problem as much as you'd think.
Hopefully the above information is more useful to you in helping debug
what's actually wrong.
> Knowing the number of allocations is especially helpful for comparing
> different kernel versions in a such case, as it's hard to distinguish changes
> in mm, changes in these drivers or just workload/environment changes,
> leaded to an increased or decreased fragmentation.
>
I'm still struggling to see how counters help when an agent that monitors
for high CPU usage could be activated that captures tracing to see if it's
allocation and compaction stalls that are contributing to the overall load
or "something else".
--
Mel Gorman
SUSE Labs
On Thu, Jul 6, 2017 at 1:16 PM, Mel Gorman <[email protected]> wrote:
>
> I'm still struggling to see how counters help when an agent that monitors
> for high CPU usage could be activated
>
I suspect Roman has the same problem set as us, the CPU usage is
either always high, high and service critical likely when something
interesting is happening. We'd like to collect data on 200k machines,
and study the results statistically and with respect to time based on
kernel versions, build configs, hardware types, process types, load
patterns, etc, etc. Even finding good candidate machines and at the
right time of day to manually debug with ftrace is problematic.
Granted we could be utilizing existing counters like compact_fail
better. Ultimately the data either leads to dealing with certain bad
actors, different vm tunings, or patches to mm.
On Thu, Jul 06, 2017 at 02:00:00PM -0400, Debabrata Banerjee wrote:
> On Thu, Jul 6, 2017 at 1:16 PM, Mel Gorman <[email protected]> wrote:
> >
> > I'm still struggling to see how counters help when an agent that monitors
> > for high CPU usage could be activated
> >
>
> I suspect Roman has the same problem set as us, the CPU usage is
> either always high, high and service critical likely when something
> interesting is happening. We'd like to collect data on 200k machines,
> and study the results statistically and with respect to time based on
> kernel versions, build configs, hardware types, process types, load
> patterns, etc, etc. Even finding good candidate machines and at the
> right time of day to manually debug with ftrace is problematic.
> Granted we could be utilizing existing counters like compact_fail
> better. Ultimately the data either leads to dealing with certain bad
> actors, different vm tunings, or patches to mm.
Same issue as described in the other mail. The number of high-order
allocations that happened in the past or even the recent past does not
give you useful information for debugging high-order allocation stalls
or fragmentation-related issues. If the high-order allocations are
steady then two machines running similar workloads can both have similar
allocation counts but only one of them may be experiencing high latency.
Similarly, with high CPU usage, it may be due to compaction or a whole
variety of other factors. Even doing a statistical analysis is not going
to be enough unless all the relevant variables are accounted for and the
raw allocation count in isolation is one of the weakest variables to
draw conclusions from.
Correlating allocstall with compaction activity from just /proc/vmstat gives
a much better hint as to whether high CPU activity is due to high-order
allocations. Combining it with top will indicate whether it's direct or
indirect costs. If it really is high-order allocations then ftrace to
identify the source of the high-order allocations becomes relevant and if
it's due to fragmentation, it's a case of tracing the allocator itself to
determine why the fragmentation occurred.
The proc file with allocation counts is such a tiny part of debugging this
class of problem that it's almost irrelevant which is why minimally I think
it should be behind Kconfig at absolute minimum. If you want to activate
it across production machines then by all means go ahead and if so, I'd
be very interested in hearing what class of problem could be debugged and
either tuned or fixed without needing ftrace to gather more information. I
say "almost irrelevant" because technically, correlating high allocation
counts with a kernel version change may be a relevant factor if a kernel
introduced a new source of high-order allocations but I suspect that's
the exception. It would be much more interesting to correlate increased
latency with a kernel version because it's much more relevant. You may
be able to correlate high allocation counts with particular hardware
(particularly network hardware that cannot scatter/gather) *but* the
same proc will will not tell you if those increased requests are
actually a problem so the usefulness is diminished.
I'm not saying that fragmentation and high-order allocation stalls are not a
problem because they can be, but the proc file is unlikely to help and even
an extremely basic systemtap script would give you the same information,
work on much older kernels and with a trivial amount of additional work
it can gather latency information as well as counts.
--
Mel Gorman
SUSE Labs
Hi Roman,
[auto build test ERROR on mmotm/master]
[also build test ERROR on next-20170706]
[cannot apply to v4.12]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Roman-Gushchin/mm-make-allocation-counters-per-order/20170707-091630
base: git://git.cmpxchg.org/linux-mmotm.git master
config: x86_64-randconfig-x019-201727 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
All errors (new ones prefixed by >>):
mm/vmstat.c: In function 'allocinfo_show':
>> mm/vmstat.c:1513:2: error: implicit declaration of function 'sum_vm_events' [-Werror=implicit-function-declaration]
sum_vm_events(allocs, PGALLOC_FIRST_ZONE, PGALLOC_EVENTS_SIZE);
^~~~~~~~~~~~~
At top level:
mm/vmstat.c:1491:13: warning: 'sum_alloc_events' defined but not used [-Wunused-function]
static void sum_alloc_events(unsigned long *v)
^~~~~~~~~~~~~~~~
Cyclomatic Complexity 5 include/linux/compiler.h:__read_once_size
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:fls64
Cyclomatic Complexity 1 include/linux/log2.h:__ilog2_u64
Cyclomatic Complexity 1 include/asm-generic/getorder.h:__get_order
Cyclomatic Complexity 2 arch/x86/include/asm/jump_label.h:arch_static_branch
Cyclomatic Complexity 1 arch/x86/include/asm/atomic64_64.h:atomic64_read
Cyclomatic Complexity 1 include/asm-generic/atomic-long.h:atomic_long_read
Cyclomatic Complexity 1 include/linux/math64.h:div_u64_rem
Cyclomatic Complexity 1 include/linux/math64.h:div_u64
Cyclomatic Complexity 1 include/linux/err.h:ERR_PTR
Cyclomatic Complexity 1 include/linux/spinlock.h:spinlock_check
Cyclomatic Complexity 1 include/linux/spinlock.h:spin_unlock_irqrestore
Cyclomatic Complexity 1 include/linux/nodemask.h:node_state
Cyclomatic Complexity 1 include/linux/mmzone.h:zone_end_pfn
Cyclomatic Complexity 1 include/linux/mmzone.h:populated_zone
Cyclomatic Complexity 2 include/linux/mmzone.h:__nr_to_section
Cyclomatic Complexity 1 include/linux/mmzone.h:__section_mem_map_addr
Cyclomatic Complexity 3 include/linux/mmzone.h:valid_section
Cyclomatic Complexity 1 include/linux/mmzone.h:__pfn_to_section
Cyclomatic Complexity 2 include/linux/mmzone.h:pfn_valid
Cyclomatic Complexity 1 include/linux/mmzone.h:memmap_valid_within
Cyclomatic Complexity 1 include/linux/mm.h:page_zonenum
Cyclomatic Complexity 1 include/linux/mm.h:page_zone
Cyclomatic Complexity 1 include/linux/mm.h:page_to_section
Cyclomatic Complexity 1 include/linux/vmstat.h:global_page_state
Cyclomatic Complexity 1 include/linux/vmstat.h:global_node_page_state
Cyclomatic Complexity 1 include/linux/vmstat.h:zone_page_state
Cyclomatic Complexity 28 include/linux/slab.h:kmalloc_index
Cyclomatic Complexity 68 include/linux/slab.h:kmalloc_large
Cyclomatic Complexity 5 include/linux/slab.h:kmalloc
Cyclomatic Complexity 1 include/linux/cpu.h:cpus_read_lock
Cyclomatic Complexity 1 include/linux/cpu.h:cpus_read_unlock
Cyclomatic Complexity 1 include/linux/cpu.h:get_online_cpus
Cyclomatic Complexity 1 include/linux/cpu.h:put_online_cpus
Cyclomatic Complexity 1 include/linux/proc_fs.h:proc_create
Cyclomatic Complexity 3 mm/vmstat.c:fill_contig_page_info
Cyclomatic Complexity 3 mm/vmstat.c:__fragmentation_index
Cyclomatic Complexity 1 mm/vmstat.c:frag_stop
Cyclomatic Complexity 6 mm/vmstat.c:walk_zones_in_node
Cyclomatic Complexity 1 mm/vmstat.c:frag_show
Cyclomatic Complexity 3 mm/vmstat.c:is_zone_first_populated
Cyclomatic Complexity 1 mm/vmstat.c:zoneinfo_show
Cyclomatic Complexity 2 mm/vmstat.c:allocinfo_start
Cyclomatic Complexity 1 mm/vmstat.c:allocinfo_next
Cyclomatic Complexity 1 mm/vmstat.c:allocinfo_stop
Cyclomatic Complexity 3 mm/vmstat.c:vmstat_next
Cyclomatic Complexity 2 mm/vmstat.c:unusable_free_index
Cyclomatic Complexity 2 mm/vmstat.c:unusable_show
Cyclomatic Complexity 1 mm/vmstat.c:extfrag_show
Cyclomatic Complexity 1 mm/vmstat.c:zoneinfo_open
Cyclomatic Complexity 1 mm/vmstat.c:vmstat_open
Cyclomatic Complexity 1 mm/vmstat.c:pagetypeinfo_open
Cyclomatic Complexity 1 mm/vmstat.c:allocinfo_open
Cyclomatic Complexity 1 mm/vmstat.c:fragmentation_open
Cyclomatic Complexity 1 mm/vmstat.c:extfrag_open
Cyclomatic Complexity 1 mm/vmstat.c:unusable_open
Cyclomatic Complexity 7 mm/vmstat.c:zoneinfo_show_print
Cyclomatic Complexity 2 mm/vmstat.c:pagetypeinfo_showfree
Cyclomatic Complexity 4 mm/vmstat.c:pagetypeinfo_showfree_print
Cyclomatic Complexity 2 mm/vmstat.c:pagetypeinfo_showblockcount
Cyclomatic Complexity 2 mm/vmstat.c:frag_show_print
Cyclomatic Complexity 2 mm/vmstat.c:extfrag_show_print
Cyclomatic Complexity 2 mm/vmstat.c:unusable_show_print
Cyclomatic Complexity 1 mm/vmstat.c:frag_next
Cyclomatic Complexity 2 mm/vmstat.c:frag_start
Cyclomatic Complexity 4 mm/vmstat.c:vmstat_show
Cyclomatic Complexity 1 mm/vmstat.c:vmstat_stop
Cyclomatic Complexity 5 mm/vmstat.c:vmstat_start
Cyclomatic Complexity 8 mm/vmstat.c:pagetypeinfo_showblockcount_print
Cyclomatic Complexity 3 mm/vmstat.c:pagetypeinfo_showmixedcount
Cyclomatic Complexity 2 mm/vmstat.c:pagetypeinfo_show
Cyclomatic Complexity 4 mm/vmstat.c:allocinfo_show
Cyclomatic Complexity 4 mm/vmstat.c:extfrag_debug_init
Cyclomatic Complexity 1 mm/vmstat.c:fragmentation_index
Cyclomatic Complexity 1 mm/vmstat.c:init_mm_internals
cc1: some warnings being treated as errors
vim +/sum_vm_events +1513 mm/vmstat.c
1507 int zid;
1508
1509 if (arg != SEQ_START_TOKEN)
1510 return 0;
1511
1512 get_online_cpus();
> 1513 sum_vm_events(allocs, PGALLOC_FIRST_ZONE, PGALLOC_EVENTS_SIZE);
1514 put_online_cpus();
1515
1516 for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Roman,
[auto build test WARNING on mmotm/master]
[also build test WARNING on next-20170706]
[cannot apply to v4.12]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Roman-Gushchin/mm-make-allocation-counters-per-order/20170707-091630
base: git://git.cmpxchg.org/linux-mmotm.git master
config: s390-default_defconfig (attached as .config)
compiler: s390x-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=s390
All warnings (new ones prefixed by >>):
mm/memcontrol.c: In function 'memory_stat_show':
>> mm/memcontrol.c:5262:1: warning: the frame size of 1136 bytes is larger than 1024 bytes [-Wframe-larger-than=]
}
^
vim +5262 mm/memcontrol.c
f10eb7534 Roman Gushchin 2017-06-30 5246 events[PGSCAN_DIRECT]);
f10eb7534 Roman Gushchin 2017-06-30 5247 seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
f10eb7534 Roman Gushchin 2017-06-30 5248 events[PGSTEAL_DIRECT]);
f10eb7534 Roman Gushchin 2017-06-30 5249 seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
f10eb7534 Roman Gushchin 2017-06-30 5250 seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
f10eb7534 Roman Gushchin 2017-06-30 5251 seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
f10eb7534 Roman Gushchin 2017-06-30 5252 seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
f10eb7534 Roman Gushchin 2017-06-30 5253
2a2e48854 Johannes Weiner 2017-05-03 5254 seq_printf(m, "workingset_refault %lu\n",
71cd31135 Johannes Weiner 2017-05-03 5255 stat[WORKINGSET_REFAULT]);
2a2e48854 Johannes Weiner 2017-05-03 5256 seq_printf(m, "workingset_activate %lu\n",
71cd31135 Johannes Weiner 2017-05-03 5257 stat[WORKINGSET_ACTIVATE]);
2a2e48854 Johannes Weiner 2017-05-03 5258 seq_printf(m, "workingset_nodereclaim %lu\n",
71cd31135 Johannes Weiner 2017-05-03 5259 stat[WORKINGSET_NODERECLAIM]);
2a2e48854 Johannes Weiner 2017-05-03 5260
587d9f726 Johannes Weiner 2016-01-20 5261 return 0;
587d9f726 Johannes Weiner 2016-01-20 @5262 }
587d9f726 Johannes Weiner 2016-01-20 5263
241994ed8 Johannes Weiner 2015-02-11 5264 static struct cftype memory_files[] = {
241994ed8 Johannes Weiner 2015-02-11 5265 {
241994ed8 Johannes Weiner 2015-02-11 5266 .name = "current",
f5fc3c5d8 Johannes Weiner 2015-11-05 5267 .flags = CFTYPE_NOT_ON_ROOT,
241994ed8 Johannes Weiner 2015-02-11 5268 .read_u64 = memory_current_read,
241994ed8 Johannes Weiner 2015-02-11 5269 },
241994ed8 Johannes Weiner 2015-02-11 5270 {
:::::: The code at line 5262 was first introduced by commit
:::::: 587d9f726aaec52157e4156e50363dbe6cb82bdb mm: memcontrol: basic memory statistics in cgroup2 memory controller
:::::: TO: Johannes Weiner <[email protected]>
:::::: CC: Linus Torvalds <[email protected]>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Thu, Jul 06, 2017 at 04:47:05PM +0100, Mel Gorman wrote:
> On Thu, Jul 06, 2017 at 03:46:34PM +0100, Roman Gushchin wrote:
> > > The alloc counter updates are themselves a surprisingly heavy cost to
> > > the allocation path and this makes it worse for a debugging case that is
> > > relatively rare. I'm extremely reluctant for such a patch to be added
> > > given that the tracepoints can be used to assemble such a monitor even
> > > if it means running a userspace daemon to keep track of it. Would such a
> > > solution be suitable? Failing that if this is a severe issue, would it be
> > > possible to at least make this a compile-time or static tracepoint option?
> > > That way, only people that really need it have to take the penalty.
> >
> > I've tried to measure the difference with my patch applied and without
> > any accounting at all (__count_alloc_event() redefined to an empty function),
> > and I wasn't able to find any measurable difference.
> > Can you, please, provide more details, how your scenario looked like,
> > when alloc coutners were costly?
> >
>
> At the time I used a page allocator microbenchmark from mmtests to call
> the allocator directly without zeroing pages. Triggering allocations from
> userspace generally mask the overhead by the zeroing costs. It's just a few
> cycles but given the budget for the page allocator in some circumstances
> is tiny, it was noticable. perf was used to examine the cost.
>
> > As new counters replace an old one, and both are per-cpu counters, I believe,
> > that the difference should be really small.
> >
>
> Minimally you add a new branch and a small number of computations. It's
> small but it's there. The cache footprint of the counters is also increased.
> That is hard to take given that it's overhead for everybody on the off-chance
> it can debug something.
>
> It's not a strong objection and I won't nak it on this basis but given
> that the same information can be easily obtained using tracepoints
> (optionally lower overhead with systemtap), the information is rarely
> going to be useful (no latency information for example) and there is an
> increased maintenance cost then it does not seem to be that useful.
I ran page allocator microbenchmark on raw 4.12 and 4.12 + the change
on my hardware.
Here are the results (raw 4.12 and patched 4.12 corrspondigly):
order 0 batch 1 alloc 1942 free 1041 order 0 batch 1 alloc 822 free 451
order 0 batch 1 alloc 596 free 306 order 0 batch 1 alloc 493 free 290
order 0 batch 1 alloc 417 free 245 order 0 batch 1 alloc 526 free 286
order 0 batch 1 alloc 419 free 243 order 0 batch 1 alloc 435 free 255
order 0 batch 1 alloc 411 free 243 order 0 batch 1 alloc 423 free 240
order 0 batch 1 alloc 417 free 241 order 0 batch 1 alloc 406 free 239
order 0 batch 1 alloc 376 free 225 order 0 batch 1 alloc 383 free 219
order 0 batch 1 alloc 416 free 222 order 0 batch 1 alloc 355 free 205
order 0 batch 1 alloc 312 free 183 order 0 batch 1 alloc 438 free 216
order 0 batch 1 alloc 315 free 181 order 0 batch 1 alloc 347 free 194
order 0 batch 1 alloc 305 free 181 order 0 batch 1 alloc 317 free 185
order 0 batch 1 alloc 307 free 179 order 0 batch 1 alloc 329 free 191
order 0 batch 1 alloc 308 free 178 order 0 batch 1 alloc 335 free 192
order 0 batch 1 alloc 314 free 180 order 0 batch 1 alloc 350 free 190
order 0 batch 1 alloc 301 free 180 order 0 batch 1 alloc 319 free 184
order 0 batch 1 alloc 1807 free 1002 order 0 batch 1 alloc 813 free 459
order 0 batch 1 alloc 633 free 302 order 0 batch 1 alloc 500 free 287
order 0 batch 1 alloc 331 free 194 order 0 batch 1 alloc 609 free 300
order 0 batch 1 alloc 332 free 194 order 0 batch 1 alloc 443 free 255
order 0 batch 1 alloc 330 free 194 order 0 batch 1 alloc 410 free 239
order 0 batch 1 alloc 331 free 194 order 0 batch 1 alloc 383 free 222
order 0 batch 1 alloc 386 free 214 order 0 batch 1 alloc 372 free 212
order 0 batch 1 alloc 370 free 212 order 0 batch 1 alloc 342 free 203
order 0 batch 1 alloc 360 free 208 order 0 batch 1 alloc 428 free 216
order 0 batch 1 alloc 324 free 186 order 0 batch 1 alloc 350 free 195
order 0 batch 1 alloc 298 free 179 order 0 batch 1 alloc 320 free 186
order 0 batch 1 alloc 293 free 173 order 0 batch 1 alloc 323 free 187
order 0 batch 1 alloc 296 free 173 order 0 batch 1 alloc 320 free 188
order 0 batch 1 alloc 294 free 173 order 0 batch 1 alloc 321 free 186
order 0 batch 1 alloc 312 free 174 order 0 batch 1 alloc 320 free 189
order 0 batch 1 alloc 1927 free 1042 order 0 batch 1 alloc 2016 free 10
order 0 batch 1 alloc 856 free 522 order 0 batch 1 alloc 1805 free 10
order 0 batch 1 alloc 372 free 225 order 0 batch 1 alloc 1485 free 73
order 0 batch 1 alloc 375 free 224 order 0 batch 1 alloc 732 free 419
order 0 batch 1 alloc 419 free 234 order 0 batch 1 alloc 576 free 327
order 0 batch 1 alloc 389 free 233 order 0 batch 1 alloc 488 free 280
order 0 batch 1 alloc 376 free 223 order 0 batch 1 alloc 390 free 233
order 0 batch 1 alloc 331 free 196 order 0 batch 1 alloc 409 free 227
order 0 batch 1 alloc 328 free 191 order 0 batch 1 alloc 338 free 198
order 0 batch 1 alloc 307 free 182 order 0 batch 1 alloc 320 free 186
order 0 batch 1 alloc 307 free 183 order 0 batch 1 alloc 322 free 187
order 0 batch 1 alloc 304 free 183 order 0 batch 1 alloc 296 free 173
order 0 batch 1 alloc 303 free 183 order 0 batch 1 alloc 297 free 173
order 0 batch 1 alloc 303 free 176 order 0 batch 1 alloc 296 free 173
order 0 batch 1 alloc 294 free 176 order 0 batch 1 alloc 296 free 173
order 0 batch 1 alloc 716 free 433 order 0 batch 1 alloc 725 free 421
order 0 batch 1 alloc 487 free 286 order 0 batch 1 alloc 485 free 287
order 0 batch 1 alloc 499 free 296 order 0 batch 1 alloc 627 free 300
order 0 batch 1 alloc 437 free 253 order 0 batch 1 alloc 406 free 238
order 0 batch 1 alloc 412 free 242 order 0 batch 1 alloc 390 free 226
order 0 batch 1 alloc 391 free 228 order 0 batch 1 alloc 364 free 218
order 0 batch 1 alloc 379 free 220 order 0 batch 1 alloc 353 free 213
order 0 batch 1 alloc 349 free 210 order 0 batch 1 alloc 334 free 200
order 0 batch 1 alloc 350 free 207 order 0 batch 1 alloc 328 free 194
order 0 batch 1 alloc 402 free 229 order 0 batch 1 alloc 406 free 202
order 0 batch 1 alloc 329 free 188 order 0 batch 1 alloc 333 free 188
order 0 batch 1 alloc 310 free 182 order 0 batch 1 alloc 318 free 188
order 0 batch 1 alloc 307 free 180 order 0 batch 1 alloc 319 free 186
order 0 batch 1 alloc 304 free 180 order 0 batch 1 alloc 320 free 183
order 0 batch 1 alloc 307 free 180 order 0 batch 1 alloc 317 free 185
order 0 batch 1 alloc 827 free 479 order 0 batch 1 alloc 667 free 375
order 0 batch 1 alloc 389 free 228 order 0 batch 1 alloc 479 free 276
order 0 batch 1 alloc 509 free 256 order 0 batch 1 alloc 599 free 294
order 0 batch 1 alloc 338 free 204 order 0 batch 1 alloc 412 free 243
order 0 batch 1 alloc 331 free 194 order 0 batch 1 alloc 376 free 227
order 0 batch 1 alloc 318 free 189 order 0 batch 1 alloc 363 free 211
order 0 batch 1 alloc 305 free 180 order 0 batch 1 alloc 343 free 204
order 0 batch 1 alloc 307 free 191 order 0 batch 1 alloc 332 free 200
order 0 batch 1 alloc 304 free 180 order 0 batch 1 alloc 415 free 206
order 0 batch 1 alloc 351 free 195 order 0 batch 1 alloc 328 free 199
order 0 batch 1 alloc 351 free 193 order 0 batch 1 alloc 321 free 185
order 0 batch 1 alloc 315 free 184 order 0 batch 1 alloc 318 free 185
order 0 batch 1 alloc 317 free 194 order 0 batch 1 alloc 319 free 193
order 0 batch 1 alloc 298 free 179 order 0 batch 1 alloc 323 free 187
order 0 batch 1 alloc 293 free 175 order 0 batch 1 alloc 324 free 184
TBH, I can't see any meaningful difference here, but it might
depend on hardware, of course. As most of the allocations are
order 0, and all other nearby counters are not so hot,
the cache footprint increase should be not so important.
Anyway, I've added a config option.
Thanks!
Roman
>From 36c6f4d293469569ca8b53b89ab8eebd358a5fa5 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <[email protected]>
Date: Mon, 3 Jul 2017 19:02:49 +0100
Subject: [v2] mm: make allocation counters per-order
High-order allocations are obviously more costly, and it's very useful
to know how many of them happens, if there are any issues
(or suspicions) with memory fragmentation.
This commit changes existing per-zone allocation counters to be
per-zone per-order. These counters are displayed using a new
procfs interface (similar to /proc/buddyinfo):
$ cat /proc/allocinfo
DMA 0 0 0 0 0 \
0 0 0 0 0 0
DMA32 3 0 1 0 0 \
0 0 0 0 0 0
Normal 4997056 23594 10902 23686 931 \
23 122 786 17 1 0
Movable 0 0 0 0 0 \
0 0 0 0 0 0
Device 0 0 0 0 0 \
0 0 0 0 0 0
The existing vmstat interface remains untouched*, and still shows
the total number of single page allocations, so high-order allocations
are represented as a corresponding number of order-0 allocations.
$ cat /proc/vmstat | grep alloc
pgalloc_dma 0
pgalloc_dma32 7
pgalloc_normal 5461660
pgalloc_movable 0
pgalloc_device 0
* I've added device zone for consistency with other zones,
and to avoid messy exclusion of this zone in the code.
v2:
The functionality can be enabled/disabled by the PER_ORDER_ALLOC_COUNTERS
config option.
Signed-off-by: Roman Gushchin <[email protected]>
Suggested-by: Johannes Weiner <[email protected]>
Cc: Debabrata Banerjee <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
arch/s390/appldata/appldata_mem.c | 16 +++++
include/linux/mmzone.h | 2 +
include/linux/vm_event_item.h | 27 ++++++--
include/linux/vmstat.h | 20 ++++++
init/Kconfig | 9 +++
mm/page_alloc.c | 11 +++-
mm/vmstat.c | 128 +++++++++++++++++++++++++++++++++++---
7 files changed, 199 insertions(+), 14 deletions(-)
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index 598df57..79679d3 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -66,6 +66,21 @@ struct appldata_mem_data {
} __packed;
+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
+{
+ int order;
+
+ for (order = 1; order < MAX_ORDER; ++order) {
+ pgalloc += ev[PGALLOC_NORMAL + order * MAX_NR_ZONES] << order;
+ pgalloc += ev[PGALLOC_DMA + order * MAX_NR_ZONES] << order;
+ }
+}
+#else
+static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
+{
+}
+#endif
/*
* appldata_get_mem_data()
@@ -92,6 +107,7 @@ static void appldata_get_mem_data(void *data)
mem_data->pswpout = ev[PSWPOUT];
mem_data->pgalloc = ev[PGALLOC_NORMAL];
mem_data->pgalloc += ev[PGALLOC_DMA];
+ sum_pgalloc_events(&mem_data->pgalloc, ev);
mem_data->pgfault = ev[PGFAULT];
mem_data->pgmajfault = ev[PGMAJFAULT];
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b..406dfc4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -66,6 +66,8 @@ enum migratetype {
/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern char * const migratetype_names[MIGRATE_TYPES];
+extern const char *zone_name(int idx);
+
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 37e8d31..da94618 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -19,12 +19,31 @@
#define HIGHMEM_ZONE(xx)
#endif
-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
+#ifdef CONFIG_ZONE_DEVICE
+#define DEVICE_ZONE(xx) xx##__DEVICE,
+#else
+#define DEVICE_ZONE(xx)
+#endif
+
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx)
+
+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+#define PGALLOC_EVENTS_SIZE (MAX_NR_ZONES * MAX_ORDER)
+#define PGALLOC_EVENTS_CUT_SIZE (MAX_NR_ZONES * (MAX_ORDER - 1))
+#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL)
+#else
+#define PGALLOC_EVENTS_SIZE MAX_NR_ZONES
+#define PGALLOC_EVENTS_CUT_SIZE 0
+#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL)
+#endif
enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
- FOR_ALL_ZONES(PGALLOC),
- FOR_ALL_ZONES(ALLOCSTALL),
- FOR_ALL_ZONES(PGSCAN_SKIP),
+ FOR_ALL_ZONES(PGALLOC)
+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+ __PGALLOC_LAST = PGALLOC_FIRST_ZONE + PGALLOC_EVENTS_SIZE - 1,
+#endif
+ FOR_ALL_ZONES(ALLOCSTALL)
+ FOR_ALL_ZONES(PGSCAN_SKIP)
PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
PGFAULT, PGMAJFAULT,
PGLAZYFREED,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b3d85f3..bca96fc 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -103,6 +103,26 @@ static inline void vm_events_fold_cpu(int cpu)
#define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+static inline void __count_alloc_event(enum zone_type zid, unsigned int order)
+{
+ enum vm_event_item item;
+
+ if (unlikely(order >= MAX_ORDER)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ item = PGALLOC_FIRST_ZONE + order * MAX_NR_ZONES + zid;
+ __count_vm_events(item, 1);
+}
+#else
+static inline void __count_alloc_event(enum zone_type zid, unsigned int order)
+{
+ __count_zid_vm_events(PGALLOC, zid, 1 << order);
+}
+#endif
+
/*
* Zone and node-based page accounting with per cpu differentials.
*/
diff --git a/init/Kconfig b/init/Kconfig
index 8514b25..164d6f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1477,6 +1477,15 @@ config VM_EVENT_COUNTERS
on EXPERT systems. /proc/vmstat will only show page counts
if VM event counters are disabled.
+config PER_ORDER_ALLOC_COUNTERS
+ bool "Per-order memory allocation counters"
+ depends on VM_EVENT_COUNTERS && PROC_FS
+ help
+ This option enables splitting per-zone allocation counters
+ into per-zone per-order counters.
+ Per-order counters are exported using the /proc/allocinfo
+ interface, and /proc/vmstat shows accumulated values.
+
config SLUB_DEBUG
default y
bool "Enable SLUB debugging support" if EXPERT
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 80e4adb..e74b327 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -233,6 +233,13 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
+const char *zone_name(int zid)
+{
+ if (zid < MAX_NR_ZONES)
+ return zone_names[zid];
+ return NULL;
+}
+
char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
@@ -2779,7 +2786,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_alloc_event(page_zonenum(page), order);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
@@ -2827,7 +2834,7 @@ struct page *rmqueue(struct zone *preferred_zone,
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_alloc_event(page_zonenum(page), order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441b..1d468ed 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,6 +27,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/mmzone.h>
#include "internal.h"
@@ -34,18 +35,18 @@
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret)
+static void sum_vm_events(unsigned long *ret, int off, size_t nr_events)
{
int cpu;
int i;
- memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
+ memset(ret, 0, nr_events * sizeof(unsigned long));
for_each_online_cpu(cpu) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- ret[i] += this->event[i];
+ for (i = 0; i < nr_events; i++)
+ ret[i] += this->event[off + i];
}
}
@@ -57,7 +58,7 @@ static void sum_vm_events(unsigned long *ret)
void all_vm_events(unsigned long *ret)
{
get_online_cpus();
- sum_vm_events(ret);
+ sum_vm_events(ret, 0, NR_VM_EVENT_ITEMS);
put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);
@@ -915,8 +916,15 @@ int fragmentation_index(struct zone *zone, unsigned int order)
#define TEXT_FOR_HIGHMEM(xx)
#endif
+#ifdef CONFIG_ZONE_DEVICE
+#define TEXT_FOR_DEVICE(xx) xx "_device",
+#else
+#define TEXT_FOR_DEVICE(xx)
+#endif
+
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable",
+ TEXT_FOR_HIGHMEM(xx) xx "_movable", \
+ TEXT_FOR_DEVICE(xx)
const char * const vmstat_text[] = {
/* enum zone_stat_item countes */
@@ -1480,12 +1488,92 @@ enum writeback_stat_item {
NR_VM_WRITEBACK_STAT_ITEMS,
};
+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+static void sum_alloc_events(unsigned long *v)
+{
+ int zid, order, index;
+
+ for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+ for (order = 1; order < MAX_ORDER; order++) {
+ index = PGALLOC_FIRST_ZONE + zid;
+ v[index] += v[index + order * MAX_NR_ZONES] << order;
+ }
+ }
+}
+
+static int allocinfo_show(struct seq_file *m, void *arg)
+{
+ unsigned long allocs[PGALLOC_EVENTS_SIZE];
+ unsigned int order;
+ int zid;
+
+ if (arg != SEQ_START_TOKEN)
+ return 0;
+
+ get_online_cpus();
+ sum_vm_events(allocs, PGALLOC_FIRST_ZONE, PGALLOC_EVENTS_SIZE);
+ put_online_cpus();
+
+ for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+ seq_printf(m, "%8s ", zone_name(zid));
+
+ for (order = 0; order < MAX_ORDER; order++)
+ seq_printf(m, "%10lu ",
+ allocs[zid + order * MAX_NR_ZONES]);
+
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+
+static void *allocinfo_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+ return SEQ_START_TOKEN;
+}
+
+static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void allocinfo_stop(struct seq_file *m, void *arg)
+{
+}
+
+static const struct seq_operations allocinfo_op = {
+ .start = allocinfo_start,
+ .next = allocinfo_next,
+ .stop = allocinfo_stop,
+ .show = allocinfo_show,
+};
+
+static int allocinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &allocinfo_op);
+}
+
+static const struct file_operations allocinfo_file_operations = {
+ .open = allocinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#else
+static void sum_alloc_events(unsigned long *v)
+{
+}
+#endif
+
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
int i, stat_items_size;
- if (*pos >= ARRAY_SIZE(vmstat_text))
+ if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE)
return NULL;
stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
@@ -1513,6 +1601,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
#ifdef CONFIG_VM_EVENT_COUNTERS
all_vm_events(v);
+ sum_alloc_events(v);
v[PGPGIN] /= 2; /* sectors -> kbytes */
v[PGPGOUT] /= 2;
#endif
@@ -1521,8 +1610,16 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
{
+ int alloc_event_start = NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NODE_STAT_ITEMS +
+ NR_VM_WRITEBACK_STAT_ITEMS +
+ PGALLOC_FIRST_ZONE;
+
(*pos)++;
- if (*pos >= ARRAY_SIZE(vmstat_text))
+ if (*pos == alloc_event_start + MAX_NR_ZONES)
+ *(pos) += PGALLOC_EVENTS_CUT_SIZE;
+
+ if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE)
return NULL;
return (unsigned long *)m->private + *pos;
}
@@ -1531,6 +1628,18 @@ static int vmstat_show(struct seq_file *m, void *arg)
{
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
+ int alloc_event_start = NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NODE_STAT_ITEMS +
+ NR_VM_WRITEBACK_STAT_ITEMS +
+ PGALLOC_FIRST_ZONE;
+
+ if (off >= alloc_event_start + PGALLOC_EVENTS_SIZE)
+ off -= PGALLOC_EVENTS_CUT_SIZE;
+
+ if (unlikely(off >= sizeof(vmstat_text))) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
@@ -1790,6 +1899,9 @@ void __init init_mm_internals(void)
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
+#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
+ proc_create("allocinfo", 0444, NULL, &allocinfo_file_operations);
+#endif
proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
--
2.7.4
Hi Roman,
[auto build test WARNING on mmotm/master]
[also build test WARNING on v4.13-rc1 next-20170714]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Roman-Gushchin/mm-make-allocation-counters-per-order/20170716-213429
base: git://git.cmpxchg.org/linux-mmotm.git master
config: s390-allyesconfig (attached as .config)
compiler: s390x-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=s390
All warnings (new ones prefixed by >>):
>> arch/s390//appldata/appldata_mem.c:70:15: warning: return type defaults to 'int' [-Wreturn-type]
static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
^~~~~~~~~~~~~~~~~~
arch/s390//appldata/appldata_mem.c: In function 'sum_pgalloc_events':
>> arch/s390//appldata/appldata_mem.c:78:1: warning: control reaches end of non-void function [-Wreturn-type]
}
^
vim +/int +70 arch/s390//appldata/appldata_mem.c
68
69 #ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
> 70 static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
71 {
72 int order;
73
74 for (order = 1; order < MAX_ORDER; ++order) {
75 pgalloc += ev[PGALLOC_NORMAL + order * MAX_NR_ZONES] << order;
76 pgalloc += ev[PGALLOC_DMA + order * MAX_NR_ZONES] << order;
77 }
> 78 }
79 #else
80 static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
81 {
82 }
83 #endif
84
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation