Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
This drastically reduces the size of struct zone for systems with large
amounts of processors and allows placement of critical variables of struct
zone in one cacheline even on very large systems.
Another effect is that the pagesets of one processor are placed near one
another. If multiple pagesets from different zones fit into one cacheline
then additional cacheline fetches can be avoided on the hot paths when
allocating memory from multiple zones.
Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs
are reduced and we can drop the zone_pcp macro.
Hotplug handling is also simplified since cpu alloc can bring up and
shut down cpu areas for a specific cpu as a whole. So there is no need to
allocate or free individual pagesets.
Cc: Mel Gorman <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>
---
include/linux/mm.h | 4 -
include/linux/mmzone.h | 12 ---
mm/page_alloc.c | 156 ++++++++++++++-----------------------------------
mm/vmstat.c | 14 ++--
4 files changed, 55 insertions(+), 131 deletions(-)
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h 2009-09-29 09:30:37.000000000 -0500
+++ linux-2.6/include/linux/mm.h 2009-09-29 09:30:39.000000000 -0500
@@ -1062,11 +1062,7 @@ extern void si_meminfo_node(struct sysin
extern int after_bootmem;
extern void setup_pagesets(void);
-#ifdef CONFIG_NUMA
extern void setup_per_cpu_pageset(void);
-#else
-static inline void setup_per_cpu_pageset(void) {}
-#endif
extern void zone_pcp_update(struct zone *zone);
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h 2009-09-29 09:30:25.000000000 -0500
+++ linux-2.6/include/linux/mmzone.h 2009-09-29 09:30:39.000000000 -0500
@@ -184,13 +184,7 @@ struct per_cpu_pageset {
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
-} ____cacheline_aligned_in_smp;
-
-#ifdef CONFIG_NUMA
-#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
-#else
-#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
-#endif
+};
#endif /* !__GENERATING_BOUNDS.H */
@@ -306,10 +300,8 @@ struct zone {
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
- struct per_cpu_pageset *pageset[NR_CPUS];
-#else
- struct per_cpu_pageset pageset[NR_CPUS];
#endif
+ struct per_cpu_pageset *pageset;
/*
* free areas of different sizes
*/
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2009-09-29 09:30:37.000000000 -0500
+++ linux-2.6/mm/page_alloc.c 2009-09-29 09:30:50.000000000 -0500
@@ -1011,7 +1011,7 @@ static void drain_pages(unsigned int cpu
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- pset = zone_pcp(zone, cpu);
+ pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
local_irq_save(flags);
@@ -1098,7 +1098,7 @@ static void free_hot_cold_page(struct pa
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
- pcp = &zone_pcp(zone, get_cpu())->pcp;
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
local_irq_save(flags);
@@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa
out:
local_irq_restore(flags);
- put_cpu();
}
void free_hot_page(struct page *page)
@@ -1183,15 +1182,13 @@ struct page *buffered_rmqueue(struct zon
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
- int cpu;
again:
- cpu = get_cpu();
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
- pcp = &zone_pcp(zone, cpu)->pcp;
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
local_irq_save(flags);
if (list_empty(list)) {
@@ -1234,7 +1231,6 @@ again:
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
- put_cpu();
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
@@ -1243,7 +1239,6 @@ again:
failed:
local_irq_restore(flags);
- put_cpu();
return NULL;
}
@@ -2172,7 +2167,7 @@ void show_free_areas(void)
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, cpu);
+ pageset = per_cpu_ptr(zone->pageset, cpu);
printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
cpu, pageset->pcp.high,
@@ -3087,7 +3082,6 @@ static void setup_pagelist_highmark(stru
}
-#ifdef CONFIG_NUMA
/*
* Boot pageset table. One per cpu which is going to be used for all
* zones and all nodes. The parameters will be set in such a way
@@ -3095,112 +3089,67 @@ static void setup_pagelist_highmark(stru
* the buddy list. This is safe since pageset manipulation is done
* with interrupts disabled.
*
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
+ * Some counter updates may also be caught by the boot pagesets.
*
* zoneinfo_show() and maybe other functions do
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
-{
- struct zone *zone, *dzone;
- int node = cpu_to_node(cpu);
-
- node_set_state(node, N_CPU); /* this node has a cpu */
-
- for_each_populated_zone(zone) {
- zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
- GFP_KERNEL, node);
- if (!zone_pcp(zone, cpu))
- goto bad;
-
- setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
- if (percpu_pagelist_fraction)
- setup_pagelist_highmark(zone_pcp(zone, cpu),
- (zone->present_pages / percpu_pagelist_fraction));
- }
-
- return 0;
-bad:
- for_each_zone(dzone) {
- if (!populated_zone(dzone))
- continue;
- if (dzone == zone)
- break;
- kfree(zone_pcp(dzone, cpu));
- zone_pcp(dzone, cpu) = &boot_pageset[cpu];
- }
- return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
- struct zone *zone;
-
- for_each_zone(zone) {
- struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
- /* Free per_cpu_pageset if it is slab allocated */
- if (pset != &boot_pageset[cpu])
- kfree(pset);
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- }
-}
+static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
int cpu = (long)hcpu;
- int ret = NOTIFY_OK;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- free_zone_pagesets(cpu);
+ node_set_state(cpu_to_node(cpu), N_CPU);
break;
default:
break;
}
- return ret;
+ return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata pageset_notifier =
{ &pageset_cpuup_callback, NULL, 0 };
+/*
+ * Allocate per cpu pagesets and initialize them.
+ * Before this call only boot pagesets were available.
+ * Boot pagesets will no longer be used after this call is complete.
+ */
void __init setup_per_cpu_pageset(void)
{
- int err;
+ struct zone *zone;
+ int cpu;
+
+ for_each_populated_zone(zone) {
+ zone->pageset = alloc_percpu(struct per_cpu_pageset);
- /* Initialize per_cpu_pageset for cpu 0.
- * A cpuup callback will do this for every cpu
- * as it comes online
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+
+ setup_pageset(pcp, zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(pcp,
+ (zone->present_pages /
+ percpu_pagelist_fraction));
+ }
+ }
+
+ /*
+ * The boot cpu is always the first active.
+ * The boot node has a processor
*/
- err = process_zones(smp_processor_id());
- BUG_ON(err);
+ node_set_state(cpu_to_node(smp_processor_id()), N_CPU);
register_cpu_notifier(&pageset_notifier);
}
-#endif
-
static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
@@ -3254,7 +3203,7 @@ static int __zone_pcp_update(void *data)
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- pset = zone_pcp(zone, cpu);
+ pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
local_irq_save(flags);
@@ -3272,15 +3221,7 @@ void zone_pcp_update(struct zone *zone)
/*
* Early setup of pagesets.
- *
- * In the NUMA case the pageset setup simply results in all zones pcp
- * pointer being directed at a per cpu pageset with zero batchsize.
- *
- * This means that every free and every allocation occurs directly from
- * the buddy allocator tables.
- *
- * The pageset never queues pages during early boot and is therefore usable
- * for every type of zone.
+ * At this point various allocators are not operational yet.
*/
__meminit void setup_pagesets(void)
{
@@ -3288,23 +3229,15 @@ __meminit void setup_pagesets(void)
struct zone *zone;
for_each_zone(zone) {
-#ifdef CONFIG_NUMA
- unsigned long batch = 0;
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- /* Early boot. Slab allocator not functional yet */
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- }
-#else
- unsigned long batch = zone_batchsize(zone);
-#endif
+ zone->pageset = &per_cpu_var(boot_pageset);
+ /*
+ * Special pagesets with zero elements so that frees
+ * and allocations are not buffered at all.
+ */
for_each_possible_cpu(cpu)
- setup_pageset(zone_pcp(zone, cpu), batch);
+ setup_pageset(per_cpu_ptr(zone->pageset, cpu), 0);
- if (zone->present_pages)
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone->name, zone->present_pages, batch);
}
}
@@ -4818,10 +4751,11 @@ int percpu_pagelist_fraction_sysctl_hand
if (!write || (ret == -EINVAL))
return ret;
for_each_populated_zone(zone) {
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
unsigned long high;
high = zone->present_pages / percpu_pagelist_fraction;
- setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+ setup_pagelist_highmark(
+ per_cpu_ptr(zone->pageset, cpu), high);
}
}
return 0;
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c 2009-09-29 09:30:25.000000000 -0500
+++ linux-2.6/mm/vmstat.c 2009-09-29 09:30:43.000000000 -0500
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds
threshold = calculate_threshold(zone);
for_each_online_cpu(cpu)
- zone_pcp(zone, cpu)->stat_threshold = threshold;
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
}
}
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+
s8 *p = pcp->vm_stat_diff + item;
long x;
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
s8 *p = pcp->vm_stat_diff + item;
(*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
s8 *p = pcp->vm_stat_diff + item;
(*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;
- p = zone_pcp(zone, cpu);
+ p = per_cpu_ptr(zone->pageset, cpu);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (p->vm_stat_diff[i]) {
@@ -738,7 +740,7 @@ static void zoneinfo_show_print(struct s
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, i);
+ pageset = per_cpu_ptr(zone->pageset, i);
seq_printf(m,
"\n cpu: %i"
"\n count: %i"
--
On Thu, Oct 01, 2009 at 05:25:34PM -0400, [email protected] wrote:
> Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
>
> This drastically reduces the size of struct zone for systems with large
> amounts of processors and allows placement of critical variables of struct
> zone in one cacheline even on very large systems.
>
This seems reasonably accurate. The largest shrink is on !NUMA configured
systems but the NUMA case deletes a lot of pointers.
> Another effect is that the pagesets of one processor are placed near one
> another. If multiple pagesets from different zones fit into one cacheline
> then additional cacheline fetches can be avoided on the hot paths when
> allocating memory from multiple zones.
>
Out of curiousity, how common an occurance is it that a CPU allocate from
multiple zones? I would have thought it was rare but I never checked
either.
> Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs
> are reduced and we can drop the zone_pcp macro.
>
> Hotplug handling is also simplified since cpu alloc can bring up and
> shut down cpu areas for a specific cpu as a whole. So there is no need to
> allocate or free individual pagesets.
>
> Cc: Mel Gorman <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> ---
> include/linux/mm.h | 4 -
> include/linux/mmzone.h | 12 ---
> mm/page_alloc.c | 156 ++++++++++++++-----------------------------------
> mm/vmstat.c | 14 ++--
> 4 files changed, 55 insertions(+), 131 deletions(-)
>
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h 2009-09-29 09:30:37.000000000 -0500
> +++ linux-2.6/include/linux/mm.h 2009-09-29 09:30:39.000000000 -0500
> @@ -1062,11 +1062,7 @@ extern void si_meminfo_node(struct sysin
> extern int after_bootmem;
> extern void setup_pagesets(void);
>
> -#ifdef CONFIG_NUMA
> extern void setup_per_cpu_pageset(void);
> -#else
> -static inline void setup_per_cpu_pageset(void) {}
> -#endif
>
> extern void zone_pcp_update(struct zone *zone);
>
> Index: linux-2.6/include/linux/mmzone.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mmzone.h 2009-09-29 09:30:25.000000000 -0500
> +++ linux-2.6/include/linux/mmzone.h 2009-09-29 09:30:39.000000000 -0500
> @@ -184,13 +184,7 @@ struct per_cpu_pageset {
> s8 stat_threshold;
> s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
> #endif
> -} ____cacheline_aligned_in_smp;
> -
> -#ifdef CONFIG_NUMA
> -#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
> -#else
> -#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
> -#endif
> +};
>
> #endif /* !__GENERATING_BOUNDS.H */
>
> @@ -306,10 +300,8 @@ struct zone {
> */
> unsigned long min_unmapped_pages;
> unsigned long min_slab_pages;
> - struct per_cpu_pageset *pageset[NR_CPUS];
> -#else
> - struct per_cpu_pageset pageset[NR_CPUS];
> #endif
> + struct per_cpu_pageset *pageset;
> /*
> * free areas of different sizes
> */
> Index: linux-2.6/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.orig/mm/page_alloc.c 2009-09-29 09:30:37.000000000 -0500
> +++ linux-2.6/mm/page_alloc.c 2009-09-29 09:30:50.000000000 -0500
> @@ -1011,7 +1011,7 @@ static void drain_pages(unsigned int cpu
> struct per_cpu_pageset *pset;
> struct per_cpu_pages *pcp;
>
> - pset = zone_pcp(zone, cpu);
> + pset = per_cpu_ptr(zone->pageset, cpu);
>
> pcp = &pset->pcp;
> local_irq_save(flags);
> @@ -1098,7 +1098,7 @@ static void free_hot_cold_page(struct pa
> arch_free_page(page, 0);
> kernel_map_pages(page, 1, 0);
>
> - pcp = &zone_pcp(zone, get_cpu())->pcp;
> + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> migratetype = get_pageblock_migratetype(page);
> set_page_private(page, migratetype);
> local_irq_save(flags);
> @@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa
>
> out:
> local_irq_restore(flags);
> - put_cpu();
Previously we get_cpu() to be preemption safe. We then disable
interrupts and potentially take a spinlock later.
Is the point we disable interrupts a pre-emption point? Even if it's not
on normal kernels, is it a preemption point on the RT kernel?
If it is a pre-emption point, what stops us getting rescheduled on another
CPU after PCP has been looked up? Sorry if this has been brought up and
resolved already. This is my first proper look at this patchset. The same
query applies to any section that was
get_cpu()
look up PCP structure
disable interrupts
stuff
enable interrupts
put_cpu()
is converted to
this_cpu_ptr() looks up PCP
disable interrupts
enable interrupts
> }
>
> void free_hot_page(struct page *page)
> @@ -1183,15 +1182,13 @@ struct page *buffered_rmqueue(struct zon
> unsigned long flags;
> struct page *page;
> int cold = !!(gfp_flags & __GFP_COLD);
> - int cpu;
>
> again:
> - cpu = get_cpu();
> if (likely(order == 0)) {
> struct per_cpu_pages *pcp;
> struct list_head *list;
>
> - pcp = &zone_pcp(zone, cpu)->pcp;
> + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> list = &pcp->lists[migratetype];
> local_irq_save(flags);
> if (list_empty(list)) {
> @@ -1234,7 +1231,6 @@ again:
> __count_zone_vm_events(PGALLOC, zone, 1 << order);
> zone_statistics(preferred_zone, zone);
> local_irq_restore(flags);
> - put_cpu();
>
> VM_BUG_ON(bad_range(zone, page));
> if (prep_new_page(page, order, gfp_flags))
> @@ -1243,7 +1239,6 @@ again:
>
> failed:
> local_irq_restore(flags);
> - put_cpu();
> return NULL;
> }
>
> @@ -2172,7 +2167,7 @@ void show_free_areas(void)
> for_each_online_cpu(cpu) {
> struct per_cpu_pageset *pageset;
>
> - pageset = zone_pcp(zone, cpu);
> + pageset = per_cpu_ptr(zone->pageset, cpu);
>
> printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
> cpu, pageset->pcp.high,
> @@ -3087,7 +3082,6 @@ static void setup_pagelist_highmark(stru
> }
>
>
> -#ifdef CONFIG_NUMA
> /*
> * Boot pageset table. One per cpu which is going to be used for all
> * zones and all nodes. The parameters will be set in such a way
> @@ -3095,112 +3089,67 @@ static void setup_pagelist_highmark(stru
> * the buddy list. This is safe since pageset manipulation is done
> * with interrupts disabled.
> *
> - * Some NUMA counter updates may also be caught by the boot pagesets.
> - *
> - * The boot_pagesets must be kept even after bootup is complete for
> - * unused processors and/or zones. They do play a role for bootstrapping
> - * hotplugged processors.
> + * Some counter updates may also be caught by the boot pagesets.
> *
> * zoneinfo_show() and maybe other functions do
> * not check if the processor is online before following the pageset pointer.
> * Other parts of the kernel may not check if the zone is available.
> */
> -static struct per_cpu_pageset boot_pageset[NR_CPUS];
> -
> -/*
> - * Dynamically allocate memory for the
> - * per cpu pageset array in struct zone.
> - */
> -static int __cpuinit process_zones(int cpu)
> -{
> - struct zone *zone, *dzone;
> - int node = cpu_to_node(cpu);
> -
> - node_set_state(node, N_CPU); /* this node has a cpu */
> -
> - for_each_populated_zone(zone) {
> - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
> - GFP_KERNEL, node);
> - if (!zone_pcp(zone, cpu))
> - goto bad;
> -
> - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
> -
> - if (percpu_pagelist_fraction)
> - setup_pagelist_highmark(zone_pcp(zone, cpu),
> - (zone->present_pages / percpu_pagelist_fraction));
> - }
> -
> - return 0;
> -bad:
> - for_each_zone(dzone) {
> - if (!populated_zone(dzone))
> - continue;
> - if (dzone == zone)
> - break;
> - kfree(zone_pcp(dzone, cpu));
> - zone_pcp(dzone, cpu) = &boot_pageset[cpu];
> - }
> - return -ENOMEM;
> -}
> -
> -static inline void free_zone_pagesets(int cpu)
> -{
> - struct zone *zone;
> -
> - for_each_zone(zone) {
> - struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
> -
> - /* Free per_cpu_pageset if it is slab allocated */
> - if (pset != &boot_pageset[cpu])
> - kfree(pset);
> - zone_pcp(zone, cpu) = &boot_pageset[cpu];
> - }
> -}
> +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
>
> static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
> unsigned long action,
> void *hcpu)
> {
> int cpu = (long)hcpu;
> - int ret = NOTIFY_OK;
>
> switch (action) {
> case CPU_UP_PREPARE:
> case CPU_UP_PREPARE_FROZEN:
> - if (process_zones(cpu))
> - ret = NOTIFY_BAD;
> - break;
> - case CPU_UP_CANCELED:
> - case CPU_UP_CANCELED_FROZEN:
> - case CPU_DEAD:
> - case CPU_DEAD_FROZEN:
> - free_zone_pagesets(cpu);
> + node_set_state(cpu_to_node(cpu), N_CPU);
> break;
> default:
> break;
> }
> - return ret;
> + return NOTIFY_OK;
> }
>
> static struct notifier_block __cpuinitdata pageset_notifier =
> { &pageset_cpuup_callback, NULL, 0 };
>
> +/*
> + * Allocate per cpu pagesets and initialize them.
> + * Before this call only boot pagesets were available.
> + * Boot pagesets will no longer be used after this call is complete.
If they are no longer used, do we get the memory back?
> + */
> void __init setup_per_cpu_pageset(void)
> {
> - int err;
> + struct zone *zone;
> + int cpu;
> +
> + for_each_populated_zone(zone) {
> + zone->pageset = alloc_percpu(struct per_cpu_pageset);
>
> - /* Initialize per_cpu_pageset for cpu 0.
> - * A cpuup callback will do this for every cpu
> - * as it comes online
> + for_each_possible_cpu(cpu) {
> + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
> +
> + setup_pageset(pcp, zone_batchsize(zone));
> +
> + if (percpu_pagelist_fraction)
> + setup_pagelist_highmark(pcp,
> + (zone->present_pages /
> + percpu_pagelist_fraction));
> + }
> + }
This would have been easier to review if you left process_zones() where it
was and converted it to the new API. I'm assuming this is just shuffling
code around.
> +
> + /*
> + * The boot cpu is always the first active.
> + * The boot node has a processor
> */
> - err = process_zones(smp_processor_id());
> - BUG_ON(err);
> + node_set_state(cpu_to_node(smp_processor_id()), N_CPU);
> register_cpu_notifier(&pageset_notifier);
> }
>
> -#endif
> -
> static noinline __init_refok
> int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
> {
> @@ -3254,7 +3203,7 @@ static int __zone_pcp_update(void *data)
> struct per_cpu_pageset *pset;
> struct per_cpu_pages *pcp;
>
> - pset = zone_pcp(zone, cpu);
> + pset = per_cpu_ptr(zone->pageset, cpu);
> pcp = &pset->pcp;
>
> local_irq_save(flags);
> @@ -3272,15 +3221,7 @@ void zone_pcp_update(struct zone *zone)
>
> /*
> * Early setup of pagesets.
> - *
> - * In the NUMA case the pageset setup simply results in all zones pcp
> - * pointer being directed at a per cpu pageset with zero batchsize.
> - *
> - * This means that every free and every allocation occurs directly from
> - * the buddy allocator tables.
> - *
> - * The pageset never queues pages during early boot and is therefore usable
> - * for every type of zone.
> + * At this point various allocators are not operational yet.
> */
> __meminit void setup_pagesets(void)
> {
> @@ -3288,23 +3229,15 @@ __meminit void setup_pagesets(void)
> struct zone *zone;
>
> for_each_zone(zone) {
> -#ifdef CONFIG_NUMA
> - unsigned long batch = 0;
> -
> - for (cpu = 0; cpu < NR_CPUS; cpu++) {
> - /* Early boot. Slab allocator not functional yet */
> - zone_pcp(zone, cpu) = &boot_pageset[cpu];
> - }
> -#else
> - unsigned long batch = zone_batchsize(zone);
> -#endif
> + zone->pageset = &per_cpu_var(boot_pageset);
>
> + /*
> + * Special pagesets with zero elements so that frees
> + * and allocations are not buffered at all.
> + */
> for_each_possible_cpu(cpu)
> - setup_pageset(zone_pcp(zone, cpu), batch);
> + setup_pageset(per_cpu_ptr(zone->pageset, cpu), 0);
>
> - if (zone->present_pages)
> - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
> - zone->name, zone->present_pages, batch);
> }
> }
>
> @@ -4818,10 +4751,11 @@ int percpu_pagelist_fraction_sysctl_hand
> if (!write || (ret == -EINVAL))
> return ret;
> for_each_populated_zone(zone) {
> - for_each_online_cpu(cpu) {
> + for_each_possible_cpu(cpu) {
> unsigned long high;
> high = zone->present_pages / percpu_pagelist_fraction;
> - setup_pagelist_highmark(zone_pcp(zone, cpu), high);
> + setup_pagelist_highmark(
> + per_cpu_ptr(zone->pageset, cpu), high);
> }
> }
> return 0;
> Index: linux-2.6/mm/vmstat.c
> ===================================================================
> --- linux-2.6.orig/mm/vmstat.c 2009-09-29 09:30:25.000000000 -0500
> +++ linux-2.6/mm/vmstat.c 2009-09-29 09:30:43.000000000 -0500
> @@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds
> threshold = calculate_threshold(zone);
>
> for_each_online_cpu(cpu)
> - zone_pcp(zone, cpu)->stat_threshold = threshold;
> + per_cpu_ptr(zone->pageset, cpu)->stat_threshold
> + = threshold;
> }
> }
>
> @@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds
> void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
> int delta)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> +
> s8 *p = pcp->vm_stat_diff + item;
> long x;
>
> @@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
> */
> void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> s8 *p = pcp->vm_stat_diff + item;
>
> (*p)++;
> @@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
>
> void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> s8 *p = pcp->vm_stat_diff + item;
>
> (*p)--;
> @@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
> for_each_populated_zone(zone) {
> struct per_cpu_pageset *p;
>
> - p = zone_pcp(zone, cpu);
> + p = per_cpu_ptr(zone->pageset, cpu);
>
> for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
> if (p->vm_stat_diff[i]) {
> @@ -738,7 +740,7 @@ static void zoneinfo_show_print(struct s
> for_each_online_cpu(i) {
> struct per_cpu_pageset *pageset;
>
> - pageset = zone_pcp(zone, i);
> + pageset = per_cpu_ptr(zone->pageset, i);
> seq_printf(m,
> "\n cpu: %i"
> "\n count: %i"
>
> --
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Fri, 2 Oct 2009, Mel Gorman wrote:
> On Thu, Oct 01, 2009 at 05:25:34PM -0400, [email protected] wrote:
> > Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
> >
> > This drastically reduces the size of struct zone for systems with large
> > amounts of processors and allows placement of critical variables of struct
> > zone in one cacheline even on very large systems.
> >
>
> This seems reasonably accurate. The largest shrink is on !NUMA configured
> systems but the NUMA case deletes a lot of pointers.
True, the !NUMA case will then avoid allocating pagesets for unused
zones. But the NUMA case will have the most benefit since the large arrays
in struct zone are gone. Removing the pagesets from struct zone also
increases the cacheability of struct zone information. This is
particularly useful since the size of the pagesets grew with the addition
of the various types of allocation queues.
> > Another effect is that the pagesets of one processor are placed near one
> > another. If multiple pagesets from different zones fit into one cacheline
> > then additional cacheline fetches can be avoided on the hot paths when
> > allocating memory from multiple zones.
> >
>
> Out of curiousity, how common an occurance is it that a CPU allocate from
> multiple zones? I would have thought it was rare but I never checked
> either.
zone allocations are determined by their use. GFP_KERNEL allocs come from
ZONE_NORMAL whereas typical application pages may come from ZONE_HIGHMEM.
The mix depends on what the kernel and the application are doing.
> > pcp = &pset->pcp;
> > + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> > migratetype = get_pageblock_migratetype(page);
> > set_page_private(page, migratetype);
> > local_irq_save(flags);
> > @@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa
> >
> > out:
> > local_irq_restore(flags);
> > - put_cpu();
>
> Previously we get_cpu() to be preemption safe. We then disable
> interrupts and potentially take a spinlock later.
Right. WE need to move the local_irq_save() up two lines. Why disable
preempt and two instructions later disable interrupts? Isnt this bloating
the code?
> this_cpu_ptr() looks up PCP
> disable interrupts
> enable interrupts
Move disable interrupts before the this_cpu_ptr?
> > +/*
> > + * Allocate per cpu pagesets and initialize them.
> > + * Before this call only boot pagesets were available.
> > + * Boot pagesets will no longer be used after this call is complete.
>
> If they are no longer used, do we get the memory back?
No we need to keep them for onlining new processors.
> > - * as it comes online
> > + for_each_possible_cpu(cpu) {
> > + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
> > +
> > + setup_pageset(pcp, zone_batchsize(zone));
> > +
> > + if (percpu_pagelist_fraction)
> > + setup_pagelist_highmark(pcp,
> > + (zone->present_pages /
> > + percpu_pagelist_fraction));
> > + }
> > + }
>
> This would have been easier to review if you left process_zones() where it
> was and converted it to the new API. I'm assuming this is just shuffling
> code around.
Yes I think this was the result of reducing #ifdefs.
On Fri, Oct 02, 2009 at 01:39:28PM -0400, Christoph Lameter wrote:
> On Fri, 2 Oct 2009, Mel Gorman wrote:
>
> > On Thu, Oct 01, 2009 at 05:25:34PM -0400, [email protected] wrote:
> > > Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
> > >
> > > This drastically reduces the size of struct zone for systems with large
> > > amounts of processors and allows placement of critical variables of struct
> > > zone in one cacheline even on very large systems.
> > >
> >
> > This seems reasonably accurate. The largest shrink is on !NUMA configured
> > systems but the NUMA case deletes a lot of pointers.
>
> True, the !NUMA case will then avoid allocating pagesets for unused
> zones. But the NUMA case will have the most benefit since the large arrays
> in struct zone are gone.
Indeed. Out of curiousity, has this patchset been performance tested? I
would expect there to be a small but measurable improvement. If there is
a regression, it might point to poor placement of read/write fields in
the zone.
> Removing the pagesets from struct zone also
> increases the cacheability of struct zone information. This is
> particularly useful since the size of the pagesets grew with the addition
> of the various types of allocation queues.
>
> > > Another effect is that the pagesets of one processor are placed near one
> > > another. If multiple pagesets from different zones fit into one cacheline
> > > then additional cacheline fetches can be avoided on the hot paths when
> > > allocating memory from multiple zones.
> > >
> >
> > Out of curiousity, how common an occurance is it that a CPU allocate from
> > multiple zones? I would have thought it was rare but I never checked
> > either.
>
> zone allocations are determined by their use. GFP_KERNEL allocs come from
> ZONE_NORMAL whereas typical application pages may come from ZONE_HIGHMEM.
> The mix depends on what the kernel and the application are doing.
>
I just wouldn't have expected a significant enough mix to make a measurable
performance difference. It's no biggie.
> > > pcp = &pset->pcp;
> > > + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> > > migratetype = get_pageblock_migratetype(page);
> > > set_page_private(page, migratetype);
> > > local_irq_save(flags);
> > > @@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa
> > >
> > > out:
> > > local_irq_restore(flags);
> > > - put_cpu();
> >
> > Previously we get_cpu() to be preemption safe. We then disable
> > interrupts and potentially take a spinlock later.
>
> Right. WE need to move the local_irq_save() up two lines.
Just so I'm 100% clear, IRQ disabling is considered a preemption point?
> Why disable
> preempt and two instructions later disable interrupts? Isnt this bloating
> the code?
>
By and large, IRQs are disabled at the last possible moment with the minimum
amount of code in between. While the current location does not make perfect
sense, it was probably many small changes that placed it like this each
person avoiding IRQ-disabling for too long without considering what the cost
of get_cpu() was.
Similar care needs to be taken with the other removals of get_cpu() in
this patch to ensure it's still preemption-safe.
> > this_cpu_ptr() looks up PCP
> > disable interrupts
> > enable interrupts
>
> Move disable interrupts before the this_cpu_ptr?
>
In this case, why not move this_cpu_ptr() down until its first use just
before the if (cold) check? It'll still be within the IRQ disabling but
without significantly increasing the amount of time the IRQ is disabled.
> > > +/*
> > > + * Allocate per cpu pagesets and initialize them.
> > > + * Before this call only boot pagesets were available.
> > > + * Boot pagesets will no longer be used after this call is complete.
> >
> > If they are no longer used, do we get the memory back?
>
> No we need to keep them for onlining new processors.
>
That comment would appear to disagree.
> > > - * as it comes online
> > > + for_each_possible_cpu(cpu) {
> > > + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
> > > +
> > > + setup_pageset(pcp, zone_batchsize(zone));
> > > +
> > > + if (percpu_pagelist_fraction)
> > > + setup_pagelist_highmark(pcp,
> > > + (zone->present_pages /
> > > + percpu_pagelist_fraction));
> > > + }
> > > + }
> >
> > This would have been easier to review if you left process_zones() where it
> > was and converted it to the new API. I'm assuming this is just shuffling
> > code around.
>
> Yes I think this was the result of reducing #ifdefs.
>
Ok.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Mon, 5 Oct 2009, Mel Gorman wrote:
> > Right. WE need to move the local_irq_save() up two lines.
>
> Just so I'm 100% clear, IRQ disabling is considered a preemption point?
Yes.
> > Move disable interrupts before the this_cpu_ptr?
> >
>
> In this case, why not move this_cpu_ptr() down until its first use just
> before the if (cold) check? It'll still be within the IRQ disabling but
> without significantly increasing the amount of time the IRQ is disabled.
Good idea. Ill put that into the next release.
> > > > + * Before this call only boot pagesets were available.
> > > > + * Boot pagesets will no longer be used after this call is complete.
> > >
> > > If they are no longer used, do we get the memory back?
> >
> > No we need to keep them for onlining new processors.
> >
>
> That comment would appear to disagree.
The comment is accurate for a processor. Once the pagesets are allocated
for a processor then the boot pageset is no longer used.
Changes to this patch so far:
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2009-10-05 09:49:07.000000000 -0500
+++ linux-2.6/mm/page_alloc.c 2009-10-05 09:48:43.000000000 -0500
@@ -1098,8 +1098,6 @@ static void free_hot_cold_page(struct pa
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
- local_irq_save(flags);
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
if (unlikely(wasMlocked))
@@ -1121,6 +1119,8 @@ static void free_hot_cold_page(struct pa
migratetype = MIGRATE_MOVABLE;
}
+ local_irq_save(flags);
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
else
@@ -3120,7 +3120,8 @@ static struct notifier_block __cpuinitda
/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
- * Boot pagesets will no longer be used after this call is complete.
+ * Boot pagesets will no longer be used by this processorr
+ * after setup_per_cpu_pageset().
*/
void __init setup_per_cpu_pageset(void)
{
@@ -3232,11 +3233,11 @@ __meminit void setup_pagesets(void)
zone->pageset = &per_cpu_var(boot_pageset);
/*
- * Special pagesets with zero elements so that frees
+ * Special pagesets with one element so that frees
* and allocations are not buffered at all.
*/
for_each_possible_cpu(cpu)
- setup_pageset(per_cpu_ptr(zone->pageset, cpu), 0);
+ setup_pageset(per_cpu_ptr(zone->pageset, cpu), 1);
}
}
On Mon, Oct 05, 2009 at 10:55:49AM -0400, Christoph Lameter wrote:
>
> Changes to this patch so far:
>
> Index: linux-2.6/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.orig/mm/page_alloc.c 2009-10-05 09:49:07.000000000 -0500
> +++ linux-2.6/mm/page_alloc.c 2009-10-05 09:48:43.000000000 -0500
> @@ -1098,8 +1098,6 @@ static void free_hot_cold_page(struct pa
> arch_free_page(page, 0);
> kernel_map_pages(page, 1, 0);
>
> - local_irq_save(flags);
> - pcp = &this_cpu_ptr(zone->pageset)->pcp;
> migratetype = get_pageblock_migratetype(page);
> set_page_private(page, migratetype);
> if (unlikely(wasMlocked))
Why did you move local_irq_save() ? It should have stayed where it was
because VM counters are updated under the lock. Only the this_cpu_ptr
should be moving.
> @@ -1121,6 +1119,8 @@ static void free_hot_cold_page(struct pa
> migratetype = MIGRATE_MOVABLE;
> }
>
> + local_irq_save(flags);
> + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> if (cold)
> list_add_tail(&page->lru, &pcp->lists[migratetype]);
> else
> @@ -3120,7 +3120,8 @@ static struct notifier_block __cpuinitda
> /*
> * Allocate per cpu pagesets and initialize them.
> * Before this call only boot pagesets were available.
> - * Boot pagesets will no longer be used after this call is complete.
> + * Boot pagesets will no longer be used by this processorr
> + * after setup_per_cpu_pageset().
> */
> void __init setup_per_cpu_pageset(void)
> {
> @@ -3232,11 +3233,11 @@ __meminit void setup_pagesets(void)
> zone->pageset = &per_cpu_var(boot_pageset);
>
> /*
> - * Special pagesets with zero elements so that frees
> + * Special pagesets with one element so that frees
> * and allocations are not buffered at all.
> */
> for_each_possible_cpu(cpu)
> - setup_pageset(per_cpu_ptr(zone->pageset, cpu), 0);
> + setup_pageset(per_cpu_ptr(zone->pageset, cpu), 1);
>
> }
> }
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Tue, 6 Oct 2009, Mel Gorman wrote:
> > - local_irq_save(flags);
> > - pcp = &this_cpu_ptr(zone->pageset)->pcp;
> > migratetype = get_pageblock_migratetype(page);
> > set_page_private(page, migratetype);
> > if (unlikely(wasMlocked))
>
> Why did you move local_irq_save() ? It should have stayed where it was
> because VM counters are updated under the lock. Only the this_cpu_ptr
> should be moving.
The __count_vm_event()? VM counters may be incremented in a racy way if
convenient. x86 usually produces non racy code (and with this patchset
will always produce non racy code) but f.e. IA64 has always had racy
updates. I'd rather shorted the irq off section.
See the comment in vmstat.h.
On Tue, Oct 06, 2009 at 12:34:56PM -0400, Christoph Lameter wrote:
> On Tue, 6 Oct 2009, Mel Gorman wrote:
>
> > > - local_irq_save(flags);
> > > - pcp = &this_cpu_ptr(zone->pageset)->pcp;
> > > migratetype = get_pageblock_migratetype(page);
> > > set_page_private(page, migratetype);
> > > if (unlikely(wasMlocked))
> >
> > Why did you move local_irq_save() ? It should have stayed where it was
> > because VM counters are updated under the lock. Only the this_cpu_ptr
> > should be moving.
>
> The __count_vm_event()?
and the __dec_zone_page_state within free_page_mlock(). However, it's already
atomic so it shouldn't be a problem.
> VM counters may be incremented in a racy way if
> convenient. x86 usually produces non racy code (and with this patchset
> will always produce non racy code) but f.e. IA64 has always had racy
> updates. I'd rather shorted the irq off section.
>
The count_vm_event is now racier than it was and no longer symmetric with
the PGALLOC counting which still happens with IRQs disabled. The assymetry
could look very strange if there are a lot more frees than allocs for example
because the raciness between the counters is difference.
While I have no problem as such with the local_irq_save() moving (although
I would like PGFREE and PGALLOC to be accounted both with or without IRQs
enabled), I think it deserves to be in a patch all to itself and not hidden
in an apparently unrelated change.
> See the comment in vmstat.h.
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Tue, 6 Oct 2009, Mel Gorman wrote:
> While I have no problem as such with the local_irq_save() moving (although
> I would like PGFREE and PGALLOC to be accounted both with or without IRQs
> enabled), I think it deserves to be in a patch all to itself and not hidden
> in an apparently unrelated change.
Ok I have moved the local_irq_save back.
Full patch (will push it back into my git tree if you approve)
From: Christoph Lameter <[email protected]>
Subject: this_cpu_ops: page allocator conversion
Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
This drastically reduces the size of struct zone for systems with large
amounts of processors and allows placement of critical variables of struct
zone in one cacheline even on very large systems.
Another effect is that the pagesets of one processor are placed near one
another. If multiple pagesets from different zones fit into one cacheline
then additional cacheline fetches can be avoided on the hot paths when
allocating memory from multiple zones.
Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs
are reduced and we can drop the zone_pcp macro.
Hotplug handling is also simplified since cpu alloc can bring up and
shut down cpu areas for a specific cpu as a whole. So there is no need to
allocate or free individual pagesets.
Cc: Mel Gorman <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>
---
include/linux/mm.h | 4 -
include/linux/mmzone.h | 12 ---
mm/page_alloc.c | 157 ++++++++++++++-----------------------------------
mm/vmstat.c | 14 ++--
4 files changed, 56 insertions(+), 131 deletions(-)
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h 2009-10-06 12:41:19.000000000 -0500
+++ linux-2.6/include/linux/mm.h 2009-10-06 12:41:19.000000000 -0500
@@ -1062,11 +1062,7 @@ extern void si_meminfo_node(struct sysin
extern int after_bootmem;
extern void setup_pagesets(void);
-#ifdef CONFIG_NUMA
extern void setup_per_cpu_pageset(void);
-#else
-static inline void setup_per_cpu_pageset(void) {}
-#endif
extern void zone_pcp_update(struct zone *zone);
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h 2009-10-05 15:33:08.000000000 -0500
+++ linux-2.6/include/linux/mmzone.h 2009-10-06 12:41:19.000000000 -0500
@@ -184,13 +184,7 @@ struct per_cpu_pageset {
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
-} ____cacheline_aligned_in_smp;
-
-#ifdef CONFIG_NUMA
-#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
-#else
-#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
-#endif
+};
#endif /* !__GENERATING_BOUNDS.H */
@@ -306,10 +300,8 @@ struct zone {
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
- struct per_cpu_pageset *pageset[NR_CPUS];
-#else
- struct per_cpu_pageset pageset[NR_CPUS];
#endif
+ struct per_cpu_pageset *pageset;
/*
* free areas of different sizes
*/
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2009-10-06 12:41:19.000000000 -0500
+++ linux-2.6/mm/page_alloc.c 2009-10-06 12:43:27.000000000 -0500
@@ -1011,7 +1011,7 @@ static void drain_pages(unsigned int cpu
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- pset = zone_pcp(zone, cpu);
+ pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
local_irq_save(flags);
@@ -1098,7 +1098,6 @@ static void free_hot_cold_page(struct pa
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
- pcp = &zone_pcp(zone, get_cpu())->pcp;
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
local_irq_save(flags);
@@ -1121,6 +1120,7 @@ static void free_hot_cold_page(struct pa
migratetype = MIGRATE_MOVABLE;
}
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
else
@@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa
out:
local_irq_restore(flags);
- put_cpu();
}
void free_hot_page(struct page *page)
@@ -1183,15 +1182,13 @@ struct page *buffered_rmqueue(struct zon
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
- int cpu;
again:
- cpu = get_cpu();
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
- pcp = &zone_pcp(zone, cpu)->pcp;
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
local_irq_save(flags);
if (list_empty(list)) {
@@ -1234,7 +1231,6 @@ again:
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
- put_cpu();
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
@@ -1243,7 +1239,6 @@ again:
failed:
local_irq_restore(flags);
- put_cpu();
return NULL;
}
@@ -2172,7 +2167,7 @@ void show_free_areas(void)
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, cpu);
+ pageset = per_cpu_ptr(zone->pageset, cpu);
printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
cpu, pageset->pcp.high,
@@ -3087,7 +3082,6 @@ static void setup_pagelist_highmark(stru
}
-#ifdef CONFIG_NUMA
/*
* Boot pageset table. One per cpu which is going to be used for all
* zones and all nodes. The parameters will be set in such a way
@@ -3095,112 +3089,68 @@ static void setup_pagelist_highmark(stru
* the buddy list. This is safe since pageset manipulation is done
* with interrupts disabled.
*
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
+ * Some counter updates may also be caught by the boot pagesets.
*
* zoneinfo_show() and maybe other functions do
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
-{
- struct zone *zone, *dzone;
- int node = cpu_to_node(cpu);
-
- node_set_state(node, N_CPU); /* this node has a cpu */
-
- for_each_populated_zone(zone) {
- zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
- GFP_KERNEL, node);
- if (!zone_pcp(zone, cpu))
- goto bad;
-
- setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
- if (percpu_pagelist_fraction)
- setup_pagelist_highmark(zone_pcp(zone, cpu),
- (zone->present_pages / percpu_pagelist_fraction));
- }
-
- return 0;
-bad:
- for_each_zone(dzone) {
- if (!populated_zone(dzone))
- continue;
- if (dzone == zone)
- break;
- kfree(zone_pcp(dzone, cpu));
- zone_pcp(dzone, cpu) = &boot_pageset[cpu];
- }
- return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
- struct zone *zone;
-
- for_each_zone(zone) {
- struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
- /* Free per_cpu_pageset if it is slab allocated */
- if (pset != &boot_pageset[cpu])
- kfree(pset);
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- }
-}
+static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
int cpu = (long)hcpu;
- int ret = NOTIFY_OK;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- free_zone_pagesets(cpu);
+ node_set_state(cpu_to_node(cpu), N_CPU);
break;
default:
break;
}
- return ret;
+ return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata pageset_notifier =
{ &pageset_cpuup_callback, NULL, 0 };
+/*
+ * Allocate per cpu pagesets and initialize them.
+ * Before this call only boot pagesets were available.
+ * Boot pagesets will no longer be used by this processorr
+ * after setup_per_cpu_pageset().
+ */
void __init setup_per_cpu_pageset(void)
{
- int err;
+ struct zone *zone;
+ int cpu;
+
+ for_each_populated_zone(zone) {
+ zone->pageset = alloc_percpu(struct per_cpu_pageset);
- /* Initialize per_cpu_pageset for cpu 0.
- * A cpuup callback will do this for every cpu
- * as it comes online
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+
+ setup_pageset(pcp, zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(pcp,
+ (zone->present_pages /
+ percpu_pagelist_fraction));
+ }
+ }
+
+ /*
+ * The boot cpu is always the first active.
+ * The boot node has a processor
*/
- err = process_zones(smp_processor_id());
- BUG_ON(err);
+ node_set_state(cpu_to_node(smp_processor_id()), N_CPU);
register_cpu_notifier(&pageset_notifier);
}
-#endif
-
static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
@@ -3254,7 +3204,7 @@ static int __zone_pcp_update(void *data)
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- pset = zone_pcp(zone, cpu);
+ pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
local_irq_save(flags);
@@ -3272,15 +3222,7 @@ void zone_pcp_update(struct zone *zone)
/*
* Early setup of pagesets.
- *
- * In the NUMA case the pageset setup simply results in all zones pcp
- * pointer being directed at a per cpu pageset with zero batchsize.
- *
- * This means that every free and every allocation occurs directly from
- * the buddy allocator tables.
- *
- * The pageset never queues pages during early boot and is therefore usable
- * for every type of zone.
+ * At this point various allocators are not operational yet.
*/
__meminit void setup_pagesets(void)
{
@@ -3288,23 +3230,15 @@ __meminit void setup_pagesets(void)
struct zone *zone;
for_each_zone(zone) {
-#ifdef CONFIG_NUMA
- unsigned long batch = 0;
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- /* Early boot. Slab allocator not functional yet */
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- }
-#else
- unsigned long batch = zone_batchsize(zone);
-#endif
+ zone->pageset = &per_cpu_var(boot_pageset);
+ /*
+ * Special pagesets with one element so that frees
+ * and allocations are not buffered at all.
+ */
for_each_possible_cpu(cpu)
- setup_pageset(zone_pcp(zone, cpu), batch);
+ setup_pageset(per_cpu_ptr(zone->pageset, cpu), 1);
- if (zone->present_pages)
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone->name, zone->present_pages, batch);
}
}
@@ -4818,10 +4752,11 @@ int percpu_pagelist_fraction_sysctl_hand
if (!write || (ret == -EINVAL))
return ret;
for_each_populated_zone(zone) {
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
unsigned long high;
high = zone->present_pages / percpu_pagelist_fraction;
- setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+ setup_pagelist_highmark(
+ per_cpu_ptr(zone->pageset, cpu), high);
}
}
return 0;
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c 2009-10-05 15:33:08.000000000 -0500
+++ linux-2.6/mm/vmstat.c 2009-10-06 12:43:22.000000000 -0500
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds
threshold = calculate_threshold(zone);
for_each_online_cpu(cpu)
- zone_pcp(zone, cpu)->stat_threshold = threshold;
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
}
}
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+
s8 *p = pcp->vm_stat_diff + item;
long x;
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
s8 *p = pcp->vm_stat_diff + item;
(*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
s8 *p = pcp->vm_stat_diff + item;
(*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;
- p = zone_pcp(zone, cpu);
+ p = per_cpu_ptr(zone->pageset, cpu);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (p->vm_stat_diff[i]) {
@@ -738,7 +740,7 @@ static void zoneinfo_show_print(struct s
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, i);
+ pageset = per_cpu_ptr(zone->pageset, i);
seq_printf(m,
"\n cpu: %i"
"\n count: %i"
On Tue, Oct 06, 2009 at 01:51:38PM -0400, Christoph Lameter wrote:
> On Tue, 6 Oct 2009, Mel Gorman wrote:
>
> > While I have no problem as such with the local_irq_save() moving (although
> > I would like PGFREE and PGALLOC to be accounted both with or without IRQs
> > enabled), I think it deserves to be in a patch all to itself and not hidden
> > in an apparently unrelated change.
>
> Ok I have moved the local_irq_save back.
>
Thanks.
> Full patch (will push it back into my git tree if you approve)
>
Few more comments I'm afraid :(
> From: Christoph Lameter <[email protected]>
> Subject: this_cpu_ops: page allocator conversion
>
> Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
>
> This drastically reduces the size of struct zone for systems with large
> amounts of processors and allows placement of critical variables of struct
> zone in one cacheline even on very large systems.
>
> Another effect is that the pagesets of one processor are placed near one
> another. If multiple pagesets from different zones fit into one cacheline
> then additional cacheline fetches can be avoided on the hot paths when
> allocating memory from multiple zones.
>
> Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs
> are reduced and we can drop the zone_pcp macro.
>
> Hotplug handling is also simplified since cpu alloc can bring up and
> shut down cpu areas for a specific cpu as a whole. So there is no need to
> allocate or free individual pagesets.
>
> Cc: Mel Gorman <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> ---
> include/linux/mm.h | 4 -
> include/linux/mmzone.h | 12 ---
> mm/page_alloc.c | 157 ++++++++++++++-----------------------------------
> mm/vmstat.c | 14 ++--
> 4 files changed, 56 insertions(+), 131 deletions(-)
>
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h 2009-10-06 12:41:19.000000000 -0500
> +++ linux-2.6/include/linux/mm.h 2009-10-06 12:41:19.000000000 -0500
> @@ -1062,11 +1062,7 @@ extern void si_meminfo_node(struct sysin
> extern int after_bootmem;
> extern void setup_pagesets(void);
>
> -#ifdef CONFIG_NUMA
> extern void setup_per_cpu_pageset(void);
> -#else
> -static inline void setup_per_cpu_pageset(void) {}
> -#endif
>
> extern void zone_pcp_update(struct zone *zone);
>
> Index: linux-2.6/include/linux/mmzone.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mmzone.h 2009-10-05 15:33:08.000000000 -0500
> +++ linux-2.6/include/linux/mmzone.h 2009-10-06 12:41:19.000000000 -0500
> @@ -184,13 +184,7 @@ struct per_cpu_pageset {
> s8 stat_threshold;
> s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
> #endif
> -} ____cacheline_aligned_in_smp;
> -
> -#ifdef CONFIG_NUMA
> -#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
> -#else
> -#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
> -#endif
> +};
>
> #endif /* !__GENERATING_BOUNDS.H */
>
> @@ -306,10 +300,8 @@ struct zone {
> */
> unsigned long min_unmapped_pages;
> unsigned long min_slab_pages;
> - struct per_cpu_pageset *pageset[NR_CPUS];
> -#else
> - struct per_cpu_pageset pageset[NR_CPUS];
> #endif
> + struct per_cpu_pageset *pageset;
> /*
> * free areas of different sizes
> */
> Index: linux-2.6/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.orig/mm/page_alloc.c 2009-10-06 12:41:19.000000000 -0500
> +++ linux-2.6/mm/page_alloc.c 2009-10-06 12:43:27.000000000 -0500
> @@ -1011,7 +1011,7 @@ static void drain_pages(unsigned int cpu
> struct per_cpu_pageset *pset;
> struct per_cpu_pages *pcp;
>
> - pset = zone_pcp(zone, cpu);
> + pset = per_cpu_ptr(zone->pageset, cpu);
>
> pcp = &pset->pcp;
> local_irq_save(flags);
It's not your fault and it doesn't actually matter to the current callers
of drain_pages, but you might as well move the per_cpu_ptr inside the
local_irq_save() here as well while you're changing here.
> @@ -1098,7 +1098,6 @@ static void free_hot_cold_page(struct pa
> arch_free_page(page, 0);
> kernel_map_pages(page, 1, 0);
>
> - pcp = &zone_pcp(zone, get_cpu())->pcp;
> migratetype = get_pageblock_migratetype(page);
> set_page_private(page, migratetype);
> local_irq_save(flags);
> @@ -1121,6 +1120,7 @@ static void free_hot_cold_page(struct pa
> migratetype = MIGRATE_MOVABLE;
> }
>
> + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> if (cold)
> list_add_tail(&page->lru, &pcp->lists[migratetype]);
> else
> @@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa
>
> out:
> local_irq_restore(flags);
> - put_cpu();
> }
>
> void free_hot_page(struct page *page)
> @@ -1183,15 +1182,13 @@ struct page *buffered_rmqueue(struct zon
> unsigned long flags;
> struct page *page;
> int cold = !!(gfp_flags & __GFP_COLD);
> - int cpu;
>
> again:
> - cpu = get_cpu();
> if (likely(order == 0)) {
> struct per_cpu_pages *pcp;
> struct list_head *list;
>
> - pcp = &zone_pcp(zone, cpu)->pcp;
> + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> list = &pcp->lists[migratetype];
> local_irq_save(flags);
I believe this falls foul of the same problem as in the free path. We
are no longer preempt safe and this_cpu_ptr() needs to move within the
local_irq_save().
I didn't spot anything out of the ordinary after this but I haven't tested
the series.
> if (list_empty(list)) {
> @@ -1234,7 +1231,6 @@ again:
> __count_zone_vm_events(PGALLOC, zone, 1 << order);
> zone_statistics(preferred_zone, zone);
> local_irq_restore(flags);
> - put_cpu();
>
> VM_BUG_ON(bad_range(zone, page));
> if (prep_new_page(page, order, gfp_flags))
> @@ -1243,7 +1239,6 @@ again:
>
> failed:
> local_irq_restore(flags);
> - put_cpu();
> return NULL;
> }
>
> @@ -2172,7 +2167,7 @@ void show_free_areas(void)
> for_each_online_cpu(cpu) {
> struct per_cpu_pageset *pageset;
>
> - pageset = zone_pcp(zone, cpu);
> + pageset = per_cpu_ptr(zone->pageset, cpu);
>
> printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
> cpu, pageset->pcp.high,
> @@ -3087,7 +3082,6 @@ static void setup_pagelist_highmark(stru
> }
>
>
> -#ifdef CONFIG_NUMA
> /*
> * Boot pageset table. One per cpu which is going to be used for all
> * zones and all nodes. The parameters will be set in such a way
> @@ -3095,112 +3089,68 @@ static void setup_pagelist_highmark(stru
> * the buddy list. This is safe since pageset manipulation is done
> * with interrupts disabled.
> *
> - * Some NUMA counter updates may also be caught by the boot pagesets.
> - *
> - * The boot_pagesets must be kept even after bootup is complete for
> - * unused processors and/or zones. They do play a role for bootstrapping
> - * hotplugged processors.
> + * Some counter updates may also be caught by the boot pagesets.
> *
> * zoneinfo_show() and maybe other functions do
> * not check if the processor is online before following the pageset pointer.
> * Other parts of the kernel may not check if the zone is available.
> */
> -static struct per_cpu_pageset boot_pageset[NR_CPUS];
> -
> -/*
> - * Dynamically allocate memory for the
> - * per cpu pageset array in struct zone.
> - */
> -static int __cpuinit process_zones(int cpu)
> -{
> - struct zone *zone, *dzone;
> - int node = cpu_to_node(cpu);
> -
> - node_set_state(node, N_CPU); /* this node has a cpu */
> -
> - for_each_populated_zone(zone) {
> - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
> - GFP_KERNEL, node);
> - if (!zone_pcp(zone, cpu))
> - goto bad;
> -
> - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
> -
> - if (percpu_pagelist_fraction)
> - setup_pagelist_highmark(zone_pcp(zone, cpu),
> - (zone->present_pages / percpu_pagelist_fraction));
> - }
> -
> - return 0;
> -bad:
> - for_each_zone(dzone) {
> - if (!populated_zone(dzone))
> - continue;
> - if (dzone == zone)
> - break;
> - kfree(zone_pcp(dzone, cpu));
> - zone_pcp(dzone, cpu) = &boot_pageset[cpu];
> - }
> - return -ENOMEM;
> -}
> -
> -static inline void free_zone_pagesets(int cpu)
> -{
> - struct zone *zone;
> -
> - for_each_zone(zone) {
> - struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
> -
> - /* Free per_cpu_pageset if it is slab allocated */
> - if (pset != &boot_pageset[cpu])
> - kfree(pset);
> - zone_pcp(zone, cpu) = &boot_pageset[cpu];
> - }
> -}
> +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
>
> static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
> unsigned long action,
> void *hcpu)
> {
> int cpu = (long)hcpu;
> - int ret = NOTIFY_OK;
>
> switch (action) {
> case CPU_UP_PREPARE:
> case CPU_UP_PREPARE_FROZEN:
> - if (process_zones(cpu))
> - ret = NOTIFY_BAD;
> - break;
> - case CPU_UP_CANCELED:
> - case CPU_UP_CANCELED_FROZEN:
> - case CPU_DEAD:
> - case CPU_DEAD_FROZEN:
> - free_zone_pagesets(cpu);
> + node_set_state(cpu_to_node(cpu), N_CPU);
> break;
> default:
> break;
> }
> - return ret;
> + return NOTIFY_OK;
> }
>
> static struct notifier_block __cpuinitdata pageset_notifier =
> { &pageset_cpuup_callback, NULL, 0 };
>
> +/*
> + * Allocate per cpu pagesets and initialize them.
> + * Before this call only boot pagesets were available.
> + * Boot pagesets will no longer be used by this processorr
> + * after setup_per_cpu_pageset().
> + */
> void __init setup_per_cpu_pageset(void)
> {
> - int err;
> + struct zone *zone;
> + int cpu;
> +
> + for_each_populated_zone(zone) {
> + zone->pageset = alloc_percpu(struct per_cpu_pageset);
>
> - /* Initialize per_cpu_pageset for cpu 0.
> - * A cpuup callback will do this for every cpu
> - * as it comes online
> + for_each_possible_cpu(cpu) {
> + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
> +
> + setup_pageset(pcp, zone_batchsize(zone));
> +
> + if (percpu_pagelist_fraction)
> + setup_pagelist_highmark(pcp,
> + (zone->present_pages /
> + percpu_pagelist_fraction));
> + }
> + }
> +
> + /*
> + * The boot cpu is always the first active.
> + * The boot node has a processor
> */
> - err = process_zones(smp_processor_id());
> - BUG_ON(err);
> + node_set_state(cpu_to_node(smp_processor_id()), N_CPU);
> register_cpu_notifier(&pageset_notifier);
> }
>
> -#endif
> -
> static noinline __init_refok
> int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
> {
> @@ -3254,7 +3204,7 @@ static int __zone_pcp_update(void *data)
> struct per_cpu_pageset *pset;
> struct per_cpu_pages *pcp;
>
> - pset = zone_pcp(zone, cpu);
> + pset = per_cpu_ptr(zone->pageset, cpu);
> pcp = &pset->pcp;
>
> local_irq_save(flags);
> @@ -3272,15 +3222,7 @@ void zone_pcp_update(struct zone *zone)
>
> /*
> * Early setup of pagesets.
> - *
> - * In the NUMA case the pageset setup simply results in all zones pcp
> - * pointer being directed at a per cpu pageset with zero batchsize.
> - *
> - * This means that every free and every allocation occurs directly from
> - * the buddy allocator tables.
> - *
> - * The pageset never queues pages during early boot and is therefore usable
> - * for every type of zone.
> + * At this point various allocators are not operational yet.
> */
> __meminit void setup_pagesets(void)
> {
> @@ -3288,23 +3230,15 @@ __meminit void setup_pagesets(void)
> struct zone *zone;
>
> for_each_zone(zone) {
> -#ifdef CONFIG_NUMA
> - unsigned long batch = 0;
> -
> - for (cpu = 0; cpu < NR_CPUS; cpu++) {
> - /* Early boot. Slab allocator not functional yet */
> - zone_pcp(zone, cpu) = &boot_pageset[cpu];
> - }
> -#else
> - unsigned long batch = zone_batchsize(zone);
> -#endif
> + zone->pageset = &per_cpu_var(boot_pageset);
>
> + /*
> + * Special pagesets with one element so that frees
> + * and allocations are not buffered at all.
> + */
> for_each_possible_cpu(cpu)
> - setup_pageset(zone_pcp(zone, cpu), batch);
> + setup_pageset(per_cpu_ptr(zone->pageset, cpu), 1);
>
> - if (zone->present_pages)
> - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
> - zone->name, zone->present_pages, batch);
> }
> }
>
> @@ -4818,10 +4752,11 @@ int percpu_pagelist_fraction_sysctl_hand
> if (!write || (ret == -EINVAL))
> return ret;
> for_each_populated_zone(zone) {
> - for_each_online_cpu(cpu) {
> + for_each_possible_cpu(cpu) {
> unsigned long high;
> high = zone->present_pages / percpu_pagelist_fraction;
> - setup_pagelist_highmark(zone_pcp(zone, cpu), high);
> + setup_pagelist_highmark(
> + per_cpu_ptr(zone->pageset, cpu), high);
> }
> }
> return 0;
> Index: linux-2.6/mm/vmstat.c
> ===================================================================
> --- linux-2.6.orig/mm/vmstat.c 2009-10-05 15:33:08.000000000 -0500
> +++ linux-2.6/mm/vmstat.c 2009-10-06 12:43:22.000000000 -0500
> @@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds
> threshold = calculate_threshold(zone);
>
> for_each_online_cpu(cpu)
> - zone_pcp(zone, cpu)->stat_threshold = threshold;
> + per_cpu_ptr(zone->pageset, cpu)->stat_threshold
> + = threshold;
> }
> }
>
> @@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds
> void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
> int delta)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> +
> s8 *p = pcp->vm_stat_diff + item;
> long x;
>
> @@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
> */
> void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> s8 *p = pcp->vm_stat_diff + item;
>
> (*p)++;
> @@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
>
> void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> s8 *p = pcp->vm_stat_diff + item;
>
> (*p)--;
> @@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
> for_each_populated_zone(zone) {
> struct per_cpu_pageset *p;
>
> - p = zone_pcp(zone, cpu);
> + p = per_cpu_ptr(zone->pageset, cpu);
>
> for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
> if (p->vm_stat_diff[i]) {
> @@ -738,7 +740,7 @@ static void zoneinfo_show_print(struct s
> for_each_online_cpu(i) {
> struct per_cpu_pageset *pageset;
>
> - pageset = zone_pcp(zone, i);
> + pageset = per_cpu_ptr(zone->pageset, i);
> seq_printf(m,
> "\n cpu: %i"
> "\n count: %i"
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Tue, 6 Oct 2009, Mel Gorman wrote:
> > --- linux-2.6.orig/mm/page_alloc.c 2009-10-06 12:41:19.000000000 -0500
> > +++ linux-2.6/mm/page_alloc.c 2009-10-06 12:43:27.000000000 -0500
> > @@ -1011,7 +1011,7 @@ static void drain_pages(unsigned int cpu
> > struct per_cpu_pageset *pset;
> > struct per_cpu_pages *pcp;
> >
> > - pset = zone_pcp(zone, cpu);
> > + pset = per_cpu_ptr(zone->pageset, cpu);
> >
> > pcp = &pset->pcp;
> > local_irq_save(flags);
>
> It's not your fault and it doesn't actually matter to the current callers
> of drain_pages, but you might as well move the per_cpu_ptr inside the
> local_irq_save() here as well while you're changing here.
The comments before drain_pages() clearly state that the caller must be
pinned to a processor. But lets change it for consistencies sake.
> > - cpu = get_cpu();
> > if (likely(order == 0)) {
> > struct per_cpu_pages *pcp;
> > struct list_head *list;
> >
> > - pcp = &zone_pcp(zone, cpu)->pcp;
> > + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> > list = &pcp->lists[migratetype];
> > local_irq_save(flags);
>
> I believe this falls foul of the same problem as in the free path. We
> are no longer preempt safe and this_cpu_ptr() needs to move within the
> local_irq_save().
Ok.
From: Christoph Lameter <[email protected]>
Subject: this_cpu_ops: page allocator conversion
Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
This drastically reduces the size of struct zone for systems with large
amounts of processors and allows placement of critical variables of struct
zone in one cacheline even on very large systems.
Another effect is that the pagesets of one processor are placed near one
another. If multiple pagesets from different zones fit into one cacheline
then additional cacheline fetches can be avoided on the hot paths when
allocating memory from multiple zones.
Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs
are reduced and we can drop the zone_pcp macro.
Hotplug handling is also simplified since cpu alloc can bring up and
shut down cpu areas for a specific cpu as a whole. So there is no need to
allocate or free individual pagesets.
Cc: Mel Gorman <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>
---
include/linux/mm.h | 4 -
include/linux/mmzone.h | 12 ---
mm/page_alloc.c | 161 ++++++++++++++-----------------------------------
mm/vmstat.c | 14 ++--
4 files changed, 58 insertions(+), 133 deletions(-)
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h 2009-10-06 13:54:25.000000000 -0500
+++ linux-2.6/include/linux/mm.h 2009-10-06 13:54:25.000000000 -0500
@@ -1062,11 +1062,7 @@ extern void si_meminfo_node(struct sysin
extern int after_bootmem;
extern void setup_pagesets(void);
-#ifdef CONFIG_NUMA
extern void setup_per_cpu_pageset(void);
-#else
-static inline void setup_per_cpu_pageset(void) {}
-#endif
extern void zone_pcp_update(struct zone *zone);
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h 2009-10-06 12:48:46.000000000 -0500
+++ linux-2.6/include/linux/mmzone.h 2009-10-06 13:54:25.000000000 -0500
@@ -184,13 +184,7 @@ struct per_cpu_pageset {
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
-} ____cacheline_aligned_in_smp;
-
-#ifdef CONFIG_NUMA
-#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
-#else
-#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
-#endif
+};
#endif /* !__GENERATING_BOUNDS.H */
@@ -306,10 +300,8 @@ struct zone {
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
- struct per_cpu_pageset *pageset[NR_CPUS];
-#else
- struct per_cpu_pageset pageset[NR_CPUS];
#endif
+ struct per_cpu_pageset *pageset;
/*
* free areas of different sizes
*/
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2009-10-06 13:54:25.000000000 -0500
+++ linux-2.6/mm/page_alloc.c 2009-10-06 13:59:27.000000000 -0500
@@ -1011,10 +1011,10 @@ static void drain_pages(unsigned int cpu
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- pset = zone_pcp(zone, cpu);
+ local_irq_save(flags);
+ pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- local_irq_save(flags);
free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
local_irq_restore(flags);
@@ -1098,7 +1098,6 @@ static void free_hot_cold_page(struct pa
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
- pcp = &zone_pcp(zone, get_cpu())->pcp;
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
local_irq_save(flags);
@@ -1121,6 +1120,7 @@ static void free_hot_cold_page(struct pa
migratetype = MIGRATE_MOVABLE;
}
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
else
@@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa
out:
local_irq_restore(flags);
- put_cpu();
}
void free_hot_page(struct page *page)
@@ -1183,17 +1182,15 @@ struct page *buffered_rmqueue(struct zon
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
- int cpu;
again:
- cpu = get_cpu();
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
- pcp = &zone_pcp(zone, cpu)->pcp;
- list = &pcp->lists[migratetype];
local_irq_save(flags);
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
@@ -1234,7 +1231,6 @@ again:
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
- put_cpu();
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
@@ -1243,7 +1239,6 @@ again:
failed:
local_irq_restore(flags);
- put_cpu();
return NULL;
}
@@ -2172,7 +2167,7 @@ void show_free_areas(void)
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, cpu);
+ pageset = per_cpu_ptr(zone->pageset, cpu);
printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
cpu, pageset->pcp.high,
@@ -3087,7 +3082,6 @@ static void setup_pagelist_highmark(stru
}
-#ifdef CONFIG_NUMA
/*
* Boot pageset table. One per cpu which is going to be used for all
* zones and all nodes. The parameters will be set in such a way
@@ -3095,112 +3089,68 @@ static void setup_pagelist_highmark(stru
* the buddy list. This is safe since pageset manipulation is done
* with interrupts disabled.
*
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
+ * Some counter updates may also be caught by the boot pagesets.
*
* zoneinfo_show() and maybe other functions do
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
-{
- struct zone *zone, *dzone;
- int node = cpu_to_node(cpu);
-
- node_set_state(node, N_CPU); /* this node has a cpu */
-
- for_each_populated_zone(zone) {
- zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
- GFP_KERNEL, node);
- if (!zone_pcp(zone, cpu))
- goto bad;
-
- setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
- if (percpu_pagelist_fraction)
- setup_pagelist_highmark(zone_pcp(zone, cpu),
- (zone->present_pages / percpu_pagelist_fraction));
- }
-
- return 0;
-bad:
- for_each_zone(dzone) {
- if (!populated_zone(dzone))
- continue;
- if (dzone == zone)
- break;
- kfree(zone_pcp(dzone, cpu));
- zone_pcp(dzone, cpu) = &boot_pageset[cpu];
- }
- return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
- struct zone *zone;
-
- for_each_zone(zone) {
- struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
- /* Free per_cpu_pageset if it is slab allocated */
- if (pset != &boot_pageset[cpu])
- kfree(pset);
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- }
-}
+static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
int cpu = (long)hcpu;
- int ret = NOTIFY_OK;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- free_zone_pagesets(cpu);
+ node_set_state(cpu_to_node(cpu), N_CPU);
break;
default:
break;
}
- return ret;
+ return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata pageset_notifier =
{ &pageset_cpuup_callback, NULL, 0 };
+/*
+ * Allocate per cpu pagesets and initialize them.
+ * Before this call only boot pagesets were available.
+ * Boot pagesets will no longer be used by this processorr
+ * after setup_per_cpu_pageset().
+ */
void __init setup_per_cpu_pageset(void)
{
- int err;
+ struct zone *zone;
+ int cpu;
- /* Initialize per_cpu_pageset for cpu 0.
- * A cpuup callback will do this for every cpu
- * as it comes online
+ for_each_populated_zone(zone) {
+ zone->pageset = alloc_percpu(struct per_cpu_pageset);
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+
+ setup_pageset(pcp, zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(pcp,
+ (zone->present_pages /
+ percpu_pagelist_fraction));
+ }
+ }
+
+ /*
+ * The boot cpu is always the first active.
+ * The boot node has a processor
*/
- err = process_zones(smp_processor_id());
- BUG_ON(err);
+ node_set_state(cpu_to_node(smp_processor_id()), N_CPU);
register_cpu_notifier(&pageset_notifier);
}
-#endif
-
static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
@@ -3254,7 +3204,7 @@ static int __zone_pcp_update(void *data)
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- pset = zone_pcp(zone, cpu);
+ pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
local_irq_save(flags);
@@ -3272,15 +3222,7 @@ void zone_pcp_update(struct zone *zone)
/*
* Early setup of pagesets.
- *
- * In the NUMA case the pageset setup simply results in all zones pcp
- * pointer being directed at a per cpu pageset with zero batchsize.
- *
- * This means that every free and every allocation occurs directly from
- * the buddy allocator tables.
- *
- * The pageset never queues pages during early boot and is therefore usable
- * for every type of zone.
+ * At this point various allocators are not operational yet.
*/
__meminit void setup_pagesets(void)
{
@@ -3288,23 +3230,15 @@ __meminit void setup_pagesets(void)
struct zone *zone;
for_each_zone(zone) {
-#ifdef CONFIG_NUMA
- unsigned long batch = 0;
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- /* Early boot. Slab allocator not functional yet */
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- }
-#else
- unsigned long batch = zone_batchsize(zone);
-#endif
+ zone->pageset = &per_cpu_var(boot_pageset);
+ /*
+ * Special pagesets with one element so that frees
+ * and allocations are not buffered at all.
+ */
for_each_possible_cpu(cpu)
- setup_pageset(zone_pcp(zone, cpu), batch);
+ setup_pageset(per_cpu_ptr(zone->pageset, cpu), 1);
- if (zone->present_pages)
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone->name, zone->present_pages, batch);
}
}
@@ -4818,10 +4752,11 @@ int percpu_pagelist_fraction_sysctl_hand
if (!write || (ret == -EINVAL))
return ret;
for_each_populated_zone(zone) {
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
unsigned long high;
high = zone->present_pages / percpu_pagelist_fraction;
- setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+ setup_pagelist_highmark(
+ per_cpu_ptr(zone->pageset, cpu), high);
}
}
return 0;
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c 2009-10-06 12:48:46.000000000 -0500
+++ linux-2.6/mm/vmstat.c 2009-10-06 13:59:23.000000000 -0500
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds
threshold = calculate_threshold(zone);
for_each_online_cpu(cpu)
- zone_pcp(zone, cpu)->stat_threshold = threshold;
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
}
}
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+
s8 *p = pcp->vm_stat_diff + item;
long x;
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
s8 *p = pcp->vm_stat_diff + item;
(*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
s8 *p = pcp->vm_stat_diff + item;
(*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;
- p = zone_pcp(zone, cpu);
+ p = per_cpu_ptr(zone->pageset, cpu);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (p->vm_stat_diff[i]) {
@@ -738,7 +740,7 @@ static void zoneinfo_show_print(struct s
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, i);
+ pageset = per_cpu_ptr(zone->pageset, i);
seq_printf(m,
"\n cpu: %i"
"\n count: %i"
On Tue, Oct 06, 2009 at 03:06:27PM -0400, Christoph Lameter wrote:
> On Tue, 6 Oct 2009, Mel Gorman wrote:
>
> > > --- linux-2.6.orig/mm/page_alloc.c 2009-10-06 12:41:19.000000000 -0500
> > > +++ linux-2.6/mm/page_alloc.c 2009-10-06 12:43:27.000000000 -0500
> > > @@ -1011,7 +1011,7 @@ static void drain_pages(unsigned int cpu
> > > struct per_cpu_pageset *pset;
> > > struct per_cpu_pages *pcp;
> > >
> > > - pset = zone_pcp(zone, cpu);
> > > + pset = per_cpu_ptr(zone->pageset, cpu);
> > >
> > > pcp = &pset->pcp;
> > > local_irq_save(flags);
> >
> > It's not your fault and it doesn't actually matter to the current callers
> > of drain_pages, but you might as well move the per_cpu_ptr inside the
> > local_irq_save() here as well while you're changing here.
>
> The comments before drain_pages() clearly state that the caller must be
> pinned to a processor. But lets change it for consistencies sake.
>
I noted the comment all right hence me saying that it doesn't matter to
the current callers because they obey the rules.
It was consistency I was looking at but I should have kept quiet because
there are a few oddities like this. It doesn't hurt to fix though.
> > > - cpu = get_cpu();
> > > if (likely(order == 0)) {
> > > struct per_cpu_pages *pcp;
> > > struct list_head *list;
> > >
> > > - pcp = &zone_pcp(zone, cpu)->pcp;
> > > + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> > > list = &pcp->lists[migratetype];
> > > local_irq_save(flags);
> >
> > I believe this falls foul of the same problem as in the free path. We
> > are no longer preempt safe and this_cpu_ptr() needs to move within the
> > local_irq_save().
>
> Ok.
>
> From: Christoph Lameter <[email protected]>
> Subject: this_cpu_ops: page allocator conversion
>
> Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
>
> This drastically reduces the size of struct zone for systems with large
> amounts of processors and allows placement of critical variables of struct
> zone in one cacheline even on very large systems.
>
> Another effect is that the pagesets of one processor are placed near one
> another. If multiple pagesets from different zones fit into one cacheline
> then additional cacheline fetches can be avoided on the hot paths when
> allocating memory from multiple zones.
>
> Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs
> are reduced and we can drop the zone_pcp macro.
>
> Hotplug handling is also simplified since cpu alloc can bring up and
> shut down cpu areas for a specific cpu as a whole. So there is no need to
> allocate or free individual pagesets.
>
> Cc: Mel Gorman <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
>
I can't see anything else to complain about. Performance figures would
be nice but otherwise
Reviewed-by: Mel Gorman <[email protected]>
Thanks
> ---
> include/linux/mm.h | 4 -
> include/linux/mmzone.h | 12 ---
> mm/page_alloc.c | 161 ++++++++++++++-----------------------------------
> mm/vmstat.c | 14 ++--
> 4 files changed, 58 insertions(+), 133 deletions(-)
>
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h 2009-10-06 13:54:25.000000000 -0500
> +++ linux-2.6/include/linux/mm.h 2009-10-06 13:54:25.000000000 -0500
> @@ -1062,11 +1062,7 @@ extern void si_meminfo_node(struct sysin
> extern int after_bootmem;
> extern void setup_pagesets(void);
>
> -#ifdef CONFIG_NUMA
> extern void setup_per_cpu_pageset(void);
> -#else
> -static inline void setup_per_cpu_pageset(void) {}
> -#endif
>
> extern void zone_pcp_update(struct zone *zone);
>
> Index: linux-2.6/include/linux/mmzone.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mmzone.h 2009-10-06 12:48:46.000000000 -0500
> +++ linux-2.6/include/linux/mmzone.h 2009-10-06 13:54:25.000000000 -0500
> @@ -184,13 +184,7 @@ struct per_cpu_pageset {
> s8 stat_threshold;
> s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
> #endif
> -} ____cacheline_aligned_in_smp;
> -
> -#ifdef CONFIG_NUMA
> -#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
> -#else
> -#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
> -#endif
> +};
>
> #endif /* !__GENERATING_BOUNDS.H */
>
> @@ -306,10 +300,8 @@ struct zone {
> */
> unsigned long min_unmapped_pages;
> unsigned long min_slab_pages;
> - struct per_cpu_pageset *pageset[NR_CPUS];
> -#else
> - struct per_cpu_pageset pageset[NR_CPUS];
> #endif
> + struct per_cpu_pageset *pageset;
> /*
> * free areas of different sizes
> */
> Index: linux-2.6/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.orig/mm/page_alloc.c 2009-10-06 13:54:25.000000000 -0500
> +++ linux-2.6/mm/page_alloc.c 2009-10-06 13:59:27.000000000 -0500
> @@ -1011,10 +1011,10 @@ static void drain_pages(unsigned int cpu
> struct per_cpu_pageset *pset;
> struct per_cpu_pages *pcp;
>
> - pset = zone_pcp(zone, cpu);
> + local_irq_save(flags);
> + pset = per_cpu_ptr(zone->pageset, cpu);
>
> pcp = &pset->pcp;
> - local_irq_save(flags);
> free_pcppages_bulk(zone, pcp->count, pcp);
> pcp->count = 0;
> local_irq_restore(flags);
> @@ -1098,7 +1098,6 @@ static void free_hot_cold_page(struct pa
> arch_free_page(page, 0);
> kernel_map_pages(page, 1, 0);
>
> - pcp = &zone_pcp(zone, get_cpu())->pcp;
> migratetype = get_pageblock_migratetype(page);
> set_page_private(page, migratetype);
> local_irq_save(flags);
> @@ -1121,6 +1120,7 @@ static void free_hot_cold_page(struct pa
> migratetype = MIGRATE_MOVABLE;
> }
>
> + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> if (cold)
> list_add_tail(&page->lru, &pcp->lists[migratetype]);
> else
> @@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa
>
> out:
> local_irq_restore(flags);
> - put_cpu();
> }
>
> void free_hot_page(struct page *page)
> @@ -1183,17 +1182,15 @@ struct page *buffered_rmqueue(struct zon
> unsigned long flags;
> struct page *page;
> int cold = !!(gfp_flags & __GFP_COLD);
> - int cpu;
>
> again:
> - cpu = get_cpu();
> if (likely(order == 0)) {
> struct per_cpu_pages *pcp;
> struct list_head *list;
>
> - pcp = &zone_pcp(zone, cpu)->pcp;
> - list = &pcp->lists[migratetype];
> local_irq_save(flags);
> + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> + list = &pcp->lists[migratetype];
> if (list_empty(list)) {
> pcp->count += rmqueue_bulk(zone, 0,
> pcp->batch, list,
> @@ -1234,7 +1231,6 @@ again:
> __count_zone_vm_events(PGALLOC, zone, 1 << order);
> zone_statistics(preferred_zone, zone);
> local_irq_restore(flags);
> - put_cpu();
>
> VM_BUG_ON(bad_range(zone, page));
> if (prep_new_page(page, order, gfp_flags))
> @@ -1243,7 +1239,6 @@ again:
>
> failed:
> local_irq_restore(flags);
> - put_cpu();
> return NULL;
> }
>
> @@ -2172,7 +2167,7 @@ void show_free_areas(void)
> for_each_online_cpu(cpu) {
> struct per_cpu_pageset *pageset;
>
> - pageset = zone_pcp(zone, cpu);
> + pageset = per_cpu_ptr(zone->pageset, cpu);
>
> printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
> cpu, pageset->pcp.high,
> @@ -3087,7 +3082,6 @@ static void setup_pagelist_highmark(stru
> }
>
>
> -#ifdef CONFIG_NUMA
> /*
> * Boot pageset table. One per cpu which is going to be used for all
> * zones and all nodes. The parameters will be set in such a way
> @@ -3095,112 +3089,68 @@ static void setup_pagelist_highmark(stru
> * the buddy list. This is safe since pageset manipulation is done
> * with interrupts disabled.
> *
> - * Some NUMA counter updates may also be caught by the boot pagesets.
> - *
> - * The boot_pagesets must be kept even after bootup is complete for
> - * unused processors and/or zones. They do play a role for bootstrapping
> - * hotplugged processors.
> + * Some counter updates may also be caught by the boot pagesets.
> *
> * zoneinfo_show() and maybe other functions do
> * not check if the processor is online before following the pageset pointer.
> * Other parts of the kernel may not check if the zone is available.
> */
> -static struct per_cpu_pageset boot_pageset[NR_CPUS];
> -
> -/*
> - * Dynamically allocate memory for the
> - * per cpu pageset array in struct zone.
> - */
> -static int __cpuinit process_zones(int cpu)
> -{
> - struct zone *zone, *dzone;
> - int node = cpu_to_node(cpu);
> -
> - node_set_state(node, N_CPU); /* this node has a cpu */
> -
> - for_each_populated_zone(zone) {
> - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
> - GFP_KERNEL, node);
> - if (!zone_pcp(zone, cpu))
> - goto bad;
> -
> - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
> -
> - if (percpu_pagelist_fraction)
> - setup_pagelist_highmark(zone_pcp(zone, cpu),
> - (zone->present_pages / percpu_pagelist_fraction));
> - }
> -
> - return 0;
> -bad:
> - for_each_zone(dzone) {
> - if (!populated_zone(dzone))
> - continue;
> - if (dzone == zone)
> - break;
> - kfree(zone_pcp(dzone, cpu));
> - zone_pcp(dzone, cpu) = &boot_pageset[cpu];
> - }
> - return -ENOMEM;
> -}
> -
> -static inline void free_zone_pagesets(int cpu)
> -{
> - struct zone *zone;
> -
> - for_each_zone(zone) {
> - struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
> -
> - /* Free per_cpu_pageset if it is slab allocated */
> - if (pset != &boot_pageset[cpu])
> - kfree(pset);
> - zone_pcp(zone, cpu) = &boot_pageset[cpu];
> - }
> -}
> +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
>
> static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
> unsigned long action,
> void *hcpu)
> {
> int cpu = (long)hcpu;
> - int ret = NOTIFY_OK;
>
> switch (action) {
> case CPU_UP_PREPARE:
> case CPU_UP_PREPARE_FROZEN:
> - if (process_zones(cpu))
> - ret = NOTIFY_BAD;
> - break;
> - case CPU_UP_CANCELED:
> - case CPU_UP_CANCELED_FROZEN:
> - case CPU_DEAD:
> - case CPU_DEAD_FROZEN:
> - free_zone_pagesets(cpu);
> + node_set_state(cpu_to_node(cpu), N_CPU);
> break;
> default:
> break;
> }
> - return ret;
> + return NOTIFY_OK;
> }
>
> static struct notifier_block __cpuinitdata pageset_notifier =
> { &pageset_cpuup_callback, NULL, 0 };
>
> +/*
> + * Allocate per cpu pagesets and initialize them.
> + * Before this call only boot pagesets were available.
> + * Boot pagesets will no longer be used by this processorr
> + * after setup_per_cpu_pageset().
> + */
> void __init setup_per_cpu_pageset(void)
> {
> - int err;
> + struct zone *zone;
> + int cpu;
>
> - /* Initialize per_cpu_pageset for cpu 0.
> - * A cpuup callback will do this for every cpu
> - * as it comes online
> + for_each_populated_zone(zone) {
> + zone->pageset = alloc_percpu(struct per_cpu_pageset);
> +
> + for_each_possible_cpu(cpu) {
> + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
> +
> + setup_pageset(pcp, zone_batchsize(zone));
> +
> + if (percpu_pagelist_fraction)
> + setup_pagelist_highmark(pcp,
> + (zone->present_pages /
> + percpu_pagelist_fraction));
> + }
> + }
> +
> + /*
> + * The boot cpu is always the first active.
> + * The boot node has a processor
> */
> - err = process_zones(smp_processor_id());
> - BUG_ON(err);
> + node_set_state(cpu_to_node(smp_processor_id()), N_CPU);
> register_cpu_notifier(&pageset_notifier);
> }
>
> -#endif
> -
> static noinline __init_refok
> int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
> {
> @@ -3254,7 +3204,7 @@ static int __zone_pcp_update(void *data)
> struct per_cpu_pageset *pset;
> struct per_cpu_pages *pcp;
>
> - pset = zone_pcp(zone, cpu);
> + pset = per_cpu_ptr(zone->pageset, cpu);
> pcp = &pset->pcp;
>
> local_irq_save(flags);
> @@ -3272,15 +3222,7 @@ void zone_pcp_update(struct zone *zone)
>
> /*
> * Early setup of pagesets.
> - *
> - * In the NUMA case the pageset setup simply results in all zones pcp
> - * pointer being directed at a per cpu pageset with zero batchsize.
> - *
> - * This means that every free and every allocation occurs directly from
> - * the buddy allocator tables.
> - *
> - * The pageset never queues pages during early boot and is therefore usable
> - * for every type of zone.
> + * At this point various allocators are not operational yet.
> */
> __meminit void setup_pagesets(void)
> {
> @@ -3288,23 +3230,15 @@ __meminit void setup_pagesets(void)
> struct zone *zone;
>
> for_each_zone(zone) {
> -#ifdef CONFIG_NUMA
> - unsigned long batch = 0;
> -
> - for (cpu = 0; cpu < NR_CPUS; cpu++) {
> - /* Early boot. Slab allocator not functional yet */
> - zone_pcp(zone, cpu) = &boot_pageset[cpu];
> - }
> -#else
> - unsigned long batch = zone_batchsize(zone);
> -#endif
> + zone->pageset = &per_cpu_var(boot_pageset);
>
> + /*
> + * Special pagesets with one element so that frees
> + * and allocations are not buffered at all.
> + */
> for_each_possible_cpu(cpu)
> - setup_pageset(zone_pcp(zone, cpu), batch);
> + setup_pageset(per_cpu_ptr(zone->pageset, cpu), 1);
>
> - if (zone->present_pages)
> - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
> - zone->name, zone->present_pages, batch);
> }
> }
>
> @@ -4818,10 +4752,11 @@ int percpu_pagelist_fraction_sysctl_hand
> if (!write || (ret == -EINVAL))
> return ret;
> for_each_populated_zone(zone) {
> - for_each_online_cpu(cpu) {
> + for_each_possible_cpu(cpu) {
> unsigned long high;
> high = zone->present_pages / percpu_pagelist_fraction;
> - setup_pagelist_highmark(zone_pcp(zone, cpu), high);
> + setup_pagelist_highmark(
> + per_cpu_ptr(zone->pageset, cpu), high);
> }
> }
> return 0;
> Index: linux-2.6/mm/vmstat.c
> ===================================================================
> --- linux-2.6.orig/mm/vmstat.c 2009-10-06 12:48:46.000000000 -0500
> +++ linux-2.6/mm/vmstat.c 2009-10-06 13:59:23.000000000 -0500
> @@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds
> threshold = calculate_threshold(zone);
>
> for_each_online_cpu(cpu)
> - zone_pcp(zone, cpu)->stat_threshold = threshold;
> + per_cpu_ptr(zone->pageset, cpu)->stat_threshold
> + = threshold;
> }
> }
>
> @@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds
> void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
> int delta)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> +
> s8 *p = pcp->vm_stat_diff + item;
> long x;
>
> @@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
> */
> void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> s8 *p = pcp->vm_stat_diff + item;
>
> (*p)++;
> @@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
>
> void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> s8 *p = pcp->vm_stat_diff + item;
>
> (*p)--;
> @@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
> for_each_populated_zone(zone) {
> struct per_cpu_pageset *p;
>
> - p = zone_pcp(zone, cpu);
> + p = per_cpu_ptr(zone->pageset, cpu);
>
> for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
> if (p->vm_stat_diff[i]) {
> @@ -738,7 +740,7 @@ static void zoneinfo_show_print(struct s
> for_each_online_cpu(i) {
> struct per_cpu_pageset *pageset;
>
> - pageset = zone_pcp(zone, i);
> + pageset = per_cpu_ptr(zone->pageset, i);
> seq_printf(m,
> "\n cpu: %i"
> "\n count: %i"
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab