2009-12-14 22:05:56

by Christoph Lameter

[permalink] [raw]
Subject: [this_cpu_xx V7 1/8] this_cpu_ops: page allocator conversion

Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.

This drastically reduces the size of struct zone for systems with large
amounts of processors and allows placement of critical variables of struct
zone in one cacheline even on very large systems.

Another effect is that the pagesets of one processor are placed near one
another. If multiple pagesets from different zones fit into one cacheline
then additional cacheline fetches can be avoided on the hot paths when
allocating memory from multiple zones.

Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs
are reduced and we can drop the zone_pcp macro.

Hotplug handling is also simplified since cpu alloc can bring up and
shut down cpu areas for a specific cpu as a whole. So there is no need to
allocate or free individual pagesets.

V4-V5:
- Fix up cases where per_cpu_ptr is called before irq disable
- Integrate the bootstrap logic that was separate before.

Reviewed-by: Mel Gorman <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>

---
include/linux/mm.h | 4 -
include/linux/mmzone.h | 12 ---
mm/page_alloc.c | 187 ++++++++++++++++++-------------------------------
mm/vmstat.c | 14 ++-
4 files changed, 81 insertions(+), 136 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h 2009-12-14 15:13:43.000000000 -0600
+++ linux-2.6/include/linux/mm.h 2009-12-14 15:20:45.000000000 -0600
@@ -1061,11 +1061,7 @@ extern void si_meminfo(struct sysinfo *
extern void si_meminfo_node(struct sysinfo *val, int nid);
extern int after_bootmem;

-#ifdef CONFIG_NUMA
extern void setup_per_cpu_pageset(void);
-#else
-static inline void setup_per_cpu_pageset(void) {}
-#endif

extern void zone_pcp_update(struct zone *zone);

Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h 2009-12-14 15:13:43.000000000 -0600
+++ linux-2.6/include/linux/mmzone.h 2009-12-14 15:20:45.000000000 -0600
@@ -184,13 +184,7 @@ struct per_cpu_pageset {
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
-} ____cacheline_aligned_in_smp;
-
-#ifdef CONFIG_NUMA
-#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
-#else
-#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
-#endif
+};

#endif /* !__GENERATING_BOUNDS.H */

@@ -306,10 +300,8 @@ struct zone {
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
- struct per_cpu_pageset *pageset[NR_CPUS];
-#else
- struct per_cpu_pageset pageset[NR_CPUS];
#endif
+ struct per_cpu_pageset *pageset;
/*
* free areas of different sizes
*/
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2009-12-14 15:13:43.000000000 -0600
+++ linux-2.6/mm/page_alloc.c 2009-12-14 15:21:17.000000000 -0600
@@ -1011,10 +1011,10 @@ static void drain_pages(unsigned int cpu
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;

- pset = zone_pcp(zone, cpu);
+ local_irq_save(flags);
+ pset = per_cpu_ptr(zone->pageset, cpu);

pcp = &pset->pcp;
- local_irq_save(flags);
free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
local_irq_restore(flags);
@@ -1098,7 +1098,6 @@ static void free_hot_cold_page(struct pa
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);

- pcp = &zone_pcp(zone, get_cpu())->pcp;
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
local_irq_save(flags);
@@ -1121,6 +1120,7 @@ static void free_hot_cold_page(struct pa
migratetype = MIGRATE_MOVABLE;
}

+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
else
@@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa

out:
local_irq_restore(flags);
- put_cpu();
}

void free_hot_page(struct page *page)
@@ -1183,17 +1182,15 @@ struct page *buffered_rmqueue(struct zon
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
- int cpu;

again:
- cpu = get_cpu();
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;

- pcp = &zone_pcp(zone, cpu)->pcp;
- list = &pcp->lists[migratetype];
local_irq_save(flags);
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
@@ -1234,7 +1231,6 @@ again:
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
- put_cpu();

VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
@@ -1243,7 +1239,6 @@ again:

failed:
local_irq_restore(flags);
- put_cpu();
return NULL;
}

@@ -2172,7 +2167,7 @@ void show_free_areas(void)
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;

- pageset = zone_pcp(zone, cpu);
+ pageset = per_cpu_ptr(zone->pageset, cpu);

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
cpu, pageset->pcp.high,
@@ -2734,10 +2729,29 @@ static void build_zonelist_cache(pg_data

#endif /* CONFIG_NUMA */

+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+
/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *dummy)
{
int nid;
+ int cpu;

#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -2748,6 +2762,14 @@ static int __build_all_zonelists(void *d
build_zonelists(pgdat);
build_zonelist_cache(pgdat);
}
+
+ /*
+ * Initialize the boot_pagesets that are going to be used
+ * for bootstrapping processors.
+ */
+ for_each_possible_cpu(cpu)
+ setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
return 0;
}

@@ -3086,120 +3108,60 @@ static void setup_pagelist_highmark(stru
}


-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
-{
- struct zone *zone, *dzone;
- int node = cpu_to_node(cpu);
-
- node_set_state(node, N_CPU); /* this node has a cpu */
-
- for_each_populated_zone(zone) {
- zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
- GFP_KERNEL, node);
- if (!zone_pcp(zone, cpu))
- goto bad;
-
- setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
- if (percpu_pagelist_fraction)
- setup_pagelist_highmark(zone_pcp(zone, cpu),
- (zone->present_pages / percpu_pagelist_fraction));
- }
-
- return 0;
-bad:
- for_each_zone(dzone) {
- if (!populated_zone(dzone))
- continue;
- if (dzone == zone)
- break;
- kfree(zone_pcp(dzone, cpu));
- zone_pcp(dzone, cpu) = &boot_pageset[cpu];
- }
- return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
- struct zone *zone;
-
- for_each_zone(zone) {
- struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
- /* Free per_cpu_pageset if it is slab allocated */
- if (pset != &boot_pageset[cpu])
- kfree(pset);
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- }
-}
-
static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
int cpu = (long)hcpu;
- int ret = NOTIFY_OK;

switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- free_zone_pagesets(cpu);
+ node_set_state(cpu_to_node(cpu), N_CPU);
break;
default:
break;
}
- return ret;
+ return NOTIFY_OK;
}

static struct notifier_block __cpuinitdata pageset_notifier =
{ &pageset_cpuup_callback, NULL, 0 };

+/*
+ * Allocate per cpu pagesets and initialize them.
+ * Before this call only boot pagesets were available.
+ * Boot pagesets will no longer be used by this processorr
+ * after setup_per_cpu_pageset().
+ */
void __init setup_per_cpu_pageset(void)
{
- int err;
+ struct zone *zone;
+ int cpu;
+
+ for_each_populated_zone(zone) {
+ zone->pageset = alloc_percpu(struct per_cpu_pageset);
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+
+ setup_pageset(pcp, zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(pcp,
+ (zone->present_pages /
+ percpu_pagelist_fraction));
+ }
+ }

- /* Initialize per_cpu_pageset for cpu 0.
- * A cpuup callback will do this for every cpu
- * as it comes online
+ /*
+ * The boot cpu is always the first active.
+ * The boot node has a processor
*/
- err = process_zones(smp_processor_id());
- BUG_ON(err);
+ node_set_state(cpu_to_node(smp_processor_id()), N_CPU);
register_cpu_notifier(&pageset_notifier);
}

-#endif
-
static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
@@ -3253,7 +3215,7 @@ static int __zone_pcp_update(void *data)
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;

- pset = zone_pcp(zone, cpu);
+ pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;

local_irq_save(flags);
@@ -3271,21 +3233,13 @@ void zone_pcp_update(struct zone *zone)

static __meminit void zone_pcp_init(struct zone *zone)
{
- int cpu;
- unsigned long batch = zone_batchsize(zone);
+ /* Use boot pagesets until we have the per cpu allocator up */
+ zone->pageset = &per_cpu_var(boot_pageset);

- for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
- /* Early boot. Slab allocator not functional yet */
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- setup_pageset(&boot_pageset[cpu],0);
-#else
- setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
- }
if (zone->present_pages)
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone->name, zone->present_pages, batch);
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
+ zone->name, zone->present_pages,
+ zone_batchsize(zone));
}

__meminit int init_currently_empty_zone(struct zone *zone,
@@ -4799,10 +4753,11 @@ int percpu_pagelist_fraction_sysctl_hand
if (!write || (ret == -EINVAL))
return ret;
for_each_populated_zone(zone) {
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
unsigned long high;
high = zone->present_pages / percpu_pagelist_fraction;
- setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+ setup_pagelist_highmark(
+ per_cpu_ptr(zone->pageset, cpu), high);
}
}
return 0;
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c 2009-12-14 15:13:43.000000000 -0600
+++ linux-2.6/mm/vmstat.c 2009-12-14 15:20:45.000000000 -0600
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds
threshold = calculate_threshold(zone);

for_each_online_cpu(cpu)
- zone_pcp(zone, cpu)->stat_threshold = threshold;
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
}
}

@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+
s8 *p = pcp->vm_stat_diff + item;
long x;

@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
s8 *p = pcp->vm_stat_diff + item;

(*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);

void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+ struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
s8 *p = pcp->vm_stat_diff + item;

(*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;

- p = zone_pcp(zone, cpu);
+ p = per_cpu_ptr(zone->pageset, cpu);

for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (p->vm_stat_diff[i]) {
@@ -738,7 +740,7 @@ static void zoneinfo_show_print(struct s
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;

- pageset = zone_pcp(zone, i);
+ pageset = per_cpu_ptr(zone->pageset, i);
seq_printf(m,
"\n cpu: %i"
"\n count: %i"

--


2009-12-15 03:53:44

by Tejun Heo

[permalink] [raw]
Subject: Re: [this_cpu_xx V7 1/8] this_cpu_ops: page allocator conversion

Hello,

On 12/15/2009 07:03 AM, Christoph Lameter wrote:
> static __meminit void zone_pcp_init(struct zone *zone)
> {
> - int cpu;
> - unsigned long batch = zone_batchsize(zone);
> + /* Use boot pagesets until we have the per cpu allocator up */
> + zone->pageset = &per_cpu_var(boot_pageset);

Ummm... this scares me a little bit. Before it was a pointer to
statically allocated area which can be used from basically anywhere.
Now, it's being initialized to a percpu pointer which won't be
available before setup_per_cpu_areas() is complete and the above
initialization takes place from setup_arch() which is before percpu
initialization. I don't think there's anything which would access
page allocator between the two places, but it still seems a bit risky.
Maybe it's better to keep the boot_pageset a static array? Or am I
misunderstanding something?

Thanks.

--
tejun

2009-12-15 15:05:22

by Christoph Lameter

[permalink] [raw]
Subject: Re: [this_cpu_xx V7 1/8] this_cpu_ops: page allocator conversion

On Tue, 15 Dec 2009, Tejun Heo wrote:

> Hello,
>
> On 12/15/2009 07:03 AM, Christoph Lameter wrote:
> > static __meminit void zone_pcp_init(struct zone *zone)
> > {
> > - int cpu;
> > - unsigned long batch = zone_batchsize(zone);
> > + /* Use boot pagesets until we have the per cpu allocator up */
> > + zone->pageset = &per_cpu_var(boot_pageset);
>
> Ummm... this scares me a little bit. Before it was a pointer to
> statically allocated area which can be used from basically anywhere.
> Now, it's being initialized to a percpu pointer which won't be
> available before setup_per_cpu_areas() is complete and the above
> initialization takes place from setup_arch() which is before percpu
> initialization. I don't think there's anything which would access
> page allocator between the two places, but it still seems a bit risky.
> Maybe it's better to keep the boot_pageset a static array? Or am I
> misunderstanding something?

A static array would have to be dimensioned to NR_CPUS. That is one thing
we are trying to avoid.

The assignment of the pageset "percpu" pointer does not mean that the pcp
is usable. It must first be properly initialized through setup_pageset().

setup_pageset() is run for each cpu. zone->pageset is the same for all
cpus that is why it is in zone_pcp_init() and not in setup_pageset().

The boot pageset initialization was moved into __build_all_zonelists(). We
could move the zone->pageset initialization there too?


2009-12-16 00:52:42

by Tejun Heo

[permalink] [raw]
Subject: Re: [this_cpu_xx V7 1/8] this_cpu_ops: page allocator conversion

Hello, Christoph.

On 12/16/2009 12:04 AM, Christoph Lameter wrote:
> A static array would have to be dimensioned to NR_CPUS. That is one thing
> we are trying to avoid.
>
> The assignment of the pageset "percpu" pointer does not mean that the pcp
> is usable. It must first be properly initialized through setup_pageset().
>
> setup_pageset() is run for each cpu. zone->pageset is the same for all
> cpus that is why it is in zone_pcp_init() and not in setup_pageset().
>
> The boot pageset initialization was moved into __build_all_zonelists(). We
> could move the zone->pageset initialization there too?

Maybe that is a bit less scary. (at least for me :-) The reason why
I'm a bit worried is that different architectures handle percpu
pointers differently before setup_per_cpu_areas(). x86 sets up the
offsets and stuff such that cpu0 can access the original percpu
section in the kernel image. ia64 sets up everything properly way
before setup_per_cpu_areas() and in some archs percpu pointers are
completely invalid before setup_per_cpu_areas(). So, percpu pointer
being handled in generic code which is being called before percpu
setup is a bit worrying.

Another thing is that there were attempts to simplify memory
initialization stages such that bootmem is removed and page / k*
allocators can be used earlier which kind of puts percpu allocator in
the dependency loop but I don't think it's something we need to worry
about at this point.

Thanks.

--
tejun

2009-12-16 14:57:07

by Christoph Lameter

[permalink] [raw]
Subject: Re: [this_cpu_xx V7 1/8] this_cpu_ops: page allocator conversion

On Wed, 16 Dec 2009, Tejun Heo wrote:

> > The boot pageset initialization was moved into __build_all_zonelists(). We
> > could move the zone->pageset initialization there too?
>
> Maybe that is a bit less scary. (at least for me :-) The reason why

Wel sorry moving the ->pageset assign to __build_all_zonelists does not
work since __build_all_zonelists does not scan through
zones. zone_pcp_init is called reliable first for all zones. I think just
leave it at is.

> I'm a bit worried is that different architectures handle percpu
> pointers differently before setup_per_cpu_areas(). x86 sets up the
> offsets and stuff such that cpu0 can access the original percpu
> section in the kernel image. ia64 sets up everything properly way
> before setup_per_cpu_areas() and in some archs percpu pointers are
> completely invalid before setup_per_cpu_areas(). So, percpu pointer
> being handled in generic code which is being called before percpu
> setup is a bit worrying.

True but we are not dereferencing a per cpu pointer here. It is simply the
assignment of the unreferenced native per cpu address generated by the
linker. This address is unaffected by allocator bootstrap.

> Another thing is that there were attempts to simplify memory
> initialization stages such that bootmem is removed and page / k*
> allocators can be used earlier which kind of puts percpu allocator in
> the dependency loop but I don't think it's something we need to worry
> about at this point.

We already worried about this earlier. The initialization we are talking
about does not require the per cpu allocator to be up. It just requires
the static per cpu areas to function with a single pcp while the zonelists
are being built.

The assignment of the final per cpu areas (dynamically allocated) occurs
after all the other memory allocators have been bootstrapped.

2009-12-17 00:22:24

by Tejun Heo

[permalink] [raw]
Subject: Re: [this_cpu_xx V7 1/8] this_cpu_ops: page allocator conversion

Hello,

On 12/16/2009 11:55 PM, Christoph Lameter wrote:
>>> The boot pageset initialization was moved into __build_all_zonelists(). We
>>> could move the zone->pageset initialization there too?
>>
>> Maybe that is a bit less scary. (at least for me :-) The reason why
>
> Wel sorry moving the ->pageset assign to __build_all_zonelists does not
> work since __build_all_zonelists does not scan through
> zones. zone_pcp_init is called reliable first for all zones. I think just
> leave it at is.

Ah, well, alright.

>> I'm a bit worried is that different architectures handle percpu
>> pointers differently before setup_per_cpu_areas(). x86 sets up the
>> offsets and stuff such that cpu0 can access the original percpu
>> section in the kernel image. ia64 sets up everything properly way
>> before setup_per_cpu_areas() and in some archs percpu pointers are
>> completely invalid before setup_per_cpu_areas(). So, percpu pointer
>> being handled in generic code which is being called before percpu
>> setup is a bit worrying.
>
> True but we are not dereferencing a per cpu pointer here. It is simply the
> assignment of the unreferenced native per cpu address generated by the
> linker. This address is unaffected by allocator bootstrap.

Yeap, the code is fine. Probably I'm just being a bit paranoid. If
you don't mind, can you please add a comment pointing out that the
percpu pointer isn't dereferenced before later initialization is done
which happens after percpu initialization?

> The assignment of the final per cpu areas (dynamically allocated) occurs
> after all the other memory allocators have been bootstrapped.

Great, thanks for the explanation.

--
tejun