2009-03-06 06:48:33

by Tejun Heo

[permalink] [raw]
Subject: [GIT PULL] x86, percpu: implement and use reserved percpu alloc


Hello,

Please pull from the following git vector.

git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu

This patchset implements reserved percpu alloc from the first chunk
and uses it for static perpcu variables in modules on x86_64. This is
necessary because x86_64 expects symbols to be relocatable using 32bit
relocations but dynamic percpu allocation does not guarantee it
leading to relocation overflow during module load.

This can happen to any of the x86_64 percpu first chunk allocators but
the embedding allocator is most likely to trigger it as the first
chunk is located in the normal kernel memory allocation area while all
the other chunks will be allocated in the vmalloc area which is very
far from the first chunk.

This type of reservation will be necessary for other architectures too
where addressing mode used for percpu acess uses offsets with limited
range to access symbols (e.g. ia64 w/ the magic 64k percpu page).

This problem was first reported and bisected by Mike Galbraith[2].

This patchset is against the current x86/core/percpu[1] and contains
the following patches.

0001-percpu-clean-up-percpu-constants.patch
0002-percpu-cosmetic-renames-in-pcpu_setup_first_chunk.patch
0003-percpu-improve-first-chunk-initial-area-map-handlin.patch
0004-percpu-use-negative-for-auto-for-pcpu_setup_first_c.patch
0005-x86-make-embedding-percpu-allocator-return-excessiv.patch
0006-percpu-add-an-indirection-ptr-for-chunk-page-map-ac.patch
0007-percpu-module-implement-reserved-allocation-and-us.patch
0008-x86-percpu-setup-reserved-percpu-area-for-x86_64.patch

0001-0004 makes misc updates to percpu in preparation of later
changes. 0005 fixes a case where excessive memory can be set aside in
the first chunk when using the embedding allocator. 0006-0008
implement reserved allocation and use it for x86_64.

Bryan Wu, can you please review #0001? It contains small change in
blackfin. It's mostly trivial but just in case.

Diffstat follows.

arch/blackfin/include/asm/percpu.h | 10 -
arch/x86/kernel/setup_percpu.c | 73 +++++++---
include/linux/percpu.h | 60 +++-----
kernel/module.c | 2
mm/percpu.c | 267 ++++++++++++++++++++++++++-----------
5 files changed, 274 insertions(+), 138 deletions(-)

Thanks.

--
tejun

[1] f254f3909efaf59ca2d0f408de2d044dace60706
[2] http://thread.gmane.org/gmane.linux.kernel/801799


2009-03-06 06:47:52

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 5/8] x86: make embedding percpu allocator return excessive free space

Impact: reduce unnecessary memory usage on certain configurations

Embedding percpu allocator allocates unit_size *
smp_num_possible_cpus() bytes consecutively and use it for the first
chunk. However, if the static area is small, this can result in
excessive prellocated free space in the first chunk due to
PCPU_MIN_UNIT_SIZE restriction.

This patch makes embedding percpu allocator preallocate only what's
necessary as described by PERPCU_DYNAMIC_RESERVE and return the
leftover to the bootmem allocator.

Signed-off-by: Tejun Heo <[email protected]>
---
arch/x86/kernel/setup_percpu.c | 44 +++++++++++++++++++++++++--------------
1 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef3a2cd..38e2b2a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -241,24 +241,31 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
* Embedding allocator
*
* The first chunk is sized to just contain the static area plus
- * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
- * bootmem allocator and used as-is without being mapped into vmalloc
- * area. This enables the first chunk to piggy back on the linear
- * physical PMD mapping and doesn't add any additional pressure to
- * TLB.
+ * module and dynamic reserves, and allocated as a contiguous area
+ * using bootmem allocator and used as-is without being mapped into
+ * vmalloc area. This enables the first chunk to piggy back on the
+ * linear physical PMD mapping and doesn't add any additional pressure
+ * to TLB. Note that if the needed size is smaller than the minimum
+ * unit size, the leftover is returned to the bootmem allocator.
*/
static void *pcpue_ptr __initdata;
+static size_t pcpue_size __initdata;
static size_t pcpue_unit_size __initdata;

static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
{
- return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
- + ((size_t)pageno << PAGE_SHIFT));
+ size_t off = (size_t)pageno << PAGE_SHIFT;
+
+ if (off >= pcpue_size)
+ return NULL;
+
+ return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
}

static ssize_t __init setup_pcpu_embed(size_t static_size)
{
unsigned int cpu;
+ size_t dyn_size;

/*
* If large page isn't supported, there's no benefit in doing
@@ -269,25 +276,30 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
return -EINVAL;

/* allocate and copy */
- pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
- pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
+ pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+ pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
+ dyn_size = pcpue_size - static_size;
+
pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
PAGE_SIZE);
if (!pcpue_ptr)
return -ENOMEM;

- for_each_possible_cpu(cpu)
- memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
- static_size);
+ for_each_possible_cpu(cpu) {
+ void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+
+ free_bootmem(__pa(ptr + pcpue_size),
+ pcpue_unit_size - pcpue_size);
+ memcpy(ptr, __per_cpu_load, static_size);
+ }

/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
- pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+ pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);

return pcpu_setup_first_chunk(pcpue_get_page, static_size,
- pcpue_unit_size,
- pcpue_unit_size - static_size, pcpue_ptr,
- NULL);
+ pcpue_unit_size, dyn_size,
+ pcpue_ptr, NULL);
}

/*
--
1.6.0.2

2009-03-06 06:48:16

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 1/8] percpu: clean up percpu constants

Impact: cleaup

Make the following cleanups.

* There isn't much arch-specific about PERCPU_MODULE_RESERVE. Always
define it whether arch overrides PERCPU_ENOUGH_ROOM or not.

* blackfin overrides PERCPU_ENOUGH_ROOM to align static area size. Do
it by default.

* percpu allocation sizes doesn't have much to do with the page size.
Don't use PAGE_SHIFT in their definition.

Signed-off-by: Tejun Heo <[email protected]>
Cc: Bryan Wu <[email protected]>
---
arch/blackfin/include/asm/percpu.h | 10 ----------
include/linux/percpu.h | 24 +++++++++++++-----------
2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/arch/blackfin/include/asm/percpu.h b/arch/blackfin/include/asm/percpu.h
index 797c0c1..c94c7bc 100644
--- a/arch/blackfin/include/asm/percpu.h
+++ b/arch/blackfin/include/asm/percpu.h
@@ -3,14 +3,4 @@

#include <asm-generic/percpu.h>

-#ifdef CONFIG_MODULES
-#define PERCPU_MODULE_RESERVE 8192
-#else
-#define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
- (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
- PERCPU_MODULE_RESERVE)
-
#endif /* __ARCH_BLACKFIN_PERCPU__ */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 545b068..2d34b03 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -5,6 +5,7 @@
#include <linux/slab.h> /* For kmalloc() */
#include <linux/smp.h>
#include <linux/cpumask.h>
+#include <linux/pfn.h>

#include <asm/percpu.h>

@@ -52,17 +53,18 @@
#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)

-/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
-#ifndef PERCPU_ENOUGH_ROOM
+/* enough to cover all DEFINE_PER_CPUs in modules */
#ifdef CONFIG_MODULES
-#define PERCPU_MODULE_RESERVE 8192
+#define PERCPU_MODULE_RESERVE (8 << 10)
#else
-#define PERCPU_MODULE_RESERVE 0
+#define PERCPU_MODULE_RESERVE 0
#endif

+#ifndef PERCPU_ENOUGH_ROOM
#define PERCPU_ENOUGH_ROOM \
- (__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE)
-#endif /* PERCPU_ENOUGH_ROOM */
+ (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
+ PERCPU_MODULE_RESERVE)
+#endif

/*
* Must be an lvalue. Since @var must be a simple identifier,
@@ -79,7 +81,7 @@
#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA

/* minimum unit size, also is the maximum supported allocation size */
-#define PCPU_MIN_UNIT_SIZE (16UL << PAGE_SHIFT)
+#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10)

/*
* PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
@@ -96,15 +98,15 @@
#ifndef PERCPU_DYNAMIC_RESERVE
# if BITS_PER_LONG > 32
# ifdef CONFIG_MODULES
-# define PERCPU_DYNAMIC_RESERVE (6 << PAGE_SHIFT)
+# define PERCPU_DYNAMIC_RESERVE (24 << 10)
# else
-# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
+# define PERCPU_DYNAMIC_RESERVE (16 << 10)
# endif
# else
# ifdef CONFIG_MODULES
-# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
+# define PERCPU_DYNAMIC_RESERVE (16 << 10)
# else
-# define PERCPU_DYNAMIC_RESERVE (2 << PAGE_SHIFT)
+# define PERCPU_DYNAMIC_RESERVE (8 << 10)
# endif
# endif
#endif /* PERCPU_DYNAMIC_RESERVE */
--
1.6.0.2

2009-03-06 06:48:51

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 3/8] percpu: improve first chunk initial area map handling

Impact: no functional change

When the first chunk is created, its initial area map is not allocated
because kmalloc isn't online yet. The map is allocated and
initialized on the first allocation request on the chunk. This works
fine but the scattering of initialization logic between the init
function and allocation path is a bit confusing.

This patch makes the first chunk initialize and use minimal statically
allocated map from pcpu_setpu_first_chunk(). The map resizing path
still needs to handle this specially but it's more straight-forward
and gives more latitude to the init path. This will ease future
changes.

Signed-off-by: Tejun Heo <[email protected]>
---
mm/percpu.c | 53 +++++++++++++++++++++++++++--------------------------
1 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 9531590..503ccad 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -93,9 +93,6 @@ static size_t pcpu_chunk_struct_size __read_mostly;
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);

-/* the size of kernel static area */
-static int pcpu_static_size __read_mostly;
-
/*
* One mutex to rule them all.
*
@@ -316,15 +313,28 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)

/* reallocation required? */
if (chunk->map_alloc < target) {
- int new_alloc = chunk->map_alloc;
+ int new_alloc;
int *new;

+ new_alloc = PCPU_DFL_MAP_ALLOC;
while (new_alloc < target)
new_alloc *= 2;

- new = pcpu_realloc(chunk->map,
- chunk->map_alloc * sizeof(new[0]),
- new_alloc * sizeof(new[0]));
+ if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
+ /*
+ * map_alloc smaller than the default size
+ * indicates that the chunk is one of the
+ * first chunks and still using static map.
+ * Allocate a dynamic one and copy.
+ */
+ new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
+ if (new)
+ memcpy(new, chunk->map,
+ chunk->map_alloc * sizeof(new[0]));
+ } else
+ new = pcpu_realloc(chunk->map,
+ chunk->map_alloc * sizeof(new[0]),
+ new_alloc * sizeof(new[0]));
if (!new)
return -ENOMEM;

@@ -367,22 +377,6 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
int max_contig = 0;
int i, off;

- /*
- * The static chunk initially doesn't have map attached
- * because kmalloc wasn't available during init. Give it one.
- */
- if (unlikely(!chunk->map)) {
- chunk->map = pcpu_realloc(NULL, 0,
- PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
- if (!chunk->map)
- return -ENOMEM;
-
- chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- chunk->map[chunk->map_used++] = -pcpu_static_size;
- if (chunk->free_size)
- chunk->map[chunk->map_used++] = chunk->free_size;
- }
-
for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
bool is_last = i + 1 == chunk->map_used;
int head, tail;
@@ -874,12 +868,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
pcpu_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct first_vm;
+ static int smap[2];
struct pcpu_chunk *schunk;
unsigned int cpu;
int nr_pages;
int err, i;

/* santiy checks */
+ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
BUG_ON(!unit_size && dyn_size);
BUG_ON(unit_size && unit_size < static_size + dyn_size);
@@ -893,7 +889,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
PFN_UP(static_size));

- pcpu_static_size = static_size;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
@@ -912,14 +907,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->vm = &first_vm;
+ schunk->map = smap;
+ schunk->map_alloc = ARRAY_SIZE(smap);

if (dyn_size)
schunk->free_size = dyn_size;
else
- schunk->free_size = pcpu_unit_size - pcpu_static_size;
+ schunk->free_size = pcpu_unit_size - static_size;

schunk->contig_hint = schunk->free_size;

+ schunk->map[schunk->map_used++] = -static_size;
+ if (schunk->free_size)
+ schunk->map[schunk->map_used++] = schunk->free_size;
+
/* allocate vm address */
first_vm.flags = VM_ALLOC;
first_vm.size = pcpu_chunk_size;
@@ -948,7 +949,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
*pcpu_chunk_pagep(schunk, cpu, i) = page;
}

- BUG_ON(i < PFN_UP(pcpu_static_size));
+ BUG_ON(i < PFN_UP(static_size));

if (nr_pages < 0)
nr_pages = i;
--
1.6.0.2

2009-03-06 06:49:16

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 4/8] percpu: use negative for auto for pcpu_setup_first_chunk() arguments

Impact: argument semantic cleanup

In pcpu_setup_first_chunk(), zero @unit_size and @dyn_size meant
auto-sizing. It's okay for @unit_size as 0 doesn't make sense but 0
dynamic reserve size is valid. Alos, if arch @dyn_size is calculated
from other parameters, it might end up passing in 0 @dyn_size and
malfunction when the size is automatically adjusted.

This patch makes both @unit_size and @dyn_size ssize_t and use -1 for
auto sizing.

Signed-off-by: Tejun Heo <[email protected]>
---
arch/x86/kernel/setup_percpu.c | 2 +-
include/linux/percpu.h | 5 ++-
mm/percpu.c | 46 +++++++++++++++++++++------------------
3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index c29f301..ef3a2cd 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -344,7 +344,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
pcpu4k_nr_static_pages, static_size);

- ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
+ ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
pcpu4k_populate_pte);
goto out_free_ar;

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a0b4ea2..a96fc53 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,8 +117,9 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);

extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
- size_t static_size, size_t unit_size,
- size_t dyn_size, void *base_addr,
+ size_t static_size,
+ ssize_t unit_size, ssize_t dyn_size,
+ void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn);

/*
diff --git a/mm/percpu.c b/mm/percpu.c
index 503ccad..a84cf99 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -824,8 +824,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
* @base_addr: mapped address, NULL for auto
* @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
*
@@ -842,13 +842,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
* indicates end of pages for the cpu. Note that @get_page_fn() must
* return the same number of pages for all cpus.
*
- * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
+ * @unit_size, if non-negative, specifies unit size and must be
+ * aligned to PAGE_SIZE and equal to or larger than @static_size +
+ * @dyn_size.
*
- * @dyn_size determines the number of free bytes after the static
- * area in the first chunk. If zero, whatever left is available.
- * Specifying non-zero value make percpu leave the area after
- * @static_size + @dyn_size alone.
+ * @dyn_size, if non-negative, limits the number of bytes available
+ * for dynamic allocation in the first chunk. Specifying non-negative
+ * value make percpu leave alone the area beyond @static_size +
+ * @dyn_size.
*
* Non-null @base_addr means that the caller already allocated virtual
* region for the first chunk and mapped it. percpu must not mess
@@ -863,8 +864,9 @@ EXPORT_SYMBOL_GPL(free_percpu);
* percpu access.
*/
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
- size_t static_size, size_t unit_size,
- size_t dyn_size, void *base_addr,
+ size_t static_size,
+ ssize_t unit_size, ssize_t dyn_size,
+ void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct first_vm;
@@ -877,13 +879,17 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
/* santiy checks */
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
- BUG_ON(!unit_size && dyn_size);
- BUG_ON(unit_size && unit_size < static_size + dyn_size);
- BUG_ON(unit_size & ~PAGE_MASK);
- BUG_ON(base_addr && !unit_size);
+ if (unit_size >= 0) {
+ BUG_ON(unit_size < static_size +
+ (dyn_size >= 0 ? dyn_size : 0));
+ BUG_ON(unit_size & ~PAGE_MASK);
+ } else {
+ BUG_ON(dyn_size >= 0);
+ BUG_ON(base_addr);
+ }
BUG_ON(base_addr && populate_pte_fn);

- if (unit_size)
+ if (unit_size >= 0)
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
else
pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
@@ -894,6 +900,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);

+ if (dyn_size < 0)
+ dyn_size = pcpu_unit_size - static_size;
+
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
@@ -909,12 +918,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
-
- if (dyn_size)
- schunk->free_size = dyn_size;
- else
- schunk->free_size = pcpu_unit_size - static_size;
-
+ schunk->free_size = dyn_size;
schunk->contig_hint = schunk->free_size;

schunk->map[schunk->map_used++] = -static_size;
--
1.6.0.2

2009-03-06 06:49:33

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 2/8] percpu: cosmetic renames in pcpu_setup_first_chunk()

Impact: cosmetic, preparation for future changes

Make the following renames in pcpur_setup_first_chunk() in preparation
for future changes.

* s/free_size/dyn_size/
* s/static_vm/first_vm/
* s/static_chunk/schunk/

Signed-off-by: Tejun Heo <[email protected]>
---
include/linux/percpu.h | 2 +-
mm/percpu.c | 58 ++++++++++++++++++++++++------------------------
2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 2d34b03..a0b4ea2 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -118,7 +118,7 @@ typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);

extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
size_t static_size, size_t unit_size,
- size_t free_size, void *base_addr,
+ size_t dyn_size, void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn);

/*
diff --git a/mm/percpu.c b/mm/percpu.c
index 3d0f545..9531590 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -831,7 +831,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
* @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @free_size: free size in bytes, 0 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
* @base_addr: mapped address, NULL for auto
* @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
*
@@ -849,12 +849,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
* return the same number of pages for all cpus.
*
* @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
*
- * @free_size determines the number of free bytes after the static
+ * @dyn_size determines the number of free bytes after the static
* area in the first chunk. If zero, whatever left is available.
* Specifying non-zero value make percpu leave the area after
- * @static_size + @free_size alone.
+ * @static_size + @dyn_size alone.
*
* Non-null @base_addr means that the caller already allocated virtual
* region for the first chunk and mapped it. percpu must not mess
@@ -870,19 +870,19 @@ EXPORT_SYMBOL_GPL(free_percpu);
*/
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
size_t static_size, size_t unit_size,
- size_t free_size, void *base_addr,
+ size_t dyn_size, void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn)
{
- static struct vm_struct static_vm;
- struct pcpu_chunk *static_chunk;
+ static struct vm_struct first_vm;
+ struct pcpu_chunk *schunk;
unsigned int cpu;
int nr_pages;
int err, i;

/* santiy checks */
BUG_ON(!static_size);
- BUG_ON(!unit_size && free_size);
- BUG_ON(unit_size && unit_size < static_size + free_size);
+ BUG_ON(!unit_size && dyn_size);
+ BUG_ON(unit_size && unit_size < static_size + dyn_size);
BUG_ON(unit_size & ~PAGE_MASK);
BUG_ON(base_addr && !unit_size);
BUG_ON(base_addr && populate_pte_fn);
@@ -908,24 +908,24 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);

- /* init static_chunk */
- static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
- INIT_LIST_HEAD(&static_chunk->list);
- static_chunk->vm = &static_vm;
+ /* init static chunk */
+ schunk = alloc_bootmem(pcpu_chunk_struct_size);
+ INIT_LIST_HEAD(&schunk->list);
+ schunk->vm = &first_vm;

- if (free_size)
- static_chunk->free_size = free_size;
+ if (dyn_size)
+ schunk->free_size = dyn_size;
else
- static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+ schunk->free_size = pcpu_unit_size - pcpu_static_size;

- static_chunk->contig_hint = static_chunk->free_size;
+ schunk->contig_hint = schunk->free_size;

/* allocate vm address */
- static_vm.flags = VM_ALLOC;
- static_vm.size = pcpu_chunk_size;
+ first_vm.flags = VM_ALLOC;
+ first_vm.size = pcpu_chunk_size;

if (!base_addr)
- vm_area_register_early(&static_vm, PAGE_SIZE);
+ vm_area_register_early(&first_vm, PAGE_SIZE);
else {
/*
* Pages already mapped. No need to remap into
@@ -933,8 +933,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
* be mapped or unmapped by percpu and is marked
* immutable.
*/
- static_vm.addr = base_addr;
- static_chunk->immutable = true;
+ first_vm.addr = base_addr;
+ schunk->immutable = true;
}

/* assign pages */
@@ -945,7 +945,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,

if (!page)
break;
- *pcpu_chunk_pagep(static_chunk, cpu, i) = page;
+ *pcpu_chunk_pagep(schunk, cpu, i) = page;
}

BUG_ON(i < PFN_UP(pcpu_static_size));
@@ -960,20 +960,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
if (populate_pte_fn) {
for_each_possible_cpu(cpu)
for (i = 0; i < nr_pages; i++)
- populate_pte_fn(pcpu_chunk_addr(static_chunk,
+ populate_pte_fn(pcpu_chunk_addr(schunk,
cpu, i));

- err = pcpu_map(static_chunk, 0, nr_pages);
+ err = pcpu_map(schunk, 0, nr_pages);
if (err)
panic("failed to setup static percpu area, err=%d\n",
err);
}

- /* link static_chunk in */
- pcpu_chunk_relocate(static_chunk, -1);
- pcpu_chunk_addr_insert(static_chunk);
+ /* link the first chunk in */
+ pcpu_chunk_relocate(schunk, -1);
+ pcpu_chunk_addr_insert(schunk);

/* we're done */
- pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
return pcpu_unit_size;
}
--
1.6.0.2

2009-03-06 06:49:48

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 8/8] x86, percpu: setup reserved percpu area for x86_64

Impact: fix relocation overflow during module load

x86_64 uses 32bit relocations for symbol access and static percpu
symbols whether in core or modules must be inside 2GB of the percpu
segement base which the dynamic percpu allocator doesn't guarantee.
This patch makes x86_64 reserve PERCPU_MODULE_RESERVE bytes in the
first chunk so that module percpu areas are always allocated from the
first chunk which is always inside the relocatable range.

This problem exists for any percpu allocator but is easily triggered
when using the embedding allocator because the second chunk is located
beyond 2GB on it.

This patch also changes the meaning of PERCPU_DYNAMIC_RESERVE such
that it only indicates the size of the area to reserve for dynamic
allocation as static and dynamic areas can be separate. New
PERCPU_DYNAMIC_RESERVED is increased by 4k for both 32 and 64bits as
the reserved area separation eats away some allocatable space and
having slightly more headroom (currently between 4 and 8k after
minimal boot sans module area) makes sense for common case
performance.

x86_32 can address anywhere from anywhere and doesn't need reserving.

Mike Galbraith first reported the problem first and bisected it to the
embedding percpu allocator commit.

Signed-off-by: Tejun Heo <[email protected]>
Reported-by: Mike Galbraith <[email protected]>
Reported-by: Jaswinder Singh Rajput <[email protected]>
---
arch/x86/kernel/setup_percpu.c | 37 ++++++++++++++++++++++++++++---------
include/linux/percpu.h | 35 ++++++++++++-----------------------
2 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index dd4eabc..efa615f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -42,6 +42,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
};
EXPORT_SYMBOL(__per_cpu_offset);

+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations. Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base. On x86_32, anything can
+ * address anywhere. No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
+#else
+#define PERCPU_FIRST_CHUNK_RESERVE 0
+#endif
+
/**
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
*
@@ -141,7 +154,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
{
static struct vm_struct vm;
pg_data_t *last;
- size_t ptrs_size;
+ size_t ptrs_size, dyn_size;
unsigned int cpu;
ssize_t ret;

@@ -169,12 +182,14 @@ proceed:
* Currently supports only single page. Supporting multiple
* pages won't be too difficult if it ever becomes necessary.
*/
- pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+ pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE);
if (pcpur_size > PMD_SIZE) {
pr_warning("PERCPU: static data is larger than large page, "
"can't use large page\n");
return -EINVAL;
}
+ dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;

/* allocate pointer array and alloc large pages */
ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
@@ -217,8 +232,9 @@ proceed:
pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", vm.addr, static_size);

- ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
- pcpur_size - static_size, vm.addr, NULL);
+ ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+ PERCPU_FIRST_CHUNK_RESERVE,
+ PMD_SIZE, dyn_size, vm.addr, NULL);
goto out_free_ar;

enomem:
@@ -276,9 +292,10 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
return -EINVAL;

/* allocate and copy */
- pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+ pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE);
pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
- dyn_size = pcpue_size - static_size;
+ dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;

pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
PAGE_SIZE);
@@ -297,7 +314,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);

- return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
+ return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+ PERCPU_FIRST_CHUNK_RESERVE,
pcpue_unit_size, dyn_size,
pcpue_ptr, NULL);
}
@@ -356,8 +374,9 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
pcpu4k_nr_static_pages, static_size);

- ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
- NULL, pcpu4k_populate_pte);
+ ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+ PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
+ pcpu4k_populate_pte);
goto out_free_ar;

enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8ff1515..54a968b 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -85,31 +85,20 @@

/*
* PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
- * back on the first chunk if arch is manually allocating and mapping
- * it for faster access (as a part of large page mapping for example).
- * Note that dynamic percpu allocator covers both static and dynamic
- * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
+ * back on the first chunk for dynamic percpu allocation if arch is
+ * manually allocating and mapping it for faster access (as a part of
+ * large page mapping for example).
*
- * On typical configuration with modules, the following values leave
- * about 8k of free space on the first chunk after boot on both x86_32
- * and 64 when module support is enabled. When module support is
- * disabled, it's much tighter.
+ * The following values give between one and two pages of free space
+ * after typical minimal boot (2-way SMP, single disk and NIC) with
+ * both defconfig and a distro config on x86_64 and 32. More
+ * intelligent way to determine this would be nice.
*/
-#ifndef PERCPU_DYNAMIC_RESERVE
-# if BITS_PER_LONG > 32
-# ifdef CONFIG_MODULES
-# define PERCPU_DYNAMIC_RESERVE (24 << 10)
-# else
-# define PERCPU_DYNAMIC_RESERVE (16 << 10)
-# endif
-# else
-# ifdef CONFIG_MODULES
-# define PERCPU_DYNAMIC_RESERVE (16 << 10)
-# else
-# define PERCPU_DYNAMIC_RESERVE (8 << 10)
-# endif
-# endif
-#endif /* PERCPU_DYNAMIC_RESERVE */
+#if BITS_PER_LONG > 32
+#define PERCPU_DYNAMIC_RESERVE (20 << 10)
+#else
+#define PERCPU_DYNAMIC_RESERVE (12 << 10)
+#endif

extern void *pcpu_base_addr;

--
1.6.0.2

2009-03-06 06:50:13

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 7/8] percpu, module: implement reserved allocation and use it for module percpu variables

Impact: add reserved allocation functionality and use it for module
percpu variables

This patch implements reserved allocation from the first chunk. When
setting up the first chunk, arch can ask to set aside certain number
of bytes right after the core static area which is available only
through a separate reserved allocator. This will be used primarily
for module static percpu variables on architectures with limited
relocation range to ensure that the module perpcu symbols are inside
the relocatable range.

If reserved area is requested, the first chunk becomes reserved and
isn't available for regular allocation. If the first chunk also
includes piggy-back dynamic allocation area, a separate chunk mapping
the same region is created to serve dynamic allocation. The first one
is called static first chunk and the second dynamic first chunk.
Although they share the page map, their different area map
initializations guarantee they serve disjoint areas according to their
purposes.

If arch doesn't setup reserved area, reserved allocation is handled
like any other allocation.

Signed-off-by: Tejun Heo <[email protected]>
---
arch/x86/kernel/setup_percpu.c | 8 +-
include/linux/percpu.h | 10 ++-
kernel/module.c | 2 +-
mm/percpu.c | 153 ++++++++++++++++++++++++++++++++++-----
4 files changed, 144 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 38e2b2a..dd4eabc 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -217,7 +217,7 @@ proceed:
pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", vm.addr, static_size);

- ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
+ ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
pcpur_size - static_size, vm.addr, NULL);
goto out_free_ar;

@@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);

- return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+ return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
pcpue_unit_size, dyn_size,
pcpue_ptr, NULL);
}
@@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
pcpu4k_nr_static_pages, static_size);

- ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
- pcpu4k_populate_pte);
+ ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
+ NULL, pcpu4k_populate_pte);
goto out_free_ar;

enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a96fc53..8ff1515 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,10 +117,10 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);

extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
- size_t static_size,
- ssize_t unit_size, ssize_t dyn_size,
- void *base_addr,
- pcpu_populate_pte_fn_t populate_pte_fn);
+ size_t static_size, size_t reserved_size,
+ ssize_t unit_size, ssize_t dyn_size,
+ void *base_addr,
+ pcpu_populate_pte_fn_t populate_pte_fn);

/*
* Use this to get to a cpu's version of the per-cpu object
@@ -129,6 +129,8 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
*/
#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))

+extern void *__alloc_reserved_percpu(size_t size, size_t align);
+
#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */

struct percpu_data {
diff --git a/kernel/module.c b/kernel/module.c
index 1f0657a..f0e04d6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -381,7 +381,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
align = PAGE_SIZE;
}

- ptr = __alloc_percpu(size, align);
+ ptr = __alloc_reserved_percpu(size, align);
if (!ptr)
printk(KERN_WARNING
"Could not allocate %lu bytes percpu data\n", size);
diff --git a/mm/percpu.c b/mm/percpu.c
index 5b47d9f..ef8e169 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,6 +94,11 @@ static size_t pcpu_chunk_struct_size __read_mostly;
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);

+/* optional reserved chunk, only accessible for reserved allocations */
+static struct pcpu_chunk *pcpu_reserved_chunk;
+/* offset limit of the reserved chunk */
+static int pcpu_reserved_chunk_limit;
+
/*
* One mutex to rule them all.
*
@@ -201,13 +206,14 @@ static void *pcpu_realloc(void *p, size_t size, size_t new_size)
*
* This function is called after an allocation or free changed @chunk.
* New slot according to the changed state is determined and @chunk is
- * moved to the slot.
+ * moved to the slot. Note that the reserved chunk is never put on
+ * chunk slots.
*/
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);

- if (oslot != nslot) {
+ if (chunk != pcpu_reserved_chunk && oslot != nslot) {
if (oslot < nslot)
list_move(&chunk->list, &pcpu_slot[nslot]);
else
@@ -255,6 +261,15 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
struct rb_node *n, *parent;
struct pcpu_chunk *chunk;

+ /* is it in the reserved chunk? */
+ if (pcpu_reserved_chunk) {
+ void *start = pcpu_reserved_chunk->vm->addr;
+
+ if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+ return pcpu_reserved_chunk;
+ }
+
+ /* nah... search the regular ones */
n = *pcpu_chunk_rb_search(addr, &parent);
if (!n) {
/* no exactly matching chunk, the parent is the closest */
@@ -713,9 +728,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
}

/**
- * __alloc_percpu - allocate percpu area
+ * pcpu_alloc - the percpu allocator
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
+ * @reserved: allocate from the reserved chunk if available
*
* Allocate percpu area of @size bytes aligned at @align. Might
* sleep. Might trigger writeouts.
@@ -723,7 +739,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
-void *__alloc_percpu(size_t size, size_t align)
+static void *pcpu_alloc(size_t size, size_t align, bool reserved)
{
void *ptr = NULL;
struct pcpu_chunk *chunk;
@@ -737,7 +753,18 @@ void *__alloc_percpu(size_t size, size_t align)

mutex_lock(&pcpu_mutex);

- /* allocate area */
+ /* serve reserved allocations from the reserved chunk if available */
+ if (reserved && pcpu_reserved_chunk) {
+ chunk = pcpu_reserved_chunk;
+ if (size > chunk->contig_hint)
+ goto out_unlock;
+ off = pcpu_alloc_area(chunk, size, align);
+ if (off >= 0)
+ goto area_found;
+ goto out_unlock;
+ }
+
+ /* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
if (size > chunk->contig_hint)
@@ -773,8 +800,41 @@ out_unlock:
mutex_unlock(&pcpu_mutex);
return ptr;
}
+
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align. Might
+ * sleep. Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+ return pcpu_alloc(size, align, false);
+}
EXPORT_SYMBOL_GPL(__alloc_percpu);

+/**
+ * __alloc_reserved_percpu - allocate reserved percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align from reserved
+ * percpu area if arch has set it up; otherwise, allocation is served
+ * from the same dynamic area. Might sleep. Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_reserved_percpu(size_t size, size_t align)
+{
+ return pcpu_alloc(size, align, true);
+}
+
static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
{
WARN_ON(chunk->immutable);
@@ -826,6 +886,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
* @base_addr: mapped address, NULL for auto
@@ -844,14 +905,22 @@ EXPORT_SYMBOL_GPL(free_percpu);
* indicates end of pages for the cpu. Note that @get_page_fn() must
* return the same number of pages for all cpus.
*
+ * @reserved_size, if non-zero, specifies the amount of bytes to
+ * reserve after the static area in the first chunk. This reserves
+ * the first chunk such that it's available only through reserved
+ * percpu allocation. This is primarily used to serve module percpu
+ * static areas on architectures where the addressing model has
+ * limited offset range for symbol relocations to guarantee module
+ * percpu symbols fall inside the relocatable range.
+ *
* @unit_size, if non-negative, specifies unit size and must be
* aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
*
* @dyn_size, if non-negative, limits the number of bytes available
* for dynamic allocation in the first chunk. Specifying non-negative
* value make percpu leave alone the area beyond @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
*
* Non-null @base_addr means that the caller already allocated virtual
* region for the first chunk and mapped it. percpu must not mess
@@ -861,28 +930,36 @@ EXPORT_SYMBOL_GPL(free_percpu);
* @populate_pte_fn is used to populate the pagetable. NULL means the
* caller already populated the pagetable.
*
+ * If the first chunk ends up with both reserved and dynamic areas, it
+ * is served by two chunks - one to serve the core static and reserved
+ * areas and the other for the dynamic area. They share the same vm
+ * and page map but uses different area allocation map to stay away
+ * from each other. The latter chunk is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunks.
+ *
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
- size_t static_size,
+ size_t static_size, size_t reserved_size,
ssize_t unit_size, ssize_t dyn_size,
void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct first_vm;
- static int smap[2];
- struct pcpu_chunk *schunk;
+ static int smap[2], dmap[2];
+ struct pcpu_chunk *schunk, *dchunk = NULL;
unsigned int cpu;
int nr_pages;
int err, i;

/* santiy checks */
- BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
+ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
+ ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
if (unit_size >= 0) {
- BUG_ON(unit_size < static_size +
+ BUG_ON(unit_size < static_size + reserved_size +
(dyn_size >= 0 ? dyn_size : 0));
BUG_ON(unit_size & ~PAGE_MASK);
} else {
@@ -895,7 +972,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
else
pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
- PFN_UP(static_size));
+ PFN_UP(static_size + reserved_size));

pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
@@ -903,7 +980,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);

if (dyn_size < 0)
- dyn_size = pcpu_unit_size - static_size;
+ dyn_size = pcpu_unit_size - static_size - reserved_size;

/*
* Allocate chunk slots. The additional last slot is for
@@ -914,20 +991,49 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);

- /* init static chunk */
+ /*
+ * Initialize static chunk. If reserved_size is zero, the
+ * static chunk covers static area + dynamic allocation area
+ * in the first chunk. If reserved_size is not zero, it
+ * covers static area + reserved area (mostly used for module
+ * static percpu allocation).
+ */
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
schunk->page = schunk->page_ar;
- schunk->free_size = dyn_size;
+
+ if (reserved_size) {
+ schunk->free_size = reserved_size;
+ pcpu_reserved_chunk = schunk; /* not for dynamic alloc */
+ } else {
+ schunk->free_size = dyn_size;
+ dyn_size = 0; /* dynamic area covered */
+ }
schunk->contig_hint = schunk->free_size;

schunk->map[schunk->map_used++] = -static_size;
if (schunk->free_size)
schunk->map[schunk->map_used++] = schunk->free_size;

+ pcpu_reserved_chunk_limit = static_size + schunk->free_size;
+
+ /* init dynamic chunk if necessary */
+ if (dyn_size) {
+ dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+ INIT_LIST_HEAD(&dchunk->list);
+ dchunk->vm = &first_vm;
+ dchunk->map = dmap;
+ dchunk->map_alloc = ARRAY_SIZE(dmap);
+ dchunk->page = schunk->page_ar; /* share page map with schunk */
+
+ dchunk->contig_hint = dchunk->free_size = dyn_size;
+ dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
+ dchunk->map[dchunk->map_used++] = dchunk->free_size;
+ }
+
/* allocate vm address */
first_vm.flags = VM_ALLOC;
first_vm.size = pcpu_chunk_size;
@@ -937,12 +1043,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
else {
/*
* Pages already mapped. No need to remap into
- * vmalloc area. In this case the static chunk can't
- * be mapped or unmapped by percpu and is marked
+ * vmalloc area. In this case the first chunks can't
+ * be mapped or unmapped by percpu and are marked
* immutable.
*/
first_vm.addr = base_addr;
schunk->immutable = true;
+ if (dchunk)
+ dchunk->immutable = true;
}

/* assign pages */
@@ -978,8 +1086,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
}

/* link the first chunk in */
- pcpu_chunk_relocate(schunk, -1);
- pcpu_chunk_addr_insert(schunk);
+ if (!dchunk) {
+ pcpu_chunk_relocate(schunk, -1);
+ pcpu_chunk_addr_insert(schunk);
+ } else {
+ pcpu_chunk_relocate(dchunk, -1);
+ pcpu_chunk_addr_insert(dchunk);
+ }

/* we're done */
pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
--
1.6.0.2

2009-03-06 06:50:36

by Tejun Heo

[permalink] [raw]
Subject: [PATCH 6/8] percpu: add an indirection ptr for chunk page map access

Impact: allow sharing page map, no functional difference yet

Make chunk->page access indirect by adding a pointer and renaming the
actual array to page_ar. This will be used by future changes.

Signed-off-by: Tejun Heo <[email protected]>
---
mm/percpu.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index a84cf99..5b47d9f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,7 +80,8 @@ struct pcpu_chunk {
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
bool immutable; /* no [de]population allowed */
- struct page *page[]; /* #cpus * UNIT_PAGES */
+ struct page **page; /* points to page array */
+ struct page *page_ar[]; /* #cpus * UNIT_PAGES */
};

static int pcpu_unit_pages __read_mostly;
@@ -696,6 +697,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
+ chunk->page = chunk->page_ar;

chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
if (!chunk->vm) {
@@ -918,6 +920,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
+ schunk->page = schunk->page_ar;
schunk->free_size = dyn_size;
schunk->contig_hint = schunk->free_size;

--
1.6.0.2

2009-03-06 07:29:58

by Mike Galbraith

[permalink] [raw]
Subject: Re: [GIT PULL] x86, percpu: implement and use reserved percpu alloc

On Fri, 2009-03-06 at 15:46 +0900, Tejun Heo wrote:
> Hello,
>
> Please pull from the following git vector.
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu
>
> This patchset implements reserved percpu alloc from the first chunk
> and uses it for static perpcu variables in modules on x86_64. This is
> necessary because x86_64 expects symbols to be relocatable using 32bit
> relocations but dynamic percpu allocation does not guarantee it
> leading to relocation overflow during module load.

Tested, my module loading woes are cured.

-Mike

2009-03-06 08:06:59

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL] x86, percpu: implement and use reserved percpu alloc


* Tejun Heo <[email protected]> wrote:

> Hello,
>
> Please pull from the following git vector.
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu
>
> This patchset implements reserved percpu alloc from the first
> chunk and uses it for static perpcu variables in modules on
> x86_64. This is necessary because x86_64 expects symbols to
> be relocatable using 32bit relocations but dynamic percpu
> allocation does not guarantee it leading to relocation
> overflow during module load.
>
> This can happen to any of the x86_64 percpu first chunk
> allocators but the embedding allocator is most likely to
> trigger it as the first chunk is located in the normal kernel
> memory allocation area while all the other chunks will be
> allocated in the vmalloc area which is very far from the first
> chunk.
>
> This type of reservation will be necessary for other
> architectures too where addressing mode used for percpu acess
> uses offsets with limited range to access symbols (e.g. ia64
> w/ the magic 64k percpu page).
>
> This problem was first reported and bisected by Mike
> Galbraith[2].
>
> This patchset is against the current x86/core/percpu[1] and contains
> the following patches.
>
> 0001-percpu-clean-up-percpu-constants.patch
> 0002-percpu-cosmetic-renames-in-pcpu_setup_first_chunk.patch
> 0003-percpu-improve-first-chunk-initial-area-map-handlin.patch
> 0004-percpu-use-negative-for-auto-for-pcpu_setup_first_c.patch
> 0005-x86-make-embedding-percpu-allocator-return-excessiv.patch
> 0006-percpu-add-an-indirection-ptr-for-chunk-page-map-ac.patch
> 0007-percpu-module-implement-reserved-allocation-and-us.patch
> 0008-x86-percpu-setup-reserved-percpu-area-for-x86_64.patch
>
> 0001-0004 makes misc updates to percpu in preparation of later
> changes. 0005 fixes a case where excessive memory can be set
> aside in the first chunk when using the embedding allocator.
> 0006-0008 implement reserved allocation and use it for x86_64.
>
> Bryan Wu, can you please review #0001? It contains small
> change in blackfin. It's mostly trivial but just in case.
>
> Diffstat follows.
>
> arch/blackfin/include/asm/percpu.h | 10 -
> arch/x86/kernel/setup_percpu.c | 73 +++++++---
> include/linux/percpu.h | 60 +++-----
> kernel/module.c | 2
> mm/percpu.c | 267 ++++++++++++++++++++++++++-----------
> 5 files changed, 274 insertions(+), 138 deletions(-)

Pulled, thanks Tejun!

Ingo

2009-03-08 04:37:34

by Bryan Wu

[permalink] [raw]
Subject: Re: [GIT PULL] x86, percpu: implement and use reserved percpu alloc

On Fri, Mar 6, 2009 at 2:46 PM, Tejun Heo <[email protected]> wrote:
>
> Hello,
>
> Please pull from the following git vector.
>
> ?git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git tj-percpu
>
> This patchset implements reserved percpu alloc from the first chunk
> and uses it for static perpcu variables in modules on x86_64. ?This is
> necessary because x86_64 expects symbols to be relocatable using 32bit
> relocations but dynamic percpu allocation does not guarantee it
> leading to relocation overflow during module load.
>
> This can happen to any of the x86_64 percpu first chunk allocators but
> the embedding allocator is most likely to trigger it as the first
> chunk is located in the normal kernel memory allocation area while all
> the other chunks will be allocated in the vmalloc area which is very
> far from the first chunk.
>
> This type of reservation will be necessary for other architectures too
> where addressing mode used for percpu acess uses offsets with limited
> range to access symbols (e.g. ia64 w/ the magic 64k percpu page).
>
> This problem was first reported and bisected by Mike Galbraith[2].
>
> This patchset is against the current x86/core/percpu[1] and contains
> the following patches.
>
> ?0001-percpu-clean-up-percpu-constants.patch
> ?0002-percpu-cosmetic-renames-in-pcpu_setup_first_chunk.patch
> ?0003-percpu-improve-first-chunk-initial-area-map-handlin.patch
> ?0004-percpu-use-negative-for-auto-for-pcpu_setup_first_c.patch
> ?0005-x86-make-embedding-percpu-allocator-return-excessiv.patch
> ?0006-percpu-add-an-indirection-ptr-for-chunk-page-map-ac.patch
> ?0007-percpu-module-implement-reserved-allocation-and-us.patch
> ?0008-x86-percpu-setup-reserved-percpu-area-for-x86_64.patch
>
> 0001-0004 makes misc updates to percpu in preparation of later
> changes. ?0005 fixes a case where excessive memory can be set aside in
> the first chunk when using the embedding allocator. ?0006-0008
> implement reserved allocation and use it for x86_64.
>
> Bryan Wu, can you please review #0001? ?It contains small change in
> blackfin. ?It's mostly trivial but just in case.
>

Hi Tejun,

Sorry for the delay, it looks fine for me.

-Bryan

> Diffstat follows.
>
> ?arch/blackfin/include/asm/percpu.h | ? 10 -
> ?arch/x86/kernel/setup_percpu.c ? ? | ? 73 +++++++---
> ?include/linux/percpu.h ? ? ? ? ? ? | ? 60 +++-----
> ?kernel/module.c ? ? ? ? ? ? ? ? ? ?| ? ?2
> ?mm/percpu.c ? ? ? ? ? ? ? ? ? ? ? ?| ?267 ++++++++++++++++++++++++++-----------
> ?5 files changed, 274 insertions(+), 138 deletions(-)
>
> Thanks.
>
> --
> tejun
>
> [1] f254f3909efaf59ca2d0f408de2d044dace60706
> [2] http://thread.gmane.org/gmane.linux.kernel/801799
>

2009-03-08 05:02:18

by Bryan Wu

[permalink] [raw]
Subject: Re: [PATCH 1/8] percpu: clean up percpu constants

On Fri, Mar 6, 2009 at 2:46 PM, Tejun Heo <[email protected]> wrote:
> Impact: cleaup
>
> Make the following cleanups.
>
> * There isn't much arch-specific about PERCPU_MODULE_RESERVE. ?Always
> ?define it whether arch overrides PERCPU_ENOUGH_ROOM or not.
>
> * blackfin overrides PERCPU_ENOUGH_ROOM to align static area size. ?Do
> ?it by default.
>

Acked-by: Bryan Wu <[email protected]>

Thanks
-Bryan

> * percpu allocation sizes doesn't have much to do with the page size.
> ?Don't use PAGE_SHIFT in their definition.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Cc: Bryan Wu <[email protected]>
> ---
> ?arch/blackfin/include/asm/percpu.h | ? 10 ----------
> ?include/linux/percpu.h ? ? ? ? ? ? | ? 24 +++++++++++++-----------
> ?2 files changed, 13 insertions(+), 21 deletions(-)
>
> diff --git a/arch/blackfin/include/asm/percpu.h b/arch/blackfin/include/asm/percpu.h
> index 797c0c1..c94c7bc 100644
> --- a/arch/blackfin/include/asm/percpu.h
> +++ b/arch/blackfin/include/asm/percpu.h
> @@ -3,14 +3,4 @@
>
> ?#include <asm-generic/percpu.h>
>
> -#ifdef CONFIG_MODULES
> -#define PERCPU_MODULE_RESERVE 8192
> -#else
> -#define PERCPU_MODULE_RESERVE 0
> -#endif
> -
> -#define PERCPU_ENOUGH_ROOM \
> - ? ? ? (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
> - ? ? ? ?PERCPU_MODULE_RESERVE)
> -
> ?#endif /* __ARCH_BLACKFIN_PERCPU__ */
> diff --git a/include/linux/percpu.h b/include/linux/percpu.h
> index 545b068..2d34b03 100644
> --- a/include/linux/percpu.h
> +++ b/include/linux/percpu.h
> @@ -5,6 +5,7 @@
> ?#include <linux/slab.h> /* For kmalloc() */
> ?#include <linux/smp.h>
> ?#include <linux/cpumask.h>
> +#include <linux/pfn.h>
>
> ?#include <asm/percpu.h>
>
> @@ -52,17 +53,18 @@
> ?#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
> ?#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
>
> -/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
> -#ifndef PERCPU_ENOUGH_ROOM
> +/* enough to cover all DEFINE_PER_CPUs in modules */
> ?#ifdef CONFIG_MODULES
> -#define PERCPU_MODULE_RESERVE ?8192
> +#define PERCPU_MODULE_RESERVE ? ? ? ? ?(8 << 10)
> ?#else
> -#define PERCPU_MODULE_RESERVE ?0
> +#define PERCPU_MODULE_RESERVE ? ? ? ? ?0
> ?#endif
>
> +#ifndef PERCPU_ENOUGH_ROOM
> ?#define PERCPU_ENOUGH_ROOM ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
> - ? ? ? (__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE)
> -#endif /* PERCPU_ENOUGH_ROOM */
> + ? ? ? (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + ? ? ?\
> + ? ? ? ?PERCPU_MODULE_RESERVE)
> +#endif
>
> ?/*
> ?* Must be an lvalue. Since @var must be a simple identifier,
> @@ -79,7 +81,7 @@
> ?#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
>
> ?/* minimum unit size, also is the maximum supported allocation size */
> -#define PCPU_MIN_UNIT_SIZE ? ? ? ? ? ? (16UL << PAGE_SHIFT)
> +#define PCPU_MIN_UNIT_SIZE ? ? ? ? ? ? PFN_ALIGN(64 << 10)
>
> ?/*
> ?* PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
> @@ -96,15 +98,15 @@
> ?#ifndef PERCPU_DYNAMIC_RESERVE
> ?# ?if BITS_PER_LONG > 32
> ?# ? ?ifdef CONFIG_MODULES
> -# ? ? ?define PERCPU_DYNAMIC_RESERVE ? (6 << PAGE_SHIFT)
> +# ? ? ?define PERCPU_DYNAMIC_RESERVE ? (24 << 10)
> ?# ? ?else
> -# ? ? ?define PERCPU_DYNAMIC_RESERVE ? (4 << PAGE_SHIFT)
> +# ? ? ?define PERCPU_DYNAMIC_RESERVE ? (16 << 10)
> ?# ? ?endif
> ?# ?else
> ?# ? ?ifdef CONFIG_MODULES
> -# ? ? ?define PERCPU_DYNAMIC_RESERVE ? (4 << PAGE_SHIFT)
> +# ? ? ?define PERCPU_DYNAMIC_RESERVE ? (16 << 10)
> ?# ? ?else
> -# ? ? ?define PERCPU_DYNAMIC_RESERVE ? (2 << PAGE_SHIFT)
> +# ? ? ?define PERCPU_DYNAMIC_RESERVE ? (8 << 10)
> ?# ? ?endif
> ?# ?endif
> ?#endif /* PERCPU_DYNAMIC_RESERVE */
> --
> 1.6.0.2
>
>