Hi everyone,
The percpu memory using the valloc area based chunk allocator lazily
populates chunks by first requesting the full virtual address space
required for the chunk and subsequently adding pages as allocations come
through. To ensure atomic allocations can succeed, a workqueue item is
used to maintain a minimum number of empty pages. In certain scenarios,
such as reported in [1], it is possible that physical memory becomes
quite scarce which can result in either a rather long time spent trying
to find free pages or worse, a kernel panic.
This patchset introduces no retry semantics to percpu memory by allowing
users to pass through certain flags controlled by a whitelist. In this
case, only __GFP_NORETRY and __GFP_NOWARN are initially allowed. This
should prevent the eventual kernel panic due to the workqueue item and
now give flexibility to users to decide how they want to fail when the
percpu allocator fails if they use the additional flags. This does not
prevent the allocator from panicking should an allocation without the
additional flags cause an underlying allocator to trigger out of memory.
I tested this by running basic allocations with and without passing
the additional flags. Allocations are able to proceed / fail as
expected without triggering the out of memory kernel panic. I still
saw OOM killer activate, but I attribute that to other programs faulting
in the background. Without the flags, the kernel panics pretty quickly
with the expected out of memory panic.
[1] https://lkml.org/lkml/2018/2/12/551
This patchset contains the following 3 patches:
0001-percpu-match-chunk-allocator-declarations-with-defin.patch
0002-percpu-add-__GFP_NORETRY-semantics-to-the-percpu-bal.patch
0003-percpu-allow-select-gfp-to-be-passed-to-underlying-a.patch
0001 fixes out of sync declaration and definiton variable names. 0002 adds
no retry semantics to the workqueue balance path. 0003 enables users to
pass through flags to the underlying percpu memory allocators. This also
cleans up the semantics surrounding how the flags are managed.
This patchset is ontop of percpu#master 7928b2cbe5.
diffstats below:
Dennis Zhou (3):
percpu: match chunk allocator declarations with definitions
percpu: add __GFP_NORETRY semantics to the percpu balancing path
percpu: allow select gfp to be passed to underlying allocators
mm/percpu-km.c | 8 ++++----
mm/percpu-vm.c | 18 +++++++++++-------
mm/percpu.c | 52 ++++++++++++++++++++++++++++++++++------------------
3 files changed, 49 insertions(+), 29 deletions(-)
Thanks,
Dennis
The prior patch added support for passing gfp flags through to the
underlying allocators. This patch allows users to pass along gfp flags
(currently only __GFP_NORETRY and __GFP_NOWARN) to the underlying
allocators. This should allow users to decide if they are ok with
failing allocations recovering in a more graceful way.
Additionally, the prior use of gfp was as additional gfp flags that were
then combined with the base flags, namely GFP_KERNEL. gfp_percpu_mask is
introduced to create the base of GFP_KERNEL and whitelist allowed gfp
flags. Using this in the appropriate places changes gfp use from as
additional flags to as a whole set in general removing the need to
always or with the GFP_KERNEL.
Signed-off-by: Dennis Zhou <[email protected]>
Suggested-by: Daniel Borkmann <[email protected]>
---
mm/percpu-km.c | 2 +-
mm/percpu-vm.c | 4 ++--
mm/percpu.c | 16 ++++++++++------
3 files changed, 13 insertions(+), 9 deletions(-)
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 0d88d7b..38de70a 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -56,7 +56,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
if (!chunk)
return NULL;
- pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));
+ pages = alloc_pages(gfp, order_base_2(nr_pages));
if (!pages) {
pcpu_free_chunk(chunk);
return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index ea9906a..c771d86 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
lockdep_assert_held(&pcpu_alloc_mutex);
if (!pages)
- pages = pcpu_mem_zalloc(pages_size, 0);
+ pages = pcpu_mem_zalloc(pages_size, gfp_percpu_mask(0));
return pages;
}
@@ -86,7 +86,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
unsigned int cpu, tcpu;
int i;
- gfp |= GFP_KERNEL | __GFP_HIGHMEM;
+ gfp |= __GFP_HIGHMEM;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
diff --git a/mm/percpu.c b/mm/percpu.c
index 2489b8b..e35a120 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -91,6 +91,10 @@
#include "percpu-internal.h"
+/* the whitelisted flags that can be passed to the backing allocators */
+#define gfp_percpu_mask(gfp) (((gfp) & (__GFP_NORETRY | __GFP_NOWARN)) | \
+ GFP_KERNEL)
+
/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
#define PCPU_SLOT_BASE_SHIFT 5
@@ -466,10 +470,9 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
return NULL;
if (size <= PAGE_SIZE)
- return kzalloc(size, gfp | GFP_KERNEL);
+ return kzalloc(size, gfp);
else
- return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL);
+ return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
}
/**
@@ -1344,6 +1347,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
+ gfp_t pcpu_gfp = gfp_percpu_mask(gfp);
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
bool do_warn = !(gfp & __GFP_NOWARN);
static int warn_limit = 10;
@@ -1426,7 +1430,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
}
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
- chunk = pcpu_create_chunk(0);
+ chunk = pcpu_create_chunk(pcpu_gfp);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
@@ -1455,7 +1459,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
page_start, page_end) {
WARN_ON(chunk->immutable);
- ret = pcpu_populate_chunk(chunk, rs, re, 0);
+ ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
@@ -1576,7 +1580,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
static void pcpu_balance_workfn(struct work_struct *work)
{
/* gfp flags passed to underlying allocators */
- gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+ gfp_t gfp = gfp_percpu_mask(__GFP_NORETRY | __GFP_NOWARN);
LIST_HEAD(to_free);
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
--
1.8.3.1
At some point the function declaration parameters got out of sync with
the function definitions in percpu-vm.c and percpu-km.c. This patch
makes them match again.
Signed-off-by: Dennis Zhou <[email protected]>
---
mm/percpu.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/mm/percpu.c b/mm/percpu.c
index 50e7fdf..e1ea410 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1277,8 +1277,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
* pcpu_addr_to_page - translate address to physical address
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end);
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(void);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
--
1.8.3.1
Percpu memory using the vmalloc area based chunk allocator lazily
populates chunks by first requesting the full virtual address space
required for the chunk and subsequently adding pages as allocations come
through. To ensure atomic allocations can succeed, a workqueue item is
used to maintain a minimum number of empty pages. In certain scenarios,
such as reported in [1], it is possible that physical memory becomes
quite scarce which can result in either a rather long time spent trying
to find free pages or worse, a kernel panic.
This patch adds support for __GFP_NORETRY and __GFP_NOWARN passing them
through to the underlying allocators. This should prevent any
unnecessary panics potentially caused by the workqueue item. The passing
of gfp around is as additional flags rather than a full set of flags.
The next patch will change these semantics to be a full set (in this
case containing GFP_KERNEL as the base).
[1] https://lkml.org/lkml/2018/2/12/551
Signed-off-by: Dennis Zhou <[email protected]>
Suggested-by: Daniel Borkmann <[email protected]>
Reported-by: [email protected]
---
mm/percpu-km.c | 8 ++++----
mm/percpu-vm.c | 18 +++++++++++-------
mm/percpu.c | 44 +++++++++++++++++++++++++++-----------------
3 files changed, 42 insertions(+), 28 deletions(-)
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d2a7664..0d88d7b 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -34,7 +34,7 @@
#include <linux/log2.h>
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
- int page_start, int page_end)
+ int page_start, int page_end, gfp_t gfp)
{
return 0;
}
@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
/* nada */
}
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
struct page *pages;
int i;
- chunk = pcpu_alloc_chunk();
+ chunk = pcpu_alloc_chunk(gfp);
if (!chunk)
return NULL;
- pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+ pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));
if (!pages) {
pcpu_free_chunk(chunk);
return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9158e5a..ea9906a 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
lockdep_assert_held(&pcpu_alloc_mutex);
if (!pages)
- pages = pcpu_mem_zalloc(pages_size);
+ pages = pcpu_mem_zalloc(pages_size, 0);
return pages;
}
@@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
* @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
* @page_start: page index of the first page to be allocated
* @page_end: page index of the last page to be allocated + 1
+ * @gfp: allocation flags passed to the underlying allocator
*
* Allocate pages [@page_start,@page_end) into @pages for all units.
* The allocation is for @chunk. Percpu core doesn't care about the
* content of @pages and will pass it verbatim to pcpu_map_pages().
*/
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
- struct page **pages, int page_start, int page_end)
+ struct page **pages, int page_start, int page_end,
+ gfp_t gfp)
{
- const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
unsigned int cpu, tcpu;
int i;
+ gfp |= GFP_KERNEL | __GFP_HIGHMEM;
+
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
@@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
* @chunk: chunk of interest
* @page_start: the start page
* @page_end: the end page
+ * @gfp: allocation flags passed to the underlying memory allocator
*
* For each cpu, populate and map pages [@page_start,@page_end) into
* @chunk.
@@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
* pcpu_alloc_mutex, does GFP_KERNEL allocation.
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
- int page_start, int page_end)
+ int page_start, int page_end, gfp_t gfp)
{
struct page **pages;
@@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
if (!pages)
return -ENOMEM;
- if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
+ if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
return -ENOMEM;
if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
@@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
pcpu_free_pages(chunk, pages, page_start, page_end);
}
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
struct vm_struct **vms;
- chunk = pcpu_alloc_chunk();
+ chunk = pcpu_alloc_chunk(gfp);
if (!chunk)
return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index e1ea410..2489b8b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -447,10 +447,12 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
/**
* pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate
+ * @gfp: allocation flags
*
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used. The returned
- * memory is always zeroed.
+ * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
+ * This is to facilitate passing through whitelisted flags. The
+ * returned memory is always zeroed.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
@@ -458,15 +460,16 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
if (WARN_ON_ONCE(!slab_is_available()))
return NULL;
if (size <= PAGE_SIZE)
- return kzalloc(size, GFP_KERNEL);
+ return kzalloc(size, gfp | GFP_KERNEL);
else
- return vzalloc(size);
+ return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL);
}
/**
@@ -1154,12 +1157,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
return chunk;
}
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
int region_bits;
- chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+ chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
if (!chunk)
return NULL;
@@ -1168,17 +1171,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
region_bits = pcpu_chunk_map_bits(chunk);
chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
- sizeof(chunk->alloc_map[0]));
+ sizeof(chunk->alloc_map[0]), gfp);
if (!chunk->alloc_map)
goto alloc_map_fail;
chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
- sizeof(chunk->bound_map[0]));
+ sizeof(chunk->bound_map[0]), gfp);
if (!chunk->bound_map)
goto bound_map_fail;
chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
- sizeof(chunk->md_blocks[0]));
+ sizeof(chunk->md_blocks[0]), gfp);
if (!chunk->md_blocks)
goto md_blocks_fail;
@@ -1278,10 +1281,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
- int page_start, int page_end);
+ int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1423,7 +1426,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
}
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
- chunk = pcpu_create_chunk();
+ chunk = pcpu_create_chunk(0);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
@@ -1452,7 +1455,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
page_start, page_end) {
WARN_ON(chunk->immutable);
- ret = pcpu_populate_chunk(chunk, rs, re);
+ ret = pcpu_populate_chunk(chunk, rs, re, 0);
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
@@ -1563,10 +1566,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
* pcpu_balance_workfn - manage the amount of free chunks and populated pages
* @work: unused
*
- * Reclaim all fully free chunks except for the first one.
+ * Reclaim all fully free chunks except for the first one. This is also
+ * responsible for maintaining the pool of empty populated pages. However,
+ * it is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered. We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
*/
static void pcpu_balance_workfn(struct work_struct *work)
{
+ /* gfp flags passed to underlying allocators */
+ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
LIST_HEAD(to_free);
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
@@ -1647,7 +1657,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
chunk->nr_pages) {
int nr = min(re - rs, nr_to_pop);
- ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+ ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
if (!ret) {
nr_to_pop -= nr;
spin_lock_irq(&pcpu_lock);
@@ -1664,7 +1674,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */
- chunk = pcpu_create_chunk();
+ chunk = pcpu_create_chunk(gfp);
if (chunk) {
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
--
1.8.3.1
On Thu, 15 Feb 2018, Dennis Zhou wrote:
> At some point the function declaration parameters got out of sync with
> the function definitions in percpu-vm.c and percpu-km.c. This patch
> makes them match again.
Acked-by: Christoph Lameter <[email protected]>
Hello,
On Thu, Feb 15, 2018 at 10:08:15AM -0600, Dennis Zhou wrote:
> -static struct pcpu_chunk *pcpu_create_chunk(void)
> +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
> {
> const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
> struct pcpu_chunk *chunk;
> struct page *pages;
> int i;
>
> - chunk = pcpu_alloc_chunk();
> + chunk = pcpu_alloc_chunk(gfp);
> if (!chunk)
> return NULL;
>
> - pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
> + pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));
Is there a reason to set GFP_KERNEL in this function? I'd prefer
pushing this to the callers.
> diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
> index 9158e5a..ea9906a 100644
> --- a/mm/percpu-vm.c
> +++ b/mm/percpu-vm.c
> @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
> lockdep_assert_held(&pcpu_alloc_mutex);
>
> if (!pages)
> - pages = pcpu_mem_zalloc(pages_size);
> + pages = pcpu_mem_zalloc(pages_size, 0);
^^^^
because this is confusing
> static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
> - struct page **pages, int page_start, int page_end)
> + struct page **pages, int page_start, int page_end,
> + gfp_t gfp)
> {
> - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
> unsigned int cpu, tcpu;
> int i;
>
> + gfp |= GFP_KERNEL | __GFP_HIGHMEM;
^^
double space
So, setting __GFP_HIGHMEM unconditionally here makes sense because
it's indicating the types of pages we can use (we also accept high
pages); however, I'm not sure GFP_KERNEL makes sense. That's about
"how to allocate" and looks like it should be left to the caller.
Thanks.
--
tejun
Hello,
On Thu, Feb 15, 2018 at 10:08:16AM -0600, Dennis Zhou wrote:
> +/* the whitelisted flags that can be passed to the backing allocators */
> +#define gfp_percpu_mask(gfp) (((gfp) & (__GFP_NORETRY | __GFP_NOWARN)) | \
> + GFP_KERNEL)
Isn't there just one place where gfp comes in from outside? If so,
this looks like a bit of overkill. Can't we just filter there?
Thanks.
--
tejun
Hi Tejun,
On Thu, Feb 15, 2018 at 01:39:09PM -0800, Tejun Heo wrote:
> On Thu, Feb 15, 2018 at 10:08:15AM -0600, Dennis Zhou wrote:
> > -static struct pcpu_chunk *pcpu_create_chunk(void)
> > +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
> > {
> > const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
> > struct pcpu_chunk *chunk;
> > struct page *pages;
> > int i;
> >
> > - chunk = pcpu_alloc_chunk();
> > + chunk = pcpu_alloc_chunk(gfp);
> > if (!chunk)
> > return NULL;
> >
> > - pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
> > + pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));
>
> Is there a reason to set GFP_KERNEL in this function? I'd prefer
> pushing this to the callers.
>
Not particularly. As I wasn't sure of the original decision to use
GFP_KERNEL for all percpu underlying allocations, I didn't want to
add the gfp passthrough and remove functionality.
> > diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
> > index 9158e5a..ea9906a 100644
> > --- a/mm/percpu-vm.c
> > +++ b/mm/percpu-vm.c
> > @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
> > lockdep_assert_held(&pcpu_alloc_mutex);
> >
> > if (!pages)
> > - pages = pcpu_mem_zalloc(pages_size);
> > + pages = pcpu_mem_zalloc(pages_size, 0);
> ^^^^
> because this is confusing
Yeah.. The next patch removes this as the additional gfp flags is weird.
> > static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
> > - struct page **pages, int page_start, int page_end)
> > + struct page **pages, int page_start, int page_end,
> > + gfp_t gfp)
> > {
> > - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
> > unsigned int cpu, tcpu;
> > int i;
> >
> > + gfp |= GFP_KERNEL | __GFP_HIGHMEM;
> ^^
> double space
>
I'll fix this with any other updates.
> So, setting __GFP_HIGHMEM unconditionally here makes sense because
> it's indicating the types of pages we can use (we also accept high
> pages); however, I'm not sure GFP_KERNEL makes sense. That's about
> "how to allocate" and looks like it should be left to the caller.
>
That makes sense, I can remove the forced GFP_KERNEL use in the next
patch as that patch moves the flags to the caller.
I'd rather be explicit though and whitelist GFP_KERNEL as I don't have a
full grasp of all the flags. Our use case is a little different because
we ultimately become the owner of the pages until the chunk is freed. So
there are certain flags such as __GFP_HARDWALL (poor example), the
difference between GFP_KERNEL and GFP_USER, which don't make sense here.
Regarding high pages, I think you're referring to GFP_ATOMIC
allocations? We actually never allocate on this path as allocations must
be served out of parts of chunks that are already backed.
Thanks,
Dennis
Hi,
On Thu, Feb 15, 2018 at 01:41:48PM -0800, Tejun Heo wrote:
> On Thu, Feb 15, 2018 at 10:08:16AM -0600, Dennis Zhou wrote:
> > +/* the whitelisted flags that can be passed to the backing allocators */
> > +#define gfp_percpu_mask(gfp) (((gfp) & (__GFP_NORETRY | __GFP_NOWARN)) | \
> > + GFP_KERNEL)
>
> Isn't there just one place where gfp comes in from outside? If so,
> this looks like a bit of overkill. Can't we just filter there?
>
I agree, but it's also nice having a single place where flags can be
added or removed. The primary motivator was for the "| GFP_KERNEL", but
as suggested in the other patch this is getting removed. I guess I still
lean towards having it as it's explicit and helps gate both the balance
path and the user allocation path.
Thanks,
Dennis
Percpu memory using the vmalloc area based chunk allocator lazily
populates chunks by first requesting the full virtual address space
required for the chunk and subsequently adding pages as allocations come
through. To ensure atomic allocations can succeed, a workqueue item is
used to maintain a minimum number of empty pages. In certain scenarios,
such as reported in [1], it is possible that physical memory becomes
quite scarce which can result in either a rather long time spent trying
to find free pages or worse, a kernel panic.
This patch adds support for __GFP_NORETRY and __GFP_NOWARN passing them
through to the underlying allocators. This should prevent any
unnecessary panics potentially caused by the workqueue item. The passing
of gfp around is as additional flags rather than a full set of flags.
The next patch will change these to caller passed semantics.
V2:
Added const modifier to gfp flags in the balance path.
Removed an extra whitespace.
[1] https://lkml.org/lkml/2018/2/12/551
Signed-off-by: Dennis Zhou <[email protected]>
Suggested-by: Daniel Borkmann <[email protected]>
Reported-by: [email protected]
---
mm/percpu-km.c | 8 ++++----
mm/percpu-vm.c | 18 +++++++++++-------
mm/percpu.c | 44 +++++++++++++++++++++++++++-----------------
3 files changed, 42 insertions(+), 28 deletions(-)
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d2a7664..0d88d7b 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -34,7 +34,7 @@
#include <linux/log2.h>
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
- int page_start, int page_end)
+ int page_start, int page_end, gfp_t gfp)
{
return 0;
}
@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
/* nada */
}
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
struct page *pages;
int i;
- chunk = pcpu_alloc_chunk();
+ chunk = pcpu_alloc_chunk(gfp);
if (!chunk)
return NULL;
- pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+ pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));
if (!pages) {
pcpu_free_chunk(chunk);
return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9158e5a..0af71eb 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
lockdep_assert_held(&pcpu_alloc_mutex);
if (!pages)
- pages = pcpu_mem_zalloc(pages_size);
+ pages = pcpu_mem_zalloc(pages_size, 0);
return pages;
}
@@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
* @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
* @page_start: page index of the first page to be allocated
* @page_end: page index of the last page to be allocated + 1
+ * @gfp: allocation flags passed to the underlying allocator
*
* Allocate pages [@page_start,@page_end) into @pages for all units.
* The allocation is for @chunk. Percpu core doesn't care about the
* content of @pages and will pass it verbatim to pcpu_map_pages().
*/
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
- struct page **pages, int page_start, int page_end)
+ struct page **pages, int page_start, int page_end,
+ gfp_t gfp)
{
- const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
unsigned int cpu, tcpu;
int i;
+ gfp |= GFP_KERNEL | __GFP_HIGHMEM;
+
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
@@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
* @chunk: chunk of interest
* @page_start: the start page
* @page_end: the end page
+ * @gfp: allocation flags passed to the underlying memory allocator
*
* For each cpu, populate and map pages [@page_start,@page_end) into
* @chunk.
@@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
* pcpu_alloc_mutex, does GFP_KERNEL allocation.
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
- int page_start, int page_end)
+ int page_start, int page_end, gfp_t gfp)
{
struct page **pages;
@@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
if (!pages)
return -ENOMEM;
- if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
+ if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
return -ENOMEM;
if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
@@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
pcpu_free_pages(chunk, pages, page_start, page_end);
}
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
struct vm_struct **vms;
- chunk = pcpu_alloc_chunk();
+ chunk = pcpu_alloc_chunk(gfp);
if (!chunk)
return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index e1ea410..f97443d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -447,10 +447,12 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
/**
* pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate
+ * @gfp: allocation flags
*
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used. The returned
- * memory is always zeroed.
+ * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
+ * This is to facilitate passing through whitelisted flags. The
+ * returned memory is always zeroed.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
@@ -458,15 +460,16 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
if (WARN_ON_ONCE(!slab_is_available()))
return NULL;
if (size <= PAGE_SIZE)
- return kzalloc(size, GFP_KERNEL);
+ return kzalloc(size, gfp | GFP_KERNEL);
else
- return vzalloc(size);
+ return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL);
}
/**
@@ -1154,12 +1157,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
return chunk;
}
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
int region_bits;
- chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+ chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
if (!chunk)
return NULL;
@@ -1168,17 +1171,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
region_bits = pcpu_chunk_map_bits(chunk);
chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
- sizeof(chunk->alloc_map[0]));
+ sizeof(chunk->alloc_map[0]), gfp);
if (!chunk->alloc_map)
goto alloc_map_fail;
chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
- sizeof(chunk->bound_map[0]));
+ sizeof(chunk->bound_map[0]), gfp);
if (!chunk->bound_map)
goto bound_map_fail;
chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
- sizeof(chunk->md_blocks[0]));
+ sizeof(chunk->md_blocks[0]), gfp);
if (!chunk->md_blocks)
goto md_blocks_fail;
@@ -1278,10 +1281,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
- int page_start, int page_end);
+ int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1423,7 +1426,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
}
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
- chunk = pcpu_create_chunk();
+ chunk = pcpu_create_chunk(0);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
@@ -1452,7 +1455,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
page_start, page_end) {
WARN_ON(chunk->immutable);
- ret = pcpu_populate_chunk(chunk, rs, re);
+ ret = pcpu_populate_chunk(chunk, rs, re, 0);
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
@@ -1563,10 +1566,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
* pcpu_balance_workfn - manage the amount of free chunks and populated pages
* @work: unused
*
- * Reclaim all fully free chunks except for the first one.
+ * Reclaim all fully free chunks except for the first one. This is also
+ * responsible for maintaining the pool of empty populated pages. However,
+ * it is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered. We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
*/
static void pcpu_balance_workfn(struct work_struct *work)
{
+ /* gfp flags passed to underlying allocators */
+ const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
LIST_HEAD(to_free);
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
@@ -1647,7 +1657,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
chunk->nr_pages) {
int nr = min(re - rs, nr_to_pop);
- ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+ ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
if (!ret) {
nr_to_pop -= nr;
spin_lock_irq(&pcpu_lock);
@@ -1664,7 +1674,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */
- chunk = pcpu_create_chunk();
+ chunk = pcpu_create_chunk(gfp);
if (chunk) {
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
--
1.8.3.1
The prior patch added support for passing gfp flags through to the
underlying allocators. This patch allows users to pass along gfp flags
(currently only __GFP_NORETRY and __GFP_NOWARN) to the underlying
allocators. This should allow users to decide if they are ok with
failing allocations recovering in a more graceful way.
Additionally, gfp passing was done as additional flags in the previous
patch. Instead, change this to caller passed semantics. GFP_KERNEL is
also removed as the default flag. It continues to be used for internally
caused underlying percpu allocations.
V2:
Removed gfp_percpu_mask in favor of doing it inline.
Removed GFP_KERNEL as a default flag for __alloc_percpu_gfp.
Signed-off-by: Dennis Zhou <[email protected]>
Suggested-by: Daniel Borkmann <[email protected]>
---
mm/percpu-km.c | 2 +-
mm/percpu-vm.c | 4 ++--
mm/percpu.c | 16 +++++++---------
3 files changed, 10 insertions(+), 12 deletions(-)
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 0d88d7b..38de70a 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -56,7 +56,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
if (!chunk)
return NULL;
- pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));
+ pages = alloc_pages(gfp, order_base_2(nr_pages));
if (!pages) {
pcpu_free_chunk(chunk);
return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 0af71eb..d8078de 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
lockdep_assert_held(&pcpu_alloc_mutex);
if (!pages)
- pages = pcpu_mem_zalloc(pages_size, 0);
+ pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
return pages;
}
@@ -86,7 +86,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
unsigned int cpu, tcpu;
int i;
- gfp |= GFP_KERNEL | __GFP_HIGHMEM;
+ gfp |= __GFP_HIGHMEM;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
diff --git a/mm/percpu.c b/mm/percpu.c
index f97443d..fa3f854 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -454,9 +454,6 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
* This is to facilitate passing through whitelisted flags. The
* returned memory is always zeroed.
*
- * CONTEXT:
- * Does GFP_KERNEL allocation.
- *
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
@@ -466,10 +463,9 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
return NULL;
if (size <= PAGE_SIZE)
- return kzalloc(size, gfp | GFP_KERNEL);
+ return kzalloc(size, gfp);
else
- return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL);
+ return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
}
/**
@@ -1344,6 +1340,8 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
+ /* whitelisted flags that can be passed to the backing allocators */
+ gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
bool do_warn = !(gfp & __GFP_NOWARN);
static int warn_limit = 10;
@@ -1426,7 +1424,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
}
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
- chunk = pcpu_create_chunk(0);
+ chunk = pcpu_create_chunk(pcpu_gfp);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
@@ -1455,7 +1453,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
page_start, page_end) {
WARN_ON(chunk->immutable);
- ret = pcpu_populate_chunk(chunk, rs, re, 0);
+ ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
@@ -1576,7 +1574,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
static void pcpu_balance_workfn(struct work_struct *work)
{
/* gfp flags passed to underlying allocators */
- const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
LIST_HEAD(to_free);
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
--
1.8.3.1
On Fri, Feb 16, 2018 at 12:09:58PM -0600, Dennis Zhou wrote:
> The prior patch added support for passing gfp flags through to the
> underlying allocators. This patch allows users to pass along gfp flags
> (currently only __GFP_NORETRY and __GFP_NOWARN) to the underlying
> allocators. This should allow users to decide if they are ok with
> failing allocations recovering in a more graceful way.
>
> Additionally, gfp passing was done as additional flags in the previous
> patch. Instead, change this to caller passed semantics. GFP_KERNEL is
> also removed as the default flag. It continues to be used for internally
> caused underlying percpu allocations.
>
> V2:
> Removed gfp_percpu_mask in favor of doing it inline.
> Removed GFP_KERNEL as a default flag for __alloc_percpu_gfp.
>
> Signed-off-by: Dennis Zhou <[email protected]>
> Suggested-by: Daniel Borkmann <[email protected]>
Applied 1-3 to percpu/for-4.16-fixes.
Thanks, Dennis.
--
tejun