This patchset introduces function kvmalloc and kvmalloc_node. These
functions allow reliable allocation of objects of arbitrary size. They
attempt to do allocation with kmalloc and if it fails, use vmalloc. Memory
allocated with these functions should be freed with kvfree.
The patchset makes modification to device mapper to use these functions.
Mikulas
Export the function __vmalloc_node_flags.
Signed-off-by: Mikulas Patocka <[email protected]>
---
include/linux/vmalloc.h | 1 +
mm/vmalloc.c | 4 ++--
2 files changed, 3 insertions(+), 2 deletions(-)
Index: linux-4.1/include/linux/vmalloc.h
===================================================================
--- linux-4.1.orig/include/linux/vmalloc.h 2015-07-02 19:19:43.000000000 +0200
+++ linux-4.1/include/linux/vmalloc.h 2015-07-02 19:20:59.000000000 +0200
@@ -75,6 +75,7 @@ extern void *vmalloc_exec(unsigned long
extern void *vmalloc_32(unsigned long size);
extern void *vmalloc_32_user(unsigned long size);
extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
+void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
Index: linux-4.1/mm/vmalloc.c
===================================================================
--- linux-4.1.orig/mm/vmalloc.c 2015-07-02 19:19:13.000000000 +0200
+++ linux-4.1/mm/vmalloc.c 2015-07-02 19:21:00.000000000 +0200
@@ -1722,12 +1722,12 @@ void *__vmalloc(unsigned long size, gfp_
}
EXPORT_SYMBOL(__vmalloc);
-static inline void *__vmalloc_node_flags(unsigned long size,
- int node, gfp_t flags)
+void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
{
return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
node, __builtin_return_address(0));
}
+EXPORT_SYMBOL(__vmalloc_node_flags);
/**
* vmalloc - allocate virtually contiguous memory
Introduce the functions kvmalloc and kvmalloc_node. These functions
provide reliable allocation of object of arbitrary size. They attempt to
do allocation with kmalloc and if it fails, use vmalloc. Memory allocated
with these functions should be freed with kvfree.
Signed-off-by: Mikulas Patocka <[email protected]>
---
include/linux/mm.h | 2 ++
mm/util.c | 37 +++++++++++++++++++++++++++++++++++++
2 files changed, 39 insertions(+)
Index: linux-4.2-rc1/include/linux/mm.h
===================================================================
--- linux-4.2-rc1.orig/include/linux/mm.h 2015-07-07 15:54:36.000000000 +0200
+++ linux-4.2-rc1/include/linux/mm.h 2015-07-07 15:54:58.000000000 +0200
@@ -400,6 +400,8 @@ static inline int is_vmalloc_or_module_a
}
#endif
+extern void *kvmalloc_node(size_t size, gfp_t gfp, int node);
+extern void *kvmalloc(size_t size, gfp_t gfp);
extern void kvfree(const void *addr);
static inline void compound_lock(struct page *page)
Index: linux-4.2-rc1/mm/util.c
===================================================================
--- linux-4.2-rc1.orig/mm/util.c 2015-07-07 15:52:37.000000000 +0200
+++ linux-4.2-rc1/mm/util.c 2015-07-07 15:54:06.000000000 +0200
@@ -316,6 +316,43 @@ unsigned long vm_mmap(struct file *file,
}
EXPORT_SYMBOL(vm_mmap);
+void *kvmalloc_node(size_t size, gfp_t gfp, int node)
+{
+ void *p;
+ unsigned uninitialized_var(noio_flag);
+
+ /* vmalloc doesn't support no-wait allocations */
+ WARN_ON(!(gfp & __GFP_WAIT));
+
+ if (likely(size <= KMALLOC_MAX_SIZE)) {
+ p = kmalloc_node(size, gfp | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
+ if (likely(p != NULL))
+ return p;
+ }
+ if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS)) {
+ /*
+ * vmalloc allocates page tables with GFP_KERNEL, regardless
+ * of GFP flags passed to it. If we are no GFP_NOIO context,
+ * we call memalloc_noio_save, so that all allocations are
+ * implicitly done with GFP_NOIO.
+ */
+ noio_flag = memalloc_noio_save();
+ gfp |= __GFP_HIGH;
+ }
+ p = __vmalloc_node_flags(size, node, gfp | __GFP_REPEAT | __GFP_HIGHMEM);
+ if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS)) {
+ memalloc_noio_restore(noio_flag);
+ }
+ return p;
+}
+EXPORT_SYMBOL(kvmalloc_node);
+
+void *kvmalloc(size_t size, gfp_t gfp)
+{
+ return kvmalloc_node(size, gfp, NUMA_NO_NODE);
+}
+EXPORT_SYMBOL(kvmalloc);
+
void kvfree(const void *addr)
{
if (is_vmalloc_addr(addr))
Join flags DM_PARAMS_KMALLOC and DM_PARAMS_VMALLOC into just one flag
DM_PARAMS_ALLOC. We can determine if the block was allocated with kmalloc
or vmalloc with the function is_vmalloc_addr, so there is no need to have
separate flags for that.
Signed-off-by: Mikulas Patocka <[email protected]>
---
drivers/md/dm-ioctl.c | 17 +++++++++--------
drivers/md/dm-stats.c | 20 ++++++++++----------
2 files changed, 19 insertions(+), 18 deletions(-)
Index: linux-4.1/drivers/md/dm-ioctl.c
===================================================================
--- linux-4.1.orig/drivers/md/dm-ioctl.c 2015-07-02 18:24:08.000000000 +0200
+++ linux-4.1/drivers/md/dm-ioctl.c 2015-07-02 18:52:52.000000000 +0200
@@ -1668,8 +1668,7 @@ static int check_version(unsigned int cm
return r;
}
-#define DM_PARAMS_KMALLOC 0x0001 /* Params alloced with kmalloc */
-#define DM_PARAMS_VMALLOC 0x0002 /* Params alloced with vmalloc */
+#define DM_PARAMS_ALLOC 0x0001 /* Params alloced with kmalloc or vmalloc */
#define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */
static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
@@ -1677,10 +1676,12 @@ static void free_params(struct dm_ioctl
if (param_flags & DM_WIPE_BUFFER)
memset(param, 0, param_size);
- if (param_flags & DM_PARAMS_KMALLOC)
- kfree(param);
- if (param_flags & DM_PARAMS_VMALLOC)
- vfree(param);
+ if (param_flags & DM_PARAMS_ALLOC) {
+ if (is_vmalloc_addr(param))
+ vfree(param);
+ else
+ kfree(param);
+ }
}
static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
@@ -1715,7 +1716,7 @@ static int copy_params(struct dm_ioctl _
if (param_kernel->data_size <= KMALLOC_MAX_SIZE) {
dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
if (dmi)
- *param_flags |= DM_PARAMS_KMALLOC;
+ *param_flags |= DM_PARAMS_ALLOC;
}
if (!dmi) {
@@ -1724,7 +1725,7 @@ static int copy_params(struct dm_ioctl _
dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
memalloc_noio_restore(noio_flag);
if (dmi)
- *param_flags |= DM_PARAMS_VMALLOC;
+ *param_flags |= DM_PARAMS_ALLOC;
}
if (!dmi) {
Use the function kvmalloc to allocate ioctl parameters.
Signed-off-by: Mikulas Patocka <[email protected]>
---
drivers/md/dm-ioctl.c | 26 +++++---------------------
drivers/md/dm-table.c | 37 +++++++++++++++++++++++++++++++++++++
include/linux/device-mapper.h | 2 ++
3 files changed, 44 insertions(+), 21 deletions(-)
Index: linux-4.1/drivers/md/dm-ioctl.c
===================================================================
--- linux-4.1.orig/drivers/md/dm-ioctl.c 2015-07-02 19:21:15.000000000 +0200
+++ linux-4.1/drivers/md/dm-ioctl.c 2015-07-02 19:21:21.000000000 +0200
@@ -1676,12 +1676,8 @@ static void free_params(struct dm_ioctl
if (param_flags & DM_WIPE_BUFFER)
memset(param, 0, param_size);
- if (param_flags & DM_PARAMS_ALLOC) {
- if (is_vmalloc_addr(param))
- vfree(param);
- else
- kfree(param);
- }
+ if (param_flags & DM_PARAMS_ALLOC)
+ kvfree(param);
}
static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
@@ -1712,21 +1708,7 @@ static int copy_params(struct dm_ioctl _
* Try to avoid low memory issues when a device is suspended.
* Use kmalloc() rather than vmalloc() when we can.
*/
- dmi = NULL;
- if (param_kernel->data_size <= KMALLOC_MAX_SIZE) {
- dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- if (dmi)
- *param_flags |= DM_PARAMS_ALLOC;
- }
-
- if (!dmi) {
- unsigned noio_flag;
- noio_flag = memalloc_noio_save();
- dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
- memalloc_noio_restore(noio_flag);
- if (dmi)
- *param_flags |= DM_PARAMS_ALLOC;
- }
+ dmi = kvmalloc(param_kernel->data_size, GFP_NOIO);
if (!dmi) {
if (secure_data && clear_user(user, param_kernel->data_size))
@@ -1734,6 +1716,8 @@ static int copy_params(struct dm_ioctl _
return -ENOMEM;
}
+ *param_flags |= DM_PARAMS_ALLOC;
+
if (copy_from_user(dmi, user, param_kernel->data_size))
goto bad;
Make dm-thin use kvmalloc instead of kmalloc because there was a reported
allocation failure - see
https://bugzilla.redhat.com/show_bug.cgi?id=1225370
Signed-off-by: Mikulas Patocka <[email protected]>
---
drivers/md/dm-thin.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
Index: linux-4.2-rc1/drivers/md/dm-thin.c
===================================================================
--- linux-4.2-rc1.orig/drivers/md/dm-thin.c 2015-07-06 17:32:35.000000000 +0200
+++ linux-4.2-rc1/drivers/md/dm-thin.c 2015-07-06 17:36:28.000000000 +0200
@@ -2791,7 +2791,7 @@ static void __pool_destroy(struct pool *
mempool_destroy(pool->mapping_pool);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
- kfree(pool);
+ kvfree(pool);
}
static struct kmem_cache *_new_mapping_cache;
@@ -2813,7 +2813,7 @@ static struct pool *pool_create(struct m
return (struct pool *)pmd;
}
- pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+ pool = kvmalloc(sizeof(*pool), GFP_KERNEL);
if (!pool) {
*error = "Error allocating memory for pool";
err_p = ERR_PTR(-ENOMEM);
@@ -2908,7 +2908,7 @@ bad_wq:
bad_kcopyd_client:
dm_bio_prison_destroy(pool->prison);
bad_prison:
- kfree(pool);
+ kvfree(pool);
bad_pool:
if (dm_pool_metadata_close(pmd))
DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
Use kvmalloc_node just to clean up the code and remove duplicated logic.
Signed-off-by: Mikulas Patocka <[email protected]>
---
drivers/md/dm-stats.c | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
Index: linux-4.1/drivers/md/dm-stats.c
===================================================================
--- linux-4.1.orig/drivers/md/dm-stats.c 2015-07-02 19:21:39.000000000 +0200
+++ linux-4.1/drivers/md/dm-stats.c 2015-07-02 19:23:00.000000000 +0200
@@ -146,12 +146,7 @@ static void *dm_stats_kvzalloc(size_t al
if (!claim_shared_memory(alloc_size))
return NULL;
- if (alloc_size <= KMALLOC_MAX_SIZE) {
- p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
- if (p)
- return p;
- }
- p = vzalloc_node(alloc_size, node);
+ p = kvmalloc_node(alloc_size, GFP_KERNEL | __GFP_ZERO, node);
if (p)
return p;
Make dm_vcalloc use kvmalloc, so that smaller allocations are done with
kmalloc (which is faster).
Signed-off-by: Mikulas Patocka <[email protected]>
---
drivers/md/dm-snap-persistent.c | 2 +-
drivers/md/dm-snap.c | 2 +-
drivers/md/dm-table.c | 8 ++++----
3 files changed, 6 insertions(+), 6 deletions(-)
Index: linux-4.2-rc1/drivers/md/dm-snap-persistent.c
===================================================================
--- linux-4.2-rc1.orig/drivers/md/dm-snap-persistent.c 2015-07-07 15:50:23.000000000 +0200
+++ linux-4.2-rc1/drivers/md/dm-snap-persistent.c 2015-07-07 15:59:43.000000000 +0200
@@ -600,7 +600,7 @@ static void persistent_dtr(struct dm_exc
free_area(ps);
/* Allocated in persistent_read_metadata */
- vfree(ps->callbacks);
+ kvfree(ps->callbacks);
kfree(ps);
}
Index: linux-4.2-rc1/drivers/md/dm-snap.c
===================================================================
--- linux-4.2-rc1.orig/drivers/md/dm-snap.c 2015-07-07 15:50:23.000000000 +0200
+++ linux-4.2-rc1/drivers/md/dm-snap.c 2015-07-07 15:59:43.000000000 +0200
@@ -633,7 +633,7 @@ static void dm_exception_table_exit(stru
kmem_cache_free(mem, ex);
}
- vfree(et->table);
+ kvfree(et->table);
}
static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
Index: linux-4.2-rc1/drivers/md/dm-table.c
===================================================================
--- linux-4.2-rc1.orig/drivers/md/dm-table.c 2015-07-07 15:50:25.000000000 +0200
+++ linux-4.2-rc1/drivers/md/dm-table.c 2015-07-07 15:59:43.000000000 +0200
@@ -143,7 +143,7 @@ void *dm_vcalloc(unsigned long nmemb, un
return NULL;
size = nmemb * elem_size;
- addr = vzalloc(size);
+ addr = kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
return addr;
}
@@ -171,7 +171,7 @@ static int alloc_targets(struct dm_table
n_targets = (struct dm_target *) (n_highs + num);
memset(n_highs, -1, sizeof(*n_highs) * num);
- vfree(t->highs);
+ kvfree(t->highs);
t->num_allocated = num;
t->highs = n_highs;
@@ -235,7 +235,7 @@ void dm_table_destroy(struct dm_table *t
/* free the indexes */
if (t->depth >= 2)
- vfree(t->index[t->depth - 2]);
+ kvfree(t->index[t->depth - 2]);
/* free the targets */
for (i = 0; i < t->num_targets; i++) {
@@ -247,7 +247,7 @@ void dm_table_destroy(struct dm_table *t
dm_put_target_type(tgt->type);
}
- vfree(t->highs);
+ kvfree(t->highs);
/* free the device list */
free_devices(&t->devices, t->md);
On Tue, 7 Jul 2015 11:10:09 -0400 (EDT) Mikulas Patocka <[email protected]> wrote:
> Introduce the functions kvmalloc and kvmalloc_node. These functions
> provide reliable allocation of object of arbitrary size. They attempt to
> do allocation with kmalloc and if it fails, use vmalloc. Memory allocated
> with these functions should be freed with kvfree.
Sigh. We've resisted doing this because vmalloc() is somewhat of a bad
thing, and we don't want to make it easy for people to do bad things.
And vmalloc is bad because a) it's slow and b) it does GFP_KERNEL
allocations for page tables and c) it is susceptible to arena
fragmentation.
We'd prefer that people fix their junk so it doesn't depend upon large
contiguous allocations. This isn't userspace - kernel space is hostile
and kernel code should be robust.
So I dunno. Should we continue to make it a bit more awkward to use
vmalloc()? Probably that tactic isn't being very successful - people
will just go ahead and open-code it. And given the surprising amount
of stuff you've placed in kvmalloc_node(), they'll implement it
incorrectly...
How about we compromise: add kvmalloc_node(), but include a BUG_ON("you
suck") to it?
>
> ...
>
> +void *kvmalloc_node(size_t size, gfp_t gfp, int node)
> +{
> + void *p;
> + unsigned uninitialized_var(noio_flag);
> +
> + /* vmalloc doesn't support no-wait allocations */
> + WARN_ON(!(gfp & __GFP_WAIT));
It could be a WARN_ON_ONCE, but that doesn't seem very important.
> + if (likely(size <= KMALLOC_MAX_SIZE)) {
> + p = kmalloc_node(size, gfp | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
There is no way in which a reader will be able to work out the reason
for this selection of flags. Heck, this reviewer can't work it out
either.
Can we please have a code comment in there which reveals all this?
Also, it would be nice to find a tasteful way of squeezing this into 80
cols.
> + if (likely(p != NULL))
> + return p;
> + }
> + if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS)) {
> + /*
> + * vmalloc allocates page tables with GFP_KERNEL, regardless
> + * of GFP flags passed to it. If we are no GFP_NOIO context,
> + * we call memalloc_noio_save, so that all allocations are
> + * implicitly done with GFP_NOIO.
OK. But why do we turn on __GFP_HIGH?
> + */
> + noio_flag = memalloc_noio_save();
> + gfp |= __GFP_HIGH;
> + }
> + p = __vmalloc_node_flags(size, node, gfp | __GFP_REPEAT | __GFP_HIGHMEM);
Again, please document the __GFP_REPEAT reasoning.
__vmalloc_node_flags() handles __GFP_ZERO, I believe? So we presently
don't have a kvzalloc() - callers are to open-code the __GFP_ZERO.
I suppose we may as well go ahead and add the 4-line wrapper for
kvzalloc().
> + if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS)) {
> + memalloc_noio_restore(noio_flag);
> + }
scripts/checkpatch.pl is your friend!
> + return p;
> +}
> +EXPORT_SYMBOL(kvmalloc_node);
> +
> +void *kvmalloc(size_t size, gfp_t gfp)
> +{
> + return kvmalloc_node(size, gfp, NUMA_NO_NODE);
> +}
> +EXPORT_SYMBOL(kvmalloc);
It's probably better to switch this to a static inline. That's a bit
faster and will save a bit of stack on a stack-heavy code path. Unless
gcc manages to do a tailcall, but it doesn't seem to do that much.
Dne 7.7.2015 v 23:41 Andrew Morton napsal(a):
> On Tue, 7 Jul 2015 11:10:09 -0400 (EDT) Mikulas Patocka <[email protected]> wrote:
>
>> Introduce the functions kvmalloc and kvmalloc_node. These functions
>> provide reliable allocation of object of arbitrary size. They attempt to
>> do allocation with kmalloc and if it fails, use vmalloc. Memory allocated
>> with these functions should be freed with kvfree.
>
> Sigh. We've resisted doing this because vmalloc() is somewhat of a bad
> thing, and we don't want to make it easy for people to do bad things.
>
> And vmalloc is bad because a) it's slow and b) it does GFP_KERNEL
> allocations for page tables and c) it is susceptible to arena
> fragmentation.
>
> We'd prefer that people fix their junk so it doesn't depend upon large
> contiguous allocations. This isn't userspace - kernel space is hostile
> and kernel code should be robust.
>
> So I dunno. Should we continue to make it a bit more awkward to use
> vmalloc()? Probably that tactic isn't being very successful - people
> will just go ahead and open-code it. And given the surprising amount
> of stuff you've placed in kvmalloc_node(), they'll implement it
> incorrectly...
Hi
From my naive view: 4K-128K were nice restriction in the age of 16MB Pentium
machines - but the time has changed and now users need to work with TB of memory.
So if the kernel driver is going to maintain such a huge chunk - it could
hardly fit its resources into KB blocks.
So there are options - you could make complex code inside the driver to
address every little kmalloc-ed chunk (and have a lot of potential for bugs)
or you could always use vmalloc() and leave it on 'slow/GFP_KERNEL'.
So IMHO it's quite right to have the 'middle' road here - if there is enough
memory to proceed with kmalloc - fine and if not - then driver will be
somewhat slower but the coder will not have to spend months of coding
reinvention of the wheel...
Personally I even find 128K pretty small if this limit comes from MB era and
we are in the age of commonly available 32G laptops...
IMHO also it's kind of weird when kernel is not able to satisfy 128K
allocation if there are gigabytes of free RAM in my system - there should be
some defrag process running behind if there is such constrained kmalloc
interface...
Zdenek
On Tue, 7 Jul 2015, Andrew Morton wrote:
> On Tue, 7 Jul 2015 11:10:09 -0400 (EDT) Mikulas Patocka <[email protected]> wrote:
>
> > Introduce the functions kvmalloc and kvmalloc_node. These functions
> > provide reliable allocation of object of arbitrary size. They attempt to
> > do allocation with kmalloc and if it fails, use vmalloc. Memory allocated
> > with these functions should be freed with kvfree.
>
> Sigh. We've resisted doing this because vmalloc() is somewhat of a bad
> thing, and we don't want to make it easy for people to do bad things.
>
> And vmalloc is bad because a) it's slow and b) it does GFP_KERNEL
> allocations for page tables and c) it is susceptible to arena
> fragmentation.
This patch makes less use of vmalloc.
The typical pattern is that someone notices random failures due to memory
fragmentation in some subsystem that uses large kmalloc - so he replaces
kmalloc with vmalloc - and the code gets slower because of that. With this
patch, you can replace many vmalloc users with kvmalloc - and vmalloc will
be used only very rarely, when the memory is too fragmented for kmalloc.
> We'd prefer that people fix their junk so it doesn't depend upon large
> contiguous allocations. This isn't userspace - kernel space is hostile
> and kernel code should be robust.
>
> So I dunno. Should we continue to make it a bit more awkward to use
> vmalloc()? Probably that tactic isn't being very successful - people
> will just go ahead and open-code it. And given the surprising amount
> of stuff you've placed in kvmalloc_node(), they'll implement it
> incorrectly...
>
> How about we compromise: add kvmalloc_node(), but include a BUG_ON("you
> suck") to it?
>
> >
> > ...
> >
> > +void *kvmalloc_node(size_t size, gfp_t gfp, int node)
> > +{
> > + void *p;
> > + unsigned uninitialized_var(noio_flag);
> > +
> > + /* vmalloc doesn't support no-wait allocations */
> > + WARN_ON(!(gfp & __GFP_WAIT));
>
> It could be a WARN_ON_ONCE, but that doesn't seem very important.
>
> > + if (likely(size <= KMALLOC_MAX_SIZE)) {
> > + p = kmalloc_node(size, gfp | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
>
> There is no way in which a reader will be able to work out the reason
> for this selection of flags. Heck, this reviewer can't work it out
> either.
>
> Can we please have a code comment in there which reveals all this?
>
> Also, it would be nice to find a tasteful way of squeezing this into 80
> cols.
>
> > + if (likely(p != NULL))
> > + return p;
> > + }
> > + if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS)) {
> > + /*
> > + * vmalloc allocates page tables with GFP_KERNEL, regardless
> > + * of GFP flags passed to it. If we are no GFP_NOIO context,
> > + * we call memalloc_noio_save, so that all allocations are
> > + * implicitly done with GFP_NOIO.
>
> OK. But why do we turn on __GFP_HIGH?
>
> > + */
> > + noio_flag = memalloc_noio_save();
> > + gfp |= __GFP_HIGH;
> > + }
> > + p = __vmalloc_node_flags(size, node, gfp | __GFP_REPEAT | __GFP_HIGHMEM);
>
> Again, please document the __GFP_REPEAT reasoning.
>
> __vmalloc_node_flags() handles __GFP_ZERO, I believe? So we presently
> don't have a kvzalloc() - callers are to open-code the __GFP_ZERO.
>
> I suppose we may as well go ahead and add the 4-line wrapper for
> kvzalloc().
>
> > + if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS)) {
> > + memalloc_noio_restore(noio_flag);
> > + }
>
> scripts/checkpatch.pl is your friend!
>
> > + return p;
> > +}
> > +EXPORT_SYMBOL(kvmalloc_node);
> > +
> > +void *kvmalloc(size_t size, gfp_t gfp)
> > +{
> > + return kvmalloc_node(size, gfp, NUMA_NO_NODE);
> > +}
> > +EXPORT_SYMBOL(kvmalloc);
>
> It's probably better to switch this to a static inline. That's a bit
> faster and will save a bit of stack on a stack-heavy code path. Unless
> gcc manages to do a tailcall, but it doesn't seem to do that much.
Here I'm sending next version of the patch with comments added.
From: Mikulas Patocka <[email protected]>
Introduce the functions kvmalloc and kvmalloc_node. These functions
provide reliable allocation of object of arbitrary size. They attempt to
do allocation with kmalloc and if it fails, use vmalloc. Memory allocated
with these functions should be freed with kvfree.
Signed-off-by: Mikulas Patocka <[email protected]>
---
include/linux/mm.h | 5 ++++
mm/util.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 60 insertions(+)
Index: linux-4.2-rc1/include/linux/mm.h
===================================================================
--- linux-4.2-rc1.orig/include/linux/mm.h 2015-07-07 15:58:11.000000000 +0200
+++ linux-4.2-rc1/include/linux/mm.h 2015-07-08 19:22:24.000000000 +0200
@@ -400,6 +400,11 @@ static inline int is_vmalloc_or_module_a
}
#endif
+extern void *kvmalloc_node(size_t size, gfp_t gfp, int node);
+static inline void *kvmalloc(size_t size, gfp_t gfp)
+{
+ return kvmalloc_node(size, gfp, NUMA_NO_NODE);
+}
extern void kvfree(const void *addr);
static inline void compound_lock(struct page *page)
Index: linux-4.2-rc1/mm/util.c
===================================================================
--- linux-4.2-rc1.orig/mm/util.c 2015-07-07 15:58:11.000000000 +0200
+++ linux-4.2-rc1/mm/util.c 2015-07-08 19:22:26.000000000 +0200
@@ -316,6 +316,61 @@ unsigned long vm_mmap(struct file *file,
}
EXPORT_SYMBOL(vm_mmap);
+void *kvmalloc_node(size_t size, gfp_t gfp, int node)
+{
+ void *p;
+ unsigned uninitialized_var(noio_flag);
+
+ /* vmalloc doesn't support no-wait allocations */
+ WARN_ON_ONCE(!(gfp & __GFP_WAIT));
+
+ if (likely(size <= KMALLOC_MAX_SIZE)) {
+ /*
+ * Use __GFP_NORETRY so that we don't loop waiting for the
+ * allocation - we don't have to loop here, if the memory
+ * is too fragmented, we fallback to vmalloc.
+ * Use __GFP_NOMEMALLOC to not allocate from emergency reserves.
+ * This allocation can fail, so we don't need to use
+ * emergency reserves.
+ * Use __GFP_NOWARN to avoid the warning when the allocation
+ * fails because it was too large or because of the above
+ * two flags. There is no need to warn the user because
+ * there is no functionality lost when this allocation
+ * fails - we just fallback to vmalloc.
+ */
+ p = kmalloc_node(size, gfp |
+ __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
+ if (likely(p != NULL))
+ return p;
+ }
+ if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS)) {
+ /*
+ * vmalloc allocates page tables with GFP_KERNEL, regardless
+ * of GFP flags passed to it. If we are no GFP_NOIO context,
+ * we call memalloc_noio_save, so that all allocations are
+ * implicitly done with GFP_NOIO.
+ */
+ noio_flag = memalloc_noio_save();
+ /*
+ * GFP_NOIO allocations cannot rely on the swapper to free some
+ * memory, so __GFP_HIGH to access the emergency pool, so
+ * that the failure is less likely.
+ */
+ gfp |= __GFP_HIGH;
+ }
+ /*
+ * Use __GFP_REPEAT so that the allocation less likely fails.
+ * Use __GFP_HIGHMEM so that it is possible to allocate pages from high
+ * memory.
+ */
+ p = __vmalloc_node_flags(size, node,
+ gfp | __GFP_REPEAT | __GFP_HIGHMEM);
+ if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS))
+ memalloc_noio_restore(noio_flag);
+ return p;
+}
+EXPORT_SYMBOL(kvmalloc_node);
+
void kvfree(const void *addr)
{
if (is_vmalloc_addr(addr))
On Wed, 8 Jul 2015 19:03:08 -0400 (EDT) Mikulas Patocka <[email protected]> wrote:
>
>
> On Tue, 7 Jul 2015, Andrew Morton wrote:
>
> > On Tue, 7 Jul 2015 11:10:09 -0400 (EDT) Mikulas Patocka <[email protected]> wrote:
> >
> > > Introduce the functions kvmalloc and kvmalloc_node. These functions
> > > provide reliable allocation of object of arbitrary size. They attempt to
> > > do allocation with kmalloc and if it fails, use vmalloc. Memory allocated
> > > with these functions should be freed with kvfree.
> >
> > Sigh. We've resisted doing this because vmalloc() is somewhat of a bad
> > thing, and we don't want to make it easy for people to do bad things.
> >
> > And vmalloc is bad because a) it's slow and b) it does GFP_KERNEL
> > allocations for page tables and c) it is susceptible to arena
> > fragmentation.
>
> This patch makes less use of vmalloc.
>
> The typical pattern is that someone notices random failures due to memory
> fragmentation in some subsystem that uses large kmalloc - so he replaces
> kmalloc with vmalloc - and the code gets slower because of that. With this
> patch, you can replace many vmalloc users with kvmalloc - and vmalloc will
> be used only very rarely, when the memory is too fragmented for kmalloc.
Yes, I guess there is that.
> Here I'm sending next version of the patch with comments added.
You didn't like kvzalloc()? We can always add those later...
> --- linux-4.2-rc1.orig/include/linux/mm.h 2015-07-07 15:58:11.000000000 +0200
> +++ linux-4.2-rc1/include/linux/mm.h 2015-07-08 19:22:24.000000000 +0200
> @@ -400,6 +400,11 @@ static inline int is_vmalloc_or_module_a
> }
> #endif
>
> +extern void *kvmalloc_node(size_t size, gfp_t gfp, int node);
> +static inline void *kvmalloc(size_t size, gfp_t gfp)
> +{
> + return kvmalloc_node(size, gfp, NUMA_NO_NODE);
> +}
> extern void kvfree(const void *addr);
>
> static inline void compound_lock(struct page *page)
> Index: linux-4.2-rc1/mm/util.c
> ===================================================================
> --- linux-4.2-rc1.orig/mm/util.c 2015-07-07 15:58:11.000000000 +0200
> +++ linux-4.2-rc1/mm/util.c 2015-07-08 19:22:26.000000000 +0200
> @@ -316,6 +316,61 @@ unsigned long vm_mmap(struct file *file,
> }
> EXPORT_SYMBOL(vm_mmap);
>
> +void *kvmalloc_node(size_t size, gfp_t gfp, int node)
> +{
> + void *p;
> + unsigned uninitialized_var(noio_flag);
> +
> + /* vmalloc doesn't support no-wait allocations */
> + WARN_ON_ONCE(!(gfp & __GFP_WAIT));
> +
> + if (likely(size <= KMALLOC_MAX_SIZE)) {
> + /*
> + * Use __GFP_NORETRY so that we don't loop waiting for the
> + * allocation - we don't have to loop here, if the memory
> + * is too fragmented, we fallback to vmalloc.
I'm not sure about this decision. The direct reclaim retry code is the
normal default behaviour and becomes more important with larger allocation
attempts. So why turn it off, and make it more likely that we return
vmalloc memory?
> + * Use __GFP_NOMEMALLOC to not allocate from emergency reserves.
> + * This allocation can fail, so we don't need to use
> + * emergency reserves.
> + * Use __GFP_NOWARN to avoid the warning when the allocation
> + * fails because it was too large or because of the above
> + * two flags. There is no need to warn the user because
> + * there is no functionality lost when this allocation
> + * fails - we just fallback to vmalloc.
> + */
> + p = kmalloc_node(size, gfp |
> + __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
> + if (likely(p != NULL))
> + return p;
> + }
> + if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS)) {
> + /*
> + * vmalloc allocates page tables with GFP_KERNEL, regardless
> + * of GFP flags passed to it. If we are no GFP_NOIO context,
> + * we call memalloc_noio_save, so that all allocations are
> + * implicitly done with GFP_NOIO.
> + */
> + noio_flag = memalloc_noio_save();
> + /*
> + * GFP_NOIO allocations cannot rely on the swapper to free some
> + * memory, so __GFP_HIGH to access the emergency pool, so
> + * that the failure is less likely.
> + */
> + gfp |= __GFP_HIGH;
> + }
> + /*
> + * Use __GFP_REPEAT so that the allocation less likely fails.
> + * Use __GFP_HIGHMEM so that it is possible to allocate pages from high
> + * memory.
> + */
> + p = __vmalloc_node_flags(size, node,
> + gfp | __GFP_REPEAT | __GFP_HIGHMEM);
> + if ((gfp & (__GFP_IO | __GFP_FS)) != (__GFP_IO | __GFP_FS))
> + memalloc_noio_restore(noio_flag);
> + return p;
> +}
> +EXPORT_SYMBOL(kvmalloc_node);
On Wed, 8 Jul 2015, Andrew Morton wrote:
> On Wed, 8 Jul 2015 19:03:08 -0400 (EDT) Mikulas Patocka <[email protected]> wrote:
>
> >
> >
> > On Tue, 7 Jul 2015, Andrew Morton wrote:
> >
> > > On Tue, 7 Jul 2015 11:10:09 -0400 (EDT) Mikulas Patocka <[email protected]> wrote:
> > >
> > > > Introduce the functions kvmalloc and kvmalloc_node. These functions
> > > > provide reliable allocation of object of arbitrary size. They attempt to
> > > > do allocation with kmalloc and if it fails, use vmalloc. Memory allocated
> > > > with these functions should be freed with kvfree.
> > >
> > > Sigh. We've resisted doing this because vmalloc() is somewhat of a bad
> > > thing, and we don't want to make it easy for people to do bad things.
> > >
> > > And vmalloc is bad because a) it's slow and b) it does GFP_KERNEL
> > > allocations for page tables and c) it is susceptible to arena
> > > fragmentation.
> >
> > This patch makes less use of vmalloc.
> >
> > The typical pattern is that someone notices random failures due to memory
> > fragmentation in some subsystem that uses large kmalloc - so he replaces
> > kmalloc with vmalloc - and the code gets slower because of that. With this
> > patch, you can replace many vmalloc users with kvmalloc - and vmalloc will
> > be used only very rarely, when the memory is too fragmented for kmalloc.
>
> Yes, I guess there is that.
>
> > Here I'm sending next version of the patch with comments added.
>
> You didn't like kvzalloc()? We can always add those later...
>
> > --- linux-4.2-rc1.orig/include/linux/mm.h 2015-07-07 15:58:11.000000000 +0200
> > +++ linux-4.2-rc1/include/linux/mm.h 2015-07-08 19:22:24.000000000 +0200
> > @@ -400,6 +400,11 @@ static inline int is_vmalloc_or_module_a
> > }
> > #endif
> >
> > +extern void *kvmalloc_node(size_t size, gfp_t gfp, int node);
> > +static inline void *kvmalloc(size_t size, gfp_t gfp)
> > +{
> > + return kvmalloc_node(size, gfp, NUMA_NO_NODE);
> > +}
> > extern void kvfree(const void *addr);
> >
> > static inline void compound_lock(struct page *page)
> > Index: linux-4.2-rc1/mm/util.c
> > ===================================================================
> > --- linux-4.2-rc1.orig/mm/util.c 2015-07-07 15:58:11.000000000 +0200
> > +++ linux-4.2-rc1/mm/util.c 2015-07-08 19:22:26.000000000 +0200
> > @@ -316,6 +316,61 @@ unsigned long vm_mmap(struct file *file,
> > }
> > EXPORT_SYMBOL(vm_mmap);
> >
> > +void *kvmalloc_node(size_t size, gfp_t gfp, int node)
> > +{
> > + void *p;
> > + unsigned uninitialized_var(noio_flag);
> > +
> > + /* vmalloc doesn't support no-wait allocations */
> > + WARN_ON_ONCE(!(gfp & __GFP_WAIT));
> > +
> > + if (likely(size <= KMALLOC_MAX_SIZE)) {
> > + /*
> > + * Use __GFP_NORETRY so that we don't loop waiting for the
> > + * allocation - we don't have to loop here, if the memory
> > + * is too fragmented, we fallback to vmalloc.
>
> I'm not sure about this decision. The direct reclaim retry code is the
> normal default behaviour and becomes more important with larger allocation
> attempts. So why turn it off, and make it more likely that we return
> vmalloc memory?
It can avoid triggering the OOM killer in case of fragmented memory.
This is general question - if the code can handle allocation failure
gracefully, what gfp flags should it use? Maybe add some flag
__GFP_MAYFAIL instead of __GFP_NORETRY that changes the behavior in
desired way?
Mikulas
On Thu, 9 Jul 2015, Mikulas Patocka wrote:
> > > Index: linux-4.2-rc1/mm/util.c
> > > ===================================================================
> > > --- linux-4.2-rc1.orig/mm/util.c 2015-07-07 15:58:11.000000000 +0200
> > > +++ linux-4.2-rc1/mm/util.c 2015-07-08 19:22:26.000000000 +0200
> > > @@ -316,6 +316,61 @@ unsigned long vm_mmap(struct file *file,
> > > }
> > > EXPORT_SYMBOL(vm_mmap);
> > >
> > > +void *kvmalloc_node(size_t size, gfp_t gfp, int node)
> > > +{
> > > + void *p;
> > > + unsigned uninitialized_var(noio_flag);
> > > +
> > > + /* vmalloc doesn't support no-wait allocations */
> > > + WARN_ON_ONCE(!(gfp & __GFP_WAIT));
> > > +
> > > + if (likely(size <= KMALLOC_MAX_SIZE)) {
> > > + /*
> > > + * Use __GFP_NORETRY so that we don't loop waiting for the
> > > + * allocation - we don't have to loop here, if the memory
> > > + * is too fragmented, we fallback to vmalloc.
> >
> > I'm not sure about this decision. The direct reclaim retry code is the
> > normal default behaviour and becomes more important with larger allocation
> > attempts. So why turn it off, and make it more likely that we return
> > vmalloc memory?
>
> It can avoid triggering the OOM killer in case of fragmented memory.
>
> This is general question - if the code can handle allocation failure
> gracefully, what gfp flags should it use? Maybe add some flag
> __GFP_MAYFAIL instead of __GFP_NORETRY that changes the behavior in
> desired way?
>
There's a misunderstanding in regards to the comment: __GFP_NORETRY
doesn't turn direct reclaim or compaction off, it is still attempted and
with the same priority as any other allocation. This only stops the page
allocator from calling the oom killer, which will free memory or panic the
system, and looping when memory is available.
In regards to the proposal in general, I think it's unnecessary because we
are still left behind with other users who open code their call to
vmalloc. I was interested in commit 058504edd026 ("fs/seq_file: fallback
to vmalloc allocation") since it solved an issue with high memory
fragmentation. Note how it falls back to vmalloc(): _without_ this
__GFP_NORETRY. That's because we only want to fallback when high-order
allocations fail and the page allocator doesn't implicitly loop due to the
order. ext4_kvmalloc(), ext4_kzmalloc() does the same.
The differences in implementations between those that do kmalloc() and
fallback to vmalloc() are different enough that I don't think we need this
addition.
On Tue, Jul 14 2015 at 5:13pm -0400,
David Rientjes <[email protected]> wrote:
> On Thu, 9 Jul 2015, Mikulas Patocka wrote:
>
> > > > Index: linux-4.2-rc1/mm/util.c
> > > > ===================================================================
> > > > --- linux-4.2-rc1.orig/mm/util.c 2015-07-07 15:58:11.000000000 +0200
> > > > +++ linux-4.2-rc1/mm/util.c 2015-07-08 19:22:26.000000000 +0200
> > > > @@ -316,6 +316,61 @@ unsigned long vm_mmap(struct file *file,
> > > > }
> > > > EXPORT_SYMBOL(vm_mmap);
> > > >
> > > > +void *kvmalloc_node(size_t size, gfp_t gfp, int node)
> > > > +{
> > > > + void *p;
> > > > + unsigned uninitialized_var(noio_flag);
> > > > +
> > > > + /* vmalloc doesn't support no-wait allocations */
> > > > + WARN_ON_ONCE(!(gfp & __GFP_WAIT));
> > > > +
> > > > + if (likely(size <= KMALLOC_MAX_SIZE)) {
> > > > + /*
> > > > + * Use __GFP_NORETRY so that we don't loop waiting for the
> > > > + * allocation - we don't have to loop here, if the memory
> > > > + * is too fragmented, we fallback to vmalloc.
> > >
> > > I'm not sure about this decision. The direct reclaim retry code is the
> > > normal default behaviour and becomes more important with larger allocation
> > > attempts. So why turn it off, and make it more likely that we return
> > > vmalloc memory?
> >
> > It can avoid triggering the OOM killer in case of fragmented memory.
> >
> > This is general question - if the code can handle allocation failure
> > gracefully, what gfp flags should it use? Maybe add some flag
> > __GFP_MAYFAIL instead of __GFP_NORETRY that changes the behavior in
> > desired way?
> >
>
> There's a misunderstanding in regards to the comment: __GFP_NORETRY
> doesn't turn direct reclaim or compaction off, it is still attempted and
> with the same priority as any other allocation. This only stops the page
> allocator from calling the oom killer, which will free memory or panic the
> system, and looping when memory is available.
>
> In regards to the proposal in general, I think it's unnecessary because we
> are still left behind with other users who open code their call to
> vmalloc. I was interested in commit 058504edd026 ("fs/seq_file: fallback
> to vmalloc allocation") since it solved an issue with high memory
> fragmentation. Note how it falls back to vmalloc(): _without_ this
> __GFP_NORETRY. That's because we only want to fallback when high-order
> allocations fail and the page allocator doesn't implicitly loop due to the
> order. ext4_kvmalloc(), ext4_kzmalloc() does the same.
>
> The differences in implementations between those that do kmalloc() and
> fallback to vmalloc() are different enough that I don't think we need this
> addition.
Wouldn't mm benefit from acknowledging the pattern people are
open-coding and switching existing code over to official methods for
accomplishing the same?
It is always easier to shoehorn utility functions locally within a
subsystem (be it ext4, dm, etc) but once enough do something in a
similar but different way it really should get elevated.
On Tue, 14 Jul 2015 14:13:16 -0700 (PDT) David Rientjes <[email protected]> wrote:
> There's a misunderstanding in regards to the comment: __GFP_NORETRY
> doesn't turn direct reclaim or compaction off, it is still attempted and
> with the same priority as any other allocation. This only stops the page
> allocator from calling the oom killer, which will free memory or panic the
> system, and looping when memory is available.
include/linux/gfp.h says
* __GFP_NORETRY: The VM implementation must not retry indefinitely.
Can you suggest something more accurate and complete which we can put there?
On Tue, 14 Jul 2015, Mike Snitzer wrote:
> > > > > Index: linux-4.2-rc1/mm/util.c
> > > > > ===================================================================
> > > > > --- linux-4.2-rc1.orig/mm/util.c 2015-07-07 15:58:11.000000000 +0200
> > > > > +++ linux-4.2-rc1/mm/util.c 2015-07-08 19:22:26.000000000 +0200
> > > > > @@ -316,6 +316,61 @@ unsigned long vm_mmap(struct file *file,
> > > > > }
> > > > > EXPORT_SYMBOL(vm_mmap);
> > > > >
> > > > > +void *kvmalloc_node(size_t size, gfp_t gfp, int node)
> > > > > +{
> > > > > + void *p;
> > > > > + unsigned uninitialized_var(noio_flag);
> > > > > +
> > > > > + /* vmalloc doesn't support no-wait allocations */
> > > > > + WARN_ON_ONCE(!(gfp & __GFP_WAIT));
> > > > > +
> > > > > + if (likely(size <= KMALLOC_MAX_SIZE)) {
> > > > > + /*
> > > > > + * Use __GFP_NORETRY so that we don't loop waiting for the
> > > > > + * allocation - we don't have to loop here, if the memory
> > > > > + * is too fragmented, we fallback to vmalloc.
> > > >
> > > > I'm not sure about this decision. The direct reclaim retry code is the
> > > > normal default behaviour and becomes more important with larger allocation
> > > > attempts. So why turn it off, and make it more likely that we return
> > > > vmalloc memory?
> > >
> > > It can avoid triggering the OOM killer in case of fragmented memory.
> > >
> > > This is general question - if the code can handle allocation failure
> > > gracefully, what gfp flags should it use? Maybe add some flag
> > > __GFP_MAYFAIL instead of __GFP_NORETRY that changes the behavior in
> > > desired way?
> > >
> >
> > There's a misunderstanding in regards to the comment: __GFP_NORETRY
> > doesn't turn direct reclaim or compaction off, it is still attempted and
> > with the same priority as any other allocation. This only stops the page
> > allocator from calling the oom killer, which will free memory or panic the
> > system, and looping when memory is available.
> >
> > In regards to the proposal in general, I think it's unnecessary because we
> > are still left behind with other users who open code their call to
> > vmalloc. I was interested in commit 058504edd026 ("fs/seq_file: fallback
> > to vmalloc allocation") since it solved an issue with high memory
> > fragmentation. Note how it falls back to vmalloc(): _without_ this
> > __GFP_NORETRY. That's because we only want to fallback when high-order
> > allocations fail and the page allocator doesn't implicitly loop due to the
> > order. ext4_kvmalloc(), ext4_kzmalloc() does the same.
> >
> > The differences in implementations between those that do kmalloc() and
> > fallback to vmalloc() are different enough that I don't think we need this
> > addition.
>
> Wouldn't mm benefit from acknowledging the pattern people are
> open-coding and switching existing code over to official methods for
> accomplishing the same?
>
Sure, but it's not accomplishing the same thing: things like
ext4_kvmalloc() only want to fallback to vmalloc() when high-order
allocations fail: the function is used for different sizes. This cannot
be converted to kvmalloc_node() since it fallsback immediately when
reclaim fails. Same issue with single_file_open() for the seq_file code.
We could go through every kmalloc() -> vmalloc() fallback for more
examples in the code, but those two instances were the first I looked at
and couldn't be converted to kvmalloc_node() without work.
> It is always easier to shoehorn utility functions locally within a
> subsystem (be it ext4, dm, etc) but once enough do something in a
> similar but different way it really should get elevated.
>
I would argue that
void *ext4_kvmalloc(size_t size, gfp_t flags)
{
void *ret;
ret = kmalloc(size, flags | __GFP_NOWARN);
if (!ret)
ret = __vmalloc(size, flags, PAGE_KERNEL);
return ret;
}
is simple enough that we don't need to convert it to anything.
If all such fallback was done in the same way as the implementation as
kvmalloc_node(), and perhaps only very few exceptions were needed, this
would be helpful. Unfortunately, that isn't the case.
On Tue, Jul 14, 2015 at 02:24:24PM -0700, David Rientjes wrote:
> On Tue, 14 Jul 2015, Mike Snitzer wrote:
>
> > > > > > Index: linux-4.2-rc1/mm/util.c
> > > > > > ===================================================================
> > > > > > --- linux-4.2-rc1.orig/mm/util.c 2015-07-07 15:58:11.000000000 +0200
> > > > > > +++ linux-4.2-rc1/mm/util.c 2015-07-08 19:22:26.000000000 +0200
> > > > > > @@ -316,6 +316,61 @@ unsigned long vm_mmap(struct file *file,
> > > > > > }
> > > > > > EXPORT_SYMBOL(vm_mmap);
> > > > > >
> > > > > > +void *kvmalloc_node(size_t size, gfp_t gfp, int node)
> > > > > > +{
> > > > > > + void *p;
> > > > > > + unsigned uninitialized_var(noio_flag);
> > > > > > +
> > > > > > + /* vmalloc doesn't support no-wait allocations */
> > > > > > + WARN_ON_ONCE(!(gfp & __GFP_WAIT));
> > > > > > +
> > > > > > + if (likely(size <= KMALLOC_MAX_SIZE)) {
> > > > > > + /*
> > > > > > + * Use __GFP_NORETRY so that we don't loop waiting for the
> > > > > > + * allocation - we don't have to loop here, if the memory
> > > > > > + * is too fragmented, we fallback to vmalloc.
> > > > >
> > > > > I'm not sure about this decision. The direct reclaim retry code is the
> > > > > normal default behaviour and becomes more important with larger allocation
> > > > > attempts. So why turn it off, and make it more likely that we return
> > > > > vmalloc memory?
> > > >
> > > > It can avoid triggering the OOM killer in case of fragmented memory.
> > > >
> > > > This is general question - if the code can handle allocation failure
> > > > gracefully, what gfp flags should it use? Maybe add some flag
> > > > __GFP_MAYFAIL instead of __GFP_NORETRY that changes the behavior in
> > > > desired way?
> > > >
> > >
> > > There's a misunderstanding in regards to the comment: __GFP_NORETRY
> > > doesn't turn direct reclaim or compaction off, it is still attempted and
> > > with the same priority as any other allocation. This only stops the page
> > > allocator from calling the oom killer, which will free memory or panic the
> > > system, and looping when memory is available.
> > >
> > > In regards to the proposal in general, I think it's unnecessary because we
> > > are still left behind with other users who open code their call to
> > > vmalloc. I was interested in commit 058504edd026 ("fs/seq_file: fallback
> > > to vmalloc allocation") since it solved an issue with high memory
> > > fragmentation. Note how it falls back to vmalloc(): _without_ this
> > > __GFP_NORETRY. That's because we only want to fallback when high-order
> > > allocations fail and the page allocator doesn't implicitly loop due to the
> > > order. ext4_kvmalloc(), ext4_kzmalloc() does the same.
> > >
> > > The differences in implementations between those that do kmalloc() and
> > > fallback to vmalloc() are different enough that I don't think we need this
> > > addition.
> >
> > Wouldn't mm benefit from acknowledging the pattern people are
> > open-coding and switching existing code over to official methods for
> > accomplishing the same?
> >
>
> Sure, but it's not accomplishing the same thing: things like
> ext4_kvmalloc() only want to fallback to vmalloc() when high-order
> allocations fail: the function is used for different sizes. This cannot
> be converted to kvmalloc_node() since it fallsback immediately when
> reclaim fails. Same issue with single_file_open() for the seq_file code.
> We could go through every kmalloc() -> vmalloc() fallback for more
> examples in the code, but those two instances were the first I looked at
> and couldn't be converted to kvmalloc_node() without work.
>
> > It is always easier to shoehorn utility functions locally within a
> > subsystem (be it ext4, dm, etc) but once enough do something in a
> > similar but different way it really should get elevated.
> >
>
> I would argue that
>
> void *ext4_kvmalloc(size_t size, gfp_t flags)
> {
> void *ret;
>
> ret = kmalloc(size, flags | __GFP_NOWARN);
> if (!ret)
> ret = __vmalloc(size, flags, PAGE_KERNEL);
> return ret;
> }
>
> is simple enough that we don't need to convert it to anything.
Except that it will have problems with GFP_NOFS context when the pte
code inside vmalloc does a GFP_KERNEL allocation. Hence we have
stuff in other subsystems (such as XFS) where we've noticed lockdep
whining about this:
void *
kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
{
unsigned noio_flag = 0;
void *ptr;
gfp_t lflags;
ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
if (ptr)
return ptr;
/*
* __vmalloc() will allocate data pages and auxillary structures (e.g.
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
* here. Hence we need to tell memory reclaim that we are in such a
* context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
* the filesystem here and potentially deadlocking.
*/
if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
noio_flag = memalloc_noio_save();
lflags = kmem_flags_convert(flags);
ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
memalloc_noio_restore(noio_flag);
return ptr;
}
This allocation context issue needs to be fixed before making
generic kvmalloc() functions available for general use....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, 15 Jul 2015, Dave Chinner wrote:
> > Sure, but it's not accomplishing the same thing: things like
> > ext4_kvmalloc() only want to fallback to vmalloc() when high-order
> > allocations fail: the function is used for different sizes. This cannot
> > be converted to kvmalloc_node() since it fallsback immediately when
> > reclaim fails. Same issue with single_file_open() for the seq_file code.
> > We could go through every kmalloc() -> vmalloc() fallback for more
> > examples in the code, but those two instances were the first I looked at
> > and couldn't be converted to kvmalloc_node() without work.
> >
> > > It is always easier to shoehorn utility functions locally within a
> > > subsystem (be it ext4, dm, etc) but once enough do something in a
> > > similar but different way it really should get elevated.
> > >
> >
> > I would argue that
> >
> > void *ext4_kvmalloc(size_t size, gfp_t flags)
> > {
> > void *ret;
> >
> > ret = kmalloc(size, flags | __GFP_NOWARN);
> > if (!ret)
> > ret = __vmalloc(size, flags, PAGE_KERNEL);
> > return ret;
> > }
> >
> > is simple enough that we don't need to convert it to anything.
>
> Except that it will have problems with GFP_NOFS context when the pte
> code inside vmalloc does a GFP_KERNEL allocation. Hence we have
> stuff in other subsystems (such as XFS) where we've noticed lockdep
> whining about this:
>
Does anyone have an example of ext4_kvmalloc() having a lockdep violation?
Presumably the GFP_NOFS calls to ext4_kvmalloc() will never have
size > (1 << (PAGE_SHIFT + PAGE_ALLOC_COSTLY_ORDER)) so that kmalloc()
above actually never returns NULL and __vmalloc() only gets used for the
ext4_kvmalloc(..., GFP_KERNEL) call.
It should be fixed, though, probably in the same way as
kmem_zalloc_large() today, but it seems the real fix would be to attack
the whole vmalloc() GFP_KERNEL issue that has been talked about several
times in the past. Then the existing ext4_kvmalloc() implementation
should be fine.
Once that's done, we can revisit the idea of a generalized kvmalloc() or
kvmalloc_node(), but since the implementation such as above is different
from the proposed kvmalloc_node() implementation with respect to
high-order allocations, I doubt a generalized form will be helpful.
On Tue, Jul 14, 2015 at 03:45:40PM -0700, David Rientjes wrote:
> On Wed, 15 Jul 2015, Dave Chinner wrote:
>
> > > Sure, but it's not accomplishing the same thing: things like
> > > ext4_kvmalloc() only want to fallback to vmalloc() when high-order
> > > allocations fail: the function is used for different sizes. This cannot
> > > be converted to kvmalloc_node() since it fallsback immediately when
> > > reclaim fails. Same issue with single_file_open() for the seq_file code.
> > > We could go through every kmalloc() -> vmalloc() fallback for more
> > > examples in the code, but those two instances were the first I looked at
> > > and couldn't be converted to kvmalloc_node() without work.
> > >
> > > > It is always easier to shoehorn utility functions locally within a
> > > > subsystem (be it ext4, dm, etc) but once enough do something in a
> > > > similar but different way it really should get elevated.
> > > >
> > >
> > > I would argue that
> > >
> > > void *ext4_kvmalloc(size_t size, gfp_t flags)
> > > {
> > > void *ret;
> > >
> > > ret = kmalloc(size, flags | __GFP_NOWARN);
> > > if (!ret)
> > > ret = __vmalloc(size, flags, PAGE_KERNEL);
> > > return ret;
> > > }
> > >
> > > is simple enough that we don't need to convert it to anything.
> >
> > Except that it will have problems with GFP_NOFS context when the pte
> > code inside vmalloc does a GFP_KERNEL allocation. Hence we have
> > stuff in other subsystems (such as XFS) where we've noticed lockdep
> > whining about this:
> >
>
> Does anyone have an example of ext4_kvmalloc() having a lockdep violation?
> Presumably the GFP_NOFS calls to ext4_kvmalloc() will never have
> size > (1 << (PAGE_SHIFT + PAGE_ALLOC_COSTLY_ORDER)) so that kmalloc()
> above actually never returns NULL and __vmalloc() only gets used for the
> ext4_kvmalloc(..., GFP_KERNEL) call.
Code inspection is all that is necessary. For example,
fs/ext4/resize.c::add_new_gdb() does:
818 n_group_desc = ext4_kvmalloc((gdb_num + 1) *
819 sizeof(struct buffer_head *),
820 GFP_NOFS);
I have to assume this was done because resize was failing kmalloc()
calls on large filesystems in GFP_NOFS context as the commit message
is less than helpful:
commit f18a5f21c25707b4fe64b326e2b4d150565e7300
Author: Theodore Ts'o <[email protected]>
Date: Mon Aug 1 08:45:38 2011 -0400
ext4: use ext4_kvzalloc()/ext4_kvmalloc() for s_group_desc and s_group_info
Signed-off-by: "Theodore Ts'o" <[email protected]>
> It should be fixed, though, probably in the same way as
> kmem_zalloc_large() today, but it seems the real fix would be to attack
> the whole vmalloc() GFP_KERNEL issue that has been talked about several
> times in the past. Then the existing ext4_kvmalloc() implementation
> should be fine.
Agreed, we really need to ensure that the generic kernel allocation
functions follow the context guidelines they are provided by
callers. I'm not going to hold my breathe waiting for this to
happen, though....
Cheers,
Dave.
--
Dave Chinner
[email protected]