2018-07-03 22:50:11

by Mikulas Patocka

[permalink] [raw]
Subject: [PATCH] mm: set PF_LESS_THROTTLE when allocating memory for i/o

It has been noticed that congestion throttling can slow down allocations
path that participate in the IO and thus help the memory reclaim.
Stalling those allocation is therefore not productive. Moreover mempool
allocator and md variants of the same already implement their own
throttling which has a better way to be feedback driven. Stalling at the
page allocator is therefore even counterproductive.

PF_LESS_THROTTLE is a task flag denoting allocation context that is
participating in the memory reclaim which fits into these IO paths
model, so use the flag and make the page allocator aware they are
special and they do not really want any dirty data throttling.

The throttling causes stalls on Android - it uses the dm-verity driver
that uses dm-bufio. Allocations in dm-bufio were observed to sleep in
wait_iff_congested repeatedly.

Signed-off-by: Mikulas Patocka <[email protected]>
Acked-by: Michal Hocko <[email protected]> # mempool_alloc and bvec_alloc
Cc: [email protected]

---
block/bio.c | 4 ++++
drivers/md/dm-bufio.c | 14 +++++++++++---
drivers/md/dm-crypt.c | 8 ++++++++
drivers/md/dm-integrity.c | 4 ++++
drivers/md/dm-kcopyd.c | 3 +++
drivers/md/dm-verity-target.c | 4 ++++
drivers/md/dm-writecache.c | 4 ++++
mm/mempool.c | 4 ++++
8 files changed, 42 insertions(+), 3 deletions(-)

Index: linux-2.6/mm/mempool.c
===================================================================
--- linux-2.6.orig/mm/mempool.c 2018-06-29 03:47:16.290000000 +0200
+++ linux-2.6/mm/mempool.c 2018-06-29 03:47:16.270000000 +0200
@@ -369,6 +369,7 @@ void *mempool_alloc(mempool_t *pool, gfp
unsigned long flags;
wait_queue_entry_t wait;
gfp_t gfp_temp;
+ unsigned old_flags;

VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
@@ -381,7 +382,10 @@ void *mempool_alloc(mempool_t *pool, gfp

repeat_alloc:

+ old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
element = pool->alloc(gfp_temp, pool->pool_data);
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);
if (likely(element != NULL))
return element;

Index: linux-2.6/block/bio.c
===================================================================
--- linux-2.6.orig/block/bio.c 2018-06-29 03:47:16.290000000 +0200
+++ linux-2.6/block/bio.c 2018-06-29 03:47:16.270000000 +0200
@@ -217,6 +217,7 @@ fallback:
} else {
struct biovec_slab *bvs = bvec_slabs + *idx;
gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
+ unsigned old_flags;

/*
* Make this allocation restricted and don't dump info on
@@ -229,7 +230,10 @@ fallback:
* Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
* is set, retry with the 1-entry mempool
*/
+ old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);
if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
*idx = BVEC_POOL_MAX;
goto fallback;
Index: linux-2.6/drivers/md/dm-bufio.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-bufio.c 2018-06-29 03:47:16.290000000 +0200
+++ linux-2.6/drivers/md/dm-bufio.c 2018-06-29 03:47:16.270000000 +0200
@@ -356,6 +356,7 @@ static void __cache_size_refresh(void)
static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
unsigned char *data_mode)
{
+ void *ptr;
if (unlikely(c->slab_cache != NULL)) {
*data_mode = DATA_MODE_SLAB;
return kmem_cache_alloc(c->slab_cache, gfp_mask);
@@ -363,9 +364,14 @@ static void *alloc_buffer_data(struct dm

if (c->block_size <= KMALLOC_MAX_SIZE &&
gfp_mask & __GFP_NORETRY) {
+ unsigned old_flags;
*data_mode = DATA_MODE_GET_FREE_PAGES;
- return (void *)__get_free_pages(gfp_mask,
+ old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
+ ptr = (void *)__get_free_pages(gfp_mask,
c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);
+ return ptr;
}

*data_mode = DATA_MODE_VMALLOC;
@@ -381,8 +387,10 @@ static void *alloc_buffer_data(struct dm
*/
if (gfp_mask & __GFP_NORETRY) {
unsigned noio_flag = memalloc_noio_save();
- void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
-
+ unsigned old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
+ ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);
memalloc_noio_restore(noio_flag);
return ptr;
}
Index: linux-2.6/drivers/md/dm-integrity.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-integrity.c 2018-06-29 03:47:16.290000000 +0200
+++ linux-2.6/drivers/md/dm-integrity.c 2018-06-29 03:47:16.270000000 +0200
@@ -1318,6 +1318,7 @@ static void integrity_metadata(struct wo
int r;

if (ic->internal_hash) {
+ unsigned old_flags;
struct bvec_iter iter;
struct bio_vec bv;
unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
@@ -1331,8 +1332,11 @@ static void integrity_metadata(struct wo
if (unlikely(ic->mode == 'R'))
goto skip_io;

+ old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);
if (!checksums)
checksums = checksums_onstack;

Index: linux-2.6/drivers/md/dm-kcopyd.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-kcopyd.c 2018-06-29 03:47:16.290000000 +0200
+++ linux-2.6/drivers/md/dm-kcopyd.c 2018-06-29 03:47:16.270000000 +0200
@@ -245,7 +245,10 @@ static int kcopyd_get_pages(struct dm_kc
*pages = NULL;

do {
+ unsigned old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM);
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);
if (unlikely(!pl)) {
/* Use reserved pages */
pl = kc->pages;
Index: linux-2.6/drivers/md/dm-verity-target.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-verity-target.c 2018-06-29 03:47:16.290000000 +0200
+++ linux-2.6/drivers/md/dm-verity-target.c 2018-06-29 03:47:16.280000000 +0200
@@ -596,9 +596,13 @@ no_prefetch_cluster:
static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
{
struct dm_verity_prefetch_work *pw;
+ unsigned old_flags;

+ old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);

if (!pw)
return;
Index: linux-2.6/drivers/md/dm-writecache.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-writecache.c 2018-06-29 03:47:16.290000000 +0200
+++ linux-2.6/drivers/md/dm-writecache.c 2018-06-29 03:47:16.280000000 +0200
@@ -1473,6 +1473,7 @@ static void __writecache_writeback_pmem(
unsigned max_pages;

while (wbl->size) {
+ unsigned old_flags;
wbl->size--;
e = container_of(wbl->list.prev, struct wc_entry, lru);
list_del(&e->lru);
@@ -1486,6 +1487,8 @@ static void __writecache_writeback_pmem(
bio_set_dev(&wb->bio, wc->dev->bdev);
wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
wb->page_offset = PAGE_SIZE;
+ old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
if (max_pages <= WB_LIST_INLINE ||
unlikely(!(wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *),
GFP_NOIO | __GFP_NORETRY |
@@ -1493,6 +1496,7 @@ static void __writecache_writeback_pmem(
wb->wc_list = wb->wc_list_inline;
max_pages = WB_LIST_INLINE;
}
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);

BUG_ON(!wc_add_block(wb, e, GFP_NOIO));

Index: linux-2.6/drivers/md/dm-crypt.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-crypt.c 2018-06-29 03:47:16.290000000 +0200
+++ linux-2.6/drivers/md/dm-crypt.c 2018-06-29 03:47:16.280000000 +0200
@@ -2181,12 +2181,16 @@ static void *crypt_page_alloc(gfp_t gfp_
{
struct crypt_config *cc = pool_data;
struct page *page;
+ unsigned old_flags;

if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) &&
likely(gfp_mask & __GFP_NORETRY))
return NULL;

+ old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
page = alloc_page(gfp_mask);
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);
if (likely(page != NULL))
percpu_counter_add(&cc->n_allocated_pages, 1);

@@ -2893,7 +2897,10 @@ static int crypt_map(struct dm_target *t

if (cc->on_disk_tag_size) {
unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
+ unsigned old_flags;

+ old_flags = current->flags & PF_LESS_THROTTLE;
+ current->flags |= PF_LESS_THROTTLE;
if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
unlikely(!(io->integrity_metadata = kmalloc(tag_len,
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
@@ -2902,6 +2909,7 @@ static int crypt_map(struct dm_target *t
io->integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO);
io->integrity_metadata_from_pool = true;
}
+ current_restore_flags(old_flags, PF_LESS_THROTTLE);
}

if (crypt_integrity_aead(cc))


2018-07-05 19:55:21

by John Stoffel

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] mm: set PF_LESS_THROTTLE when allocating memory for i/o

>>>>> "Mikulas" == Mikulas Patocka <[email protected]> writes:

Mikulas> It has been noticed that congestion throttling can slow down
Mikulas> allocations path that participate in the IO and thus help the
Mikulas> memory reclaim. Stalling those allocation is therefore not
Mikulas> productive. Moreover mempool allocator and md variants of the
Mikulas> same already implement their own throttling which has a
Mikulas> better way to be feedback driven. Stalling at the page
Mikulas> allocator is therefore even counterproductive.

Can you show numbers for this claim? I would think that throttling
needs to be done as close to the disk as possible, and propogate back
up the layers to have this all work well, so that faster devices (and
the layers stacked on them) will work better without stalling on a
slow device.


Mikulas> PF_LESS_THROTTLE is a task flag denoting allocation context that is
Mikulas> participating in the memory reclaim which fits into these IO paths
Mikulas> model, so use the flag and make the page allocator aware they are
Mikulas> special and they do not really want any dirty data throttling.

Mikulas> The throttling causes stalls on Android - it uses the dm-verity driver
Mikulas> that uses dm-bufio. Allocations in dm-bufio were observed to sleep in
Mikulas> wait_iff_congested repeatedly.

Mikulas> Signed-off-by: Mikulas Patocka <[email protected]>
Mikulas> Acked-by: Michal Hocko <[email protected]> # mempool_alloc and bvec_alloc
Mikulas> Cc: [email protected]

Mikulas> ---
Mikulas> block/bio.c | 4 ++++
Mikulas> drivers/md/dm-bufio.c | 14 +++++++++++---
Mikulas> drivers/md/dm-crypt.c | 8 ++++++++
Mikulas> drivers/md/dm-integrity.c | 4 ++++
Mikulas> drivers/md/dm-kcopyd.c | 3 +++
Mikulas> drivers/md/dm-verity-target.c | 4 ++++
Mikulas> drivers/md/dm-writecache.c | 4 ++++
Mikulas> mm/mempool.c | 4 ++++
Mikulas> 8 files changed, 42 insertions(+), 3 deletions(-)

Mikulas> Index: linux-2.6/mm/mempool.c
Mikulas> ===================================================================
Mikulas> --- linux-2.6.orig/mm/mempool.c 2018-06-29 03:47:16.290000000 +0200
Mikulas> +++ linux-2.6/mm/mempool.c 2018-06-29 03:47:16.270000000 +0200
Mikulas> @@ -369,6 +369,7 @@ void *mempool_alloc(mempool_t *pool, gfp
Mikulas> unsigned long flags;
Mikulas> wait_queue_entry_t wait;
Mikulas> gfp_t gfp_temp;
Mikulas> + unsigned old_flags;

Mikulas> VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
Mikulas> might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
Mikulas> @@ -381,7 +382,10 @@ void *mempool_alloc(mempool_t *pool, gfp

Mikulas> repeat_alloc:

Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> element = pool->alloc(gfp_temp, pool->pool_data);
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
Mikulas> if (likely(element != NULL))
Mikulas> return element;

Mikulas> Index: linux-2.6/block/bio.c
Mikulas> ===================================================================
Mikulas> --- linux-2.6.orig/block/bio.c 2018-06-29 03:47:16.290000000 +0200
Mikulas> +++ linux-2.6/block/bio.c 2018-06-29 03:47:16.270000000 +0200
Mikulas> @@ -217,6 +217,7 @@ fallback:
Mikulas> } else {
Mikulas> struct biovec_slab *bvs = bvec_slabs + *idx;
Mikulas> gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
Mikulas> + unsigned old_flags;

Mikulas> /*
Mikulas> * Make this allocation restricted and don't dump info on
Mikulas> @@ -229,7 +230,10 @@ fallback:
Mikulas> * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
Mikulas> * is set, retry with the 1-entry mempool
Mikulas> */
Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
Mikulas> if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
Mikulas> *idx = BVEC_POOL_MAX;
Mikulas> goto fallback;
Mikulas> Index: linux-2.6/drivers/md/dm-bufio.c
Mikulas> ===================================================================
Mikulas> --- linux-2.6.orig/drivers/md/dm-bufio.c 2018-06-29 03:47:16.290000000 +0200
Mikulas> +++ linux-2.6/drivers/md/dm-bufio.c 2018-06-29 03:47:16.270000000 +0200
Mikulas> @@ -356,6 +356,7 @@ static void __cache_size_refresh(void)
Mikulas> static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
Mikulas> unsigned char *data_mode)
Mikulas> {
Mikulas> + void *ptr;
Mikulas> if (unlikely(c->slab_cache != NULL)) {
Mikulas> *data_mode = DATA_MODE_SLAB;
Mikulas> return kmem_cache_alloc(c->slab_cache, gfp_mask);
Mikulas> @@ -363,9 +364,14 @@ static void *alloc_buffer_data(struct dm

Mikulas> if (c->block_size <= KMALLOC_MAX_SIZE &&
Mikulas> gfp_mask & __GFP_NORETRY) {
Mikulas> + unsigned old_flags;
Mikulas> *data_mode = DATA_MODE_GET_FREE_PAGES;
Mikulas> - return (void *)__get_free_pages(gfp_mask,
Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> + ptr = (void *)__get_free_pages(gfp_mask,
c-> sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
Mikulas> + return ptr;
Mikulas> }

Mikulas> *data_mode = DATA_MODE_VMALLOC;
Mikulas> @@ -381,8 +387,10 @@ static void *alloc_buffer_data(struct dm
Mikulas> */
Mikulas> if (gfp_mask & __GFP_NORETRY) {
Mikulas> unsigned noio_flag = memalloc_noio_save();
Mikulas> - void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
Mikulas> -
Mikulas> + unsigned old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> + ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
Mikulas> memalloc_noio_restore(noio_flag);
Mikulas> return ptr;
Mikulas> }
Mikulas> Index: linux-2.6/drivers/md/dm-integrity.c
Mikulas> ===================================================================
Mikulas> --- linux-2.6.orig/drivers/md/dm-integrity.c 2018-06-29 03:47:16.290000000 +0200
Mikulas> +++ linux-2.6/drivers/md/dm-integrity.c 2018-06-29 03:47:16.270000000 +0200
Mikulas> @@ -1318,6 +1318,7 @@ static void integrity_metadata(struct wo
Mikulas> int r;

Mikulas> if (ic->internal_hash) {
Mikulas> + unsigned old_flags;
Mikulas> struct bvec_iter iter;
Mikulas> struct bio_vec bv;
Mikulas> unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
Mikulas> @@ -1331,8 +1332,11 @@ static void integrity_metadata(struct wo
Mikulas> if (unlikely(ic->mode == 'R'))
Mikulas> goto skip_io;

Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
Mikulas> GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
Mikulas> if (!checksums)
Mikulas> checksums = checksums_onstack;

Mikulas> Index: linux-2.6/drivers/md/dm-kcopyd.c
Mikulas> ===================================================================
Mikulas> --- linux-2.6.orig/drivers/md/dm-kcopyd.c 2018-06-29 03:47:16.290000000 +0200
Mikulas> +++ linux-2.6/drivers/md/dm-kcopyd.c 2018-06-29 03:47:16.270000000 +0200
Mikulas> @@ -245,7 +245,10 @@ static int kcopyd_get_pages(struct dm_kc
Mikulas> *pages = NULL;

Mikulas> do {
Mikulas> + unsigned old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM);
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
Mikulas> if (unlikely(!pl)) {
Mikulas> /* Use reserved pages */
Mikulas> pl = kc->pages;
Mikulas> Index: linux-2.6/drivers/md/dm-verity-target.c
Mikulas> ===================================================================
Mikulas> --- linux-2.6.orig/drivers/md/dm-verity-target.c 2018-06-29 03:47:16.290000000 +0200
Mikulas> +++ linux-2.6/drivers/md/dm-verity-target.c 2018-06-29 03:47:16.280000000 +0200
Mikulas> @@ -596,9 +596,13 @@ no_prefetch_cluster:
Mikulas> static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
Mikulas> {
Mikulas> struct dm_verity_prefetch_work *pw;
Mikulas> + unsigned old_flags;

Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
Mikulas> GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);

Mikulas> if (!pw)
Mikulas> return;
Mikulas> Index: linux-2.6/drivers/md/dm-writecache.c
Mikulas> ===================================================================
Mikulas> --- linux-2.6.orig/drivers/md/dm-writecache.c 2018-06-29 03:47:16.290000000 +0200
Mikulas> +++ linux-2.6/drivers/md/dm-writecache.c 2018-06-29 03:47:16.280000000 +0200
Mikulas> @@ -1473,6 +1473,7 @@ static void __writecache_writeback_pmem(
Mikulas> unsigned max_pages;

Mikulas> while (wbl->size) {
Mikulas> + unsigned old_flags;
wbl-> size--;
Mikulas> e = container_of(wbl->list.prev, struct wc_entry, lru);
Mikulas> list_del(&e->lru);
Mikulas> @@ -1486,6 +1487,8 @@ static void __writecache_writeback_pmem(
Mikulas> bio_set_dev(&wb->bio, wc->dev->bdev);
wb-> bio.bi_iter.bi_sector = read_original_sector(wc, e);
wb-> page_offset = PAGE_SIZE;
Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> if (max_pages <= WB_LIST_INLINE ||
Mikulas> unlikely(!(wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *),
Mikulas> GFP_NOIO | __GFP_NORETRY |
Mikulas> @@ -1493,6 +1496,7 @@ static void __writecache_writeback_pmem(
wb-> wc_list = wb->wc_list_inline;
Mikulas> max_pages = WB_LIST_INLINE;
Mikulas> }
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);

Mikulas> BUG_ON(!wc_add_block(wb, e, GFP_NOIO));

Mikulas> Index: linux-2.6/drivers/md/dm-crypt.c
Mikulas> ===================================================================
Mikulas> --- linux-2.6.orig/drivers/md/dm-crypt.c 2018-06-29 03:47:16.290000000 +0200
Mikulas> +++ linux-2.6/drivers/md/dm-crypt.c 2018-06-29 03:47:16.280000000 +0200
Mikulas> @@ -2181,12 +2181,16 @@ static void *crypt_page_alloc(gfp_t gfp_
Mikulas> {
Mikulas> struct crypt_config *cc = pool_data;
Mikulas> struct page *page;
Mikulas> + unsigned old_flags;

Mikulas> if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) &&
Mikulas> likely(gfp_mask & __GFP_NORETRY))
Mikulas> return NULL;

Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> page = alloc_page(gfp_mask);
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
Mikulas> if (likely(page != NULL))
Mikulas> percpu_counter_add(&cc->n_allocated_pages, 1);

Mikulas> @@ -2893,7 +2897,10 @@ static int crypt_map(struct dm_target *t

Mikulas> if (cc->on_disk_tag_size) {
Mikulas> unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
Mikulas> + unsigned old_flags;

Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
Mikulas> + current->flags |= PF_LESS_THROTTLE;
Mikulas> if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
Mikulas> unlikely(!(io->integrity_metadata = kmalloc(tag_len,
Mikulas> GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
Mikulas> @@ -2902,6 +2909,7 @@ static int crypt_map(struct dm_target *t
io-> integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO);
io-> integrity_metadata_from_pool = true;
Mikulas> }
Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
Mikulas> }

Mikulas> if (crypt_integrity_aead(cc))

Mikulas> --
Mikulas> dm-devel mailing list
Mikulas> [email protected]
Mikulas> https://www.redhat.com/mailman/listinfo/dm-devel

2018-07-10 01:47:58

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] mm: set PF_LESS_THROTTLE when allocating memory for i/o



On Thu, 5 Jul 2018, John Stoffel wrote:

> >>>>> "Mikulas" == Mikulas Patocka <[email protected]> writes:
>
> Mikulas> It has been noticed that congestion throttling can slow down
> Mikulas> allocations path that participate in the IO and thus help the
> Mikulas> memory reclaim. Stalling those allocation is therefore not
> Mikulas> productive. Moreover mempool allocator and md variants of the
> Mikulas> same already implement their own throttling which has a
> Mikulas> better way to be feedback driven. Stalling at the page
> Mikulas> allocator is therefore even counterproductive.
>
> Can you show numbers for this claim?

This is bug report where this problem is reported:
https://www.redhat.com/archives/dm-devel/2018-June/msg00161.html

A process takes dm-bufio lock, allocates memory and memory allocation
sleeps in wait_iff_congested. kswapd is attempting to take the dm-bufio
lock as well, so sleeping in wait_iff_congested delays kswapd. Delaying
kswapd prevents the system from reclaiming memory.

> I would think that throttling needs to be done as close to the disk as
> possible, and propogate back up the layers to have this all work well,
> so that faster devices (and the layers stacked on them) will work better
> without stalling on a slow device.

The drivers that participate in memory reclaim shouldn't be throttled at
all. If you throttle them, you slow down reclaim - and if you slow down
reclaim, the congestion will not be resolved.

> Mikulas> PF_LESS_THROTTLE is a task flag denoting allocation context that is
> Mikulas> participating in the memory reclaim which fits into these IO paths
> Mikulas> model, so use the flag and make the page allocator aware they are
> Mikulas> special and they do not really want any dirty data throttling.
>
> Mikulas> The throttling causes stalls on Android - it uses the dm-verity driver
> Mikulas> that uses dm-bufio. Allocations in dm-bufio were observed to sleep in
> Mikulas> wait_iff_congested repeatedly.
>
> Mikulas> Signed-off-by: Mikulas Patocka <[email protected]>
> Mikulas> Acked-by: Michal Hocko <[email protected]> # mempool_alloc and bvec_alloc
> Mikulas> Cc: [email protected]
>
> Mikulas> ---
> Mikulas> block/bio.c | 4 ++++
> Mikulas> drivers/md/dm-bufio.c | 14 +++++++++++---
> Mikulas> drivers/md/dm-crypt.c | 8 ++++++++
> Mikulas> drivers/md/dm-integrity.c | 4 ++++
> Mikulas> drivers/md/dm-kcopyd.c | 3 +++
> Mikulas> drivers/md/dm-verity-target.c | 4 ++++
> Mikulas> drivers/md/dm-writecache.c | 4 ++++
> Mikulas> mm/mempool.c | 4 ++++
> Mikulas> 8 files changed, 42 insertions(+), 3 deletions(-)
>
> Mikulas> Index: linux-2.6/mm/mempool.c
> Mikulas> ===================================================================
> Mikulas> --- linux-2.6.orig/mm/mempool.c 2018-06-29 03:47:16.290000000 +0200
> Mikulas> +++ linux-2.6/mm/mempool.c 2018-06-29 03:47:16.270000000 +0200
> Mikulas> @@ -369,6 +369,7 @@ void *mempool_alloc(mempool_t *pool, gfp
> Mikulas> unsigned long flags;
> Mikulas> wait_queue_entry_t wait;
> Mikulas> gfp_t gfp_temp;
> Mikulas> + unsigned old_flags;
>
> Mikulas> VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
> Mikulas> might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
> Mikulas> @@ -381,7 +382,10 @@ void *mempool_alloc(mempool_t *pool, gfp
>
> Mikulas> repeat_alloc:
>
> Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> element = pool->alloc(gfp_temp, pool->pool_data);
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
> Mikulas> if (likely(element != NULL))
> Mikulas> return element;
>
> Mikulas> Index: linux-2.6/block/bio.c
> Mikulas> ===================================================================
> Mikulas> --- linux-2.6.orig/block/bio.c 2018-06-29 03:47:16.290000000 +0200
> Mikulas> +++ linux-2.6/block/bio.c 2018-06-29 03:47:16.270000000 +0200
> Mikulas> @@ -217,6 +217,7 @@ fallback:
> Mikulas> } else {
> Mikulas> struct biovec_slab *bvs = bvec_slabs + *idx;
> Mikulas> gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
> Mikulas> + unsigned old_flags;
>
> Mikulas> /*
> Mikulas> * Make this allocation restricted and don't dump info on
> Mikulas> @@ -229,7 +230,10 @@ fallback:
> Mikulas> * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
> Mikulas> * is set, retry with the 1-entry mempool
> Mikulas> */
> Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
> Mikulas> if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
> Mikulas> *idx = BVEC_POOL_MAX;
> Mikulas> goto fallback;
> Mikulas> Index: linux-2.6/drivers/md/dm-bufio.c
> Mikulas> ===================================================================
> Mikulas> --- linux-2.6.orig/drivers/md/dm-bufio.c 2018-06-29 03:47:16.290000000 +0200
> Mikulas> +++ linux-2.6/drivers/md/dm-bufio.c 2018-06-29 03:47:16.270000000 +0200
> Mikulas> @@ -356,6 +356,7 @@ static void __cache_size_refresh(void)
> Mikulas> static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
> Mikulas> unsigned char *data_mode)
> Mikulas> {
> Mikulas> + void *ptr;
> Mikulas> if (unlikely(c->slab_cache != NULL)) {
> Mikulas> *data_mode = DATA_MODE_SLAB;
> Mikulas> return kmem_cache_alloc(c->slab_cache, gfp_mask);
> Mikulas> @@ -363,9 +364,14 @@ static void *alloc_buffer_data(struct dm
>
> Mikulas> if (c->block_size <= KMALLOC_MAX_SIZE &&
> Mikulas> gfp_mask & __GFP_NORETRY) {
> Mikulas> + unsigned old_flags;
> Mikulas> *data_mode = DATA_MODE_GET_FREE_PAGES;
> Mikulas> - return (void *)__get_free_pages(gfp_mask,
> Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> + ptr = (void *)__get_free_pages(gfp_mask,
> c-> sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
> Mikulas> + return ptr;
> Mikulas> }
>
> Mikulas> *data_mode = DATA_MODE_VMALLOC;
> Mikulas> @@ -381,8 +387,10 @@ static void *alloc_buffer_data(struct dm
> Mikulas> */
> Mikulas> if (gfp_mask & __GFP_NORETRY) {
> Mikulas> unsigned noio_flag = memalloc_noio_save();
> Mikulas> - void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
> Mikulas> -
> Mikulas> + unsigned old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> + ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
> Mikulas> memalloc_noio_restore(noio_flag);
> Mikulas> return ptr;
> Mikulas> }
> Mikulas> Index: linux-2.6/drivers/md/dm-integrity.c
> Mikulas> ===================================================================
> Mikulas> --- linux-2.6.orig/drivers/md/dm-integrity.c 2018-06-29 03:47:16.290000000 +0200
> Mikulas> +++ linux-2.6/drivers/md/dm-integrity.c 2018-06-29 03:47:16.270000000 +0200
> Mikulas> @@ -1318,6 +1318,7 @@ static void integrity_metadata(struct wo
> Mikulas> int r;
>
> Mikulas> if (ic->internal_hash) {
> Mikulas> + unsigned old_flags;
> Mikulas> struct bvec_iter iter;
> Mikulas> struct bio_vec bv;
> Mikulas> unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
> Mikulas> @@ -1331,8 +1332,11 @@ static void integrity_metadata(struct wo
> Mikulas> if (unlikely(ic->mode == 'R'))
> Mikulas> goto skip_io;
>
> Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
> Mikulas> GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
> Mikulas> if (!checksums)
> Mikulas> checksums = checksums_onstack;
>
> Mikulas> Index: linux-2.6/drivers/md/dm-kcopyd.c
> Mikulas> ===================================================================
> Mikulas> --- linux-2.6.orig/drivers/md/dm-kcopyd.c 2018-06-29 03:47:16.290000000 +0200
> Mikulas> +++ linux-2.6/drivers/md/dm-kcopyd.c 2018-06-29 03:47:16.270000000 +0200
> Mikulas> @@ -245,7 +245,10 @@ static int kcopyd_get_pages(struct dm_kc
> Mikulas> *pages = NULL;
>
> Mikulas> do {
> Mikulas> + unsigned old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM);
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
> Mikulas> if (unlikely(!pl)) {
> Mikulas> /* Use reserved pages */
> Mikulas> pl = kc->pages;
> Mikulas> Index: linux-2.6/drivers/md/dm-verity-target.c
> Mikulas> ===================================================================
> Mikulas> --- linux-2.6.orig/drivers/md/dm-verity-target.c 2018-06-29 03:47:16.290000000 +0200
> Mikulas> +++ linux-2.6/drivers/md/dm-verity-target.c 2018-06-29 03:47:16.280000000 +0200
> Mikulas> @@ -596,9 +596,13 @@ no_prefetch_cluster:
> Mikulas> static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
> Mikulas> {
> Mikulas> struct dm_verity_prefetch_work *pw;
> Mikulas> + unsigned old_flags;
>
> Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
> Mikulas> GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
>
> Mikulas> if (!pw)
> Mikulas> return;
> Mikulas> Index: linux-2.6/drivers/md/dm-writecache.c
> Mikulas> ===================================================================
> Mikulas> --- linux-2.6.orig/drivers/md/dm-writecache.c 2018-06-29 03:47:16.290000000 +0200
> Mikulas> +++ linux-2.6/drivers/md/dm-writecache.c 2018-06-29 03:47:16.280000000 +0200
> Mikulas> @@ -1473,6 +1473,7 @@ static void __writecache_writeback_pmem(
> Mikulas> unsigned max_pages;
>
> Mikulas> while (wbl->size) {
> Mikulas> + unsigned old_flags;
> wbl-> size--;
> Mikulas> e = container_of(wbl->list.prev, struct wc_entry, lru);
> Mikulas> list_del(&e->lru);
> Mikulas> @@ -1486,6 +1487,8 @@ static void __writecache_writeback_pmem(
> Mikulas> bio_set_dev(&wb->bio, wc->dev->bdev);
> wb-> bio.bi_iter.bi_sector = read_original_sector(wc, e);
> wb-> page_offset = PAGE_SIZE;
> Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> if (max_pages <= WB_LIST_INLINE ||
> Mikulas> unlikely(!(wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *),
> Mikulas> GFP_NOIO | __GFP_NORETRY |
> Mikulas> @@ -1493,6 +1496,7 @@ static void __writecache_writeback_pmem(
> wb-> wc_list = wb->wc_list_inline;
> Mikulas> max_pages = WB_LIST_INLINE;
> Mikulas> }
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
>
> Mikulas> BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
>
> Mikulas> Index: linux-2.6/drivers/md/dm-crypt.c
> Mikulas> ===================================================================
> Mikulas> --- linux-2.6.orig/drivers/md/dm-crypt.c 2018-06-29 03:47:16.290000000 +0200
> Mikulas> +++ linux-2.6/drivers/md/dm-crypt.c 2018-06-29 03:47:16.280000000 +0200
> Mikulas> @@ -2181,12 +2181,16 @@ static void *crypt_page_alloc(gfp_t gfp_
> Mikulas> {
> Mikulas> struct crypt_config *cc = pool_data;
> Mikulas> struct page *page;
> Mikulas> + unsigned old_flags;
>
> Mikulas> if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) &&
> Mikulas> likely(gfp_mask & __GFP_NORETRY))
> Mikulas> return NULL;
>
> Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> page = alloc_page(gfp_mask);
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
> Mikulas> if (likely(page != NULL))
> Mikulas> percpu_counter_add(&cc->n_allocated_pages, 1);
>
> Mikulas> @@ -2893,7 +2897,10 @@ static int crypt_map(struct dm_target *t
>
> Mikulas> if (cc->on_disk_tag_size) {
> Mikulas> unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
> Mikulas> + unsigned old_flags;
>
> Mikulas> + old_flags = current->flags & PF_LESS_THROTTLE;
> Mikulas> + current->flags |= PF_LESS_THROTTLE;
> Mikulas> if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
> Mikulas> unlikely(!(io->integrity_metadata = kmalloc(tag_len,
> Mikulas> GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
> Mikulas> @@ -2902,6 +2909,7 @@ static int crypt_map(struct dm_target *t
> io-> integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO);
> io-> integrity_metadata_from_pool = true;
> Mikulas> }
> Mikulas> + current_restore_flags(old_flags, PF_LESS_THROTTLE);
> Mikulas> }
>
> Mikulas> if (crypt_integrity_aead(cc))
>
> Mikulas> --
> Mikulas> dm-devel mailing list
> Mikulas> [email protected]
> Mikulas> https://www.redhat.com/mailman/listinfo/dm-devel
>