2022-10-18 20:47:00

by Pavel Begunkov

[permalink] [raw]
Subject: [RFC for-next v2 0/4] enable pcpu bio caching for IRQ I/O

This series implements bio pcpu caching for normal / IRQ-driven I/O
extending REQ_ALLOC_CACHE currently limited to iopoll. The allocation side
still only works from non-irq context, which is the reason it's not enabled
by default, but turning it on for other users (e.g. filesystems) is
as a matter of passing a flag.

t/io_uring with an Optane SSD setup showed +7% for batches of 32 requests
and +4.3% for batches of 8.

IRQ, 128/32/32, cache off
IOPS=59.08M, BW=28.84GiB/s, IOS/call=31/31
IOPS=59.30M, BW=28.96GiB/s, IOS/call=32/32
IOPS=59.97M, BW=29.28GiB/s, IOS/call=31/31
IOPS=59.92M, BW=29.26GiB/s, IOS/call=32/32
IOPS=59.81M, BW=29.20GiB/s, IOS/call=32/31

IRQ, 128/32/32, cache on
IOPS=64.05M, BW=31.27GiB/s, IOS/call=32/31
IOPS=64.22M, BW=31.36GiB/s, IOS/call=32/32
IOPS=64.04M, BW=31.27GiB/s, IOS/call=31/31
IOPS=63.16M, BW=30.84GiB/s, IOS/call=32/32

IRQ, 32/8/8, cache off
IOPS=50.60M, BW=24.71GiB/s, IOS/call=7/8
IOPS=50.22M, BW=24.52GiB/s, IOS/call=8/7
IOPS=49.54M, BW=24.19GiB/s, IOS/call=8/8
IOPS=50.07M, BW=24.45GiB/s, IOS/call=7/7
IOPS=50.46M, BW=24.64GiB/s, IOS/call=8/8

IRQ, 32/8/8, cache on
IOPS=51.39M, BW=25.09GiB/s, IOS/call=8/7
IOPS=52.52M, BW=25.64GiB/s, IOS/call=7/8
IOPS=52.57M, BW=25.67GiB/s, IOS/call=8/8
IOPS=52.58M, BW=25.67GiB/s, IOS/call=8/7
IOPS=52.61M, BW=25.69GiB/s, IOS/call=8/8

The main part is in patch 3. Would be great to take patch 1 separately
for 6.1 for extra safety.

v2: fix botched splicing threshold checks

Pavel Begunkov (4):
bio: safeguard REQ_ALLOC_CACHE bio put
bio: split pcpu cache part of bio_put into a helper
block/bio: add pcpu caching for non-polling bio_put
io_uring/rw: enable bio caches for IRQ rw

block/bio.c | 94 ++++++++++++++++++++++++++++++++++++++++-----------
io_uring/rw.c | 3 +-
2 files changed, 76 insertions(+), 21 deletions(-)

--
2.38.0


2022-10-18 20:50:11

by Pavel Begunkov

[permalink] [raw]
Subject: [RFC for-next v2 1/4] bio: safeguard REQ_ALLOC_CACHE bio put

bio_put() with REQ_ALLOC_CACHE assumes that it's executed not from
an irq context. Let's add a warning if the invariant is not respected,
especially since there is a couple of places removing REQ_POLLED by hand
without also clearing REQ_ALLOC_CACHE.

Signed-off-by: Pavel Begunkov <[email protected]>
---
block/bio.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 7cb7d2ff139b..5b4594daa259 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -741,7 +741,7 @@ void bio_put(struct bio *bio)
return;
}

- if (bio->bi_opf & REQ_ALLOC_CACHE) {
+ if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) {
struct bio_alloc_cache *cache;

bio_uninit(bio);
--
2.38.0

2022-10-18 20:54:02

by Pavel Begunkov

[permalink] [raw]
Subject: [RFC for-next v2 3/4] block/bio: add pcpu caching for non-polling bio_put

This patch extends REQ_ALLOC_CACHE to IRQ completions, whenever
currently it's only limited to iopoll. Instead of guarding the list with
irq toggling on alloc, which is expensive, it keeps an additional
irq-safe list from which bios are spliced in batches to ammortise
overhead. On the put side it toggles irqs, but in many cases they're
already disabled and so cheap.

Signed-off-by: Pavel Begunkov <[email protected]>
---
block/bio.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 53 insertions(+), 11 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index ac16cc154476..c2dda2759df5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -25,9 +25,15 @@
#include "blk-rq-qos.h"
#include "blk-cgroup.h"

+#define ALLOC_CACHE_THRESHOLD 16
+#define ALLOC_CACHE_SLACK 64
+#define ALLOC_CACHE_MAX 512
+
struct bio_alloc_cache {
struct bio *free_list;
+ struct bio *free_list_irq;
unsigned int nr;
+ unsigned int nr_irq;
};

static struct biovec_slab {
@@ -408,6 +414,22 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
queue_work(bs->rescue_workqueue, &bs->rescue_work);
}

+static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
+{
+ unsigned long flags;
+
+ /* cache->free_list must be empty */
+ if (WARN_ON_ONCE(cache->free_list))
+ return;
+
+ local_irq_save(flags);
+ cache->free_list = cache->free_list_irq;
+ cache->free_list_irq = NULL;
+ cache->nr += cache->nr_irq;
+ cache->nr_irq = 0;
+ local_irq_restore(flags);
+}
+
static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
struct bio_set *bs)
@@ -417,9 +439,17 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,

cache = per_cpu_ptr(bs->cache, get_cpu());
if (!cache->free_list) {
- put_cpu();
- return NULL;
+ if (READ_ONCE(cache->nr_irq) < ALLOC_CACHE_THRESHOLD) {
+ put_cpu();
+ return NULL;
+ }
+ bio_alloc_irq_cache_splice(cache);
+ if (!cache->free_list) {
+ put_cpu();
+ return NULL;
+ }
}
+
bio = cache->free_list;
cache->free_list = bio->bi_next;
cache->nr--;
@@ -676,11 +706,8 @@ void guard_bio_eod(struct bio *bio)
bio_truncate(bio, maxsector << 9);
}

-#define ALLOC_CACHE_MAX 512
-#define ALLOC_CACHE_SLACK 64
-
-static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
- unsigned int nr)
+static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
{
unsigned int i = 0;
struct bio *bio;
@@ -692,6 +719,17 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
if (++i == nr)
break;
}
+ return i;
+}
+
+static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
+{
+ nr -= __bio_alloc_cache_prune(cache, nr);
+ if (!READ_ONCE(cache->free_list)) {
+ bio_alloc_irq_cache_splice(cache);
+ __bio_alloc_cache_prune(cache, nr);
+ }
}

static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
@@ -728,6 +766,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
static inline void bio_put_percpu_cache(struct bio *bio)
{
struct bio_alloc_cache *cache;
+ unsigned long flags;

cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
bio_uninit(bio);
@@ -737,12 +776,15 @@ static inline void bio_put_percpu_cache(struct bio *bio)
cache->free_list = bio;
cache->nr++;
} else {
- put_cpu();
- bio_free(bio);
- return;
+ local_irq_save(flags);
+ bio->bi_next = cache->free_list_irq;
+ cache->free_list_irq = bio;
+ cache->nr_irq++;
+ local_irq_restore(flags);
}

- if (cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
+ if (READ_ONCE(cache->nr_irq) + cache->nr >
+ ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
put_cpu();
}
--
2.38.0

2022-10-20 08:37:04

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC for-next v2 3/4] block/bio: add pcpu caching for non-polling bio_put

> + unsigned long flags;
>
> cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
> bio_uninit(bio);
> @@ -737,12 +776,15 @@ static inline void bio_put_percpu_cache(struct bio *bio)
> cache->free_list = bio;
> cache->nr++;
> } else {
> - put_cpu();
> - bio_free(bio);
> - return;
> + local_irq_save(flags);
> + bio->bi_next = cache->free_list_irq;
> + cache->free_list_irq = bio;
> + cache->nr_irq++;
> + local_irq_restore(flags);
> }

Ok, I guess with that my previous comments don't make quite
as much sense any more. I think youcan keep flags local in
the branch here, though.

2022-10-20 08:52:46

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC for-next v2 0/4] enable pcpu bio caching for IRQ I/O

On Tue, Oct 18, 2022 at 08:50:54PM +0100, Pavel Begunkov wrote:
> This series implements bio pcpu caching for normal / IRQ-driven I/O
> extending REQ_ALLOC_CACHE currently limited to iopoll. The allocation side
> still only works from non-irq context, which is the reason it's not enabled
> by default, but turning it on for other users (e.g. filesystems) is
> as a matter of passing a flag.
>
> t/io_uring with an Optane SSD setup showed +7% for batches of 32 requests
> and +4.3% for batches of 8.

This looks much nicer to me than the previous attempt exposing the bio
internals to io_uring, thanks.

2022-10-20 08:53:42

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC for-next v2 1/4] bio: safeguard REQ_ALLOC_CACHE bio put

On Tue, Oct 18, 2022 at 08:50:55PM +0100, Pavel Begunkov wrote:
> bio_put() with REQ_ALLOC_CACHE assumes that it's executed not from
> an irq context. Let's add a warning if the invariant is not respected,
> especially since there is a couple of places removing REQ_POLLED by hand
> without also clearing REQ_ALLOC_CACHE.

Looks good:

Reviewed-by: Christoph Hellwig <[email protected]>

2022-10-20 12:37:20

by Pavel Begunkov

[permalink] [raw]
Subject: Re: [RFC for-next v2 3/4] block/bio: add pcpu caching for non-polling bio_put

On 10/20/22 09:31, Christoph Hellwig wrote:
>> + unsigned long flags;
>>
>> cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
>> bio_uninit(bio);
>> @@ -737,12 +776,15 @@ static inline void bio_put_percpu_cache(struct bio *bio)
>> cache->free_list = bio;
>> cache->nr++;
>> } else {
>> - put_cpu();
>> - bio_free(bio);
>> - return;
>> + local_irq_save(flags);
>> + bio->bi_next = cache->free_list_irq;
>> + cache->free_list_irq = bio;
>> + cache->nr_irq++;
>> + local_irq_restore(flags);
>> }
>
> Ok, I guess with that my previous comments don't make quite
> as much sense any more. I think youcan keep flags local in

Yeah, a little bit of oracle coding

> the branch here, though.

Not like it makes any difference but can move it

--
Pavel Begunkov

2022-10-20 13:04:28

by Pavel Begunkov

[permalink] [raw]
Subject: Re: [RFC for-next v2 0/4] enable pcpu bio caching for IRQ I/O

On 10/20/22 09:32, Christoph Hellwig wrote:
> On Tue, Oct 18, 2022 at 08:50:54PM +0100, Pavel Begunkov wrote:
>> This series implements bio pcpu caching for normal / IRQ-driven I/O
>> extending REQ_ALLOC_CACHE currently limited to iopoll. The allocation side
>> still only works from non-irq context, which is the reason it's not enabled
>> by default, but turning it on for other users (e.g. filesystems) is
>> as a matter of passing a flag.
>>
>> t/io_uring with an Optane SSD setup showed +7% for batches of 32 requests
>> and +4.3% for batches of 8.
>
> This looks much nicer to me than the previous attempt exposing the bio
> internals to io_uring, thanks.

Yeah, I saw the one Jens posted before but I wanted this one to be more
generic, i.e. applicable not only to io_uring. Thanks for taking a look.

--
Pavel Begunkov

2022-10-20 13:06:28

by Jens Axboe

[permalink] [raw]
Subject: Re: (subset) [RFC for-next v2 0/4] enable pcpu bio caching for IRQ I/O

On Tue, 18 Oct 2022 20:50:54 +0100, Pavel Begunkov wrote:
> This series implements bio pcpu caching for normal / IRQ-driven I/O
> extending REQ_ALLOC_CACHE currently limited to iopoll. The allocation side
> still only works from non-irq context, which is the reason it's not enabled
> by default, but turning it on for other users (e.g. filesystems) is
> as a matter of passing a flag.
>
> t/io_uring with an Optane SSD setup showed +7% for batches of 32 requests
> and +4.3% for batches of 8.
>
> [...]

Applied, thanks!

[1/4] bio: safeguard REQ_ALLOC_CACHE bio put
commit: d4347d50407daea6237872281ece64c4bdf1ec99

Best regards,
--
Jens Axboe


2022-10-20 13:06:45

by Jens Axboe

[permalink] [raw]
Subject: Re: [RFC for-next v2 0/4] enable pcpu bio caching for IRQ I/O

On 10/20/22 5:40 AM, Pavel Begunkov wrote:
> On 10/20/22 09:32, Christoph Hellwig wrote:
>> On Tue, Oct 18, 2022 at 08:50:54PM +0100, Pavel Begunkov wrote:
>>> This series implements bio pcpu caching for normal / IRQ-driven I/O
>>> extending REQ_ALLOC_CACHE currently limited to iopoll. The allocation side
>>> still only works from non-irq context, which is the reason it's not enabled
>>> by default, but turning it on for other users (e.g. filesystems) is
>>> as a matter of passing a flag.
>>>
>>> t/io_uring with an Optane SSD setup showed +7% for batches of 32 requests
>>> and +4.3% for batches of 8.
>>
>> This looks much nicer to me than the previous attempt exposing the bio
>> internals to io_uring, thanks.
>
> Yeah, I saw the one Jens posted before but I wanted this one to be more
> generic, i.e. applicable not only to io_uring. Thanks for taking a look.

It is indeed better like that, also because we can get rid of the alloc
cache flag long term and just have it be the way that bio allocations
work.

--
Jens Axboe