Hi,
This patches kills two kinds of atomic operations in block
accounting I/O.
The 1st two patches convert atomic refcount of partition
into percpu refcount.
The 2nd two patches converts partition->in_flight[] into percpu
variable.
With this change, ~15% throughput improvement can be observed
when running fio(randread) over null blk in a dual-socket
environment.
block/bio.c | 4 ++--
block/blk-core.c | 5 ++--
block/blk-merge.c | 2 +-
block/genhd.c | 9 ++++---
block/partition-generic.c | 17 ++++++-------
drivers/md/dm.c | 10 ++++----
drivers/nvdimm/core.c | 4 ++--
include/linux/genhd.h | 61 +++++++++++++++++++++++++++++++++--------------
8 files changed, 72 insertions(+), 40 deletions(-)
Thanks,
Ming
So the helper can be used in both generic partition
case and part0 case.
Signed-off-by: Ming Lei <[email protected]>
---
block/genhd.c | 3 +--
block/partition-generic.c | 3 +--
include/linux/genhd.h | 6 ++++++
3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c
index e552e1b..ed3f5b9 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1110,8 +1110,7 @@ static void disk_release(struct device *dev)
disk_release_events(disk);
kfree(disk->random);
disk_replace_part_tbl(disk, NULL);
- free_part_stats(&disk->part0);
- free_part_info(&disk->part0);
+ hd_free_part(&disk->part0);
if (disk->queue)
blk_put_queue(disk->queue);
kfree(disk);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 0d9e5f9..eca0d02 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -212,8 +212,7 @@ static void part_release(struct device *dev)
{
struct hd_struct *p = dev_to_part(dev);
blk_free_devt(dev->devt);
- free_part_stats(p);
- free_part_info(p);
+ hd_free_part(p);
kfree(p);
}
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ec274e0..a221220 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -663,6 +663,12 @@ static inline void hd_struct_put(struct hd_struct *part)
__delete_partition(part);
}
+static inline void hd_free_part(struct hd_struct *part)
+{
+ free_part_stats(part);
+ free_part_info(part);
+}
+
/*
* Any access of part->nr_sects which is not protected by partition
* bd_mutex or gendisk bdev bd_mutex, should be done using this
--
1.9.1
Percpu refcount is the perfect match for partition's case,
and the conversion is quite straight.
With the convertion, one pair of atomic inc/dec can be saved
for accounting block I/O, which is run in hot path of block I/O.
Signed-off-by: Ming Lei <[email protected]>
---
block/genhd.c | 6 +++++-
block/partition-generic.c | 9 +++++----
include/linux/genhd.h | 27 +++++++++++++++++----------
3 files changed, 27 insertions(+), 15 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c
index ed3f5b9..3213b66 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1284,7 +1284,11 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
* converted to make use of bd_mutex and sequence counters.
*/
seqcount_init(&disk->part0.nr_sects_seq);
- hd_ref_init(&disk->part0);
+ if (hd_ref_init(&disk->part0)) {
+ hd_free_part(&disk->part0);
+ kfree(disk);
+ return NULL;
+ }
disk->minors = minors;
rand_initialize_disk(disk);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index eca0d02..e771113 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -232,8 +232,9 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
put_device(part_to_dev(part));
}
-void __delete_partition(struct hd_struct *part)
+void __delete_partition(struct percpu_ref *ref)
{
+ struct hd_struct *part = container_of(ref, struct hd_struct, ref);
call_rcu(&part->rcu_head, delete_partition_rcu_cb);
}
@@ -254,7 +255,7 @@ void delete_partition(struct gendisk *disk, int partno)
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
- hd_struct_put(part);
+ hd_struct_kill(part);
}
static ssize_t whole_disk_show(struct device *dev,
@@ -355,8 +356,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
- hd_ref_init(p);
- return p;
+ if (!hd_ref_init(p))
+ return p;
out_free_info:
free_part_info(p);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a221220..2adbfa6 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -13,6 +13,7 @@
#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
+#include <linux/percpu-refcount.h>
#ifdef CONFIG_BLOCK
@@ -124,7 +125,7 @@ struct hd_struct {
#else
struct disk_stats dkstats;
#endif
- atomic_t ref;
+ struct percpu_ref ref;
struct rcu_head rcu_head;
};
@@ -611,7 +612,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
sector_t len, int flags,
struct partition_meta_info
*info);
-extern void __delete_partition(struct hd_struct *);
+extern void __delete_partition(struct percpu_ref *);
extern void delete_partition(struct gendisk *, int);
extern void printk_all_partitions(void);
@@ -640,33 +641,39 @@ extern ssize_t part_fail_store(struct device *dev,
const char *buf, size_t count);
#endif /* CONFIG_FAIL_MAKE_REQUEST */
-static inline void hd_ref_init(struct hd_struct *part)
+static inline int hd_ref_init(struct hd_struct *part)
{
- atomic_set(&part->ref, 1);
- smp_mb();
+ if (percpu_ref_init(&part->ref, __delete_partition, 0,
+ GFP_KERNEL))
+ return -ENOMEM;
+ return 0;
}
static inline void hd_struct_get(struct hd_struct *part)
{
- atomic_inc(&part->ref);
- smp_mb__after_atomic();
+ percpu_ref_get(&part->ref);
}
static inline int hd_struct_try_get(struct hd_struct *part)
{
- return atomic_inc_not_zero(&part->ref);
+ return percpu_ref_tryget_live(&part->ref);
}
static inline void hd_struct_put(struct hd_struct *part)
{
- if (atomic_dec_and_test(&part->ref))
- __delete_partition(part);
+ percpu_ref_put(&part->ref);
+}
+
+static inline void hd_struct_kill(struct hd_struct *part)
+{
+ percpu_ref_kill(&part->ref);
}
static inline void hd_free_part(struct hd_struct *part)
{
free_part_stats(part);
free_part_info(part);
+ percpu_ref_exit(&part->ref);
}
/*
--
1.9.1
So that it is easier to convert part->in_flight[rw] into percpu variable
in the following patch.
Signed-off-by: Ming Lei <[email protected]>
---
block/bio.c | 4 ++--
block/blk-core.c | 4 ++--
block/blk-merge.c | 2 +-
drivers/nvdimm/core.c | 4 ++--
include/linux/genhd.h | 4 ++--
5 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 2a00d34..fe8807f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1724,7 +1724,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
part_round_stats(cpu, part);
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, sectors[rw], sectors);
- part_inc_in_flight(part, rw);
+ part_inc_in_flight(cpu, part, rw);
part_stat_unlock();
}
@@ -1738,7 +1738,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
part_stat_add(cpu, part, ticks[rw], duration);
part_round_stats(cpu, part);
- part_dec_in_flight(part, rw);
+ part_dec_in_flight(cpu, part, rw);
part_stat_unlock();
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 82819e6..f180a6d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2194,7 +2194,7 @@ void blk_account_io_done(struct request *req)
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
part_round_stats(cpu, part);
- part_dec_in_flight(part, rw);
+ part_dec_in_flight(cpu, part, rw);
hd_struct_put(part);
part_stat_unlock();
@@ -2252,7 +2252,7 @@ void blk_account_io_start(struct request *rq, bool new_io)
hd_struct_get(part);
}
part_round_stats(cpu, part);
- part_inc_in_flight(part, rw);
+ part_inc_in_flight(cpu, part, rw);
rq->part = part;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 30a0d9f..cb7c46d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -449,7 +449,7 @@ static void blk_account_io_merge(struct request *req)
part = req->part;
part_round_stats(cpu, part);
- part_dec_in_flight(part, rq_data_dir(req));
+ part_dec_in_flight(cpu, part, rq_data_dir(req));
hd_struct_put(part);
part_stat_unlock();
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index cb62ec6..053d026 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -224,7 +224,7 @@ void __nd_iostat_start(struct bio *bio, unsigned long *start)
part_round_stats(cpu, &disk->part0);
part_stat_inc(cpu, &disk->part0, ios[rw]);
part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
- part_inc_in_flight(&disk->part0, rw);
+ part_inc_in_flight(cpu, &disk->part0, rw);
part_stat_unlock();
}
EXPORT_SYMBOL(__nd_iostat_start);
@@ -238,7 +238,7 @@ void nd_iostat_end(struct bio *bio, unsigned long start)
part_stat_add(cpu, &disk->part0, ticks[rw], duration);
part_round_stats(cpu, &disk->part0);
- part_dec_in_flight(&disk->part0, rw);
+ part_dec_in_flight(cpu, &disk->part0, rw);
part_stat_unlock();
}
EXPORT_SYMBOL(nd_iostat_end);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2adbfa6..612ae80 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -381,14 +381,14 @@ static inline void free_part_stats(struct hd_struct *part)
#define part_stat_sub(cpu, gendiskp, field, subnd) \
part_stat_add(cpu, gendiskp, field, -subnd)
-static inline void part_inc_in_flight(struct hd_struct *part, int rw)
+static inline void part_inc_in_flight(int cpu, struct hd_struct *part, int rw)
{
atomic_inc(&part->in_flight[rw]);
if (part->partno)
atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
}
-static inline void part_dec_in_flight(struct hd_struct *part, int rw)
+static inline void part_dec_in_flight(int cpu, struct hd_struct *part, int rw)
{
atomic_dec(&part->in_flight[rw]);
if (part->partno)
--
1.9.1
So the atomic operations for accounting block I/O can be killed
completely, and it is OK to add the percpu variables in part_in_flight()
because the function is run at most one time in every tick.
Signed-off-by: Ming Lei <[email protected]>
---
block/blk-core.c | 1 +
block/partition-generic.c | 5 +++--
drivers/md/dm.c | 10 ++++++----
include/linux/genhd.h | 24 ++++++++++++++++++------
4 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index f180a6d..0001d4c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1344,6 +1344,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
if (now == part->stamp)
return;
+ /* at most one percpu addition per one tick */
inflight = part_in_flight(part);
if (inflight) {
__part_stat_add(cpu, part, time_in_queue,
diff --git a/block/partition-generic.c b/block/partition-generic.c
index e771113..0a553e7 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -140,8 +140,9 @@ ssize_t part_inflight_show(struct device *dev,
{
struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
- atomic_read(&p->in_flight[1]));
+ return sprintf(buf, "%8u %8u\n",
+ part_stat_read(p, in_flight[0]),
+ part_stat_read(p, in_flight[1]));
}
#ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index de70377..1b6d8be 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -651,9 +651,9 @@ static void start_io_acct(struct dm_io *io)
cpu = part_stat_lock();
part_round_stats(cpu, &dm_disk(md)->part0);
+ part_stat_set(cpu, &dm_disk(md)->part0, in_flight[rw],
+ atomic_inc_return(&md->pending[rw]));
part_stat_unlock();
- atomic_set(&dm_disk(md)->part0.in_flight[rw],
- atomic_inc_return(&md->pending[rw]));
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
@@ -665,7 +665,7 @@ static void end_io_acct(struct dm_io *io)
struct mapped_device *md = io->md;
struct bio *bio = io->bio;
unsigned long duration = jiffies - io->start_time;
- int pending;
+ int pending, cpu;
int rw = bio_data_dir(bio);
generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
@@ -679,7 +679,9 @@ static void end_io_acct(struct dm_io *io)
* a flush.
*/
pending = atomic_dec_return(&md->pending[rw]);
- atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
+ cpu = part_stat_lock();
+ part_stat_set(cpu, &dm_disk(md)->part0, in_flight[rw], pending);
+ part_stat_unlock();
pending += atomic_read(&md->pending[rw^0x1]);
/* nudge anyone waiting on suspend queue */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 612ae80..abe5567 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -86,6 +86,7 @@ struct disk_stats {
unsigned long ticks[2];
unsigned long io_ticks;
unsigned long time_in_queue;
+ unsigned int in_flight[2];
};
#define PARTITION_META_INFO_VOLNAMELTH 64
@@ -119,7 +120,6 @@ struct hd_struct {
int make_it_fail;
#endif
unsigned long stamp;
- atomic_t in_flight[2];
#ifdef CONFIG_SMP
struct disk_stats __percpu *dkstats;
#else
@@ -320,6 +320,9 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
res; \
})
+#define part_stat_set(cpu, part, field, seted) \
+ (per_cpu_ptr((part)->dkstats, (cpu))->field = (seted))
+
static inline void part_stat_set_all(struct hd_struct *part, int value)
{
int i;
@@ -351,6 +354,9 @@ static inline void free_part_stats(struct hd_struct *part)
#define part_stat_read(part, field) ((part)->dkstats.field)
+#define part_stat_set(cpu, part, field, seted) \
+ ((part)->dkstats.field = (seted))
+
static inline void part_stat_set_all(struct hd_struct *part, int value)
{
memset(&part->dkstats, value, sizeof(struct disk_stats));
@@ -383,21 +389,27 @@ static inline void free_part_stats(struct hd_struct *part)
static inline void part_inc_in_flight(int cpu, struct hd_struct *part, int rw)
{
- atomic_inc(&part->in_flight[rw]);
+ part_stat_inc(cpu, part, in_flight[rw]);
if (part->partno)
- atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+ part_stat_inc(cpu, &part_to_disk(part)->part0, in_flight[rw]);
}
static inline void part_dec_in_flight(int cpu, struct hd_struct *part, int rw)
{
- atomic_dec(&part->in_flight[rw]);
+ part_stat_dec(cpu, part, in_flight[rw]);
if (part->partno)
- atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+ part_stat_dec(cpu, &part_to_disk(part)->part0, in_flight[rw]);
}
static inline int part_in_flight(struct hd_struct *part)
{
- return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
+ int res = 0;
+ unsigned int cpu;
+ for_each_possible_cpu(cpu) {
+ res += per_cpu_ptr((part)->dkstats, cpu)->in_flight[0];
+ res += per_cpu_ptr((part)->dkstats, cpu)->in_flight[1];
+ }
+ return res;
}
static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
--
1.9.1
On Thu, Jul 16, 2015 at 11:16:45AM +0800, Ming Lei wrote:
> Percpu refcount is the perfect match for partition's case,
> and the conversion is quite straight.
>
> With the convertion, one pair of atomic inc/dec can be saved
> for accounting block I/O, which is run in hot path of block I/O.
>
> Signed-off-by: Ming Lei <[email protected]>
Acked-by: Tejun Heo <[email protected]>
Thanks.
--
tejun
Hello,
On Thu, Jul 16, 2015 at 11:16:47AM +0800, Ming Lei wrote:
...
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -651,9 +651,9 @@ static void start_io_acct(struct dm_io *io)
>
> cpu = part_stat_lock();
> part_round_stats(cpu, &dm_disk(md)->part0);
> + part_stat_set(cpu, &dm_disk(md)->part0, in_flight[rw],
> + atomic_inc_return(&md->pending[rw]));
> part_stat_unlock();
> - atomic_set(&dm_disk(md)->part0.in_flight[rw],
> - atomic_inc_return(&md->pending[rw]));
Why is this correct? Isn't the code trying to transfer its stat to
block stat verbatim? Why does part_stat_set() have @cpu param at all?
Shouldn't it clear the whole thing and set one of the cpu counters to
the target value?
> @@ -679,7 +679,9 @@ static void end_io_acct(struct dm_io *io)
> * a flush.
> */
> pending = atomic_dec_return(&md->pending[rw]);
> - atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
> + cpu = part_stat_lock();
> + part_stat_set(cpu, &dm_disk(md)->part0, in_flight[rw], pending);
> + part_stat_unlock();
Ditto.
Thanks.
--
tejun
On 07/15/2015 09:16 PM, Ming Lei wrote:
> Hi,
>
> This patches kills two kinds of atomic operations in block
> accounting I/O.
>
> The 1st two patches convert atomic refcount of partition
> into percpu refcount.
>
> The 2nd two patches converts partition->in_flight[] into percpu
> variable.
>
> With this change, ~15% throughput improvement can be observed
> when running fio(randread) over null blk in a dual-socket
> environment.
I've played with this before, but always ran into the hurdle of making
part_in_flight() too expensive ended up hurting results in the end.
Making the inc/dec parts of accounting percpu is a no-brainer,
unfortunately the summing then becomes pretty expensive. I'll run this
through some testing and see what kind of results I get.
--
Jens Axboe
On Thu, Jul 16, 2015 at 08:48:41AM -0600, Jens Axboe wrote:
> I've played with this before, but always ran into the hurdle of making
> part_in_flight() too expensive ended up hurting results in the end. Making
> the inc/dec parts of accounting percpu is a no-brainer, unfortunately the
> summing then becomes pretty expensive. I'll run this through some testing
> and see what kind of results I get.
The only place which could be a problem is part_round_stats() and we
can do that *way* lazier. I don't think we're using that internally.
Why are we even invoking it from IO issue / completion path?
Thanks.
--
tejun
On Thu, Jul 16, 2015 at 10:48 PM, Jens Axboe <[email protected]> wrote:
> On 07/15/2015 09:16 PM, Ming Lei wrote:
>>
>> Hi,
>>
>> This patches kills two kinds of atomic operations in block
>> accounting I/O.
>>
>> The 1st two patches convert atomic refcount of partition
>> into percpu refcount.
>>
>> The 2nd two patches converts partition->in_flight[] into percpu
>> variable.
>>
>> With this change, ~15% throughput improvement can be observed
>> when running fio(randread) over null blk in a dual-socket
>> environment.
>
>
> I've played with this before, but always ran into the hurdle of making
> part_in_flight() too expensive ended up hurting results in the end. Making
Yes, it is a bit expensive, but it is only run at most one time per tick for
one partition.
> the inc/dec parts of accounting percpu is a no-brainer, unfortunately the
> summing then becomes pretty expensive. I'll run this through some testing
> and see what kind of results I get.
The first two patches should be fine, and it still can get ~8% improvement
in my test.
>
> --
> Jens Axboe
>
--
Ming Lei
On 07/16/2015 08:59 AM, Tejun Heo wrote:
> On Thu, Jul 16, 2015 at 08:48:41AM -0600, Jens Axboe wrote:
>> I've played with this before, but always ran into the hurdle of making
>> part_in_flight() too expensive ended up hurting results in the end. Making
>> the inc/dec parts of accounting percpu is a no-brainer, unfortunately the
>> summing then becomes pretty expensive. I'll run this through some testing
>> and see what kind of results I get.
>
> The only place which could be a problem is part_round_stats() and we
> can do that *way* lazier. I don't think we're using that internally.
> Why are we even invoking it from IO issue / completion path?
Right, that's where it's called in the fast path. Right now, it's
per-jiffy lazy. As to how/when it's called, I don't believe that has
changed since those stats were introduced. If we can move that out of
the fast path, then the percpu changes are straight forward.
--
Jens Axboe
On 07/16/2015 09:01 AM, Ming Lei wrote:
> On Thu, Jul 16, 2015 at 10:48 PM, Jens Axboe <[email protected]> wrote:
>> On 07/15/2015 09:16 PM, Ming Lei wrote:
>>>
>>> Hi,
>>>
>>> This patches kills two kinds of atomic operations in block
>>> accounting I/O.
>>>
>>> The 1st two patches convert atomic refcount of partition
>>> into percpu refcount.
>>>
>>> The 2nd two patches converts partition->in_flight[] into percpu
>>> variable.
>>>
>>> With this change, ~15% throughput improvement can be observed
>>> when running fio(randread) over null blk in a dual-socket
>>> environment.
>>
>>
>> I've played with this before, but always ran into the hurdle of making
>> part_in_flight() too expensive ended up hurting results in the end. Making
>
> Yes, it is a bit expensive, but it is only run at most one time per tick for
> one partition.
Yup, but that can still be 1000 per second. And up until last year, it
was even worse: 7276d02e241dc. So it's not too surprising if there's
more low hanging fruit :-)
If we can make the rounding more lazy, then we should go ahead and do that.
>> the inc/dec parts of accounting percpu is a no-brainer, unfortunately the
>> summing then becomes pretty expensive. I'll run this through some testing
>> and see what kind of results I get.
>
> The first two patches should be fine, and it still can get ~8% improvement
> in my test.
Agree, those can go right in.
--
Jens Axboe