2014-10-14 10:04:06

by Dmitry Monakhov

[permalink] [raw]
Subject: [PATCH 0/4] blkdev: flush optimization

Some filesystems try to optimize barrier flushes by maintaining
fs-specific generation counters, but if we introduce generic
flush generation counter for block device filesystems may use
it for fdatasync(2) optimization. Optimization should works if
userspace performs mutli-threaded IO with a lot of fdatasync()

TOC:
blkdev: add flush generation counter
md: add flush_idx support for stacked devices
ext4: cleanup data integrity sync fo nonjournal mode
ext4: Add fdatasync scalability optimization


2014-10-14 10:04:18

by Dmitry Monakhov

[permalink] [raw]
Subject: [PATCH 4/4] ext4: Add fdatasync scalability optimization

If block device support flush epoch generation we can optimize fsync
by using following logic
1) track flush_idx on per-inode basis, update it inside end_io
2) During fsync we can compare inode's flush_idx and curent device's
flush_idx. If device generation is newer than it means someone
already flushed data from volatile cache to stable media, so
we can avoid explicit barrier

This optimization works well for fsync intensitive workloads like:
1) mail-server
2) cloud/chunk server which mostly perfomes direct-io and fdatasync
3) fdatasync bunch of files requires only one barrier.

Patch was tested on real HW with power-loss test hwflush-check
(was submitted here http://patchwork.ozlabs.org/patch/244297/).
Following configurations was tested:
ext4.buf_noalloc_fsync PASS
ext4.buf_noalloc_fdatasync PASS
ext4.buf_falloc_fsync PASS
ext4.buf_falloc_fdatasync PASS
ext4.buf_rewrite_fsync PASS
ext4.buf_rewrite_fdatasync PASS
ext4.dio_noalloc_fsync PASS
ext4.dio_noalloc_fdatasync PASS
ext4.dio_falloc_fsync PASS
ext4.dio_falloc_fdatasync PASS
ext4.dio_rewrite_fsync PASS
ext4.dio_rewrite_fdatasync PASS

CC: [email protected]
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/ext4.h | 1 +
fs/ext4/ext4_jbd2.h | 9 +++++++++
fs/ext4/fsync.c | 22 ++++++++++++++++++----
fs/ext4/ialloc.c | 1 +
fs/ext4/indirect.c | 16 ++++++++++++++--
fs/ext4/inode.c | 4 ++++
fs/ext4/page-io.c | 1 +
include/trace/events/ext4.h | 10 ++++++----
8 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0c225c..46cb45a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -939,6 +939,7 @@ struct ext4_inode_info {
*/
tid_t i_sync_tid;
tid_t i_datasync_tid;
+ unsigned i_flush_idx;

/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
__u32 i_csum_seed;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 17c00ff..06df47e 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -381,6 +381,15 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
}
}

+static inline void ext4_update_inode_fsync_fid(struct inode *inode)
+{
+ struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev);
+
+ if (unlikely(!q))
+ return;
+ ACCESS_ONCE(EXT4_I(inode)->i_flush_idx) = blk_get_flush_idx(q, true);
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2b0fd69..59fe0d2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -90,8 +90,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret = 0, err;
+ struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev);
+ int ret = 0, barrier_opt = 0, err;
tid_t commit_tid;
+ unsigned flush_idx;
bool needs_barrier = test_opt(inode->i_sb, BARRIER);

J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -108,6 +110,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)

if (!journal) {
ret = __generic_file_fsync(file, start, end, datasync);
+ flush_idx = ACCESS_ONCE(ei->i_flush_idx);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out_flush;
@@ -134,19 +137,30 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = ext4_force_commit(inode->i_sb);
goto out;
}
-
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ flush_idx = ACCESS_ONCE(ei->i_flush_idx);
if (needs_barrier &&
- jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ jbd2_trans_will_send_data_barrier(journal, commit_tid)) {
needs_barrier = false;
+ barrier_opt = 1;
+ }
ret = jbd2_complete_transaction(journal, commit_tid);
out_flush:
+ /*
+ * We MUST send a barrier unless we can guarantee that:
+ * Latest write request for given inode was completed before
+ * new flush request was QUEUED and COMPLETED by blkdev.
+ */
+ if (needs_barrier && q && blk_flush_idx_is_stable(q, flush_idx)) {
+ needs_barrier = false;
+ barrier_opt = 2;
+ }
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
ret = err;
}
out:
- trace_ext4_sync_file_exit(inode, ret);
+ trace_ext4_sync_file_exit(inode, ret, barrier_opt);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 5b87fc3..91581d5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1052,6 +1052,7 @@ got:
}
}

+ ext4_update_inode_fsync_fid(inode);
if (ext4_handle_valid(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
ei->i_datasync_tid = handle->h_transaction->t_tid;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index e75f840..71a27a5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -633,6 +633,14 @@ out:
return err;
}

+static void ext4_ind_end_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ if (unlikely(!size))
+ return;
+ ext4_update_inode_fsync_fid(file_inode(iocb->ki_filp));
+}
+
/*
* O_DIRECT for ext3 (or indirect map) based files
*
@@ -697,8 +705,12 @@ retry:
inode_dio_done(inode);
} else {
locked:
- ret = blockdev_direct_IO(rw, iocb, inode, iter,
- offset, ext4_get_block);
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter,
+ offset, ext4_get_block,
+ (rw & WRITE) ? ext4_ind_end_dio :
+ NULL, NULL,
+ DIO_LOCKING | DIO_SKIP_HOLES);

if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3aa26e9..9879634 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2940,6 +2940,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
iocb->private = NULL;
io_end->offset = offset;
io_end->size = size;
+ if (size)
+ ext4_update_inode_fsync_fid(io_end->inode);
+
ext4_put_io_end(io_end);
}

@@ -4035,6 +4038,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_sync_tid = tid;
ei->i_datasync_tid = tid;
}
+ ext4_update_inode_fsync_fid(inode);

if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
if (ei->i_extra_isize == 0) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b24a254..fef8b2e 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -305,6 +305,7 @@ static void ext4_end_bio(struct bio *bio, int error)
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;

+ ext4_update_inode_fsync_fid(io_end->inode);
if (error) {
struct inode *inode = io_end->inode;

diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index d4f70a7..4a1989a 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -857,26 +857,28 @@ TRACE_EVENT(ext4_sync_file_enter,
);

TRACE_EVENT(ext4_sync_file_exit,
- TP_PROTO(struct inode *inode, int ret),
+ TP_PROTO(struct inode *inode, int ret, int barrier_opt),

- TP_ARGS(inode, ret),
+ TP_ARGS(inode, ret, barrier_opt),

TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( int, ret )
+ __field( int, barrier_opt )
),

TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->ret = ret;
+ __entry->barrier_opt = barrier_opt;
),

- TP_printk("dev %d,%d ino %lu ret %d",
+ TP_printk("dev %d,%d ino %lu ret %d, barrier_opt %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- __entry->ret)
+ __entry->ret, __entry->barrier_opt)
);

TRACE_EVENT(ext4_sync_fs,
--
1.7.1

2014-10-14 10:04:19

by Dmitry Monakhov

[permalink] [raw]
Subject: [PATCH 1/4] blkdev: add flush generation counter

PROF:
*Flush machinery addumptions
C1. At any given time, only one flush shall be in progress. This is
double buffering sufficient.
C2. Flush is deferred if any request is executing DATA of its ence.
This avoids issuing separate POSTFLUSHes for requests which ed
PREFLUSH.
C3. The second condition is ignored if there is a request which has
waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
starvation in the unlikely case where there are continuous am of
FUA (without FLUSH) requests.

So if we will increment flush_tag counter in two places:
blk_kick_flush: the place where flush request is issued
flush_end_io : the place where flush is completed
And by using rule (C1) we can guarantee that:
if (flush_tag & 0x1 == 1) then flush_tag is in progress
if (flush_tag & 0x1 == 0) then (flush_tag & ~(0x1)) completed
In other words we can define it as:

FLUSH_TAG_IDX = (flush_tag +1) & ~0x1
FLUSH_TAG_STATE = flush_tag & 0x1 ? IN_PROGRESS : COMPLETED

After that we can define rules for flush optimization:
We can be sure that our data was flushed only if:
1) data's bio was completed before flush request was QUEUED
and COMPLETED
So in terms of formulas we can write it like follows:
is_data_flushed = (blkdev->flush_tag & ~(0x1)) >
((data->flush_tag + 0x1) & (~0x1))


In order to support stacked block devices (especially linear dm)
I've implemented get_flush_idx function as queue's callback.

*Mutli-Queue scalability notes*
This implementation try to makes global optimization for all hw-queues
for a device which require read from each hw-queue like follows:
queue_for_each_hw_ctx(q, hctx, i)
fid += ACCESS_ONCE(hctx->fq->flush_idx)

In my tests I do not see any visiable difference on performance on
my hardware (W2600CR: 8cores x 2 sockets, 2numa nodes).
Really fast HW may prefer to return flush_id for a single hw-queue
in order to do so we have to encode flush_id with hw_queue_id
like follows:
fid_t = (hctx->fq->flush_id << MQ_SHIFT) | hctx->id
#define MQ_ID(fid) (fid & ((1 << MQ_SHIFT) -1))
Later blk_flush_idx_is_stable() can assumes fid_t as unstable if
if was obtained from another hw-queue:

bool blk_flush_idx_is_stable(struct request_queue *q, fid_t fid)
{
int difference;
fid_t cur = blk_get_flush_idx(q, false);
if (MQ_ID(fid) != MQ_ID(fid))
return 0;

difference = (blk_get_flush_idx(q, false) - id);
return (difference > 0);

}
Please let me know if you prefer that design to global one.

CC: Jens Axboe <[email protected]>
CC: Christoph Hellwig <[email protected]>
CC: Ming Lei <[email protected]>
Signed-off-by: Dmitry Monakhov <[email protected]>
---
block/blk-core.c | 1 +
block/blk-flush.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++-
block/blk-mq.c | 5 ++-
block/blk-settings.c | 6 +++++
block/blk-sysfs.c | 11 ++++++++++
block/blk.h | 1 +
include/linux/blkdev.h | 17 ++++++++++++++++
7 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ffcb47a..78c7e64 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -714,6 +714,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unprep_rq_fn = NULL;
+ q->get_flush_idx_fn = get_flush_idx;
q->queue_flags |= QUEUE_FLAG_DEFAULT;

/* Override internal queue lock with supplied lock pointer */
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 20badd7..e264af8 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -147,6 +147,46 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
}

/**
+ * get_flush_idx - return stable flush epoch index for a given queue
+ * @q: request_queue
+ * @queued: return index of latests queued flush
+ *
+ * Any bio which was completed before some flush request was QUEUED
+ * and COMPLETED is already in permanent store. Upper layer may use this
+ * feature and skip explicit flush if already does that
+ *
+ * fq->flush_idx counter incremented in two places:
+ * 1)blk_kick_flush: the place where flush request is issued
+ * 2)flush_end_io : the place where flush is completed
+ * And by using rule (C1) we can guarantee that:
+ * if (fq->flush_idx & 0x1 == 1) then flush_idx is in QUEUED
+ * if (fq->flush_idx & 0x1 == 0) then (flush_idx & ~(0x1)) COMPLETED
+ *
+ * In other words we can define it as:
+ * FLUSH_IDX = (flush_idx +1) & ~0x1
+ * FLUSH_STATE = flush_idx & 0x1 ? IN_PROGRESS : COMPLETED
+ *
+ */
+unsigned get_flush_idx(struct request_queue *q, bool queued)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned int i;
+ unsigned fid = 0;
+ unsigned diff = 0;
+
+ if (queued)
+ diff = 0x1;
+ if (!q->mq_ops) {
+ fid = (ACCESS_ONCE(q->fq->flush_idx) + diff) >> 1;
+ } else {
+ queue_for_each_hw_ctx(q, hctx, i)
+ fid += (ACCESS_ONCE(hctx->fq->flush_idx) + diff) >> 1;
+ }
+ return fid;
+}
+EXPORT_SYMBOL(get_flush_idx);
+
+/**
* blk_flush_complete_seq - complete flush sequence
* @rq: FLUSH/FUA request being sequenced
* @fq: flush queue
@@ -232,9 +272,15 @@ static void flush_end_io(struct request *flush_rq, int error)

running = &fq->flush_queue[fq->flush_running_idx];
BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
-
+ BUG_ON(!(fq->flush_idx & 0x1));
/* account completion of the flush request */
fq->flush_running_idx ^= 1;
+ /* In case of error we have to restore original flush_idx
+ * which was incremented kick_flush */
+ if (unlikely(error))
+ fq->flush_idx--;
+ else
+ fq->flush_idx++;

if (!q->mq_ops)
elv_completed_request(q, flush_rq);
@@ -303,6 +349,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
* different from running_idx, which means flush is in flight.
*/
fq->flush_pending_idx ^= 1;
+ fq->flush_idx++;

blk_rq_init(q, flush_rq);

@@ -509,6 +556,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
INIT_LIST_HEAD(&fq->flush_queue[0]);
INIT_LIST_HEAD(&fq->flush_queue[1]);
INIT_LIST_HEAD(&fq->flush_data_in_flight);
+ fq->flush_idx = 0;

return fq;

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e7a314..3b60daf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1624,9 +1624,10 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
break;
}

- if (i == q->nr_hw_queues)
+ if (i == q->nr_hw_queues) {
+ q->get_flush_idx_fn = get_flush_idx;
return 0;
-
+ }
/*
* Init failed
*/
diff --git a/block/blk-settings.c b/block/blk-settings.c
index aa02247..1b192dc 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -99,6 +99,12 @@ void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
}
EXPORT_SYMBOL_GPL(blk_queue_lld_busy);

+void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn)
+{
+ q->get_flush_idx_fn = fn;
+}
+EXPORT_SYMBOL_GPL(blk_queue_get_flush_idx);
+
/**
* blk_set_default_limits - reset limits to default values
* @lim: the queue_limits structure to reset
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e8f38a3..533fc0c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -94,6 +94,11 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
return ret;
}

+static ssize_t queue_flush_idx_show(struct request_queue *q, char *page)
+{
+ return sprintf(page, "%u\n", blk_get_flush_idx(q, false));
+}
+
static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
{
int max_sectors_kb = queue_max_sectors(q) >> 1;
@@ -374,6 +379,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = {
.show = queue_write_same_max_show,
};

+static struct queue_sysfs_entry queue_flush_idx_entry = {
+ .attr = {.name = "flush_index", .mode = S_IRUGO },
+ .show = queue_flush_idx_show,
+};
+
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
.show = queue_show_nonrot,
@@ -422,6 +432,7 @@ static struct attribute *default_attrs[] = {
&queue_discard_max_entry.attr,
&queue_discard_zeroes_data_entry.attr,
&queue_write_same_max_entry.attr,
+ &queue_flush_idx_entry.attr,
&queue_nonrot_entry.attr,
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
diff --git a/block/blk.h b/block/blk.h
index 43b0361..f4fa503 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,6 +18,7 @@ struct blk_flush_queue {
unsigned int flush_queue_delayed:1;
unsigned int flush_pending_idx:1;
unsigned int flush_running_idx:1;
+ unsigned int flush_idx;
unsigned long flush_pending_since;
struct list_head flush_queue[2];
struct list_head flush_data_in_flight;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5546392..6fb7bab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -237,6 +237,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
typedef int (prep_rq_fn) (struct request_queue *, struct request *);
typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
+typedef unsigned (get_flush_idx_fn) (struct request_queue *q, bool queued);

struct bio_vec;
struct bvec_merge_data {
@@ -332,6 +333,7 @@ struct request_queue {
rq_timed_out_fn *rq_timed_out_fn;
dma_drain_needed_fn *dma_drain_needed;
lld_busy_fn *lld_busy_fn;
+ get_flush_idx_fn *get_flush_idx_fn;

struct blk_mq_ops *mq_ops;

@@ -1030,6 +1032,7 @@ extern int blk_queue_dma_drain(struct request_queue *q,
dma_drain_needed_fn *dma_drain_needed,
void *buf, unsigned int size);
extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
+extern void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn);
extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
@@ -1180,6 +1183,20 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
nr_blocks << (sb->s_blocksize_bits - 9),
gfp_mask);
}
+extern unsigned get_flush_idx(struct request_queue *q, bool queued);
+static inline unsigned blk_get_flush_idx(struct request_queue *q, bool queued)
+{
+ if (unlikely(!q || !q->get_flush_idx_fn))
+ return 0;
+
+ return q->get_flush_idx_fn(q, queued);
+}
+static inline bool blk_flush_idx_is_stable(struct request_queue *q, unsigned id)
+{
+ int difference = (blk_get_flush_idx(q, false) - id);
+
+ return (difference > 0);
+}

extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);

--
1.7.1

2014-10-14 10:04:24

by Dmitry Monakhov

[permalink] [raw]
Subject: [PATCH 2/4] md: add flush_idx support for stacked devices

For single device mapping it is safe to introduce get_flush_idx conception.
This cover most common case linear mapping for a single dev.

CC: Alasdair Kergon <[email protected]>
CC: Mike Snitzer <[email protected]>
CC: [email protected]
Signed-off-by: Dmitry Monakhov <[email protected]>
---
drivers/md/dm.c | 31 +++++++++++++++++++++++++++++++
1 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 32b958d..042d172 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1843,6 +1843,36 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
return r;
}

+static int dm_iter_get_flush_idx(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ struct block_device *bdev = dev->bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return blk_get_flush_idx(q, *((bool *)data));
+}
+
+static unsigned dm_get_flush_idx(struct request_queue *q, bool queued)
+{
+ struct dm_target *ti;
+ struct mapped_device *md = q->queuedata;
+ struct dm_table *map = dm_get_live_table_fast(md);
+ unsigned fid = 0;
+
+ if (unlikely(!map || dm_table_get_num_targets(map) != 1))
+ goto out;
+
+ ti = dm_table_get_target(map, 0);
+ if (unlikely(!ti || !ti->type->iterate_devices))
+ goto out;
+
+ fid = ti->type->iterate_devices(ti, dm_iter_get_flush_idx, &queued);
+out:
+ dm_put_live_table_fast(md);
+ return fid;
+}
+
/*-----------------------------------------------------------------
* An IDR is used to keep track of allocated minor numbers.
*---------------------------------------------------------------*/
@@ -1962,6 +1992,7 @@ static struct mapped_device *alloc_dev(int minor)
goto bad_queue;

dm_init_md_queue(md);
+ blk_queue_get_flush_idx(md->queue, dm_get_flush_idx);

md->disk = alloc_disk(1);
if (!md->disk)
--
1.7.1

2014-10-14 10:04:16

by Dmitry Monakhov

[permalink] [raw]
Subject: [PATCH 3/4] ext4: cleanup data integrity sync fo nonjournal mode

There are several rules we must follows during data integrity
1) barrier MUST being sent if barrier_opt is enabled (and must not otherwise)
2) If we can guarantee that barrier will be sent for us,
we can skip explicit barrier.

Change log:
- Fix needs_barrier var initialization according to rule (1)
- Avoid barriers if barrier_opt was not enabled for non-journal mode
according to rule (1)
- Style cleanup flush optimization according to rule (2)

CC: [email protected]
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/fsync.c | 13 +++++++------
1 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a8bc47f..2b0fd69 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -92,7 +92,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0, err;
tid_t commit_tid;
- bool needs_barrier = false;
+ bool needs_barrier = test_opt(inode->i_sb, BARRIER);

J_ASSERT(ext4_journal_current_handle() == NULL);

@@ -107,10 +107,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}

if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
- goto out;
+ goto out_flush;
}

ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
@@ -136,10 +136,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}

commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
- if (journal->j_flags & JBD2_BARRIER &&
- !jbd2_trans_will_send_data_barrier(journal, commit_tid))
- needs_barrier = true;
+ if (needs_barrier &&
+ jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = false;
ret = jbd2_complete_transaction(journal, commit_tid);
+out_flush:
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
--
1.7.1

2014-10-14 10:57:31

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 4/4] ext4: Add fdatasync scalability optimization

Much of the bookkeeping here seems generic and should be in common
code and not inside a filesystem.

2014-10-14 11:26:06

by Dmitry Monakhov

[permalink] [raw]
Subject: Re: [PATCH 4/4] ext4: Add fdatasync scalability optimization

Christoph Hellwig <[email protected]> writes:

> Much of the bookkeeping here seems generic and should be in common
> code and not inside a filesystem.
Yes. But this means that I need to store flush_id inside generic inode
which likely will be accepted negatively because VFS people do not like
inode bloating. But if you are OK then I'll prepare the patch.


Attachments:
(No filename) (818.00 B)

2014-10-14 11:35:49

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 4/4] ext4: Add fdatasync scalability optimization

On Tue, Oct 14, 2014 at 03:25:54PM +0400, Dmitry Monakhov wrote:
> Christoph Hellwig <[email protected]> writes:
>
> > Much of the bookkeeping here seems generic and should be in common
> > code and not inside a filesystem.
> Yes. But this means that I need to store flush_id inside generic inode
> which likely will be accepted negatively because VFS people do not like
> inode bloating. But if you are OK then I'll prepare the patch.

With my "VFS person critical of struct inode size increases" hat on
I'm still critical of any struct inode increase, so if you can find
an option that say just takes the address of the counter variable
I'd prefer that. But even if we have to increase the inode this might
be one of the cases where it's fine as cache flushing is something
at least all on disk filesystems need to do.

2014-10-14 12:25:29

by Mike Snitzer

[permalink] [raw]
Subject: Re: [PATCH 2/4] md: add flush_idx support for stacked devices

On Tue, Oct 14 2014 at 6:03am -0400,
Dmitry Monakhov <[email protected]> wrote:

> For single device mapping it is safe to introduce get_flush_idx conception.
> This cover most common case linear mapping for a single dev.

You really should email the entire patchset rather than forcing the need
to hunt for the remainder of the patches that actually provide context
for this change.

2014-10-16 14:24:32

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 1/4] blkdev: add flush generation counter

On Tue, Oct 14, 2014 at 12:03 PM, Dmitry Monakhov <[email protected]> wrote:
> PROF:
> *Flush machinery addumptions
> C1. At any given time, only one flush shall be in progress. This is
> double buffering sufficient.
> C2. Flush is deferred if any request is executing DATA of its ence.
> This avoids issuing separate POSTFLUSHes for requests which ed
> PREFLUSH.
> C3. The second condition is ignored if there is a request which has
> waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
> starvation in the unlikely case where there are continuous am of
> FUA (without FLUSH) requests.
>
> So if we will increment flush_tag counter in two places:
> blk_kick_flush: the place where flush request is issued
> flush_end_io : the place where flush is completed
> And by using rule (C1) we can guarantee that:
> if (flush_tag & 0x1 == 1) then flush_tag is in progress
> if (flush_tag & 0x1 == 0) then (flush_tag & ~(0x1)) completed
> In other words we can define it as:
>
> FLUSH_TAG_IDX = (flush_tag +1) & ~0x1
> FLUSH_TAG_STATE = flush_tag & 0x1 ? IN_PROGRESS : COMPLETED
>
> After that we can define rules for flush optimization:
> We can be sure that our data was flushed only if:
> 1) data's bio was completed before flush request was QUEUED
> and COMPLETED
> So in terms of formulas we can write it like follows:
> is_data_flushed = (blkdev->flush_tag & ~(0x1)) >
> ((data->flush_tag + 0x1) & (~0x1))

Looks you are just trying to figure out if the 'data' is flushed or not,
so care to explain a bit what the optimization is?

>
>
> In order to support stacked block devices (especially linear dm)
> I've implemented get_flush_idx function as queue's callback.
>
> *Mutli-Queue scalability notes*
> This implementation try to makes global optimization for all hw-queues
> for a device which require read from each hw-queue like follows:
> queue_for_each_hw_ctx(q, hctx, i)
> fid += ACCESS_ONCE(hctx->fq->flush_idx)

I am wondering if it can work, suppose request A is submitted
to hw_queue 0, and request B is submitted to hw_queue 1, then
you may thought request A has been flushed out when request
B is just flushed via hctx 1.

>
> In my tests I do not see any visiable difference on performance on
> my hardware (W2600CR: 8cores x 2 sockets, 2numa nodes).
> Really fast HW may prefer to return flush_id for a single hw-queue
> in order to do so we have to encode flush_id with hw_queue_id
> like follows:
> fid_t = (hctx->fq->flush_id << MQ_SHIFT) | hctx->id
> #define MQ_ID(fid) (fid & ((1 << MQ_SHIFT) -1))
> Later blk_flush_idx_is_stable() can assumes fid_t as unstable if
> if was obtained from another hw-queue:
>
> bool blk_flush_idx_is_stable(struct request_queue *q, fid_t fid)
> {
> int difference;
> fid_t cur = blk_get_flush_idx(q, false);
> if (MQ_ID(fid) != MQ_ID(fid))
> return 0;
>
> difference = (blk_get_flush_idx(q, false) - id);
> return (difference > 0);
>
> }
> Please let me know if you prefer that design to global one.
>
> CC: Jens Axboe <[email protected]>
> CC: Christoph Hellwig <[email protected]>
> CC: Ming Lei <[email protected]>
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> block/blk-core.c | 1 +
> block/blk-flush.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++-
> block/blk-mq.c | 5 ++-
> block/blk-settings.c | 6 +++++
> block/blk-sysfs.c | 11 ++++++++++
> block/blk.h | 1 +
> include/linux/blkdev.h | 17 ++++++++++++++++
> 7 files changed, 88 insertions(+), 3 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index ffcb47a..78c7e64 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -714,6 +714,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
> q->request_fn = rfn;
> q->prep_rq_fn = NULL;
> q->unprep_rq_fn = NULL;
> + q->get_flush_idx_fn = get_flush_idx;
> q->queue_flags |= QUEUE_FLAG_DEFAULT;
>
> /* Override internal queue lock with supplied lock pointer */
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index 20badd7..e264af8 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -147,6 +147,46 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
> }
>
> /**
> + * get_flush_idx - return stable flush epoch index for a given queue
> + * @q: request_queue
> + * @queued: return index of latests queued flush
> + *
> + * Any bio which was completed before some flush request was QUEUED
> + * and COMPLETED is already in permanent store. Upper layer may use this
> + * feature and skip explicit flush if already does that
> + *
> + * fq->flush_idx counter incremented in two places:
> + * 1)blk_kick_flush: the place where flush request is issued
> + * 2)flush_end_io : the place where flush is completed
> + * And by using rule (C1) we can guarantee that:
> + * if (fq->flush_idx & 0x1 == 1) then flush_idx is in QUEUED
> + * if (fq->flush_idx & 0x1 == 0) then (flush_idx & ~(0x1)) COMPLETED
> + *
> + * In other words we can define it as:
> + * FLUSH_IDX = (flush_idx +1) & ~0x1
> + * FLUSH_STATE = flush_idx & 0x1 ? IN_PROGRESS : COMPLETED
> + *
> + */
> +unsigned get_flush_idx(struct request_queue *q, bool queued)
> +{
> + struct blk_mq_hw_ctx *hctx;
> + unsigned int i;
> + unsigned fid = 0;
> + unsigned diff = 0;
> +
> + if (queued)
> + diff = 0x1;
> + if (!q->mq_ops) {
> + fid = (ACCESS_ONCE(q->fq->flush_idx) + diff) >> 1;
> + } else {
> + queue_for_each_hw_ctx(q, hctx, i)
> + fid += (ACCESS_ONCE(hctx->fq->flush_idx) + diff) >> 1;
> + }
> + return fid;
> +}
> +EXPORT_SYMBOL(get_flush_idx);
> +
> +/**
> * blk_flush_complete_seq - complete flush sequence
> * @rq: FLUSH/FUA request being sequenced
> * @fq: flush queue
> @@ -232,9 +272,15 @@ static void flush_end_io(struct request *flush_rq, int error)
>
> running = &fq->flush_queue[fq->flush_running_idx];
> BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
> -
> + BUG_ON(!(fq->flush_idx & 0x1));
> /* account completion of the flush request */
> fq->flush_running_idx ^= 1;
> + /* In case of error we have to restore original flush_idx
> + * which was incremented kick_flush */
> + if (unlikely(error))
> + fq->flush_idx--;
> + else
> + fq->flush_idx++;
>
> if (!q->mq_ops)
> elv_completed_request(q, flush_rq);
> @@ -303,6 +349,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
> * different from running_idx, which means flush is in flight.
> */
> fq->flush_pending_idx ^= 1;
> + fq->flush_idx++;
>
> blk_rq_init(q, flush_rq);
>
> @@ -509,6 +556,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
> INIT_LIST_HEAD(&fq->flush_queue[0]);
> INIT_LIST_HEAD(&fq->flush_queue[1]);
> INIT_LIST_HEAD(&fq->flush_data_in_flight);
> + fq->flush_idx = 0;
>
> return fq;
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 4e7a314..3b60daf 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1624,9 +1624,10 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
> break;
> }
>
> - if (i == q->nr_hw_queues)
> + if (i == q->nr_hw_queues) {
> + q->get_flush_idx_fn = get_flush_idx;
> return 0;
> -
> + }
> /*
> * Init failed
> */
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index aa02247..1b192dc 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -99,6 +99,12 @@ void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
> }
> EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
>
> +void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn)
> +{
> + q->get_flush_idx_fn = fn;
> +}
> +EXPORT_SYMBOL_GPL(blk_queue_get_flush_idx);
> +
> /**
> * blk_set_default_limits - reset limits to default values
> * @lim: the queue_limits structure to reset
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index e8f38a3..533fc0c 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -94,6 +94,11 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
> return ret;
> }
>
> +static ssize_t queue_flush_idx_show(struct request_queue *q, char *page)
> +{
> + return sprintf(page, "%u\n", blk_get_flush_idx(q, false));
> +}
> +
> static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
> {
> int max_sectors_kb = queue_max_sectors(q) >> 1;
> @@ -374,6 +379,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = {
> .show = queue_write_same_max_show,
> };
>
> +static struct queue_sysfs_entry queue_flush_idx_entry = {
> + .attr = {.name = "flush_index", .mode = S_IRUGO },
> + .show = queue_flush_idx_show,
> +};
> +
> static struct queue_sysfs_entry queue_nonrot_entry = {
> .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
> .show = queue_show_nonrot,
> @@ -422,6 +432,7 @@ static struct attribute *default_attrs[] = {
> &queue_discard_max_entry.attr,
> &queue_discard_zeroes_data_entry.attr,
> &queue_write_same_max_entry.attr,
> + &queue_flush_idx_entry.attr,
> &queue_nonrot_entry.attr,
> &queue_nomerges_entry.attr,
> &queue_rq_affinity_entry.attr,
> diff --git a/block/blk.h b/block/blk.h
> index 43b0361..f4fa503 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -18,6 +18,7 @@ struct blk_flush_queue {
> unsigned int flush_queue_delayed:1;
> unsigned int flush_pending_idx:1;
> unsigned int flush_running_idx:1;
> + unsigned int flush_idx;
> unsigned long flush_pending_since;
> struct list_head flush_queue[2];
> struct list_head flush_data_in_flight;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 5546392..6fb7bab 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -237,6 +237,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
> typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
> typedef int (prep_rq_fn) (struct request_queue *, struct request *);
> typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
> +typedef unsigned (get_flush_idx_fn) (struct request_queue *q, bool queued);
>
> struct bio_vec;
> struct bvec_merge_data {
> @@ -332,6 +333,7 @@ struct request_queue {
> rq_timed_out_fn *rq_timed_out_fn;
> dma_drain_needed_fn *dma_drain_needed;
> lld_busy_fn *lld_busy_fn;
> + get_flush_idx_fn *get_flush_idx_fn;
>
> struct blk_mq_ops *mq_ops;
>
> @@ -1030,6 +1032,7 @@ extern int blk_queue_dma_drain(struct request_queue *q,
> dma_drain_needed_fn *dma_drain_needed,
> void *buf, unsigned int size);
> extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
> +extern void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn);
> extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
> extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
> extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
> @@ -1180,6 +1183,20 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
> nr_blocks << (sb->s_blocksize_bits - 9),
> gfp_mask);
> }
> +extern unsigned get_flush_idx(struct request_queue *q, bool queued);
> +static inline unsigned blk_get_flush_idx(struct request_queue *q, bool queued)
> +{
> + if (unlikely(!q || !q->get_flush_idx_fn))
> + return 0;
> +
> + return q->get_flush_idx_fn(q, queued);
> +}
> +static inline bool blk_flush_idx_is_stable(struct request_queue *q, unsigned id)
> +{
> + int difference = (blk_get_flush_idx(q, false) - id);
> +
> + return (difference > 0);
> +}
>
> extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
>
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2014-10-16 15:15:12

by Dmitry Monakhov

[permalink] [raw]
Subject: Re: [PATCH 1/4] blkdev: add flush generation counter

Ming Lei <[email protected]> writes:

> On Tue, Oct 14, 2014 at 12:03 PM, Dmitry Monakhov <[email protected]> wrote:
>> PROF:
>> *Flush machinery addumptions
>> C1. At any given time, only one flush shall be in progress. This is
>> double buffering sufficient.
>> C2. Flush is deferred if any request is executing DATA of its ence.
>> This avoids issuing separate POSTFLUSHes for requests which ed
>> PREFLUSH.
>> C3. The second condition is ignored if there is a request which has
>> waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
>> starvation in the unlikely case where there are continuous am of
>> FUA (without FLUSH) requests.
>>
>> So if we will increment flush_tag counter in two places:
>> blk_kick_flush: the place where flush request is issued
>> flush_end_io : the place where flush is completed
>> And by using rule (C1) we can guarantee that:
>> if (flush_tag & 0x1 == 1) then flush_tag is in progress
>> if (flush_tag & 0x1 == 0) then (flush_tag & ~(0x1)) completed
>> In other words we can define it as:
>>
>> FLUSH_TAG_IDX = (flush_tag +1) & ~0x1
>> FLUSH_TAG_STATE = flush_tag & 0x1 ? IN_PROGRESS : COMPLETED
>>
>> After that we can define rules for flush optimization:
>> We can be sure that our data was flushed only if:
>> 1) data's bio was completed before flush request was QUEUED
>> and COMPLETED
>> So in terms of formulas we can write it like follows:
>> is_data_flushed = (blkdev->flush_tag & ~(0x1)) >
>> ((data->flush_tag + 0x1) & (~0x1))
>
> Looks you are just trying to figure out if the 'data' is flushed or not,
> so care to explain a bit what the optimization is?
Indeed I try to understand whenever data was flushed or not.
inodes on filesystem may update this ID inside ->end_io callback.
Later if user called fsync/fdatasync we may figure out that inode's
data was flushed already so we do not have to issue explicit barrier.
This helps for intensive multi-file dio-aio/fsync workloads
chunk servers, virt-disk images, mail server, etc.

for (i=0;i<fd_num;i++)
pwrite(fd[i], buf, 4096, 0)
for (i=0;i<fd_num;i++)
fdatasync(fd[i])
Before this optimization we have to issue a barrier to each fdatasync
one by one, and send only one when optimization enabled.
>
>>
>>
>> In order to support stacked block devices (especially linear dm)
>> I've implemented get_flush_idx function as queue's callback.
>>
>> *Mutli-Queue scalability notes*
>> This implementation try to makes global optimization for all hw-queues
>> for a device which require read from each hw-queue like follows:
>> queue_for_each_hw_ctx(q, hctx, i)
>> fid += ACCESS_ONCE(hctx->fq->flush_idx)
>
> I am wondering if it can work, suppose request A is submitted
> to hw_queue 0, and request B is submitted to hw_queue 1, then
> you may thought request A has been flushed out when request
> B is just flushed via hctx 1.
Correct, because we know that at the moment A and B was completed
at least one barrier was issued and completed (even via hwctx 2)
Flush has blkdev/request_queue-wide guarantee regardless to hwctx.
If it not the case for MQ then all filesystem are totally broken,
and blkdev_issue_flush MUST be changed to flush all hwctx.
>
>>
>> In my tests I do not see any visiable difference on performance on
>> my hardware (W2600CR: 8cores x 2 sockets, 2numa nodes).
>> Really fast HW may prefer to return flush_id for a single hw-queue
>> in order to do so we have to encode flush_id with hw_queue_id
>> like follows:
>> fid_t = (hctx->fq->flush_id << MQ_SHIFT) | hctx->id
>> #define MQ_ID(fid) (fid & ((1 << MQ_SHIFT) -1))
>> Later blk_flush_idx_is_stable() can assumes fid_t as unstable if
>> if was obtained from another hw-queue:
>>
>> bool blk_flush_idx_is_stable(struct request_queue *q, fid_t fid)
>> {
>> int difference;
>> fid_t cur = blk_get_flush_idx(q, false);
>> if (MQ_ID(fid) != MQ_ID(fid))
>> return 0;
>>
>> difference = (blk_get_flush_idx(q, false) - id);
>> return (difference > 0);
>>
>> }
>> Please let me know if you prefer that design to global one.
>>
>> CC: Jens Axboe <[email protected]>
>> CC: Christoph Hellwig <[email protected]>
>> CC: Ming Lei <[email protected]>
>> Signed-off-by: Dmitry Monakhov <[email protected]>
>> ---
>> block/blk-core.c | 1 +
>> block/blk-flush.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++-
>> block/blk-mq.c | 5 ++-
>> block/blk-settings.c | 6 +++++
>> block/blk-sysfs.c | 11 ++++++++++
>> block/blk.h | 1 +
>> include/linux/blkdev.h | 17 ++++++++++++++++
>> 7 files changed, 88 insertions(+), 3 deletions(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index ffcb47a..78c7e64 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -714,6 +714,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
>> q->request_fn = rfn;
>> q->prep_rq_fn = NULL;
>> q->unprep_rq_fn = NULL;
>> + q->get_flush_idx_fn = get_flush_idx;
>> q->queue_flags |= QUEUE_FLAG_DEFAULT;
>>
>> /* Override internal queue lock with supplied lock pointer */
>> diff --git a/block/blk-flush.c b/block/blk-flush.c
>> index 20badd7..e264af8 100644
>> --- a/block/blk-flush.c
>> +++ b/block/blk-flush.c
>> @@ -147,6 +147,46 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
>> }
>>
>> /**
>> + * get_flush_idx - return stable flush epoch index for a given queue
>> + * @q: request_queue
>> + * @queued: return index of latests queued flush
>> + *
>> + * Any bio which was completed before some flush request was QUEUED
>> + * and COMPLETED is already in permanent store. Upper layer may use this
>> + * feature and skip explicit flush if already does that
>> + *
>> + * fq->flush_idx counter incremented in two places:
>> + * 1)blk_kick_flush: the place where flush request is issued
>> + * 2)flush_end_io : the place where flush is completed
>> + * And by using rule (C1) we can guarantee that:
>> + * if (fq->flush_idx & 0x1 == 1) then flush_idx is in QUEUED
>> + * if (fq->flush_idx & 0x1 == 0) then (flush_idx & ~(0x1)) COMPLETED
>> + *
>> + * In other words we can define it as:
>> + * FLUSH_IDX = (flush_idx +1) & ~0x1
>> + * FLUSH_STATE = flush_idx & 0x1 ? IN_PROGRESS : COMPLETED
>> + *
>> + */
>> +unsigned get_flush_idx(struct request_queue *q, bool queued)
>> +{
>> + struct blk_mq_hw_ctx *hctx;
>> + unsigned int i;
>> + unsigned fid = 0;
>> + unsigned diff = 0;
>> +
>> + if (queued)
>> + diff = 0x1;
>> + if (!q->mq_ops) {
>> + fid = (ACCESS_ONCE(q->fq->flush_idx) + diff) >> 1;
>> + } else {
>> + queue_for_each_hw_ctx(q, hctx, i)
>> + fid += (ACCESS_ONCE(hctx->fq->flush_idx) + diff) >> 1;
>> + }
>> + return fid;
>> +}
>> +EXPORT_SYMBOL(get_flush_idx);
>> +
>> +/**
>> * blk_flush_complete_seq - complete flush sequence
>> * @rq: FLUSH/FUA request being sequenced
>> * @fq: flush queue
>> @@ -232,9 +272,15 @@ static void flush_end_io(struct request *flush_rq, int error)
>>
>> running = &fq->flush_queue[fq->flush_running_idx];
>> BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
>> -
>> + BUG_ON(!(fq->flush_idx & 0x1));
>> /* account completion of the flush request */
>> fq->flush_running_idx ^= 1;
>> + /* In case of error we have to restore original flush_idx
>> + * which was incremented kick_flush */
>> + if (unlikely(error))
>> + fq->flush_idx--;
>> + else
>> + fq->flush_idx++;
>>
>> if (!q->mq_ops)
>> elv_completed_request(q, flush_rq);
>> @@ -303,6 +349,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
>> * different from running_idx, which means flush is in flight.
>> */
>> fq->flush_pending_idx ^= 1;
>> + fq->flush_idx++;
>>
>> blk_rq_init(q, flush_rq);
>>
>> @@ -509,6 +556,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
>> INIT_LIST_HEAD(&fq->flush_queue[0]);
>> INIT_LIST_HEAD(&fq->flush_queue[1]);
>> INIT_LIST_HEAD(&fq->flush_data_in_flight);
>> + fq->flush_idx = 0;
>>
>> return fq;
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 4e7a314..3b60daf 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -1624,9 +1624,10 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
>> break;
>> }
>>
>> - if (i == q->nr_hw_queues)
>> + if (i == q->nr_hw_queues) {
>> + q->get_flush_idx_fn = get_flush_idx;
>> return 0;
>> -
>> + }
>> /*
>> * Init failed
>> */
>> diff --git a/block/blk-settings.c b/block/blk-settings.c
>> index aa02247..1b192dc 100644
>> --- a/block/blk-settings.c
>> +++ b/block/blk-settings.c
>> @@ -99,6 +99,12 @@ void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
>> }
>> EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
>>
>> +void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn)
>> +{
>> + q->get_flush_idx_fn = fn;
>> +}
>> +EXPORT_SYMBOL_GPL(blk_queue_get_flush_idx);
>> +
>> /**
>> * blk_set_default_limits - reset limits to default values
>> * @lim: the queue_limits structure to reset
>> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
>> index e8f38a3..533fc0c 100644
>> --- a/block/blk-sysfs.c
>> +++ b/block/blk-sysfs.c
>> @@ -94,6 +94,11 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
>> return ret;
>> }
>>
>> +static ssize_t queue_flush_idx_show(struct request_queue *q, char *page)
>> +{
>> + return sprintf(page, "%u\n", blk_get_flush_idx(q, false));
>> +}
>> +
>> static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
>> {
>> int max_sectors_kb = queue_max_sectors(q) >> 1;
>> @@ -374,6 +379,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = {
>> .show = queue_write_same_max_show,
>> };
>>
>> +static struct queue_sysfs_entry queue_flush_idx_entry = {
>> + .attr = {.name = "flush_index", .mode = S_IRUGO },
>> + .show = queue_flush_idx_show,
>> +};
>> +
>> static struct queue_sysfs_entry queue_nonrot_entry = {
>> .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
>> .show = queue_show_nonrot,
>> @@ -422,6 +432,7 @@ static struct attribute *default_attrs[] = {
>> &queue_discard_max_entry.attr,
>> &queue_discard_zeroes_data_entry.attr,
>> &queue_write_same_max_entry.attr,
>> + &queue_flush_idx_entry.attr,
>> &queue_nonrot_entry.attr,
>> &queue_nomerges_entry.attr,
>> &queue_rq_affinity_entry.attr,
>> diff --git a/block/blk.h b/block/blk.h
>> index 43b0361..f4fa503 100644
>> --- a/block/blk.h
>> +++ b/block/blk.h
>> @@ -18,6 +18,7 @@ struct blk_flush_queue {
>> unsigned int flush_queue_delayed:1;
>> unsigned int flush_pending_idx:1;
>> unsigned int flush_running_idx:1;
>> + unsigned int flush_idx;
>> unsigned long flush_pending_since;
>> struct list_head flush_queue[2];
>> struct list_head flush_data_in_flight;
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 5546392..6fb7bab 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -237,6 +237,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
>> typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
>> typedef int (prep_rq_fn) (struct request_queue *, struct request *);
>> typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
>> +typedef unsigned (get_flush_idx_fn) (struct request_queue *q, bool queued);
>>
>> struct bio_vec;
>> struct bvec_merge_data {
>> @@ -332,6 +333,7 @@ struct request_queue {
>> rq_timed_out_fn *rq_timed_out_fn;
>> dma_drain_needed_fn *dma_drain_needed;
>> lld_busy_fn *lld_busy_fn;
>> + get_flush_idx_fn *get_flush_idx_fn;
>>
>> struct blk_mq_ops *mq_ops;
>>
>> @@ -1030,6 +1032,7 @@ extern int blk_queue_dma_drain(struct request_queue *q,
>> dma_drain_needed_fn *dma_drain_needed,
>> void *buf, unsigned int size);
>> extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
>> +extern void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn);
>> extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
>> extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
>> extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
>> @@ -1180,6 +1183,20 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
>> nr_blocks << (sb->s_blocksize_bits - 9),
>> gfp_mask);
>> }
>> +extern unsigned get_flush_idx(struct request_queue *q, bool queued);
>> +static inline unsigned blk_get_flush_idx(struct request_queue *q, bool queued)
>> +{
>> + if (unlikely(!q || !q->get_flush_idx_fn))
>> + return 0;
>> +
>> + return q->get_flush_idx_fn(q, queued);
>> +}
>> +static inline bool blk_flush_idx_is_stable(struct request_queue *q, unsigned id)
>> +{
>> + int difference = (blk_get_flush_idx(q, false) - id);
>> +
>> + return (difference > 0);
>> +}
>>
>> extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
>>
>> --
>> 1.7.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/


Attachments:
(No filename) (818.00 B)

2014-10-18 09:59:53

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 1/4] blkdev: add flush generation counter

On Thu, Oct 16, 2014 at 5:14 PM, Dmitry Monakhov <[email protected]> wrote:
> Ming Lei <[email protected]> writes:
>
>> On Tue, Oct 14, 2014 at 12:03 PM, Dmitry Monakhov <[email protected]> wrote:
>>> PROF:
>>> *Flush machinery addumptions
>>> C1. At any given time, only one flush shall be in progress. This is
>>> double buffering sufficient.
>>> C2. Flush is deferred if any request is executing DATA of its ence.
>>> This avoids issuing separate POSTFLUSHes for requests which ed
>>> PREFLUSH.
>>> C3. The second condition is ignored if there is a request which has
>>> waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
>>> starvation in the unlikely case where there are continuous am of
>>> FUA (without FLUSH) requests.
>>>
>>> So if we will increment flush_tag counter in two places:
>>> blk_kick_flush: the place where flush request is issued
>>> flush_end_io : the place where flush is completed
>>> And by using rule (C1) we can guarantee that:
>>> if (flush_tag & 0x1 == 1) then flush_tag is in progress
>>> if (flush_tag & 0x1 == 0) then (flush_tag & ~(0x1)) completed
>>> In other words we can define it as:
>>>
>>> FLUSH_TAG_IDX = (flush_tag +1) & ~0x1
>>> FLUSH_TAG_STATE = flush_tag & 0x1 ? IN_PROGRESS : COMPLETED
>>>
>>> After that we can define rules for flush optimization:
>>> We can be sure that our data was flushed only if:
>>> 1) data's bio was completed before flush request was QUEUED
>>> and COMPLETED
>>> So in terms of formulas we can write it like follows:
>>> is_data_flushed = (blkdev->flush_tag & ~(0x1)) >
>>> ((data->flush_tag + 0x1) & (~0x1))
>>
>> Looks you are just trying to figure out if the 'data' is flushed or not,
>> so care to explain a bit what the optimization is?
> Indeed I try to understand whenever data was flushed or not.
> inodes on filesystem may update this ID inside ->end_io callback.
> Later if user called fsync/fdatasync we may figure out that inode's
> data was flushed already so we do not have to issue explicit barrier.
> This helps for intensive multi-file dio-aio/fsync workloads
> chunk servers, virt-disk images, mail server, etc.
>
> for (i=0;i<fd_num;i++)
> pwrite(fd[i], buf, 4096, 0)
> for (i=0;i<fd_num;i++)
> fdatasync(fd[i])
> Before this optimization we have to issue a barrier to each fdatasync
> one by one, and send only one when optimization enabled.

Could you share some performance data about the optimization?

The flush command itself shouldn't be very expensive, but flushing
cache to storage from drive should be. Maybe the following simple
optimization can be done in blk-flush for your dio case:

- only issue flush command to drive if there are writes completed since
last flush.

>>>
>>>
>>> In order to support stacked block devices (especially linear dm)
>>> I've implemented get_flush_idx function as queue's callback.
>>>
>>> *Mutli-Queue scalability notes*
>>> This implementation try to makes global optimization for all hw-queues
>>> for a device which require read from each hw-queue like follows:
>>> queue_for_each_hw_ctx(q, hctx, i)
>>> fid += ACCESS_ONCE(hctx->fq->flush_idx)
>>
>> I am wondering if it can work, suppose request A is submitted
>> to hw_queue 0, and request B is submitted to hw_queue 1, then
>> you may thought request A has been flushed out when request
>> B is just flushed via hctx 1.
> Correct, because we know that at the moment A and B was completed
> at least one barrier was issued and completed (even via hwctx 2)
> Flush has blkdev/request_queue-wide guarantee regardless to hwctx.
> If it not the case for MQ then all filesystem are totally broken,
> and blkdev_issue_flush MUST be changed to flush all hwctx.

It is my fault, your approach is correct.


Thanks,
>>
>>>
>>> In my tests I do not see any visiable difference on performance on
>>> my hardware (W2600CR: 8cores x 2 sockets, 2numa nodes).
>>> Really fast HW may prefer to return flush_id for a single hw-queue
>>> in order to do so we have to encode flush_id with hw_queue_id
>>> like follows:
>>> fid_t = (hctx->fq->flush_id << MQ_SHIFT) | hctx->id
>>> #define MQ_ID(fid) (fid & ((1 << MQ_SHIFT) -1))
>>> Later blk_flush_idx_is_stable() can assumes fid_t as unstable if
>>> if was obtained from another hw-queue:
>>>
>>> bool blk_flush_idx_is_stable(struct request_queue *q, fid_t fid)
>>> {
>>> int difference;
>>> fid_t cur = blk_get_flush_idx(q, false);
>>> if (MQ_ID(fid) != MQ_ID(fid))
>>> return 0;
>>>
>>> difference = (blk_get_flush_idx(q, false) - id);
>>> return (difference > 0);
>>>
>>> }
>>> Please let me know if you prefer that design to global one.
>>>
>>> CC: Jens Axboe <[email protected]>
>>> CC: Christoph Hellwig <[email protected]>
>>> CC: Ming Lei <[email protected]>
>>> Signed-off-by: Dmitry Monakhov <[email protected]>
>>> ---
>>> block/blk-core.c | 1 +
>>> block/blk-flush.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++-
>>> block/blk-mq.c | 5 ++-
>>> block/blk-settings.c | 6 +++++
>>> block/blk-sysfs.c | 11 ++++++++++
>>> block/blk.h | 1 +
>>> include/linux/blkdev.h | 17 ++++++++++++++++
>>> 7 files changed, 88 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index ffcb47a..78c7e64 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -714,6 +714,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
>>> q->request_fn = rfn;
>>> q->prep_rq_fn = NULL;
>>> q->unprep_rq_fn = NULL;
>>> + q->get_flush_idx_fn = get_flush_idx;
>>> q->queue_flags |= QUEUE_FLAG_DEFAULT;
>>>
>>> /* Override internal queue lock with supplied lock pointer */
>>> diff --git a/block/blk-flush.c b/block/blk-flush.c
>>> index 20badd7..e264af8 100644
>>> --- a/block/blk-flush.c
>>> +++ b/block/blk-flush.c
>>> @@ -147,6 +147,46 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
>>> }
>>>
>>> /**
>>> + * get_flush_idx - return stable flush epoch index for a given queue
>>> + * @q: request_queue
>>> + * @queued: return index of latests queued flush
>>> + *
>>> + * Any bio which was completed before some flush request was QUEUED
>>> + * and COMPLETED is already in permanent store. Upper layer may use this
>>> + * feature and skip explicit flush if already does that
>>> + *
>>> + * fq->flush_idx counter incremented in two places:
>>> + * 1)blk_kick_flush: the place where flush request is issued
>>> + * 2)flush_end_io : the place where flush is completed
>>> + * And by using rule (C1) we can guarantee that:
>>> + * if (fq->flush_idx & 0x1 == 1) then flush_idx is in QUEUED
>>> + * if (fq->flush_idx & 0x1 == 0) then (flush_idx & ~(0x1)) COMPLETED
>>> + *
>>> + * In other words we can define it as:
>>> + * FLUSH_IDX = (flush_idx +1) & ~0x1
>>> + * FLUSH_STATE = flush_idx & 0x1 ? IN_PROGRESS : COMPLETED
>>> + *
>>> + */
>>> +unsigned get_flush_idx(struct request_queue *q, bool queued)
>>> +{
>>> + struct blk_mq_hw_ctx *hctx;
>>> + unsigned int i;
>>> + unsigned fid = 0;
>>> + unsigned diff = 0;
>>> +
>>> + if (queued)
>>> + diff = 0x1;
>>> + if (!q->mq_ops) {
>>> + fid = (ACCESS_ONCE(q->fq->flush_idx) + diff) >> 1;
>>> + } else {
>>> + queue_for_each_hw_ctx(q, hctx, i)
>>> + fid += (ACCESS_ONCE(hctx->fq->flush_idx) + diff) >> 1;
>>> + }
>>> + return fid;
>>> +}
>>> +EXPORT_SYMBOL(get_flush_idx);
>>> +
>>> +/**
>>> * blk_flush_complete_seq - complete flush sequence
>>> * @rq: FLUSH/FUA request being sequenced
>>> * @fq: flush queue
>>> @@ -232,9 +272,15 @@ static void flush_end_io(struct request *flush_rq, int error)
>>>
>>> running = &fq->flush_queue[fq->flush_running_idx];
>>> BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
>>> -
>>> + BUG_ON(!(fq->flush_idx & 0x1));
>>> /* account completion of the flush request */
>>> fq->flush_running_idx ^= 1;
>>> + /* In case of error we have to restore original flush_idx
>>> + * which was incremented kick_flush */
>>> + if (unlikely(error))
>>> + fq->flush_idx--;
>>> + else
>>> + fq->flush_idx++;
>>>
>>> if (!q->mq_ops)
>>> elv_completed_request(q, flush_rq);
>>> @@ -303,6 +349,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
>>> * different from running_idx, which means flush is in flight.
>>> */
>>> fq->flush_pending_idx ^= 1;
>>> + fq->flush_idx++;
>>>
>>> blk_rq_init(q, flush_rq);
>>>
>>> @@ -509,6 +556,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
>>> INIT_LIST_HEAD(&fq->flush_queue[0]);
>>> INIT_LIST_HEAD(&fq->flush_queue[1]);
>>> INIT_LIST_HEAD(&fq->flush_data_in_flight);
>>> + fq->flush_idx = 0;
>>>
>>> return fq;
>>>
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 4e7a314..3b60daf 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -1624,9 +1624,10 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
>>> break;
>>> }
>>>
>>> - if (i == q->nr_hw_queues)
>>> + if (i == q->nr_hw_queues) {
>>> + q->get_flush_idx_fn = get_flush_idx;
>>> return 0;
>>> -
>>> + }
>>> /*
>>> * Init failed
>>> */
>>> diff --git a/block/blk-settings.c b/block/blk-settings.c
>>> index aa02247..1b192dc 100644
>>> --- a/block/blk-settings.c
>>> +++ b/block/blk-settings.c
>>> @@ -99,6 +99,12 @@ void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
>>> }
>>> EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
>>>
>>> +void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn)
>>> +{
>>> + q->get_flush_idx_fn = fn;
>>> +}
>>> +EXPORT_SYMBOL_GPL(blk_queue_get_flush_idx);
>>> +
>>> /**
>>> * blk_set_default_limits - reset limits to default values
>>> * @lim: the queue_limits structure to reset
>>> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
>>> index e8f38a3..533fc0c 100644
>>> --- a/block/blk-sysfs.c
>>> +++ b/block/blk-sysfs.c
>>> @@ -94,6 +94,11 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
>>> return ret;
>>> }
>>>
>>> +static ssize_t queue_flush_idx_show(struct request_queue *q, char *page)
>>> +{
>>> + return sprintf(page, "%u\n", blk_get_flush_idx(q, false));
>>> +}
>>> +
>>> static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
>>> {
>>> int max_sectors_kb = queue_max_sectors(q) >> 1;
>>> @@ -374,6 +379,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = {
>>> .show = queue_write_same_max_show,
>>> };
>>>
>>> +static struct queue_sysfs_entry queue_flush_idx_entry = {
>>> + .attr = {.name = "flush_index", .mode = S_IRUGO },
>>> + .show = queue_flush_idx_show,
>>> +};
>>> +
>>> static struct queue_sysfs_entry queue_nonrot_entry = {
>>> .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
>>> .show = queue_show_nonrot,
>>> @@ -422,6 +432,7 @@ static struct attribute *default_attrs[] = {
>>> &queue_discard_max_entry.attr,
>>> &queue_discard_zeroes_data_entry.attr,
>>> &queue_write_same_max_entry.attr,
>>> + &queue_flush_idx_entry.attr,
>>> &queue_nonrot_entry.attr,
>>> &queue_nomerges_entry.attr,
>>> &queue_rq_affinity_entry.attr,
>>> diff --git a/block/blk.h b/block/blk.h
>>> index 43b0361..f4fa503 100644
>>> --- a/block/blk.h
>>> +++ b/block/blk.h
>>> @@ -18,6 +18,7 @@ struct blk_flush_queue {
>>> unsigned int flush_queue_delayed:1;
>>> unsigned int flush_pending_idx:1;
>>> unsigned int flush_running_idx:1;
>>> + unsigned int flush_idx;
>>> unsigned long flush_pending_since;
>>> struct list_head flush_queue[2];
>>> struct list_head flush_data_in_flight;
>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>> index 5546392..6fb7bab 100644
>>> --- a/include/linux/blkdev.h
>>> +++ b/include/linux/blkdev.h
>>> @@ -237,6 +237,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
>>> typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
>>> typedef int (prep_rq_fn) (struct request_queue *, struct request *);
>>> typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
>>> +typedef unsigned (get_flush_idx_fn) (struct request_queue *q, bool queued);
>>>
>>> struct bio_vec;
>>> struct bvec_merge_data {
>>> @@ -332,6 +333,7 @@ struct request_queue {
>>> rq_timed_out_fn *rq_timed_out_fn;
>>> dma_drain_needed_fn *dma_drain_needed;
>>> lld_busy_fn *lld_busy_fn;
>>> + get_flush_idx_fn *get_flush_idx_fn;
>>>
>>> struct blk_mq_ops *mq_ops;
>>>
>>> @@ -1030,6 +1032,7 @@ extern int blk_queue_dma_drain(struct request_queue *q,
>>> dma_drain_needed_fn *dma_drain_needed,
>>> void *buf, unsigned int size);
>>> extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
>>> +extern void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn);
>>> extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
>>> extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
>>> extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
>>> @@ -1180,6 +1183,20 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
>>> nr_blocks << (sb->s_blocksize_bits - 9),
>>> gfp_mask);
>>> }
>>> +extern unsigned get_flush_idx(struct request_queue *q, bool queued);
>>> +static inline unsigned blk_get_flush_idx(struct request_queue *q, bool queued)
>>> +{
>>> + if (unlikely(!q || !q->get_flush_idx_fn))
>>> + return 0;
>>> +
>>> + return q->get_flush_idx_fn(q, queued);
>>> +}
>>> +static inline bool blk_flush_idx_is_stable(struct request_queue *q, unsigned id)
>>> +{
>>> + int difference = (blk_get_flush_idx(q, false) - id);
>>> +
>>> + return (difference > 0);
>>> +}
>>>
>>> extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
>>>
>>> --
>>> 1.7.1
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at http://www.tux.org/lkml/

2014-10-18 11:52:46

by Dmitry Monakhov

[permalink] [raw]
Subject: Re: [PATCH 1/4] blkdev: add flush generation counter

Ming Lei <[email protected]> writes:

> On Thu, Oct 16, 2014 at 5:14 PM, Dmitry Monakhov <[email protected]> wrote:
>> Ming Lei <[email protected]> writes:
>>
>>> On Tue, Oct 14, 2014 at 12:03 PM, Dmitry Monakhov <[email protected]> wrote:
>>>> PROF:
>>>> *Flush machinery addumptions
>>>> C1. At any given time, only one flush shall be in progress. This is
>>>> double buffering sufficient.
>>>> C2. Flush is deferred if any request is executing DATA of its ence.
>>>> This avoids issuing separate POSTFLUSHes for requests which ed
>>>> PREFLUSH.
>>>> C3. The second condition is ignored if there is a request which has
>>>> waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
>>>> starvation in the unlikely case where there are continuous am of
>>>> FUA (without FLUSH) requests.
>>>>
>>>> So if we will increment flush_tag counter in two places:
>>>> blk_kick_flush: the place where flush request is issued
>>>> flush_end_io : the place where flush is completed
>>>> And by using rule (C1) we can guarantee that:
>>>> if (flush_tag & 0x1 == 1) then flush_tag is in progress
>>>> if (flush_tag & 0x1 == 0) then (flush_tag & ~(0x1)) completed
>>>> In other words we can define it as:
>>>>
>>>> FLUSH_TAG_IDX = (flush_tag +1) & ~0x1
>>>> FLUSH_TAG_STATE = flush_tag & 0x1 ? IN_PROGRESS : COMPLETED
>>>>
>>>> After that we can define rules for flush optimization:
>>>> We can be sure that our data was flushed only if:
>>>> 1) data's bio was completed before flush request was QUEUED
>>>> and COMPLETED
>>>> So in terms of formulas we can write it like follows:
>>>> is_data_flushed = (blkdev->flush_tag & ~(0x1)) >
>>>> ((data->flush_tag + 0x1) & (~0x1))
>>>
>>> Looks you are just trying to figure out if the 'data' is flushed or not,
>>> so care to explain a bit what the optimization is?
>> Indeed I try to understand whenever data was flushed or not.
>> inodes on filesystem may update this ID inside ->end_io callback.
>> Later if user called fsync/fdatasync we may figure out that inode's
>> data was flushed already so we do not have to issue explicit barrier.
>> This helps for intensive multi-file dio-aio/fsync workloads
>> chunk servers, virt-disk images, mail server, etc.
>>
>> for (i=0;i<fd_num;i++)
>> pwrite(fd[i], buf, 4096, 0)
>> for (i=0;i<fd_num;i++)
>> fdatasync(fd[i])
>> Before this optimization we have to issue a barrier to each fdatasync
>> one by one, and send only one when optimization enabled.
>
> Could you share some performance data about the optimization?
I'm in the middle of comparison of new version of the patch,
I'll post numbers with new version.
But here is quick timings of simulation of "chunk server"
time xfs_io -c "pwrite -S 0x1 -b4096 0 4096000" \
-c "sync_range -w 0 4096000" \
-c "sync_range -a 0 4096000" \
-c "fdatasync" -f /mnt/{1..32} > /dev/null

| time_out | w/o patch | with patch |
|----------+-----------+------------+
| | | |
| Real | 0m2.448 | 0m1.547s |
| user | 0m0.003 | 0m0.010s |
| sys | 0m0.287 | 0m0.262s |

> The flush command itself shouldn't be very expensive, but flushing
> cache to storage from drive should be. Maybe the following simple
> optimization can be done in blk-flush for your dio case:
Yes. But filesystem is alive so usually there are other tasks which
also do IO so global flag will not helps that many. Imagine several
virtual images host on same fs. So per-inode granularity is good thing
to have. I am not trying to implement "the silver bullet", I just want
to optimize the things which can be optimized w/o destroying others.
>
> - only issue flush command to drive if there are writes completed since
> last flush.
>
>>>>
>>>>
>>>> In order to support stacked block devices (especially linear dm)
>>>> I've implemented get_flush_idx function as queue's callback.
>>>>
>>>> *Mutli-Queue scalability notes*
>>>> This implementation try to makes global optimization for all hw-queues
>>>> for a device which require read from each hw-queue like follows:
>>>> queue_for_each_hw_ctx(q, hctx, i)
>>>> fid += ACCESS_ONCE(hctx->fq->flush_idx)
>>>
>>> I am wondering if it can work, suppose request A is submitted
>>> to hw_queue 0, and request B is submitted to hw_queue 1, then
>>> you may thought request A has been flushed out when request
>>> B is just flushed via hctx 1.
>> Correct, because we know that at the moment A and B was completed
>> at least one barrier was issued and completed (even via hwctx 2)
>> Flush has blkdev/request_queue-wide guarantee regardless to hwctx.
>> If it not the case for MQ then all filesystem are totally broken,
>> and blkdev_issue_flush MUST be changed to flush all hwctx.
>
> It is my fault, your approach is correct.
>
>
> Thanks,
>>>
>>>>
>>>> In my tests I do not see any visiable difference on performance on
>>>> my hardware (W2600CR: 8cores x 2 sockets, 2numa nodes).
>>>> Really fast HW may prefer to return flush_id for a single hw-queue
>>>> in order to do so we have to encode flush_id with hw_queue_id
>>>> like follows:
>>>> fid_t = (hctx->fq->flush_id << MQ_SHIFT) | hctx->id
>>>> #define MQ_ID(fid) (fid & ((1 << MQ_SHIFT) -1))
>>>> Later blk_flush_idx_is_stable() can assumes fid_t as unstable if
>>>> if was obtained from another hw-queue:
>>>>
>>>> bool blk_flush_idx_is_stable(struct request_queue *q, fid_t fid)
>>>> {
>>>> int difference;
>>>> fid_t cur = blk_get_flush_idx(q, false);
>>>> if (MQ_ID(fid) != MQ_ID(fid))
>>>> return 0;
>>>>
>>>> difference = (blk_get_flush_idx(q, false) - id);
>>>> return (difference > 0);
>>>>
>>>> }
>>>> Please let me know if you prefer that design to global one.
>>>>
>>>> CC: Jens Axboe <[email protected]>
>>>> CC: Christoph Hellwig <[email protected]>
>>>> CC: Ming Lei <[email protected]>
>>>> Signed-off-by: Dmitry Monakhov <[email protected]>
>>>> ---
>>>> block/blk-core.c | 1 +
>>>> block/blk-flush.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++-
>>>> block/blk-mq.c | 5 ++-
>>>> block/blk-settings.c | 6 +++++
>>>> block/blk-sysfs.c | 11 ++++++++++
>>>> block/blk.h | 1 +
>>>> include/linux/blkdev.h | 17 ++++++++++++++++
>>>> 7 files changed, 88 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index ffcb47a..78c7e64 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -714,6 +714,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
>>>> q->request_fn = rfn;
>>>> q->prep_rq_fn = NULL;
>>>> q->unprep_rq_fn = NULL;
>>>> + q->get_flush_idx_fn = get_flush_idx;
>>>> q->queue_flags |= QUEUE_FLAG_DEFAULT;
>>>>
>>>> /* Override internal queue lock with supplied lock pointer */
>>>> diff --git a/block/blk-flush.c b/block/blk-flush.c
>>>> index 20badd7..e264af8 100644
>>>> --- a/block/blk-flush.c
>>>> +++ b/block/blk-flush.c
>>>> @@ -147,6 +147,46 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
>>>> }
>>>>
>>>> /**
>>>> + * get_flush_idx - return stable flush epoch index for a given queue
>>>> + * @q: request_queue
>>>> + * @queued: return index of latests queued flush
>>>> + *
>>>> + * Any bio which was completed before some flush request was QUEUED
>>>> + * and COMPLETED is already in permanent store. Upper layer may use this
>>>> + * feature and skip explicit flush if already does that
>>>> + *
>>>> + * fq->flush_idx counter incremented in two places:
>>>> + * 1)blk_kick_flush: the place where flush request is issued
>>>> + * 2)flush_end_io : the place where flush is completed
>>>> + * And by using rule (C1) we can guarantee that:
>>>> + * if (fq->flush_idx & 0x1 == 1) then flush_idx is in QUEUED
>>>> + * if (fq->flush_idx & 0x1 == 0) then (flush_idx & ~(0x1)) COMPLETED
>>>> + *
>>>> + * In other words we can define it as:
>>>> + * FLUSH_IDX = (flush_idx +1) & ~0x1
>>>> + * FLUSH_STATE = flush_idx & 0x1 ? IN_PROGRESS : COMPLETED
>>>> + *
>>>> + */
>>>> +unsigned get_flush_idx(struct request_queue *q, bool queued)
>>>> +{
>>>> + struct blk_mq_hw_ctx *hctx;
>>>> + unsigned int i;
>>>> + unsigned fid = 0;
>>>> + unsigned diff = 0;
>>>> +
>>>> + if (queued)
>>>> + diff = 0x1;
>>>> + if (!q->mq_ops) {
>>>> + fid = (ACCESS_ONCE(q->fq->flush_idx) + diff) >> 1;
>>>> + } else {
>>>> + queue_for_each_hw_ctx(q, hctx, i)
>>>> + fid += (ACCESS_ONCE(hctx->fq->flush_idx) + diff) >> 1;
>>>> + }
>>>> + return fid;
>>>> +}
>>>> +EXPORT_SYMBOL(get_flush_idx);
>>>> +
>>>> +/**
>>>> * blk_flush_complete_seq - complete flush sequence
>>>> * @rq: FLUSH/FUA request being sequenced
>>>> * @fq: flush queue
>>>> @@ -232,9 +272,15 @@ static void flush_end_io(struct request *flush_rq, int error)
>>>>
>>>> running = &fq->flush_queue[fq->flush_running_idx];
>>>> BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
>>>> -
>>>> + BUG_ON(!(fq->flush_idx & 0x1));
>>>> /* account completion of the flush request */
>>>> fq->flush_running_idx ^= 1;
>>>> + /* In case of error we have to restore original flush_idx
>>>> + * which was incremented kick_flush */
>>>> + if (unlikely(error))
>>>> + fq->flush_idx--;
>>>> + else
>>>> + fq->flush_idx++;
>>>>
>>>> if (!q->mq_ops)
>>>> elv_completed_request(q, flush_rq);
>>>> @@ -303,6 +349,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
>>>> * different from running_idx, which means flush is in flight.
>>>> */
>>>> fq->flush_pending_idx ^= 1;
>>>> + fq->flush_idx++;
>>>>
>>>> blk_rq_init(q, flush_rq);
>>>>
>>>> @@ -509,6 +556,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
>>>> INIT_LIST_HEAD(&fq->flush_queue[0]);
>>>> INIT_LIST_HEAD(&fq->flush_queue[1]);
>>>> INIT_LIST_HEAD(&fq->flush_data_in_flight);
>>>> + fq->flush_idx = 0;
>>>>
>>>> return fq;
>>>>
>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>> index 4e7a314..3b60daf 100644
>>>> --- a/block/blk-mq.c
>>>> +++ b/block/blk-mq.c
>>>> @@ -1624,9 +1624,10 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
>>>> break;
>>>> }
>>>>
>>>> - if (i == q->nr_hw_queues)
>>>> + if (i == q->nr_hw_queues) {
>>>> + q->get_flush_idx_fn = get_flush_idx;
>>>> return 0;
>>>> -
>>>> + }
>>>> /*
>>>> * Init failed
>>>> */
>>>> diff --git a/block/blk-settings.c b/block/blk-settings.c
>>>> index aa02247..1b192dc 100644
>>>> --- a/block/blk-settings.c
>>>> +++ b/block/blk-settings.c
>>>> @@ -99,6 +99,12 @@ void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
>>>> }
>>>> EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
>>>>
>>>> +void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn)
>>>> +{
>>>> + q->get_flush_idx_fn = fn;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(blk_queue_get_flush_idx);
>>>> +
>>>> /**
>>>> * blk_set_default_limits - reset limits to default values
>>>> * @lim: the queue_limits structure to reset
>>>> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
>>>> index e8f38a3..533fc0c 100644
>>>> --- a/block/blk-sysfs.c
>>>> +++ b/block/blk-sysfs.c
>>>> @@ -94,6 +94,11 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
>>>> return ret;
>>>> }
>>>>
>>>> +static ssize_t queue_flush_idx_show(struct request_queue *q, char *page)
>>>> +{
>>>> + return sprintf(page, "%u\n", blk_get_flush_idx(q, false));
>>>> +}
>>>> +
>>>> static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
>>>> {
>>>> int max_sectors_kb = queue_max_sectors(q) >> 1;
>>>> @@ -374,6 +379,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = {
>>>> .show = queue_write_same_max_show,
>>>> };
>>>>
>>>> +static struct queue_sysfs_entry queue_flush_idx_entry = {
>>>> + .attr = {.name = "flush_index", .mode = S_IRUGO },
>>>> + .show = queue_flush_idx_show,
>>>> +};
>>>> +
>>>> static struct queue_sysfs_entry queue_nonrot_entry = {
>>>> .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
>>>> .show = queue_show_nonrot,
>>>> @@ -422,6 +432,7 @@ static struct attribute *default_attrs[] = {
>>>> &queue_discard_max_entry.attr,
>>>> &queue_discard_zeroes_data_entry.attr,
>>>> &queue_write_same_max_entry.attr,
>>>> + &queue_flush_idx_entry.attr,
>>>> &queue_nonrot_entry.attr,
>>>> &queue_nomerges_entry.attr,
>>>> &queue_rq_affinity_entry.attr,
>>>> diff --git a/block/blk.h b/block/blk.h
>>>> index 43b0361..f4fa503 100644
>>>> --- a/block/blk.h
>>>> +++ b/block/blk.h
>>>> @@ -18,6 +18,7 @@ struct blk_flush_queue {
>>>> unsigned int flush_queue_delayed:1;
>>>> unsigned int flush_pending_idx:1;
>>>> unsigned int flush_running_idx:1;
>>>> + unsigned int flush_idx;
>>>> unsigned long flush_pending_since;
>>>> struct list_head flush_queue[2];
>>>> struct list_head flush_data_in_flight;
>>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>>> index 5546392..6fb7bab 100644
>>>> --- a/include/linux/blkdev.h
>>>> +++ b/include/linux/blkdev.h
>>>> @@ -237,6 +237,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
>>>> typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
>>>> typedef int (prep_rq_fn) (struct request_queue *, struct request *);
>>>> typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
>>>> +typedef unsigned (get_flush_idx_fn) (struct request_queue *q, bool queued);
>>>>
>>>> struct bio_vec;
>>>> struct bvec_merge_data {
>>>> @@ -332,6 +333,7 @@ struct request_queue {
>>>> rq_timed_out_fn *rq_timed_out_fn;
>>>> dma_drain_needed_fn *dma_drain_needed;
>>>> lld_busy_fn *lld_busy_fn;
>>>> + get_flush_idx_fn *get_flush_idx_fn;
>>>>
>>>> struct blk_mq_ops *mq_ops;
>>>>
>>>> @@ -1030,6 +1032,7 @@ extern int blk_queue_dma_drain(struct request_queue *q,
>>>> dma_drain_needed_fn *dma_drain_needed,
>>>> void *buf, unsigned int size);
>>>> extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
>>>> +extern void blk_queue_get_flush_idx(struct request_queue *q, get_flush_idx_fn *fn);
>>>> extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
>>>> extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
>>>> extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
>>>> @@ -1180,6 +1183,20 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
>>>> nr_blocks << (sb->s_blocksize_bits - 9),
>>>> gfp_mask);
>>>> }
>>>> +extern unsigned get_flush_idx(struct request_queue *q, bool queued);
>>>> +static inline unsigned blk_get_flush_idx(struct request_queue *q, bool queued)
>>>> +{
>>>> + if (unlikely(!q || !q->get_flush_idx_fn))
>>>> + return 0;
>>>> +
>>>> + return q->get_flush_idx_fn(q, queued);
>>>> +}
>>>> +static inline bool blk_flush_idx_is_stable(struct request_queue *q, unsigned id)
>>>> +{
>>>> + int difference = (blk_get_flush_idx(q, false) - id);
>>>> +
>>>> + return (difference > 0);
>>>> +}
>>>>
>>>> extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
>>>>
>>>> --
>>>> 1.7.1
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at http://www.tux.org/lkml/


Attachments:
(No filename) (818.00 B)