2021-07-26 06:10:07

by Pankaj Gupta

[permalink] [raw]
Subject: [RFC v2 0/2] virtio-pmem: Asynchronous flush

From: Pankaj Gupta <[email protected]>

Jeff reported preflush order issue with the existing implementation
of virtio pmem preflush. Dan suggested[1] to implement asynchronous flush
for virtio pmem using work queue as done in md/RAID. This patch series
intends to solve the preflush ordering issue and also makes the flush
asynchronous for the submitting thread.

Submitting this patch series for review. Sorry, It took me long time to
come back to this due to some personal reasons.

RFC v1 -> RFC v2
- More testing and bug fix.

[1] https://marc.info/?l=linux-kernel&m=157446316409937&w=2

Pankaj Gupta (2):
virtio-pmem: Async virtio-pmem flush
pmem: enable pmem_submit_bio for asynchronous flush

drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
drivers/nvdimm/pmem.c | 17 ++++++---
drivers/nvdimm/virtio_pmem.c | 10 ++++-
drivers/nvdimm/virtio_pmem.h | 14 +++++++
4 files changed, 91 insertions(+), 22 deletions(-)

--
2.25.1


2021-07-26 06:10:57

by Pankaj Gupta

[permalink] [raw]
Subject: [RFC v2 1/2] virtio-pmem: Async virtio-pmem flush

From: Pankaj Gupta <[email protected]>

Implement asynchronous flush for virtio pmem using work queue
to solve the preflush ordering issue. Also, coalesce the flush
requests when a flush is already in process.

Signed-off-by: Pankaj Gupta <[email protected]>
---
drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
drivers/nvdimm/virtio_pmem.c | 10 ++++-
drivers/nvdimm/virtio_pmem.h | 14 +++++++
3 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
index 10351d5b49fa..61b655b583be 100644
--- a/drivers/nvdimm/nd_virtio.c
+++ b/drivers/nvdimm/nd_virtio.c
@@ -97,29 +97,69 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
return err;
};

+static void submit_async_flush(struct work_struct *ws);
+
/* The asynchronous flush callback function */
int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
{
- /*
- * Create child bio for asynchronous flush and chain with
- * parent bio. Otherwise directly call nd_region flush.
+ /* queue asynchronous flush and coalesce the flush requests */
+ struct virtio_device *vdev = nd_region->provider_data;
+ struct virtio_pmem *vpmem = vdev->priv;
+ ktime_t req_start = ktime_get_boottime();
+
+ spin_lock_irq(&vpmem->lock);
+ /* flush requests wait until ongoing flush completes,
+ * hence coalescing all the pending requests.
*/
- if (bio && bio->bi_iter.bi_sector != -1) {
- struct bio *child = bio_alloc(GFP_ATOMIC, 0);
-
- if (!child)
- return -ENOMEM;
- bio_copy_dev(child, bio);
- child->bi_opf = REQ_PREFLUSH;
- child->bi_iter.bi_sector = -1;
- bio_chain(child, bio);
- submit_bio(child);
- return 0;
+ wait_event_lock_irq(vpmem->sb_wait,
+ !vpmem->flush_bio ||
+ ktime_before(req_start, vpmem->prev_flush_start),
+ vpmem->lock);
+ /* new request after previous flush is completed */
+ if (ktime_after(req_start, vpmem->prev_flush_start)) {
+ WARN_ON(vpmem->flush_bio);
+ vpmem->flush_bio = bio;
+ bio = NULL;
+ }
+ spin_unlock_irq(&vpmem->lock);
+
+ if (!bio) {
+ INIT_WORK(&vpmem->flush_work, submit_async_flush);
+ queue_work(vpmem->pmem_wq, &vpmem->flush_work);
+ return 1;
+ }
+
+ /* flush completed in other context while we waited */
+ if (bio && (bio->bi_opf & REQ_PREFLUSH)) {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ submit_bio(bio);
+ } else if (bio && (bio->bi_opf & REQ_FUA)) {
+ bio->bi_opf &= ~REQ_FUA;
+ bio_endio(bio);
}
- if (virtio_pmem_flush(nd_region))
- return -EIO;

return 0;
};
EXPORT_SYMBOL_GPL(async_pmem_flush);
+
+static void submit_async_flush(struct work_struct *ws)
+{
+ struct virtio_pmem *vpmem = container_of(ws, struct virtio_pmem, flush_work);
+ struct bio *bio = vpmem->flush_bio;
+
+ vpmem->start_flush = ktime_get_boottime();
+ bio->bi_status = errno_to_blk_status(virtio_pmem_flush(vpmem->nd_region));
+ vpmem->prev_flush_start = vpmem->start_flush;
+ vpmem->flush_bio = NULL;
+ wake_up(&vpmem->sb_wait);
+
+ /* Submit parent bio only for PREFLUSH */
+ if (bio && (bio->bi_opf & REQ_PREFLUSH)) {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ submit_bio(bio);
+ } else if (bio && (bio->bi_opf & REQ_FUA)) {
+ bio->bi_opf &= ~REQ_FUA;
+ bio_endio(bio);
+ }
+}
MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
index 726c7354d465..56780a6140c7 100644
--- a/drivers/nvdimm/virtio_pmem.c
+++ b/drivers/nvdimm/virtio_pmem.c
@@ -24,6 +24,7 @@ static int init_vq(struct virtio_pmem *vpmem)
return PTR_ERR(vpmem->req_vq);

spin_lock_init(&vpmem->pmem_lock);
+ spin_lock_init(&vpmem->lock);
INIT_LIST_HEAD(&vpmem->req_list);

return 0;
@@ -57,7 +58,12 @@ static int virtio_pmem_probe(struct virtio_device *vdev)
dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
goto out_err;
}
-
+ vpmem->pmem_wq = alloc_workqueue("vpmem_wq", WQ_MEM_RECLAIM, 0);
+ if (!vpmem->pmem_wq) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+ init_waitqueue_head(&vpmem->sb_wait);
virtio_cread_le(vpmem->vdev, struct virtio_pmem_config,
start, &vpmem->start);
virtio_cread_le(vpmem->vdev, struct virtio_pmem_config,
@@ -90,10 +96,12 @@ static int virtio_pmem_probe(struct virtio_device *vdev)
goto out_nd;
}
nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
+ vpmem->nd_region = nd_region;
return 0;
out_nd:
nvdimm_bus_unregister(vpmem->nvdimm_bus);
out_vq:
+ destroy_workqueue(vpmem->pmem_wq);
vdev->config->del_vqs(vdev);
out_err:
return err;
diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
index 0dddefe594c4..d9abc8d052b6 100644
--- a/drivers/nvdimm/virtio_pmem.h
+++ b/drivers/nvdimm/virtio_pmem.h
@@ -35,9 +35,23 @@ struct virtio_pmem {
/* Virtio pmem request queue */
struct virtqueue *req_vq;

+ struct bio *flush_bio;
+ /* last_flush is when the last completed flush was started */
+ ktime_t prev_flush_start, start_flush;
+
+ /* work queue for deferred flush */
+ struct work_struct flush_work;
+ struct workqueue_struct *pmem_wq;
+
+ /* Synchronize flush wait queue data */
+ spinlock_t lock;
+ /* for waiting for previous flush to complete */
+ wait_queue_head_t sb_wait;
+
/* nvdimm bus registers virtio pmem device */
struct nvdimm_bus *nvdimm_bus;
struct nvdimm_bus_descriptor nd_desc;
+ struct nd_region *nd_region;

/* List to store deferred work if virtqueue is full */
struct list_head req_list;
--
2.25.1

2021-07-26 06:11:06

by Pankaj Gupta

[permalink] [raw]
Subject: [RFC v2 2/2] pmem: Enable pmem_submit_bio for asynchronous flush

From: Pankaj Gupta <[email protected]>

Return from "pmem_submit_bio" when asynchronous flush is in
process in other context.

Signed-off-by: Pankaj Gupta <[email protected]>
---
drivers/nvdimm/pmem.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 1e0615b8565e..3ca1fa88a5e7 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -201,8 +201,13 @@ static blk_qc_t pmem_submit_bio(struct bio *bio)
struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data;
struct nd_region *nd_region = to_region(pmem);

- if (bio->bi_opf & REQ_PREFLUSH)
- ret = nvdimm_flush(nd_region, bio);
+ if ((bio->bi_opf & REQ_PREFLUSH) &&
+ nvdimm_flush(nd_region, bio)) {
+
+ /* asynchronous flush completes in other context */
+ if (nd_region->flush)
+ return BLK_QC_T_NONE;
+ }

do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
if (do_acct)
@@ -222,11 +227,13 @@ static blk_qc_t pmem_submit_bio(struct bio *bio)
if (do_acct)
bio_end_io_acct(bio, start);

- if (bio->bi_opf & REQ_FUA)
+ if (bio->bi_opf & REQ_FUA) {
ret = nvdimm_flush(nd_region, bio);

- if (ret)
- bio->bi_status = errno_to_blk_status(ret);
+ /* asynchronous flush completes in other context */
+ if (nd_region->flush)
+ return BLK_QC_T_NONE;
+ }

bio_endio(bio);
return BLK_QC_T_NONE;
--
2.25.1

2021-08-19 11:10:56

by Pankaj Gupta

[permalink] [raw]
Subject: Re: [RFC v2 0/2] virtio-pmem: Asynchronous flush

Gentle ping.

>
> Jeff reported preflush order issue with the existing implementation
> of virtio pmem preflush. Dan suggested[1] to implement asynchronous flush
> for virtio pmem using work queue as done in md/RAID. This patch series
> intends to solve the preflush ordering issue and also makes the flush
> asynchronous for the submitting thread.
>
> Submitting this patch series for review. Sorry, It took me long time to
> come back to this due to some personal reasons.
>
> RFC v1 -> RFC v2
> - More testing and bug fix.
>
> [1] https://marc.info/?l=linux-kernel&m=157446316409937&w=2
>
> Pankaj Gupta (2):
> virtio-pmem: Async virtio-pmem flush
> pmem: enable pmem_submit_bio for asynchronous flush
>
> drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
> drivers/nvdimm/pmem.c | 17 ++++++---
> drivers/nvdimm/virtio_pmem.c | 10 ++++-
> drivers/nvdimm/virtio_pmem.h | 14 +++++++
> 4 files changed, 91 insertions(+), 22 deletions(-)
>
> --
> 2.25.1
>

2021-08-25 21:01:24

by Dan Williams

[permalink] [raw]
Subject: Re: [RFC v2 1/2] virtio-pmem: Async virtio-pmem flush

On Sun, Jul 25, 2021 at 11:09 PM Pankaj Gupta
<[email protected]> wrote:
>
> From: Pankaj Gupta <[email protected]>
>
> Implement asynchronous flush for virtio pmem using work queue
> to solve the preflush ordering issue. Also, coalesce the flush
> requests when a flush is already in process.
>
> Signed-off-by: Pankaj Gupta <[email protected]>
> ---
> drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
> drivers/nvdimm/virtio_pmem.c | 10 ++++-
> drivers/nvdimm/virtio_pmem.h | 14 +++++++
> 3 files changed, 79 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
> index 10351d5b49fa..61b655b583be 100644
> --- a/drivers/nvdimm/nd_virtio.c
> +++ b/drivers/nvdimm/nd_virtio.c
> @@ -97,29 +97,69 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
> return err;
> };
>
> +static void submit_async_flush(struct work_struct *ws);
> +
> /* The asynchronous flush callback function */
> int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
> {
> - /*
> - * Create child bio for asynchronous flush and chain with
> - * parent bio. Otherwise directly call nd_region flush.
> + /* queue asynchronous flush and coalesce the flush requests */
> + struct virtio_device *vdev = nd_region->provider_data;
> + struct virtio_pmem *vpmem = vdev->priv;
> + ktime_t req_start = ktime_get_boottime();
> +
> + spin_lock_irq(&vpmem->lock);
> + /* flush requests wait until ongoing flush completes,
> + * hence coalescing all the pending requests.
> */
> - if (bio && bio->bi_iter.bi_sector != -1) {
> - struct bio *child = bio_alloc(GFP_ATOMIC, 0);
> -
> - if (!child)
> - return -ENOMEM;
> - bio_copy_dev(child, bio);
> - child->bi_opf = REQ_PREFLUSH;
> - child->bi_iter.bi_sector = -1;
> - bio_chain(child, bio);
> - submit_bio(child);
> - return 0;
> + wait_event_lock_irq(vpmem->sb_wait,
> + !vpmem->flush_bio ||
> + ktime_before(req_start, vpmem->prev_flush_start),
> + vpmem->lock);
> + /* new request after previous flush is completed */
> + if (ktime_after(req_start, vpmem->prev_flush_start)) {
> + WARN_ON(vpmem->flush_bio);
> + vpmem->flush_bio = bio;
> + bio = NULL;
> + }

Why the dance with ->prev_flush_start vs just calling queue_work()
again. queue_work() is naturally coalescing in that if the last work
request has not started execution another queue attempt will be
dropped.

> + spin_unlock_irq(&vpmem->lock);
> +
> + if (!bio) {
> + INIT_WORK(&vpmem->flush_work, submit_async_flush);

I expect this only needs to be initialized once at driver init time.

> + queue_work(vpmem->pmem_wq, &vpmem->flush_work);
> + return 1;
> + }
> +
> + /* flush completed in other context while we waited */
> + if (bio && (bio->bi_opf & REQ_PREFLUSH)) {
> + bio->bi_opf &= ~REQ_PREFLUSH;
> + submit_bio(bio);
> + } else if (bio && (bio->bi_opf & REQ_FUA)) {
> + bio->bi_opf &= ~REQ_FUA;
> + bio_endio(bio);

It's not clear to me how this happens, shouldn't all flush completions
be driven from the work completion?

> }
> - if (virtio_pmem_flush(nd_region))
> - return -EIO;
>
> return 0;
> };
> EXPORT_SYMBOL_GPL(async_pmem_flush);
> +
> +static void submit_async_flush(struct work_struct *ws)
> +{
> + struct virtio_pmem *vpmem = container_of(ws, struct virtio_pmem, flush_work);
> + struct bio *bio = vpmem->flush_bio;
> +
> + vpmem->start_flush = ktime_get_boottime();
> + bio->bi_status = errno_to_blk_status(virtio_pmem_flush(vpmem->nd_region));
> + vpmem->prev_flush_start = vpmem->start_flush;
> + vpmem->flush_bio = NULL;
> + wake_up(&vpmem->sb_wait);
> +
> + /* Submit parent bio only for PREFLUSH */
> + if (bio && (bio->bi_opf & REQ_PREFLUSH)) {
> + bio->bi_opf &= ~REQ_PREFLUSH;
> + submit_bio(bio);
> + } else if (bio && (bio->bi_opf & REQ_FUA)) {
> + bio->bi_opf &= ~REQ_FUA;
> + bio_endio(bio);
> + }

Shouldn't the wait_event_lock_irq() be here rather than in
async_pmem_flush()? That will cause the workqueue to back up and flush
requests to coalesce.

> +}
> MODULE_LICENSE("GPL");
> diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
> index 726c7354d465..56780a6140c7 100644
> --- a/drivers/nvdimm/virtio_pmem.c
> +++ b/drivers/nvdimm/virtio_pmem.c
> @@ -24,6 +24,7 @@ static int init_vq(struct virtio_pmem *vpmem)
> return PTR_ERR(vpmem->req_vq);
>
> spin_lock_init(&vpmem->pmem_lock);
> + spin_lock_init(&vpmem->lock);

Why 2 locks?

2021-08-25 21:11:39

by Pankaj Gupta

[permalink] [raw]
Subject: Re: [RFC v2 1/2] virtio-pmem: Async virtio-pmem flush

Hi Dan,

Thank you for the review. Please see my reply inline.

> > Implement asynchronous flush for virtio pmem using work queue
> > to solve the preflush ordering issue. Also, coalesce the flush
> > requests when a flush is already in process.
> >
> > Signed-off-by: Pankaj Gupta <[email protected]>
> > ---
> > drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
> > drivers/nvdimm/virtio_pmem.c | 10 ++++-
> > drivers/nvdimm/virtio_pmem.h | 14 +++++++
> > 3 files changed, 79 insertions(+), 17 deletions(-)
> >
> > diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
> > index 10351d5b49fa..61b655b583be 100644
> > --- a/drivers/nvdimm/nd_virtio.c
> > +++ b/drivers/nvdimm/nd_virtio.c
> > @@ -97,29 +97,69 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
> > return err;
> > };
> >
> > +static void submit_async_flush(struct work_struct *ws);
> > +
> > /* The asynchronous flush callback function */
> > int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
> > {
> > - /*
> > - * Create child bio for asynchronous flush and chain with
> > - * parent bio. Otherwise directly call nd_region flush.
> > + /* queue asynchronous flush and coalesce the flush requests */
> > + struct virtio_device *vdev = nd_region->provider_data;
> > + struct virtio_pmem *vpmem = vdev->priv;
> > + ktime_t req_start = ktime_get_boottime();
> > +
> > + spin_lock_irq(&vpmem->lock);
> > + /* flush requests wait until ongoing flush completes,
> > + * hence coalescing all the pending requests.
> > */
> > - if (bio && bio->bi_iter.bi_sector != -1) {
> > - struct bio *child = bio_alloc(GFP_ATOMIC, 0);
> > -
> > - if (!child)
> > - return -ENOMEM;
> > - bio_copy_dev(child, bio);
> > - child->bi_opf = REQ_PREFLUSH;
> > - child->bi_iter.bi_sector = -1;
> > - bio_chain(child, bio);
> > - submit_bio(child);
> > - return 0;
> > + wait_event_lock_irq(vpmem->sb_wait,
> > + !vpmem->flush_bio ||
> > + ktime_before(req_start, vpmem->prev_flush_start),
> > + vpmem->lock);
> > + /* new request after previous flush is completed */
> > + if (ktime_after(req_start, vpmem->prev_flush_start)) {
> > + WARN_ON(vpmem->flush_bio);
> > + vpmem->flush_bio = bio;
> > + bio = NULL;
> > + }
>
> Why the dance with ->prev_flush_start vs just calling queue_work()
> again. queue_work() is naturally coalescing in that if the last work
> request has not started execution another queue attempt will be
> dropped.

How parent flush request will know when corresponding flush is completed?

>
> > + spin_unlock_irq(&vpmem->lock);
> > +
> > + if (!bio) {
> > + INIT_WORK(&vpmem->flush_work, submit_async_flush);
>
> I expect this only needs to be initialized once at driver init time.

yes, will fix this.
>
> > + queue_work(vpmem->pmem_wq, &vpmem->flush_work);
> > + return 1;
> > + }
> > +
> > + /* flush completed in other context while we waited */
> > + if (bio && (bio->bi_opf & REQ_PREFLUSH)) {
> > + bio->bi_opf &= ~REQ_PREFLUSH;
> > + submit_bio(bio);
> > + } else if (bio && (bio->bi_opf & REQ_FUA)) {
> > + bio->bi_opf &= ~REQ_FUA;
> > + bio_endio(bio);
>
> It's not clear to me how this happens, shouldn't all flush completions
> be driven from the work completion?

Requests should progress after notified by ongoing flush completion
event.

>
> > }
> > - if (virtio_pmem_flush(nd_region))
> > - return -EIO;
> >
> > return 0;
> > };
> > EXPORT_SYMBOL_GPL(async_pmem_flush);
> > +
> > +static void submit_async_flush(struct work_struct *ws)
> > +{
> > + struct virtio_pmem *vpmem = container_of(ws, struct virtio_pmem, flush_work);
> > + struct bio *bio = vpmem->flush_bio;
> > +
> > + vpmem->start_flush = ktime_get_boottime();
> > + bio->bi_status = errno_to_blk_status(virtio_pmem_flush(vpmem->nd_region));
> > + vpmem->prev_flush_start = vpmem->start_flush;
> > + vpmem->flush_bio = NULL;
> > + wake_up(&vpmem->sb_wait);
> > +
> > + /* Submit parent bio only for PREFLUSH */
> > + if (bio && (bio->bi_opf & REQ_PREFLUSH)) {
> > + bio->bi_opf &= ~REQ_PREFLUSH;
> > + submit_bio(bio);
> > + } else if (bio && (bio->bi_opf & REQ_FUA)) {
> > + bio->bi_opf &= ~REQ_FUA;
> > + bio_endio(bio);
> > + }
>
> Shouldn't the wait_event_lock_irq() be here rather than in
> async_pmem_flush()? That will cause the workqueue to back up and flush
> requests to coalesce.

but this is coalesced flush request?

> > +}
> > MODULE_LICENSE("GPL");
> > diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
> > index 726c7354d465..56780a6140c7 100644
> > --- a/drivers/nvdimm/virtio_pmem.c
> > +++ b/drivers/nvdimm/virtio_pmem.c
> > @@ -24,6 +24,7 @@ static int init_vq(struct virtio_pmem *vpmem)
> > return PTR_ERR(vpmem->req_vq);
> >
> > spin_lock_init(&vpmem->pmem_lock);
> > + spin_lock_init(&vpmem->lock);
>
> Why 2 locks?

One lock is for work queue and other for virtio flush completion.

Thanks,
Pankaj

2021-08-25 21:52:14

by Dan Williams

[permalink] [raw]
Subject: Re: [RFC v2 1/2] virtio-pmem: Async virtio-pmem flush

On Wed, Aug 25, 2021 at 1:02 PM Pankaj Gupta
<[email protected]> wrote:
>
> Hi Dan,
>
> Thank you for the review. Please see my reply inline.
>
> > > Implement asynchronous flush for virtio pmem using work queue
> > > to solve the preflush ordering issue. Also, coalesce the flush
> > > requests when a flush is already in process.
> > >
> > > Signed-off-by: Pankaj Gupta <[email protected]>
> > > ---
> > > drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
> > > drivers/nvdimm/virtio_pmem.c | 10 ++++-
> > > drivers/nvdimm/virtio_pmem.h | 14 +++++++
> > > 3 files changed, 79 insertions(+), 17 deletions(-)
> > >
> > > diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
> > > index 10351d5b49fa..61b655b583be 100644
> > > --- a/drivers/nvdimm/nd_virtio.c
> > > +++ b/drivers/nvdimm/nd_virtio.c
> > > @@ -97,29 +97,69 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
> > > return err;
> > > };
> > >
> > > +static void submit_async_flush(struct work_struct *ws);
> > > +
> > > /* The asynchronous flush callback function */
> > > int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
> > > {
> > > - /*
> > > - * Create child bio for asynchronous flush and chain with
> > > - * parent bio. Otherwise directly call nd_region flush.
> > > + /* queue asynchronous flush and coalesce the flush requests */
> > > + struct virtio_device *vdev = nd_region->provider_data;
> > > + struct virtio_pmem *vpmem = vdev->priv;
> > > + ktime_t req_start = ktime_get_boottime();
> > > +
> > > + spin_lock_irq(&vpmem->lock);
> > > + /* flush requests wait until ongoing flush completes,
> > > + * hence coalescing all the pending requests.
> > > */
> > > - if (bio && bio->bi_iter.bi_sector != -1) {
> > > - struct bio *child = bio_alloc(GFP_ATOMIC, 0);
> > > -
> > > - if (!child)
> > > - return -ENOMEM;
> > > - bio_copy_dev(child, bio);
> > > - child->bi_opf = REQ_PREFLUSH;
> > > - child->bi_iter.bi_sector = -1;
> > > - bio_chain(child, bio);
> > > - submit_bio(child);
> > > - return 0;
> > > + wait_event_lock_irq(vpmem->sb_wait,
> > > + !vpmem->flush_bio ||
> > > + ktime_before(req_start, vpmem->prev_flush_start),
> > > + vpmem->lock);
> > > + /* new request after previous flush is completed */
> > > + if (ktime_after(req_start, vpmem->prev_flush_start)) {
> > > + WARN_ON(vpmem->flush_bio);
> > > + vpmem->flush_bio = bio;
> > > + bio = NULL;
> > > + }
> >
> > Why the dance with ->prev_flush_start vs just calling queue_work()
> > again. queue_work() is naturally coalescing in that if the last work
> > request has not started execution another queue attempt will be
> > dropped.
>
> How parent flush request will know when corresponding flush is completed?

The eventual bio_endio() is what signals upper layers that the flush
completed...


Hold on... it's been so long that I forgot that you are copying
md_flush_request() here. It would help immensely if that was mentioned
in the changelog and at a minimum have a comment in the code that this
was copied from md. In fact it would be extra helpful if you
refactored a common helper that bio based block drivers could share
for implementing flush handling, but that can come later.

Let me go re-review this with respect to whether the md case is fully
applicable here.

2021-08-25 22:06:31

by Pankaj Gupta

[permalink] [raw]
Subject: Re: [RFC v2 1/2] virtio-pmem: Async virtio-pmem flush

> > Hi Dan,
> >
> > Thank you for the review. Please see my reply inline.
> >
> > > > Implement asynchronous flush for virtio pmem using work queue
> > > > to solve the preflush ordering issue. Also, coalesce the flush
> > > > requests when a flush is already in process.
> > > >
> > > > Signed-off-by: Pankaj Gupta <[email protected]>
> > > > ---
> > > > drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
> > > > drivers/nvdimm/virtio_pmem.c | 10 ++++-
> > > > drivers/nvdimm/virtio_pmem.h | 14 +++++++
> > > > 3 files changed, 79 insertions(+), 17 deletions(-)
> > > >
> > > > diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
> > > > index 10351d5b49fa..61b655b583be 100644
> > > > --- a/drivers/nvdimm/nd_virtio.c
> > > > +++ b/drivers/nvdimm/nd_virtio.c
> > > > @@ -97,29 +97,69 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
> > > > return err;
> > > > };
> > > >
> > > > +static void submit_async_flush(struct work_struct *ws);
> > > > +
> > > > /* The asynchronous flush callback function */
> > > > int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
> > > > {
> > > > - /*
> > > > - * Create child bio for asynchronous flush and chain with
> > > > - * parent bio. Otherwise directly call nd_region flush.
> > > > + /* queue asynchronous flush and coalesce the flush requests */
> > > > + struct virtio_device *vdev = nd_region->provider_data;
> > > > + struct virtio_pmem *vpmem = vdev->priv;
> > > > + ktime_t req_start = ktime_get_boottime();
> > > > +
> > > > + spin_lock_irq(&vpmem->lock);
> > > > + /* flush requests wait until ongoing flush completes,
> > > > + * hence coalescing all the pending requests.
> > > > */
> > > > - if (bio && bio->bi_iter.bi_sector != -1) {
> > > > - struct bio *child = bio_alloc(GFP_ATOMIC, 0);
> > > > -
> > > > - if (!child)
> > > > - return -ENOMEM;
> > > > - bio_copy_dev(child, bio);
> > > > - child->bi_opf = REQ_PREFLUSH;
> > > > - child->bi_iter.bi_sector = -1;
> > > > - bio_chain(child, bio);
> > > > - submit_bio(child);
> > > > - return 0;
> > > > + wait_event_lock_irq(vpmem->sb_wait,
> > > > + !vpmem->flush_bio ||
> > > > + ktime_before(req_start, vpmem->prev_flush_start),
> > > > + vpmem->lock);
> > > > + /* new request after previous flush is completed */
> > > > + if (ktime_after(req_start, vpmem->prev_flush_start)) {
> > > > + WARN_ON(vpmem->flush_bio);
> > > > + vpmem->flush_bio = bio;
> > > > + bio = NULL;
> > > > + }
> > >
> > > Why the dance with ->prev_flush_start vs just calling queue_work()
> > > again. queue_work() is naturally coalescing in that if the last work
> > > request has not started execution another queue attempt will be
> > > dropped.
> >
> > How parent flush request will know when corresponding flush is completed?
>
> The eventual bio_endio() is what signals upper layers that the flush
> completed...
>
>
> Hold on... it's been so long that I forgot that you are copying
> md_flush_request() here. It would help immensely if that was mentioned
> in the changelog and at a minimum have a comment in the code that this
> was copied from md. In fact it would be extra helpful if you

My bad. I only mentioned this in the cover letter.

> refactored a common helper that bio based block drivers could share
> for implementing flush handling, but that can come later.

Sure.
>
> Let me go re-review this with respect to whether the md case is fully
> applicable here.

o.k.

Best regards,
Pankaj

2021-08-25 22:12:08

by Dan Williams

[permalink] [raw]
Subject: Re: [RFC v2 1/2] virtio-pmem: Async virtio-pmem flush

On Wed, Aug 25, 2021 at 3:01 PM Pankaj Gupta <[email protected]> wrote:
>
> > > Hi Dan,
> > >
> > > Thank you for the review. Please see my reply inline.
> > >
> > > > > Implement asynchronous flush for virtio pmem using work queue
> > > > > to solve the preflush ordering issue. Also, coalesce the flush
> > > > > requests when a flush is already in process.
> > > > >
> > > > > Signed-off-by: Pankaj Gupta <[email protected]>
> > > > > ---
> > > > > drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
> > > > > drivers/nvdimm/virtio_pmem.c | 10 ++++-
> > > > > drivers/nvdimm/virtio_pmem.h | 14 +++++++
> > > > > 3 files changed, 79 insertions(+), 17 deletions(-)
> > > > >
> > > > > diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
> > > > > index 10351d5b49fa..61b655b583be 100644
> > > > > --- a/drivers/nvdimm/nd_virtio.c
> > > > > +++ b/drivers/nvdimm/nd_virtio.c
> > > > > @@ -97,29 +97,69 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
> > > > > return err;
> > > > > };
> > > > >
> > > > > +static void submit_async_flush(struct work_struct *ws);
> > > > > +
> > > > > /* The asynchronous flush callback function */
> > > > > int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
> > > > > {
> > > > > - /*
> > > > > - * Create child bio for asynchronous flush and chain with
> > > > > - * parent bio. Otherwise directly call nd_region flush.
> > > > > + /* queue asynchronous flush and coalesce the flush requests */
> > > > > + struct virtio_device *vdev = nd_region->provider_data;
> > > > > + struct virtio_pmem *vpmem = vdev->priv;
> > > > > + ktime_t req_start = ktime_get_boottime();
> > > > > +
> > > > > + spin_lock_irq(&vpmem->lock);
> > > > > + /* flush requests wait until ongoing flush completes,
> > > > > + * hence coalescing all the pending requests.
> > > > > */
> > > > > - if (bio && bio->bi_iter.bi_sector != -1) {
> > > > > - struct bio *child = bio_alloc(GFP_ATOMIC, 0);
> > > > > -
> > > > > - if (!child)
> > > > > - return -ENOMEM;
> > > > > - bio_copy_dev(child, bio);
> > > > > - child->bi_opf = REQ_PREFLUSH;
> > > > > - child->bi_iter.bi_sector = -1;
> > > > > - bio_chain(child, bio);
> > > > > - submit_bio(child);
> > > > > - return 0;
> > > > > + wait_event_lock_irq(vpmem->sb_wait,
> > > > > + !vpmem->flush_bio ||
> > > > > + ktime_before(req_start, vpmem->prev_flush_start),
> > > > > + vpmem->lock);
> > > > > + /* new request after previous flush is completed */
> > > > > + if (ktime_after(req_start, vpmem->prev_flush_start)) {
> > > > > + WARN_ON(vpmem->flush_bio);
> > > > > + vpmem->flush_bio = bio;
> > > > > + bio = NULL;
> > > > > + }
> > > >
> > > > Why the dance with ->prev_flush_start vs just calling queue_work()
> > > > again. queue_work() is naturally coalescing in that if the last work
> > > > request has not started execution another queue attempt will be
> > > > dropped.
> > >
> > > How parent flush request will know when corresponding flush is completed?
> >
> > The eventual bio_endio() is what signals upper layers that the flush
> > completed...
> >
> >
> > Hold on... it's been so long that I forgot that you are copying
> > md_flush_request() here. It would help immensely if that was mentioned
> > in the changelog and at a minimum have a comment in the code that this
> > was copied from md. In fact it would be extra helpful if you
>
> My bad. I only mentioned this in the cover letter.

Yeah, sorry about that. Having come back to this after so long I just
decided to jump straight into the patches, but even if I had read that
cover I still would have given the feedback that md_flush_request()
heritage should also be noted with a comment in the code.

2021-08-27 12:41:36

by Pankaj Gupta

[permalink] [raw]
Subject: Re: [RFC v2 1/2] virtio-pmem: Async virtio-pmem flush

> > > > > > Implement asynchronous flush for virtio pmem using work queue
> > > > > > to solve the preflush ordering issue. Also, coalesce the flush
> > > > > > requests when a flush is already in process.
> > > > > >
> > > > > > Signed-off-by: Pankaj Gupta <[email protected]>
> > > > > > ---
> > > > > > drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
> > > > > > drivers/nvdimm/virtio_pmem.c | 10 ++++-
> > > > > > drivers/nvdimm/virtio_pmem.h | 14 +++++++
> > > > > > 3 files changed, 79 insertions(+), 17 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
> > > > > > index 10351d5b49fa..61b655b583be 100644
> > > > > > --- a/drivers/nvdimm/nd_virtio.c
> > > > > > +++ b/drivers/nvdimm/nd_virtio.c
> > > > > > @@ -97,29 +97,69 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
> > > > > > return err;
> > > > > > };
> > > > > >
> > > > > > +static void submit_async_flush(struct work_struct *ws);
> > > > > > +
> > > > > > /* The asynchronous flush callback function */
> > > > > > int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
> > > > > > {
> > > > > > - /*
> > > > > > - * Create child bio for asynchronous flush and chain with
> > > > > > - * parent bio. Otherwise directly call nd_region flush.
> > > > > > + /* queue asynchronous flush and coalesce the flush requests */
> > > > > > + struct virtio_device *vdev = nd_region->provider_data;
> > > > > > + struct virtio_pmem *vpmem = vdev->priv;
> > > > > > + ktime_t req_start = ktime_get_boottime();
> > > > > > +
> > > > > > + spin_lock_irq(&vpmem->lock);
> > > > > > + /* flush requests wait until ongoing flush completes,
> > > > > > + * hence coalescing all the pending requests.
> > > > > > */
> > > > > > - if (bio && bio->bi_iter.bi_sector != -1) {
> > > > > > - struct bio *child = bio_alloc(GFP_ATOMIC, 0);
> > > > > > -
> > > > > > - if (!child)
> > > > > > - return -ENOMEM;
> > > > > > - bio_copy_dev(child, bio);
> > > > > > - child->bi_opf = REQ_PREFLUSH;
> > > > > > - child->bi_iter.bi_sector = -1;
> > > > > > - bio_chain(child, bio);
> > > > > > - submit_bio(child);
> > > > > > - return 0;
> > > > > > + wait_event_lock_irq(vpmem->sb_wait,
> > > > > > + !vpmem->flush_bio ||
> > > > > > + ktime_before(req_start, vpmem->prev_flush_start),
> > > > > > + vpmem->lock);
> > > > > > + /* new request after previous flush is completed */
> > > > > > + if (ktime_after(req_start, vpmem->prev_flush_start)) {
> > > > > > + WARN_ON(vpmem->flush_bio);
> > > > > > + vpmem->flush_bio = bio;
> > > > > > + bio = NULL;
> > > > > > + }
> > > > >
> > > > > Why the dance with ->prev_flush_start vs just calling queue_work()
> > > > > again. queue_work() is naturally coalescing in that if the last work
> > > > > request has not started execution another queue attempt will be
> > > > > dropped.
> > > >
> > > > How parent flush request will know when corresponding flush is completed?
> > >
> > > The eventual bio_endio() is what signals upper layers that the flush
> > > completed...
> > >
> > >
> > > Hold on... it's been so long that I forgot that you are copying
> > > md_flush_request() here. It would help immensely if that was mentioned
> > > in the changelog and at a minimum have a comment in the code that this
> > > was copied from md. In fact it would be extra helpful if you
> >
> > My bad. I only mentioned this in the cover letter.
>
> Yeah, sorry about that. Having come back to this after so long I just
> decided to jump straight into the patches, but even if I had read that
> cover I still would have given the feedback that md_flush_request()
> heritage should also be noted with a comment in the code.

Sure.

Thanks,
Pankaj

2021-10-18 03:31:59

by Pankaj Gupta

[permalink] [raw]
Subject: Re: [RFC v2 0/2] virtio-pmem: Asynchronous flush

Friendly ping!

Thanks,
Pankaj

On Thu, 19 Aug 2021 at 13:08, Pankaj Gupta <[email protected]> wrote:
>
> Gentle ping.
>
> >
> > Jeff reported preflush order issue with the existing implementation
> > of virtio pmem preflush. Dan suggested[1] to implement asynchronous flush
> > for virtio pmem using work queue as done in md/RAID. This patch series
> > intends to solve the preflush ordering issue and also makes the flush
> > asynchronous for the submitting thread.
> >
> > Submitting this patch series for review. Sorry, It took me long time to
> > come back to this due to some personal reasons.
> >
> > RFC v1 -> RFC v2
> > - More testing and bug fix.
> >
> > [1] https://marc.info/?l=linux-kernel&m=157446316409937&w=2
> >
> > Pankaj Gupta (2):
> > virtio-pmem: Async virtio-pmem flush
> > pmem: enable pmem_submit_bio for asynchronous flush
> >
> > drivers/nvdimm/nd_virtio.c | 72 ++++++++++++++++++++++++++++--------
> > drivers/nvdimm/pmem.c | 17 ++++++---
> > drivers/nvdimm/virtio_pmem.c | 10 ++++-
> > drivers/nvdimm/virtio_pmem.h | 14 +++++++
> > 4 files changed, 91 insertions(+), 22 deletions(-)
> >
> > --
> > 2.25.1
> >