LinuxLists.cc - [PATCH 0/2] block: loop: fix stacked loop and performance regression

2015-05-05 11:51:00

Subject: [PATCH 0/2] block: loop: fix stacked loop and performance regression

Hi,

The 1st patch convers to per-device workqueue because loop devices
can be stacked.

The 2nd patch decreases max active works as 16, so that fedora 22's
boot performance regression can be fixed.

drivers/block/loop.c | 30 ++++++++++++++----------------
drivers/block/loop.h | 1 +
2 files changed, 15 insertions(+), 16 deletions(-)

Thanks,
Ming Lei

2015-05-05 11:52:06

by Ming Lei

[permalink] [raw]

Subject: [PATCH 1/2] block: loop: convert to per-device workqueue

Documentation/workqueue.txt:
If there is dependency among multiple work items used
during memory reclaim, they should be queued to separate
wq each with WQ_MEM_RECLAIM.

Loop devices can be stacked, so we have to convert to per-device
workqueue. One example is Fedora live CD.

Fixes: b5dd2f6047ca108001328aac0e8588edd15f1778
Cc: [email protected] (v4.0)
Cc: Justin M. Forbes <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
drivers/block/loop.c | 30 ++++++++++++++----------------
drivers/block/loop.h | 1 +
2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ae3fcb4..3dc1598 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -86,8 +86,6 @@ static DEFINE_MUTEX(loop_index_mutex);
static int max_part;
static int part_shift;

-static struct workqueue_struct *loop_wq;
-
static int transfer_xor(struct loop_device *lo, int cmd,
struct page *raw_page, unsigned raw_off,
struct page *loop_page, unsigned loop_off,
@@ -725,6 +723,12 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
size = get_loop_size(lo, file);
if ((loff_t)(sector_t)size != size)
goto out_putf;
+ error = -ENOMEM;
+ lo->wq = alloc_workqueue("kloopd%d",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0,
+ lo->lo_number);
+ if (!lo->wq)
+ goto out_putf;

error = 0;

@@ -872,6 +876,8 @@ static int loop_clr_fd(struct loop_device *lo)
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+ destroy_workqueue(lo->wq);
+ lo->wq = NULL;
mutex_unlock(&lo->lo_ctl_mutex);
/*
* Need not hold lo_ctl_mutex to fput backing file.
@@ -1425,9 +1431,13 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+ struct loop_device *lo = cmd->rq->q->queuedata;

blk_mq_start_request(bd->rq);

+ if (lo->lo_state != Lo_bound)
+ return -EIO;
+
if (cmd->rq->cmd_flags & REQ_WRITE) {
struct loop_device *lo = cmd->rq->q->queuedata;
bool need_sched = true;
@@ -1441,9 +1451,9 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
spin_unlock_irq(&lo->lo_lock);

if (need_sched)
- queue_work(loop_wq, &lo->write_work);
+ queue_work(lo->wq, &lo->write_work);
} else {
- queue_work(loop_wq, &cmd->read_work);
+ queue_work(lo->wq, &cmd->read_work);
}

return BLK_MQ_RQ_QUEUE_OK;
@@ -1455,9 +1465,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
struct loop_device *lo = cmd->rq->q->queuedata;
int ret = -EIO;

- if (lo->lo_state != Lo_bound)
- goto failed;
-
if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
goto failed;

@@ -1806,13 +1813,6 @@ static int __init loop_init(void)
goto misc_out;
}

- loop_wq = alloc_workqueue("kloopd",
- WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
- if (!loop_wq) {
- err = -ENOMEM;
- goto misc_out;
- }
-
blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
THIS_MODULE, loop_probe, NULL, NULL);

@@ -1850,8 +1850,6 @@ static void __exit loop_exit(void)
blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
unregister_blkdev(LOOP_MAJOR, "loop");

- destroy_workqueue(loop_wq);
-
misc_deregister(&loop_misc);
}

diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 301c27f..49564ed 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -54,6 +54,7 @@ struct loop_device {
gfp_t old_gfp_mask;

spinlock_t lo_lock;
+ struct workqueue_struct *wq;
struct list_head write_cmd_head;
struct work_struct write_work;
bool write_started;
--
1.9.1

2015-05-05 11:51:52

by Ming Lei

[permalink] [raw]

Subject: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

If there are too many pending per work I/O, too many
high priority work thread can be generated so that
system performance can be effected.

This patch limits the max_active parameter of workqueue as 16.

This patch fixes Fedora 22 live booting performance
regression when it is booted from squashfs over dm
based on loop, and looks the following reasons are
related with the problem:

- not like other filesyststems(such as ext4), squashfs
is a bit special, and I observed that increasing I/O jobs
to access file in squashfs only improve I/O performance a
little, but it can make big difference for ext4

- nested loop: both squashfs.img and ext3fs.img are mounted
as loop block, and ext3fs.img is inside the squashfs

- during booting, lots of tasks may run concurrently

Fixes: b5dd2f6047ca108001328aac0e8588edd15f1778
Cc: [email protected] (v4.0)
Cc: Justin M. Forbes <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
drivers/block/loop.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3dc1598..1bee523 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -725,7 +725,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
goto out_putf;
error = -ENOMEM;
lo->wq = alloc_workqueue("kloopd%d",
- WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0,
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
lo->lo_number);
if (!lo->wq)
goto out_putf;
--
1.9.1

2015-05-05 14:00:09

by Tejun Heo

[permalink] [raw]

Subject: Re: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

On Tue, May 05, 2015 at 07:49:55PM +0800, Ming Lei wrote:
...
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 3dc1598..1bee523 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -725,7 +725,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
> goto out_putf;
> error = -ENOMEM;
> lo->wq = alloc_workqueue("kloopd%d",
> - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0,
> + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,

It's a bit weird to hard code this to 16 as this effectively becomes a
hidden bottleneck for concurrency. For cases where 16 isn't a good
value, hunting down what's going on can be painful as it's not visible
anywhere. I still think the right knob to control concurrency is
nr_requests for the loop device. You said that for linear IOs, it's
better to have higher nr_requests than concurrency but can you
elaborate why?

Thanks.

--
tejun

2015-05-05 14:00:48

by Tejun Heo

[permalink] [raw]

Subject: Re: [PATCH 1/2] block: loop: convert to per-device workqueue

On Tue, May 05, 2015 at 07:49:54PM +0800, Ming Lei wrote:
> Documentation/workqueue.txt:
> If there is dependency among multiple work items used
> during memory reclaim, they should be queued to separate
> wq each with WQ_MEM_RECLAIM.
>
> Loop devices can be stacked, so we have to convert to per-device
> workqueue. One example is Fedora live CD.
>
> Fixes: b5dd2f6047ca108001328aac0e8588edd15f1778
> Cc: [email protected] (v4.0)
> Cc: Justin M. Forbes <[email protected]>
> Signed-off-by: Ming Lei <[email protected]>

Acked-by: Tejun Heo <[email protected]>

Thanks.

--
tejun

2015-05-05 16:08:05

by Ming Lei

[permalink] [raw]

Subject: Re: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

On Tue, May 5, 2015 at 9:59 PM, Tejun Heo <[email protected]> wrote:
> On Tue, May 05, 2015 at 07:49:55PM +0800, Ming Lei wrote:
> ...
>> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
>> index 3dc1598..1bee523 100644
>> --- a/drivers/block/loop.c
>> +++ b/drivers/block/loop.c
>> @@ -725,7 +725,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
>> goto out_putf;
>> error = -ENOMEM;
>> lo->wq = alloc_workqueue("kloopd%d",
>> - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0,
>> + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
>
> It's a bit weird to hard code this to 16 as this effectively becomes a
> hidden bottleneck for concurrency. For cases where 16 isn't a good
> value, hunting down what's going on can be painful as it's not visible
> anywhere. I still think the right knob to control concurrency is
> nr_requests for the loop device. You said that for linear IOs, it's
> better to have higher nr_requests than concurrency but can you
> elaborate why?

I mean, in case of sequential IO, the IO may hit page cache a bit easier,
so handling the IO may be quite quick, then it is often more efficient to
handle them in one same context(such as, handle one by one from IO
queue) than from different contexts(scheduled from different worker
threads). And that can be made by setting a bigger nr_requests(queue_depth).

Thanks,
Ming Lei

2015-05-05 16:55:56

by Tejun Heo

[permalink] [raw]

Subject: Re: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

Hello, Ming.

On Tue, May 05, 2015 at 10:46:10PM +0800, Ming Lei wrote:
> On Tue, May 5, 2015 at 9:59 PM, Tejun Heo <[email protected]> wrote:
> > It's a bit weird to hard code this to 16 as this effectively becomes a
> > hidden bottleneck for concurrency. For cases where 16 isn't a good
> > value, hunting down what's going on can be painful as it's not visible
> > anywhere. I still think the right knob to control concurrency is
> > nr_requests for the loop device. You said that for linear IOs, it's
> > better to have higher nr_requests than concurrency but can you
> > elaborate why?
>
> I mean, in case of sequential IO, the IO may hit page cache a bit easier,
> so handling the IO may be quite quick, then it is often more efficient to
> handle them in one same context(such as, handle one by one from IO
> queue) than from different contexts(scheduled from different worker
> threads). And that can be made by setting a bigger nr_requests(queue_depth).

Ah, so, it's about the queueing latency. Blocking the issuer from
get_request side for the same level of concurrency would incur a lot
longer latency before the next IO can be dispatched. The arbitrary 16
is still bothering but for now it's fine I guess, but we need to
revisit the whole thing including WQ_HIGHPRI thing. Maybe it made
sense when we had only one thread servicing all IOs but w/ high
concurrency I don't think it's a good idea.

Please feel free to add

Acked-by: Tejun Heo <[email protected]>

Thanks.

--
tejun

2015-05-05 19:01:09

by Justin Forbes

[permalink] [raw]

Subject: Re: [PATCH 0/2] block: loop: fix stacked loop and performance regression

On Tue, 2015-05-05 at 19:49 +0800, Ming Lei wrote:
> Hi,
>
> The 1st patch convers to per-device workqueue because loop devices
> can be stacked.
>
> The 2nd patch decreases max active works as 16, so that fedora 22's
> boot performance regression can be fixed.
>
> drivers/block/loop.c | 30 ++++++++++++++----------------
> drivers/block/loop.h | 1 +
> 2 files changed, 15 insertions(+), 16 deletions(-)
>
>
> Thanks,
> Ming Lei
>

Tested with Fedora 22 ISOs, these still solve the problem for us.

Tested-by: Justin M. Forbes <[email protected]>

2015-05-06 03:14:26

by Ming Lei

[permalink] [raw]

Subject: Re: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

On Wed, May 6, 2015 at 12:55 AM, Tejun Heo <[email protected]> wrote:
> Hello, Ming.
>
> On Tue, May 05, 2015 at 10:46:10PM +0800, Ming Lei wrote:
>> On Tue, May 5, 2015 at 9:59 PM, Tejun Heo <[email protected]> wrote:
>> > It's a bit weird to hard code this to 16 as this effectively becomes a
>> > hidden bottleneck for concurrency. For cases where 16 isn't a good
>> > value, hunting down what's going on can be painful as it's not visible
>> > anywhere. I still think the right knob to control concurrency is
>> > nr_requests for the loop device. You said that for linear IOs, it's
>> > better to have higher nr_requests than concurrency but can you
>> > elaborate why?
>>
>> I mean, in case of sequential IO, the IO may hit page cache a bit easier,
>> so handling the IO may be quite quick, then it is often more efficient to
>> handle them in one same context(such as, handle one by one from IO
>> queue) than from different contexts(scheduled from different worker
>> threads). And that can be made by setting a bigger nr_requests(queue_depth).
>
> Ah, so, it's about the queueing latency. Blocking the issuer from
> get_request side for the same level of concurrency would incur a lot
> longer latency before the next IO can be dispatched. The arbitrary 16
> is still bothering but for now it's fine I guess, but we need to
> revisit the whole thing including WQ_HIGHPRI thing. Maybe it made
> sense when we had only one thread servicing all IOs but w/ high
> concurrency I don't think it's a good idea.

Yes, I was thinking about it too, but concurrency can improve
random I/O throughput a lot in my tests.

Also I have patches to use aio/dio for loop, then one thread is enough,
and both double cache and high context switch can be avoided.

I will post them later for review.

>
> Please feel free to add
>
> Acked-by: Tejun Heo <[email protected]>

Thanks for your review and ack!

thanks,
Ming Lei

2015-05-06 05:17:38

by Ming Lei

[permalink] [raw]

Subject: Re: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

On Wed, May 6, 2015 at 11:14 AM, Ming Lei <[email protected]> wrote:
> On Wed, May 6, 2015 at 12:55 AM, Tejun Heo <[email protected]> wrote:
>> Hello, Ming.
>>
>> On Tue, May 05, 2015 at 10:46:10PM +0800, Ming Lei wrote:
>>> On Tue, May 5, 2015 at 9:59 PM, Tejun Heo <[email protected]> wrote:
>>> > It's a bit weird to hard code this to 16 as this effectively becomes a
>>> > hidden bottleneck for concurrency. For cases where 16 isn't a good
>>> > value, hunting down what's going on can be painful as it's not visible
>>> > anywhere. I still think the right knob to control concurrency is
>>> > nr_requests for the loop device. You said that for linear IOs, it's
>>> > better to have higher nr_requests than concurrency but can you
>>> > elaborate why?
>>>
>>> I mean, in case of sequential IO, the IO may hit page cache a bit easier,
>>> so handling the IO may be quite quick, then it is often more efficient to
>>> handle them in one same context(such as, handle one by one from IO
>>> queue) than from different contexts(scheduled from different worker
>>> threads). And that can be made by setting a bigger nr_requests(queue_depth).
>>
>> Ah, so, it's about the queueing latency. Blocking the issuer from
>> get_request side for the same level of concurrency would incur a lot
>> longer latency before the next IO can be dispatched. The arbitrary 16
>> is still bothering but for now it's fine I guess, but we need to
>> revisit the whole thing including WQ_HIGHPRI thing. Maybe it made
>> sense when we had only one thread servicing all IOs but w/ high
>> concurrency I don't think it's a good idea.
>
> Yes, I was thinking about it too, but concurrency can improve
> random I/O throughput a lot in my tests.

Thinking of it further, the problem becomes very similar with
'Non-blocking buffered file read operations'[1] which was discussed
last year. If the read IO can be predicted as buffered I/O, we
handle it in single thread, otherwise handle it concurrently,
and this approach should be more efficient if possible, I think.

But I still prefer to dio/aio approach because double cache can
be avoided, which is a big win in my previous tests.

[1], https://lwn.net/Articles/612483/

Thanks,
Ming Lei

2015-05-06 07:36:45

by Christoph Hellwig

[permalink] [raw]

Subject: Re: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

On Wed, May 06, 2015 at 01:17:33PM +0800, Ming Lei wrote:
> But I still prefer to dio/aio approach because double cache can
> be avoided, which is a big win in my previous tests.

Can yo respin and resend the patches? With Al's iov_iter work and my
kiocb changes it should be fairly easy and non-intrusive to do.

2015-05-06 11:43:30

by Ming Lei

[permalink] [raw]

Subject: Re: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

On Wed, May 6, 2015 at 3:36 PM, Christoph Hellwig <[email protected]> wrote:
> On Wed, May 06, 2015 at 01:17:33PM +0800, Ming Lei wrote:
>> But I still prefer to dio/aio approach because double cache can
>> be avoided, which is a big win in my previous tests.
>
> Can yo respin and resend the patches? With Al's iov_iter work and my
> kiocb changes it should be fairly easy and non-intrusive to do.

I will post them later.

thanks,

2015-05-22 12:36:35

by Josh Boyer

[permalink] [raw]

Subject: Re: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

On Tue, May 5, 2015 at 7:49 AM, Ming Lei <[email protected]> wrote:
> If there are too many pending per work I/O, too many
> high priority work thread can be generated so that
> system performance can be effected.
>
> This patch limits the max_active parameter of workqueue as 16.
>
> This patch fixes Fedora 22 live booting performance
> regression when it is booted from squashfs over dm
> based on loop, and looks the following reasons are
> related with the problem:
>
> - not like other filesyststems(such as ext4), squashfs
> is a bit special, and I observed that increasing I/O jobs
> to access file in squashfs only improve I/O performance a
> little, but it can make big difference for ext4
>
> - nested loop: both squashfs.img and ext3fs.img are mounted
> as loop block, and ext3fs.img is inside the squashfs
>
> - during booting, lots of tasks may run concurrently
>
> Fixes: b5dd2f6047ca108001328aac0e8588edd15f1778
> Cc: [email protected] (v4.0)
> Cc: Justin M. Forbes <[email protected]>
> Signed-off-by: Ming Lei <[email protected]>

Did we ever come to conclusion on this and patch 1/2 in the series?
Fedora has them applied to it's 4.0.y based kernels to fix the
performance regression we saw, and we're carrying them in rawhide as
well. I'm curious if these will go into 4.1 or if they're queued at
all for 4.2?

josh

> ---
> drivers/block/loop.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 3dc1598..1bee523 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -725,7 +725,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
> goto out_putf;
> error = -ENOMEM;
> lo->wq = alloc_workqueue("kloopd%d",
> - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0,
> + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
> lo->lo_number);
> if (!lo->wq)
> goto out_putf;
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe stable" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2015-05-22 13:32:39

by Ming Lei

[permalink] [raw]

Subject: Re: [PATCH 2/2] block: loop: avoiding too many pending per work I/O

On Fri, May 22, 2015 at 8:36 PM, Josh Boyer <[email protected]> wrote:
> On Tue, May 5, 2015 at 7:49 AM, Ming Lei <[email protected]> wrote:
>> If there are too many pending per work I/O, too many
>> high priority work thread can be generated so that
>> system performance can be effected.
>>
>> This patch limits the max_active parameter of workqueue as 16.
>>
>> This patch fixes Fedora 22 live booting performance
>> regression when it is booted from squashfs over dm
>> based on loop, and looks the following reasons are
>> related with the problem:
>>
>> - not like other filesyststems(such as ext4), squashfs
>> is a bit special, and I observed that increasing I/O jobs
>> to access file in squashfs only improve I/O performance a
>> little, but it can make big difference for ext4
>>
>> - nested loop: both squashfs.img and ext3fs.img are mounted
>> as loop block, and ext3fs.img is inside the squashfs
>>
>> - during booting, lots of tasks may run concurrently
>>
>> Fixes: b5dd2f6047ca108001328aac0e8588edd15f1778
>> Cc: [email protected] (v4.0)
>> Cc: Justin M. Forbes <[email protected]>
>> Signed-off-by: Ming Lei <[email protected]>
>
> Did we ever come to conclusion on this and patch 1/2 in the series?
> Fedora has them applied to it's 4.0.y based kernels to fix the
> performance regression we saw, and we're carrying them in rawhide as
> well. I'm curious if these will go into 4.1 or if they're queued at
> all for 4.2?

I saw it queued in for-next branch of block tree, so it should be merged
to 4.2.

>
> josh
>
>> ---
>> drivers/block/loop.c | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
>> index 3dc1598..1bee523 100644
>> --- a/drivers/block/loop.c
>> +++ b/drivers/block/loop.c
>> @@ -725,7 +725,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
>> goto out_putf;
>> error = -ENOMEM;
>> lo->wq = alloc_workqueue("kloopd%d",
>> - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0,
>> + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
>> lo->lo_number);
>> if (!lo->wq)
>> goto out_putf;
>> --
>> 1.9.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe stable" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html

2015-05-26 22:30:23

by Dave Kleikamp

[permalink] [raw]

Subject: Re: [PATCH 1/2] block: loop: convert to per-device workqueue

On 05/05/2015 06:49 AM, Ming Lei wrote:
> Documentation/workqueue.txt:
> If there is dependency among multiple work items used
> during memory reclaim, they should be queued to separate
> wq each with WQ_MEM_RECLAIM.
>
> Loop devices can be stacked, so we have to convert to per-device
> workqueue. One example is Fedora live CD.

I'm just now looking at this, but I found a minor nit. I'm not sure if
it's worth replacing the patch or just fixing it with a trivial cleanup
patch.

>
> Fixes: b5dd2f6047ca108001328aac0e8588edd15f1778
> Cc: [email protected] (v4.0)
> Cc: Justin M. Forbes <[email protected]>
> Signed-off-by: Ming Lei <[email protected]>
> ---
> drivers/block/loop.c | 30 ++++++++++++++----------------
> drivers/block/loop.h | 1 +
> 2 files changed, 15 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index ae3fcb4..3dc1598 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -86,8 +86,6 @@ static DEFINE_MUTEX(loop_index_mutex);
> static int max_part;
> static int part_shift;
>
> -static struct workqueue_struct *loop_wq;
> -
> static int transfer_xor(struct loop_device *lo, int cmd,
> struct page *raw_page, unsigned raw_off,
> struct page *loop_page, unsigned loop_off,
> @@ -725,6 +723,12 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
> size = get_loop_size(lo, file);
> if ((loff_t)(sector_t)size != size)
> goto out_putf;
> + error = -ENOMEM;
> + lo->wq = alloc_workqueue("kloopd%d",
> + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0,
> + lo->lo_number);
> + if (!lo->wq)
> + goto out_putf;
>
> error = 0;
>
> @@ -872,6 +876,8 @@ static int loop_clr_fd(struct loop_device *lo)
> lo->lo_flags = 0;
> if (!part_shift)
> lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
> + destroy_workqueue(lo->wq);
> + lo->wq = NULL;
> mutex_unlock(&lo->lo_ctl_mutex);
> /*
> * Need not hold lo_ctl_mutex to fput backing file.
> @@ -1425,9 +1431,13 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
> const struct blk_mq_queue_data *bd)
> {
> struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
> + struct loop_device *lo = cmd->rq->q->queuedata;
>
> blk_mq_start_request(bd->rq);
>
> + if (lo->lo_state != Lo_bound)
> + return -EIO;
> +
> if (cmd->rq->cmd_flags & REQ_WRITE) {
> struct loop_device *lo = cmd->rq->q->queuedata;

The above definition of lo becomes redundant now.

> bool need_sched = true;
> @@ -1441,9 +1451,9 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
> spin_unlock_irq(&lo->lo_lock);
>
> if (need_sched)
> - queue_work(loop_wq, &lo->write_work);
> + queue_work(lo->wq, &lo->write_work);
> } else {
> - queue_work(loop_wq, &cmd->read_work);
> + queue_work(lo->wq, &cmd->read_work);
> }
>
> return BLK_MQ_RQ_QUEUE_OK;
> @@ -1455,9 +1465,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
> struct loop_device *lo = cmd->rq->q->queuedata;
> int ret = -EIO;
>
> - if (lo->lo_state != Lo_bound)
> - goto failed;
> -
> if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
> goto failed;
>
> @@ -1806,13 +1813,6 @@ static int __init loop_init(void)
> goto misc_out;
> }
>
> - loop_wq = alloc_workqueue("kloopd",
> - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
> - if (!loop_wq) {
> - err = -ENOMEM;
> - goto misc_out;
> - }
> -
> blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
> THIS_MODULE, loop_probe, NULL, NULL);
>
> @@ -1850,8 +1850,6 @@ static void __exit loop_exit(void)
> blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
> unregister_blkdev(LOOP_MAJOR, "loop");
>
> - destroy_workqueue(loop_wq);
> -
> misc_deregister(&loop_misc);
> }
>
> diff --git a/drivers/block/loop.h b/drivers/block/loop.h
> index 301c27f..49564ed 100644
> --- a/drivers/block/loop.h
> +++ b/drivers/block/loop.h
> @@ -54,6 +54,7 @@ struct loop_device {
> gfp_t old_gfp_mask;
>
> spinlock_t lo_lock;
> + struct workqueue_struct *wq;
> struct list_head write_cmd_head;
> struct work_struct write_work;
> bool write_started;
>