Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753064AbbGSMMV (ORCPT ); Sun, 19 Jul 2015 08:12:21 -0400 Received: from mail-ie0-f171.google.com ([209.85.223.171]:36608 "EHLO mail-ie0-f171.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751496AbbGSMMT (ORCPT ); Sun, 19 Jul 2015 08:12:19 -0400 MIME-Version: 1.0 In-Reply-To: <1437236903-31617-6-git-send-email-akinobu.mita@gmail.com> References: <1437236903-31617-1-git-send-email-akinobu.mita@gmail.com> <1437236903-31617-6-git-send-email-akinobu.mita@gmail.com> Date: Sun, 19 Jul 2015 20:12:18 +0800 Message-ID: Subject: Re: [PATCH v3 5/7] blk-mq: avoid inserting requests before establishing new mapping From: Ming Lei To: Akinobu Mita Cc: Linux Kernel Mailing List , Jens Axboe Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9509 Lines: 241 On Sun, Jul 19, 2015 at 12:28 AM, Akinobu Mita wrote: > Notifier callbacks for CPU_ONLINE action can be run on the other CPU > than the CPU which was just onlined. So it is possible for the > process running on the just onlined CPU to insert request and run > hw queue before establishing new mapping which is done by > blk_mq_queue_reinit_notify(). > > This can cause a problem when the CPU has just been onlined first time > since the request queue was initialized. At this time ctx->index_hw > for the CPU, which is the index in hctx->ctxs[] for this ctx, is still > zero before blk_mq_queue_reinit_notify() is called by notifier > callbacks for CPU_ONLINE action. > > For example, there is a single hw queue (hctx) and two CPU queues > (ctx0 for CPU0, and ctx1 for CPU1). Now CPU1 is just onlined and > a request is inserted into ctx1->rq_list and set bit0 in pending > bitmap as ctx1->index_hw is still zero. > > And then while running hw queue, flush_busy_ctxs() finds bit0 is set > in pending bitmap and tries to retrieve requests in > hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, so the > request in ctx1->rq_list is ignored. > > Fix it by ensuring that new mapping is established before onlined cpu > starts running. > > Signed-off-by: Akinobu Mita > Cc: Jens Axboe > Cc: Ming Lei Reviewed-by: Ming Lei > --- > block/blk-mq-cpumap.c | 9 ++++---- > block/blk-mq.c | 59 +++++++++++++++++++++++++++++++++++++++------------ > block/blk-mq.h | 3 ++- > 3 files changed, 52 insertions(+), 19 deletions(-) > > diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c > index 1e28ddb..8764c24 100644 > --- a/block/blk-mq-cpumap.c > +++ b/block/blk-mq-cpumap.c > @@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu) > return cpu; > } > > -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) > +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, > + const struct cpumask *online_mask) > { > unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; > cpumask_var_t cpus; > @@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) > > cpumask_clear(cpus); > nr_cpus = nr_uniq_cpus = 0; > - for_each_online_cpu(i) { > + for_each_cpu(i, online_mask) { > nr_cpus++; > first_sibling = get_first_sibling(i); > if (!cpumask_test_cpu(first_sibling, cpus)) > @@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) > > queue = 0; > for_each_possible_cpu(i) { > - if (!cpu_online(i)) { > + if (!cpumask_test_cpu(i, online_mask)) { > map[i] = 0; > continue; > } > @@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) > if (!map) > return NULL; > > - if (!blk_mq_update_queue_map(map, set->nr_hw_queues)) > + if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask)) > return map; > > kfree(map); > diff --git a/block/blk-mq.c b/block/blk-mq.c > index d5d93e0..d861c70 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -1799,7 +1799,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, > } > } > > -static void blk_mq_map_swqueue(struct request_queue *q) > +static void blk_mq_map_swqueue(struct request_queue *q, > + const struct cpumask *online_mask) > { > unsigned int i; > struct blk_mq_hw_ctx *hctx; > @@ -1816,7 +1817,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) > */ > queue_for_each_ctx(q, ctx, i) { > /* If the cpu isn't online, the cpu is mapped to first hctx */ > - if (!cpu_online(i)) > + if (!cpumask_test_cpu(i, online_mask)) > continue; > > hctx = q->mq_ops->map_queue(q, i); > @@ -1862,7 +1863,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) > } > > queue_for_each_ctx(q, ctx, i) { > - if (!cpu_online(i)) > + if (!cpumask_test_cpu(i, online_mask)) > continue; > > hctx = q->mq_ops->map_queue(q, i); > @@ -2047,13 +2048,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, > if (blk_mq_init_hw_queues(q, set)) > goto err_hctxs; > > + get_online_cpus(); > mutex_lock(&all_q_mutex); > > list_add_tail(&q->all_q_node, &all_q_list); > blk_mq_add_queue_tag_set(set, q); > - blk_mq_map_swqueue(q); > + blk_mq_map_swqueue(q, cpu_online_mask); > > mutex_unlock(&all_q_mutex); > + put_online_cpus(); > > return q; > > @@ -2090,13 +2093,14 @@ void blk_mq_free_queue(struct request_queue *q) > } > > /* Basically redo blk_mq_init_queue with queue frozen */ > -static void blk_mq_queue_reinit(struct request_queue *q) > +static void blk_mq_queue_reinit(struct request_queue *q, > + const struct cpumask *online_mask) > { > WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); > > blk_mq_sysfs_unregister(q); > > - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); > + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); > > /* > * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe > @@ -2104,7 +2108,7 @@ static void blk_mq_queue_reinit(struct request_queue *q) > * involves free and re-allocate memory, worthy doing?) > */ > > - blk_mq_map_swqueue(q); > + blk_mq_map_swqueue(q, online_mask); > > blk_mq_sysfs_register(q); > } > @@ -2113,16 +2117,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, > unsigned long action, void *hcpu) > { > struct request_queue *q; > + int cpu = (unsigned long)hcpu; > + /* > + * New online cpumask which is going to be set in this hotplug event. > + * Declare this cpumasks as global as cpu-hotplug operation is invoked > + * one-by-one and dynamically allocating this could result in a failure. > + */ > + static struct cpumask online_new; > > /* > - * Before new mappings are established, hotadded cpu might already > - * start handling requests. This doesn't break anything as we map > - * offline CPUs to first hardware queue. We will re-init the queue > - * below to get optimal settings. > + * Before hotadded cpu starts handling requests, new mappings must > + * be established. Otherwise, these requests in hw queue might > + * never be dispatched. > + * > + * For example, there is a single hw queue (hctx) and two CPU queues > + * (ctx0 for CPU0, and ctx1 for CPU1). > + * > + * Now CPU1 is just onlined and a request is inserted into > + * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is > + * still zero. > + * > + * And then while running hw queue, flush_busy_ctxs() finds bit0 is > + * set in pending bitmap and tries to retrieve requests in > + * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, > + * so the request in ctx1->rq_list is ignored. > */ > - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && > - action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) > + switch (action & ~CPU_TASKS_FROZEN) { > + case CPU_DEAD: > + case CPU_UP_CANCELED: > + cpumask_copy(&online_new, cpu_online_mask); > + break; > + case CPU_UP_PREPARE: > + cpumask_copy(&online_new, cpu_online_mask); > + cpumask_set_cpu(cpu, &online_new); > + break; > + default: > return NOTIFY_OK; > + } > > mutex_lock(&all_q_mutex); > > @@ -2146,7 +2177,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, > } > > list_for_each_entry(q, &all_q_list, all_q_node) > - blk_mq_queue_reinit(q); > + blk_mq_queue_reinit(q, &online_new); > > list_for_each_entry(q, &all_q_list, all_q_node) > blk_mq_unfreeze_queue(q); > diff --git a/block/blk-mq.h b/block/blk-mq.h > index 6a48c4c..f4fea79 100644 > --- a/block/blk-mq.h > +++ b/block/blk-mq.h > @@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void); > * CPU -> queue mappings > */ > extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); > -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); > +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, > + const struct cpumask *online_mask); > extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); > > /* > -- > 1.9.1 > -- Ming Lei -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/