Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751384AbbGIHEn (ORCPT ); Thu, 9 Jul 2015 03:04:43 -0400 Received: from mail-ig0-f172.google.com ([209.85.213.172]:36872 "EHLO mail-ig0-f172.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751125AbbGIHEe (ORCPT ); Thu, 9 Jul 2015 03:04:34 -0400 MIME-Version: 1.0 In-Reply-To: <1435847397-724-5-git-send-email-akinobu.mita@gmail.com> References: <1435847397-724-1-git-send-email-akinobu.mita@gmail.com> <1435847397-724-5-git-send-email-akinobu.mita@gmail.com> Date: Thu, 9 Jul 2015 15:04:34 +0800 Message-ID: Subject: Re: [PATCH v2 4/6] blk-mq: avoid inserting requests before establishing new mapping From: Ming Lei To: Akinobu Mita Cc: Linux Kernel Mailing List , Jens Axboe Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8081 Lines: 213 On Thu, Jul 2, 2015 at 10:29 PM, Akinobu Mita wrote: > Notifier callbacks for CPU_ONLINE action can be run on the other CPU > than the CPU which was just onlined. So it is possible for the > process running on the just onlined CPU to insert request and run > hw queue before establishing new mapping which is done by > blk_mq_queue_reinit_notify(). > > This can cause a problem when the CPU has just been onlined first time > since the request queue was initialized. At this time ctx->index_hw > for the CPU, which is the index in hctx->ctxs[] for this ctx, is still > zero before blk_mq_queue_reinit_notify() is called by notifier > callbacks for CPU_ONLINE action. > > For example, there is a single hw queue (hctx) and two CPU queues > (ctx0 for CPU0, and ctx1 for CPU1). Now CPU1 is just onlined and > a request is inserted into ctx1->rq_list and set bit0 in pending > bitmap as ctx1->index_hw is still zero. > > And then while running hw queue, flush_busy_ctxs() finds bit0 is set > in pending bitmap and tries to retrieve requests in > hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, so the > request in ctx1->rq_list is ignored. > > Fix it by ensuring that new mapping is established before onlined cpu > starts running. > > Signed-off-by: Akinobu Mita > Cc: Jens Axboe > Cc: Ming Lei > --- > block/blk-mq-cpumap.c | 9 +++++---- > block/blk-mq.c | 41 ++++++++++++++++++++++++++--------------- > block/blk-mq.h | 3 ++- > 3 files changed, 33 insertions(+), 20 deletions(-) > > diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c > index 1e28ddb..8764c24 100644 > --- a/block/blk-mq-cpumap.c > +++ b/block/blk-mq-cpumap.c > @@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu) > return cpu; > } > > -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) > +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, > + const struct cpumask *online_mask) > { > unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; > cpumask_var_t cpus; > @@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) > > cpumask_clear(cpus); > nr_cpus = nr_uniq_cpus = 0; > - for_each_online_cpu(i) { > + for_each_cpu(i, online_mask) { > nr_cpus++; > first_sibling = get_first_sibling(i); > if (!cpumask_test_cpu(first_sibling, cpus)) > @@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) > > queue = 0; > for_each_possible_cpu(i) { > - if (!cpu_online(i)) { > + if (!cpumask_test_cpu(i, online_mask)) { > map[i] = 0; > continue; > } > @@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) > if (!map) > return NULL; > > - if (!blk_mq_update_queue_map(map, set->nr_hw_queues)) > + if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask)) > return map; > > kfree(map); > diff --git a/block/blk-mq.c b/block/blk-mq.c > index ff72e18..ad07373 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -1799,7 +1799,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, > } > } > > -static void blk_mq_map_swqueue(struct request_queue *q) > +static void blk_mq_map_swqueue(struct request_queue *q, > + const struct cpumask *online_mask) > { > unsigned int i; > struct blk_mq_hw_ctx *hctx; > @@ -1816,7 +1817,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) > */ > queue_for_each_ctx(q, ctx, i) { > /* If the cpu isn't online, the cpu is mapped to first hctx */ > - if (!cpu_online(i)) > + if (!cpumask_test_cpu(i, online_mask)) > continue; > > hctx = q->mq_ops->map_queue(q, i); > @@ -2046,7 +2047,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, > > blk_mq_add_queue_tag_set(set, q); > > - blk_mq_map_swqueue(q); > + get_online_cpus(); > + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, cpu_online_mask); > + blk_mq_map_swqueue(q, cpu_online_mask); > + put_online_cpus(); > > return q; > > @@ -2083,13 +2087,14 @@ void blk_mq_free_queue(struct request_queue *q) > } > > /* Basically redo blk_mq_init_queue with queue frozen */ > -static void blk_mq_queue_reinit(struct request_queue *q) > +static void blk_mq_queue_reinit(struct request_queue *q, > + const struct cpumask *online_mask) > { > WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); > > blk_mq_sysfs_unregister(q); > > - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); > + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); > > /* > * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe > @@ -2097,25 +2102,31 @@ static void blk_mq_queue_reinit(struct request_queue *q) > * involves free and re-allocate memory, worthy doing?) > */ > > - blk_mq_map_swqueue(q); > + blk_mq_map_swqueue(q, online_mask); > > blk_mq_sysfs_register(q); > } > > +static struct cpumask online_new; Better to comment on why there isn't race on the single global variable. > + > static int blk_mq_queue_reinit_notify(struct notifier_block *nb, > unsigned long action, void *hcpu) > { > struct request_queue *q; > + int cpu = (unsigned long)hcpu; > > - /* > - * Before new mappings are established, hotadded cpu might already > - * start handling requests. This doesn't break anything as we map > - * offline CPUs to first hardware queue. We will re-init the queue > - * below to get optimal settings. > - */ You should put some comment here about why the change is introduced because the race isn't very obvious. > - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && > - action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) > + switch (action & ~CPU_TASKS_FROZEN) { > + case CPU_DEAD: > + case CPU_UP_CANCELED: > + cpumask_copy(&online_new, cpu_online_mask); > + break; > + case CPU_UP_PREPARE: > + cpumask_copy(&online_new, cpu_online_mask); > + cpumask_set_cpu(cpu, &online_new); > + break; > + default: > return NOTIFY_OK; > + } > > mutex_lock(&all_q_mutex); > > @@ -2139,7 +2150,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, > } > > list_for_each_entry(q, &all_q_list, all_q_node) > - blk_mq_queue_reinit(q); > + blk_mq_queue_reinit(q, &online_new); > > list_for_each_entry(q, &all_q_list, all_q_node) > blk_mq_unfreeze_queue(q); > diff --git a/block/blk-mq.h b/block/blk-mq.h > index 6a48c4c..f4fea79 100644 > --- a/block/blk-mq.h > +++ b/block/blk-mq.h > @@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void); > * CPU -> queue mappings > */ > extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); > -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); > +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, > + const struct cpumask *online_mask); > extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); > > /* > -- > 1.9.1 > -- Ming Lei -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/