MIME-Version: 1.0
In-Reply-To: <1437236903-31617-6-git-send-email-akinobu.mita@gmail.com>
References: <1437236903-31617-1-git-send-email-akinobu.mita@gmail.com>
	<1437236903-31617-6-git-send-email-akinobu.mita@gmail.com>
Date: Sun, 19 Jul 2015 20:12:18 +0800
Message-ID: <CACVXFVNQzXkr6hwokvy7dNZ1tOZv3Y8bBDEO+gw4EGdLt_Bw2A@mail.gmail.com>
Subject: Re: [PATCH v3 5/7] blk-mq: avoid inserting requests before
 establishing new mapping
From: Ming Lei <tom.leiming@gmail.com>
To: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
        Jens Axboe <axboe@kernel.dk>
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 9509
Lines: 241

On Sun, Jul 19, 2015 at 12:28 AM, Akinobu Mita <akinobu.mita@gmail.com> wrote:
> Notifier callbacks for CPU_ONLINE action can be run on the other CPU
> than the CPU which was just onlined.  So it is possible for the
> process running on the just onlined CPU to insert request and run
> hw queue before establishing new mapping which is done by
> blk_mq_queue_reinit_notify().
>
> This can cause a problem when the CPU has just been onlined first time
> since the request queue was initialized.  At this time ctx->index_hw
> for the CPU, which is the index in hctx->ctxs[] for this ctx, is still
> zero before blk_mq_queue_reinit_notify() is called by notifier
> callbacks for CPU_ONLINE action.
>
> For example, there is a single hw queue (hctx) and two CPU queues
> (ctx0 for CPU0, and ctx1 for CPU1).  Now CPU1 is just onlined and
> a request is inserted into ctx1->rq_list and set bit0 in pending
> bitmap as ctx1->index_hw is still zero.
>
> And then while running hw queue, flush_busy_ctxs() finds bit0 is set
> in pending bitmap and tries to retrieve requests in
> hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0, so the
> request in ctx1->rq_list is ignored.
>
> Fix it by ensuring that new mapping is established before onlined cpu
> starts running.
>
> Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Ming Lei <tom.leiming@gmail.com>

Reviewed-by: Ming Lei <tom.leiming@gmail.com>


> ---
>  block/blk-mq-cpumap.c |  9 ++++----
>  block/blk-mq.c        | 59 +++++++++++++++++++++++++++++++++++++++------------
>  block/blk-mq.h        |  3 ++-
>  3 files changed, 52 insertions(+), 19 deletions(-)
>
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 1e28ddb..8764c24 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
>         return cpu;
>  }
>
> -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
> +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
> +                           const struct cpumask *online_mask)
>  {
>         unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
>         cpumask_var_t cpus;
> @@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
>
>         cpumask_clear(cpus);
>         nr_cpus = nr_uniq_cpus = 0;
> -       for_each_online_cpu(i) {
> +       for_each_cpu(i, online_mask) {
>                 nr_cpus++;
>                 first_sibling = get_first_sibling(i);
>                 if (!cpumask_test_cpu(first_sibling, cpus))
> @@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
>
>         queue = 0;
>         for_each_possible_cpu(i) {
> -               if (!cpu_online(i)) {
> +               if (!cpumask_test_cpu(i, online_mask)) {
>                         map[i] = 0;
>                         continue;
>                 }
> @@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
>         if (!map)
>                 return NULL;
>
> -       if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
> +       if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
>                 return map;
>
>         kfree(map);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d5d93e0..d861c70 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1799,7 +1799,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
>         }
>  }
>
> -static void blk_mq_map_swqueue(struct request_queue *q)
> +static void blk_mq_map_swqueue(struct request_queue *q,
> +                              const struct cpumask *online_mask)
>  {
>         unsigned int i;
>         struct blk_mq_hw_ctx *hctx;
> @@ -1816,7 +1817,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
>          */
>         queue_for_each_ctx(q, ctx, i) {
>                 /* If the cpu isn't online, the cpu is mapped to first hctx */
> -               if (!cpu_online(i))
> +               if (!cpumask_test_cpu(i, online_mask))
>                         continue;
>
>                 hctx = q->mq_ops->map_queue(q, i);
> @@ -1862,7 +1863,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
>         }
>
>         queue_for_each_ctx(q, ctx, i) {
> -               if (!cpu_online(i))
> +               if (!cpumask_test_cpu(i, online_mask))
>                         continue;
>
>                 hctx = q->mq_ops->map_queue(q, i);
> @@ -2047,13 +2048,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>         if (blk_mq_init_hw_queues(q, set))
>                 goto err_hctxs;
>
> +       get_online_cpus();
>         mutex_lock(&all_q_mutex);
>
>         list_add_tail(&q->all_q_node, &all_q_list);
>         blk_mq_add_queue_tag_set(set, q);
> -       blk_mq_map_swqueue(q);
> +       blk_mq_map_swqueue(q, cpu_online_mask);
>
>         mutex_unlock(&all_q_mutex);
> +       put_online_cpus();
>
>         return q;
>
> @@ -2090,13 +2093,14 @@ void blk_mq_free_queue(struct request_queue *q)
>  }
>
>  /* Basically redo blk_mq_init_queue with queue frozen */
> -static void blk_mq_queue_reinit(struct request_queue *q)
> +static void blk_mq_queue_reinit(struct request_queue *q,
> +                               const struct cpumask *online_mask)
>  {
>         WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
>
>         blk_mq_sysfs_unregister(q);
>
> -       blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
> +       blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
>
>         /*
>          * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
> @@ -2104,7 +2108,7 @@ static void blk_mq_queue_reinit(struct request_queue *q)
>          * involves free and re-allocate memory, worthy doing?)
>          */
>
> -       blk_mq_map_swqueue(q);
> +       blk_mq_map_swqueue(q, online_mask);
>
>         blk_mq_sysfs_register(q);
>  }
> @@ -2113,16 +2117,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
>                                       unsigned long action, void *hcpu)
>  {
>         struct request_queue *q;
> +       int cpu = (unsigned long)hcpu;
> +       /*
> +        * New online cpumask which is going to be set in this hotplug event.
> +        * Declare this cpumasks as global as cpu-hotplug operation is invoked
> +        * one-by-one and dynamically allocating this could result in a failure.
> +        */
> +       static struct cpumask online_new;
>
>         /*
> -        * Before new mappings are established, hotadded cpu might already
> -        * start handling requests. This doesn't break anything as we map
> -        * offline CPUs to first hardware queue. We will re-init the queue
> -        * below to get optimal settings.
> +        * Before hotadded cpu starts handling requests, new mappings must
> +        * be established.  Otherwise, these requests in hw queue might
> +        * never be dispatched.
> +        *
> +        * For example, there is a single hw queue (hctx) and two CPU queues
> +        * (ctx0 for CPU0, and ctx1 for CPU1).
> +        *
> +        * Now CPU1 is just onlined and a request is inserted into
> +        * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
> +        * still zero.
> +        *
> +        * And then while running hw queue, flush_busy_ctxs() finds bit0 is
> +        * set in pending bitmap and tries to retrieve requests in
> +        * hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0,
> +        * so the request in ctx1->rq_list is ignored.
>          */
> -       if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
> -           action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
> +       switch (action & ~CPU_TASKS_FROZEN) {
> +       case CPU_DEAD:
> +       case CPU_UP_CANCELED:
> +               cpumask_copy(&online_new, cpu_online_mask);
> +               break;
> +       case CPU_UP_PREPARE:
> +               cpumask_copy(&online_new, cpu_online_mask);
> +               cpumask_set_cpu(cpu, &online_new);
> +               break;
> +       default:
>                 return NOTIFY_OK;
> +       }
>
>         mutex_lock(&all_q_mutex);
>
> @@ -2146,7 +2177,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
>         }
>
>         list_for_each_entry(q, &all_q_list, all_q_node)
> -               blk_mq_queue_reinit(q);
> +               blk_mq_queue_reinit(q, &online_new);
>
>         list_for_each_entry(q, &all_q_list, all_q_node)
>                 blk_mq_unfreeze_queue(q);
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 6a48c4c..f4fea79 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
>   * CPU -> queue mappings
>   */
>  extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
> -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
> +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
> +                                  const struct cpumask *online_mask);
>  extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
>
>  /*
> --
> 1.9.1
>


-- 
Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/