Message-ID: <adf88c55-64b0-6138-fea8-797d2b30b770@grimberg.me>
Date:   Mon, 13 Mar 2023 14:08:43 +0200
MIME-Version: 1.0
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
 Thunderbird/102.7.1
Subject: Re: [RFC PATCH] nvme-tcp: fix possible crash when only partilal
 queues are allocated by nvme_tcp_alloc_queue
Content-Language: en-US
From:   Sagi Grimberg <sagi@grimberg.me>
To:     zhangyanjun@cestc.cn, kbusch@kernel.org, axboe@fb.com, hch@lst.de
Cc:     linux-kernel@vger.kernel.org, linux-nvme@lists.infradead.org,
        xiang.li@cestc.cn, shaopeijie@cestc.cn
References: <20230313113204.2205730-1-zhangyanjun@cestc.cn>
 <55bfaef2-1faf-ac7a-53b1-60f9e7b8afea@grimberg.me>
In-Reply-To: <55bfaef2-1faf-ac7a-53b1-60f9e7b8afea@grimberg.me>
Content-Type: text/plain; charset=UTF-8; format=flowed
Content-Transfer-Encoding: 8bit
Precedence: bulk


>   So we query that directly freeing these allocated
>> I/O queues may result in this issue. To avoid this issuse, we try to make
>> the following code modifications, but have not verified to be valid yet
>> because of the occasionality of this issue.
>>
>> Now, we can not confirm the caller of queuing on the io_work during
>> allocating all the I/O queues, but we doubt that the socket interface
>> sk_data_ready=nvme_tcp_data_ready initialized by nvme_tcp_alloc_queue
>> may do this action in this interval. Hope to get some suggestions, 
>> Thanks!
> 
> I would start by determining that the running io_work is an io queue or
> the admin queue. It is also possible that the admin queue cleanup after
> we io queues allocation failed is the problematic teardown that is
> causing this bug.
> 
>>
>> Signed-off-by: Yanjun Zhang <zhangyanjun@cestc.cn>
>> ---
>>   drivers/nvme/host/tcp.c | 7 +++++--
>>   1 file changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index 9b47dcb2a..5782183b8 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -1761,6 +1761,7 @@ static int nvme_tcp_alloc_admin_queue(struct 
>> nvme_ctrl *ctrl)
>>   static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
>>   {
>>       int i, ret;
>> +    struct nvme_tcp_ctrl *tcp_ctrl = NULL;
>>       for (i = 1; i < ctrl->queue_count; i++) {
>>           ret = nvme_tcp_alloc_queue(ctrl, i);
>> @@ -1771,9 +1772,11 @@ static int __nvme_tcp_alloc_io_queues(struct 
>> nvme_ctrl *ctrl)
>>       return 0;
>>   out_free_queues:
>> -    for (i--; i >= 1; i--)
>> +    for (i--; i >= 1; i--) {
>> +        tcp_ctrl = to_tcp_ctrl(ctrl);
>> +        __nvme_tcp_stop_queue(&tcp_ctrl->queues[i]);
>>           nvme_tcp_free_queue(ctrl, i);
>> -
>> +    }
>>       return ret;
>>   }
> 
> The crash dump reminds me of a reported issue fixed by:
> 160f3549a907 ("nvme-tcp: fix UAF when detecting digest errors")
> 
> Is this applied in your host kernel?
> 
> I'd also add this trace to your reproduction attempts:
> -- 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 0ec7d3329595..0cbc8afb41c8 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -919,6 +919,10 @@ static void nvme_tcp_data_ready(struct sock *sk)
> 
>          read_lock_bh(&sk->sk_callback_lock);
>          queue = sk->sk_user_data;
> +       if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) {
> +               pr_err("got data on a deallocated queue %d\n", 
> nvme_tcp_queue_id(queue));
> +               WARN_ON_ONCE(1);
> +       }
>          if (likely(queue && queue->rd_enabled) &&
>              !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
>                  queue_work_on(queue->io_cpu, nvme_tcp_wq, 
> &queue->io_work);
> -- 

If indeed this is the io_queue and for some reason it is running,
perhaps the correct fix would be to set the socket ops later in
start_queue. It will also be symmetric because we clear them in
stop_queue:
--
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 0ec7d3329595..b3b1e39966c0 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1617,22 +1617,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl 
*nctrl, int qid)
         if (ret)
                 goto err_init_connect;

-       queue->rd_enabled = true;
         set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
-       nvme_tcp_init_recv_ctx(queue);
-
-       write_lock_bh(&queue->sock->sk->sk_callback_lock);
-       queue->sock->sk->sk_user_data = queue;
-       queue->state_change = queue->sock->sk->sk_state_change;
-       queue->data_ready = queue->sock->sk->sk_data_ready;
-       queue->write_space = queue->sock->sk->sk_write_space;
-       queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
-       queue->sock->sk->sk_state_change = nvme_tcp_state_change;
-       queue->sock->sk->sk_write_space = nvme_tcp_write_space;
-#ifdef CONFIG_NET_RX_BUSY_POLL
-       queue->sock->sk->sk_ll_usec = 1;
-#endif
-       write_unlock_bh(&queue->sock->sk->sk_callback_lock);

         return 0;

@@ -1690,6 +1675,21 @@ static int nvme_tcp_start_queue(struct nvme_ctrl 
*nctrl, int idx)
         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
         int ret;

+       queue->rd_enabled = true;
+       nvme_tcp_init_recv_ctx(queue);
+       write_lock_bh(&queue->sock->sk->sk_callback_lock);
+       queue->sock->sk->sk_user_data = queue;
+       queue->state_change = queue->sock->sk->sk_state_change;
+       queue->data_ready = queue->sock->sk->sk_data_ready;
+       queue->write_space = queue->sock->sk->sk_write_space;
+       queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
+       queue->sock->sk->sk_state_change = nvme_tcp_state_change;
+       queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+       queue->sock->sk->sk_ll_usec = 1;
+#endif
+       write_unlock_bh(&queue->sock->sk->sk_callback_lock);
+
         if (idx)
                 ret = nvmf_connect_io_queue(nctrl, idx);
         else
--