Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67;
Subject: Re: [PATCH 3/3] nvme: complete request in work queue on CPU with
 flooded interrupts
To:     longli@linuxonhyperv.com, Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Keith Busch <keith.busch@intel.com>, Jens Axboe <axboe@fb.com>,
        Christoph Hellwig <hch@lst.de>, linux-nvme@lists.infradead.org,
        linux-kernel@vger.kernel.org
Cc:     Long Li <longli@microsoft.com>
References: <1566281669-48212-1-git-send-email-longli@linuxonhyperv.com>
 <1566281669-48212-4-git-send-email-longli@linuxonhyperv.com>
From:   Sagi Grimberg <sagi@grimberg.me>
Message-ID: <2a30a07f-982c-c291-e263-0cf72ec61235@grimberg.me>
Date:   Tue, 20 Aug 2019 10:33:38 -0700
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101
 Thunderbird/60.8.0
MIME-Version: 1.0
In-Reply-To: <1566281669-48212-4-git-send-email-longli@linuxonhyperv.com>
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Language: en-US
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Precedence: bulk


> From: Long Li <longli@microsoft.com>
> 
> When a NVMe hardware queue is mapped to several CPU queues, it is possible
> that the CPU this hardware queue is bound to is flooded by returning I/O for
> other CPUs.
> 
> For example, consider the following scenario:
> 1. CPU 0, 1, 2 and 3 share the same hardware queue
> 2. the hardware queue interrupts CPU 0 for I/O response
> 3. processes from CPU 1, 2 and 3 keep sending I/Os
> 
> CPU 0 may be flooded with interrupts from NVMe device that are I/O responses
> for CPU 1, 2 and 3. Under heavy I/O load, it is possible that CPU 0 spends
> all the time serving NVMe and other system interrupts, but doesn't have a
> chance to run in process context.
> 
> To fix this, CPU 0 can schedule a work to complete the I/O request when it
> detects the scheduler is not making progress. This serves multiple purposes:
> 
> 1. This CPU has to be scheduled to complete the request. The other CPUs can't
> issue more I/Os until some previous I/Os are completed. This helps this CPU
> get out of NVMe interrupts.
> 
> 2. This acts a throttling mechanisum for NVMe devices, in that it can not
> starve a CPU while servicing I/Os from other CPUs.
> 
> 3. This CPU can make progress on RCU and other work items on its queue.

The problem is indeed real, but this is the wrong approach in my mind.

We already have irqpoll which takes care proper budgeting polling
cycles and not hogging the cpu.

I've sent rfc for this particular problem before [1]. At the time IIRC,
Christoph suggested that we will poll the first batch directly from
the irq context and reap the rest in irqpoll handler.

[1]: 
http://lists.infradead.org/pipermail/linux-nvme/2016-October/006497.html

How about something like this instead:
--
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 71127a366d3c..84bf16d75109 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -24,6 +24,7 @@
  #include <linux/io-64-nonatomic-lo-hi.h>
  #include <linux/sed-opal.h>
  #include <linux/pci-p2pdma.h>
+#include <linux/irq_poll.h>

  #include "trace.h"
  #include "nvme.h"
@@ -32,6 +33,7 @@
  #define CQ_SIZE(q)     ((q)->q_depth * sizeof(struct nvme_completion))

  #define SGES_PER_PAGE  (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+#define NVME_POLL_BUDGET_IRQ   256

  /*
   * These can be higher, but we need to ensure that any command doesn't
@@ -189,6 +191,7 @@ struct nvme_queue {
         u32 *dbbuf_cq_db;
         u32 *dbbuf_sq_ei;
         u32 *dbbuf_cq_ei;
+       struct irq_poll iop;
         struct completion delete_done;
  };

@@ -1015,6 +1018,23 @@ static inline int nvme_process_cq(struct 
nvme_queue *nvmeq, u16 *start,
         return found;
  }

+static int nvme_irqpoll_handler(struct irq_poll *iop, int budget)
+{
+       struct nvme_queue *nvmeq = container_of(iop, struct nvme_queue, 
iop);
+       struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
+       u16 start, end;
+       int completed;
+
+       completed = nvme_process_cq(nvmeq, &start, &end, budget);
+       nvme_complete_cqes(nvmeq, start, end);
+       if (completed < budget) {
+               irq_poll_complete(&nvmeq->iop);
+               enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+       }
+
+       return completed;
+}
+
  static irqreturn_t nvme_irq(int irq, void *data)
  {
         struct nvme_queue *nvmeq = data;
@@ -1028,12 +1048,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
         rmb();
         if (nvmeq->cq_head != nvmeq->last_cq_head)
                 ret = IRQ_HANDLED;
-       nvme_process_cq(nvmeq, &start, &end, -1);
+       nvme_process_cq(nvmeq, &start, &end, NVME_POLL_BUDGET_IRQ);
         nvmeq->last_cq_head = nvmeq->cq_head;
         wmb();

         if (start != end) {
                 nvme_complete_cqes(nvmeq, start, end);
+               if (nvme_cqe_pending(nvmeq)) {
+                       disable_irq_nosync(irq);
+                       irq_poll_sched(&nvmeq->iop);
+               }
                 return IRQ_HANDLED;
         }

@@ -1347,6 +1371,7 @@ static enum blk_eh_timer_return 
nvme_timeout(struct request *req, bool reserved)

  static void nvme_free_queue(struct nvme_queue *nvmeq)
  {
+       irq_poll_disable(&nvmeq->iop);
         dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
                                 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
         if (!nvmeq->sq_cmds)
@@ -1481,6 +1506,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, 
int qid, int depth)
         nvmeq->dev = dev;
         spin_lock_init(&nvmeq->sq_lock);
         spin_lock_init(&nvmeq->cq_poll_lock);
+       irq_poll_init(&nvmeq->iop, NVME_POLL_BUDGET_IRQ, 
nvme_irqpoll_handler);
         nvmeq->cq_head = 0;
         nvmeq->cq_phase = 1;
         nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
--