[pve-devel] [PATCH kernel 2/4] revert buggy NVME setup commit

Fabian Grünbichler f.gruenbichler at proxmox.com
Fri Jan 20 09:50:56 CET 2017


see https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1626894

Signed-off-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
---
 Makefile                                    |   1 +
 nvme-revert-NVMe-only-setup-MSIX-once.patch | 128 ++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)
 create mode 100644 nvme-revert-NVMe-only-setup-MSIX-once.patch

diff --git a/Makefile b/Makefile
index dfb2060..bbf92e3 100644
--- a/Makefile
+++ b/Makefile
@@ -268,6 +268,7 @@ ${KERNEL_SRC}/README ${KERNEL_CFG_ORG}: ${KERNELSRCTAR}
 	cd ${KERNEL_SRC}; patch -p1 < ../cgroup-cpuset-add-cpuset.remap_cpus.patch
 	cd ${KERNEL_SRC}; patch -p1 < ../0001-Revert-mm-throttle-on-IO-only-when-there-are-too-man.patch
 	cd ${KERNEL_SRC}; patch -p1 < ../0002-Revert-mm-oom-rework-oom-detection.patch
+	cd ${KERNEL_SRC}; patch -p1 < ../nvme-revert-NVMe-only-setup-MSIX-once.patch 
 	sed -i ${KERNEL_SRC}/Makefile -e 's/^EXTRAVERSION.*$$/EXTRAVERSION=${EXTRAVERSION}/'
 	touch $@
 
diff --git a/nvme-revert-NVMe-only-setup-MSIX-once.patch b/nvme-revert-NVMe-only-setup-MSIX-once.patch
new file mode 100644
index 0000000..b46221e
--- /dev/null
+++ b/nvme-revert-NVMe-only-setup-MSIX-once.patch
@@ -0,0 +1,128 @@
+From af220b3adff164d1b8b89d7d5c8bb741d6195012 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler at proxmox.com>
+Date: Thu, 19 Jan 2017 15:19:46 +0100
+Subject: [PATCH] Revert "UBUNTU: SAUCE: (no-up) NVMe: only setup MSIX once"
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This reverts commit 96fce9e4025b96b08bfe5196d3380ab9215cb64b.
+
+Signed-off-by: Fabian Grünbichler <f.gruenbichler at proxmox.com>
+---
+ drivers/nvme/host/pci.c | 73 ++++++++++++++++++++++++++++++++++---------------
+ 1 file changed, 51 insertions(+), 22 deletions(-)
+
+diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
+index ae1f695..b9cf5aa 100644
+--- a/drivers/nvme/host/pci.c
++++ b/drivers/nvme/host/pci.c
+@@ -1613,7 +1613,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
+ 	struct pci_dev *pdev = to_pci_dev(dev->dev);
+ 	int result, i, vecs, nr_io_queues, size;
+ 
+-	nr_io_queues = dev->max_qid + 1;
++	nr_io_queues = num_possible_cpus();
+ 	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
+ 	if (result < 0)
+ 		return result;
+@@ -1653,7 +1653,45 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
+ 		adminq->q_db = dev->dbs;
+ 	}
+ 
+-	dev->max_qid = nr_io_queues - 1;
++	/* Deregister the admin queue's interrupt */
++	free_irq(dev->entry[0].vector, adminq);
++
++	/*
++	 * If we enable msix early due to not intx, disable it again before
++	 * setting up the full range we need.
++	 */
++	if (pdev->msi_enabled)
++		pci_disable_msi(pdev);
++	else if (pdev->msix_enabled)
++		pci_disable_msix(pdev);
++
++	for (i = 0; i < nr_io_queues; i++)
++		dev->entry[i].entry = i;
++	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
++	if (vecs < 0) {
++		vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
++		if (vecs < 0) {
++			vecs = 1;
++		} else {
++			for (i = 0; i < vecs; i++)
++				dev->entry[i].vector = i + pdev->irq;
++		}
++	}
++
++	/*
++	 * Should investigate if there's a performance win from allocating
++	 * more queues than interrupt vectors; it might allow the submission
++	 * path to scale better, even if the receive path is limited by the
++	 * number of interrupts.
++	 */
++	nr_io_queues = vecs;
++	dev->max_qid = nr_io_queues;
++
++	result = queue_request_irq(dev, adminq, adminq->irqname);
++	if (result) {
++		adminq->cq_vector = -1;
++		goto free_queues;
++	}
+ 
+ 	/* Free previously allocated queues that are no longer usable */
+ 	nvme_free_queues(dev, nr_io_queues + 1);
+@@ -1806,7 +1844,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
+ static int nvme_pci_enable(struct nvme_dev *dev)
+ {
+ 	u64 cap;
+-	int result = -ENOMEM, nr_io_queues, i, vecs;
++	int result = -ENOMEM;
+ 	struct pci_dev *pdev = to_pci_dev(dev->dev);
+ 
+ 	if (pci_enable_device_mem(pdev))
+@@ -1823,30 +1861,21 @@ static int nvme_pci_enable(struct nvme_dev *dev)
+ 		goto disable;
+ 	}
+ 
+-	nr_io_queues = num_possible_cpus();
+-
+-	for (i = 0; i < nr_io_queues; i++)
+-		dev->entry[i].entry = i;
+-	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
+-	if (vecs < 0) {
+-		vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
+-		if (vecs < 0) {
+-			result = vecs;
+-			goto disable;
+-		} else {
+-			for (i = 0; i < vecs; i++)
+-				dev->entry[i].vector = i + pdev->irq;
+-		}
++	/*
++	 * Some devices and/or platforms don't advertise or work with INTx
++	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
++	 * adjust this later.
++	 */
++	if (pci_enable_msix(pdev, dev->entry, 1)) {
++		pci_enable_msi(pdev);
++		dev->entry[0].vector = pdev->irq;
+ 	}
+ 
+-	if (vecs < 2) {
+-		dev_err(dev->ctrl.device, "Failed to get enough MSI/MSIX interrupts\n");
+-		result = -ENOSPC;
++	if (!dev->entry[0].vector) {
++		result = -ENODEV;
+ 		goto disable;
+ 	}
+ 
+-	dev->max_qid = vecs - 1;
+-
+ 	cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
+ 
+ 	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
+-- 
+2.1.4
+
-- 
2.1.4




More information about the pve-devel mailing list