RDMA/core: Introduce shared CQ pool API

Allow a ULP to ask the core to provide a completion queue based on a least-used search on a per-device CQ pools. The device CQ pools grow in a lazy fashion when more CQs are requested. This feature reduces the amount of interrupts when using many QPs. Using shared CQs allows for more effcient completion handling. It also reduces the amount of overhead needed for CQ contexts. Test setup: Intel(R) Xeon(R) Platinum 8176M CPU @ 2.10GHz servers. Running NVMeoF 4KB read IOs over ConnectX-5EX across Spectrum switch. TX-depth = 32. The patch was applied in the nvme driver on both the target and initiator. Four controllers are accessed from each core. In the current test case we have exposed sixteen NVMe namespaces using four different subsystems (four namespaces per subsystem) from one NVM port. Each controller allocated X queues (RDMA QPs) and attached to Y CQs. Before this series we had X == Y, i.e for four controllers we've created total of 4X QPs and 4X CQs. In the shared case, we've created 4X QPs and only X CQs which means that we have four controllers that share a completion queue per core. Until fourteen cores there is no significant change in performance and the number of interrupts per second is less than a million in the current case. ================================================== |Cores|Current KIOPs |Shared KIOPs |improvement| |-----|---------------|--------------|-----------| |14 |2332 |2723 |16.7% | |-----|---------------|--------------|-----------| |20 |2086 |2712 |30% | |-----|---------------|--------------|-----------| |28 |1971 |2669 |35.4% | |================================================= |Cores|Current avg lat|Shared avg lat|improvement| |-----|---------------|--------------|-----------| |14 |767us |657us |14.3% | |-----|---------------|--------------|-----------| |20 |1225us |943us |23% | |-----|---------------|--------------|-----------| |28 |1816us |1341us |26.1% | ======================================================== |Cores|Current interrupts|Shared interrupts|improvement| |-----|------------------|-----------------|-----------| |14 |1.6M/sec |0.4M/sec |72% | |-----|------------------|-----------------|-----------| |20 |2.8M/sec |0.6M/sec |72.4% | |-----|------------------|-----------------|-----------| |28 |2.9M/sec |0.8M/sec |63.4% | ==================================================================== |Cores|Current 99.99th PCTL lat|Shared 99.99th PCTL lat|improvement| |-----|------------------------|-----------------------|-----------| |14 |67ms |6ms |90.9% | |-----|------------------------|-----------------------|-----------| |20 |5ms |6ms |-10% | |-----|------------------------|-----------------------|-----------| |28 |8.7ms |6ms |25.9% | |=================================================================== Performance improvement with sixteen disks (sixteen CQs per core) is comparable. Link: https://lore.kernel.org/r/1590568495-101621-3-git-send-email-yaminf@mellanox.com Signed-off-by: Yamin Friedman <yaminf@mellanox.com> Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com> Reviewed-by: Max Gurtovoy <maxg@mellanox.com> Reviewed-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
author: Yamin Friedman <yaminf@mellanox.com> 2020-05-27 11:34:53 +0300
committer: Jason Gunthorpe <jgg@mellanox.com> 2020-05-29 16:09:02 -0300
commit: c7ff819aefea04944dfcec5f0731b97277df6a9c (patch)
tree: 9ea88cf0c441c822769beee3d858692925ab41d6 /include/rdma
parent: 3446cbd2d523fdaf37f3772082071d1154c419d9 (diff)
1 files changed, 16 insertions, 1 deletions
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index cc515025cbdb..19864da78649 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1588,10 +1588,12 @@ struct ib_ah {
 typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
 
 enum ib_poll_context {
-	IB_POLL_DIRECT,		   /* caller context, no hw completions */
 	IB_POLL_SOFTIRQ,	   /* poll from softirq context */
 	IB_POLL_WORKQUEUE,	   /* poll from workqueue */
 	IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */
+	IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE,
+
+	IB_POLL_DIRECT,		   /* caller context, no hw completions */
 };
 
 struct ib_cq {
@@ -1601,9 +1603,11 @@ struct ib_cq {
 	void                  (*event_handler)(struct ib_event *, void *);
 	void                   *cq_context;
 	int               	cqe;
+	unsigned int		cqe_used;
 	atomic_t          	usecnt; /* count number of work queues */
 	enum ib_poll_context	poll_ctx;
 	struct ib_wc		*wc;
+	struct list_head        pool_entry;
 	union {
 		struct irq_poll		iop;
 		struct work_struct	work;
@@ -1615,6 +1619,7 @@ struct ib_cq {
 	ktime_t timestamp;
 	u8 interrupt:1;
 	u8 shared:1;
+	unsigned int comp_vector;
 
 	/*
 	 * Implementation details of the RDMA core, don't use in drivers:
@@ -2734,6 +2739,10 @@ struct ib_device {
 #endif
 
 	u32                          index;
+
+	spinlock_t                   cq_pools_lock;
+	struct list_head             cq_pools[IB_POLL_LAST_POOL_TYPE + 1];
+
 	struct rdma_restrack_root *res;
 
 	const struct uapi_definition   *driver_def;
@@ -4037,6 +4046,12 @@ static inline int ib_req_notify_cq(struct ib_cq *cq,
 	return cq->device->ops.req_notify_cq(cq, flags);
 }
 
+struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
+			     int comp_vector_hint,
+			     enum ib_poll_context poll_ctx);
+
+void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe);
+
 /**
  * ib_req_ncomp_notif - Request completion notification when there are
  *   at least the specified number of unreaped completions on the CQ.
author	Yamin Friedman <yaminf@mellanox.com>	2020-05-27 11:34:53 +0300
committer	Jason Gunthorpe <jgg@mellanox.com>	2020-05-29 16:09:02 -0300
commit	c7ff819aefea04944dfcec5f0731b97277df6a9c (patch)
tree	9ea88cf0c441c822769beee3d858692925ab41d6 /include/rdma
parent	3446cbd2d523fdaf37f3772082071d1154c419d9 (diff)