diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-20 14:35:07 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-20 14:35:07 -0700 |
commit | 76b584d3125a1f7d8b64e9c522a4555bc2844bde (patch) | |
tree | c75dc6b134eeae650372df7c6179f1e43a95953b | |
parent | 7992893c5a9fdffa42117f6f749359466e06bdf6 (diff) | |
parent | c16d2750a08c8ccaf98d65f287a8aec91bb9610d (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
Pull rdma updates from Doug Ledford:
"Primary 4.7 merge window changes
- Updates to the new Intel X722 iWARP driver
- Updates to the hfi1 driver
- Fixes for the iw_cxgb4 driver
- Misc core fixes
- Generic RDMA READ/WRITE API addition
- SRP updates
- Misc ipoib updates
- Minor mlx5 updates"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (148 commits)
IB/mlx5: Fire the CQ completion handler from tasklet
net/mlx5_core: Use tasklet for user-space CQ completion events
IB/core: Do not require CAP_NET_ADMIN for packet sniffing
IB/mlx4: Fix unaligned access in send_reply_to_slave
IB/mlx5: Report Scatter FCS device capability when supported
IB/mlx5: Add Scatter FCS support for Raw Packet QP
IB/core: Add Scatter FCS create flag
IB/core: Add Raw Scatter FCS device capability
IB/core: Add extended device capability flags
i40iw: pass hw_stats by reference rather than by value
i40iw: Remove unnecessary synchronize_irq() before free_irq()
i40iw: constify i40iw_vf_cqp_ops structure
IB/mlx5: Add UARs write-combining and non-cached mapping
IB/mlx5: Allow mapping the free running counter on PROT_EXEC
IB/mlx4: Use list_for_each_entry_safe
IB/SA: Use correct free function
IB/core: Fix a potential array overrun in CMA and SA agent
IB/core: Remove unnecessary check in ibnl_rcv_msg
IB/IWPM: Fix a potential skb leak
RDMA/nes: replace custom print_hex_dump()
...
111 files changed, 4116 insertions, 2605 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index f818538a7f4e..26987d9d7e1c 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -8,9 +8,9 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) -ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \ +ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ - roce_gid_mgmt.o + roce_gid_mgmt.o mr_pool.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 93ab0ae97208..f0c91ba3178a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -800,6 +800,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, if (id->device != pd->device) return -EINVAL; + qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); @@ -4294,7 +4295,8 @@ static int __init cma_init(void) if (ret) goto err; - if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) + if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table), + cma_cb_table)) pr_warn("RDMA CMA: failed to add netlink callback\n"); cma_configfs_init(); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index e28a160cdab0..f0572049d291 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -459,7 +459,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, if (pm_addr->ss_family == AF_INET) { struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr; - if (pm4_addr->sin_addr.s_addr == INADDR_ANY) { + if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) { struct sockaddr_in *cm4_addr = (struct sockaddr_in *)cm_addr; struct sockaddr_in *cm4_outaddr = @@ -1175,7 +1175,7 @@ static int __init iw_cm_init(void) if (ret) pr_err("iw_cm: couldn't init iwpm\n"); - ret = ibnl_add_client(RDMA_NL_IWCM, RDMA_NL_IWPM_NUM_OPS, + ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table), iwcm_nl_cb_table); if (ret) pr_err("iw_cm: couldn't register netlink callbacks\n"); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 9b2bf2fb2b00..b65e06c560d7 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -634,6 +634,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + dev_kfree_skb(skb); return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c new file mode 100644 index 000000000000..49d478b2ea94 --- /dev/null +++ b/drivers/infiniband/core/mr_pool.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <rdma/ib_verbs.h> +#include <rdma/mr_pool.h> + +struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + mr = list_first_entry_or_null(list, struct ib_mr, qp_entry); + if (mr) { + list_del(&mr->qp_entry); + qp->mrs_used++; + } + spin_unlock_irqrestore(&qp->mr_lock, flags); + + return mr; +} +EXPORT_SYMBOL(ib_mr_pool_get); + +void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add(&mr->qp_entry, list); + qp->mrs_used--; + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_put); + +int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, + enum ib_mr_type type, u32 max_num_sg) +{ + struct ib_mr *mr; + unsigned long flags; + int ret, i; + + for (i = 0; i < nr; i++) { + mr = ib_alloc_mr(qp->pd, type, max_num_sg); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto out; + } + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add_tail(&mr->qp_entry, list); + spin_unlock_irqrestore(&qp->mr_lock, flags); + } + + return 0; +out: + ib_mr_pool_destroy(qp, list); + return ret; +} +EXPORT_SYMBOL(ib_mr_pool_init); + +void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + while (!list_empty(list)) { + mr = list_first_entry(list, struct ib_mr, qp_entry); + list_del(&mr->qp_entry); + + spin_unlock_irqrestore(&qp->mr_lock, flags); + ib_dereg_mr(mr); + spin_lock_irqsave(&qp->mr_lock, flags); + } + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_destroy); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index d47df9356779..9b8c20c8209b 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -151,12 +151,11 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct ibnl_client *client; int type = nlh->nlmsg_type; int index = RDMA_NL_GET_CLIENT(type); - int op = RDMA_NL_GET_OP(type); + unsigned int op = RDMA_NL_GET_OP(type); list_for_each_entry(client, &client_list, list) { if (client->index == index) { - if (op < 0 || op >= client->nops || - !client->cb_table[op].dump) + if (op >= client->nops || !client->cb_table[op].dump) return -EINVAL; /* diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c new file mode 100644 index 000000000000..1eb9b1294a63 --- /dev/null +++ b/drivers/infiniband/core/rw.c @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/moduleparam.h> +#include <linux/slab.h> +#include <rdma/mr_pool.h> +#include <rdma/rw.h> + +enum { + RDMA_RW_SINGLE_WR, + RDMA_RW_MULTI_WR, + RDMA_RW_MR, + RDMA_RW_SIG_MR, +}; + +static bool rdma_rw_force_mr; +module_param_named(force_mr, rdma_rw_force_mr, bool, 0); +MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations"); + +/* + * Check if the device might use memory registration. This is currently only + * true for iWarp devices. In the future we can hopefully fine tune this based + * on HCA driver input. + */ +static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num) +{ + if (rdma_protocol_iwarp(dev, port_num)) + return true; + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +/* + * Check if the device will use memory registration for this RW operation. + * We currently always use memory registrations for iWarp RDMA READs, and + * have a debug option to force usage of MRs. + * + * XXX: In the future we can hopefully fine tune this based on HCA driver + * input. + */ +static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, + enum dma_data_direction dir, int dma_nents) +{ + if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE) + return true; + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +static inline u32 rdma_rw_max_sge(struct ib_device *dev, + enum dma_data_direction dir) +{ + return dir == DMA_TO_DEVICE ? + dev->attrs.max_sge : dev->attrs.max_sge_rd; +} + +static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) +{ + /* arbitrary limit to avoid allocating gigantic resources */ + return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); +} + +static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, + struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, + u32 sg_cnt, u32 offset) +{ + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 nents = min(sg_cnt, pages_per_mr); + int count = 0, ret; + + reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs); + if (!reg->mr) + return -EAGAIN; + + if (reg->mr->need_inval) { + reg->inv_wr.opcode = IB_WR_LOCAL_INV; + reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; + reg->inv_wr.next = ®->reg_wr.wr; + count++; + } else { + reg->inv_wr.next = NULL; + } + + ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); + if (ret < nents) { + ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr); + return -EINVAL; + } + + reg->reg_wr.wr.opcode = IB_WR_REG_MR; + reg->reg_wr.mr = reg->mr; + reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(qp->device, port_num)) + reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; + count++; + + reg->sge.addr = reg->mr->iova; + reg->sge.length = reg->mr->length; + return count; +} + +static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + int i, j, ret = 0, count = 0; + + ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr; + ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < ctx->nr_ops; i++) { + struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL; + struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; + u32 nents = min(sg_cnt, pages_per_mr); + + ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt, + offset); + if (ret < 0) + goto out_free; + count += ret; + + if (prev) { + if (reg->mr->need_inval) + prev->wr.wr.next = ®->inv_wr; + else + prev->wr.wr.next = ®->reg_wr.wr; + } + + reg->reg_wr.wr.next = ®->wr.wr; + + reg->wr.wr.sg_list = ®->sge; + reg->wr.wr.num_sge = 1; + reg->wr.remote_addr = remote_addr; + reg->wr.rkey = rkey; + if (dir == DMA_TO_DEVICE) { + reg->wr.wr.opcode = IB_WR_RDMA_WRITE; + } else if (!rdma_cap_read_inv(qp->device, port_num)) { + reg->wr.wr.opcode = IB_WR_RDMA_READ; + } else { + reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; + reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; + } + count++; + + remote_addr += reg->sge.length; + sg_cnt -= nents; + for (j = 0; j < nents; j++) + sg = sg_next(sg); + offset = 0; + } + + ctx->type = RDMA_RW_MR; + return count; + +out_free: + while (--i >= 0) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); +out: + return ret; +} + +static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 max_sge = rdma_rw_max_sge(dev, dir); + struct ib_sge *sge; + u32 total_len = 0, i, j; + + ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge); + + ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL); + if (!ctx->map.sges) + goto out; + + ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL); + if (!ctx->map.wrs) + goto out_free_sges; + + for (i = 0; i < ctx->nr_ops; i++) { + struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; + u32 nr_sge = min(sg_cnt, max_sge); + + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->remote_addr = remote_addr + total_len; + rdma_wr->rkey = rkey; + rdma_wr->wr.sg_list = sge; + + for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { + rdma_wr->wr.num_sge++; + + sge->addr = ib_sg_dma_address(dev, sg) + offset; + sge->length = ib_sg_dma_len(dev, sg) - offset; + sge->lkey = qp->pd->local_dma_lkey; + + total_len += sge->length; + sge++; + sg_cnt--; + offset = 0; + } + + if (i + 1 < ctx->nr_ops) + rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr; + } + + ctx->type = RDMA_RW_MULTI_WR; + return ctx->nr_ops; + +out_free_sges: + kfree(ctx->map.sges); +out: + return -ENOMEM; +} + +static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct ib_rdma_wr *rdma_wr = &ctx->single.wr; + + ctx->nr_ops = 1; + + ctx->single.sge.lkey = qp->pd->local_dma_lkey; + ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset; + ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset; + + memset(rdma_wr, 0, sizeof(*rdma_wr)); + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->wr.sg_list = &ctx->single.sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + + ctx->type = RDMA_RW_SINGLE_WR; + return 1; +} + +/** + * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @sg_offset: current byte offset into @sg + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + int ret; + + ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); + if (!ret) + return -ENOMEM; + sg_cnt = ret; + + /* + * Skip to the S/G entry that sg_offset falls into: + */ + for (;;) { + u32 len = ib_sg_dma_len(dev, sg); + + if (sg_offset < len) + break; + + sg = sg_next(sg); + sg_offset -= len; + sg_cnt--; + } + + ret = -EIO; + if (WARN_ON_ONCE(sg_cnt == 0)) + goto out_unmap_sg; + + if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) { + ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt, + sg_offset, remote_addr, rkey, dir); + } else if (sg_cnt > 1) { + ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset, + remote_addr, rkey, dir); + } else { + ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset, + remote_addr, rkey, dir); + } + + if (ret < 0) + goto out_unmap_sg; + return ret; + +out_unmap_sg: + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_init); + +/** + * rdma_rw_ctx_signature init - initialize a RW context with signature offload + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist to READ/WRITE protection information from/to + * @prot_sg_cnt: number of entries in @prot_sg + * @sig_attrs: signature offloading algorithms + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + struct ib_sig_attrs *sig_attrs, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + struct ib_rdma_wr *rdma_wr; + struct ib_send_wr *prev_wr = NULL; + int count = 0, ret; + + if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { + pr_err("SG count too large\n"); + return -EINVAL; + } + + ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); + if (!ret) + return -ENOMEM; + sg_cnt = ret; + + ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); + if (!ret) { + ret = -ENOMEM; + goto out_unmap_sg; + } + prot_sg_cnt = ret; + + ctx->type = RDMA_RW_SIG_MR; + ctx->nr_ops = 1; + ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL); + if (!ctx->sig) { + ret = -ENOMEM; + goto out_unmap_prot_sg; + } + + ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0); + if (ret < 0) + goto out_free_ctx; + count += ret; + prev_wr = &ctx->sig->data.reg_wr.wr; + + if (prot_sg_cnt) { + ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot, + prot_sg, prot_sg_cnt, 0); + if (ret < 0) + goto out_destroy_data_mr; + count += ret; + + if (ctx->sig->prot.inv_wr.next) + prev_wr->next = &ctx->sig->prot.inv_wr; + else + prev_wr->next = &ctx->sig->prot.reg_wr.wr; + prev_wr = &ctx->sig->prot.reg_wr.wr; + } else { + ctx->sig->prot.mr = NULL; + } + + ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs); + if (!ctx->sig->sig_mr) { + ret = -EAGAIN; + goto out_destroy_prot_mr; + } + + if (ctx->sig->sig_mr->need_inval) { + memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr)); + + ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV; + ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey; + + prev_wr->next = &ctx->sig->sig_inv_wr; + prev_wr = &ctx->sig->sig_inv_wr; + } + + ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR; + ctx->sig->sig_wr.wr.wr_cqe = NULL; + ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge; + ctx->sig->sig_wr.wr.num_sge = 1; + ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; + ctx->sig->sig_wr.sig_attrs = sig_attrs; + ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr; + if (prot_sg_cnt) + ctx->sig->sig_wr.prot = &ctx->sig->prot.sge; + prev_wr->next = &ctx->sig->sig_wr.wr; + prev_wr = &ctx->sig->sig_wr.wr; + count++; + + ctx->sig->sig_sge.addr = 0; + ctx->sig->sig_sge.length = ctx->sig->data.sge.length; + if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE) + ctx->sig->sig_sge.length += ctx->sig->prot.sge.length; + + rdma_wr = &ctx->sig->data.wr; + rdma_wr->wr.sg_list = &ctx->sig->sig_sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + prev_wr->next = &rdma_wr->wr; + prev_wr = &rdma_wr->wr; + count++; + + return count; + +out_destroy_prot_mr: + if (prot_sg_cnt) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); +out_destroy_data_mr: + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); +out_free_ctx: + kfree(ctx->sig); +out_unmap_prot_sg: + ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); +out_unmap_sg: + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_signature_init); + +/* + * Now that we are going to post the WRs we can update the lkey and need_inval + * state on the MRs. If we were doing this at init time, we would get double + * or missing invalidations if a context was initialized but not actually + * posted. + */ +static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval) +{ + reg->mr->need_inval = need_inval; + ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey)); + reg->reg_wr.key = reg->mr->lkey; + reg->sge.lkey = reg->mr->lkey; +} + +/** + * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Return the WR chain for the set of RDMA READ/WRITE operations described by + * @ctx, as well as any memory registration operations needed. If @chain_wr + * is non-NULL the WR it points to will be appended to the chain of WRs posted. + * If @chain_wr is not set @cqe must be set so that the caller gets a + * completion notification. + */ +struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr, *last_wr; + int i; + + switch (ctx->type) { + case RDMA_RW_SIG_MR: + rdma_rw_update_lkey(&ctx->sig->data, true); + if (ctx->sig->prot.mr) + rdma_rw_update_lkey(&ctx->sig->prot, true); + + ctx->sig->sig_mr->need_inval = true; + ib_update_fast_reg_key(ctx->sig->sig_mr, + ib_inc_rkey(ctx->sig->sig_mr->lkey)); + ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey; + + if (ctx->sig->data.inv_wr.next) + first_wr = &ctx->sig->data.inv_wr; + else + first_wr = &ctx->sig->data.reg_wr.wr; + last_wr = &ctx->sig->data.wr.wr; + break; + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) { + rdma_rw_update_lkey(&ctx->reg[i], + ctx->reg[i].wr.wr.opcode != + IB_WR_RDMA_READ_WITH_INV); + } + + if (ctx->reg[0].inv_wr.next) + first_wr = &ctx->reg[0].inv_wr; + else + first_wr = &ctx->reg[0].reg_wr.wr; + last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; + break; + case RDMA_RW_MULTI_WR: + first_wr = &ctx->map.wrs[0].wr; + last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; + break; + case RDMA_RW_SINGLE_WR: + first_wr = &ctx->single.wr.wr; + last_wr = &ctx->single.wr.wr; + break; + default: + BUG(); + } + + if (chain_wr) { + last_wr->next = chain_wr; + } else { + last_wr->wr_cqe = cqe; + last_wr->send_flags |= IB_SEND_SIGNALED; + } + + return first_wr; +} +EXPORT_SYMBOL(rdma_rw_ctx_wrs); + +/** + * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Post the set of RDMA READ/WRITE operations described by @ctx, as well as + * any memory registration operations needed. If @chain_wr is non-NULL the + * WR it points to will be appended to the chain of WRs posted. If @chain_wr + * is not set @cqe must be set so that the caller gets a completion + * notification. + */ +int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr, *bad_wr; + + first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr); + return ib_post_send(qp, first_wr, &bad_wr); +} +EXPORT_SYMBOL(rdma_rw_ctx_post); + +/** + * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir) +{ + int i; + + switch (ctx->type) { + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); + break; + case RDMA_RW_MULTI_WR: + kfree(ctx->map.wrs); + kfree(ctx->map.sges); + break; + case RDMA_RW_SINGLE_WR: + break; + default: + BUG(); + break; + } + + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy); + +/** + * rdma_rw_ctx_destroy_signature - release all resources allocated by + * rdma_rw_ctx_init_signature + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist that was used for the READ/WRITE of the PI + * @prot_sg_cnt: number of entries in @prot_sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + enum dma_data_direction dir) +{ + if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR)) + return; + + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); + + if (ctx->sig->prot.mr) { + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); + ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); + } + + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr); + kfree(ctx->sig); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); + +void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) +{ + u32 factor; + + WARN_ON_ONCE(attr->port_num == 0); + + /* + * Each context needs at least one RDMA READ or WRITE WR. + * + * For some hardware we might need more, eventually we should ask the + * HCA driver for a multiplier here. + */ + factor = 1; + + /* + * If the devices needs MRs to perform RDMA READ or WRITE operations, + * we'll need two additional MRs for the registrations and the + * invalidation. + */ + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + factor += 6; /* (inv + reg) * (data + prot + sig) */ + else if (rdma_rw_can_use_mr(dev, attr->port_num)) + factor += 2; /* inv + reg */ + + attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; + + /* + * But maybe we were just too high in the sky and the device doesn't + * even support all we need, and we'll have to live with what we get.. + */ + attr->cap.max_send_wr = + min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr); +} + +int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) +{ + struct ib_device *dev = qp->pd->device; + u32 nr_mrs = 0, nr_sig_mrs = 0; + int ret = 0; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) { + nr_sig_mrs = attr->cap.max_rdma_ctxs; + nr_mrs = attr->cap.max_rdma_ctxs * 2; + } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { + nr_mrs = attr->cap.max_rdma_ctxs; + } + + if (nr_mrs) { + ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, + IB_MR_TYPE_MEM_REG, + rdma_rw_fr_page_list_len(dev)); + if (ret) { + pr_err("%s: failed to allocated %d MRs\n", + __func__, nr_mrs); + return ret; + } + } + + if (nr_sig_mrs) { + ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, + IB_MR_TYPE_SIGNATURE, 2); + if (ret) { + pr_err("%s: failed to allocated %d SIG MRs\n", + __func__, nr_mrs); + goto out_free_rdma_mrs; + } + } + + return 0; + +out_free_rdma_mrs: + ib_mr_pool_destroy(qp, &qp->rdma_mrs); + return ret; +} + +void rdma_rw_cleanup_mrs(struct ib_qp *qp) +{ + ib_mr_pool_destroy(qp, &qp->sig_mrs); + ib_mr_pool_destroy(qp, &qp->rdma_mrs); +} diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8a09c0fb268d..3ebd108bcc5f 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -536,7 +536,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS, RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST); if (!data) { - kfree_skb(skb); + nlmsg_free(skb); return -EMSGSIZE; } @@ -1820,7 +1820,7 @@ static int __init ib_sa_init(void) goto err3; } - if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS, + if (ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ib_sa_cb_table), ib_sa_cb_table)) { pr_err("Failed to add netlink callback\n"); ret = -EINVAL; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6fdc7ecdaca0..1a8babb8ee3c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1833,7 +1833,8 @@ static int create_qp(struct ib_uverbs_file *file, if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | - IB_QP_CREATE_MANAGED_RECV)) { + IB_QP_CREATE_MANAGED_RECV | + IB_QP_CREATE_SCATTER_FCS)) { ret = -EINVAL; goto err_put; } @@ -3088,8 +3089,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EINVAL; - if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER && - !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) + if (!capable(CAP_NET_RAW)) return -EPERM; if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) @@ -3655,6 +3655,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.hca_core_clock = attr.hca_core_clock; resp.response_length += sizeof(resp.hca_core_clock); + if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex)) + goto end; + + resp.device_cap_flags_ex = attr.device_cap_flags; + resp.response_length += sizeof(resp.device_cap_flags_ex); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b65b3541e732..1d7d4cf442e3 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -48,6 +48,7 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> +#include <rdma/rw.h> #include "core_priv.h" @@ -723,59 +724,89 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, } EXPORT_SYMBOL(ib_open_qp); +static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_qp *real_qp = qp; + + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + INIT_LIST_HEAD(&qp->open_list); + + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (!IS_ERR(qp)) + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + else + real_qp->device->destroy_qp(real_qp); + return qp; +} + struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { - struct ib_qp *qp, *real_qp; - struct ib_device *device; + struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; + struct ib_qp *qp; + int ret; + + /* + * If the callers is using the RDMA API calculate the resources + * needed for the RDMA READ/WRITE operations. + * + * Note that these callers need to pass in a port number. + */ + if (qp_init_attr->cap.max_rdma_ctxs) + rdma_rw_init_qp(device, qp_init_attr); - device = pd ? pd->device : qp_init_attr->xrcd->device; qp = device->create_qp(pd, qp_init_attr, NULL); + if (IS_ERR(qp)) + return qp; + + qp->device = device; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = qp_init_attr->qp_type; + + atomic_set(&qp->usecnt, 0); + qp->mrs_used = 0; + spin_lock_init(&qp->mr_lock); + INIT_LIST_HEAD(&qp->rdma_mrs); + INIT_LIST_HEAD(&qp->sig_mrs); + + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) + return ib_create_xrc_qp(qp, qp_init_attr); + + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { + qp->recv_cq = NULL; + qp->srq = NULL; + } else { + qp->recv_cq = qp_init_attr->recv_cq; + atomic_inc(&qp_init_attr->recv_cq->usecnt); + qp->srq = qp_init_attr->srq; + if (qp->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + } - if (!IS_ERR(qp)) { - qp->device = device; - qp->real_qp = qp; - qp->uobject = NULL; - qp->qp_type = qp_init_attr->qp_type; - - atomic_set(&qp->usecnt, 0); - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { - qp->event_handler = __ib_shared_qp_event_handler; - qp->qp_context = qp; - qp->pd = NULL; - qp->send_cq = qp->recv_cq = NULL; - qp->srq = NULL; - qp->xrcd = qp_init_attr->xrcd; - atomic_inc(&qp_init_attr->xrcd->usecnt); - INIT_LIST_HEAD(&qp->open_list); - - real_qp = qp; - qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, - qp_init_attr->qp_context); - if (!IS_ERR(qp)) - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); - else - real_qp->device->destroy_qp(real_qp); - } else { - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; - if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { - qp->recv_cq = NULL; - qp->srq = NULL; - } else { - qp->recv_cq = qp_init_attr->recv_cq; - atomic_inc(&qp_init_attr->recv_cq->usecnt); - qp->srq = qp_init_attr->srq; - if (qp->srq) - atomic_inc(&qp_init_attr->srq->usecnt); - } + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->xrcd = NULL; - qp->pd = pd; - qp->send_cq = qp_init_attr->send_cq; - qp->xrcd = NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&qp_init_attr->send_cq->usecnt); - atomic_inc(&pd->usecnt); - atomic_inc(&qp_init_attr->send_cq->usecnt); + if (qp_init_attr->cap.max_rdma_ctxs) { + ret = rdma_rw_init_mrs(qp, qp_init_attr); + if (ret) { + pr_err("failed to init MR pool ret= %d\n", ret); + ib_destroy_qp(qp); + qp = ERR_PTR(ret); } } @@ -1250,6 +1281,8 @@ int ib_destroy_qp(struct ib_qp *qp) struct ib_srq *srq; int ret; + WARN_ON_ONCE(qp->mrs_used > 0); + if (atomic_read(&qp->usecnt)) return -EBUSY; @@ -1261,6 +1294,9 @@ int ib_destroy_qp(struct ib_qp *qp) rcq = qp->recv_cq; srq = qp->srq; + if (!qp->uobject) + rdma_rw_cleanup_mrs(qp); + ret = qp->device->destroy_qp(qp); if (!ret) { if (pd) @@ -1343,6 +1379,7 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); + mr->need_inval = false; } return mr; @@ -1389,6 +1426,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); + mr->need_inval = false; } return mr; @@ -1597,6 +1635,7 @@ EXPORT_SYMBOL(ib_set_vf_guid); * @mr: memory region * @sg: dma mapped scatterlist * @sg_nents: number of entries in sg + * @sg_offset: offset in bytes into sg * @page_size: page vector desired page size * * Constraints: @@ -1615,17 +1654,15 @@ EXPORT_SYMBOL(ib_set_vf_guid); * After this completes successfully, the memory region * is ready for registration. */ -int ib_map_mr_sg(struct ib_mr *mr, - struct scatterlist *sg, - int sg_nents, - unsigned int page_size) +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) { if (unlikely(!mr->device->map_mr_sg)) return -ENOSYS; mr->page_size = page_size; - return mr->device->map_mr_sg(mr, sg, sg_nents); + return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset); } EXPORT_SYMBOL(ib_map_mr_sg); @@ -1635,6 +1672,10 @@ EXPORT_SYMBOL(ib_map_mr_sg); * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg + * @sg_offset_p: IN: start offset in bytes into sg + * OUT: offset in bytes for element n of the sg of the first + * byte that has not been processed where n is the return + * value of this function. * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest @@ -1645,23 +1686,26 @@ EXPORT_SYMBOL(ib_map_mr_sg); * Returns the number of sg elements that were assigned to * a page vector. */ -int ib_sg_to_pages(struct ib_mr *mr, - struct scatterlist *sgl, - int sg_nents, - int (*set_page)(struct ib_mr *, u64)) +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; u64 last_end_dma_addr = 0; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; - mr->iova = sg_dma_address(&sgl[0]); + if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) + return -EINVAL; + + mr->iova = sg_dma_address(&sgl[0]) + sg_offset; mr->length = 0; for_each_sg(sgl, sg, sg_nents, i) { - u64 dma_addr = sg_dma_address(sg); - unsigned int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg) + sg_offset; + u64 prev_addr = dma_addr; + unsigned int dma_len = sg_dma_len(sg) - sg_offset; u64 end_dma_addr = dma_addr + dma_len; u64 page_addr = dma_addr & page_mask; @@ -1685,8 +1729,14 @@ int ib_sg_to_pages(struct ib_mr *mr, do { ret = set_page(mr, page_addr); - if (unlikely(ret < 0)) - return i ? : ret; + if (unlikely(ret < 0)) { + sg_offset = prev_addr - sg_dma_address(sg); + mr->length += prev_addr - dma_addr; + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i || sg_offset ? i : ret; + } + prev_addr = page_addr; next_page: page_addr += mr->page_size; } while (page_addr < end_dma_addr); @@ -1694,8 +1744,12 @@ next_page: mr->length += dma_len; last_end_dma_addr = end_dma_addr; last_page_off = end_dma_addr & ~page_mask; + + sg_offset = 0; } + if (sg_offset_p) + *sg_offset_p = 0; return i; } EXPORT_SYMBOL(ib_sg_to_pages); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 3234a8be16f6..47cb927a0dd6 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -783,15 +783,14 @@ static int iwch_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -static int iwch_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +static int iwch_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) { struct iwch_mr *mhp = to_iwch_mr(ibmr); mhp->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, iwch_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, iwch_set_page); } static int iwch_destroy_qp(struct ib_qp *ib_qp) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 651711370d55..a3a67216bce6 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -119,7 +119,7 @@ MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " static int mpa_rev = 2; module_param(mpa_rev, int, 0644); MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " - "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft" + "1 is RFC5044 spec compliant, 2 is IETF MPA Peer Connect Draft" " compliant (default=2)"); static int markers_enabled; @@ -145,19 +145,35 @@ static struct sk_buff_head rxq; static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp); static void ep_timeout(unsigned long arg); static void connect_reply_upcall(struct c4iw_ep *ep, int status); +static int sched(struct c4iw_dev *dev, struct sk_buff *skb); static LIST_HEAD(timeout_list); static spinlock_t timeout_lock; +static void deref_cm_id(struct c4iw_ep_common *epc) +{ + epc->cm_id->rem_ref(epc->cm_id); + epc->cm_id = NULL; + set_bit(CM_ID_DEREFED, &epc->history); +} + +static void ref_cm_id(struct c4iw_ep_common *epc) +{ + set_bit(CM_ID_REFED, &epc->history); + epc->cm_id->add_ref(epc->cm_id); +} + static void deref_qp(struct c4iw_ep *ep) { c4iw_qp_rem_ref(&ep->com.qp->ibqp); clear_bit(QP_REFERENCED, &ep->com.flags); + set_bit(QP_DEREFED, &ep->com.history); } static void ref_qp(struct c4iw_ep *ep) { set_bit(QP_REFERENCED, &ep->com.flags); + set_bit(QP_REFED, &ep->com.history); c4iw_qp_add_ref(&ep->com.qp->ibqp); } @@ -201,6 +217,8 @@ static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb, error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e); if (error < 0) kfree_skb(skb); + else if (error == NET_XMIT_DROP) + return -ENOMEM; return error < 0 ? error : 0; } @@ -290,12 +308,63 @@ static void *alloc_ep(int size, gfp_t gfp) return epc; } +static void remove_ep_tid(struct c4iw_ep *ep) +{ + unsigned long flags; + + spin_lock_irqsave(&ep->com.dev->lock, flags); + _remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid, 0); + spin_unlock_irqrestore(&ep->com.dev->lock, flags); +} + +static void insert_ep_tid(struct c4iw_ep *ep) +{ + unsigned long flags; + + spin_lock_irqsave(&ep->com.dev->lock, flags); + _insert_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep, ep->hwtid, 0); + spin_unlock_irqrestore(&ep->com.dev->lock, flags); +} + +/* + * Atomically lookup the ep ptr given the tid and grab a reference on the ep. + */ +static struct c4iw_ep *get_ep_from_tid(struct c4iw_dev *dev, unsigned int tid) +{ + struct c4iw_ep *ep; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + ep = idr_find(&dev->hwtid_idr, tid); + if (ep) + c4iw_get_ep(&ep->com); + spin_unlock_irqrestore(&dev->lock, flags); + return ep; +} + +/* + * Atomically lookup the ep ptr given the stid and grab a reference on the ep. + */ +static struct c4iw_listen_ep *get_ep_from_stid(struct c4iw_dev *dev, + unsigned int stid) +{ + struct c4iw_listen_ep *ep; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + ep = idr_find(&dev->stid_idr, stid); + if (ep) + c4iw_get_ep(&ep->com); + spin_unlock_irqrestore(&dev->lock, flags); + return ep; +} + void _c4iw_free_ep(struct kref *kref) { struct c4iw_ep *ep; ep = container_of(kref, struct c4iw_ep, com.kref); - PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]); + PDBG("%s ep %p state %s\n", __func__, ep, states[ep->com.state]); if (test_bit(QP_REFERENCED, &ep->com.flags)) deref_qp(ep); if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) { @@ -309,10 +378,11 @@ void _c4iw_free_ep(struct kref *kref) (const u32 *)&sin6->sin6_addr.s6_addr, 1); } - remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); + if (ep->mpa_skb) + kfree_skb(ep->mpa_skb); } kfree(ep); } @@ -320,6 +390,15 @@ void _c4iw_free_ep(struct kref *kref) static void release_ep_resources(struct c4iw_ep *ep) { set_bit(RELEASE_RESOURCES, &ep->com.flags); + + /* + * If we have a hwtid, then remove it from the idr table + * so lookups will no longer find this endpoint. Otherwise + * we have a race where one thread finds the ep ptr just + * before the other thread is freeing the ep memory. + */ + if (ep->hwtid != -1) + remove_ep_tid(ep); c4iw_put_ep(&ep->com); } @@ -432,10 +511,74 @@ static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip, static void arp_failure_discard(void *handle, struct sk_buff *skb) { - PDBG("%s c4iw_dev %p\n", __func__, handle); + pr_err(MOD "ARP failure\n"); kfree_skb(skb); } +static void mpa_start_arp_failure(void *handle, struct sk_buff *skb) +{ + pr_err("ARP failure during MPA Negotiation - Closing Connection\n"); +} + +enum { + NUM_FAKE_CPLS = 2, + FAKE_CPL_PUT_EP_SAFE = NUM_CPL_CMDS + 0, + FAKE_CPL_PASS_PUT_EP_SAFE = NUM_CPL_CMDS + 1, +}; + +static int _put_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + + ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))); + release_ep_resources(ep); + return 0; +} + +static int _put_pass_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + + ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))); + c4iw_put_ep(&ep->parent_ep->com); + release_ep_resources(ep); + return 0; +} + +/* + * Fake up a special CPL opcode and call sched() so process_work() will call + * _put_ep_safe() in a safe context to free the ep resources. This is needed + * because ARP error handlers are called in an ATOMIC context, and + * _c4iw_free_ep() needs to block. + */ +static void queue_arp_failure_cpl(struct c4iw_ep *ep, struct sk_buff *skb, + int cpl) +{ + struct cpl_act_establish *rpl = cplhdr(skb); + + /* Set our special ARP_FAILURE opcode */ + rpl->ot.opcode = cpl; + + /* + * Save ep in the skb->cb area, after where sched() will save the dev + * ptr. + */ + *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))) = ep; + sched(ep->com.dev, skb); +} + +/* Handle an ARP failure for an accept */ +static void pass_accept_rpl_arp_failure(void *handle, struct sk_buff *skb) +{ + struct c4iw_ep *ep = handle; + + pr_err(MOD "ARP failure during accept - tid %u -dropping connection\n", + ep->hwtid); + + __state_set(&ep->com, DEAD); + queue_arp_failure_cpl(ep, skb, FAKE_CPL_PASS_PUT_EP_SAFE); +} + /* * Handle an ARP failure for an active open. */ @@ -444,9 +587,8 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) struct c4iw_ep *ep = handle; printk(KERN_ERR MOD "ARP failure during connect\n"); - kfree_skb(skb); connect_reply_upcall(ep, -EHOSTUNREACH); - state_set(&ep->com, DEAD); + __state_set(&ep->com, DEAD); if (ep->com.remote_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ep->com.local_addr; @@ -455,9 +597,7 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) } remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); - dst_release(ep->dst); - cxgb4_l2t_release(ep->l2t); - c4iw_put_ep(&ep->com); + queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE); } /* @@ -466,15 +606,21 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) */ static void abort_arp_failure(void *handle, struct sk_buff *skb) { - struct c4iw_rdev *rdev = handle; + int ret; + struct c4iw_ep *ep = handle; + struct c4iw_rdev *rdev = &ep->com.dev->rdev; struct cpl_abort_req *req = cplhdr(skb); PDBG("%s rdev %p\n", __func__, rdev); req->cmd = CPL_ABORT_NO_RST; - c4iw_ofld_send(rdev, skb); + ret = c4iw_ofld_send(rdev, skb); + if (ret) { + __state_set(&ep->com, DEAD); + queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE); + } } -static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) +static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) { unsigned int flowclen = 80; struct fw_flowc_wr *flowc; @@ -530,7 +676,7 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) } set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); - c4iw_ofld_send(&ep->com.dev->rdev, skb); + return c4iw_ofld_send(&ep->com.dev->rdev, skb); } static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp) @@ -568,7 +714,7 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) return -ENOMEM; } set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); - t4_set_arp_err_handler(skb, &ep->com.dev->rdev, abort_arp_failure); + t4_set_arp_err_handler(skb, ep, abort_arp_failure); req = (struct cpl_abort_req *) skb_put(skb, wrlen); memset(req, 0, wrlen); INIT_TP_WR(req, ep->hwtid); @@ -807,10 +953,10 @@ clip_release: return ret; } -static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, - u8 mpa_rev_to_use) +static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, + u8 mpa_rev_to_use) { - int mpalen, wrlen; + int mpalen, wrlen, ret; struct fw_ofld_tx_data_wr *req; struct mpa_message *mpa; struct mpa_v2_conn_params mpa_v2_params; @@ -826,7 +972,7 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, skb = get_skb(skb, wrlen, GFP_KERNEL); if (!skb) { connect_reply_upcall(ep, -ENOMEM); - return; + return -ENOMEM; } set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); @@ -894,12 +1040,14 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, t4_set_arp_err_handler(skb, NULL, arp_failure_discard); BUG_ON(ep->mpa_skb); ep->mpa_skb = skb; - c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + ret = c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + if (ret) + return ret; start_ep_timer(ep); __state_set(&ep->com, MPA_REQ_SENT); ep->mpa_attr.initiator = 1; ep->snd_seq += mpalen; - return; + return ret; } static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) @@ -975,7 +1123,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) */ skb_get(skb); set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); - t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure); BUG_ON(ep->mpa_skb); ep->mpa_skb = skb; ep->snd_seq += mpalen; @@ -1060,7 +1208,7 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) * Function fw4_ack() will deref it. */ skb_get(skb); - t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure); ep->mpa_skb = skb; __state_set(&ep->com, MPA_REP_SENT); ep->snd_seq += mpalen; @@ -1074,6 +1222,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) unsigned int tid = GET_TID(req); unsigned int atid = TID_TID_G(ntohl(req->tos_atid)); struct tid_info *t = dev->rdev.lldi.tids; + int ret; ep = lookup_atid(t, atid); @@ -1086,7 +1235,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) /* setup the hwtid for this connection */ ep->hwtid = tid; cxgb4_insert_tid(t, ep, tid); - insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid); + insert_ep_tid(ep); ep->snd_seq = be32_to_cpu(req->snd_isn); ep->rcv_seq = be32_to_cpu(req->rcv_isn); @@ -1099,13 +1248,22 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) set_bit(ACT_ESTAB, &ep->com.history); /* start MPA negotiation */ - send_flowc(ep, NULL); + ret = send_flowc(ep, NULL); + if (ret) + goto err; if (ep->retry_with_mpa_v1) - send_mpa_req(ep, skb, 1); + ret = send_mpa_req(ep, skb, 1); else - send_mpa_req(ep, skb, mpa_rev); + ret = send_mpa_req(ep, skb, mpa_rev); + if (ret) + goto err; mutex_unlock(&ep->com.mutex); return 0; +err: + mutex_unlock(&ep->com.mutex); + connect_reply_upcall(ep, -ENOMEM); + c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + return 0; } static void close_complete_upcall(struct c4iw_ep *ep, int status) @@ -1120,20 +1278,11 @@ static void close_complete_upcall(struct c4iw_ep *ep, int status) PDBG("close complete delivered ep %p cm_id %p tid %u\n", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); - ep->com.cm_id->rem_ref(ep->com.cm_id); - ep->com.cm_id = NULL; + deref_cm_id(&ep->com); set_bit(CLOSE_UPCALL, &ep->com.history); } } -static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) -{ - PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - __state_set(&ep->com, ABORTING); - set_bit(ABORT_CONN, &ep->com.history); - return send_abort(ep, skb, gfp); -} - static void peer_close_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; @@ -1161,8 +1310,7 @@ static void peer_abort_upcall(struct c4iw_ep *ep) PDBG("abort delivered ep %p cm_id %p tid %u\n", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); - ep->com.cm_id->rem_ref(ep->com.cm_id); - ep->com.cm_id = NULL; + deref_cm_id(&ep->com); set_bit(ABORT_UPCALL, &ep->com.history); } } @@ -1205,10 +1353,8 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status) set_bit(CONN_RPL_UPCALL, &ep->com.history); ep->com.cm_id->event_handler(ep->com.cm_id, &event); - if (status < 0) { - ep->com.cm_id->rem_ref(ep->com.cm_id); - ep->com.cm_id = NULL; - } + if (status < 0) + deref_cm_id(&ep->com); } static int connect_request_upcall(struct c4iw_ep *ep) @@ -1301,6 +1447,18 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits) #define RELAXED_IRD_NEGOTIATION 1 +/* + * process_mpa_reply - process streaming mode MPA reply + * + * Returns: + * + * 0 upon success indicating a connect request was delivered to the ULP + * or the mpa request is incomplete but valid so far. + * + * 1 if a failure requires the caller to close the connection. + * + * 2 if a failure requires the caller to abort the connection. + */ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) { struct mpa_message *mpa; @@ -1316,20 +1474,12 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); /* - * Stop mpa timer. If it expired, then - * we ignore the MPA reply. process_timeout() - * will abort the connection. - */ - if (stop_ep_timer(ep)) - return 0; - - /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { err = -EINVAL; - goto err; + goto err_stop_timer; } /* @@ -1351,11 +1501,11 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," " Received = %d\n", __func__, mpa_rev, mpa->revision); err = -EPROTO; - goto err; + goto err_stop_timer; } if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { err = -EPROTO; - goto err; + goto err_stop_timer; } plen = ntohs(mpa->private_data_size); @@ -1365,7 +1515,7 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) */ if (plen > MPA_MAX_PRIVATE_DATA) { err = -EPROTO; - goto err; + goto err_stop_timer; } /* @@ -1373,7 +1523,7 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { err = -EPROTO; - goto err; + goto err_stop_timer; } ep->plen = (u8) plen; @@ -1387,10 +1537,18 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) if (mpa->flags & MPA_REJECT) { err = -ECONNREFUSED; - goto err; + goto err_stop_timer; } /* + * Stop mpa timer. If it expired, then + * we ignore the MPA reply. process_timeout() + * will abort the connection. + */ + if (stop_ep_timer(ep)) + return 0; + + /* * If we get here we have accumulated the entire mpa * start reply message including private data. And * the MPA header is valid. @@ -1529,15 +1687,28 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) goto out; } goto out; +err_stop_timer: + stop_ep_timer(ep); err: - __state_set(&ep->com, ABORTING); - send_abort(ep, skb, GFP_KERNEL); + disconnect = 2; out: connect_reply_upcall(ep, err); return disconnect; } -static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) +/* + * process_mpa_request - process streaming mode MPA request + * + * Returns: + * + * 0 upon success indicating a connect request was delivered to the ULP + * or the mpa request is incomplete but valid so far. + * + * 1 if a failure requires the caller to close the connection. + * + * 2 if a failure requires the caller to abort the connection. + */ +static int process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) { struct mpa_message *mpa; struct mpa_v2_conn_params *mpa_v2_params; @@ -1549,11 +1720,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) * If we get more than the supported amount of private data * then we must fail this connection. */ - if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; - } + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) + goto err_stop_timer; PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); @@ -1569,7 +1737,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < sizeof(*mpa)) - return; + return 0; PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); mpa = (struct mpa_message *) ep->mpa_pkt; @@ -1580,43 +1748,32 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) if (mpa->revision > mpa_rev) { printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," " Received = %d\n", __func__, mpa_rev, mpa->revision); - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; + goto err_stop_timer; } - if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; - } + if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) + goto err_stop_timer; plen = ntohs(mpa->private_data_size); /* * Fail if there's too much private data. */ - if (plen > MPA_MAX_PRIVATE_DATA) { - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; - } + if (plen > MPA_MAX_PRIVATE_DATA) + goto err_stop_timer; /* * If plen does not account for pkt size */ - if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; - } + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) + goto err_stop_timer; ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) - return; + return 0; /* * If we get here we have accumulated the entire mpa @@ -1665,26 +1822,26 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, ep->mpa_attr.p2p_type); - /* - * If the endpoint timer already expired, then we ignore - * the start request. process_timeout() will abort - * the connection. - */ - if (!stop_ep_timer(ep)) { - __state_set(&ep->com, MPA_REQ_RCVD); - - /* drive upcall */ - mutex_lock_nested(&ep->parent_ep->com.mutex, - SINGLE_DEPTH_NESTING); - if (ep->parent_ep->com.state != DEAD) { - if (connect_request_upcall(ep)) - abort_connection(ep, skb, GFP_KERNEL); - } else { - abort_connection(ep, skb, GFP_KERNEL); - } - mutex_unlock(&ep->parent_ep->com.mutex); + __state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + mutex_lock_nested(&ep->parent_ep->com.mutex, SINGLE_DEPTH_NESTING); + if (ep->parent_ep->com.state != DEAD) { + if (connect_request_upcall(ep)) + goto err_unlock_parent; + } else { + goto err_unlock_parent; } - return; + mutex_unlock(&ep->parent_ep->com.mutex); + return 0; + +err_unlock_parent: + mutex_unlock(&ep->parent_ep->com.mutex); + goto err_out; +err_stop_timer: + (void)stop_ep_timer(ep); +err_out: + return 2; } static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) @@ -1693,11 +1850,10 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_rx_data *hdr = cplhdr(skb); unsigned int dlen = ntohs(hdr->len); unsigned int tid = GET_TID(hdr); - struct tid_info *t = dev->rdev.lldi.tids; __u8 status = hdr->status; int disconnect = 0; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); if (!ep) return 0; PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen); @@ -1715,7 +1871,7 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) break; case MPA_REQ_WAIT: ep->rcv_seq += dlen; - process_mpa_request(ep, skb); + disconnect = process_mpa_request(ep, skb); break; case FPDU_MODE: { struct c4iw_qp_attributes attrs; @@ -1736,7 +1892,8 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) } mutex_unlock(&ep->com.mutex); if (disconnect) - c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL); + c4iw_put_ep(&ep->com); return 0; } @@ -1746,9 +1903,8 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_abort_rpl_rss *rpl = cplhdr(skb); int release = 0; unsigned int tid = GET_TID(rpl); - struct tid_info *t = dev->rdev.lldi.tids; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); if (!ep) { printk(KERN_WARNING MOD "Abort rpl to freed endpoint\n"); return 0; @@ -1770,10 +1926,11 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) if (release) release_ep_resources(ep); + c4iw_put_ep(&ep->com); return 0; } -static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) +static int send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) { struct sk_buff *skb; struct fw_ofld_connection_wr *req; @@ -1843,7 +2000,7 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) req->tcb.opt2 = cpu_to_be32((__force u32)req->tcb.opt2); set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx); set_bit(ACT_OFLD_CONN, &ep->com.history); - c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } /* @@ -1986,6 +2143,7 @@ static int c4iw_reconnect(struct c4iw_ep *ep) PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id); init_timer(&ep->timer); + c4iw_init_wr_wait(&ep->com.wr_wait); /* * Allocate an active TID to initiate a TCP connection. @@ -2069,6 +2227,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct sockaddr_in *ra; struct sockaddr_in6 *la6; struct sockaddr_in6 *ra6; + int ret = 0; ep = lookup_atid(t, atid); la = (struct sockaddr_in *)&ep->com.local_addr; @@ -2104,9 +2263,10 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) mutex_unlock(&dev->rdev.stats.lock); if (ep->com.local_addr.ss_family == AF_INET && dev->rdev.lldi.enable_fw_ofld_conn) { - send_fw_act_open_req(ep, - TID_TID_G(AOPEN_ATID_G( - ntohl(rpl->atid_status)))); + ret = send_fw_act_open_req(ep, TID_TID_G(AOPEN_ATID_G( + ntohl(rpl->atid_status)))); + if (ret) + goto fail; return 0; } break; @@ -2146,6 +2306,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) break; } +fail: connect_reply_upcall(ep, status2errno(status)); state_set(&ep->com, DEAD); @@ -2170,9 +2331,8 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_pass_open_rpl *rpl = cplhdr(skb); - struct tid_info *t = dev->rdev.lldi.tids; unsigned int stid = GET_TID(rpl); - struct c4iw_listen_ep *ep = lookup_stid(t, stid); + struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid); if (!ep) { PDBG("%s stid %d lookup failure!\n", __func__, stid); @@ -2181,7 +2341,7 @@ static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s ep %p status %d error %d\n", __func__, ep, rpl->status, status2errno(rpl->status)); c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); - + c4iw_put_ep(&ep->com); out: return 0; } @@ -2189,17 +2349,17 @@ out: static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_close_listsvr_rpl *rpl = cplhdr(skb); - struct tid_info *t = dev->rdev.lldi.tids; unsigned int stid = GET_TID(rpl); - struct c4iw_listen_ep *ep = lookup_stid(t, stid); + struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid); PDBG("%s ep %p\n", __func__, ep); c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); + c4iw_put_ep(&ep->com); return 0; } -static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, - struct cpl_pass_accept_req *req) +static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, + struct cpl_pass_accept_req *req) { struct cpl_pass_accept_rpl *rpl; unsigned int mtu_idx; @@ -2287,10 +2447,9 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, rpl->opt0 = cpu_to_be64(opt0); rpl->opt2 = cpu_to_be32(opt2); set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); - t4_set_arp_err_handler(skb, NULL, arp_failure_discard); - c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + t4_set_arp_err_handler(skb, ep, pass_accept_rpl_arp_failure); - return; + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb) @@ -2355,7 +2514,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) unsigned short hdrs; u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); - parent_ep = lookup_stid(t, stid); + parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid); if (!parent_ep) { PDBG("%s connect request on invalid stid %d\n", __func__, stid); goto reject; @@ -2468,9 +2627,13 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) init_timer(&child_ep->timer); cxgb4_insert_tid(t, child_ep, hwtid); - insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid); - accept_cr(child_ep, skb, req); - set_bit(PASS_ACCEPT_REQ, &child_ep->com.history); + insert_ep_tid(child_ep); + if (accept_cr(child_ep, skb, req)) { + c4iw_put_ep(&parent_ep->com); + release_ep_resources(child_ep); + } else { + set_bit(PASS_ACCEPT_REQ, &child_ep->com.history); + } if (iptype == 6) { sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr; cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0], @@ -2479,6 +2642,8 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) goto out; reject: reject_cr(dev, hwtid, skb); + if (parent_ep) + c4iw_put_ep(&parent_ep->com); out: return 0; } @@ -2487,10 +2652,10 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) { struct c4iw_ep *ep; struct cpl_pass_establish *req = cplhdr(skb); - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(req); + int ret; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); ep->snd_seq = be32_to_cpu(req->snd_isn); ep->rcv_seq = be32_to_cpu(req->rcv_isn); @@ -2501,10 +2666,15 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) set_emss(ep, ntohs(req->tcp_opt)); dst_confirm(ep->dst); - state_set(&ep->com, MPA_REQ_WAIT); + mutex_lock(&ep->com.mutex); + ep->com.state = MPA_REQ_WAIT; start_ep_timer(ep); - send_flowc(ep, skb); set_bit(PASS_ESTAB, &ep->com.history); + ret = send_flowc(ep, skb); + mutex_unlock(&ep->com.mutex); + if (ret) + c4iw_ep_disconnect(ep, 1, GFP_KERNEL); + c4iw_put_ep(&ep->com); return 0; } @@ -2516,11 +2686,13 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_qp_attributes attrs; int disconnect = 1; int release = 0; - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(hdr); int ret; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); dst_confirm(ep->dst); @@ -2592,6 +2764,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) c4iw_ep_disconnect(ep, 0, GFP_KERNEL); if (release) release_ep_resources(ep); + c4iw_put_ep(&ep->com); return 0; } @@ -2604,10 +2777,12 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_qp_attributes attrs; int ret; int release = 0; - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(req); - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; + if (is_neg_adv(req->status)) { PDBG("%s Negative advice on abort- tid %u status %d (%s)\n", __func__, ep->hwtid, req->status, @@ -2616,7 +2791,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) mutex_lock(&dev->rdev.stats.lock); dev->rdev.stats.neg_adv++; mutex_unlock(&dev->rdev.stats.lock); - return 0; + goto deref_ep; } PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, ep->com.state); @@ -2633,6 +2808,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) mutex_lock(&ep->com.mutex); switch (ep->com.state) { case CONNECTING: + c4iw_put_ep(&ep->parent_ep->com); break; case MPA_REQ_WAIT: (void)stop_ep_timer(ep); @@ -2681,7 +2857,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) case DEAD: PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__); mutex_unlock(&ep->com.mutex); - return 0; + goto deref_ep; default: BUG_ON(1); break; @@ -2728,6 +2904,10 @@ out: c4iw_reconnect(ep); } +deref_ep: + c4iw_put_ep(&ep->com); + /* Dereferencing ep, referenced in peer_abort_intr() */ + c4iw_put_ep(&ep->com); return 0; } @@ -2737,16 +2917,18 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_qp_attributes attrs; struct cpl_close_con_rpl *rpl = cplhdr(skb); int release = 0; - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(rpl); - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); BUG_ON(!ep); /* The cm_id may be null if we failed to connect */ mutex_lock(&ep->com.mutex); + set_bit(CLOSE_CON_RPL, &ep->com.history); switch (ep->com.state) { case CLOSING: __state_set(&ep->com, MORIBUND); @@ -2774,18 +2956,18 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) mutex_unlock(&ep->com.mutex); if (release) release_ep_resources(ep); + c4iw_put_ep(&ep->com); return 0; } static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_rdma_terminate *rpl = cplhdr(skb); - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(rpl); struct c4iw_ep *ep; struct c4iw_qp_attributes attrs; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); BUG_ON(!ep); if (ep && ep->com.qp) { @@ -2796,6 +2978,7 @@ static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } else printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid); + c4iw_put_ep(&ep->com); return 0; } @@ -2811,15 +2994,16 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_fw4_ack *hdr = cplhdr(skb); u8 credits = hdr->credits; unsigned int tid = GET_TID(hdr); - struct tid_info *t = dev->rdev.lldi.tids; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits); if (credits == 0) { PDBG("%s 0 credit ack ep %p tid %u state %u\n", __func__, ep, ep->hwtid, state_read(&ep->com)); - return 0; + goto out; } dst_confirm(ep->dst); @@ -2829,7 +3013,13 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0); kfree_skb(ep->mpa_skb); ep->mpa_skb = NULL; + mutex_lock(&ep->com.mutex); + if (test_bit(STOP_MPA_TIMER, &ep->com.flags)) + stop_ep_timer(ep); + mutex_unlock(&ep->com.mutex); } +out: + c4iw_put_ep(&ep->com); return 0; } @@ -2841,22 +3031,23 @@ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); mutex_lock(&ep->com.mutex); - if (ep->com.state == DEAD) { + if (ep->com.state != MPA_REQ_RCVD) { mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return -ECONNRESET; } set_bit(ULP_REJECT, &ep->com.history); - BUG_ON(ep->com.state != MPA_REQ_RCVD); if (mpa_rev == 0) - abort_connection(ep, NULL, GFP_KERNEL); + disconnect = 2; else { err = send_mpa_reject(ep, pdata, pdata_len); disconnect = 1; } mutex_unlock(&ep->com.mutex); - if (disconnect) - err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + if (disconnect) { + stop_ep_timer(ep); + err = c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL); + } c4iw_put_ep(&ep->com); return 0; } @@ -2869,24 +3060,23 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct c4iw_ep *ep = to_ep(cm_id); struct c4iw_dev *h = to_c4iw_dev(cm_id->device); struct c4iw_qp *qp = get_qhp(h, conn_param->qpn); + int abort = 0; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); mutex_lock(&ep->com.mutex); - if (ep->com.state == DEAD) { + if (ep->com.state != MPA_REQ_RCVD) { err = -ECONNRESET; - goto err; + goto err_out; } - BUG_ON(ep->com.state != MPA_REQ_RCVD); BUG_ON(!qp); set_bit(ULP_ACCEPT, &ep->com.history); if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) || (conn_param->ird > cur_max_read_depth(ep->com.dev))) { - abort_connection(ep, NULL, GFP_KERNEL); err = -EINVAL; - goto err; + goto err_abort; } if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { @@ -2898,9 +3088,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->ord = conn_param->ord; send_mpa_reject(ep, conn_param->private_data, conn_param->private_data_len); - abort_connection(ep, NULL, GFP_KERNEL); err = -ENOMEM; - goto err; + goto err_abort; } } if (conn_param->ird < ep->ord) { @@ -2908,9 +3097,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->ord <= h->rdev.lldi.max_ordird_qp) { conn_param->ird = ep->ord; } else { - abort_connection(ep, NULL, GFP_KERNEL); err = -ENOMEM; - goto err; + goto err_abort; } } } @@ -2929,8 +3117,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord); - cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; + ref_cm_id(&ep->com); ep->com.qp = qp; ref_qp(ep); @@ -2951,23 +3139,27 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (err) - goto err1; + goto err_deref_cm_id; + + set_bit(STOP_MPA_TIMER, &ep->com.flags); err = send_mpa_reply(ep, conn_param->private_data, conn_param->private_data_len); if (err) - goto err1; + goto err_deref_cm_id; __state_set(&ep->com, FPDU_MODE); established_upcall(ep); mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return 0; -err1: - ep->com.cm_id = NULL; - abort_connection(ep, NULL, GFP_KERNEL); - cm_id->rem_ref(cm_id); -err: +err_deref_cm_id: + deref_cm_id(&ep->com); +err_abort: + abort = 1; +err_out: mutex_unlock(&ep->com.mutex); + if (abort) + c4iw_ep_disconnect(ep, 1, GFP_KERNEL); c4iw_put_ep(&ep->com); return err; } @@ -3067,9 +3259,9 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (peer2peer && ep->ord == 0) ep->ord = 1; - cm_id->add_ref(cm_id); - ep->com.dev = dev; ep->com.cm_id = cm_id; + ref_cm_id(&ep->com); + ep->com.dev = dev; ep->com.qp = get_qhp(dev, conn_param->qpn); if (!ep->com.qp) { PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn); @@ -3108,7 +3300,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) /* * Handle loopback requests to INADDR_ANY. */ - if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) { + if (raddr->sin_addr.s_addr == htonl(INADDR_ANY)) { err = pick_local_ipaddrs(dev, cm_id); if (err) goto fail1; @@ -3176,7 +3368,7 @@ fail2: remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); fail1: - cm_id->rem_ref(cm_id); + deref_cm_id(&ep->com); c4iw_put_ep(&ep->com); out: return err; @@ -3270,8 +3462,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) goto fail1; } PDBG("%s ep %p\n", __func__, ep); - cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; + ref_cm_id(&ep->com); ep->com.dev = dev; ep->backlog = backlog; memcpy(&ep->com.local_addr, &cm_id->m_local_addr, @@ -3311,7 +3503,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); fail2: - cm_id->rem_ref(cm_id); + deref_cm_id(&ep->com); c4iw_put_ep(&ep->com); fail1: out: @@ -3350,7 +3542,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id) cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); done: - cm_id->rem_ref(cm_id); + deref_cm_id(&ep->com); c4iw_put_ep(&ep->com); return err; } @@ -3367,6 +3559,12 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep, states[ep->com.state], abrupt); + /* + * Ref the ep here in case we have fatal errors causing the + * ep to be released and freed. + */ + c4iw_get_ep(&ep->com); + rdev = &ep->com.dev->rdev; if (c4iw_fatal_error(rdev)) { fatal = 1; @@ -3418,10 +3616,30 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) set_bit(EP_DISC_CLOSE, &ep->com.history); ret = send_halfclose(ep, gfp); } - if (ret) + if (ret) { + set_bit(EP_DISC_FAIL, &ep->com.history); + if (!abrupt) { + stop_ep_timer(ep); + close_complete_upcall(ep, -EIO); + } + if (ep->com.qp) { + struct c4iw_qp_attributes attrs; + + attrs.next_state = C4IW_QP_STATE_ERROR; + ret = c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (ret) + pr_err(MOD + "%s - qp <- error failed!\n", + __func__); + } fatal = 1; + } } mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); if (fatal) release_ep_resources(ep); return ret; @@ -3676,7 +3894,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_pass_accept_req *req = (void *)(rss + 1); struct l2t_entry *e; struct dst_entry *dst; - struct c4iw_ep *lep; + struct c4iw_ep *lep = NULL; u16 window; struct port_info *pi; struct net_device *pdev; @@ -3701,7 +3919,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) */ stid = (__force int) cpu_to_be32((__force u32) rss->hash_val); - lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid); + lep = (struct c4iw_ep *)get_ep_from_stid(dev, stid); if (!lep) { PDBG("%s connect request on invalid stid %d\n", __func__, stid); goto reject; @@ -3802,6 +4020,8 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) free_dst: dst_release(dst); reject: + if (lep) + c4iw_put_ep(&lep->com); return 0; } @@ -3809,7 +4029,7 @@ reject: * These are the real handlers that are called from a * work queue. */ -static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = { +static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = { [CPL_ACT_ESTABLISH] = act_establish, [CPL_ACT_OPEN_RPL] = act_open_rpl, [CPL_RX_DATA] = rx_data, @@ -3825,7 +4045,9 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = { [CPL_RDMA_TERMINATE] = terminate, [CPL_FW4_ACK] = fw4_ack, [CPL_FW6_MSG] = deferred_fw6_msg, - [CPL_RX_PKT] = rx_pkt + [CPL_RX_PKT] = rx_pkt, + [FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe, + [FAKE_CPL_PASS_PUT_EP_SAFE] = _put_pass_ep_safe }; static void process_timeout(struct c4iw_ep *ep) @@ -3839,11 +4061,12 @@ static void process_timeout(struct c4iw_ep *ep) set_bit(TIMEDOUT, &ep->com.history); switch (ep->com.state) { case MPA_REQ_SENT: - __state_set(&ep->com, ABORTING); connect_reply_upcall(ep, -ETIMEDOUT); break; case MPA_REQ_WAIT: - __state_set(&ep->com, ABORTING); + case MPA_REQ_RCVD: + case MPA_REP_SENT: + case FPDU_MODE: break; case CLOSING: case MORIBUND: @@ -3853,7 +4076,6 @@ static void process_timeout(struct c4iw_ep *ep) ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } - __state_set(&ep->com, ABORTING); close_complete_upcall(ep, -ETIMEDOUT); break; case ABORTING: @@ -3871,9 +4093,9 @@ static void process_timeout(struct c4iw_ep *ep) __func__, ep, ep->hwtid, ep->com.state); abort = 0; } - if (abort) - abort_connection(ep, NULL, GFP_KERNEL); mutex_unlock(&ep->com.mutex); + if (abort) + c4iw_ep_disconnect(ep, 1, GFP_KERNEL); c4iw_put_ep(&ep->com); } @@ -4006,10 +4228,10 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_abort_req_rss *req = cplhdr(skb); struct c4iw_ep *ep; - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(req); - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + /* This EP will be dereferenced in peer_abort() */ if (!ep) { printk(KERN_WARNING MOD "Abort on non-existent endpoint, tid %d\n", tid); @@ -4020,24 +4242,13 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s Negative advice on abort- tid %u status %d (%s)\n", __func__, ep->hwtid, req->status, neg_adv_str(req->status)); - ep->stats.abort_neg_adv++; - dev->rdev.stats.neg_adv++; - kfree_skb(skb); - return 0; + goto out; } PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, ep->com.state); - /* - * Wake up any threads in rdma_init() or rdma_fini(). - * However, if we are on MPAv2 and want to retry with MPAv1 - * then, don't wake up yet. - */ - if (mpa_rev == 2 && !ep->tried_with_mpa_v1) { - if (ep->com.state != MPA_REQ_SENT) - c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); - } else - c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); +out: sched(dev, skb); return 0; } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index df43f871ab61..f6f34a75af27 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -755,6 +755,7 @@ enum c4iw_ep_flags { CLOSE_SENT = 3, TIMEOUT = 4, QP_REFERENCED = 5, + STOP_MPA_TIMER = 7, }; enum c4iw_ep_history { @@ -779,7 +780,13 @@ enum c4iw_ep_history { EP_DISC_ABORT = 18, CONN_RPL_UPCALL = 19, ACT_RETRY_NOMEM = 20, - ACT_RETRY_INUSE = 21 + ACT_RETRY_INUSE = 21, + CLOSE_CON_RPL = 22, + EP_DISC_FAIL = 24, + QP_REFED = 25, + QP_DEREFED = 26, + CM_ID_REFED = 27, + CM_ID_DEREFED = 28, }; struct c4iw_ep_common { @@ -917,9 +924,8 @@ void c4iw_qp_rem_ref(struct ib_qp *qp); struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int c4iw_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int c4iw_dealloc_mw(struct ib_mw *mw); struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 008be07d5604..55d0651ee4de 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -86,8 +86,9 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, (wait ? FW_WR_COMPL_F : 0)); req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L; req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16))); - req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE)); - req->cmd |= cpu_to_be32(T5_ULP_MEMIO_ORDER_V(1)); + req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE) | + T5_ULP_MEMIO_ORDER_V(1) | + T5_ULP_MEMIO_FID_V(rdev->lldi.rxq_ids[0])); req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(len>>5)); req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16)); req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr)); @@ -690,15 +691,14 @@ static int c4iw_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int c4iw_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct c4iw_mr *mhp = to_c4iw_mr(ibmr); mhp->mpl_len = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, c4iw_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, c4iw_set_page); } int c4iw_dereg_mr(struct ib_mr *ib_mr) diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index 819767681445..8b9532034558 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -50,8 +50,6 @@ #include <rdma/ib_pack.h> #include <rdma/rdma_cm.h> #include <rdma/iw_cm.h> -#include <rdma/iw_portmap.h> -#include <rdma/rdma_netlink.h> #include <crypto/hash.h> #include "i40iw_status.h" @@ -254,6 +252,7 @@ struct i40iw_device { u32 arp_table_size; u32 next_arp_index; spinlock_t resource_lock; /* hw resource access */ + spinlock_t qptable_lock; u32 vendor_id; u32 vendor_part_id; u32 of_device_registered; @@ -392,7 +391,7 @@ void i40iw_flush_wqes(struct i40iw_device *iwdev, void i40iw_manage_arp_cache(struct i40iw_device *iwdev, unsigned char *mac_addr, - __be32 *ip_addr, + u32 *ip_addr, bool ipv4, u32 action); @@ -550,7 +549,7 @@ enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev, struct i40iw_qp_flush_info *info, bool wait); -void i40iw_copy_ip_ntohl(u32 *dst, u32 *src); +void i40iw_copy_ip_ntohl(u32 *dst, __be32 *src); struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size, diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 38f917a6c778..d2fa72516960 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -771,6 +771,7 @@ static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node, { struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr; struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg; + u16 ctrl_ird, ctrl_ord; /* initialize the upper 5 bytes of the frame */ i40iw_build_mpa_v1(cm_node, start_addr, mpa_key); @@ -779,38 +780,38 @@ static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node, /* initialize RTR msg */ if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { - rtr_msg->ctrl_ird = IETF_NO_IRD_ORD; - rtr_msg->ctrl_ord = IETF_NO_IRD_ORD; + ctrl_ird = IETF_NO_IRD_ORD; + ctrl_ord = IETF_NO_IRD_ORD; } else { - rtr_msg->ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? + ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? IETF_NO_IRD_ORD : cm_node->ird_size; - rtr_msg->ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? + ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? IETF_NO_IRD_ORD : cm_node->ord_size; } - rtr_msg->ctrl_ird |= IETF_PEER_TO_PEER; - rtr_msg->ctrl_ird |= IETF_FLPDU_ZERO_LEN; + ctrl_ird |= IETF_PEER_TO_PEER; + ctrl_ird |= IETF_FLPDU_ZERO_LEN; switch (mpa_key) { case MPA_KEY_REQUEST: - rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE; - rtr_msg->ctrl_ord |= IETF_RDMA0_READ; + ctrl_ord |= IETF_RDMA0_WRITE; + ctrl_ord |= IETF_RDMA0_READ; break; case MPA_KEY_REPLY: switch (cm_node->send_rdma0_op) { case SEND_RDMA_WRITE_ZERO: - rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE; + ctrl_ord |= IETF_RDMA0_WRITE; break; case SEND_RDMA_READ_ZERO: - rtr_msg->ctrl_ord |= IETF_RDMA0_READ; + ctrl_ord |= IETF_RDMA0_READ; break; } break; default: break; } - rtr_msg->ctrl_ird = htons(rtr_msg->ctrl_ird); - rtr_msg->ctrl_ord = htons(rtr_msg->ctrl_ord); + rtr_msg->ctrl_ird = htons(ctrl_ird); + rtr_msg->ctrl_ord = htons(ctrl_ord); } /** @@ -2107,7 +2108,7 @@ static bool i40iw_ipv6_is_loopback(u32 *loc_addr, u32 *rem_addr) struct in6_addr raddr6; i40iw_copy_ip_htonl(raddr6.in6_u.u6_addr32, rem_addr); - return (!memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6)); + return !memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6); } /** @@ -2160,7 +2161,7 @@ static struct i40iw_cm_node *i40iw_make_cm_node( cm_node->tcp_cntxt.rcv_wnd = I40IW_CM_DEFAULT_RCV_WND_SCALED >> I40IW_CM_DEFAULT_RCV_WND_SCALE; ts = current_kernel_time(); - cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec); + cm_node->tcp_cntxt.loc_seq_num = ts.tv_nsec; cm_node->tcp_cntxt.mss = iwdev->mss; cm_node->iwdev = iwdev; @@ -2234,7 +2235,7 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node) if (cm_node->listener) { i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true); } else { - if (!i40iw_listen_port_in_use(cm_core, htons(cm_node->loc_port)) && + if (!i40iw_listen_port_in_use(cm_core, cm_node->loc_port) && cm_node->apbvt_set && cm_node->iwdev) { i40iw_manage_apbvt(cm_node->iwdev, cm_node->loc_port, @@ -2852,7 +2853,6 @@ static struct i40iw_cm_node *i40iw_create_cm_node( void *private_data, struct i40iw_cm_info *cm_info) { - int ret; struct i40iw_cm_node *cm_node; struct i40iw_cm_listener *loopback_remotelistener; struct i40iw_cm_node *loopback_remotenode; @@ -2922,30 +2922,6 @@ static struct i40iw_cm_node *i40iw_create_cm_node( memcpy(cm_node->pdata_buf, private_data, private_data_len); cm_node->state = I40IW_CM_STATE_SYN_SENT; - ret = i40iw_send_syn(cm_node, 0); - - if (ret) { - if (cm_node->ipv4) - i40iw_debug(cm_node->dev, - I40IW_DEBUG_CM, - "Api - connect() FAILED: dest addr=%pI4", - cm_node->rem_addr); - else - i40iw_debug(cm_node->dev, I40IW_DEBUG_CM, - "Api - connect() FAILED: dest addr=%pI6", - cm_node->rem_addr); - i40iw_rem_ref_cm_node(cm_node); - cm_node = NULL; - } - - if (cm_node) - i40iw_debug(cm_node->dev, - I40IW_DEBUG_CM, - "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n", - cm_node->rem_port, - cm_node, - cm_node->cm_id); - return cm_node; } @@ -3266,11 +3242,13 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node, tcp_info->dest_ip_addr3 = cpu_to_le32(cm_node->rem_addr[0]); tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[0]); - tcp_info->arp_idx = cpu_to_le32(i40iw_arp_table(iwqp->iwdev, - &tcp_info->dest_ip_addr3, - true, - NULL, - I40IW_ARP_RESOLVE)); + tcp_info->arp_idx = + cpu_to_le16((u16)i40iw_arp_table( + iwqp->iwdev, + &tcp_info->dest_ip_addr3, + true, + NULL, + I40IW_ARP_RESOLVE)); } else { tcp_info->src_port = cpu_to_le16(cm_node->loc_port); tcp_info->dst_port = cpu_to_le16(cm_node->rem_port); @@ -3282,12 +3260,13 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node, tcp_info->local_ipaddr1 = cpu_to_le32(cm_node->loc_addr[1]); tcp_info->local_ipaddr2 = cpu_to_le32(cm_node->loc_addr[2]); tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[3]); - tcp_info->arp_idx = cpu_to_le32(i40iw_arp_table( - iwqp->iwdev, - &tcp_info->dest_ip_addr0, - false, - NULL, - I40IW_ARP_RESOLVE)); + tcp_info->arp_idx = + cpu_to_le16((u16)i40iw_arp_table( + iwqp->iwdev, + &tcp_info->dest_ip_addr0, + false, + NULL, + I40IW_ARP_RESOLVE)); } } @@ -3564,7 +3543,6 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct i40iw_cm_node *cm_node; struct ib_qp_attr attr; int passive_state; - struct i40iw_ib_device *iwibdev; struct ib_mr *ibmr; struct i40iw_pd *iwpd; u16 buf_len = 0; @@ -3627,7 +3605,6 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) !i40iw_ipv4_is_loopback(cm_node->loc_addr[0], cm_node->rem_addr[0])) || (!cm_node->ipv4 && !i40iw_ipv6_is_loopback(cm_node->loc_addr, cm_node->rem_addr))) { - iwibdev = iwdev->iwibdev; iwpd = iwqp->iwpd; tagged_offset = (uintptr_t)iwqp->ietf_mem.va; ibmr = i40iw_reg_phys_mr(&iwpd->ibpd, @@ -3752,6 +3729,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct sockaddr_in *raddr; struct sockaddr_in6 *laddr6; struct sockaddr_in6 *raddr6; + bool qhash_set = false; int apbvt_set = 0; enum i40iw_status_code status; @@ -3810,6 +3788,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) true); if (status) return -EINVAL; + qhash_set = true; } status = i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD); if (status) { @@ -3828,23 +3807,8 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) conn_param->private_data_len, (void *)conn_param->private_data, &cm_info); - if (!cm_node) { - i40iw_manage_qhash(iwdev, - &cm_info, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_DELETE, - NULL, - false); - - if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core, - cm_info.loc_port)) - i40iw_manage_apbvt(iwdev, - cm_info.loc_port, - I40IW_MANAGE_APBVT_DEL); - cm_id->rem_ref(cm_id); - iwdev->cm_core.stats_connect_errs++; - return -ENOMEM; - } + if (!cm_node) + goto err; i40iw_record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && @@ -3852,12 +3816,54 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->ord_size = 1; cm_node->apbvt_set = apbvt_set; - cm_node->qhash_set = true; + cm_node->qhash_set = qhash_set; iwqp->cm_node = cm_node; cm_node->iwqp = iwqp; iwqp->cm_id = cm_id; i40iw_add_ref(&iwqp->ibqp); + + if (cm_node->state == I40IW_CM_STATE_SYN_SENT) { + if (i40iw_send_syn(cm_node, 0)) { + i40iw_rem_ref_cm_node(cm_node); + goto err; + } + } + + i40iw_debug(cm_node->dev, + I40IW_DEBUG_CM, + "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n", + cm_node->rem_port, + cm_node, + cm_node->cm_id); return 0; + +err: + if (cm_node) { + if (cm_node->ipv4) + i40iw_debug(cm_node->dev, + I40IW_DEBUG_CM, + "Api - connect() FAILED: dest addr=%pI4", + cm_node->rem_addr); + else + i40iw_debug(cm_node->dev, I40IW_DEBUG_CM, + "Api - connect() FAILED: dest addr=%pI6", + cm_node->rem_addr); + } + i40iw_manage_qhash(iwdev, + &cm_info, + I40IW_QHASH_TYPE_TCP_ESTABLISHED, + I40IW_QHASH_MANAGE_TYPE_DELETE, + NULL, + false); + + if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core, + cm_info.loc_port)) + i40iw_manage_apbvt(iwdev, + cm_info.loc_port, + I40IW_MANAGE_APBVT_DEL); + cm_id->rem_ref(cm_id); + iwdev->cm_core.stats_connect_errs++; + return -ENOMEM; } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 5f8ceb4a8e84..e9046d9f9645 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -1,6 +1,6 @@ /******************************************************************************* * -* Copyright (c) 2015 Intel Corporation. All rights reserved. +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -291,8 +291,6 @@ struct i40iw_cm_listener { u8 loc_mac[ETH_ALEN]; u32 loc_addr[4]; u16 loc_port; - u32 map_loc_addr[4]; - u16 map_loc_port; struct iw_cm_id *cm_id; atomic_t ref_count; struct i40iw_device *iwdev; @@ -317,8 +315,6 @@ struct i40iw_kmem_info { struct i40iw_cm_node { u32 loc_addr[4], rem_addr[4]; u16 loc_port, rem_port; - u32 map_loc_addr[4], map_rem_addr[4]; - u16 map_loc_port, map_rem_port; u16 vlan_id; enum i40iw_cm_node_state state; u8 loc_mac[ETH_ALEN]; @@ -370,10 +366,6 @@ struct i40iw_cm_info { u16 rem_port; u32 loc_addr[4]; u32 rem_addr[4]; - u16 map_loc_port; - u16 map_rem_port; - u32 map_loc_addr[4]; - u32 map_rem_addr[4]; u16 vlan_id; int backlog; u16 user_pri; diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index f05802bf6ca0..2c4b4d072d6a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -114,16 +114,21 @@ static enum i40iw_status_code i40iw_cqp_poll_registers( * i40iw_sc_parse_fpm_commit_buf - parse fpm commit buffer * @buf: ptr to fpm commit buffer * @info: ptr to i40iw_hmc_obj_info struct + * @sd: number of SDs for HMC objects * * parses fpm commit info and copy base value * of hmc objects in hmc_info */ static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf( u64 *buf, - struct i40iw_hmc_obj_info *info) + struct i40iw_hmc_obj_info *info, + u32 *sd) { u64 temp; + u64 size; + u64 base = 0; u32 i, j; + u32 k = 0; u32 low; /* copy base values in obj_info */ @@ -131,10 +136,20 @@ static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf( i <= I40IW_HMC_IW_PBLE; i++, j += 8) { get_64bit_val(buf, j, &temp); info[i].base = RS_64_1(temp, 32) * 512; + if (info[i].base > base) { + base = info[i].base; + k = i; + } low = (u32)(temp); if (low) info[i].cnt = low; } + size = info[k].cnt * info[k].size + info[k].base; + if (size & 0x1FFFFF) + *sd = (u32)((size >> 21) + 1); /* add 1 for remainder */ + else + *sd = (u32)(size >> 21); + return 0; } @@ -2909,6 +2924,65 @@ static enum i40iw_status_code i40iw_sc_mw_alloc( } /** + * i40iw_sc_mr_fast_register - Posts RDMA fast register mr WR to iwarp qp + * @qp: sc qp struct + * @info: fast mr info + * @post_sq: flag for cqp db to ring + */ +enum i40iw_status_code i40iw_sc_mr_fast_register( + struct i40iw_sc_qp *qp, + struct i40iw_fast_reg_stag_info *info, + bool post_sq) +{ + u64 temp, header; + u64 *wqe; + u32 wqe_idx; + + wqe = i40iw_qp_get_next_send_wqe(&qp->qp_uk, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, + 0, info->wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + + i40iw_debug(qp->dev, I40IW_DEBUG_MR, "%s: wr_id[%llxh] wqe_idx[%04d] location[%p]\n", + __func__, info->wr_id, wqe_idx, + &qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid); + temp = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? (uintptr_t)info->va : info->fbo; + set_64bit_val(wqe, 0, temp); + + temp = RS_64(info->first_pm_pbl_index >> 16, I40IWQPSQ_FIRSTPMPBLIDXHI); + set_64bit_val(wqe, + 8, + LS_64(temp, I40IWQPSQ_FIRSTPMPBLIDXHI) | + LS_64(info->reg_addr_pa >> I40IWQPSQ_PBLADDR_SHIFT, I40IWQPSQ_PBLADDR)); + + set_64bit_val(wqe, + 16, + info->total_len | + LS_64(info->first_pm_pbl_index, I40IWQPSQ_FIRSTPMPBLIDXLO)); + + header = LS_64(info->stag_key, I40IWQPSQ_STAGKEY) | + LS_64(info->stag_idx, I40IWQPSQ_STAGINDEX) | + LS_64(I40IWQP_OP_FAST_REGISTER, I40IWQPSQ_OPCODE) | + LS_64(info->chunk_size, I40IWQPSQ_LPBLSIZE) | + LS_64(info->page_size, I40IWQPSQ_HPAGESIZE) | + LS_64(info->access_rights, I40IWQPSQ_STAGRIGHTS) | + LS_64(info->addr_type, I40IWQPSQ_VABASEDTO) | + LS_64(info->read_fence, I40IWQPSQ_READFENCE) | + LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) | + LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID); + + i40iw_insert_wqe_hdr(wqe, header); + + i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "FAST_REG WQE", + wqe, I40IW_QP_WQE_MIN_SIZE); + + if (post_sq) + i40iw_qp_post_wr(&qp->qp_uk); + return 0; +} + +/** * i40iw_sc_send_lsmm - send last streaming mode message * @qp: sc qp struct * @lsmm_buf: buffer with lsmm message @@ -3147,7 +3221,7 @@ enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev, u8 hmc_fn_ i40iw_cqp_commit_fpm_values_cmd(dev, &query_fpm_mem, hmc_fn_id); /* parse the fpm_commit_buf and fill hmc obj info */ - i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj); + i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj, &hmc_info->sd_table.sd_cnt); mem_size = sizeof(struct i40iw_hmc_sd_entry) * (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index); ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size); @@ -3221,7 +3295,9 @@ static enum i40iw_status_code i40iw_sc_configure_iw_fpm(struct i40iw_sc_dev *dev /* parse the fpm_commit_buf and fill hmc obj info */ if (!ret_code) - ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf, hmc_info->hmc_obj); + ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf, + hmc_info->hmc_obj, + &hmc_info->sd_table.sd_cnt); i40iw_debug_buf(dev, I40IW_DEBUG_HMC, "COMMIT FPM BUFFER", commit_fpm_mem.va, I40IW_COMMIT_FPM_BUF_SIZE); @@ -3469,6 +3545,40 @@ static bool i40iw_ring_full(struct i40iw_sc_cqp *cqp) } /** + * i40iw_est_sd - returns approximate number of SDs for HMC + * @dev: sc device struct + * @hmc_info: hmc structure, size and count for HMC objects + */ +static u64 i40iw_est_sd(struct i40iw_sc_dev *dev, struct i40iw_hmc_info *hmc_info) +{ + int i; + u64 size = 0; + u64 sd; + + for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_PBLE; i++) + size += hmc_info->hmc_obj[i].cnt * hmc_info->hmc_obj[i].size; + + if (dev->is_pf) + size += hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size; + + if (size & 0x1FFFFF) + sd = (size >> 21) + 1; /* add 1 for remainder */ + else + sd = size >> 21; + + if (!dev->is_pf) { + /* 2MB alignment for VF PBLE HMC */ + size = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size; + if (size & 0x1FFFFF) + sd += (size >> 21) + 1; /* add 1 for remainder */ + else + sd += size >> 21; + } + + return sd; +} + +/** * i40iw_config_fpm_values - configure HMC objects * @dev: sc device struct * @qp_count: desired qp count @@ -3479,7 +3589,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ u32 i, mem_size; u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted; u32 powerof2; - u64 sd_needed, bytes_needed; + u64 sd_needed; u32 loop_count = 0; struct i40iw_hmc_info *hmc_info; @@ -3497,23 +3607,15 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ return ret_code; } - bytes_needed = 0; - for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) { + for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt; - bytes_needed += - (hmc_info->hmc_obj[i].max_cnt) * (hmc_info->hmc_obj[i].size); - i40iw_debug(dev, I40IW_DEBUG_HMC, - "%s i[%04d] max_cnt[0x%04X] size[0x%04llx]\n", - __func__, i, hmc_info->hmc_obj[i].max_cnt, - hmc_info->hmc_obj[i].size); - } - sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1; /* round up */ + sd_needed = i40iw_est_sd(dev, hmc_info); i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: FW initial max sd_count[%08lld] first_sd_index[%04d]\n", __func__, sd_needed, hmc_info->first_sd_index); i40iw_debug(dev, I40IW_DEBUG_HMC, - "%s: bytes_needed=0x%llx sd count %d where max sd is %d\n", - __func__, bytes_needed, hmc_info->sd_table.sd_cnt, + "%s: sd count %d where max sd is %d\n", + __func__, hmc_info->sd_table.sd_cnt, hmc_fpm_misc->max_sds); qpwanted = min(qp_count, hmc_info->hmc_obj[I40IW_HMC_IW_QP].max_cnt); @@ -3555,11 +3657,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt = pblewanted; /* How much memory is needed for all the objects. */ - bytes_needed = 0; - for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) - bytes_needed += - (hmc_info->hmc_obj[i].cnt) * (hmc_info->hmc_obj[i].size); - sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1; + sd_needed = i40iw_est_sd(dev, hmc_info); if ((loop_count > 1000) || ((!(loop_count % 10)) && (qpwanted > qpwantedoriginal * 2 / 3))) { @@ -3580,15 +3678,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ pblewanted -= FPM_MULTIPLIER * 1000; } while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000); - bytes_needed = 0; - for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) { - bytes_needed += (hmc_info->hmc_obj[i].cnt) * (hmc_info->hmc_obj[i].size); - i40iw_debug(dev, I40IW_DEBUG_HMC, - "%s i[%04d] cnt[0x%04x] size[0x%04llx]\n", - __func__, i, hmc_info->hmc_obj[i].cnt, - hmc_info->hmc_obj[i].size); - } - sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1; /* round up not truncate. */ + sd_needed = i40iw_est_sd(dev, hmc_info); i40iw_debug(dev, I40IW_DEBUG_HMC, "loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n", @@ -3606,8 +3696,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ return ret_code; } - hmc_info->sd_table.sd_cnt = (u32)sd_needed; - mem_size = sizeof(struct i40iw_hmc_sd_entry) * (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index + 1); ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size); @@ -3911,11 +3999,11 @@ enum i40iw_status_code i40iw_process_bh(struct i40iw_sc_dev *dev) */ static u32 i40iw_iwarp_opcode(struct i40iw_aeqe_info *info, u8 *pkt) { - u16 *mpa; + __be16 *mpa; u32 opcode = 0xffffffff; if (info->q2_data_written) { - mpa = (u16 *)pkt; + mpa = (__be16 *)pkt; opcode = ntohs(mpa[1]) & 0xf; } return opcode; @@ -3977,7 +4065,7 @@ static int i40iw_bld_terminate_hdr(struct i40iw_sc_qp *qp, if (info->q2_data_written) { /* Use data from offending packet to fill in ddp & rdma hdrs */ pkt = i40iw_locate_mpa(pkt); - ddp_seg_len = ntohs(*(u16 *)pkt); + ddp_seg_len = ntohs(*(__be16 *)pkt); if (ddp_seg_len) { copy_len = 2; termhdr->hdrct = DDP_LEN_FLAG; @@ -4188,13 +4276,13 @@ void i40iw_terminate_connection(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info * void i40iw_terminate_received(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info) { u8 *pkt = qp->q2_buf + Q2_BAD_FRAME_OFFSET; - u32 *mpa; + __be32 *mpa; u8 ddp_ctl; u8 rdma_ctl; u16 aeq_id = 0; struct i40iw_terminate_hdr *termhdr; - mpa = (u32 *)i40iw_locate_mpa(pkt); + mpa = (__be32 *)i40iw_locate_mpa(pkt); if (info->q2_data_written) { /* did not validate the frame - do it now */ ddp_ctl = (ntohl(mpa[0]) >> 8) & 0xff; @@ -4559,17 +4647,18 @@ static struct i40iw_pd_ops iw_pd_ops = { }; static struct i40iw_priv_qp_ops iw_priv_qp_ops = { - i40iw_sc_qp_init, - i40iw_sc_qp_create, - i40iw_sc_qp_modify, - i40iw_sc_qp_destroy, - i40iw_sc_qp_flush_wqes, - i40iw_sc_qp_upload_context, - i40iw_sc_qp_setctx, - i40iw_sc_send_lsmm, - i40iw_sc_send_lsmm_nostag, - i40iw_sc_send_rtt, - i40iw_sc_post_wqe0, + .qp_init = i40iw_sc_qp_init, + .qp_create = i40iw_sc_qp_create, + .qp_modify = i40iw_sc_qp_modify, + .qp_destroy = i40iw_sc_qp_destroy, + .qp_flush_wqes = i40iw_sc_qp_flush_wqes, + .qp_upload_context = i40iw_sc_qp_upload_context, + .qp_setctx = i40iw_sc_qp_setctx, + .qp_send_lsmm = i40iw_sc_send_lsmm, + .qp_send_lsmm_nostag = i40iw_sc_send_lsmm_nostag, + .qp_send_rtt = i40iw_sc_send_rtt, + .qp_post_wqe0 = i40iw_sc_post_wqe0, + .iw_mr_fast_register = i40iw_sc_mr_fast_register }; static struct i40iw_priv_cq_ops iw_priv_cq_ops = { diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h index aab88d65f805..bd942da91a27 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_d.h +++ b/drivers/infiniband/hw/i40iw/i40iw_d.h @@ -1290,7 +1290,7 @@ /* wqe size considering 32 bytes per wqe*/ #define I40IWQP_SW_MIN_WQSIZE 4 /* 128 bytes */ -#define I40IWQP_SW_MAX_WQSIZE 16384 /* 524288 bytes */ +#define I40IWQP_SW_MAX_WQSIZE 2048 /* 2048 bytes */ #define I40IWQP_OP_RDMA_WRITE 0 #define I40IWQP_OP_RDMA_READ 1 @@ -1512,6 +1512,8 @@ enum i40iw_alignment { I40IW_SD_BUF_ALIGNMENT = 0x100 }; +#define I40IW_WQE_SIZE_64 64 + #define I40IW_QP_WQE_MIN_SIZE 32 #define I40IW_QP_WQE_MAX_SIZE 128 diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c index 9fd302425563..3ee0cad96bc6 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c @@ -106,7 +106,9 @@ u32 i40iw_initialize_hw_resources(struct i40iw_device *iwdev) set_bit(2, iwdev->allocated_pds); spin_lock_init(&iwdev->resource_lock); - mrdrvbits = 24 - get_count_order(iwdev->max_mr); + spin_lock_init(&iwdev->qptable_lock); + /* stag index mask has a minimum of 14 bits */ + mrdrvbits = 24 - max(get_count_order(iwdev->max_mr), 14); iwdev->mr_stagmask = ~(((1 << mrdrvbits) - 1) << (32 - mrdrvbits)); return 0; } @@ -301,11 +303,15 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) "%s ae_id = 0x%x bool qp=%d qp_id = %d\n", __func__, info->ae_id, info->qp, info->qp_cq_id); if (info->qp) { + spin_lock_irqsave(&iwdev->qptable_lock, flags); iwqp = iwdev->qp_table[info->qp_cq_id]; if (!iwqp) { + spin_unlock_irqrestore(&iwdev->qptable_lock, flags); i40iw_pr_err("qp_id %d is already freed\n", info->qp_cq_id); continue; } + i40iw_add_ref(&iwqp->ibqp); + spin_unlock_irqrestore(&iwdev->qptable_lock, flags); qp = &iwqp->sc_qp; spin_lock_irqsave(&iwqp->lock, flags); iwqp->hw_tcp_state = info->tcp_state; @@ -411,6 +417,8 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) i40iw_terminate_connection(qp, info); break; } + if (info->qp) + i40iw_rem_ref(&iwqp->ibqp); } while (1); if (aeqcnt) @@ -460,7 +468,7 @@ int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool ad */ void i40iw_manage_arp_cache(struct i40iw_device *iwdev, unsigned char *mac_addr, - __be32 *ip_addr, + u32 *ip_addr, bool ipv4, u32 action) { @@ -481,7 +489,7 @@ void i40iw_manage_arp_cache(struct i40iw_device *iwdev, cqp_info->cqp_cmd = OP_ADD_ARP_CACHE_ENTRY; info = &cqp_info->in.u.add_arp_cache_entry.info; memset(info, 0, sizeof(*info)); - info->arp_index = cpu_to_le32(arp_index); + info->arp_index = cpu_to_le16((u16)arp_index); info->permanent = true; ether_addr_copy(info->mac_addr, mac_addr); cqp_info->in.u.add_arp_cache_entry.scratch = (uintptr_t)cqp_request; diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index e41fae2422ab..c963cad92f5a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -270,7 +270,6 @@ static void i40iw_disable_irq(struct i40iw_sc_dev *dev, i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_vec->idx - 1), 0); else i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_vec->idx - 1), 0); - synchronize_irq(msix_vec->irq); free_irq(msix_vec->irq, dev_id); } @@ -1147,10 +1146,7 @@ static enum i40iw_status_code i40iw_alloc_set_mac_ipaddr(struct i40iw_device *iw if (!status) { status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr, (u8)iwdev->mac_ip_table_idx); - if (!status) - status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr, - (u8)iwdev->mac_ip_table_idx); - else + if (status) i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx); } return status; @@ -1165,7 +1161,7 @@ static void i40iw_add_ipv6_addr(struct i40iw_device *iwdev) struct net_device *ip_dev; struct inet6_dev *idev; struct inet6_ifaddr *ifp; - __be32 local_ipaddr6[4]; + u32 local_ipaddr6[4]; rcu_read_lock(); for_each_netdev_rcu(&init_net, ip_dev) { @@ -1512,6 +1508,7 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl, I40IW_HMC_PROFILE_DEFAULT; iwdev->max_rdma_vfs = (iwdev->resource_profile != I40IW_HMC_PROFILE_DEFAULT) ? max_rdma_vfs : 0; + iwdev->max_enabled_vfs = iwdev->max_rdma_vfs; iwdev->netdev = ldev->netdev; hdl->client = client; iwdev->mss = (!ldev->params.mtu) ? I40IW_DEFAULT_MSS : ldev->params.mtu - I40IW_MTU_TO_MSS; @@ -1531,7 +1528,10 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl, goto exit; iwdev->obj_next = iwdev->obj_mem; iwdev->push_mode = push_mode; + init_waitqueue_head(&iwdev->vchnl_waitq); + init_waitqueue_head(&dev->vf_reqs); + status = i40iw_initialize_dev(iwdev, ldev); exit: if (status) { @@ -1710,7 +1710,6 @@ static void i40iw_vf_reset(struct i40e_info *ldev, struct i40e_client *client, u for (i = 0; i < I40IW_MAX_PE_ENABLED_VF_COUNT; i++) { if (!dev->vf_dev[i] || (dev->vf_dev[i]->vf_id != vf_id)) continue; - /* free all resources allocated on behalf of vf */ tmp_vfdev = dev->vf_dev[i]; spin_lock_irqsave(&dev->dev_pestat.stats_lock, flags); @@ -1819,8 +1818,6 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev, dev = &hdl->device.sc_dev; iwdev = dev->back_dev; - i40iw_debug(dev, I40IW_DEBUG_VIRT, "msg %p, message length %u\n", msg, len); - if (dev->vchnl_if.vchnl_recv) { ret_code = dev->vchnl_if.vchnl_recv(dev, vf_id, msg, len); if (!dev->is_pf) { @@ -1832,6 +1829,39 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev, } /** + * i40iw_vf_clear_to_send - wait to send virtual channel message + * @dev: iwarp device * + * Wait for until virtual channel is clear + * before sending the next message + * + * Returns false if error + * Returns true if clear to send + */ +bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev) +{ + struct i40iw_device *iwdev; + wait_queue_t wait; + + iwdev = dev->back_dev; + + if (!wq_has_sleeper(&dev->vf_reqs) && + (atomic_read(&iwdev->vchnl_msgs) == 0)) + return true; /* virtual channel is clear */ + + init_wait(&wait); + add_wait_queue_exclusive(&dev->vf_reqs, &wait); + + if (!wait_event_timeout(dev->vf_reqs, + (atomic_read(&iwdev->vchnl_msgs) == 0), + I40IW_VCHNL_EVENT_TIMEOUT)) + dev->vchnl_up = false; + + remove_wait_queue(&dev->vf_reqs, &wait); + + return dev->vchnl_up; +} + +/** * i40iw_virtchnl_send - send a message through the virtual channel * @dev: iwarp device * @vf_id: virtual function id associated with the message @@ -1848,18 +1878,16 @@ static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev, { struct i40iw_device *iwdev; struct i40e_info *ldev; - enum i40iw_status_code ret_code = I40IW_ERR_BAD_PTR; if (!dev || !dev->back_dev) - return ret_code; + return I40IW_ERR_BAD_PTR; iwdev = dev->back_dev; ldev = iwdev->ldev; if (ldev && ldev->ops && ldev->ops->virtchnl_send) - ret_code = ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len); - - return ret_code; + return ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len); + return I40IW_ERR_BAD_PTR; } /* client interface functions */ diff --git a/drivers/infiniband/hw/i40iw/i40iw_osdep.h b/drivers/infiniband/hw/i40iw/i40iw_osdep.h index 7e20493510e8..80f422bf3967 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_osdep.h +++ b/drivers/infiniband/hw/i40iw/i40iw_osdep.h @@ -172,6 +172,7 @@ struct i40iw_hw; u8 __iomem *i40iw_get_hw_addr(void *dev); void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp); enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev); +bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev); enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 length, u32 value); struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev, struct i40iw_puda_buf *buf); diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.c b/drivers/infiniband/hw/i40iw/i40iw_pble.c index ded853d2fad8..85993dc44f6e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_pble.c +++ b/drivers/infiniband/hw/i40iw/i40iw_pble.c @@ -404,13 +404,14 @@ static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev, sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa; if (sd_entry->valid) return 0; - if (dev->is_pf) + if (dev->is_pf) { ret_code = i40iw_hmc_sd_one(dev, hmc_info->hmc_fn_id, sd_reg_val, idx->sd_idx, sd_entry->entry_type, true); - if (ret_code) { - i40iw_pr_err("cqp cmd failed for sd (pbles)\n"); - goto error; + if (ret_code) { + i40iw_pr_err("cqp cmd failed for sd (pbles)\n"); + goto error; + } } sd_entry->valid = true; diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index 8eb400d8a7a0..e9c6e82af9c7 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -1194,7 +1194,7 @@ static enum i40iw_status_code i40iw_ieq_process_buf(struct i40iw_puda_rsrc *ieq, ioffset = (u16)(buf->data - (u8 *)buf->mem.va); while (datalen) { - fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(u16 *)datap)); + fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(__be16 *)datap)); if (fpdu_len > pfpdu->max_fpdu_data) { i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ, "%s: error bad fpdu_len\n", __func__); diff --git a/drivers/infiniband/hw/i40iw/i40iw_status.h b/drivers/infiniband/hw/i40iw/i40iw_status.h index b0110c15e044..91c421762f06 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_status.h +++ b/drivers/infiniband/hw/i40iw/i40iw_status.h @@ -95,6 +95,7 @@ enum i40iw_status_code { I40IW_ERR_INVALID_MAC_ADDR = -65, I40IW_ERR_BAD_STAG = -66, I40IW_ERR_CQ_COMPL_ERROR = -67, + I40IW_ERR_QUEUE_DESTROYED = -68 }; #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h index edb3a8c8267a..16cc61720b53 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_type.h +++ b/drivers/infiniband/hw/i40iw/i40iw_type.h @@ -479,16 +479,17 @@ struct i40iw_sc_dev { struct i40iw_virt_mem ieq_mem; struct i40iw_puda_rsrc *ieq; - struct i40iw_vf_cqp_ops *iw_vf_cqp_ops; + const struct i40iw_vf_cqp_ops *iw_vf_cqp_ops; struct i40iw_hmc_fpm_misc hmc_fpm_misc; u16 qs_handle; - u32 debug_mask; + u32 debug_mask; u16 exception_lan_queue; u8 hmc_fn_id; bool is_pf; bool vchnl_up; u8 vf_id; + wait_queue_head_t vf_reqs; u64 cqp_cmd_stats[OP_SIZE_CQP_STAT_ARRAY]; struct i40iw_vchnl_vf_msg_buffer vchnl_vf_msg_buf; u8 hw_rev; @@ -889,8 +890,8 @@ struct i40iw_qhash_table_info { u32 qp_num; u32 dest_ip[4]; u32 src_ip[4]; - u32 dest_port; - u32 src_port; + u16 dest_port; + u16 src_port; }; struct i40iw_local_mac_ipaddr_entry_info { @@ -1040,6 +1041,9 @@ struct i40iw_priv_qp_ops { void (*qp_send_lsmm_nostag)(struct i40iw_sc_qp *, void *, u32); void (*qp_send_rtt)(struct i40iw_sc_qp *, bool); enum i40iw_status_code (*qp_post_wqe0)(struct i40iw_sc_qp *, u8); + enum i40iw_status_code (*iw_mr_fast_register)(struct i40iw_sc_qp *, + struct i40iw_fast_reg_stag_info *, + bool); }; struct i40iw_priv_cq_ops { @@ -1108,7 +1112,7 @@ struct i40iw_hmc_ops { enum i40iw_status_code (*parse_fpm_query_buf)(u64 *, struct i40iw_hmc_info *, struct i40iw_hmc_fpm_misc *); enum i40iw_status_code (*configure_iw_fpm)(struct i40iw_sc_dev *, u8); - enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *); + enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *, u32 *sd); enum i40iw_status_code (*create_hmc_object)(struct i40iw_sc_dev *dev, struct i40iw_hmc_create_obj_info *); enum i40iw_status_code (*del_hmc_object)(struct i40iw_sc_dev *dev, diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c index f78c3dc8bdb2..e35faea88c13 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_uk.c +++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c @@ -56,6 +56,9 @@ static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp) wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); wqe = qp->sq_base[wqe_idx].elem; + + qp->sq_wrtrk_array[wqe_idx].wqe_size = I40IW_QP_WQE_MIN_SIZE; + peek_head = (qp->sq_ring.head + 1) % qp->sq_ring.size; wqe_0 = qp->sq_base[peek_head].elem; if (peek_head) @@ -130,7 +133,10 @@ static void i40iw_qp_ring_push_db(struct i40iw_qp_uk *qp, u32 wqe_idx) */ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx, - u8 wqe_size) + u8 wqe_size, + u32 total_size, + u64 wr_id + ) { u64 *wqe = NULL; u64 wqe_ptr; @@ -159,6 +165,17 @@ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, if (!*wqe_idx) qp->swqe_polarity = !qp->swqe_polarity; } + + if (((*wqe_idx & 3) == 1) && (wqe_size == I40IW_WQE_SIZE_64)) { + i40iw_nop_1(qp); + I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code); + if (ret_code) + return NULL; + *wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); + if (!*wqe_idx) + qp->swqe_polarity = !qp->swqe_polarity; + } + for (i = 0; i < wqe_size / I40IW_QP_WQE_MIN_SIZE; i++) { I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code); if (ret_code) @@ -169,8 +186,15 @@ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, peek_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); wqe_0 = qp->sq_base[peek_head].elem; - if (peek_head & 0x3) - wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID); + + if (((peek_head & 3) == 1) || ((peek_head & 3) == 3)) { + if (RS_64(wqe_0[3], I40IWQPSQ_VALID) != !qp->swqe_polarity) + wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID); + } + + qp->sq_wrtrk_array[*wqe_idx].wrid = wr_id; + qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size; + qp->sq_wrtrk_array[*wqe_idx].wqe_size = wqe_size; return wqe; } @@ -249,12 +273,9 @@ static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp, if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = total_size; set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO)); if (!op_info->rem_addr.stag) @@ -309,12 +330,9 @@ static enum i40iw_status_code i40iw_rdma_read(struct i40iw_qp_uk *qp, ret_code = i40iw_fragcnt_to_wqesize_sq(1, &wqe_size); if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->lo_addr.len, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->lo_addr.len; local_fence |= info->local_fence; set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO)); @@ -366,13 +384,11 @@ static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp, if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; read_fence |= info->read_fence; - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = total_size; set_64bit_val(wqe, 16, 0); header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) | LS_64(info->op_type, I40IWQPSQ_OPCODE) | @@ -427,13 +443,11 @@ static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp, if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; read_fence |= info->read_fence; - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->len; set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO)); @@ -507,14 +521,11 @@ static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp, if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; read_fence |= info->read_fence; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->len; header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) | LS_64(info->op_type, I40IWQPSQ_OPCODE) | LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) | @@ -574,12 +585,9 @@ static enum i40iw_status_code i40iw_stag_local_invalidate(struct i40iw_qp_uk *qp op_info = &info->op.inv_local_stag; local_fence = info->local_fence; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = 0; set_64bit_val(wqe, 0, 0); set_64bit_val(wqe, 8, LS_64(op_info->target_stag, I40IWQPSQ_LOCSTAG)); @@ -619,12 +627,9 @@ static enum i40iw_status_code i40iw_mw_bind(struct i40iw_qp_uk *qp, op_info = &info->op.bind_window; local_fence |= info->local_fence; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = 0; set_64bit_val(wqe, 0, (uintptr_t)op_info->va); set_64bit_val(wqe, 8, LS_64(op_info->mr_stag, I40IWQPSQ_PARENTMRSTAG) | @@ -760,7 +765,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, enum i40iw_status_code ret_code2 = 0; bool move_cq_head = true; u8 polarity; - u8 addl_frag_cnt, addl_wqes = 0; + u8 addl_wqes = 0; if (cq->avoid_mem_cflct) cqe = (u64 *)I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(cq); @@ -797,6 +802,10 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, info->is_srq = (bool)RS_64(qword3, I40IWCQ_SRQ); qp = (struct i40iw_qp_uk *)(unsigned long)comp_ctx; + if (!qp) { + ret_code = I40IW_ERR_QUEUE_DESTROYED; + goto exit; + } wqe_idx = (u32)RS_64(qword3, I40IW_CQ_WQEIDX); info->qp_handle = (i40iw_qp_handle)(unsigned long)qp; @@ -827,11 +836,8 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, info->op_type = (u8)RS_64(qword3, I40IWCQ_OP); sw_wqe = qp->sq_base[wqe_idx].elem; get_64bit_val(sw_wqe, 24, &wqe_qword); - addl_frag_cnt = - (u8)RS_64(wqe_qword, I40IWQPSQ_ADDFRAGCNT); - i40iw_fragcnt_to_wqesize_sq(addl_frag_cnt + 1, &addl_wqes); - addl_wqes = (addl_wqes / I40IW_QP_WQE_MIN_SIZE); + addl_wqes = qp->sq_wrtrk_array[wqe_idx].wqe_size / I40IW_QP_WQE_MIN_SIZE; I40IW_RING_SET_TAIL(qp->sq_ring, (wqe_idx + addl_wqes)); } else { do { @@ -843,9 +849,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, get_64bit_val(sw_wqe, 24, &wqe_qword); op_type = (u8)RS_64(wqe_qword, I40IWQPSQ_OPCODE); info->op_type = op_type; - addl_frag_cnt = (u8)RS_64(wqe_qword, I40IWQPSQ_ADDFRAGCNT); - i40iw_fragcnt_to_wqesize_sq(addl_frag_cnt + 1, &addl_wqes); - addl_wqes = (addl_wqes / I40IW_QP_WQE_MIN_SIZE); + addl_wqes = qp->sq_wrtrk_array[tail].wqe_size / I40IW_QP_WQE_MIN_SIZE; I40IW_RING_SET_TAIL(qp->sq_ring, (tail + addl_wqes)); if (op_type != I40IWQP_OP_NOP) { info->wr_id = qp->sq_wrtrk_array[tail].wrid; @@ -859,6 +863,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, ret_code = 0; +exit: if (!ret_code && (info->comp_status == I40IW_COMPL_STATUS_FLUSHED)) if (pring && (I40IW_RING_MORE_WORK(*pring))) @@ -893,19 +898,21 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, * i40iw_get_wqe_shift - get shift count for maximum wqe size * @wqdepth: depth of wq required. * @sge: Maximum Scatter Gather Elements wqe + * @inline_data: Maximum inline data size * @shift: Returns the shift needed based on sge * - * Shift can be used to left shift the wqe size based on sge. - * If sge, == 1, shift =0 (wqe_size of 32 bytes), for sge=2 and 3, shift =1 - * (64 bytes wqes) and 2 otherwise (128 bytes wqe). + * Shift can be used to left shift the wqe size based on number of SGEs and inlind data size. + * For 1 SGE or inline data <= 16, shift = 0 (wqe size of 32 bytes). + * For 2 or 3 SGEs or inline data <= 48, shift = 1 (wqe size of 64 bytes). + * Shift of 2 otherwise (wqe size of 128 bytes). */ -enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u8 sge, u8 *shift) +enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u32 sge, u32 inline_data, u8 *shift) { u32 size; *shift = 0; - if (sge > 1) - *shift = (sge < 4) ? 1 : 2; + if (sge > 1 || inline_data > 16) + *shift = (sge < 4 && inline_data <= 48) ? 1 : 2; /* check if wqdepth is multiple of 2 or not */ @@ -968,11 +975,11 @@ enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp, if (info->max_rq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT) return I40IW_ERR_INVALID_FRAG_COUNT; - ret_code = i40iw_get_wqe_shift(info->sq_size, info->max_sq_frag_cnt, &sqshift); + ret_code = i40iw_get_wqe_shift(info->sq_size, info->max_sq_frag_cnt, info->max_inline_data, &sqshift); if (ret_code) return ret_code; - ret_code = i40iw_get_wqe_shift(info->rq_size, info->max_rq_frag_cnt, &rqshift); + ret_code = i40iw_get_wqe_shift(info->rq_size, info->max_rq_frag_cnt, 0, &rqshift); if (ret_code) return ret_code; @@ -1097,12 +1104,9 @@ enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, u64 header, *wqe; u32 wqe_idx; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = 0; set_64bit_val(wqe, 0, 0); set_64bit_val(wqe, 8, 0); set_64bit_val(wqe, 16, 0); @@ -1125,7 +1129,7 @@ enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, * @frag_cnt: number of fragments * @wqe_size: size of sq wqe returned */ -enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size) +enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size) { switch (frag_cnt) { case 0: @@ -1156,7 +1160,7 @@ enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size) * @frag_cnt: number of fragments * @wqe_size: size of rq wqe returned */ -enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u8 frag_cnt, u8 *wqe_size) +enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size) { switch (frag_cnt) { case 0: diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h index 5cd971bb8cc7..4627646fe8cd 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_user.h +++ b/drivers/infiniband/hw/i40iw/i40iw_user.h @@ -61,7 +61,7 @@ enum i40iw_device_capabilities_const { I40IW_MAX_CQ_SIZE = 1048575, I40IW_MAX_AEQ_ALLOCATE_COUNT = 255, I40IW_DB_ID_ZERO = 0, - I40IW_MAX_WQ_FRAGMENT_COUNT = 6, + I40IW_MAX_WQ_FRAGMENT_COUNT = 3, I40IW_MAX_SGE_RD = 1, I40IW_MAX_OUTBOUND_MESSAGE_SIZE = 2147483647, I40IW_MAX_INBOUND_MESSAGE_SIZE = 2147483647, @@ -70,8 +70,8 @@ enum i40iw_device_capabilities_const { I40IW_MAX_VF_FPM_ID = 47, I40IW_MAX_VF_PER_PF = 127, I40IW_MAX_SQ_PAYLOAD_SIZE = 2145386496, - I40IW_MAX_INLINE_DATA_SIZE = 112, - I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE = 112, + I40IW_MAX_INLINE_DATA_SIZE = 48, + I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE = 48, I40IW_MAX_IRD_SIZE = 32, I40IW_QPCTX_ENCD_MAXIRD = 3, I40IW_MAX_WQ_ENTRIES = 2048, @@ -102,6 +102,8 @@ enum i40iw_device_capabilities_const { #define I40IW_STAG_INDEX_FROM_STAG(stag) (((stag) && 0xFFFFFF00) >> 8) +#define I40IW_MAX_MR_SIZE 0x10000000000L + struct i40iw_qp_uk; struct i40iw_cq_uk; struct i40iw_srq_uk; @@ -198,7 +200,7 @@ enum i40iw_completion_notify { struct i40iw_post_send { i40iw_sgl sg_list; - u8 num_sges; + u32 num_sges; }; struct i40iw_post_inline_send { @@ -220,7 +222,7 @@ struct i40iw_post_inline_send_w_inv { struct i40iw_rdma_write { i40iw_sgl lo_sg_list; - u8 num_lo_sges; + u32 num_lo_sges; struct i40iw_sge rem_addr; }; @@ -345,7 +347,9 @@ struct i40iw_dev_uk { struct i40iw_sq_uk_wr_trk_info { u64 wrid; - u64 wr_len; + u32 wr_len; + u8 wqe_size; + u8 reserved[3]; }; struct i40iw_qp_quanta { @@ -367,6 +371,8 @@ struct i40iw_qp_uk { u32 qp_id; u32 sq_size; u32 rq_size; + u32 max_sq_frag_cnt; + u32 max_rq_frag_cnt; struct i40iw_qp_uk_ops ops; bool use_srq; u8 swqe_polarity; @@ -374,8 +380,6 @@ struct i40iw_qp_uk { u8 rwqe_polarity; u8 rq_wqe_size; u8 rq_wqe_size_multiplier; - u8 max_sq_frag_cnt; - u8 max_rq_frag_cnt; bool deferred_flag; }; @@ -404,8 +408,9 @@ struct i40iw_qp_uk_init_info { u32 qp_id; u32 sq_size; u32 rq_size; - u8 max_sq_frag_cnt; - u8 max_rq_frag_cnt; + u32 max_sq_frag_cnt; + u32 max_rq_frag_cnt; + u32 max_inline_data; }; @@ -422,7 +427,10 @@ void i40iw_device_init_uk(struct i40iw_dev_uk *dev); void i40iw_qp_post_wr(struct i40iw_qp_uk *qp); u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx, - u8 wqe_size); + u8 wqe_size, + u32 total_size, + u64 wr_id + ); u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx); u64 *i40iw_qp_get_next_srq_wqe(struct i40iw_srq_uk *srq, u32 *wqe_idx); @@ -434,9 +442,9 @@ enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp, void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq); enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, u64 wr_id, bool signaled, bool post_sq); -enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size); -enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u8 frag_cnt, u8 *wqe_size); +enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size); +enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size); enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size, u8 *wqe_size); -enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u8 sge, u8 *shift); +enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u32 sge, u32 inline_data, u8 *shift); #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 1ceec81bd8eb..0e8db0a35141 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -59,7 +59,7 @@ * @action: modify, delete or add */ int i40iw_arp_table(struct i40iw_device *iwdev, - __be32 *ip_addr, + u32 *ip_addr, bool ipv4, u8 *mac_addr, u32 action) @@ -152,7 +152,7 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, struct net_device *upper_dev; struct i40iw_device *iwdev; struct i40iw_handler *hdl; - __be32 local_ipaddr; + u32 local_ipaddr; hdl = i40iw_find_netdev(event_netdev); if (!hdl) @@ -167,11 +167,10 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, switch (event) { case NETDEV_DOWN: if (upper_dev) - local_ipaddr = - ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address; + local_ipaddr = ntohl( + ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address); else - local_ipaddr = ifa->ifa_address; - local_ipaddr = ntohl(local_ipaddr); + local_ipaddr = ntohl(ifa->ifa_address); i40iw_manage_arp_cache(iwdev, netdev->dev_addr, &local_ipaddr, @@ -180,11 +179,10 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, return NOTIFY_OK; case NETDEV_UP: if (upper_dev) - local_ipaddr = - ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address; + local_ipaddr = ntohl( + ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address); else - local_ipaddr = ifa->ifa_address; - local_ipaddr = ntohl(local_ipaddr); + local_ipaddr = ntohl(ifa->ifa_address); i40iw_manage_arp_cache(iwdev, netdev->dev_addr, &local_ipaddr, @@ -194,12 +192,11 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, case NETDEV_CHANGEADDR: /* Add the address to the IP table */ if (upper_dev) - local_ipaddr = - ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address; + local_ipaddr = ntohl( + ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address); else - local_ipaddr = ifa->ifa_address; + local_ipaddr = ntohl(ifa->ifa_address); - local_ipaddr = ntohl(local_ipaddr); i40iw_manage_arp_cache(iwdev, netdev->dev_addr, &local_ipaddr, @@ -227,7 +224,7 @@ int i40iw_inet6addr_event(struct notifier_block *notifier, struct net_device *netdev; struct i40iw_device *iwdev; struct i40iw_handler *hdl; - __be32 local_ipaddr6[4]; + u32 local_ipaddr6[4]; hdl = i40iw_find_netdev(event_netdev); if (!hdl) @@ -506,14 +503,19 @@ void i40iw_rem_ref(struct ib_qp *ibqp) struct cqp_commands_info *cqp_info; struct i40iw_device *iwdev; u32 qp_num; + unsigned long flags; iwqp = to_iwqp(ibqp); - if (!atomic_dec_and_test(&iwqp->refcount)) + iwdev = iwqp->iwdev; + spin_lock_irqsave(&iwdev->qptable_lock, flags); + if (!atomic_dec_and_test(&iwqp->refcount)) { + spin_unlock_irqrestore(&iwdev->qptable_lock, flags); return; + } - iwdev = iwqp->iwdev; qp_num = iwqp->ibqp.qp_num; iwdev->qp_table[qp_num] = NULL; + spin_unlock_irqrestore(&iwdev->qptable_lock, flags); cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false); if (!cqp_request) return; @@ -985,21 +987,24 @@ enum i40iw_status_code i40iw_cqp_commit_fpm_values_cmd(struct i40iw_sc_dev *dev, enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev) { struct i40iw_device *iwdev = dev->back_dev; - enum i40iw_status_code err_code = 0; int timeout_ret; i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s[%u] dev %p, iwdev %p\n", __func__, __LINE__, dev, iwdev); - atomic_add(2, &iwdev->vchnl_msgs); + + atomic_set(&iwdev->vchnl_msgs, 2); timeout_ret = wait_event_timeout(iwdev->vchnl_waitq, (atomic_read(&iwdev->vchnl_msgs) == 1), I40IW_VCHNL_EVENT_TIMEOUT); atomic_dec(&iwdev->vchnl_msgs); if (!timeout_ret) { i40iw_pr_err("virt channel completion timeout = 0x%x\n", timeout_ret); - err_code = I40IW_ERR_TIMEOUT; + atomic_set(&iwdev->vchnl_msgs, 0); + dev->vchnl_up = false; + return I40IW_ERR_TIMEOUT; } - return err_code; + wake_up(&dev->vf_reqs); + return 0; } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 1fe3b84a06e4..4a740f7a0519 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -63,8 +63,8 @@ static int i40iw_query_device(struct ib_device *ibdev, ether_addr_copy((u8 *)&props->sys_image_guid, iwdev->netdev->dev_addr); props->fw_ver = I40IW_FW_VERSION; props->device_cap_flags = iwdev->device_cap_flags; - props->vendor_id = iwdev->vendor_id; - props->vendor_part_id = iwdev->vendor_part_id; + props->vendor_id = iwdev->ldev->pcidev->vendor; + props->vendor_part_id = iwdev->ldev->pcidev->device; props->hw_ver = (u32)iwdev->sc_dev.hw_rev; props->max_mr_size = I40IW_MAX_OUTBOUND_MESSAGE_SIZE; props->max_qp = iwdev->max_qp; @@ -74,7 +74,7 @@ static int i40iw_query_device(struct ib_device *ibdev, props->max_cqe = iwdev->max_cqe; props->max_mr = iwdev->max_mr; props->max_pd = iwdev->max_pd; - props->max_sge_rd = 1; + props->max_sge_rd = I40IW_MAX_SGE_RD; props->max_qp_rd_atom = I40IW_MAX_IRD_SIZE; props->max_qp_init_rd_atom = props->max_qp_rd_atom; props->atomic_cap = IB_ATOMIC_NONE; @@ -120,7 +120,7 @@ static int i40iw_query_port(struct ib_device *ibdev, props->pkey_tbl_len = 1; props->active_width = IB_WIDTH_4X; props->active_speed = 1; - props->max_msg_sz = 0x80000000; + props->max_msg_sz = I40IW_MAX_OUTBOUND_MESSAGE_SIZE; return 0; } @@ -437,7 +437,6 @@ void i40iw_free_qp_resources(struct i40iw_device *iwdev, kfree(iwqp->kqp.wrid_mem); iwqp->kqp.wrid_mem = NULL; kfree(iwqp->allocated_buffer); - iwqp->allocated_buffer = NULL; } /** @@ -521,14 +520,12 @@ static int i40iw_setup_kmode_qp(struct i40iw_device *iwdev, enum i40iw_status_code status; struct i40iw_qp_uk_init_info *ukinfo = &info->qp_uk_init_info; - ukinfo->max_sq_frag_cnt = I40IW_MAX_WQ_FRAGMENT_COUNT; - sq_size = i40iw_qp_roundup(ukinfo->sq_size + 1); rq_size = i40iw_qp_roundup(ukinfo->rq_size + 1); - status = i40iw_get_wqe_shift(sq_size, ukinfo->max_sq_frag_cnt, &sqshift); + status = i40iw_get_wqe_shift(sq_size, ukinfo->max_sq_frag_cnt, ukinfo->max_inline_data, &sqshift); if (!status) - status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, &rqshift); + status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, 0, &rqshift); if (status) return -ENOSYS; @@ -609,6 +606,9 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, if (init_attr->cap.max_inline_data > I40IW_MAX_INLINE_DATA_SIZE) init_attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE; + if (init_attr->cap.max_send_sge > I40IW_MAX_WQ_FRAGMENT_COUNT) + init_attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; + memset(&init_info, 0, sizeof(init_info)); sq_size = init_attr->cap.max_send_wr; @@ -618,6 +618,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, init_info.qp_uk_init_info.rq_size = rq_size; init_info.qp_uk_init_info.max_sq_frag_cnt = init_attr->cap.max_send_sge; init_info.qp_uk_init_info.max_rq_frag_cnt = init_attr->cap.max_recv_sge; + init_info.qp_uk_init_info.max_inline_data = init_attr->cap.max_inline_data; mem = kzalloc(sizeof(*iwqp), GFP_KERNEL); if (!mem) @@ -722,8 +723,10 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, iwarp_info = &iwqp->iwarp_info; iwarp_info->rd_enable = true; iwarp_info->wr_rdresp_en = true; - if (!iwqp->user_mode) + if (!iwqp->user_mode) { + iwarp_info->fast_reg_en = true; iwarp_info->priv_mode_en = true; + } iwarp_info->ddp_ver = 1; iwarp_info->rdmap_ver = 1; @@ -784,6 +787,8 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, return ERR_PTR(err_code); } } + init_completion(&iwqp->sq_drained); + init_completion(&iwqp->rq_drained); return &iwqp->ibqp; error: @@ -1444,6 +1449,166 @@ static int i40iw_handle_q_mem(struct i40iw_device *iwdev, } /** + * i40iw_hw_alloc_stag - cqp command to allocate stag + * @iwdev: iwarp device + * @iwmr: iwarp mr pointer + */ +static int i40iw_hw_alloc_stag(struct i40iw_device *iwdev, struct i40iw_mr *iwmr) +{ + struct i40iw_allocate_stag_info *info; + struct i40iw_pd *iwpd = to_iwpd(iwmr->ibmr.pd); + enum i40iw_status_code status; + int err = 0; + struct i40iw_cqp_request *cqp_request; + struct cqp_commands_info *cqp_info; + + cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true); + if (!cqp_request) + return -ENOMEM; + + cqp_info = &cqp_request->info; + info = &cqp_info->in.u.alloc_stag.info; + memset(info, 0, sizeof(*info)); + info->page_size = PAGE_SIZE; + info->stag_idx = iwmr->stag >> I40IW_CQPSQ_STAG_IDX_SHIFT; + info->pd_id = iwpd->sc_pd.pd_id; + info->total_len = iwmr->length; + cqp_info->cqp_cmd = OP_ALLOC_STAG; + cqp_info->post_sq = 1; + cqp_info->in.u.alloc_stag.dev = &iwdev->sc_dev; + cqp_info->in.u.alloc_stag.scratch = (uintptr_t)cqp_request; + + status = i40iw_handle_cqp_op(iwdev, cqp_request); + if (status) { + err = -ENOMEM; + i40iw_pr_err("CQP-OP MR Reg fail"); + } + return err; +} + +/** + * i40iw_alloc_mr - register stag for fast memory registration + * @pd: ibpd pointer + * @mr_type: memory for stag registrion + * @max_num_sg: man number of pages + */ +static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct i40iw_pd *iwpd = to_iwpd(pd); + struct i40iw_device *iwdev = to_iwdev(pd->device); + struct i40iw_pble_alloc *palloc; + struct i40iw_pbl *iwpbl; + struct i40iw_mr *iwmr; + enum i40iw_status_code status; + u32 stag; + int err_code = -ENOMEM; + + iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL); + if (!iwmr) + return ERR_PTR(-ENOMEM); + + stag = i40iw_create_stag(iwdev); + if (!stag) { + err_code = -EOVERFLOW; + goto err; + } + iwmr->stag = stag; + iwmr->ibmr.rkey = stag; + iwmr->ibmr.lkey = stag; + iwmr->ibmr.pd = pd; + iwmr->ibmr.device = pd->device; + iwpbl = &iwmr->iwpbl; + iwpbl->iwmr = iwmr; + iwmr->type = IW_MEMREG_TYPE_MEM; + palloc = &iwpbl->pble_alloc; + iwmr->page_cnt = max_num_sg; + mutex_lock(&iwdev->pbl_mutex); + status = i40iw_get_pble(&iwdev->sc_dev, iwdev->pble_rsrc, palloc, iwmr->page_cnt); + mutex_unlock(&iwdev->pbl_mutex); + if (!status) + goto err1; + + if (palloc->level != I40IW_LEVEL_1) + goto err2; + err_code = i40iw_hw_alloc_stag(iwdev, iwmr); + if (err_code) + goto err2; + iwpbl->pbl_allocated = true; + i40iw_add_pdusecount(iwpd); + return &iwmr->ibmr; +err2: + i40iw_free_pble(iwdev->pble_rsrc, palloc); +err1: + i40iw_free_stag(iwdev, stag); +err: + kfree(iwmr); + return ERR_PTR(err_code); +} + +/** + * i40iw_set_page - populate pbl list for fmr + * @ibmr: ib mem to access iwarp mr pointer + * @addr: page dma address fro pbl list + */ +static int i40iw_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct i40iw_mr *iwmr = to_iwmr(ibmr); + struct i40iw_pbl *iwpbl = &iwmr->iwpbl; + struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc; + u64 *pbl; + + if (unlikely(iwmr->npages == iwmr->page_cnt)) + return -ENOMEM; + + pbl = (u64 *)palloc->level1.addr; + pbl[iwmr->npages++] = cpu_to_le64(addr); + return 0; +} + +/** + * i40iw_map_mr_sg - map of sg list for fmr + * @ibmr: ib mem to access iwarp mr pointer + * @sg: scatter gather list for fmr + * @sg_nents: number of sg pages + */ +static int i40iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) +{ + struct i40iw_mr *iwmr = to_iwmr(ibmr); + + iwmr->npages = 0; + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, i40iw_set_page); +} + +/** + * i40iw_drain_sq - drain the send queue + * @ibqp: ib qp pointer + */ +static void i40iw_drain_sq(struct ib_qp *ibqp) +{ + struct i40iw_qp *iwqp = to_iwqp(ibqp); + struct i40iw_sc_qp *qp = &iwqp->sc_qp; + + if (I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring)) + wait_for_completion(&iwqp->sq_drained); +} + +/** + * i40iw_drain_rq - drain the receive queue + * @ibqp: ib qp pointer + */ +static void i40iw_drain_rq(struct ib_qp *ibqp) +{ + struct i40iw_qp *iwqp = to_iwqp(ibqp); + struct i40iw_sc_qp *qp = &iwqp->sc_qp; + + if (I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring)) + wait_for_completion(&iwqp->rq_drained); +} + +/** * i40iw_hwreg_mr - send cqp command for memory registration * @iwdev: iwarp device * @iwmr: iwarp mr pointer @@ -1526,14 +1691,16 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, struct i40iw_mr *iwmr; struct ib_umem *region; struct i40iw_mem_reg_req req; - u32 pbl_depth = 0; + u64 pbl_depth = 0; u32 stag = 0; u16 access; - u32 region_length; + u64 region_length; bool use_pbles = false; unsigned long flags; int err = -ENOSYS; + if (length > I40IW_MAX_MR_SIZE) + return ERR_PTR(-EINVAL); region = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(region)) return (struct ib_mr *)region; @@ -1564,7 +1731,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, palloc = &iwpbl->pble_alloc; iwmr->type = req.reg_type; - iwmr->page_cnt = pbl_depth; + iwmr->page_cnt = (u32)pbl_depth; switch (req.reg_type) { case IW_MEMREG_TYPE_QP: @@ -1881,12 +2048,14 @@ static int i40iw_post_send(struct ib_qp *ibqp, enum i40iw_status_code ret; int err = 0; unsigned long flags; + bool inv_stag; iwqp = (struct i40iw_qp *)ibqp; ukqp = &iwqp->sc_qp.qp_uk; spin_lock_irqsave(&iwqp->lock, flags); while (ib_wr) { + inv_stag = false; memset(&info, 0, sizeof(info)); info.wr_id = (u64)(ib_wr->wr_id); if ((ib_wr->send_flags & IB_SEND_SIGNALED) || iwqp->sig_all) @@ -1896,19 +2065,28 @@ static int i40iw_post_send(struct ib_qp *ibqp, switch (ib_wr->opcode) { case IB_WR_SEND: - if (ib_wr->send_flags & IB_SEND_SOLICITED) - info.op_type = I40IW_OP_TYPE_SEND_SOL; - else - info.op_type = I40IW_OP_TYPE_SEND; + /* fall-through */ + case IB_WR_SEND_WITH_INV: + if (ib_wr->opcode == IB_WR_SEND) { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + info.op_type = I40IW_OP_TYPE_SEND_SOL; + else + info.op_type = I40IW_OP_TYPE_SEND; + } else { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + info.op_type = I40IW_OP_TYPE_SEND_SOL_INV; + else + info.op_type = I40IW_OP_TYPE_SEND_INV; + } if (ib_wr->send_flags & IB_SEND_INLINE) { info.op.inline_send.data = (void *)(unsigned long)ib_wr->sg_list[0].addr; info.op.inline_send.len = ib_wr->sg_list[0].length; - ret = ukqp->ops.iw_inline_send(ukqp, &info, rdma_wr(ib_wr)->rkey, false); + ret = ukqp->ops.iw_inline_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false); } else { info.op.send.num_sges = ib_wr->num_sge; info.op.send.sg_list = (struct i40iw_sge *)ib_wr->sg_list; - ret = ukqp->ops.iw_send(ukqp, &info, rdma_wr(ib_wr)->rkey, false); + ret = ukqp->ops.iw_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false); } if (ret) @@ -1936,7 +2114,14 @@ static int i40iw_post_send(struct ib_qp *ibqp, if (ret) err = -EIO; break; + case IB_WR_RDMA_READ_WITH_INV: + inv_stag = true; + /* fall-through*/ case IB_WR_RDMA_READ: + if (ib_wr->num_sge > I40IW_MAX_SGE_RD) { + err = -EINVAL; + break; + } info.op_type = I40IW_OP_TYPE_RDMA_READ; info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr; info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey; @@ -1944,10 +2129,47 @@ static int i40iw_post_send(struct ib_qp *ibqp, info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr; info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey; info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length; - ret = ukqp->ops.iw_rdma_read(ukqp, &info, false, false); + ret = ukqp->ops.iw_rdma_read(ukqp, &info, inv_stag, false); if (ret) err = -EIO; break; + case IB_WR_LOCAL_INV: + info.op_type = I40IW_OP_TYPE_INV_STAG; + info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; + ret = ukqp->ops.iw_stag_local_invalidate(ukqp, &info, true); + if (ret) + err = -EIO; + break; + case IB_WR_REG_MR: + { + struct i40iw_mr *iwmr = to_iwmr(reg_wr(ib_wr)->mr); + int page_shift = ilog2(reg_wr(ib_wr)->mr->page_size); + int flags = reg_wr(ib_wr)->access; + struct i40iw_pble_alloc *palloc = &iwmr->iwpbl.pble_alloc; + struct i40iw_sc_dev *dev = &iwqp->iwdev->sc_dev; + struct i40iw_fast_reg_stag_info info; + + info.access_rights = I40IW_ACCESS_FLAGS_LOCALREAD; + info.access_rights |= i40iw_get_user_access(flags); + info.stag_key = reg_wr(ib_wr)->key & 0xff; + info.stag_idx = reg_wr(ib_wr)->key >> 8; + info.wr_id = ib_wr->wr_id; + + info.addr_type = I40IW_ADDR_TYPE_VA_BASED; + info.va = (void *)(uintptr_t)iwmr->ibmr.iova; + info.total_len = iwmr->ibmr.length; + info.first_pm_pbl_index = palloc->level1.idx; + info.local_fence = ib_wr->send_flags & IB_SEND_FENCE; + info.signaled = ib_wr->send_flags & IB_SEND_SIGNALED; + + if (page_shift == 21) + info.page_size = 1; /* 2M page */ + + ret = dev->iw_priv_qp_ops->iw_mr_fast_register(&iwqp->sc_qp, &info, true); + if (ret) + err = -EIO; + break; + } default: err = -EINVAL; i40iw_pr_err(" upost_send bad opcode = 0x%x\n", @@ -2027,6 +2249,7 @@ static int i40iw_poll_cq(struct ib_cq *ibcq, enum i40iw_status_code ret; struct i40iw_cq_uk *ukcq; struct i40iw_sc_qp *qp; + struct i40iw_qp *iwqp; unsigned long flags; iwcq = (struct i40iw_cq *)ibcq; @@ -2037,6 +2260,8 @@ static int i40iw_poll_cq(struct ib_cq *ibcq, ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info, true); if (ret == I40IW_ERR_QUEUE_EMPTY) { break; + } else if (ret == I40IW_ERR_QUEUE_DESTROYED) { + continue; } else if (ret) { if (!cqe_count) cqe_count = -1; @@ -2044,10 +2269,12 @@ static int i40iw_poll_cq(struct ib_cq *ibcq, } entry->wc_flags = 0; entry->wr_id = cq_poll_info.wr_id; - if (!cq_poll_info.error) - entry->status = IB_WC_SUCCESS; - else + if (cq_poll_info.error) { entry->status = IB_WC_WR_FLUSH_ERR; + entry->vendor_err = cq_poll_info.major_err << 16 | cq_poll_info.minor_err; + } else { + entry->status = IB_WC_SUCCESS; + } switch (cq_poll_info.op_type) { case I40IW_OP_TYPE_RDMA_WRITE: @@ -2071,12 +2298,17 @@ static int i40iw_poll_cq(struct ib_cq *ibcq, break; } - entry->vendor_err = - cq_poll_info.major_err << 16 | cq_poll_info.minor_err; entry->ex.imm_data = 0; qp = (struct i40iw_sc_qp *)cq_poll_info.qp_handle; entry->qp = (struct ib_qp *)qp->back_qp; entry->src_qp = cq_poll_info.qp_id; + iwqp = (struct i40iw_qp *)qp->back_qp; + if (iwqp->iwarp_state > I40IW_QP_STATE_RTS) { + if (!I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring)) + complete(&iwqp->sq_drained); + if (!I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring)) + complete(&iwqp->rq_drained); + } entry->byte_len = cq_poll_info.bytes_xfered; entry++; cqe_count++; @@ -2143,7 +2375,6 @@ static int i40iw_get_protocol_stats(struct ib_device *ibdev, struct i40iw_dev_hw_stats *hw_stats = &devstat->hw_stats; struct timespec curr_time; static struct timespec last_rd_time = {0, 0}; - enum i40iw_status_code status = 0; unsigned long flags; curr_time = current_kernel_time(); @@ -2156,11 +2387,8 @@ static int i40iw_get_protocol_stats(struct ib_device *ibdev, spin_unlock_irqrestore(&devstat->stats_lock, flags); } else { if (((u64)curr_time.tv_sec - (u64)last_rd_time.tv_sec) > 1) - status = i40iw_vchnl_vf_get_pe_stats(dev, - &devstat->hw_stats); - - if (status) - return -ENOSYS; + if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats)) + return -ENOSYS; } stats->iw.ipInReceives = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] + @@ -2327,6 +2555,10 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev iwibdev->ibdev.query_device = i40iw_query_device; iwibdev->ibdev.create_ah = i40iw_create_ah; iwibdev->ibdev.destroy_ah = i40iw_destroy_ah; + iwibdev->ibdev.drain_sq = i40iw_drain_sq; + iwibdev->ibdev.drain_rq = i40iw_drain_rq; + iwibdev->ibdev.alloc_mr = i40iw_alloc_mr; + iwibdev->ibdev.map_mr_sg = i40iw_map_mr_sg; iwibdev->ibdev.iwcm = kzalloc(sizeof(*iwibdev->ibdev.iwcm), GFP_KERNEL); if (!iwibdev->ibdev.iwcm) { ib_dealloc_device(&iwibdev->ibdev); diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h index 1101f77080e6..0069be8a5a38 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h @@ -92,6 +92,7 @@ struct i40iw_mr { struct ib_umem *region; u16 type; u32 page_cnt; + u32 npages; u32 stag; u64 length; u64 pgaddrmem[MAX_SAVE_PAGE_ADDRS]; @@ -169,5 +170,7 @@ struct i40iw_qp { struct i40iw_pbl *iwpbl; struct i40iw_dma_mem q2_ctx_mem; struct i40iw_dma_mem ietf_mem; + struct completion sq_drained; + struct completion rq_drained; }; #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.c b/drivers/infiniband/hw/i40iw/i40iw_vf.c index cb0f18340e14..e33d4810965c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_vf.c +++ b/drivers/infiniband/hw/i40iw/i40iw_vf.c @@ -80,6 +80,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp, return 0; } -struct i40iw_vf_cqp_ops iw_vf_cqp_ops = { +const struct i40iw_vf_cqp_ops iw_vf_cqp_ops = { i40iw_manage_vf_pble_bp }; diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.h b/drivers/infiniband/hw/i40iw/i40iw_vf.h index f649f3a62e13..4359559ece9c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_vf.h +++ b/drivers/infiniband/hw/i40iw/i40iw_vf.h @@ -57,6 +57,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp, u64 scratch, bool post_sq); -extern struct i40iw_vf_cqp_ops iw_vf_cqp_ops; +extern const struct i40iw_vf_cqp_ops iw_vf_cqp_ops; #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c index 6b68f7890b76..3041003c94d2 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c @@ -254,7 +254,7 @@ static void vchnl_pf_send_get_hmc_fcn_resp(struct i40iw_sc_dev *dev, static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev, u32 vf_id, struct i40iw_virtchnl_op_buf *vchnl_msg, - struct i40iw_dev_hw_stats hw_stats) + struct i40iw_dev_hw_stats *hw_stats) { enum i40iw_status_code ret_code; u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(struct i40iw_dev_hw_stats) - 1]; @@ -264,7 +264,7 @@ static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev, vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx; vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer); vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS; - *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = hw_stats; + *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = *hw_stats; ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer)); if (ret_code) i40iw_debug(dev, I40IW_DEBUG_VIRT, @@ -437,11 +437,9 @@ enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev, vchnl_pf_send_get_ver_resp(dev, vf_id, vchnl_msg); return I40IW_SUCCESS; } - for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; - iw_vf_idx++) { + for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; iw_vf_idx++) { if (!dev->vf_dev[iw_vf_idx]) { - if (first_avail_iw_vf == - I40IW_MAX_PE_ENABLED_VF_COUNT) + if (first_avail_iw_vf == I40IW_MAX_PE_ENABLED_VF_COUNT) first_avail_iw_vf = iw_vf_idx; continue; } @@ -541,7 +539,7 @@ enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev, devstat->ops.iw_hw_stat_read_all(devstat, &devstat->hw_stats); spin_unlock_irqrestore(&dev->dev_pestat.stats_lock, flags); vf_dev->msg_count--; - vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, devstat->hw_stats); + vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, &devstat->hw_stats); break; default: i40iw_debug(dev, I40IW_DEBUG_VIRT, @@ -596,23 +594,25 @@ enum i40iw_status_code i40iw_vchnl_vf_get_ver(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.parm = vchnl_ver; vchnl_req.parm_len = sizeof(*vchnl_ver); vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_get_ver_req(dev, &vchnl_req); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } /** @@ -626,23 +626,25 @@ enum i40iw_status_code i40iw_vchnl_vf_get_hmc_fcn(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.parm = hmc_fcn; vchnl_req.parm_len = sizeof(*hmc_fcn); vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_get_hmc_fcn_req(dev, &vchnl_req); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } /** @@ -660,25 +662,27 @@ enum i40iw_status_code i40iw_vchnl_vf_add_hmc_objs(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_add_hmc_objs_req(dev, &vchnl_req, rsrc_type, start_index, rsrc_count); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } /** @@ -696,25 +700,27 @@ enum i40iw_status_code i40iw_vchnl_vf_del_hmc_obj(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_del_hmc_objs_req(dev, &vchnl_req, rsrc_type, start_index, rsrc_count); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } /** @@ -728,21 +734,23 @@ enum i40iw_status_code i40iw_vchnl_vf_get_pe_stats(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.parm = hw_stats; vchnl_req.parm_len = sizeof(*hw_stats); vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_get_pe_stats_req(dev, &vchnl_req); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 99451d887266..8f7ad07915b0 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -96,7 +96,7 @@ struct ib_sa_mcmember_data { u8 scope_join_state; u8 proxy_join; u8 reserved[2]; -}; +} __packed __aligned(4); struct mcast_group { struct ib_sa_mcmember_data rec; @@ -747,14 +747,11 @@ static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx __be64 tid, union ib_gid *new_mgid) { - struct mcast_group *group = NULL, *cur_group; + struct mcast_group *group = NULL, *cur_group, *n; struct mcast_req *req; - struct list_head *pos; - struct list_head *n; mutex_lock(&ctx->mcg_table_lock); - list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) { - group = list_entry(pos, struct mcast_group, mgid0_list); + list_for_each_entry_safe(group, n, &ctx->mcg_mgid0_list, mgid0_list) { mutex_lock(&group->lock); if (group->last_req_tid == tid) { if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 1eca01cebe51..6c5ac5d8f32f 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -717,9 +717,8 @@ int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index ce0b5aa8eb9b..631272172a0b 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -528,9 +528,8 @@ static int mlx4_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct mlx4_ib_mr *mr = to_mmr(ibmr); int rc; @@ -541,7 +540,7 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, sizeof(u64) * mr->max_pages, DMA_TO_DEVICE); - rc = ib_sg_to_pages(ibmr, sg, sg_nents, mlx4_set_page); + rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx4_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->page_map, sizeof(u64) * mr->max_pages, diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index a00ba4418de9..dabcc65bd65e 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -879,7 +879,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); cq->mcq.irqn = irqn; - cq->mcq.comp = mlx5_ib_cq_comp; + if (context) + cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; + else + cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; INIT_LIST_HEAD(&cq->wc_list); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4cb81f68d850..c72797cd9e4f 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -38,6 +38,9 @@ #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/io-mapping.h> +#if defined(CONFIG_X86) +#include <asm/pat.h> +#endif #include <linux/sched.h> #include <rdma/ib_user_verbs.h> #include <rdma/ib_addr.h> @@ -517,6 +520,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_TSO; } + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, scatter_fcs)) + props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; + props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; @@ -1068,38 +1075,89 @@ static int get_index(unsigned long offset) return get_arg(offset); } +static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) +{ + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: + return "WC"; + case MLX5_IB_MMAP_REGULAR_PAGE: + return "best effort WC"; + case MLX5_IB_MMAP_NC_PAGE: + return "NC"; + default: + return NULL; + } +} + +static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, + struct vm_area_struct *vma, struct mlx5_uuar_info *uuari) +{ + int err; + unsigned long idx; + phys_addr_t pfn, pa; + pgprot_t prot; + + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: +/* Some architectures don't support WC memory */ +#if defined(CONFIG_X86) + if (!pat_enabled()) + return -EPERM; +#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU))) + return -EPERM; +#endif + /* fall through */ + case MLX5_IB_MMAP_REGULAR_PAGE: + /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ + prot = pgprot_writecombine(vma->vm_page_prot); + break; + case MLX5_IB_MMAP_NC_PAGE: + prot = pgprot_noncached(vma->vm_page_prot); + break; + default: + return -EINVAL; + } + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + idx = get_index(vma->vm_pgoff); + if (idx >= uuari->num_uars) + return -EINVAL; + + pfn = uar_index2pfn(dev, uuari->uars[idx].index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); + + vma->vm_page_prot = prot; + err = io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot); + if (err) { + mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n", + err, vma->vm_start, &pfn, mmap_cmd2str(cmd)); + return -EAGAIN; + } + + pa = pfn << PAGE_SHIFT; + mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd), + vma->vm_start, &pa); + + return 0; +} + static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_uuar_info *uuari = &context->uuari; unsigned long command; - unsigned long idx; phys_addr_t pfn; command = get_command(vma->vm_pgoff); switch (command) { + case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; - - idx = get_index(vma->vm_pgoff); - if (idx >= uuari->num_uars) - return -EINVAL; - - pfn = uar_index2pfn(dev, uuari->uars[idx].index); - mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, - (unsigned long long)pfn); - - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - - mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n", - vma->vm_start, - (unsigned long long)pfn << PAGE_SHIFT); - break; + return uar_mmap(dev, command, vma, uuari); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: return -ENOSYS; @@ -1108,7 +1166,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; - if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + if (vma->vm_flags & VM_WRITE) return -EPERM; /* Don't expose to user-space information it shouldn't have */ diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b46c25542a7c..c4a9825828bc 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -70,6 +70,8 @@ enum { enum mlx5_ib_mmap_cmd { MLX5_IB_MMAP_REGULAR_PAGE = 0, MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, + MLX5_IB_MMAP_WC_PAGE = 2, + MLX5_IB_MMAP_NC_PAGE = 3, /* 5 is chosen in order to be compatible with old versions of libmlx5 */ MLX5_IB_MMAP_CORE_CLOCK = 5, }; @@ -356,6 +358,7 @@ enum mlx5_ib_qp_flags { MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, /* QP uses 1 as its source QP number */ MLX5_IB_QP_SQPN_QP1 = 1 << 6, + MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, }; struct mlx5_umr_wr { @@ -712,9 +715,8 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 4d5bff151cdf..8cf2ce50511f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1751,26 +1751,33 @@ done: static int mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, struct scatterlist *sgl, - unsigned short sg_nents) + unsigned short sg_nents, + unsigned int *sg_offset_p) { struct scatterlist *sg = sgl; struct mlx5_klm *klms = mr->descs; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; u32 lkey = mr->ibmr.pd->local_dma_lkey; int i; - mr->ibmr.iova = sg_dma_address(sg); + mr->ibmr.iova = sg_dma_address(sg) + sg_offset; mr->ibmr.length = 0; mr->ndescs = sg_nents; for_each_sg(sgl, sg, sg_nents, i) { if (unlikely(i > mr->max_descs)) break; - klms[i].va = cpu_to_be64(sg_dma_address(sg)); - klms[i].bcount = cpu_to_be32(sg_dma_len(sg)); + klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); + klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); klms[i].key = cpu_to_be32(lkey); mr->ibmr.length += sg_dma_len(sg); + + sg_offset = 0; } + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i; } @@ -1788,9 +1795,8 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct mlx5_ib_mr *mr = to_mmr(ibmr); int n; @@ -1802,9 +1808,10 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, DMA_TO_DEVICE); if (mr->access_mode == MLX5_ACCESS_MODE_KLM) - n = mlx5_ib_sg_to_klms(mr, sg, sg_nents); + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset); else - n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page); + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, + mlx5_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, mr->desc_size * mr->max_descs, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8dee8bc1e0fe..504117657d41 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1028,6 +1028,7 @@ static int get_rq_pas_size(void *qpc) static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, void *qpin) { + struct mlx5_ib_qp *mqp = rq->base.container_mibqp; __be64 *pas; __be64 *qp_pas; void *in; @@ -1051,6 +1052,9 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); + if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS) + MLX5_SET(rqc, rqc, scatter_fcs, 1); + wq = MLX5_ADDR_OF(rqc, rqc, wq); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); MLX5_SET(wq, wq, end_padding_mode, @@ -1136,11 +1140,12 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } if (qp->rq.wqe_cnt) { + rq->base.container_mibqp = qp; + err = create_raw_packet_qp_rq(dev, rq, in); if (err) goto err_destroy_sq; - rq->base.container_mibqp = qp; err = create_raw_packet_qp_tir(dev, rq, tdn); if (err) @@ -1252,6 +1257,19 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EOPNOTSUPP; } + if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs"); + return -EOPNOTSUPP; + } + if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) || + !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { + mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS; + } + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index 6d3a169c049b..37331e2fdc5f 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -44,6 +44,7 @@ #include <linux/ip.h> #include <linux/tcp.h> #include <linux/init.h> +#include <linux/kernel.h> #include <asm/io.h> #include <asm/irq.h> @@ -903,70 +904,15 @@ void nes_clc(unsigned long parm) */ void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length) { - char xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', - 'a', 'b', 'c', 'd', 'e', 'f'}; - char *ptr; - char hex_buf[80]; - char ascii_buf[20]; - int num_char; - int num_ascii; - int num_hex; - if (!(nes_debug_level & dump_debug_level)) { return; } - ptr = addr; if (length > 0x100) { nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100); length = 0x100; } - nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length); - - memset(ascii_buf, 0, 20); - memset(hex_buf, 0, 80); - - num_ascii = 0; - num_hex = 0; - for (num_char = 0; num_char < length; num_char++) { - if (num_ascii == 8) { - ascii_buf[num_ascii++] = ' '; - hex_buf[num_hex++] = '-'; - hex_buf[num_hex++] = ' '; - } - - if (*ptr < 0x20 || *ptr > 0x7e) - ascii_buf[num_ascii++] = '.'; - else - ascii_buf[num_ascii++] = *ptr; - hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)]; - hex_buf[num_hex++] = xlate[*ptr & 0x0f]; - hex_buf[num_hex++] = ' '; - ptr++; - - if (num_ascii >= 17) { - /* output line and reset */ - nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); - memset(ascii_buf, 0, 20); - memset(hex_buf, 0, 80); - num_ascii = 0; - num_hex = 0; - } - } + nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", addr, length, length); - /* output the rest */ - if (num_ascii) { - while (num_ascii < 17) { - if (num_ascii == 8) { - hex_buf[num_hex++] = ' '; - hex_buf[num_hex++] = ' '; - } - hex_buf[num_hex++] = ' '; - hex_buf[num_hex++] = ' '; - hex_buf[num_hex++] = ' '; - num_ascii++; - } - - nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); - } + print_hex_dump(KERN_ERR, PFX, DUMP_PREFIX_NONE, 16, 1, addr, length, true); } diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index fba69a39a7eb..464d6da5fe91 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -402,15 +402,14 @@ static int nes_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -static int nes_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +static int nes_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) { struct nes_mr *nesmr = to_nesmr(ibmr); nesmr->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, nes_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, nes_set_page); } /** @@ -981,7 +980,7 @@ static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic, /** * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory. */ -static inline void nes_free_qp_mem(struct nes_device *nesdev, +static void nes_free_qp_mem(struct nes_device *nesdev, struct nes_qp *nesqp, int virt_wqs) { unsigned long flags; @@ -1315,6 +1314,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type); return ERR_PTR(-EINVAL); } + init_completion(&nesqp->sq_drained); + init_completion(&nesqp->rq_drained); nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); init_timer(&nesqp->terminate_timer); @@ -3452,6 +3453,29 @@ out: return err; } +/** + * nes_drain_sq - drain sq + * @ibqp: pointer to ibqp + */ +static void nes_drain_sq(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + + if (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head) + wait_for_completion(&nesqp->sq_drained); +} + +/** + * nes_drain_rq - drain rq + * @ibqp: pointer to ibqp + */ +static void nes_drain_rq(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + + if (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head) + wait_for_completion(&nesqp->rq_drained); +} /** * nes_poll_cq @@ -3582,6 +3606,13 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) } } + if (nesqp->iwarp_state > NES_CQP_QP_IWARP_STATE_RTS) { + if (nesqp->hwqp.sq_tail == nesqp->hwqp.sq_head) + complete(&nesqp->sq_drained); + if (nesqp->hwqp.rq_tail == nesqp->hwqp.rq_head) + complete(&nesqp->rq_drained); + } + entry->wr_id = wrid; entry++; cqe_count++; @@ -3754,6 +3785,8 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.req_notify_cq = nes_req_notify_cq; nesibdev->ibdev.post_send = nes_post_send; nesibdev->ibdev.post_recv = nes_post_recv; + nesibdev->ibdev.drain_sq = nes_drain_sq; + nesibdev->ibdev.drain_rq = nes_drain_rq; nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL); if (nesibdev->ibdev.iwcm == NULL) { diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 70290883d067..e02a5662dc20 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -189,6 +189,8 @@ struct nes_qp { u8 pau_pending; u8 pau_state; __u64 nesuqp_addr; + struct completion sq_drained; + struct completion rq_drained; }; struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index a8496a18e20d..b1a3d91fe8b9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -3081,13 +3081,12 @@ static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int ocrdma_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct ocrdma_mr *mr = get_ocrdma_mr(ibmr); mr->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, ocrdma_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, ocrdma_set_page); } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 8b517fd36779..704ef1e9271b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -122,8 +122,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length, struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int ocrdma_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); #endif /* __OCRDMA_VERBS_H__ */ diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 3f062f0dd9d8..f253111e682e 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1090,7 +1090,7 @@ void qib_free_devdata(struct qib_devdata *dd) qib_dbg_ibdev_exit(&dd->verbs_dev); #endif free_percpu(dd->int_counter); - ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); + rvt_dealloc_device(&dd->verbs_dev.rdi); } u64 qib_int_counter(struct qib_devdata *dd) @@ -1183,7 +1183,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) bail: if (!list_empty(&dd->list)) list_del_init(&dd->list); - ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); + rvt_dealloc_device(&dd->verbs_dev.rdi); return ERR_PTR(ret); } diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 9088e26d3ac8..444028a3582a 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -230,7 +230,7 @@ bail: * * Return 1 if constructed; otherwise, return 0. */ -int qib_make_rc_req(struct rvt_qp *qp) +int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) { struct qib_qp_priv *priv = qp->priv; struct qib_ibdev *dev = to_idev(qp->ibqp.device); diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index a5f07a64b228..b67779256297 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -739,7 +739,7 @@ void qib_do_send(struct rvt_qp *qp) struct qib_qp_priv *priv = qp->priv; struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct qib_pportdata *ppd = ppd_from_ibp(ibp); - int (*make_req)(struct rvt_qp *qp); + int (*make_req)(struct rvt_qp *qp, unsigned long *flags); unsigned long flags; if ((qp->ibqp.qp_type == IB_QPT_RC || @@ -781,7 +781,7 @@ void qib_do_send(struct rvt_qp *qp) qp->s_hdrwords = 0; spin_lock_irqsave(&qp->s_lock, flags); } - } while (make_req(qp)); + } while (make_req(qp, &flags)); spin_unlock_irqrestore(&qp->s_lock, flags); } diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 7bdbc79ceaa3..1d61bd04f449 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -45,7 +45,7 @@ * * Return 1 if constructed; otherwise, return 0. */ -int qib_make_uc_req(struct rvt_qp *qp) +int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags) { struct qib_qp_priv *priv = qp->priv; struct qib_other_headers *ohdr; diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index d9502137de62..846e6c726df7 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -238,7 +238,7 @@ drop: * * Return 1 if constructed; otherwise, return 0. */ -int qib_make_ud_req(struct rvt_qp *qp) +int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) { struct qib_qp_priv *priv = qp->priv; struct qib_other_headers *ohdr; @@ -294,7 +294,7 @@ int qib_make_ud_req(struct rvt_qp *qp) this_cpu_inc(ibp->pmastats->n_unicast_xmit); lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); if (unlikely(lid == ppd->lid)) { - unsigned long flags; + unsigned long tflags = *flags; /* * If DMAs are in progress, we can't generate * a completion for the loopback packet since @@ -307,10 +307,10 @@ int qib_make_ud_req(struct rvt_qp *qp) goto bail; } qp->s_cur = next_cur; - local_irq_save(flags); - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, tflags); qib_ud_loopback(qp, wqe); - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave(&qp->s_lock, tflags); + *flags = tflags; qib_send_complete(qp, wqe, IB_WC_SUCCESS); goto done; } diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 4b76a8d59337..6888f03c6d61 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -430,11 +430,11 @@ void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, void qib_send_rc_ack(struct rvt_qp *qp); -int qib_make_rc_req(struct rvt_qp *qp); +int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags); -int qib_make_uc_req(struct rvt_qp *qp); +int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags); -int qib_make_ud_req(struct rvt_qp *qp); +int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags); int qib_register_ib_device(struct qib_devdata *); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a9e3bcc522c4..0f12c211c385 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -829,13 +829,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_UD: - qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & RVT_OPCODE_QP_MASK; + qp->allowed_ops = IB_OPCODE_UD; break; case IB_QPT_RC: - qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & RVT_OPCODE_QP_MASK; + qp->allowed_ops = IB_OPCODE_RC; break; case IB_QPT_UC: - qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & RVT_OPCODE_QP_MASK; + qp->allowed_ops = IB_OPCODE_UC; break; default: ret = ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 6caf5272ba1f..e1cc2cc42f25 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -106,6 +106,19 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports) } EXPORT_SYMBOL(rvt_alloc_device); +/** + * rvt_dealloc_device - deallocate rdi + * @rdi: structure to free + * + * Free a structure allocated with rvt_alloc_device() + */ +void rvt_dealloc_device(struct rvt_dev_info *rdi) +{ + kfree(rdi->ports); + ib_dealloc_device(&rdi->ibdev); +} +EXPORT_SYMBOL(rvt_dealloc_device); + static int rvt_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index a53fa5fc0dec..1502199c8e56 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -36,6 +36,27 @@ #include "ipoib.h" +struct ipoib_stats { + char stat_string[ETH_GSTRING_LEN]; + int stat_offset; +}; + +#define IPOIB_NETDEV_STAT(m) { \ + .stat_string = #m, \ + .stat_offset = offsetof(struct rtnl_link_stats64, m) } + +static const struct ipoib_stats ipoib_gstrings_stats[] = { + IPOIB_NETDEV_STAT(rx_packets), + IPOIB_NETDEV_STAT(tx_packets), + IPOIB_NETDEV_STAT(rx_bytes), + IPOIB_NETDEV_STAT(tx_bytes), + IPOIB_NETDEV_STAT(tx_errors), + IPOIB_NETDEV_STAT(rx_dropped), + IPOIB_NETDEV_STAT(tx_dropped) +}; + +#define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats) + static void ipoib_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { @@ -92,11 +113,57 @@ static int ipoib_set_coalesce(struct net_device *dev, return 0; } +static void ipoib_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats __always_unused *stats, + u64 *data) +{ + int i; + struct net_device_stats *net_stats = &dev->stats; + u8 *p = (u8 *)net_stats; + + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) + data[i] = *(u64 *)(p + ipoib_gstrings_stats[i].stat_offset); + +} +static void ipoib_get_strings(struct net_device __always_unused *dev, + u32 stringset, u8 *data) +{ + u8 *p = data; + int i; + + switch (stringset) { + case ETH_SS_STATS: + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) { + memcpy(p, ipoib_gstrings_stats[i].stat_string, + ETH_GSTRING_LEN); + p += ETH_GSTRING_LEN; + } + break; + case ETH_SS_TEST: + default: + break; + } +} +static int ipoib_get_sset_count(struct net_device __always_unused *dev, + int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return IPOIB_GLOBAL_STATS_LEN; + case ETH_SS_TEST: + default: + break; + } + return -EOPNOTSUPP; +} static const struct ethtool_ops ipoib_ethtool_ops = { .get_drvinfo = ipoib_get_drvinfo, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, + .get_strings = ipoib_get_strings, + .get_ethtool_stats = ipoib_get_ethtool_stats, + .get_sset_count = ipoib_get_sset_count, }; void ipoib_set_ethtool_ops(struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 3643d559ba31..418e5a1c8744 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -51,8 +51,6 @@ MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0"); #endif -static DEFINE_MUTEX(pkey_mutex); - struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ib_pd *pd, struct ib_ah_attr *attr) { diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 9a391cc5b9b3..90be56893414 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -236,7 +236,7 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, page_vec->npages = 0; page_vec->fake_mr.page_size = SIZE_4K; plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg, - mem->size, iser_set_page); + mem->size, NULL, iser_set_page); if (unlikely(plen < mem->size)) { iser_err("page vec too short to hold this SG\n"); iser_data_buf_dump(mem, device->ib_device); @@ -446,7 +446,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); - n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K); + n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K); if (unlikely(n != mem->size)) { iser_err("failed to map sg (%d/%d)\n", n, mem->size); diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 411e4464ca23..897b5a4993e8 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -33,7 +33,8 @@ #define ISERT_MAX_CONN 8 #define ISER_MAX_RX_CQ_LEN (ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN) -#define ISER_MAX_TX_CQ_LEN (ISERT_QP_MAX_REQ_DTOS * ISERT_MAX_CONN) +#define ISER_MAX_TX_CQ_LEN \ + ((ISERT_QP_MAX_REQ_DTOS + ISCSI_DEF_XMIT_CMDS_MAX) * ISERT_MAX_CONN) #define ISER_MAX_CQ_LEN (ISER_MAX_RX_CQ_LEN + ISER_MAX_TX_CQ_LEN + \ ISERT_MAX_CONN) @@ -46,14 +47,6 @@ static LIST_HEAD(device_list); static struct workqueue_struct *isert_comp_wq; static struct workqueue_struct *isert_release_wq; -static void -isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); -static int -isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn); -static void -isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); -static int -isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn); static int isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd); static int @@ -142,6 +135,7 @@ isert_create_qp(struct isert_conn *isert_conn, attr.recv_cq = comp->cq; attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1; attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1; + attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX; attr.cap.max_send_sge = device->ib_device->attrs.max_sge; isert_conn->max_sge = min(device->ib_device->attrs.max_sge, device->ib_device->attrs.max_sge_rd); @@ -270,9 +264,9 @@ isert_alloc_comps(struct isert_device *device) device->ib_device->num_comp_vectors)); isert_info("Using %d CQs, %s supports %d vectors support " - "Fast registration %d pi_capable %d\n", + "pi_capable %d\n", device->comps_used, device->ib_device->name, - device->ib_device->num_comp_vectors, device->use_fastreg, + device->ib_device->num_comp_vectors, device->pi_capable); device->comps = kcalloc(device->comps_used, sizeof(struct isert_comp), @@ -313,18 +307,6 @@ isert_create_device_ib_res(struct isert_device *device) isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge); isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd); - /* asign function handlers */ - if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && - ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { - device->use_fastreg = 1; - device->reg_rdma_mem = isert_reg_rdma; - device->unreg_rdma_mem = isert_unreg_rdma; - } else { - device->use_fastreg = 0; - device->reg_rdma_mem = isert_map_rdma; - device->unreg_rdma_mem = isert_unmap_cmd; - } - ret = isert_alloc_comps(device); if (ret) goto out; @@ -417,146 +399,6 @@ isert_device_get(struct rdma_cm_id *cma_id) } static void -isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) -{ - struct fast_reg_descriptor *fr_desc, *tmp; - int i = 0; - - if (list_empty(&isert_conn->fr_pool)) - return; - - isert_info("Freeing conn %p fastreg pool", isert_conn); - - list_for_each_entry_safe(fr_desc, tmp, - &isert_conn->fr_pool, list) { - list_del(&fr_desc->list); - ib_dereg_mr(fr_desc->data_mr); - if (fr_desc->pi_ctx) { - ib_dereg_mr(fr_desc->pi_ctx->prot_mr); - ib_dereg_mr(fr_desc->pi_ctx->sig_mr); - kfree(fr_desc->pi_ctx); - } - kfree(fr_desc); - ++i; - } - - if (i < isert_conn->fr_pool_size) - isert_warn("Pool still has %d regions registered\n", - isert_conn->fr_pool_size - i); -} - -static int -isert_create_pi_ctx(struct fast_reg_descriptor *desc, - struct ib_device *device, - struct ib_pd *pd) -{ - struct pi_context *pi_ctx; - int ret; - - pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); - if (!pi_ctx) { - isert_err("Failed to allocate pi context\n"); - return -ENOMEM; - } - - pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(pi_ctx->prot_mr)) { - isert_err("Failed to allocate prot frmr err=%ld\n", - PTR_ERR(pi_ctx->prot_mr)); - ret = PTR_ERR(pi_ctx->prot_mr); - goto err_pi_ctx; - } - desc->ind |= ISERT_PROT_KEY_VALID; - - pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2); - if (IS_ERR(pi_ctx->sig_mr)) { - isert_err("Failed to allocate signature enabled mr err=%ld\n", - PTR_ERR(pi_ctx->sig_mr)); - ret = PTR_ERR(pi_ctx->sig_mr); - goto err_prot_mr; - } - - desc->pi_ctx = pi_ctx; - desc->ind |= ISERT_SIG_KEY_VALID; - desc->ind &= ~ISERT_PROTECTED; - - return 0; - -err_prot_mr: - ib_dereg_mr(pi_ctx->prot_mr); -err_pi_ctx: - kfree(pi_ctx); - - return ret; -} - -static int -isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, - struct fast_reg_descriptor *fr_desc) -{ - fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(fr_desc->data_mr)) { - isert_err("Failed to allocate data frmr err=%ld\n", - PTR_ERR(fr_desc->data_mr)); - return PTR_ERR(fr_desc->data_mr); - } - fr_desc->ind |= ISERT_DATA_KEY_VALID; - - isert_dbg("Created fr_desc %p\n", fr_desc); - - return 0; -} - -static int -isert_conn_create_fastreg_pool(struct isert_conn *isert_conn) -{ - struct fast_reg_descriptor *fr_desc; - struct isert_device *device = isert_conn->device; - struct se_session *se_sess = isert_conn->conn->sess->se_sess; - struct se_node_acl *se_nacl = se_sess->se_node_acl; - int i, ret, tag_num; - /* - * Setup the number of FRMRs based upon the number of tags - * available to session in iscsi_target_locate_portal(). - */ - tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); - tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; - - isert_conn->fr_pool_size = 0; - for (i = 0; i < tag_num; i++) { - fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); - if (!fr_desc) { - isert_err("Failed to allocate fast_reg descriptor\n"); - ret = -ENOMEM; - goto err; - } - - ret = isert_create_fr_desc(device->ib_device, - device->pd, fr_desc); - if (ret) { - isert_err("Failed to create fastreg descriptor err=%d\n", - ret); - kfree(fr_desc); - goto err; - } - - list_add_tail(&fr_desc->list, &isert_conn->fr_pool); - isert_conn->fr_pool_size++; - } - - isert_dbg("Creating conn %p fastreg pool size=%d", - isert_conn, isert_conn->fr_pool_size); - - return 0; - -err: - isert_conn_free_fastreg_pool(isert_conn); - return ret; -} - -static void isert_init_conn(struct isert_conn *isert_conn) { isert_conn->state = ISER_CONN_INIT; @@ -565,8 +407,6 @@ isert_init_conn(struct isert_conn *isert_conn) init_completion(&isert_conn->login_req_comp); kref_init(&isert_conn->kref); mutex_init(&isert_conn->mutex); - spin_lock_init(&isert_conn->pool_lock); - INIT_LIST_HEAD(&isert_conn->fr_pool); INIT_WORK(&isert_conn->release_work, isert_release_work); } @@ -739,9 +579,6 @@ isert_connect_release(struct isert_conn *isert_conn) BUG_ON(!device); - if (device->use_fastreg) - isert_conn_free_fastreg_pool(isert_conn); - isert_free_rx_descriptors(isert_conn); if (isert_conn->cm_id) rdma_destroy_id(isert_conn->cm_id); @@ -1080,7 +917,6 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, { struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; - isert_cmd->iser_ib_op = ISER_IB_SEND; tx_desc->tx_cqe.done = isert_send_done; send_wr->wr_cqe = &tx_desc->tx_cqe; @@ -1160,16 +996,6 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, } if (!login->login_failed) { if (login->login_complete) { - if (!conn->sess->sess_ops->SessionType && - isert_conn->device->use_fastreg) { - ret = isert_conn_create_fastreg_pool(isert_conn); - if (ret) { - isert_err("Conn: %p failed to create" - " fastreg pool\n", isert_conn); - return ret; - } - } - ret = isert_alloc_rx_descriptors(isert_conn); if (ret) return ret; @@ -1633,97 +1459,26 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc) ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); } -static int -isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, - struct scatterlist *sg, u32 nents, u32 length, u32 offset, - enum iser_ib_op_code op, struct isert_data_buf *data) -{ - struct ib_device *ib_dev = isert_conn->cm_id->device; - - data->dma_dir = op == ISER_IB_RDMA_WRITE ? - DMA_TO_DEVICE : DMA_FROM_DEVICE; - - data->len = length - offset; - data->offset = offset; - data->sg_off = data->offset / PAGE_SIZE; - - data->sg = &sg[data->sg_off]; - data->nents = min_t(unsigned int, nents - data->sg_off, - ISCSI_ISER_SG_TABLESIZE); - data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE * - PAGE_SIZE); - - data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents, - data->dma_dir); - if (unlikely(!data->dma_nents)) { - isert_err("Cmd: unable to dma map SGs %p\n", sg); - return -EINVAL; - } - - isert_dbg("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", - isert_cmd, data->dma_nents, data->sg, data->nents, data->len); - - return 0; -} - static void -isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data) +isert_rdma_rw_ctx_destroy(struct isert_cmd *cmd, struct isert_conn *conn) { - struct ib_device *ib_dev = isert_conn->cm_id->device; - - ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir); - memset(data, 0, sizeof(*data)); -} - - - -static void -isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) -{ - isert_dbg("Cmd %p\n", isert_cmd); + struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd; + enum dma_data_direction dir = target_reverse_dma_direction(se_cmd); - if (isert_cmd->data.sg) { - isert_dbg("Cmd %p unmap_sg op\n", isert_cmd); - isert_unmap_data_buf(isert_conn, &isert_cmd->data); - } - - if (isert_cmd->rdma_wr) { - isert_dbg("Cmd %p free send_wr\n", isert_cmd); - kfree(isert_cmd->rdma_wr); - isert_cmd->rdma_wr = NULL; - } - - if (isert_cmd->ib_sge) { - isert_dbg("Cmd %p free ib_sge\n", isert_cmd); - kfree(isert_cmd->ib_sge); - isert_cmd->ib_sge = NULL; - } -} - -static void -isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) -{ - isert_dbg("Cmd %p\n", isert_cmd); - - if (isert_cmd->fr_desc) { - isert_dbg("Cmd %p free fr_desc %p\n", isert_cmd, isert_cmd->fr_desc); - if (isert_cmd->fr_desc->ind & ISERT_PROTECTED) { - isert_unmap_data_buf(isert_conn, &isert_cmd->prot); - isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED; - } - spin_lock_bh(&isert_conn->pool_lock); - list_add_tail(&isert_cmd->fr_desc->list, &isert_conn->fr_pool); - spin_unlock_bh(&isert_conn->pool_lock); - isert_cmd->fr_desc = NULL; - } + if (!cmd->rw.nr_ops) + return; - if (isert_cmd->data.sg) { - isert_dbg("Cmd %p unmap_sg op\n", isert_cmd); - isert_unmap_data_buf(isert_conn, &isert_cmd->data); + if (isert_prot_cmd(conn, se_cmd)) { + rdma_rw_ctx_destroy_signature(&cmd->rw, conn->qp, + conn->cm_id->port_num, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->t_prot_sg, + se_cmd->t_prot_nents, dir); + } else { + rdma_rw_ctx_destroy(&cmd->rw, conn->qp, conn->cm_id->port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, dir); } - isert_cmd->ib_sge = NULL; - isert_cmd->rdma_wr = NULL; + cmd->rw.nr_ops = 0; } static void @@ -1732,7 +1487,6 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; struct isert_conn *isert_conn = isert_cmd->conn; struct iscsi_conn *conn = isert_conn->conn; - struct isert_device *device = isert_conn->device; struct iscsi_text_rsp *hdr; isert_dbg("Cmd %p\n", isert_cmd); @@ -1760,7 +1514,7 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) } } - device->unreg_rdma_mem(isert_cmd, isert_conn); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); transport_generic_free_cmd(&cmd->se_cmd, 0); break; case ISCSI_OP_SCSI_TMFUNC: @@ -1894,14 +1648,9 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) isert_dbg("Cmd %p\n", isert_cmd); - if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) { - ret = isert_check_pi_status(cmd, - isert_cmd->fr_desc->pi_ctx->sig_mr); - isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED; - } + ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); - device->unreg_rdma_mem(isert_cmd, isert_conn); - isert_cmd->rdma_wr_num = 0; if (ret) transport_send_check_condition_and_sense(cmd, cmd->pi_err, 0); else @@ -1929,16 +1678,12 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) isert_dbg("Cmd %p\n", isert_cmd); - if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) { - ret = isert_check_pi_status(se_cmd, - isert_cmd->fr_desc->pi_ctx->sig_mr); - isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED; - } - iscsit_stop_dataout_timer(cmd); - device->unreg_rdma_mem(isert_cmd, isert_conn); - cmd->write_data_done = isert_cmd->data.len; - isert_cmd->rdma_wr_num = 0; + + if (isert_prot_cmd(isert_conn, se_cmd)) + ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); + cmd->write_data_done = 0; isert_dbg("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd); spin_lock_bh(&cmd->istate_lock); @@ -2111,7 +1856,6 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); struct isert_conn *isert_conn = conn->context; - struct isert_device *device = isert_conn->device; spin_lock_bh(&conn->cmd_lock); if (!list_empty(&cmd->i_conn_node)) @@ -2120,8 +1864,7 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) if (cmd->data_direction == DMA_TO_DEVICE) iscsit_stop_dataout_timer(cmd); - - device->unreg_rdma_mem(isert_cmd, isert_conn); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); } static enum target_prot_op @@ -2274,234 +2017,6 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) return isert_post_response(isert_conn, isert_cmd); } -static int -isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, - struct ib_sge *ib_sge, struct ib_rdma_wr *rdma_wr, - u32 data_left, u32 offset) -{ - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; - struct scatterlist *sg_start, *tmp_sg; - struct isert_device *device = isert_conn->device; - struct ib_device *ib_dev = device->ib_device; - u32 sg_off, page_off; - int i = 0, sg_nents; - - sg_off = offset / PAGE_SIZE; - sg_start = &cmd->se_cmd.t_data_sg[sg_off]; - sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge); - page_off = offset % PAGE_SIZE; - - rdma_wr->wr.sg_list = ib_sge; - rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe; - - /* - * Perform mapping of TCM scatterlist memory ib_sge dma_addr. - */ - for_each_sg(sg_start, tmp_sg, sg_nents, i) { - isert_dbg("RDMA from SGL dma_addr: 0x%llx dma_len: %u, " - "page_off: %u\n", - (unsigned long long)tmp_sg->dma_address, - tmp_sg->length, page_off); - - ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; - ib_sge->length = min_t(u32, data_left, - ib_sg_dma_len(ib_dev, tmp_sg) - page_off); - ib_sge->lkey = device->pd->local_dma_lkey; - - isert_dbg("RDMA ib_sge: addr: 0x%llx length: %u lkey: %x\n", - ib_sge->addr, ib_sge->length, ib_sge->lkey); - page_off = 0; - data_left -= ib_sge->length; - if (!data_left) - break; - ib_sge++; - isert_dbg("Incrementing ib_sge pointer to %p\n", ib_sge); - } - - rdma_wr->wr.num_sge = ++i; - isert_dbg("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n", - rdma_wr->wr.sg_list, rdma_wr->wr.num_sge); - - return rdma_wr->wr.num_sge; -} - -static int -isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn) -{ - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; - struct se_cmd *se_cmd = &cmd->se_cmd; - struct isert_conn *isert_conn = conn->context; - struct isert_data_buf *data = &isert_cmd->data; - struct ib_rdma_wr *rdma_wr; - struct ib_sge *ib_sge; - u32 offset, data_len, data_left, rdma_write_max, va_offset = 0; - int ret = 0, i, ib_sge_cnt; - - offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ? - cmd->write_data_done : 0; - ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, - se_cmd->t_data_nents, se_cmd->data_length, - offset, isert_cmd->iser_ib_op, - &isert_cmd->data); - if (ret) - return ret; - - data_left = data->len; - offset = data->offset; - - ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL); - if (!ib_sge) { - isert_warn("Unable to allocate ib_sge\n"); - ret = -ENOMEM; - goto unmap_cmd; - } - isert_cmd->ib_sge = ib_sge; - - isert_cmd->rdma_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge); - isert_cmd->rdma_wr = kzalloc(sizeof(struct ib_rdma_wr) * - isert_cmd->rdma_wr_num, GFP_KERNEL); - if (!isert_cmd->rdma_wr) { - isert_dbg("Unable to allocate isert_cmd->rdma_wr\n"); - ret = -ENOMEM; - goto unmap_cmd; - } - - rdma_write_max = isert_conn->max_sge * PAGE_SIZE; - - for (i = 0; i < isert_cmd->rdma_wr_num; i++) { - rdma_wr = &isert_cmd->rdma_wr[i]; - data_len = min(data_left, rdma_write_max); - - rdma_wr->wr.send_flags = 0; - if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; - - rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; - rdma_wr->remote_addr = isert_cmd->read_va + offset; - rdma_wr->rkey = isert_cmd->read_stag; - if (i + 1 == isert_cmd->rdma_wr_num) - rdma_wr->wr.next = &isert_cmd->tx_desc.send_wr; - else - rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr; - } else { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; - - rdma_wr->wr.opcode = IB_WR_RDMA_READ; - rdma_wr->remote_addr = isert_cmd->write_va + va_offset; - rdma_wr->rkey = isert_cmd->write_stag; - if (i + 1 == isert_cmd->rdma_wr_num) - rdma_wr->wr.send_flags = IB_SEND_SIGNALED; - else - rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr; - } - - ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge, - rdma_wr, data_len, offset); - ib_sge += ib_sge_cnt; - - offset += data_len; - va_offset += data_len; - data_left -= data_len; - } - - return 0; -unmap_cmd: - isert_unmap_data_buf(isert_conn, data); - - return ret; -} - -static inline void -isert_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) -{ - u32 rkey; - - memset(inv_wr, 0, sizeof(*inv_wr)); - inv_wr->wr_cqe = NULL; - inv_wr->opcode = IB_WR_LOCAL_INV; - inv_wr->ex.invalidate_rkey = mr->rkey; - - /* Bump the key */ - rkey = ib_inc_rkey(mr->rkey); - ib_update_fast_reg_key(mr, rkey); -} - -static int -isert_fast_reg_mr(struct isert_conn *isert_conn, - struct fast_reg_descriptor *fr_desc, - struct isert_data_buf *mem, - enum isert_indicator ind, - struct ib_sge *sge) -{ - struct isert_device *device = isert_conn->device; - struct ib_device *ib_dev = device->ib_device; - struct ib_mr *mr; - struct ib_reg_wr reg_wr; - struct ib_send_wr inv_wr, *bad_wr, *wr = NULL; - int ret, n; - - if (mem->dma_nents == 1) { - sge->lkey = device->pd->local_dma_lkey; - sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); - sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); - isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", - sge->addr, sge->length, sge->lkey); - return 0; - } - - if (ind == ISERT_DATA_KEY_VALID) - /* Registering data buffer */ - mr = fr_desc->data_mr; - else - /* Registering protection buffer */ - mr = fr_desc->pi_ctx->prot_mr; - - if (!(fr_desc->ind & ind)) { - isert_inv_rkey(&inv_wr, mr); - wr = &inv_wr; - } - - n = ib_map_mr_sg(mr, mem->sg, mem->nents, PAGE_SIZE); - if (unlikely(n != mem->nents)) { - isert_err("failed to map mr sg (%d/%d)\n", - n, mem->nents); - return n < 0 ? n : -EINVAL; - } - - isert_dbg("Use fr_desc %p sg_nents %d offset %u\n", - fr_desc, mem->nents, mem->offset); - - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_cqe = NULL; - reg_wr.wr.send_flags = 0; - reg_wr.wr.num_sge = 0; - reg_wr.mr = mr; - reg_wr.key = mr->lkey; - reg_wr.access = IB_ACCESS_LOCAL_WRITE; - - if (!wr) - wr = ®_wr.wr; - else - wr->next = ®_wr.wr; - - ret = ib_post_send(isert_conn->qp, wr, &bad_wr); - if (ret) { - isert_err("fast registration failed, ret:%d\n", ret); - return ret; - } - fr_desc->ind &= ~ind; - - sge->lkey = mr->lkey; - sge->addr = mr->iova; - sge->length = mr->length; - - isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", - sge->addr, sge->length, sge->lkey); - - return ret; -} - static inline void isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs, struct ib_sig_domain *domain) @@ -2526,6 +2041,8 @@ isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs, static int isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) { + memset(sig_attrs, 0, sizeof(*sig_attrs)); + switch (se_cmd->prot_op) { case TARGET_PROT_DIN_INSERT: case TARGET_PROT_DOUT_STRIP: @@ -2547,228 +2064,59 @@ isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) return -EINVAL; } + sig_attrs->check_mask = + (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | + (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | + (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); return 0; } -static inline u8 -isert_set_prot_checks(u8 prot_checks) -{ - return (prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | - (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | - (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); -} - -static int -isert_reg_sig_mr(struct isert_conn *isert_conn, - struct isert_cmd *isert_cmd, - struct fast_reg_descriptor *fr_desc) -{ - struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd; - struct ib_sig_handover_wr sig_wr; - struct ib_send_wr inv_wr, *bad_wr, *wr = NULL; - struct pi_context *pi_ctx = fr_desc->pi_ctx; - struct ib_sig_attrs sig_attrs; - int ret; - - memset(&sig_attrs, 0, sizeof(sig_attrs)); - ret = isert_set_sig_attrs(se_cmd, &sig_attrs); - if (ret) - goto err; - - sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks); - - if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) { - isert_inv_rkey(&inv_wr, pi_ctx->sig_mr); - wr = &inv_wr; - } - - memset(&sig_wr, 0, sizeof(sig_wr)); - sig_wr.wr.opcode = IB_WR_REG_SIG_MR; - sig_wr.wr.wr_cqe = NULL; - sig_wr.wr.sg_list = &isert_cmd->ib_sg[DATA]; - sig_wr.wr.num_sge = 1; - sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; - sig_wr.sig_attrs = &sig_attrs; - sig_wr.sig_mr = pi_ctx->sig_mr; - if (se_cmd->t_prot_sg) - sig_wr.prot = &isert_cmd->ib_sg[PROT]; - - if (!wr) - wr = &sig_wr.wr; - else - wr->next = &sig_wr.wr; - - ret = ib_post_send(isert_conn->qp, wr, &bad_wr); - if (ret) { - isert_err("fast registration failed, ret:%d\n", ret); - goto err; - } - fr_desc->ind &= ~ISERT_SIG_KEY_VALID; - - isert_cmd->ib_sg[SIG].lkey = pi_ctx->sig_mr->lkey; - isert_cmd->ib_sg[SIG].addr = 0; - isert_cmd->ib_sg[SIG].length = se_cmd->data_length; - if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP && - se_cmd->prot_op != TARGET_PROT_DOUT_INSERT) - /* - * We have protection guards on the wire - * so we need to set a larget transfer - */ - isert_cmd->ib_sg[SIG].length += se_cmd->prot_length; - - isert_dbg("sig_sge: addr: 0x%llx length: %u lkey: %x\n", - isert_cmd->ib_sg[SIG].addr, isert_cmd->ib_sg[SIG].length, - isert_cmd->ib_sg[SIG].lkey); -err: - return ret; -} - static int -isert_handle_prot_cmd(struct isert_conn *isert_conn, - struct isert_cmd *isert_cmd) -{ - struct isert_device *device = isert_conn->device; - struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd; +isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd; + enum dma_data_direction dir = target_reverse_dma_direction(se_cmd); + u8 port_num = conn->cm_id->port_num; + u64 addr; + u32 rkey, offset; int ret; - if (!isert_cmd->fr_desc->pi_ctx) { - ret = isert_create_pi_ctx(isert_cmd->fr_desc, - device->ib_device, - device->pd); - if (ret) { - isert_err("conn %p failed to allocate pi_ctx\n", - isert_conn); - return ret; - } - } - - if (se_cmd->t_prot_sg) { - ret = isert_map_data_buf(isert_conn, isert_cmd, - se_cmd->t_prot_sg, - se_cmd->t_prot_nents, - se_cmd->prot_length, - 0, - isert_cmd->iser_ib_op, - &isert_cmd->prot); - if (ret) { - isert_err("conn %p failed to map protection buffer\n", - isert_conn); - return ret; - } - - memset(&isert_cmd->ib_sg[PROT], 0, sizeof(isert_cmd->ib_sg[PROT])); - ret = isert_fast_reg_mr(isert_conn, isert_cmd->fr_desc, - &isert_cmd->prot, - ISERT_PROT_KEY_VALID, - &isert_cmd->ib_sg[PROT]); - if (ret) { - isert_err("conn %p failed to fast reg mr\n", - isert_conn); - goto unmap_prot_cmd; - } - } - - ret = isert_reg_sig_mr(isert_conn, isert_cmd, isert_cmd->fr_desc); - if (ret) { - isert_err("conn %p failed to fast reg mr\n", - isert_conn); - goto unmap_prot_cmd; - } - isert_cmd->fr_desc->ind |= ISERT_PROTECTED; - - return 0; - -unmap_prot_cmd: - if (se_cmd->t_prot_sg) - isert_unmap_data_buf(isert_conn, &isert_cmd->prot); - - return ret; -} - -static int -isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn) -{ - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; - struct se_cmd *se_cmd = &cmd->se_cmd; - struct isert_conn *isert_conn = conn->context; - struct fast_reg_descriptor *fr_desc = NULL; - struct ib_rdma_wr *rdma_wr; - struct ib_sge *ib_sg; - u32 offset; - int ret = 0; - unsigned long flags; - - offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ? - cmd->write_data_done : 0; - ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, - se_cmd->t_data_nents, se_cmd->data_length, - offset, isert_cmd->iser_ib_op, - &isert_cmd->data); - if (ret) - return ret; - - if (isert_cmd->data.dma_nents != 1 || - isert_prot_cmd(isert_conn, se_cmd)) { - spin_lock_irqsave(&isert_conn->pool_lock, flags); - fr_desc = list_first_entry(&isert_conn->fr_pool, - struct fast_reg_descriptor, list); - list_del(&fr_desc->list); - spin_unlock_irqrestore(&isert_conn->pool_lock, flags); - isert_cmd->fr_desc = fr_desc; - } - - ret = isert_fast_reg_mr(isert_conn, fr_desc, &isert_cmd->data, - ISERT_DATA_KEY_VALID, &isert_cmd->ib_sg[DATA]); - if (ret) - goto unmap_cmd; - - if (isert_prot_cmd(isert_conn, se_cmd)) { - ret = isert_handle_prot_cmd(isert_conn, isert_cmd); - if (ret) - goto unmap_cmd; - - ib_sg = &isert_cmd->ib_sg[SIG]; + if (dir == DMA_FROM_DEVICE) { + addr = cmd->write_va; + rkey = cmd->write_stag; + offset = cmd->iscsi_cmd->write_data_done; } else { - ib_sg = &isert_cmd->ib_sg[DATA]; + addr = cmd->read_va; + rkey = cmd->read_stag; + offset = 0; } - memcpy(&isert_cmd->s_ib_sge, ib_sg, sizeof(*ib_sg)); - isert_cmd->ib_sge = &isert_cmd->s_ib_sge; - isert_cmd->rdma_wr_num = 1; - memset(&isert_cmd->s_rdma_wr, 0, sizeof(isert_cmd->s_rdma_wr)); - isert_cmd->rdma_wr = &isert_cmd->s_rdma_wr; + if (isert_prot_cmd(conn, se_cmd)) { + struct ib_sig_attrs sig_attrs; - rdma_wr = &isert_cmd->s_rdma_wr; - rdma_wr->wr.sg_list = &isert_cmd->s_ib_sge; - rdma_wr->wr.num_sge = 1; - rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe; - if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; + ret = isert_set_sig_attrs(se_cmd, &sig_attrs); + if (ret) + return ret; - rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; - rdma_wr->remote_addr = isert_cmd->read_va; - rdma_wr->rkey = isert_cmd->read_stag; - rdma_wr->wr.send_flags = !isert_prot_cmd(isert_conn, se_cmd) ? - 0 : IB_SEND_SIGNALED; + WARN_ON_ONCE(offset); + ret = rdma_rw_ctx_signature_init(&cmd->rw, conn->qp, port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, + se_cmd->t_prot_sg, se_cmd->t_prot_nents, + &sig_attrs, addr, rkey, dir); } else { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; - - rdma_wr->wr.opcode = IB_WR_RDMA_READ; - rdma_wr->remote_addr = isert_cmd->write_va; - rdma_wr->rkey = isert_cmd->write_stag; - rdma_wr->wr.send_flags = IB_SEND_SIGNALED; + ret = rdma_rw_ctx_init(&cmd->rw, conn->qp, port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, + offset, addr, rkey, dir); } - - return 0; - -unmap_cmd: - if (fr_desc) { - spin_lock_irqsave(&isert_conn->pool_lock, flags); - list_add_tail(&fr_desc->list, &isert_conn->fr_pool); - spin_unlock_irqrestore(&isert_conn->pool_lock, flags); + if (ret < 0) { + isert_err("Cmd: %p failed to prepare RDMA res\n", cmd); + return ret; } - isert_unmap_data_buf(isert_conn, &isert_cmd->data); + ret = rdma_rw_ctx_post(&cmd->rw, conn->qp, port_num, cqe, chain_wr); + if (ret < 0) + isert_err("Cmd: %p failed to post RDMA res\n", cmd); return ret; } @@ -2778,21 +2126,17 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); struct isert_conn *isert_conn = conn->context; - struct isert_device *device = isert_conn->device; - struct ib_send_wr *wr_failed; + struct ib_cqe *cqe = NULL; + struct ib_send_wr *chain_wr = NULL; int rc; isert_dbg("Cmd: %p RDMA_WRITE data_length: %u\n", isert_cmd, se_cmd->data_length); - isert_cmd->iser_ib_op = ISER_IB_RDMA_WRITE; - rc = device->reg_rdma_mem(isert_cmd, conn); - if (rc) { - isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); - return rc; - } - - if (!isert_prot_cmd(isert_conn, se_cmd)) { + if (isert_prot_cmd(isert_conn, se_cmd)) { + isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; + cqe = &isert_cmd->tx_desc.tx_cqe; + } else { /* * Build isert_conn->tx_desc for iSCSI response PDU and attach */ @@ -2803,56 +2147,35 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); isert_init_send_wr(isert_conn, isert_cmd, &isert_cmd->tx_desc.send_wr); - isert_cmd->s_rdma_wr.wr.next = &isert_cmd->tx_desc.send_wr; - isert_cmd->rdma_wr_num += 1; rc = isert_post_recv(isert_conn, isert_cmd->rx_desc); if (rc) { isert_err("ib_post_recv failed with %d\n", rc); return rc; } - } - rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed); - if (rc) - isert_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n"); - - if (!isert_prot_cmd(isert_conn, se_cmd)) - isert_dbg("Cmd: %p posted RDMA_WRITE + Response for iSER Data " - "READ\n", isert_cmd); - else - isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", - isert_cmd); + chain_wr = &isert_cmd->tx_desc.send_wr; + } + isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr); + isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", isert_cmd); return 1; } static int isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) { - struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = conn->context; - struct isert_device *device = isert_conn->device; - struct ib_send_wr *wr_failed; - int rc; isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n", - isert_cmd, se_cmd->data_length, cmd->write_data_done); - isert_cmd->iser_ib_op = ISER_IB_RDMA_READ; - rc = device->reg_rdma_mem(isert_cmd, conn); - if (rc) { - isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); - return rc; - } + isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done); - rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed); - if (rc) - isert_warn("ib_post_send() failed for IB_WR_RDMA_READ\n"); + isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; + isert_rdma_rw_ctx_post(isert_cmd, conn->context, + &isert_cmd->tx_desc.tx_cqe, NULL); isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n", isert_cmd); - return 0; } diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 147900cbb578..e512ba941f2f 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -3,6 +3,7 @@ #include <linux/in6.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> +#include <rdma/rw.h> #include <scsi/iser.h> @@ -53,10 +54,7 @@ #define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) -#define ISERT_INFLIGHT_DATAOUTS 8 - -#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \ - (1 + ISERT_INFLIGHT_DATAOUTS) + \ +#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX + \ ISERT_MAX_TX_MISC_PDUS + \ ISERT_MAX_RX_MISC_PDUS) @@ -71,13 +69,6 @@ enum isert_desc_type { ISCSI_TX_DATAIN }; -enum iser_ib_op_code { - ISER_IB_RECV, - ISER_IB_SEND, - ISER_IB_RDMA_WRITE, - ISER_IB_RDMA_READ, -}; - enum iser_conn_state { ISER_CONN_INIT, ISER_CONN_UP, @@ -118,42 +109,6 @@ static inline struct iser_tx_desc *cqe_to_tx_desc(struct ib_cqe *cqe) return container_of(cqe, struct iser_tx_desc, tx_cqe); } - -enum isert_indicator { - ISERT_PROTECTED = 1 << 0, - ISERT_DATA_KEY_VALID = 1 << 1, - ISERT_PROT_KEY_VALID = 1 << 2, - ISERT_SIG_KEY_VALID = 1 << 3, -}; - -struct pi_context { - struct ib_mr *prot_mr; - struct ib_mr *sig_mr; -}; - -struct fast_reg_descriptor { - struct list_head list; - struct ib_mr *data_mr; - u8 ind; - struct pi_context *pi_ctx; -}; - -struct isert_data_buf { - struct scatterlist *sg; - int nents; - u32 sg_off; - u32 len; /* cur_rdma_length */ - u32 offset; - unsigned int dma_nents; - enum dma_data_direction dma_dir; -}; - -enum { - DATA = 0, - PROT = 1, - SIG = 2, -}; - struct isert_cmd { uint32_t read_stag; uint32_t write_stag; @@ -166,16 +121,7 @@ struct isert_cmd { struct iscsi_cmd *iscsi_cmd; struct iser_tx_desc tx_desc; struct iser_rx_desc *rx_desc; - enum iser_ib_op_code iser_ib_op; - struct ib_sge *ib_sge; - struct ib_sge s_ib_sge; - int rdma_wr_num; - struct ib_rdma_wr *rdma_wr; - struct ib_rdma_wr s_rdma_wr; - struct ib_sge ib_sg[3]; - struct isert_data_buf data; - struct isert_data_buf prot; - struct fast_reg_descriptor *fr_desc; + struct rdma_rw_ctx rw; struct work_struct comp_work; struct scatterlist sg; }; @@ -210,10 +156,6 @@ struct isert_conn { struct isert_device *device; struct mutex mutex; struct kref kref; - struct list_head fr_pool; - int fr_pool_size; - /* lock to protect fastreg pool */ - spinlock_t pool_lock; struct work_struct release_work; bool logout_posted; bool snd_w_inv; @@ -236,7 +178,6 @@ struct isert_comp { }; struct isert_device { - int use_fastreg; bool pi_capable; int refcount; struct ib_device *ib_device; @@ -244,10 +185,6 @@ struct isert_device { struct isert_comp *comps; int comps_used; struct list_head dev_node; - int (*reg_rdma_mem)(struct isert_cmd *isert_cmd, - struct iscsi_conn *conn); - void (*unreg_rdma_mem)(struct isert_cmd *isert_cmd, - struct isert_conn *isert_conn); }; struct isert_np { diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 369a75e1f44e..646de170ec12 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -70,6 +70,7 @@ static unsigned int indirect_sg_entries; static bool allow_ext_sg; static bool prefer_fr = true; static bool register_always = true; +static bool never_register; static int topspin_workarounds = 1; module_param(srp_sg_tablesize, uint, 0444); @@ -99,6 +100,9 @@ module_param(register_always, bool, 0444); MODULE_PARM_DESC(register_always, "Use memory registration even for contiguous memory regions"); +module_param(never_register, bool, 0444); +MODULE_PARM_DESC(never_register, "Never register memory"); + static const struct kernel_param_ops srp_tmo_ops; static int srp_reconnect_delay = 10; @@ -316,7 +320,7 @@ static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) struct ib_fmr_pool_param fmr_param; memset(&fmr_param, 0, sizeof(fmr_param)); - fmr_param.pool_size = target->scsi_host->can_queue; + fmr_param.pool_size = target->mr_pool_size; fmr_param.dirty_watermark = fmr_param.pool_size / 4; fmr_param.cache = 1; fmr_param.max_pages_per_fmr = dev->max_pages_per_mr; @@ -441,23 +445,22 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) { struct srp_device *dev = target->srp_host->srp_dev; - return srp_create_fr_pool(dev->dev, dev->pd, - target->scsi_host->can_queue, + return srp_create_fr_pool(dev->dev, dev->pd, target->mr_pool_size, dev->max_pages_per_mr); } /** * srp_destroy_qp() - destroy an RDMA queue pair - * @ch: SRP RDMA channel. + * @qp: RDMA queue pair. * * Drain the qp before destroying it. This avoids that the receive * completion handler can access the queue pair while it is * being destroyed. */ -static void srp_destroy_qp(struct srp_rdma_ch *ch) +static void srp_destroy_qp(struct ib_qp *qp) { - ib_drain_rq(ch->qp); - ib_destroy_qp(ch->qp); + ib_drain_rq(qp); + ib_destroy_qp(qp); } static int srp_create_ch_ib(struct srp_rdma_ch *ch) @@ -469,7 +472,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) struct ib_qp *qp; struct ib_fmr_pool *fmr_pool = NULL; struct srp_fr_pool *fr_pool = NULL; - const int m = dev->use_fast_reg ? 3 : 1; + const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); @@ -530,7 +533,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) } if (ch->qp) - srp_destroy_qp(ch); + srp_destroy_qp(ch->qp); if (ch->recv_cq) ib_free_cq(ch->recv_cq); if (ch->send_cq) @@ -554,7 +557,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) return 0; err_qp: - srp_destroy_qp(ch); + srp_destroy_qp(qp); err_send_cq: ib_free_cq(send_cq); @@ -597,7 +600,7 @@ static void srp_free_ch_ib(struct srp_target_port *target, ib_destroy_fmr_pool(ch->fmr_pool); } - srp_destroy_qp(ch); + srp_destroy_qp(ch->qp); ib_free_cq(ch->send_cq); ib_free_cq(ch->recv_cq); @@ -850,7 +853,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch) for (i = 0; i < target->req_ring_size; ++i) { req = &ch->req_ring[i]; - mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + mr_list = kmalloc(target->mr_per_cmd * sizeof(void *), GFP_KERNEL); if (!mr_list) goto out; @@ -1112,7 +1115,7 @@ static struct scsi_cmnd *srp_claim_req(struct srp_rdma_ch *ch, } /** - * srp_free_req() - Unmap data and add request to the free request list. + * srp_free_req() - Unmap data and adjust ch->req_lim. * @ch: SRP RDMA channel. * @req: Request to be freed. * @scmnd: SCSI command associated with @req. @@ -1299,9 +1302,16 @@ static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc) srp_handle_qp_err(cq, wc, "FAST REG"); } +/* + * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset + * where to start in the first element. If sg_offset_p != NULL then + * *sg_offset_p is updated to the offset in state->sg[retval] of the first + * byte that has not yet been mapped. + */ static int srp_map_finish_fr(struct srp_map_state *state, struct srp_request *req, - struct srp_rdma_ch *ch, int sg_nents) + struct srp_rdma_ch *ch, int sg_nents, + unsigned int *sg_offset_p) { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; @@ -1316,13 +1326,14 @@ static int srp_map_finish_fr(struct srp_map_state *state, WARN_ON_ONCE(!dev->use_fast_reg); - if (sg_nents == 0) - return 0; - if (sg_nents == 1 && target->global_mr) { - srp_map_desc(state, sg_dma_address(state->sg), - sg_dma_len(state->sg), + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + + srp_map_desc(state, sg_dma_address(state->sg) + sg_offset, + sg_dma_len(state->sg) - sg_offset, target->global_mr->rkey); + if (sg_offset_p) + *sg_offset_p = 0; return 1; } @@ -1333,9 +1344,17 @@ static int srp_map_finish_fr(struct srp_map_state *state, rkey = ib_inc_rkey(desc->mr->rkey); ib_update_fast_reg_key(desc->mr, rkey); - n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, dev->mr_page_size); - if (unlikely(n < 0)) + n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p, + dev->mr_page_size); + if (unlikely(n < 0)) { + srp_fr_pool_put(ch->fr_pool, &desc, 1); + pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n", + dev_name(&req->scmnd->device->sdev_gendev), sg_nents, + sg_offset_p ? *sg_offset_p : -1, n); return n; + } + + WARN_ON_ONCE(desc->mr->length == 0); req->reg_cqe.done = srp_reg_mr_err_done; @@ -1357,8 +1376,10 @@ static int srp_map_finish_fr(struct srp_map_state *state, desc->mr->length, desc->mr->rkey); err = ib_post_send(ch->qp, &wr.wr, &bad_wr); - if (unlikely(err)) + if (unlikely(err)) { + WARN_ON_ONCE(err == -ENOMEM); return err; + } return n; } @@ -1398,7 +1419,7 @@ static int srp_map_sg_entry(struct srp_map_state *state, /* * If the last entry of the MR wasn't a full page, then we need to * close it out and start a new one -- we can only merge at page - * boundries. + * boundaries. */ ret = 0; if (len != dev->mr_page_size) @@ -1413,10 +1434,9 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch, struct scatterlist *sg; int i, ret; - state->desc = req->indirect_desc; state->pages = req->map_page; state->fmr.next = req->fmr_list; - state->fmr.end = req->fmr_list + ch->target->cmd_sg_cnt; + state->fmr.end = req->fmr_list + ch->target->mr_per_cmd; for_each_sg(scat, sg, count, i) { ret = srp_map_sg_entry(state, ch, sg, i); @@ -1428,8 +1448,6 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch, if (ret) return ret; - req->nmdesc = state->nmdesc; - return 0; } @@ -1437,15 +1455,20 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, struct srp_request *req, struct scatterlist *scat, int count) { + unsigned int sg_offset = 0; + state->desc = req->indirect_desc; state->fr.next = req->fr_list; - state->fr.end = req->fr_list + ch->target->cmd_sg_cnt; + state->fr.end = req->fr_list + ch->target->mr_per_cmd; state->sg = scat; + if (count == 0) + return 0; + while (count) { int i, n; - n = srp_map_finish_fr(state, req, ch, count); + n = srp_map_finish_fr(state, req, ch, count, &sg_offset); if (unlikely(n < 0)) return n; @@ -1454,8 +1477,6 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, state->sg = sg_next(state->sg); } - req->nmdesc = state->nmdesc; - return 0; } @@ -1475,8 +1496,6 @@ static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch, target->global_mr->rkey); } - req->nmdesc = state->nmdesc; - return 0; } @@ -1509,14 +1528,15 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, if (dev->use_fast_reg) { state.sg = idb_sg; - sg_set_buf(idb_sg, req->indirect_desc, idb_len); + sg_init_one(idb_sg, req->indirect_desc, idb_len); idb_sg->dma_address = req->indirect_dma_addr; /* hack! */ #ifdef CONFIG_NEED_SG_DMA_LENGTH idb_sg->dma_length = idb_sg->length; /* hack^2 */ #endif - ret = srp_map_finish_fr(&state, req, ch, 1); + ret = srp_map_finish_fr(&state, req, ch, 1, NULL); if (ret < 0) return ret; + WARN_ON_ONCE(ret < 1); } else if (dev->use_fmr) { state.pages = idb_pages; state.pages[0] = (req->indirect_dma_addr & @@ -1534,6 +1554,41 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, return 0; } +#if defined(DYNAMIC_DATA_DEBUG) +static void srp_check_mapping(struct srp_map_state *state, + struct srp_rdma_ch *ch, struct srp_request *req, + struct scatterlist *scat, int count) +{ + struct srp_device *dev = ch->target->srp_host->srp_dev; + struct srp_fr_desc **pfr; + u64 desc_len = 0, mr_len = 0; + int i; + + for (i = 0; i < state->ndesc; i++) + desc_len += be32_to_cpu(req->indirect_desc[i].len); + if (dev->use_fast_reg) + for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++) + mr_len += (*pfr)->mr->length; + else if (dev->use_fmr) + for (i = 0; i < state->nmdesc; i++) + mr_len += be32_to_cpu(req->indirect_desc[i].len); + if (desc_len != scsi_bufflen(req->scmnd) || + mr_len > scsi_bufflen(req->scmnd)) + pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n", + scsi_bufflen(req->scmnd), desc_len, mr_len, + state->ndesc, state->nmdesc); +} +#endif + +/** + * srp_map_data() - map SCSI data buffer onto an SRP request + * @scmnd: SCSI command to map + * @ch: SRP RDMA channel + * @req: SRP request + * + * Returns the length in bytes of the SRP_CMD IU or a negative value if + * mapping failed. + */ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, struct srp_request *req) { @@ -1601,11 +1656,23 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, memset(&state, 0, sizeof(state)); if (dev->use_fast_reg) - srp_map_sg_fr(&state, ch, req, scat, count); + ret = srp_map_sg_fr(&state, ch, req, scat, count); else if (dev->use_fmr) - srp_map_sg_fmr(&state, ch, req, scat, count); + ret = srp_map_sg_fmr(&state, ch, req, scat, count); else - srp_map_sg_dma(&state, ch, req, scat, count); + ret = srp_map_sg_dma(&state, ch, req, scat, count); + req->nmdesc = state.nmdesc; + if (ret < 0) + goto unmap; + +#if defined(DYNAMIC_DEBUG) + { + DEFINE_DYNAMIC_DEBUG_METADATA(ddm, + "Memory mapping consistency check"); + if (unlikely(ddm.flags & _DPRINTK_FLAGS_PRINT)) + srp_check_mapping(&state, ch, req, scat, count); + } +#endif /* We've mapped the request, now pull as much of the indirect * descriptor table as we can into the command buffer. If this @@ -1628,7 +1695,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, !target->allow_ext_sg)) { shost_printk(KERN_ERR, target->scsi_host, "Could not fit S/G list into SRP_CMD\n"); - return -EIO; + ret = -EIO; + goto unmap; } count = min(state.ndesc, target->cmd_sg_cnt); @@ -1646,7 +1714,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, ret = srp_map_idb(ch, req, state.gen.next, state.gen.end, idb_len, &idb_rkey); if (ret < 0) - return ret; + goto unmap; req->nmdesc++; } else { idb_rkey = cpu_to_be32(target->global_mr->rkey); @@ -1672,6 +1740,12 @@ map_complete: cmd->buf_fmt = fmt; return len; + +unmap: + srp_unmap_data(scmnd, ch, req); + if (ret == -ENOMEM && req->nmdesc >= target->mr_pool_size) + ret = -E2BIG; + return ret; } /* @@ -2564,6 +2638,20 @@ static int srp_reset_host(struct scsi_cmnd *scmnd) return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; } +static int srp_slave_alloc(struct scsi_device *sdev) +{ + struct Scsi_Host *shost = sdev->host; + struct srp_target_port *target = host_to_target(shost); + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + + if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + blk_queue_virt_boundary(sdev->request_queue, + ~srp_dev->mr_page_mask); + + return 0; +} + static int srp_slave_configure(struct scsi_device *sdev) { struct Scsi_Host *shost = sdev->host; @@ -2755,6 +2843,7 @@ static struct scsi_host_template srp_template = { .module = THIS_MODULE, .name = "InfiniBand SRP initiator", .proc_name = DRV_NAME, + .slave_alloc = srp_slave_alloc, .slave_configure = srp_slave_configure, .info = srp_target_info, .queuecommand = srp_queuecommand, @@ -2829,7 +2918,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) goto out; } - pr_debug(PFX "%s: SCSI scan succeeded - detected %d LUNs\n", + pr_debug("%s: SCSI scan succeeded - detected %d LUNs\n", dev_name(&target->scsi_host->shost_gendev), srp_sdev_count(target->scsi_host)); @@ -3161,6 +3250,7 @@ static ssize_t srp_create_target(struct device *dev, struct srp_device *srp_dev = host->srp_dev; struct ib_device *ibdev = srp_dev->dev; int ret, node_idx, node, cpu, i; + unsigned int max_sectors_per_mr, mr_per_cmd = 0; bool multich = false; target_host = scsi_host_alloc(&srp_template, @@ -3217,7 +3307,33 @@ static ssize_t srp_create_target(struct device *dev, target->sg_tablesize = target->cmd_sg_cnt; } + if (srp_dev->use_fast_reg || srp_dev->use_fmr) { + /* + * FR and FMR can only map one HCA page per entry. If the + * start address is not aligned on a HCA page boundary two + * entries will be used for the head and the tail although + * these two entries combined contain at most one HCA page of + * data. Hence the "+ 1" in the calculation below. + * + * The indirect data buffer descriptor is contiguous so the + * memory for that buffer will only be registered if + * register_always is true. Hence add one to mr_per_cmd if + * register_always has been set. + */ + max_sectors_per_mr = srp_dev->max_pages_per_mr << + (ilog2(srp_dev->mr_page_size) - 9); + mr_per_cmd = register_always + + (target->scsi_host->max_sectors + 1 + + max_sectors_per_mr - 1) / max_sectors_per_mr; + pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n", + target->scsi_host->max_sectors, + srp_dev->max_pages_per_mr, srp_dev->mr_page_size, + max_sectors_per_mr, mr_per_cmd); + } + target_host->sg_tablesize = target->sg_tablesize; + target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd; + target->mr_per_cmd = mr_per_cmd; target->indirect_size = target->sg_tablesize * sizeof (struct srp_direct_buf); target->max_iu_len = sizeof (struct srp_cmd) + @@ -3414,17 +3530,6 @@ static void srp_add_one(struct ib_device *device) if (!srp_dev) return; - srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && - device->map_phys_fmr && device->unmap_fmr); - srp_dev->has_fr = (device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS); - if (!srp_dev->has_fmr && !srp_dev->has_fr) - dev_warn(&device->dev, "neither FMR nor FR is supported\n"); - - srp_dev->use_fast_reg = (srp_dev->has_fr && - (!srp_dev->has_fmr || prefer_fr)); - srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr; - /* * Use the smallest page size supported by the HCA, down to a * minimum of 4096 bytes. We're unlikely to build large sglists @@ -3435,8 +3540,25 @@ static void srp_add_one(struct ib_device *device) srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); max_pages_per_mr = device->attrs.max_mr_size; do_div(max_pages_per_mr, srp_dev->mr_page_size); + pr_debug("%s: %llu / %u = %llu <> %u\n", __func__, + device->attrs.max_mr_size, srp_dev->mr_page_size, + max_pages_per_mr, SRP_MAX_PAGES_PER_MR); srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, max_pages_per_mr); + + srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + srp_dev->has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) { + dev_warn(&device->dev, "neither FMR nor FR is supported\n"); + } else if (!never_register && + device->attrs.max_mr_size >= 2 * srp_dev->mr_page_size) { + srp_dev->use_fast_reg = (srp_dev->has_fr && + (!srp_dev->has_fmr || prefer_fr)); + srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr; + } + if (srp_dev->use_fast_reg) { srp_dev->max_pages_per_mr = min_t(u32, srp_dev->max_pages_per_mr, @@ -3456,7 +3578,8 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->pd)) goto free_dev; - if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) { + if (never_register || !register_always || + (!srp_dev->has_fmr && !srp_dev->has_fr)) { srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd, IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 9e05ce4a04fd..26bb9b0a7a63 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -202,6 +202,8 @@ struct srp_target_port { char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; + int mr_pool_size; + int mr_per_cmd; int queue_size; int req_ring_size; int comp_vector; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 8b42401d4795..2843f1ae75bd 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -765,52 +765,6 @@ static int srpt_post_recv(struct srpt_device *sdev, } /** - * srpt_post_send() - Post an IB send request. - * - * Returns zero upon success and a non-zero value upon failure. - */ -static int srpt_post_send(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx, int len) -{ - struct ib_sge list; - struct ib_send_wr wr, *bad_wr; - struct srpt_device *sdev = ch->sport->sdev; - int ret; - - atomic_inc(&ch->req_lim); - - ret = -ENOMEM; - if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) { - pr_warn("IB send queue full (needed 1)\n"); - goto out; - } - - ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len, - DMA_TO_DEVICE); - - list.addr = ioctx->ioctx.dma; - list.length = len; - list.lkey = sdev->pd->local_dma_lkey; - - ioctx->ioctx.cqe.done = srpt_send_done; - wr.next = NULL; - wr.wr_cqe = &ioctx->ioctx.cqe; - wr.sg_list = &list; - wr.num_sge = 1; - wr.opcode = IB_WR_SEND; - wr.send_flags = IB_SEND_SIGNALED; - - ret = ib_post_send(ch->qp, &wr, &bad_wr); - -out: - if (ret < 0) { - atomic_inc(&ch->sq_wr_avail); - atomic_dec(&ch->req_lim); - } - return ret; -} - -/** * srpt_zerolength_write() - Perform a zero-length RDMA write. * * A quote from the InfiniBand specification: C9-88: For an HCA responder @@ -843,6 +797,110 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) } } +static int srpt_alloc_rw_ctxs(struct srpt_send_ioctx *ioctx, + struct srp_direct_buf *db, int nbufs, struct scatterlist **sg, + unsigned *sg_cnt) +{ + enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd); + struct srpt_rdma_ch *ch = ioctx->ch; + struct scatterlist *prev = NULL; + unsigned prev_nents; + int ret, i; + + if (nbufs == 1) { + ioctx->rw_ctxs = &ioctx->s_rw_ctx; + } else { + ioctx->rw_ctxs = kmalloc_array(nbufs, sizeof(*ioctx->rw_ctxs), + GFP_KERNEL); + if (!ioctx->rw_ctxs) + return -ENOMEM; + } + + for (i = ioctx->n_rw_ctx; i < nbufs; i++, db++) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + u64 remote_addr = be64_to_cpu(db->va); + u32 size = be32_to_cpu(db->len); + u32 rkey = be32_to_cpu(db->key); + + ret = target_alloc_sgl(&ctx->sg, &ctx->nents, size, false, + i < nbufs - 1); + if (ret) + goto unwind; + + ret = rdma_rw_ctx_init(&ctx->rw, ch->qp, ch->sport->port, + ctx->sg, ctx->nents, 0, remote_addr, rkey, dir); + if (ret < 0) { + target_free_sgl(ctx->sg, ctx->nents); + goto unwind; + } + + ioctx->n_rdma += ret; + ioctx->n_rw_ctx++; + + if (prev) { + sg_unmark_end(&prev[prev_nents - 1]); + sg_chain(prev, prev_nents + 1, ctx->sg); + } else { + *sg = ctx->sg; + } + + prev = ctx->sg; + prev_nents = ctx->nents; + + *sg_cnt += ctx->nents; + } + + return 0; + +unwind: + while (--i >= 0) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port, + ctx->sg, ctx->nents, dir); + target_free_sgl(ctx->sg, ctx->nents); + } + if (ioctx->rw_ctxs != &ioctx->s_rw_ctx) + kfree(ioctx->rw_ctxs); + return ret; +} + +static void srpt_free_rw_ctxs(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd); + int i; + + for (i = 0; i < ioctx->n_rw_ctx; i++) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port, + ctx->sg, ctx->nents, dir); + target_free_sgl(ctx->sg, ctx->nents); + } + + if (ioctx->rw_ctxs != &ioctx->s_rw_ctx) + kfree(ioctx->rw_ctxs); +} + +static inline void *srpt_get_desc_buf(struct srp_cmd *srp_cmd) +{ + /* + * The pointer computations below will only be compiled correctly + * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check + * whether srp_cmd::add_data has been declared as a byte pointer. + */ + BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) && + !__same_type(srp_cmd->add_data[0], (u8)0)); + + /* + * According to the SRP spec, the lower two bits of the 'ADDITIONAL + * CDB LENGTH' field are reserved and the size in bytes of this field + * is four times the value specified in bits 3..7. Hence the "& ~3". + */ + return srp_cmd->add_data + (srp_cmd->add_cdb_len & ~3); +} + /** * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. * @ioctx: Pointer to the I/O context associated with the request. @@ -858,94 +916,59 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) * -ENOMEM when memory allocation fails and zero upon success. */ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, - struct srp_cmd *srp_cmd, - enum dma_data_direction *dir, u64 *data_len) + struct srp_cmd *srp_cmd, enum dma_data_direction *dir, + struct scatterlist **sg, unsigned *sg_cnt, u64 *data_len) { - struct srp_indirect_buf *idb; - struct srp_direct_buf *db; - unsigned add_cdb_offset; - int ret; - - /* - * The pointer computations below will only be compiled correctly - * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check - * whether srp_cmd::add_data has been declared as a byte pointer. - */ - BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) - && !__same_type(srp_cmd->add_data[0], (u8)0)); - BUG_ON(!dir); BUG_ON(!data_len); - ret = 0; - *data_len = 0; - /* * The lower four bits of the buffer format field contain the DATA-IN * buffer descriptor format, and the highest four bits contain the * DATA-OUT buffer descriptor format. */ - *dir = DMA_NONE; if (srp_cmd->buf_fmt & 0xf) /* DATA-IN: transfer data from target to initiator (read). */ *dir = DMA_FROM_DEVICE; else if (srp_cmd->buf_fmt >> 4) /* DATA-OUT: transfer data from initiator to target (write). */ *dir = DMA_TO_DEVICE; + else + *dir = DMA_NONE; + + /* initialize data_direction early as srpt_alloc_rw_ctxs needs it */ + ioctx->cmd.data_direction = *dir; - /* - * According to the SRP spec, the lower two bits of the 'ADDITIONAL - * CDB LENGTH' field are reserved and the size in bytes of this field - * is four times the value specified in bits 3..7. Hence the "& ~3". - */ - add_cdb_offset = srp_cmd->add_cdb_len & ~3; if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) || ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) { - ioctx->n_rbuf = 1; - ioctx->rbufs = &ioctx->single_rbuf; + struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd); - db = (struct srp_direct_buf *)(srp_cmd->add_data - + add_cdb_offset); - memcpy(ioctx->rbufs, db, sizeof(*db)); *data_len = be32_to_cpu(db->len); + return srpt_alloc_rw_ctxs(ioctx, db, 1, sg, sg_cnt); } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) || ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) { - idb = (struct srp_indirect_buf *)(srp_cmd->add_data - + add_cdb_offset); + struct srp_indirect_buf *idb = srpt_get_desc_buf(srp_cmd); + int nbufs = be32_to_cpu(idb->table_desc.len) / + sizeof(struct srp_direct_buf); - ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof(*db); - - if (ioctx->n_rbuf > + if (nbufs > (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { pr_err("received unsupported SRP_CMD request" " type (%u out + %u in != %u / %zu)\n", srp_cmd->data_out_desc_cnt, srp_cmd->data_in_desc_cnt, be32_to_cpu(idb->table_desc.len), - sizeof(*db)); - ioctx->n_rbuf = 0; - ret = -EINVAL; - goto out; - } - - if (ioctx->n_rbuf == 1) - ioctx->rbufs = &ioctx->single_rbuf; - else { - ioctx->rbufs = - kmalloc(ioctx->n_rbuf * sizeof(*db), GFP_ATOMIC); - if (!ioctx->rbufs) { - ioctx->n_rbuf = 0; - ret = -ENOMEM; - goto out; - } + sizeof(struct srp_direct_buf)); + return -EINVAL; } - db = idb->desc_list; - memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof(*db)); *data_len = be32_to_cpu(idb->len); + return srpt_alloc_rw_ctxs(ioctx, idb->desc_list, nbufs, + sg, sg_cnt); + } else { + *data_len = 0; + return 0; } -out: - return ret; } /** @@ -1049,217 +1072,6 @@ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) } /** - * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list. - */ -static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - struct scatterlist *sg; - enum dma_data_direction dir; - - BUG_ON(!ch); - BUG_ON(!ioctx); - BUG_ON(ioctx->n_rdma && !ioctx->rdma_wrs); - - while (ioctx->n_rdma) - kfree(ioctx->rdma_wrs[--ioctx->n_rdma].wr.sg_list); - - kfree(ioctx->rdma_wrs); - ioctx->rdma_wrs = NULL; - - if (ioctx->mapped_sg_count) { - sg = ioctx->sg; - WARN_ON(!sg); - dir = ioctx->cmd.data_direction; - BUG_ON(dir == DMA_NONE); - ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt, - target_reverse_dma_direction(&ioctx->cmd)); - ioctx->mapped_sg_count = 0; - } -} - -/** - * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list. - */ -static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - struct ib_device *dev = ch->sport->sdev->device; - struct se_cmd *cmd; - struct scatterlist *sg, *sg_orig; - int sg_cnt; - enum dma_data_direction dir; - struct ib_rdma_wr *riu; - struct srp_direct_buf *db; - dma_addr_t dma_addr; - struct ib_sge *sge; - u64 raddr; - u32 rsize; - u32 tsize; - u32 dma_len; - int count, nrdma; - int i, j, k; - - BUG_ON(!ch); - BUG_ON(!ioctx); - cmd = &ioctx->cmd; - dir = cmd->data_direction; - BUG_ON(dir == DMA_NONE); - - ioctx->sg = sg = sg_orig = cmd->t_data_sg; - ioctx->sg_cnt = sg_cnt = cmd->t_data_nents; - - count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt, - target_reverse_dma_direction(cmd)); - if (unlikely(!count)) - return -EAGAIN; - - ioctx->mapped_sg_count = count; - - if (ioctx->rdma_wrs && ioctx->n_rdma_wrs) - nrdma = ioctx->n_rdma_wrs; - else { - nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE - + ioctx->n_rbuf; - - ioctx->rdma_wrs = kcalloc(nrdma, sizeof(*ioctx->rdma_wrs), - GFP_KERNEL); - if (!ioctx->rdma_wrs) - goto free_mem; - - ioctx->n_rdma_wrs = nrdma; - } - - db = ioctx->rbufs; - tsize = cmd->data_length; - dma_len = ib_sg_dma_len(dev, &sg[0]); - riu = ioctx->rdma_wrs; - - /* - * For each remote desc - calculate the #ib_sge. - * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then - * each remote desc rdma_iu is required a rdma wr; - * else - * we need to allocate extra rdma_iu to carry extra #ib_sge in - * another rdma wr - */ - for (i = 0, j = 0; - j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { - rsize = be32_to_cpu(db->len); - raddr = be64_to_cpu(db->va); - riu->remote_addr = raddr; - riu->rkey = be32_to_cpu(db->key); - riu->wr.num_sge = 0; - - /* calculate how many sge required for this remote_buf */ - while (rsize > 0 && tsize > 0) { - - if (rsize >= dma_len) { - tsize -= dma_len; - rsize -= dma_len; - raddr += dma_len; - - if (tsize > 0) { - ++j; - if (j < count) { - sg = sg_next(sg); - dma_len = ib_sg_dma_len( - dev, sg); - } - } - } else { - tsize -= rsize; - dma_len -= rsize; - rsize = 0; - } - - ++riu->wr.num_sge; - - if (rsize > 0 && - riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) { - ++ioctx->n_rdma; - riu->wr.sg_list = kmalloc_array(riu->wr.num_sge, - sizeof(*riu->wr.sg_list), - GFP_KERNEL); - if (!riu->wr.sg_list) - goto free_mem; - - ++riu; - riu->wr.num_sge = 0; - riu->remote_addr = raddr; - riu->rkey = be32_to_cpu(db->key); - } - } - - ++ioctx->n_rdma; - riu->wr.sg_list = kmalloc_array(riu->wr.num_sge, - sizeof(*riu->wr.sg_list), - GFP_KERNEL); - if (!riu->wr.sg_list) - goto free_mem; - } - - db = ioctx->rbufs; - tsize = cmd->data_length; - riu = ioctx->rdma_wrs; - sg = sg_orig; - dma_len = ib_sg_dma_len(dev, &sg[0]); - dma_addr = ib_sg_dma_address(dev, &sg[0]); - - /* this second loop is really mapped sg_addres to rdma_iu->ib_sge */ - for (i = 0, j = 0; - j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { - rsize = be32_to_cpu(db->len); - sge = riu->wr.sg_list; - k = 0; - - while (rsize > 0 && tsize > 0) { - sge->addr = dma_addr; - sge->lkey = ch->sport->sdev->pd->local_dma_lkey; - - if (rsize >= dma_len) { - sge->length = - (tsize < dma_len) ? tsize : dma_len; - tsize -= dma_len; - rsize -= dma_len; - - if (tsize > 0) { - ++j; - if (j < count) { - sg = sg_next(sg); - dma_len = ib_sg_dma_len( - dev, sg); - dma_addr = ib_sg_dma_address( - dev, sg); - } - } - } else { - sge->length = (tsize < rsize) ? tsize : rsize; - tsize -= rsize; - dma_len -= rsize; - dma_addr += rsize; - rsize = 0; - } - - ++k; - if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) { - ++riu; - sge = riu->wr.sg_list; - k = 0; - } else if (rsize > 0 && tsize > 0) - ++sge; - } - } - - return 0; - -free_mem: - srpt_unmap_sg_to_ib_sge(ch, ioctx); - - return -ENOMEM; -} - -/** * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator. */ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) @@ -1284,12 +1096,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) BUG_ON(ioctx->ch != ch); spin_lock_init(&ioctx->spinlock); ioctx->state = SRPT_STATE_NEW; - ioctx->n_rbuf = 0; - ioctx->rbufs = NULL; ioctx->n_rdma = 0; - ioctx->n_rdma_wrs = 0; - ioctx->rdma_wrs = NULL; - ioctx->mapped_sg_count = 0; + ioctx->n_rw_ctx = 0; init_completion(&ioctx->tx_done); ioctx->queue_status_only = false; /* @@ -1359,7 +1167,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * SRP_RSP sending failed or the SRP_RSP send completion has * not been received in time. */ - srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); break; case SRPT_STATE_MGMT_RSP_SENT: @@ -1387,6 +1194,7 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(ioctx->n_rdma <= 0); atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + ioctx->n_rdma = 0; if (unlikely(wc->status != IB_WC_SUCCESS)) { pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n", @@ -1403,23 +1211,6 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) __LINE__, srpt_get_cmd_state(ioctx)); } -static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct srpt_send_ioctx *ioctx = - container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe); - - if (unlikely(wc->status != IB_WC_SUCCESS)) { - /* - * Note: if an RDMA write error completion is received that - * means that a SEND also has been posted. Defer further - * processing of the associated command until the send error - * completion has been received. - */ - pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n", - ioctx, wc->status); - } -} - /** * srpt_build_cmd_rsp() - Build an SRP_RSP response. * @ch: RDMA channel through which the request has been received. @@ -1537,6 +1328,8 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch, { struct se_cmd *cmd; struct srp_cmd *srp_cmd; + struct scatterlist *sg = NULL; + unsigned sg_cnt = 0; u64 data_len; enum dma_data_direction dir; int rc; @@ -1563,16 +1356,21 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch, break; } - if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { - pr_err("0x%llx: parsing SRP descriptor table failed.\n", - srp_cmd->tag); + rc = srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &sg, &sg_cnt, + &data_len); + if (rc) { + if (rc != -EAGAIN) { + pr_err("0x%llx: parsing SRP descriptor table failed.\n", + srp_cmd->tag); + } goto release_ioctx; } - rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb, + rc = target_submit_cmd_map_sgls(cmd, ch->sess, srp_cmd->cdb, &send_ioctx->sense_data[0], scsilun_to_int(&srp_cmd->lun), data_len, - TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); + TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF, + sg, sg_cnt, NULL, 0, NULL, 0); if (rc != 0) { pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc, srp_cmd->tag); @@ -1664,23 +1462,21 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, recv_ioctx->ioctx.dma, srp_max_req_size, DMA_FROM_DEVICE); - if (unlikely(ch->state == CH_CONNECTING)) { - list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); - goto out; - } + if (unlikely(ch->state == CH_CONNECTING)) + goto out_wait; if (unlikely(ch->state != CH_LIVE)) - goto out; + return; srp_cmd = recv_ioctx->ioctx.buf; if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) { - if (!send_ioctx) + if (!send_ioctx) { + if (!list_empty(&ch->cmd_wait_list)) + goto out_wait; send_ioctx = srpt_get_send_ioctx(ch); - if (unlikely(!send_ioctx)) { - list_add_tail(&recv_ioctx->wait_list, - &ch->cmd_wait_list); - goto out; } + if (unlikely(!send_ioctx)) + goto out_wait; } switch (srp_cmd->opcode) { @@ -1709,8 +1505,10 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, } srpt_post_recv(ch->sport->sdev, recv_ioctx); -out: return; + +out_wait: + list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); } static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1779,14 +1577,13 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(state != SRPT_STATE_CMD_RSP_SENT && state != SRPT_STATE_MGMT_RSP_SENT); - atomic_inc(&ch->sq_wr_avail); + atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail); if (wc->status != IB_WC_SUCCESS) pr_info("sending response for ioctx 0x%p failed" " with status %d\n", ioctx, wc->status); if (state != SRPT_STATE_DONE) { - srpt_unmap_sg_to_ib_sge(ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); } else { pr_err("IB completion has been received too late for" @@ -1832,8 +1629,18 @@ retry: qp_init->srq = sdev->srq; qp_init->sq_sig_type = IB_SIGNAL_REQ_WR; qp_init->qp_type = IB_QPT_RC; - qp_init->cap.max_send_wr = srp_sq_size; - qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE; + /* + * We divide up our send queue size into half SEND WRs to send the + * completions, and half R/W contexts to actually do the RDMA + * READ/WRITE transfers. Note that we need to allocate CQ slots for + * both both, as RDMA contexts will also post completions for the + * RDMA READ case. + */ + qp_init->cap.max_send_wr = srp_sq_size / 2; + qp_init->cap.max_rdma_ctxs = srp_sq_size / 2; + qp_init->cap.max_send_sge = max(sdev->device->attrs.max_sge_rd, + sdev->device->attrs.max_sge); + qp_init->port_num = ch->sport->port; ch->qp = ib_create_qp(sdev->pd, qp_init); if (IS_ERR(ch->qp)) { @@ -2386,95 +2193,6 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return ret; } -/** - * srpt_perform_rdmas() - Perform IB RDMA. - * - * Returns zero upon success or a negative number upon failure. - */ -static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - struct ib_send_wr *bad_wr; - int sq_wr_avail, ret, i; - enum dma_data_direction dir; - const int n_rdma = ioctx->n_rdma; - - dir = ioctx->cmd.data_direction; - if (dir == DMA_TO_DEVICE) { - /* write */ - ret = -ENOMEM; - sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail); - if (sq_wr_avail < 0) { - pr_warn("IB send queue full (needed %d)\n", - n_rdma); - goto out; - } - } - - for (i = 0; i < n_rdma; i++) { - struct ib_send_wr *wr = &ioctx->rdma_wrs[i].wr; - - wr->opcode = (dir == DMA_FROM_DEVICE) ? - IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; - - if (i == n_rdma - 1) { - /* only get completion event for the last rdma read */ - if (dir == DMA_TO_DEVICE) { - wr->send_flags = IB_SEND_SIGNALED; - ioctx->rdma_cqe.done = srpt_rdma_read_done; - } else { - ioctx->rdma_cqe.done = srpt_rdma_write_done; - } - wr->wr_cqe = &ioctx->rdma_cqe; - wr->next = NULL; - } else { - wr->wr_cqe = NULL; - wr->next = &ioctx->rdma_wrs[i + 1].wr; - } - } - - ret = ib_post_send(ch->qp, &ioctx->rdma_wrs->wr, &bad_wr); - if (ret) - pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n", - __func__, __LINE__, ret, i, n_rdma); -out: - if (unlikely(dir == DMA_TO_DEVICE && ret < 0)) - atomic_add(n_rdma, &ch->sq_wr_avail); - return ret; -} - -/** - * srpt_xfer_data() - Start data transfer from initiator to target. - */ -static int srpt_xfer_data(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - int ret; - - ret = srpt_map_sg_to_ib_sge(ch, ioctx); - if (ret) { - pr_err("%s[%d] ret=%d\n", __func__, __LINE__, ret); - goto out; - } - - ret = srpt_perform_rdmas(ch, ioctx); - if (ret) { - if (ret == -EAGAIN || ret == -ENOMEM) - pr_info("%s[%d] queue full -- ret=%d\n", - __func__, __LINE__, ret); - else - pr_err("%s[%d] fatal error -- ret=%d\n", - __func__, __LINE__, ret); - goto out_unmap; - } - -out: - return ret; -out_unmap: - srpt_unmap_sg_to_ib_sge(ch, ioctx); - goto out; -} - static int srpt_write_pending_status(struct se_cmd *se_cmd) { struct srpt_send_ioctx *ioctx; @@ -2491,11 +2209,42 @@ static int srpt_write_pending(struct se_cmd *se_cmd) struct srpt_send_ioctx *ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); struct srpt_rdma_ch *ch = ioctx->ch; + struct ib_send_wr *first_wr = NULL, *bad_wr; + struct ib_cqe *cqe = &ioctx->rdma_cqe; enum srpt_command_state new_state; + int ret, i; new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA); WARN_ON(new_state == SRPT_STATE_DONE); - return srpt_xfer_data(ch, ioctx); + + if (atomic_sub_return(ioctx->n_rdma, &ch->sq_wr_avail) < 0) { + pr_warn("%s: IB send queue full (needed %d)\n", + __func__, ioctx->n_rdma); + ret = -ENOMEM; + goto out_undo; + } + + cqe->done = srpt_rdma_read_done; + for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, ch->sport->port, + cqe, first_wr); + cqe = NULL; + } + + ret = ib_post_send(ch->qp, first_wr, &bad_wr); + if (ret) { + pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n", + __func__, ret, ioctx->n_rdma, + atomic_read(&ch->sq_wr_avail)); + goto out_undo; + } + + return 0; +out_undo: + atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + return ret; } static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) @@ -2517,17 +2266,17 @@ static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) */ static void srpt_queue_response(struct se_cmd *cmd) { - struct srpt_rdma_ch *ch; - struct srpt_send_ioctx *ioctx; + struct srpt_send_ioctx *ioctx = + container_of(cmd, struct srpt_send_ioctx, cmd); + struct srpt_rdma_ch *ch = ioctx->ch; + struct srpt_device *sdev = ch->sport->sdev; + struct ib_send_wr send_wr, *first_wr = NULL, *bad_wr; + struct ib_sge sge; enum srpt_command_state state; unsigned long flags; - int ret; - enum dma_data_direction dir; - int resp_len; + int resp_len, ret, i; u8 srp_tm_status; - ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); - ch = ioctx->ch; BUG_ON(!ch); spin_lock_irqsave(&ioctx->spinlock, flags); @@ -2554,17 +2303,19 @@ static void srpt_queue_response(struct se_cmd *cmd) return; } - dir = ioctx->cmd.data_direction; - /* For read commands, transfer the data to the initiator. */ - if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length && + if (ioctx->cmd.data_direction == DMA_FROM_DEVICE && + ioctx->cmd.data_length && !ioctx->queue_status_only) { - ret = srpt_xfer_data(ch, ioctx); - if (ret) { - pr_err("xfer_data failed for tag %llu\n", - ioctx->cmd.tag); - return; + for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, + ch->sport->port, NULL, + first_wr ? first_wr : &send_wr); } + } else { + first_wr = &send_wr; } if (state != SRPT_STATE_MGMT) @@ -2576,14 +2327,46 @@ static void srpt_queue_response(struct se_cmd *cmd) resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status, ioctx->cmd.tag); } - ret = srpt_post_send(ch, ioctx, resp_len); - if (ret) { - pr_err("sending cmd response failed for tag %llu\n", - ioctx->cmd.tag); - srpt_unmap_sg_to_ib_sge(ch, ioctx); - srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); - target_put_sess_cmd(&ioctx->cmd); + + atomic_inc(&ch->req_lim); + + if (unlikely(atomic_sub_return(1 + ioctx->n_rdma, + &ch->sq_wr_avail) < 0)) { + pr_warn("%s: IB send queue full (needed %d)\n", + __func__, ioctx->n_rdma); + ret = -ENOMEM; + goto out; + } + + ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, resp_len, + DMA_TO_DEVICE); + + sge.addr = ioctx->ioctx.dma; + sge.length = resp_len; + sge.lkey = sdev->pd->local_dma_lkey; + + ioctx->ioctx.cqe.done = srpt_send_done; + send_wr.next = NULL; + send_wr.wr_cqe = &ioctx->ioctx.cqe; + send_wr.sg_list = &sge; + send_wr.num_sge = 1; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(ch->qp, first_wr, &bad_wr); + if (ret < 0) { + pr_err("%s: sending cmd response failed for tag %llu (%d)\n", + __func__, ioctx->cmd.tag, ret); + goto out; } + + return; + +out: + atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail); + atomic_dec(&ch->req_lim); + srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + target_put_sess_cmd(&ioctx->cmd); } static int srpt_queue_data_in(struct se_cmd *cmd) @@ -2599,10 +2382,6 @@ static void srpt_queue_tm_rsp(struct se_cmd *cmd) static void srpt_aborted_task(struct se_cmd *cmd) { - struct srpt_send_ioctx *ioctx = container_of(cmd, - struct srpt_send_ioctx, cmd); - - srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); } static int srpt_queue_status(struct se_cmd *cmd) @@ -2903,12 +2682,10 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) unsigned long flags; WARN_ON(ioctx->state != SRPT_STATE_DONE); - WARN_ON(ioctx->mapped_sg_count != 0); - if (ioctx->n_rbuf > 1) { - kfree(ioctx->rbufs); - ioctx->rbufs = NULL; - ioctx->n_rbuf = 0; + if (ioctx->n_rw_ctx) { + srpt_free_rw_ctxs(ch, ioctx); + ioctx->n_rw_ctx = 0; } spin_lock_irqsave(&ch->spinlock, flags); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index af9b8b527340..fee6bfd7ca21 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -42,6 +42,7 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_sa.h> #include <rdma/ib_cm.h> +#include <rdma/rw.h> #include <scsi/srp.h> @@ -105,7 +106,6 @@ enum { SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2, SRPT_DEF_SG_TABLESIZE = 128, - SRPT_DEF_SG_PER_WQE = 16, MIN_SRPT_SQ_SIZE = 16, DEF_SRPT_SQ_SIZE = 4096, @@ -174,21 +174,17 @@ struct srpt_recv_ioctx { struct srpt_ioctx ioctx; struct list_head wait_list; }; + +struct srpt_rw_ctx { + struct rdma_rw_ctx rw; + struct scatterlist *sg; + unsigned int nents; +}; /** * struct srpt_send_ioctx - SRPT send I/O context. * @ioctx: See above. * @ch: Channel pointer. - * @free_list: Node in srpt_rdma_ch.free_list. - * @n_rbuf: Number of data buffers in the received SRP command. - * @rbufs: Pointer to SRP data buffer array. - * @single_rbuf: SRP data buffer if the command has only a single buffer. - * @sg: Pointer to sg-list associated with this I/O context. - * @sg_cnt: SG-list size. - * @mapped_sg_count: ib_dma_map_sg() return value. - * @n_rdma_wrs: Number of elements in the rdma_wrs array. - * @rdma_wrs: Array with information about the RDMA mapping. - * @tag: Tag of the received SRP information unit. * @spinlock: Protects 'state'. * @state: I/O context state. * @cmd: Target core command data structure. @@ -197,21 +193,18 @@ struct srpt_recv_ioctx { struct srpt_send_ioctx { struct srpt_ioctx ioctx; struct srpt_rdma_ch *ch; - struct ib_rdma_wr *rdma_wrs; + + struct srpt_rw_ctx s_rw_ctx; + struct srpt_rw_ctx *rw_ctxs; + struct ib_cqe rdma_cqe; - struct srp_direct_buf *rbufs; - struct srp_direct_buf single_rbuf; - struct scatterlist *sg; struct list_head free_list; spinlock_t spinlock; enum srpt_command_state state; struct se_cmd cmd; struct completion tx_done; - int sg_cnt; - int mapped_sg_count; - u16 n_rdma_wrs; u8 n_rdma; - u8 n_rbuf; + u8 n_rw_ctx; bool queue_status_only; u8 sense_data[TRANSPORT_SENSE_BUFFER]; }; diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h index 80417fc564d4..4705e2dea423 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h @@ -1392,6 +1392,10 @@ struct ulp_mem_io { #define T5_ULP_MEMIO_ORDER_V(x) ((x) << T5_ULP_MEMIO_ORDER_S) #define T5_ULP_MEMIO_ORDER_F T5_ULP_MEMIO_ORDER_V(1U) +#define T5_ULP_MEMIO_FID_S 4 +#define T5_ULP_MEMIO_FID_M 0x7ff +#define T5_ULP_MEMIO_FID_V(x) ((x) << T5_ULP_MEMIO_FID_S) + /* ulp_mem_io.lock_addr fields */ #define ULP_MEMIO_ADDR_S 0 #define ULP_MEMIO_ADDR_V(x) ((x) << ULP_MEMIO_ADDR_S) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index b51e42d6fbec..873a631ad155 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -39,6 +39,53 @@ #include <linux/mlx5/cq.h> #include "mlx5_core.h" +#define TASKLET_MAX_TIME 2 +#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME) + +void mlx5_cq_tasklet_cb(unsigned long data) +{ + unsigned long flags; + unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES; + struct mlx5_eq_tasklet *ctx = (struct mlx5_eq_tasklet *)data; + struct mlx5_core_cq *mcq; + struct mlx5_core_cq *temp; + + spin_lock_irqsave(&ctx->lock, flags); + list_splice_tail_init(&ctx->list, &ctx->process_list); + spin_unlock_irqrestore(&ctx->lock, flags); + + list_for_each_entry_safe(mcq, temp, &ctx->process_list, + tasklet_ctx.list) { + list_del_init(&mcq->tasklet_ctx.list); + mcq->tasklet_ctx.comp(mcq); + if (atomic_dec_and_test(&mcq->refcount)) + complete(&mcq->free); + if (time_after(jiffies, end)) + break; + } + + if (!list_empty(&ctx->process_list)) + tasklet_schedule(&ctx->task); +} + +static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq) +{ + unsigned long flags; + struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv; + + spin_lock_irqsave(&tasklet_ctx->lock, flags); + /* When migrating CQs between EQs will be implemented, please note + * that you need to sync this point. It is possible that + * while migrating a CQ, completions on the old EQs could + * still arrive. + */ + if (list_empty_careful(&cq->tasklet_ctx.list)) { + atomic_inc(&cq->refcount); + list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list); + } + spin_unlock_irqrestore(&tasklet_ctx->lock, flags); +} + void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn) { struct mlx5_core_cq *cq; @@ -96,6 +143,13 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, struct mlx5_create_cq_mbox_out out; struct mlx5_destroy_cq_mbox_in din; struct mlx5_destroy_cq_mbox_out dout; + int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), + c_eqn); + struct mlx5_eq *eq; + + eq = mlx5_eqn2eq(dev, eqn); + if (IS_ERR(eq)) + return PTR_ERR(eq); in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_CQ); memset(&out, 0, sizeof(out)); @@ -111,6 +165,11 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, cq->arm_sn = 0; atomic_set(&cq->refcount, 1); init_completion(&cq->free); + if (!cq->comp) + cq->comp = mlx5_add_cq_to_tasklet; + /* assuming CQ will be deleted before the EQ */ + cq->tasklet_ctx.priv = &eq->tasklet_ctx; + INIT_LIST_HEAD(&cq->tasklet_ctx.list); spin_lock_irq(&table->lock); err = radix_tree_insert(&table->tree, cq->cqn, cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 18fccec72c5d..0e30602ef76d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -202,7 +202,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) struct mlx5_eqe *eqe; int eqes_found = 0; int set_ci = 0; - u32 cqn; + u32 cqn = -1; u32 rsn; u8 port; @@ -320,6 +320,9 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) eq_update_ci(eq, 1); + if (cqn != -1) + tasklet_schedule(&eq->tasklet_ctx.task); + return eqes_found; } @@ -403,6 +406,12 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, if (err) goto err_irq; + INIT_LIST_HEAD(&eq->tasklet_ctx.list); + INIT_LIST_HEAD(&eq->tasklet_ctx.process_list); + spin_lock_init(&eq->tasklet_ctx.lock); + tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb, + (unsigned long)&eq->tasklet_ctx); + /* EQs are created in ARMED state */ eq_update_ci(eq, 1); @@ -436,6 +445,7 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq) mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n", eq->eqn); synchronize_irq(eq->irqn); + tasklet_disable(&eq->tasklet_ctx.task); mlx5_buf_free(dev, &eq->buf); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 6feef7fb9d6a..a19b59348dd6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -663,6 +663,23 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, } EXPORT_SYMBOL(mlx5_vector2eqn); +struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn) +{ + struct mlx5_eq_table *table = &dev->priv.eq_table; + struct mlx5_eq *eq; + + spin_lock(&table->lock); + list_for_each_entry(eq, &table->comp_eqs_list, list) + if (eq->eqn == eqn) { + spin_unlock(&table->lock); + return eq; + } + + spin_unlock(&table->lock); + + return ERR_PTR(-ENOENT); +} + static void free_comp_eqs(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = &dev->priv.eq_table; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 482604bd051c..2f86ec6fcf25 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -102,6 +102,8 @@ int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id); int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev); cycle_t mlx5_read_internal_timer(struct mlx5_core_dev *dev); u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx); +struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn); +void mlx5_cq_tasklet_cb(unsigned long data); void mlx5e_init(void); void mlx5e_cleanup(void); diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/staging/rdma/hfi1/affinity.c index 2cb8ca77f876..6e7050ab9e16 100644 --- a/drivers/staging/rdma/hfi1/affinity.c +++ b/drivers/staging/rdma/hfi1/affinity.c @@ -53,20 +53,6 @@ #include "sdma.h" #include "trace.h" -struct cpu_mask_set { - struct cpumask mask; - struct cpumask used; - uint gen; -}; - -struct hfi1_affinity { - struct cpu_mask_set def_intr; - struct cpu_mask_set rcv_intr; - struct cpu_mask_set proc; - /* spin lock to protect affinity struct */ - spinlock_t lock; -}; - /* Name of IRQ types, indexed by enum irq_type */ static const char * const irq_type_names[] = { "SDMA", @@ -82,6 +68,48 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set) set->gen = 0; } +/* Initialize non-HT cpu cores mask */ +int init_real_cpu_mask(struct hfi1_devdata *dd) +{ + struct hfi1_affinity *info; + int possible, curr_cpu, i, ht; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + cpumask_clear(&info->real_cpu_mask); + + /* Start with cpu online mask as the real cpu mask */ + cpumask_copy(&info->real_cpu_mask, cpu_online_mask); + + /* + * Remove HT cores from the real cpu mask. Do this in two steps below. + */ + possible = cpumask_weight(&info->real_cpu_mask); + ht = cpumask_weight(topology_sibling_cpumask( + cpumask_first(&info->real_cpu_mask))); + /* + * Step 1. Skip over the first N HT siblings and use them as the + * "real" cores. Assumes that HT cores are not enumerated in + * succession (except in the single core case). + */ + curr_cpu = cpumask_first(&info->real_cpu_mask); + for (i = 0; i < possible / ht; i++) + curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + /* + * Step 2. Remove the remaining HT siblings. Use cpumask_next() to + * skip any gaps. + */ + for (; i < possible; i++) { + cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + } + + dd->affinity = info; + return 0; +} + /* * Interrupt affinity. * @@ -93,20 +121,17 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set) * to the node relative 1 as necessary. * */ -int hfi1_dev_affinity_init(struct hfi1_devdata *dd) +void hfi1_dev_affinity_init(struct hfi1_devdata *dd) { int node = pcibus_to_node(dd->pcidev->bus); - struct hfi1_affinity *info; + struct hfi1_affinity *info = dd->affinity; const struct cpumask *local_mask; - int curr_cpu, possible, i, ht; + int curr_cpu, possible, i; if (node < 0) node = numa_node_id(); dd->node = node; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; spin_lock_init(&info->lock); init_cpu_mask_set(&info->def_intr); @@ -116,30 +141,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) local_mask = cpumask_of_node(dd->node); if (cpumask_first(local_mask) >= nr_cpu_ids) local_mask = topology_core_cpumask(0); - /* use local mask as default */ - cpumask_copy(&info->def_intr.mask, local_mask); - /* - * Remove HT cores from the default mask. Do this in two steps below. - */ - possible = cpumask_weight(&info->def_intr.mask); - ht = cpumask_weight(topology_sibling_cpumask( - cpumask_first(&info->def_intr.mask))); - /* - * Step 1. Skip over the first N HT siblings and use them as the - * "real" cores. Assumes that HT cores are not enumerated in - * succession (except in the single core case). - */ - curr_cpu = cpumask_first(&info->def_intr.mask); - for (i = 0; i < possible / ht; i++) - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - /* - * Step 2. Remove the remaining HT siblings. Use cpumask_next() to - * skip any gaps. - */ - for (; i < possible; i++) { - cpumask_clear_cpu(curr_cpu, &info->def_intr.mask); - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - } + /* Use the "real" cpu mask of this node as the default */ + cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask); /* fill in the receive list */ possible = cpumask_weight(&info->def_intr.mask); @@ -167,8 +170,6 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) } cpumask_copy(&info->proc.mask, cpu_online_mask); - dd->affinity = info; - return 0; } void hfi1_dev_affinity_free(struct hfi1_devdata *dd) diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/staging/rdma/hfi1/affinity.h index b287e4963024..20f52fe74091 100644 --- a/drivers/staging/rdma/hfi1/affinity.h +++ b/drivers/staging/rdma/hfi1/affinity.h @@ -64,10 +64,27 @@ enum affinity_flags { AFF_IRQ_LOCAL }; +struct cpu_mask_set { + struct cpumask mask; + struct cpumask used; + uint gen; +}; + +struct hfi1_affinity { + struct cpu_mask_set def_intr; + struct cpu_mask_set rcv_intr; + struct cpu_mask_set proc; + struct cpumask real_cpu_mask; + /* spin lock to protect affinity struct */ + spinlock_t lock; +}; + struct hfi1_msix_entry; +/* Initialize non-HT cpu cores mask */ +int init_real_cpu_mask(struct hfi1_devdata *); /* Initialize driver affinity data */ -int hfi1_dev_affinity_init(struct hfi1_devdata *); +void hfi1_dev_affinity_init(struct hfi1_devdata *); /* Free driver affinity data */ void hfi1_dev_affinity_free(struct hfi1_devdata *); /* diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c index 16eb653903e0..dcae8e723f98 100644 --- a/drivers/staging/rdma/hfi1/chip.c +++ b/drivers/staging/rdma/hfi1/chip.c @@ -123,6 +123,8 @@ struct flag_table { #define MIN_KERNEL_KCTXTS 2 #define FIRST_KERNEL_KCTXT 1 +/* sizes for both the QP and RSM map tables */ +#define NUM_MAP_ENTRIES 256 #define NUM_MAP_REGS 32 /* Bit offset into the GUID which carries HFI id information */ @@ -1029,9 +1031,12 @@ static int thermal_init(struct hfi1_devdata *dd); static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs); static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc); +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr); static void handle_temp_err(struct hfi1_devdata *); static void dc_shutdown(struct hfi1_devdata *); static void dc_start(struct hfi1_devdata *); +static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, + unsigned int *np); /* * Error interrupt table entry. This is used as input to the interrupt @@ -5661,7 +5666,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index) sci = &dd->send_contexts[sw_index]; /* there is no information for user (PSM) and ack contexts */ - if (sci->type != SC_KERNEL) + if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15)) return -1; sc = sci->sc; @@ -6199,18 +6204,13 @@ static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data) /* * Handle host requests from the 8051. - * - * This is a work-queue function outside of the interrupt. */ -void handle_8051_request(struct work_struct *work) +static void handle_8051_request(struct hfi1_pportdata *ppd) { - struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, - dc_host_req_work); struct hfi1_devdata *dd = ppd->dd; u64 reg; u16 data = 0; - u8 type, i, lanes, *cache = ppd->qsfp_info.cache; - u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS]; + u8 type; reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1); if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0) @@ -6231,46 +6231,11 @@ void handle_8051_request(struct work_struct *work) case HREQ_READ_CONFIG: case HREQ_SET_TX_EQ_ABS: case HREQ_SET_TX_EQ_REL: + case HREQ_ENABLE: dd_dev_info(dd, "8051 request: request 0x%x not supported\n", type); hreq_response(dd, HREQ_NOT_SUPPORTED, 0); break; - - case HREQ_ENABLE: - lanes = data & 0xF; - for (i = 0; lanes; lanes >>= 1, i++) { - if (!(lanes & 1)) - continue; - if (data & 0x200) { - /* enable TX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x8 && - cache[QSFP_CDR_INFO_OFFS] & 0x80) - cdr_ctrl_byte |= (1 << (i + 4)); - } else { - /* disable TX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x8 && - cache[QSFP_CDR_INFO_OFFS] & 0x80) - cdr_ctrl_byte &= ~(1 << (i + 4)); - } - - if (data & 0x800) { - /* enable RX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x4 && - cache[QSFP_CDR_INFO_OFFS] & 0x40) - cdr_ctrl_byte |= (1 << i); - } else { - /* disable RX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x4 && - cache[QSFP_CDR_INFO_OFFS] & 0x40) - cdr_ctrl_byte &= ~(1 << i); - } - } - one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS, - &cdr_ctrl_byte, 1); - hreq_response(dd, HREQ_SUCCESS, data); - refresh_qsfp_cache(ppd, &ppd->qsfp_info); - break; - case HREQ_CONFIG_DONE: hreq_response(dd, HREQ_SUCCESS, 0); break; @@ -6278,7 +6243,6 @@ void handle_8051_request(struct work_struct *work) case HREQ_INTERFACE_TEST: hreq_response(dd, HREQ_SUCCESS, data); break; - default: dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type); hreq_response(dd, HREQ_NOT_SUPPORTED, 0); @@ -6849,6 +6813,75 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd) ppd->neighbor_fm_security = 0; } +static const char * const link_down_reason_strs[] = { + [OPA_LINKDOWN_REASON_NONE] = "None", + [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0", + [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length", + [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long", + [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short", + [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID", + [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID", + [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2", + [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC", + [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8", + [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail", + [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10", + [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error", + [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15", + [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker", + [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14", + [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15", + [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance", + [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance", + [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance", + [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack", + [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker", + [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt", + [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit", + [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit", + [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24", + [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25", + [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26", + [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27", + [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28", + [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29", + [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30", + [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] = + "Excessive buffer overrun", + [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown", + [OPA_LINKDOWN_REASON_REBOOT] = "Reboot", + [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown", + [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce", + [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy", + [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy", + [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected", + [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] = + "Local media not installed", + [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed", + [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config", + [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] = + "End to end not installed", + [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy", + [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy", + [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy", + [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management", + [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled", + [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient" +}; + +/* return the neighbor link down reason string */ +static const char *link_down_reason_str(u8 reason) +{ + const char *str = NULL; + + if (reason < ARRAY_SIZE(link_down_reason_strs)) + str = link_down_reason_strs[reason]; + if (!str) + str = "(invalid)"; + + return str; +} + /* * Handle a link down interrupt from the 8051. * @@ -6857,8 +6890,11 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd) void handle_link_down(struct work_struct *work) { u8 lcl_reason, neigh_reason = 0; + u8 link_down_reason; struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, - link_down_work); + link_down_work); + int was_up; + static const char ldr_str[] = "Link down reason: "; if ((ppd->host_link_state & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) && @@ -6867,20 +6903,63 @@ void handle_link_down(struct work_struct *work) HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED); /* Go offline first, then deal with reading/writing through 8051 */ + was_up = !!(ppd->host_link_state & HLS_UP); set_link_state(ppd, HLS_DN_OFFLINE); - lcl_reason = 0; - read_planned_down_reason_code(ppd->dd, &neigh_reason); + if (was_up) { + lcl_reason = 0; + /* link down reason is only valid if the link was up */ + read_link_down_reason(ppd->dd, &link_down_reason); + switch (link_down_reason) { + case LDR_LINK_TRANSFER_ACTIVE_LOW: + /* the link went down, no idle message reason */ + dd_dev_info(ppd->dd, "%sUnexpected link down\n", + ldr_str); + break; + case LDR_RECEIVED_LINKDOWN_IDLE_MSG: + /* + * The neighbor reason is only valid if an idle message + * was received for it. + */ + read_planned_down_reason_code(ppd->dd, &neigh_reason); + dd_dev_info(ppd->dd, + "%sNeighbor link down message %d, %s\n", + ldr_str, neigh_reason, + link_down_reason_str(neigh_reason)); + break; + case LDR_RECEIVED_HOST_OFFLINE_REQ: + dd_dev_info(ppd->dd, + "%sHost requested link to go offline\n", + ldr_str); + break; + default: + dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n", + ldr_str, link_down_reason); + break; + } - /* - * If no reason, assume peer-initiated but missed - * LinkGoingDown idle flits. - */ - if (neigh_reason == 0) - lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN; + /* + * If no reason, assume peer-initiated but missed + * LinkGoingDown idle flits. + */ + if (neigh_reason == 0) + lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN; + } else { + /* went down while polling or going up */ + lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT; + } set_link_down_reason(ppd, lcl_reason, neigh_reason, 0); + /* inform the SMA when the link transitions from up to down */ + if (was_up && ppd->local_link_down_reason.sma == 0 && + ppd->neigh_link_down_reason.sma == 0) { + ppd->local_link_down_reason.sma = + ppd->local_link_down_reason.latest; + ppd->neigh_link_down_reason.sma = + ppd->neigh_link_down_reason.latest; + } + reset_neighbor_info(ppd); /* disable the port */ @@ -6890,7 +6969,7 @@ void handle_link_down(struct work_struct *work) * If there is no cable attached, turn the DC off. Otherwise, * start the link bring up. */ - if (!qsfp_mod_present(ppd)) { + if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) { dc_shutdown(ppd->dd); } else { tune_serdes(ppd); @@ -7373,7 +7452,11 @@ retry: ppd->link_width_downgrade_rx_active = rx; } - if (lwde == 0) { + if (ppd->link_width_downgrade_tx_active == 0 || + ppd->link_width_downgrade_rx_active == 0) { + /* the 8051 reported a dead link as a downgrade */ + dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n"); + } else if (lwde == 0) { /* downgrade is disabled */ /* bounce if not at starting active width */ @@ -7534,7 +7617,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) host_msg &= ~(u64)LINKUP_ACHIEVED; } if (host_msg & EXT_DEVICE_CFG_REQ) { - queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work); + handle_8051_request(ppd); host_msg &= ~(u64)EXT_DEVICE_CFG_REQ; } if (host_msg & VERIFY_CAP_FRAME) { @@ -8660,6 +8743,14 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc) *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK; } +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr) +{ + u32 frame; + + read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame); + *ldr = (frame & 0xff); +} + static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx, u8 *tx_polarity_inversion, @@ -9049,9 +9140,9 @@ set_local_link_attributes_fail: } /* - * Call this to start the link. Schedule a retry if the cable is not - * present or if unable to start polling. Do not do anything if the - * link is disabled. Returns 0 if link is disabled or moved to polling + * Call this to start the link. + * Do not do anything if the link is disabled. + * Returns 0 if link is disabled, moved to polling, or the driver is not ready. */ int start_link(struct hfi1_pportdata *ppd) { @@ -9068,15 +9159,7 @@ int start_link(struct hfi1_pportdata *ppd) return 0; } - if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES || - loopback == LOOPBACK_LCB || - ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) - return set_link_state(ppd, HLS_DN_POLL); - - dd_dev_info(ppd->dd, - "%s: stopping link start because no cable is present\n", - __func__); - return -EAGAIN; + return set_link_state(ppd, HLS_DN_POLL); } static void wait_for_qsfp_init(struct hfi1_pportdata *ppd) @@ -9247,7 +9330,7 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, return 0; } -/* This routine will only be scheduled if the QSFP module is present */ +/* This routine will only be scheduled if the QSFP module present is asserted */ void qsfp_event(struct work_struct *work) { struct qsfp_data *qd; @@ -9676,6 +9759,7 @@ static void set_send_length(struct hfi1_pportdata *ppd) & SEND_LEN_CHECK1_LEN_VL15_MASK) << SEND_LEN_CHECK1_LEN_VL15_SHIFT; int i; + u32 thres; for (i = 0; i < ppd->vls_supported; i++) { if (dd->vld[i].mtu > maxvlmtu) @@ -9694,16 +9778,17 @@ static void set_send_length(struct hfi1_pportdata *ppd) /* adjust kernel credit return thresholds based on new MTUs */ /* all kernel receive contexts have the same hdrqentsize */ for (i = 0; i < ppd->vls_supported; i++) { - sc_set_cr_threshold(dd->vld[i].sc, - sc_mtu_to_threshold(dd->vld[i].sc, - dd->vld[i].mtu, - dd->rcd[0]-> - rcvhdrqentsize)); - } - sc_set_cr_threshold(dd->vld[15].sc, - sc_mtu_to_threshold(dd->vld[15].sc, - dd->vld[15].mtu, + thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50), + sc_mtu_to_threshold(dd->vld[i].sc, + dd->vld[i].mtu, dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->vld[i].sc, thres); + } + thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50), + sc_mtu_to_threshold(dd->vld[15].sc, + dd->vld[15].mtu, + dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->vld[15].sc, thres); /* Adjust maximum MTU for the port in DC */ dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 : @@ -10030,7 +10115,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) struct hfi1_devdata *dd = ppd->dd; struct ib_event event = {.device = NULL}; int ret1, ret = 0; - int was_up, is_down; int orig_new_state, poll_bounce; mutex_lock(&ppd->hls_lock); @@ -10049,8 +10133,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) poll_bounce ? "(bounce) " : "", link_state_reason_name(ppd, state)); - was_up = !!(ppd->host_link_state & HLS_UP); - /* * If we're going to a (HLS_*) link state that implies the logical * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then @@ -10261,17 +10343,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) break; } - is_down = !!(ppd->host_link_state & (HLS_DN_POLL | - HLS_DN_DISABLE | HLS_DN_OFFLINE)); - - if (was_up && is_down && ppd->local_link_down_reason.sma == 0 && - ppd->neigh_link_down_reason.sma == 0) { - ppd->local_link_down_reason.sma = - ppd->local_link_down_reason.latest; - ppd->neigh_link_down_reason.sma = - ppd->neigh_link_down_reason.latest; - } - goto done; unexpected: @@ -12673,22 +12744,24 @@ static int set_up_context_variables(struct hfi1_devdata *dd) int total_contexts; int ret; unsigned ngroups; + int qos_rmt_count; + int user_rmt_reduced; /* - * Kernel contexts: (to be fixed later): - * - min or 2 or 1 context/numa + * Kernel receive contexts: + * - min of 2 or 1 context/numa (excluding control context) * - Context 0 - control context (VL15/multicast/error) - * - Context 1 - default context + * - Context 1 - first kernel context + * - Context 2 - second kernel context + * ... */ if (n_krcvqs) /* - * Don't count context 0 in n_krcvqs since - * is isn't used for normal verbs traffic. - * - * krcvqs will reflect number of kernel - * receive contexts above 0. + * n_krcvqs is the sum of module parameter kernel receive + * contexts, krcvqs[]. It does not include the control + * context, so add that. */ - num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1; + num_kernel_contexts = n_krcvqs + 1; else num_kernel_contexts = num_online_nodes() + 1; num_kernel_contexts = @@ -12705,12 +12778,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd) num_kernel_contexts = dd->chip_send_contexts - num_vls - 1; } /* - * User contexts: (to be fixed later) - * - default to 1 user context per CPU if num_user_contexts is - * negative + * User contexts: + * - default to 1 user context per real (non-HT) CPU core if + * num_user_contexts is negative */ if (num_user_contexts < 0) - num_user_contexts = num_online_cpus(); + num_user_contexts = + cpumask_weight(&dd->affinity->real_cpu_mask); total_contexts = num_kernel_contexts + num_user_contexts; @@ -12727,6 +12801,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd) total_contexts = num_kernel_contexts + num_user_contexts; } + /* each user context requires an entry in the RMT */ + qos_rmt_count = qos_rmt_entries(dd, NULL, NULL); + if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) { + user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count; + dd_dev_err(dd, + "RMT size is reducing the number of user receive contexts from %d to %d\n", + (int)num_user_contexts, + user_rmt_reduced); + /* recalculate */ + num_user_contexts = user_rmt_reduced; + total_contexts = num_kernel_contexts + num_user_contexts; + } + /* the first N are kernel contexts, the rest are user contexts */ dd->num_rcv_contexts = total_contexts; dd->n_krcv_queues = num_kernel_contexts; @@ -12776,12 +12863,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd) dd->num_send_contexts = ret; dd_dev_info( dd, - "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n", + "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n", dd->chip_send_contexts, dd->num_send_contexts, dd->sc_sizes[SC_KERNEL].count, dd->sc_sizes[SC_ACK].count, - dd->sc_sizes[SC_USER].count); + dd->sc_sizes[SC_USER].count, + dd->sc_sizes[SC_VL15].count); ret = 0; /* success */ } @@ -13451,122 +13539,224 @@ static void init_qpmap_table(struct hfi1_devdata *dd, int i; u64 ctxt = first_ctxt; - for (i = 0; i < 256;) { + for (i = 0; i < 256; i++) { reg |= ctxt << (8 * (i % 8)); - i++; ctxt++; if (ctxt > last_ctxt) ctxt = first_ctxt; - if (i % 8 == 0) { + if (i % 8 == 7) { write_csr(dd, regno, reg); reg = 0; regno += 8; } } - if (i % 8) - write_csr(dd, regno, reg); add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK); } -/** - * init_qos - init RX qos - * @dd - device data - * @first_context - * - * This routine initializes Rule 0 and the - * RSM map table to implement qos. - * - * If all of the limit tests succeed, - * qos is applied based on the array - * interpretation of krcvqs where - * entry 0 is VL0. - * - * The number of vl bits (n) and the number of qpn - * bits (m) are computed to feed both the RSM map table - * and the single rule. - * +struct rsm_map_table { + u64 map[NUM_MAP_REGS]; + unsigned int used; +}; + +struct rsm_rule_data { + u8 offset; + u8 pkt_type; + u32 field1_off; + u32 field2_off; + u32 index1_off; + u32 index1_width; + u32 index2_off; + u32 index2_width; + u32 mask1; + u32 value1; + u32 mask2; + u32 value2; +}; + +/* + * Return an initialized RMT map table for users to fill in. OK if it + * returns NULL, indicating no table. */ -static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt) +static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd) { + struct rsm_map_table *rmt; + u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */ + + rmt = kmalloc(sizeof(*rmt), GFP_KERNEL); + if (rmt) { + memset(rmt->map, rxcontext, sizeof(rmt->map)); + rmt->used = 0; + } + + return rmt; +} + +/* + * Write the final RMT map table to the chip and free the table. OK if + * table is NULL. + */ +static void complete_rsm_map_table(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + int i; + + if (rmt) { + /* write table to chip */ + for (i = 0; i < NUM_MAP_REGS; i++) + write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]); + + /* enable RSM */ + add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); + } +} + +/* + * Add a receive side mapping rule. + */ +static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index, + struct rsm_rule_data *rrd) +{ + write_csr(dd, RCV_RSM_CFG + (8 * rule_index), + (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT | + 1ull << rule_index | /* enable bit */ + (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT); + write_csr(dd, RCV_RSM_SELECT + (8 * rule_index), + (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | + (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | + (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | + (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | + (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | + (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); + write_csr(dd, RCV_RSM_MATCH + (8 * rule_index), + (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT | + (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT | + (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT | + (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT); +} + +/* return the number of RSM map table entries that will be used for QOS */ +static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, + unsigned int *np) +{ + int i; + unsigned int m, n; u8 max_by_vl = 0; - unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; - u64 *rsmmap; - u64 reg; - u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */ - /* validate */ + /* is QOS active at all? */ if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS || num_vls == 1 || krcvqsset <= 1) - goto bail; - for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++) + goto no_qos; + + /* determine bits for qpn */ + for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++) if (krcvqs[i] > max_by_vl) max_by_vl = krcvqs[i]; if (max_by_vl > 32) - goto bail; - qpns_per_vl = __roundup_pow_of_two(max_by_vl); - /* determine bits vl */ - n = ilog2(num_vls); - /* determine bits for qpn */ - m = ilog2(qpns_per_vl); + goto no_qos; + m = ilog2(__roundup_pow_of_two(max_by_vl)); + + /* determine bits for vl */ + n = ilog2(__roundup_pow_of_two(num_vls)); + + /* reject if too much is used */ if ((m + n) > 7) + goto no_qos; + + if (mp) + *mp = m; + if (np) + *np = n; + + return 1 << (m + n); + +no_qos: + if (mp) + *mp = 0; + if (np) + *np = 0; + return 0; +} + +/** + * init_qos - init RX qos + * @dd - device data + * @rmt - RSM map table + * + * This routine initializes Rule 0 and the RSM map table to implement + * quality of service (qos). + * + * If all of the limit tests succeed, qos is applied based on the array + * interpretation of krcvqs where entry 0 is VL0. + * + * The number of vl bits (n) and the number of qpn bits (m) are computed to + * feed both the RSM map table and the single rule. + */ +static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; + unsigned int rmt_entries; + u64 reg; + + if (!rmt) goto bail; - if (num_vls * qpns_per_vl > dd->chip_rcv_contexts) + rmt_entries = qos_rmt_entries(dd, &m, &n); + if (rmt_entries == 0) goto bail; - rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL); - if (!rsmmap) + qpns_per_vl = 1 << m; + + /* enough room in the map table? */ + rmt_entries = 1 << (m + n); + if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES) goto bail; - memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64)); - /* init the local copy of the table */ - for (i = 0, ctxt = first_ctxt; i < num_vls; i++) { + + /* add qos entries to the the RSM map table */ + for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) { unsigned tctxt; for (qpn = 0, tctxt = ctxt; krcvqs[i] && qpn < qpns_per_vl; qpn++) { unsigned idx, regoff, regidx; - /* generate index <= 128 */ - idx = (qpn << n) ^ i; + /* generate the index the hardware will produce */ + idx = rmt->used + ((qpn << n) ^ i); regoff = (idx % 8) * 8; regidx = idx / 8; - reg = rsmmap[regidx]; - /* replace 0xff with context number */ + /* replace default with context number */ + reg = rmt->map[regidx]; reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff); reg |= (u64)(tctxt++) << regoff; - rsmmap[regidx] = reg; + rmt->map[regidx] = reg; if (tctxt == ctxt + krcvqs[i]) tctxt = ctxt; } ctxt += krcvqs[i]; } - /* flush cached copies to chip */ - for (i = 0; i < NUM_MAP_REGS; i++) - write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]); - /* add rule0 */ - write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */, - RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK << - RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT | - 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT); - write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */, - LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | - LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | - LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | - ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | - QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | - ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); - write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */, - LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT | - LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT | - LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT | - LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT); - /* Enable RSM */ - add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); - kfree(rsmmap); - /* map everything else to first context */ - init_qpmap_table(dd, FIRST_KERNEL_KCTXT, MIN_KERNEL_KCTXTS - 1); + + rrd.offset = rmt->used; + rrd.pkt_type = 2; + rrd.field1_off = LRH_BTH_MATCH_OFFSET; + rrd.field2_off = LRH_SC_MATCH_OFFSET; + rrd.index1_off = LRH_SC_SELECT_OFFSET; + rrd.index1_width = n; + rrd.index2_off = QPN_SELECT_OFFSET; + rrd.index2_width = m + n; + rrd.mask1 = LRH_BTH_MASK; + rrd.value1 = LRH_BTH_VALUE; + rrd.mask2 = LRH_SC_MASK; + rrd.value2 = LRH_SC_VALUE; + + /* add rule 0 */ + add_rsm_rule(dd, 0, &rrd); + + /* mark RSM map entries as used */ + rmt->used += rmt_entries; + /* map everything else to the mcast/err/vl15 context */ + init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT); dd->qos_shift = n + 1; return; bail: @@ -13574,13 +13764,86 @@ bail: init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1); } +static void init_user_fecn_handling(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + u64 reg; + int i, idx, regoff, regidx; + u8 offset; + + /* there needs to be enough room in the map table */ + if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) { + dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n"); + return; + } + + /* + * RSM will extract the destination context as an index into the + * map table. The destination contexts are a sequential block + * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive). + * Map entries are accessed as offset + extracted value. Adjust + * the added offset so this sequence can be placed anywhere in + * the table - as long as the entries themselves do not wrap. + * There are only enough bits in offset for the table size, so + * start with that to allow for a "negative" offset. + */ + offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used - + (int)dd->first_user_ctxt); + + for (i = dd->first_user_ctxt, idx = rmt->used; + i < dd->num_rcv_contexts; i++, idx++) { + /* replace with identity mapping */ + regoff = (idx % 8) * 8; + regidx = idx / 8; + reg = rmt->map[regidx]; + reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff); + reg |= (u64)i << regoff; + rmt->map[regidx] = reg; + } + + /* + * For RSM intercept of Expected FECN packets: + * o packet type 0 - expected + * o match on F (bit 95), using select/match 1, and + * o match on SH (bit 133), using select/match 2. + * + * Use index 1 to extract the 8-bit receive context from DestQP + * (start at bit 64). Use that as the RSM map table index. + */ + rrd.offset = offset; + rrd.pkt_type = 0; + rrd.field1_off = 95; + rrd.field2_off = 133; + rrd.index1_off = 64; + rrd.index1_width = 8; + rrd.index2_off = 0; + rrd.index2_width = 0; + rrd.mask1 = 1; + rrd.value1 = 1; + rrd.mask2 = 1; + rrd.value2 = 1; + + /* add rule 1 */ + add_rsm_rule(dd, 1, &rrd); + + rmt->used += dd->num_user_contexts; +} + static void init_rxe(struct hfi1_devdata *dd) { + struct rsm_map_table *rmt; + /* enable all receive errors */ write_csr(dd, RCV_ERR_MASK, ~0ull); - /* setup QPN map table - start where VL15 context leaves off */ - init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? - MIN_KERNEL_KCTXTS : 0); + + rmt = alloc_rsm_map_table(dd); + /* set up QOS, including the QPN map table */ + init_qos(dd, rmt); + init_user_fecn_handling(dd, rmt); + complete_rsm_map_table(dd, rmt); + kfree(rmt); + /* * make sure RcvCtrl.RcvWcb <= PCIe Device Control * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config @@ -13762,6 +14025,7 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey) write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg); reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; + reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK; write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); done: return ret; @@ -14148,6 +14412,19 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, (dd->revision >> CCE_REVISION_SW_SHIFT) & CCE_REVISION_SW_MASK); + /* + * The real cpu mask is part of the affinity struct but has to be + * initialized earlier than the rest of the affinity struct because it + * is needed to calculate the number of user contexts in + * set_up_context_variables(). However, hfi1_dev_affinity_init(), + * which initializes the rest of the affinity struct members, + * depends on set_up_context_variables() for the number of kernel + * contexts, so it cannot be called before set_up_context_variables(). + */ + ret = init_real_cpu_mask(dd); + if (ret) + goto bail_cleanup; + ret = set_up_context_variables(dd); if (ret) goto bail_cleanup; @@ -14161,9 +14438,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* set up KDETH QP prefix in both RX and TX CSRs */ init_kdeth_qp(dd); - ret = hfi1_dev_affinity_init(dd); - if (ret) - goto bail_cleanup; + hfi1_dev_affinity_init(dd); /* send contexts must be set up before receive contexts */ ret = init_send_contexts(dd); diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h index 4f3b878e43eb..1948706fff1a 100644 --- a/drivers/staging/rdma/hfi1/chip.h +++ b/drivers/staging/rdma/hfi1/chip.h @@ -389,6 +389,7 @@ #define LAST_REMOTE_STATE_COMPLETE 0x13 #define LINK_QUALITY_INFO 0x14 #define REMOTE_DEVICE_ID 0x15 +#define LINK_DOWN_REASON 0x16 /* 8051 lane specific register field IDs */ #define TX_EQ_SETTINGS 0x00 @@ -497,6 +498,11 @@ #define PWRM_BER_CONTROL 0x1 #define PWRM_BANDWIDTH_CONTROL 0x2 +/* 8051 link down reasons */ +#define LDR_LINK_TRANSFER_ACTIVE_LOW 0xa +#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb +#define LDR_RECEIVED_HOST_OFFLINE_REQ 0xc + /* verify capability fabric CRC size bits */ enum { CAP_CRC_14B = (1 << 0), /* 14b CRC */ @@ -691,7 +697,6 @@ void handle_verify_cap(struct work_struct *work); void handle_freeze(struct work_struct *work); void handle_link_up(struct work_struct *work); void handle_link_down(struct work_struct *work); -void handle_8051_request(struct work_struct *work); void handle_link_downgrade(struct work_struct *work); void handle_link_bounce(struct work_struct *work); void handle_sma_message(struct work_struct *work); diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h index 770f05c9b8de..8744de6667c2 100644 --- a/drivers/staging/rdma/hfi1/chip_registers.h +++ b/drivers/staging/rdma/hfi1/chip_registers.h @@ -771,6 +771,7 @@ #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0 #define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60 +#define RCV_RSM_CFG_OFFSET_SHIFT 32 #define RCV_RSM_MAP_TABLE (RXE + 0x000000000900) #define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull #define RCV_RSM_MATCH (RXE + 0x000000000800) diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c index c5b520bf610e..bb2409ad891a 100644 --- a/drivers/staging/rdma/hfi1/diag.c +++ b/drivers/staging/rdma/hfi1/diag.c @@ -413,7 +413,8 @@ static ssize_t diagpkt_send(struct diag_pkt *dp) goto bail; } /* can only use kernel contexts */ - if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) { + if (dd->send_contexts[dp->sw_index].type != SC_KERNEL && + dd->send_contexts[dp->sw_index].type != SC_VL15) { ret = -EINVAL; goto bail; } diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c index 34511e5df1d5..700c6fa3a633 100644 --- a/drivers/staging/rdma/hfi1/driver.c +++ b/drivers/staging/rdma/hfi1/driver.c @@ -75,7 +75,8 @@ DEFINE_MUTEX(hfi1_mutex); /* general driver use */ unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO); -MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192"); +MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify( + HFI1_DEFAULT_MAX_MTU)); unsigned int hfi1_cu = 1; module_param_named(cu, hfi1_cu, uint, S_IRUGO); diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c index 3040162cb326..ed680fda611d 100644 --- a/drivers/staging/rdma/hfi1/firmware.c +++ b/drivers/staging/rdma/hfi1/firmware.c @@ -1413,8 +1413,15 @@ static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource) if (resource & CR_DYN_MASK) { /* a dynamic resource is in use if either HFI has set the bit */ - all_bits = resource_mask(0, resource) | + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 && + (resource & (CR_I2C1 | CR_I2C2))) { + /* discrete devices must serialize across both chains */ + all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) | + resource_mask(1, CR_I2C1 | CR_I2C2); + } else { + all_bits = resource_mask(0, resource) | resource_mask(1, resource); + } my_bit = resource_mask(dd->hfi1_id, resource); } else { /* non-dynamic resources are not split between HFIs */ diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h index 16cbdc4073e0..7b78d56de7f5 100644 --- a/drivers/staging/rdma/hfi1/hfi.h +++ b/drivers/staging/rdma/hfi1/hfi.h @@ -455,9 +455,9 @@ struct rvt_sge_state; #define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE) /* use this MTU size if none other is given */ -#define HFI1_DEFAULT_ACTIVE_MTU 8192 +#define HFI1_DEFAULT_ACTIVE_MTU 10240 /* use this MTU size as the default maximum */ -#define HFI1_DEFAULT_MAX_MTU 8192 +#define HFI1_DEFAULT_MAX_MTU 10240 /* default partition key */ #define DEFAULT_PKEY 0xffff @@ -606,7 +606,6 @@ struct hfi1_pportdata { struct work_struct link_vc_work; struct work_struct link_up_work; struct work_struct link_down_work; - struct work_struct dc_host_req_work; struct work_struct sma_message_work; struct work_struct freeze_work; struct work_struct link_downgrade_work; @@ -1258,7 +1257,7 @@ void receive_interrupt_work(struct work_struct *work); static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf) { return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) | - ((!!(rhf & RHF_DC_INFO_MASK)) << 4); + ((!!(rhf & RHF_DC_INFO_SMASK)) << 4); } static inline u16 generate_jkey(kuid_t uid) @@ -1333,6 +1332,9 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, u8 sc5, const struct ib_grh *old_grh); +#define PKEY_CHECK_INVALID -1 +int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, + u8 sc5, int8_t s_pkey_index); #define PACKET_EGRESS_TIMEOUT 350 static inline void pause_for_credit_return(struct hfi1_devdata *dd) @@ -1776,6 +1778,7 @@ extern struct mutex hfi1_mutex; #define HFI1_PKT_USER_SC_INTEGRITY \ (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK \ | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \ | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK) diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c index cfcdc16b41c3..502b7cf4647d 100644 --- a/drivers/staging/rdma/hfi1/init.c +++ b/drivers/staging/rdma/hfi1/init.c @@ -422,9 +422,10 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) struct cca_timer *cca_timer; struct hfi1_pportdata *ppd; int sl; - u16 ccti, ccti_timer, ccti_min; + u16 ccti_timer, ccti_min; struct cc_state *cc_state; unsigned long flags; + enum hrtimer_restart ret = HRTIMER_NORESTART; cca_timer = container_of(t, struct cca_timer, hrtimer); ppd = cca_timer->ppd; @@ -450,24 +451,21 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) spin_lock_irqsave(&ppd->cca_timer_lock, flags); - ccti = cca_timer->ccti; - - if (ccti > ccti_min) { + if (cca_timer->ccti > ccti_min) { cca_timer->ccti--; set_link_ipg(ppd); } - spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); - - rcu_read_unlock(); - - if (ccti > ccti_min) { + if (cca_timer->ccti > ccti_min) { unsigned long nsec = 1024 * ccti_timer; /* ccti_timer is in units of 1.024 usec */ hrtimer_forward_now(t, ns_to_ktime(nsec)); - return HRTIMER_RESTART; + ret = HRTIMER_RESTART; } - return HRTIMER_NORESTART; + + spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); + rcu_read_unlock(); + return ret; } /* @@ -496,7 +494,6 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, INIT_WORK(&ppd->link_vc_work, handle_verify_cap); INIT_WORK(&ppd->link_up_work, handle_link_up); INIT_WORK(&ppd->link_down_work, handle_link_down); - INIT_WORK(&ppd->dc_host_req_work, handle_8051_request); INIT_WORK(&ppd->freeze_work, handle_freeze); INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); INIT_WORK(&ppd->sma_message_work, handle_sma_message); @@ -1007,7 +1004,7 @@ void hfi1_free_devdata(struct hfi1_devdata *dd) free_percpu(dd->rcv_limit); hfi1_dev_affinity_free(dd); free_percpu(dd->send_schedule); - ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); + rvt_dealloc_device(&dd->verbs_dev.rdi); } /* @@ -1110,7 +1107,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) bail: if (!list_empty(&dd->list)) list_del_init(&dd->list); - ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); + rvt_dealloc_device(&dd->verbs_dev.rdi); return ERR_PTR(ret); } diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c index d1e7f4d7cf6f..ed58cf21e790 100644 --- a/drivers/staging/rdma/hfi1/mad.c +++ b/drivers/staging/rdma/hfi1/mad.c @@ -999,7 +999,21 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, break; } - set_link_state(ppd, link_state); + if ((link_state == HLS_DN_POLL || + link_state == HLS_DN_DOWNDEF)) { + /* + * Going to poll. No matter what the current state, + * always move offline first, then tune and start the + * link. This correctly handles a FM link bounce and + * a link enable. Going offline is a no-op if already + * offline. + */ + set_link_state(ppd, HLS_DN_OFFLINE); + tune_serdes(ppd); + start_link(ppd); + } else { + set_link_state(ppd, link_state); + } if (link_state == HLS_DN_DISABLE && (ppd->offline_disabled_reason > HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) || diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c index b3f0682a36c9..2b0e91d3093d 100644 --- a/drivers/staging/rdma/hfi1/mmu_rb.c +++ b/drivers/staging/rdma/hfi1/mmu_rb.c @@ -91,7 +91,7 @@ static unsigned long mmu_node_start(struct mmu_rb_node *node) static unsigned long mmu_node_last(struct mmu_rb_node *node) { - return PAGE_ALIGN((node->addr & PAGE_MASK) + node->len) - 1; + return PAGE_ALIGN(node->addr + node->len) - 1; } int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) @@ -126,10 +126,15 @@ void hfi1_mmu_rb_unregister(struct rb_root *root) if (!handler) return; + /* Unregister first so we don't get any more notifications. */ + if (current->mm) + mmu_notifier_unregister(&handler->mn, current->mm); + spin_lock_irqsave(&mmu_rb_lock, flags); list_del(&handler->list); spin_unlock_irqrestore(&mmu_rb_lock, flags); + spin_lock_irqsave(&handler->lock, flags); if (!RB_EMPTY_ROOT(root)) { struct rb_node *node; struct mmu_rb_node *rbnode; @@ -141,9 +146,8 @@ void hfi1_mmu_rb_unregister(struct rb_root *root) handler->ops->remove(root, rbnode, NULL); } } + spin_unlock_irqrestore(&handler->lock, flags); - if (current->mm) - mmu_notifier_unregister(&handler->mn, current->mm); kfree(handler); } @@ -235,6 +239,25 @@ struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr, return node; } +struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root, + unsigned long addr, unsigned long len) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + struct mmu_rb_node *node; + unsigned long flags; + + if (!handler) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&handler->lock, flags); + node = __mmu_rb_search(handler, addr, len); + if (node) + __mmu_int_rb_remove(node, handler->root); + spin_unlock_irqrestore(&handler->lock, flags); + + return node; +} + void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node) { struct mmu_rb_handler *handler = find_mmu_handler(root); @@ -293,9 +316,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u", node->addr, node->len); if (handler->ops->invalidate(root, node)) { - spin_unlock_irqrestore(&handler->lock, flags); - __mmu_rb_remove(handler, node, mm); - spin_lock_irqsave(&handler->lock, flags); + __mmu_int_rb_remove(node, root); + if (handler->ops->remove) + handler->ops->remove(root, node, mm); } } spin_unlock_irqrestore(&handler->lock, flags); diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/staging/rdma/hfi1/mmu_rb.h index 19a306e83c7d..7a57b9c49d27 100644 --- a/drivers/staging/rdma/hfi1/mmu_rb.h +++ b/drivers/staging/rdma/hfi1/mmu_rb.h @@ -70,5 +70,7 @@ int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *); struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long, unsigned long); +struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long, + unsigned long); #endif /* _HFI1_MMU_RB_H */ diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c index c6849ce9e5eb..c67b9ad3fcf4 100644 --- a/drivers/staging/rdma/hfi1/pio.c +++ b/drivers/staging/rdma/hfi1/pio.c @@ -139,23 +139,30 @@ void pio_send_control(struct hfi1_devdata *dd, int op) /* Send Context Size (SCS) wildcards */ #define SCS_POOL_0 -1 #define SCS_POOL_1 -2 + /* Send Context Count (SCC) wildcards */ #define SCC_PER_VL -1 #define SCC_PER_CPU -2 - #define SCC_PER_KRCVQ -3 -#define SCC_ACK_CREDITS 32 + +/* Send Context Size (SCS) constants */ +#define SCS_ACK_CREDITS 32 +#define SCS_VL15_CREDITS 102 /* 3 pkts of 2048B data + 128B header */ + +#define PIO_THRESHOLD_CEILING 4096 #define PIO_WAIT_BATCH_SIZE 5 /* default send context sizes */ static struct sc_config_sizes sc_config_sizes[SC_MAX] = { [SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */ - .count = SCC_PER_VL },/* one per NUMA */ - [SC_ACK] = { .size = SCC_ACK_CREDITS, + .count = SCC_PER_VL }, /* one per NUMA */ + [SC_ACK] = { .size = SCS_ACK_CREDITS, .count = SCC_PER_KRCVQ }, [SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */ .count = SCC_PER_CPU }, /* one per CPU */ + [SC_VL15] = { .size = SCS_VL15_CREDITS, + .count = 1 }, }; @@ -202,7 +209,8 @@ static int wildcard_to_pool(int wc) static const char *sc_type_names[SC_MAX] = { "kernel", "ack", - "user" + "user", + "vl15" }; static const char *sc_type_name(int index) @@ -231,6 +239,22 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd) int i; /* + * When SDMA is enabled, kernel context pio packet size is capped by + * "piothreshold". Reduce pio buffer allocation for kernel context by + * setting it to a fixed size. The allocation allows 3-deep buffering + * of the largest pio packets plus up to 128 bytes header, sufficient + * to maintain verbs performance. + * + * When SDMA is disabled, keep the default pooling allocation. + */ + if (HFI1_CAP_IS_KSET(SDMA)) { + u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ? + piothreshold : PIO_THRESHOLD_CEILING; + sc_config_sizes[SC_KERNEL].size = + 3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE; + } + + /* * Step 0: * - copy the centipercents/absolute sizes from the pool config * - sanity check these values @@ -311,7 +335,7 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd) if (i == SC_ACK) { count = dd->n_krcv_queues; } else if (i == SC_KERNEL) { - count = (INIT_SC_PER_VL * num_vls) + 1 /* VL15 */; + count = INIT_SC_PER_VL * num_vls; } else if (count == SCC_PER_CPU) { count = dd->num_rcv_contexts - dd->n_krcv_queues; } else if (count < 0) { @@ -596,7 +620,7 @@ u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize) * Return value is what to write into the CSR: trigger return when * unreturned credits pass this count. */ -static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent) +u32 sc_percent_to_threshold(struct send_context *sc, u32 percent) { return (sc->credits * percent) / 100; } @@ -790,7 +814,10 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, * For Ack contexts, set a threshold for half the credits. * For User contexts use the given percentage. This has been * sanitized on driver start-up. - * For Kernel contexts, use the default MTU plus a header. + * For Kernel contexts, use the default MTU plus a header + * or half the credits, whichever is smaller. This should + * work for both the 3-deep buffering allocation and the + * pooling allocation. */ if (type == SC_ACK) { thresh = sc_percent_to_threshold(sc, 50); @@ -798,7 +825,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, thresh = sc_percent_to_threshold(sc, user_credit_return_threshold); } else { /* kernel */ - thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize); + thresh = min(sc_percent_to_threshold(sc, 50), + sc_mtu_to_threshold(sc, hfi1_max_mtu, + hdrqentsize)); } reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT); /* add in early return */ @@ -1531,7 +1560,8 @@ static void sc_piobufavail(struct send_context *sc) unsigned long flags; unsigned i, n = 0; - if (dd->send_contexts[sc->sw_index].type != SC_KERNEL) + if (dd->send_contexts[sc->sw_index].type != SC_KERNEL && + dd->send_contexts[sc->sw_index].type != SC_VL15) return; list = &sc->piowait; /* @@ -1900,7 +1930,7 @@ int init_pervl_scs(struct hfi1_devdata *dd) u32 ctxt; struct hfi1_pportdata *ppd = dd->pport; - dd->vld[15].sc = sc_alloc(dd, SC_KERNEL, + dd->vld[15].sc = sc_alloc(dd, SC_VL15, dd->rcd[0]->rcvhdrqentsize, dd->node); if (!dd->vld[15].sc) goto nomem; diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h index 0026976ce4f6..53a08edb7f64 100644 --- a/drivers/staging/rdma/hfi1/pio.h +++ b/drivers/staging/rdma/hfi1/pio.h @@ -51,7 +51,8 @@ #define SC_KERNEL 0 #define SC_ACK 1 #define SC_USER 2 -#define SC_MAX 3 +#define SC_VL15 3 +#define SC_MAX 4 /* invalid send context index */ #define INVALID_SCI 0xff @@ -293,6 +294,7 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context); void sc_add_credit_return_intr(struct send_context *sc); void sc_del_credit_return_intr(struct send_context *sc); void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold); +u32 sc_percent_to_threshold(struct send_context *sc, u32 percent); u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize); void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint); void sc_wait(struct hfi1_devdata *dd); diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c index 0a1d074583e4..8fe8a205b5bb 100644 --- a/drivers/staging/rdma/hfi1/platform.c +++ b/drivers/staging/rdma/hfi1/platform.c @@ -114,21 +114,11 @@ static int qual_power(struct hfi1_pportdata *ppd) if (ret) return ret; - if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4) - cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]); - else - cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]); + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); - if (cable_power_class <= 3 && cable_power_class > (power_class_max - 1)) - ppd->offline_disabled_reason = - HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY); - else if (cable_power_class > 4 && cable_power_class > (power_class_max)) + if (cable_power_class > power_class_max) ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY); - /* - * cable_power_class will never have value 4 as this simply - * means the high power settings are unused - */ if (ppd->offline_disabled_reason == HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) { @@ -173,12 +163,9 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd) u8 *cache = ppd->qsfp_info.cache; int ret; - if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4) - cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]); - else - cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]); + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); - if (cable_power_class) { + if (cable_power_class > QSFP_POWER_CLASS_1) { power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS]; power_ctrl_byte |= 1; @@ -190,8 +177,7 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd) if (ret != 1) return -EIO; - if (cable_power_class > 3) { - /* > power class 4*/ + if (cable_power_class > QSFP_POWER_CLASS_4) { power_ctrl_byte |= (1 << 2); ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_PWR_CTRL_BYTE_OFFS, @@ -212,12 +198,21 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd, { u32 rx_preset; u8 *cache = ppd->qsfp_info.cache; + int cable_power_class; if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) && (cache[QSFP_CDR_INFO_OFFS] & 0x40))) return; - /* rx_preset preset to zero to catch error */ + /* RX CDR present, bypass supported */ + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class <= QSFP_POWER_CLASS_3) { + /* Power class <= 3, ignore config & turn RX CDR on */ + *cdr_ctrl_byte |= 0xF; + return; + } + get_platform_config_field( ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY, @@ -250,15 +245,25 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd, static void apply_tx_cdr(struct hfi1_pportdata *ppd, u32 tx_preset_index, - u8 *ctr_ctrl_byte) + u8 *cdr_ctrl_byte) { u32 tx_preset; u8 *cache = ppd->qsfp_info.cache; + int cable_power_class; if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) && (cache[QSFP_CDR_INFO_OFFS] & 0x80))) return; + /* TX CDR present, bypass supported */ + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class <= QSFP_POWER_CLASS_3) { + /* Power class <= 3, ignore config & turn TX CDR on */ + *cdr_ctrl_byte |= 0xF0; + return; + } + get_platform_config_field( ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index, @@ -282,10 +287,10 @@ static void apply_tx_cdr(struct hfi1_pportdata *ppd, (tx_preset << 2) | (tx_preset << 3)); if (tx_preset) - *ctr_ctrl_byte |= (tx_preset << 4); + *cdr_ctrl_byte |= (tx_preset << 4); else /* Preserve current/determined RX CDR status */ - *ctr_ctrl_byte &= ((tx_preset << 4) | 0xF); + *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF); } static void apply_cdr_settings( @@ -598,6 +603,7 @@ static void apply_tunings( "Applying TX settings"); } +/* Must be holding the QSFP i2c resource */ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, u32 *ptr_rx_preset, u32 *ptr_total_atten) { @@ -605,26 +611,19 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; u8 *cache = ppd->qsfp_info.cache; - ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT); - if (ret) { - dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n", - __func__, (int)ppd->dd->hfi1_id); - return ret; - } - ppd->qsfp_info.limiting_active = 1; ret = set_qsfp_tx(ppd, 0); if (ret) - goto bail_unlock; + return ret; ret = qual_power(ppd); if (ret) - goto bail_unlock; + return ret; ret = qual_bitrate(ppd); if (ret) - goto bail_unlock; + return ret; if (ppd->qsfp_info.reset_needed) { reset_qsfp(ppd); @@ -636,7 +635,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, ret = set_qsfp_high_power(ppd); if (ret) - goto bail_unlock; + return ret; if (cache[QSFP_EQ_INFO_OFFS] & 0x4) { ret = get_platform_config_field( @@ -646,7 +645,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, ptr_tx_preset, 4); if (ret) { *ptr_tx_preset = OPA_INVALID_INDEX; - goto bail_unlock; + return ret; } } else { ret = get_platform_config_field( @@ -656,7 +655,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, ptr_tx_preset, 4); if (ret) { *ptr_tx_preset = OPA_INVALID_INDEX; - goto bail_unlock; + return ret; } } @@ -665,7 +664,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4); if (ret) { *ptr_rx_preset = OPA_INVALID_INDEX; - goto bail_unlock; + return ret; } if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G)) @@ -685,8 +684,6 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, ret = set_qsfp_tx(ppd, 1); -bail_unlock: - release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); return ret; } @@ -833,12 +830,22 @@ void tune_serdes(struct hfi1_pportdata *ppd) total_atten = platform_atten + remote_atten; tuning_method = OPA_PASSIVE_TUNING; - } else + } else { ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG); + goto bail; + } break; case PORT_TYPE_QSFP: if (qsfp_mod_present(ppd)) { + ret = acquire_chip_resource(ppd->dd, + qsfp_resource(ppd->dd), + QSFP_WAIT); + if (ret) { + dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n", + __func__, (int)ppd->dd->hfi1_id); + goto bail; + } refresh_qsfp_cache(ppd, &ppd->qsfp_info); if (ppd->qsfp_info.cache_valid) { @@ -853,21 +860,23 @@ void tune_serdes(struct hfi1_pportdata *ppd) * update the cache to reflect the changes */ refresh_qsfp_cache(ppd, &ppd->qsfp_info); - if (ret) - goto bail; - limiting_active = ppd->qsfp_info.limiting_active; } else { dd_dev_err(dd, "%s: Reading QSFP memory failed\n", __func__); - goto bail; + ret = -EINVAL; /* a fail indication */ } - } else + release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); + if (ret) + goto bail; + } else { ppd->offline_disabled_reason = HFI1_ODR_MASK( OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED); + goto bail; + } break; default: dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__); diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c index dc9119e1b458..91eb42316df9 100644 --- a/drivers/staging/rdma/hfi1/qp.c +++ b/drivers/staging/rdma/hfi1/qp.c @@ -167,8 +167,12 @@ static inline int opa_mtu_enum_to_int(int mtu) */ static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu) { - int val = opa_mtu_enum_to_int((int)mtu); + int val; + /* Constraining 10KB packets to 8KB packets */ + if (mtu == (enum ib_mtu)OPA_MTU_10240) + mtu = OPA_MTU_8192; + val = opa_mtu_enum_to_int((int)mtu); if (val > 0) return val; return ib_mtu_enum_to_int(mtu); diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c index 9ed1963010fe..2441669f0817 100644 --- a/drivers/staging/rdma/hfi1/qsfp.c +++ b/drivers/staging/rdma/hfi1/qsfp.c @@ -96,7 +96,7 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, { int ret; - if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; /* make sure the TWSI bus is in a sane state */ @@ -162,7 +162,7 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, { int ret; - if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; /* make sure the TWSI bus is in a sane state */ @@ -192,7 +192,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int ret; u8 page; - if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; /* make sure the TWSI bus is in a sane state */ @@ -276,7 +276,7 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int ret; u8 page; - if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; /* make sure the TWSI bus is in a sane state */ @@ -355,6 +355,8 @@ int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, * The calls to qsfp_{read,write} in this function correctly handle the * address map difference between this mapping and the mapping implemented * by those functions + * + * The caller must be holding the QSFP i2c chain resource. */ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) { @@ -371,13 +373,9 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) if (!qsfp_mod_present(ppd)) { ret = -ENODEV; - goto bail_no_release; + goto bail; } - ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT); - if (ret) - goto bail_no_release; - ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE); if (ret != QSFP_PAGESIZE) { dd_dev_info(ppd->dd, @@ -440,8 +438,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) } } - release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); - spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); ppd->qsfp_info.cache_valid = 1; ppd->qsfp_info.cache_refresh_required = 0; @@ -450,8 +446,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) return 0; bail: - release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); -bail_no_release: memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128)); return ret; } @@ -466,7 +460,28 @@ const char * const hfi1_qsfp_devtech[16] = { #define QSFP_DUMP_CHUNK 16 /* Holds longest string */ #define QSFP_DEFAULT_HDR_CNT 224 -static const char *pwr_codes = "1.5W2.0W2.5W3.5W"; +#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) +#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3) +/* For use with QSFP_HIGH_PWR macro */ +#define QSFP_HIGH_PWR_UNUSED 0 /* Bits [1:0] = 00 implies low power module */ + +/* + * Takes power class byte [Page 00 Byte 129] in SFF 8636 + * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4) + */ +int get_qsfp_power_class(u8 power_byte) +{ + if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED) + /* power classes count from 1, their bit encodings from 0 */ + return (QSFP_PWR(power_byte) + 1); + /* + * 00 in the high power classes stands for unused, bringing + * balance to the off-by-1 offset above, we add 4 here to + * account for the difference between the low and high power + * groups + */ + return (QSFP_HIGH_PWR(power_byte) + 4); +} int qsfp_mod_present(struct hfi1_pportdata *ppd) { @@ -537,6 +552,16 @@ set_zeroes: return ret; } +static const char *pwr_codes[8] = {"N/AW", + "1.5W", + "2.0W", + "2.5W", + "3.5W", + "4.0W", + "4.5W", + "5.0W" + }; + int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) { u8 *cache = &ppd->qsfp_info.cache[0]; @@ -546,6 +571,7 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) int bidx = 0; u8 *atten = &cache[QSFP_ATTEN_OFFS]; u8 *vendor_oui = &cache[QSFP_VOUI_OFFS]; + u8 power_byte = 0; sofar = 0; lenstr[0] = ' '; @@ -555,9 +581,9 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS])) sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]); + power_byte = cache[QSFP_MOD_PWR_OFFS]; sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", - pwr_codes + - (QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4)); + pwr_codes[get_qsfp_power_class(power_byte)]); sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", lenstr, diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h index 831fe4cf1345..dadc66c442b9 100644 --- a/drivers/staging/rdma/hfi1/qsfp.h +++ b/drivers/staging/rdma/hfi1/qsfp.h @@ -82,8 +82,9 @@ /* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */ #define QSFP_MOD_ID_OFFS 128 /* - * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class - * 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W + * Byte 129 is "Extended Identifier". + * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W + * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W */ #define QSFP_MOD_PWR_OFFS 129 /* Byte 130 is Connector type. Not Intel req'd */ @@ -190,6 +191,9 @@ extern const char *const hfi1_qsfp_devtech[16]; #define QSFP_HIGH_BIAS_WARNING 0x22 #define QSFP_LOW_BIAS_WARNING 0x11 +#define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) +#define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) + /* * struct qsfp_data encapsulates state of QSFP device for one port. * it will be part of port-specific data if a board supports QSFP. @@ -201,12 +205,6 @@ extern const char *const hfi1_qsfp_devtech[16]; * and let the qsfp_lock arbitrate access to common resources. * */ - -#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) -#define QSFP_HIGH_PWR(pbyte) (((pbyte) & 3) | 4) -#define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) -#define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) - struct qsfp_data { /* Helps to find our way */ struct hfi1_pportdata *ppd; @@ -223,6 +221,7 @@ struct qsfp_data { int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp); +int get_qsfp_power_class(u8 power_byte); int qsfp_mod_present(struct hfi1_pportdata *ppd); int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len, u8 *data); diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c index 0d7e1017f3cb..792f15eb8efe 100644 --- a/drivers/staging/rdma/hfi1/rc.c +++ b/drivers/staging/rdma/hfi1/rc.c @@ -1497,7 +1497,7 @@ reserved: /* Ignore reserved NAK codes. */ goto bail_stop; } - return ret; + /* cannot be reached */ bail_stop: hfi1_stop_rc_timers(qp); return ret; @@ -2021,8 +2021,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, if (sl >= OPA_MAX_SLS) return; - cca_timer = &ppd->cca_timer[sl]; - cc_state = get_cc_state(ppd); if (!cc_state) @@ -2041,6 +2039,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, spin_lock_irqsave(&ppd->cca_timer_lock, flags); + cca_timer = &ppd->cca_timer[sl]; if (cca_timer->ccti < ccti_limit) { if (cca_timer->ccti + ccti_incr <= ccti_limit) cca_timer->ccti += ccti_incr; @@ -2049,8 +2048,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, set_link_ipg(ppd); } - spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); - ccti = cca_timer->ccti; if (!hrtimer_active(&cca_timer->hrtimer)) { @@ -2061,6 +2058,8 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, HRTIMER_MODE_REL); } + spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); + if ((trigger_threshold != 0) && (ccti >= trigger_threshold)) log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type); } diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c index 08813cdbd475..a659aec3c3c6 100644 --- a/drivers/staging/rdma/hfi1/ruc.c +++ b/drivers/staging/rdma/hfi1/ruc.c @@ -831,7 +831,6 @@ void hfi1_do_send(struct rvt_qp *qp) struct hfi1_pkt_state ps; struct hfi1_qp_priv *priv = qp->priv; int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps); - unsigned long flags; unsigned long timeout; unsigned long timeout_int; int cpu; @@ -866,11 +865,11 @@ void hfi1_do_send(struct rvt_qp *qp) timeout_int = SEND_RESCHED_TIMEOUT; } - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave(&qp->s_lock, ps.flags); /* Return if we are already busy processing a work request. */ if (!hfi1_send_ok(qp)) { - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); return; } @@ -884,7 +883,7 @@ void hfi1_do_send(struct rvt_qp *qp) do { /* Check for a constructed packet to be sent. */ if (qp->s_hdrwords != 0) { - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); /* * If the packet cannot be sent now, return and * the send tasklet will be woken up later. @@ -897,11 +896,14 @@ void hfi1_do_send(struct rvt_qp *qp) if (unlikely(time_after(jiffies, timeout))) { if (workqueue_congested(cpu, ps.ppd->hfi1_wq)) { - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave( + &qp->s_lock, + ps.flags); qp->s_flags &= ~RVT_S_BUSY; hfi1_schedule_send(qp); - spin_unlock_irqrestore(&qp->s_lock, - flags); + spin_unlock_irqrestore( + &qp->s_lock, + ps.flags); this_cpu_inc( *ps.ppd->dd->send_schedule); return; @@ -913,11 +915,11 @@ void hfi1_do_send(struct rvt_qp *qp) } timeout = jiffies + (timeout_int) / 8; } - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave(&qp->s_lock, ps.flags); } } while (make_req(qp, &ps)); - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); } /* diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c index c7f1271190af..8cd6df8634ad 100644 --- a/drivers/staging/rdma/hfi1/sysfs.c +++ b/drivers/staging/rdma/hfi1/sysfs.c @@ -84,7 +84,7 @@ static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj, rcu_read_unlock(); return -EINVAL; } - memcpy(buf, &cc_state->cct, count); + memcpy(buf, (void *)&cc_state->cct + pos, count); rcu_read_unlock(); return count; @@ -131,7 +131,7 @@ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, rcu_read_unlock(); return -EINVAL; } - memcpy(buf, &cc_state->cong_setting, count); + memcpy(buf, (void *)&cc_state->cong_setting + pos, count); rcu_read_unlock(); return count; diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c index ae8a70f703eb..1e503ad0bebb 100644 --- a/drivers/staging/rdma/hfi1/ud.c +++ b/drivers/staging/rdma/hfi1/ud.c @@ -322,7 +322,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) (lid == ppd->lid || (lid == be16_to_cpu(IB_LID_PERMISSIVE) && qp->ibqp.qp_type == IB_QPT_GSI)))) { - unsigned long flags; + unsigned long tflags = ps->flags; /* * If DMAs are in progress, we can't generate * a completion for the loopback packet since @@ -335,10 +335,10 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; } qp->s_cur = next_cur; - local_irq_save(flags); - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, tflags); ud_loopback(qp, wqe); - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave(&qp->s_lock, tflags); + ps->flags = tflags; hfi1_send_complete(qp, wqe, IB_WC_SUCCESS); goto done_free_tx; } diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c index 8bd56d5c783d..1b640a35b3fe 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.c +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -399,8 +399,11 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) * pages, accept the amount pinned so far and program only that. * User space knows how to deal with partially programmed buffers. */ - if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) - return -ENOMEM; + if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) { + ret = -ENOMEM; + goto bail; + } + pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); if (pinned <= 0) { ret = pinned; diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c index d53a659548e0..0014c9c0e967 100644 --- a/drivers/staging/rdma/hfi1/user_sdma.c +++ b/drivers/staging/rdma/hfi1/user_sdma.c @@ -180,6 +180,8 @@ struct user_sdma_iovec { u64 offset; }; +#define SDMA_CACHE_NODE_EVICT BIT(0) + struct sdma_mmu_node { struct mmu_rb_node rb; struct list_head list; @@ -187,6 +189,7 @@ struct sdma_mmu_node { atomic_t refcount; struct page **pages; unsigned npages; + unsigned long flags; }; struct user_sdma_request { @@ -597,6 +600,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, goto free_req; } + /* Checking P_KEY for requests from user-space */ + if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc, + PKEY_CHECK_INVALID)) { + ret = -EINVAL; + goto free_req; + } + /* * Also should check the BTH.lnh. If it says the next header is GRH then * the RXE parsing will be off and will land in the middle of the KDETH @@ -1030,27 +1040,29 @@ static inline int num_user_pages(const struct iovec *iov) return 1 + ((epage - spage) >> PAGE_SHIFT); } -/* Caller must hold pq->evict_lock */ static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) { u32 cleared = 0; struct sdma_mmu_node *node, *ptr; + struct list_head to_evict = LIST_HEAD_INIT(to_evict); + spin_lock(&pq->evict_lock); list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) { /* Make sure that no one is still using the node. */ if (!atomic_read(&node->refcount)) { - /* - * Need to use the page count now as the remove callback - * will free the node. - */ + set_bit(SDMA_CACHE_NODE_EVICT, &node->flags); + list_del_init(&node->list); + list_add(&node->list, &to_evict); cleared += node->npages; - spin_unlock(&pq->evict_lock); - hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb); - spin_lock(&pq->evict_lock); if (cleared >= npages) break; } } + spin_unlock(&pq->evict_lock); + + list_for_each_entry_safe(node, ptr, &to_evict, list) + hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb); + return cleared; } @@ -1062,9 +1074,9 @@ static int pin_vector_pages(struct user_sdma_request *req, struct sdma_mmu_node *node = NULL; struct mmu_rb_node *rb_node; - rb_node = hfi1_mmu_rb_search(&pq->sdma_rb_root, - (unsigned long)iovec->iov.iov_base, - iovec->iov.iov_len); + rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root, + (unsigned long)iovec->iov.iov_base, + iovec->iov.iov_len); if (rb_node && !IS_ERR(rb_node)) node = container_of(rb_node, struct sdma_mmu_node, rb); else @@ -1076,7 +1088,6 @@ static int pin_vector_pages(struct user_sdma_request *req, return -ENOMEM; node->rb.addr = (unsigned long)iovec->iov.iov_base; - node->rb.len = iovec->iov.iov_len; node->pq = pq; atomic_set(&node->refcount, 0); INIT_LIST_HEAD(&node->list); @@ -1093,11 +1104,25 @@ static int pin_vector_pages(struct user_sdma_request *req, memcpy(pages, node->pages, node->npages * sizeof(*pages)); npages -= node->npages; + + /* + * If rb_node is NULL, it means that this is brand new node + * and, therefore not on the eviction list. + * If, however, the rb_node is non-NULL, it means that the + * node is already in RB tree and, therefore on the eviction + * list (nodes are unconditionally inserted in the eviction + * list). In that case, we have to remove the node prior to + * calling the eviction function in order to prevent it from + * freeing this node. + */ + if (rb_node) { + spin_lock(&pq->evict_lock); + list_del_init(&node->list); + spin_unlock(&pq->evict_lock); + } retry: if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) { - spin_lock(&pq->evict_lock); cleared = sdma_cache_evict(pq, npages); - spin_unlock(&pq->evict_lock); if (cleared >= npages) goto retry; } @@ -1117,37 +1142,32 @@ retry: goto bail; } kfree(node->pages); + node->rb.len = iovec->iov.iov_len; node->pages = pages; node->npages += pinned; npages = node->npages; spin_lock(&pq->evict_lock); - if (!rb_node) - list_add(&node->list, &pq->evict); - else - list_move(&node->list, &pq->evict); + list_add(&node->list, &pq->evict); pq->n_locked += pinned; spin_unlock(&pq->evict_lock); } iovec->pages = node->pages; iovec->npages = npages; - if (!rb_node) { - ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); - if (ret) { - spin_lock(&pq->evict_lock); + ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); + if (ret) { + spin_lock(&pq->evict_lock); + if (!list_empty(&node->list)) list_del(&node->list); - pq->n_locked -= node->npages; - spin_unlock(&pq->evict_lock); - ret = 0; - goto bail; - } - } else { - atomic_inc(&node->refcount); + pq->n_locked -= node->npages; + spin_unlock(&pq->evict_lock); + goto bail; } return 0; bail: - if (!rb_node) - kfree(node); + if (rb_node) + unpin_vector_pages(current->mm, node->pages, 0, node->npages); + kfree(node); return ret; } @@ -1558,7 +1578,20 @@ static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode, container_of(mnode, struct sdma_mmu_node, rb); spin_lock(&node->pq->evict_lock); - list_del(&node->list); + /* + * We've been called by the MMU notifier but this node has been + * scheduled for eviction. The eviction function will take care + * of freeing this node. + * We have to take the above lock first because we are racing + * against the setting of the bit in the eviction function. + */ + if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) { + spin_unlock(&node->pq->evict_lock); + return; + } + + if (!list_empty(&node->list)) + list_del(&node->list); node->pq->n_locked -= node->npages; spin_unlock(&node->pq->evict_lock); diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c index 89f2aad45c1b..9cdc85fa366f 100644 --- a/drivers/staging/rdma/hfi1/verbs.c +++ b/drivers/staging/rdma/hfi1/verbs.c @@ -545,7 +545,7 @@ static inline int qp_ok(int opcode, struct hfi1_packet *packet) if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) goto dropit; - if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) || + if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) || (opcode == IB_OPCODE_CNP)) return 1; dropit: @@ -1089,16 +1089,16 @@ bail: /* * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent - * being an entry from the ingress partition key table), return 0 + * being an entry from the partition key table), return 0 * otherwise. Use the matching criteria for egress partition keys * specified in the OPAv1 spec., section 9.1l.7. */ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) { u16 mkey = pkey & PKEY_LOW_15_MASK; - u16 ment = ent & PKEY_LOW_15_MASK; + u16 mentry = ent & PKEY_LOW_15_MASK; - if (mkey == ment) { + if (mkey == mentry) { /* * If pkey[15] is set (full partition member), * is bit 15 in the corresponding table element @@ -1111,32 +1111,32 @@ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) return 0; } -/* - * egress_pkey_check - return 0 if hdr's pkey matches according to the - * criteria in the OPAv1 spec., section 9.11.7. +/** + * egress_pkey_check - check P_KEY of a packet + * @ppd: Physical IB port data + * @lrh: Local route header + * @bth: Base transport header + * @sc5: SC for packet + * @s_pkey_index: It will be used for look up optimization for kernel contexts + * only. If it is negative value, then it means user contexts is calling this + * function. + * + * It checks if hdr's pkey is valid. + * + * Return: 0 on success, otherwise, 1 */ -static inline int egress_pkey_check(struct hfi1_pportdata *ppd, - struct hfi1_ib_header *hdr, - struct rvt_qp *qp) +int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, + u8 sc5, int8_t s_pkey_index) { - struct hfi1_qp_priv *priv = qp->priv; - struct hfi1_other_headers *ohdr; struct hfi1_devdata *dd; - int i = 0; + int i; u16 pkey; - u8 lnh, sc5 = priv->s_sc; + int is_user_ctxt_mechanism = (s_pkey_index < 0); if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT)) return 0; - /* locate the pkey within the headers */ - lnh = be16_to_cpu(hdr->lrh[0]) & 3; - if (lnh == HFI1_LRH_GRH) - ohdr = &hdr->u.l.oth; - else - ohdr = &hdr->u.oth; - - pkey = (u16)be32_to_cpu(ohdr->bth[0]); + pkey = (u16)be32_to_cpu(bth[0]); /* If SC15, pkey[0:14] must be 0x7fff */ if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) @@ -1146,28 +1146,37 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd, if ((pkey & PKEY_LOW_15_MASK) == 0) goto bad; - /* The most likely matching pkey has index qp->s_pkey_index */ - if (unlikely(!egress_pkey_matches_entry(pkey, - ppd->pkeys - [qp->s_pkey_index]))) { - /* no match - try the entire table */ - for (; i < MAX_PKEY_VALUES; i++) { - if (egress_pkey_matches_entry(pkey, ppd->pkeys[i])) - break; - } + /* + * For the kernel contexts only, if a qp is passed into the function, + * the most likely matching pkey has index qp->s_pkey_index + */ + if (!is_user_ctxt_mechanism && + egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) { + return 0; } - if (i < MAX_PKEY_VALUES) - return 0; + for (i = 0; i < MAX_PKEY_VALUES; i++) { + if (egress_pkey_matches_entry(pkey, ppd->pkeys[i])) + return 0; + } bad: - incr_cntr64(&ppd->port_xmit_constraint_errors); - dd = ppd->dd; - if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) { - u16 slid = be16_to_cpu(hdr->lrh[3]); - - dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK; - dd->err_info_xmit_constraint.slid = slid; - dd->err_info_xmit_constraint.pkey = pkey; + /* + * For the user-context mechanism, the P_KEY check would only happen + * once per SDMA request, not once per packet. Therefore, there's no + * need to increment the counter for the user-context mechanism. + */ + if (!is_user_ctxt_mechanism) { + incr_cntr64(&ppd->port_xmit_constraint_errors); + dd = ppd->dd; + if (!(dd->err_info_xmit_constraint.status & + OPA_EI_STATUS_SMASK)) { + u16 slid = be16_to_cpu(lrh[3]); + + dd->err_info_xmit_constraint.status |= + OPA_EI_STATUS_SMASK; + dd->err_info_xmit_constraint.slid = slid; + dd->err_info_xmit_constraint.pkey = pkey; + } } return 1; } @@ -1227,11 +1236,26 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_other_headers *ohdr; + struct hfi1_ib_header *hdr; send_routine sr; int ret; + u8 lnh; + + hdr = &ps->s_txreq->phdr.hdr; + /* locate the pkey within the headers */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == HFI1_LRH_GRH) + ohdr = &hdr->u.l.oth; + else + ohdr = &hdr->u.oth; sr = get_send_routine(qp, ps->s_txreq); - ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp); + ret = egress_pkey_check(dd->pport, + hdr->lrh, + ohdr->bth, + priv->s_sc, + qp->s_pkey_index); if (unlikely(ret)) { /* * The value we are returning here does not get propagated to diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h index 6c4670fffdbb..3ee223983b20 100644 --- a/drivers/staging/rdma/hfi1/verbs.h +++ b/drivers/staging/rdma/hfi1/verbs.h @@ -215,6 +215,7 @@ struct hfi1_pkt_state { struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; struct verbs_txreq *s_txreq; + unsigned long flags; }; #define HFI1_PSN_CREDIT 16 @@ -334,9 +335,6 @@ int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, #endif #define PSN_MODIFY_MASK 0xFFFFFF -/* Number of bits to pay attention to in the opcode for checking qp type */ -#define OPCODE_QP_MASK 0xE0 - /* * Compare the lower 24 bits of the msn values. * Returns an integer <, ==, or > than zero. diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index ab2bf12975e1..590384a2bf8b 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2195,7 +2195,7 @@ queue_full: transport_handle_queue_full(cmd, cmd->se_dev); } -static inline void transport_free_sgl(struct scatterlist *sgl, int nents) +void target_free_sgl(struct scatterlist *sgl, int nents) { struct scatterlist *sg; int count; @@ -2205,6 +2205,7 @@ static inline void transport_free_sgl(struct scatterlist *sgl, int nents) kfree(sgl); } +EXPORT_SYMBOL(target_free_sgl); static inline void transport_reset_sgl_orig(struct se_cmd *cmd) { @@ -2225,7 +2226,7 @@ static inline void transport_reset_sgl_orig(struct se_cmd *cmd) static inline void transport_free_pages(struct se_cmd *cmd) { if (!(cmd->se_cmd_flags & SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC)) { - transport_free_sgl(cmd->t_prot_sg, cmd->t_prot_nents); + target_free_sgl(cmd->t_prot_sg, cmd->t_prot_nents); cmd->t_prot_sg = NULL; cmd->t_prot_nents = 0; } @@ -2236,7 +2237,7 @@ static inline void transport_free_pages(struct se_cmd *cmd) * SG_TO_MEM_NOALLOC to function with COMPARE_AND_WRITE */ if (cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) { - transport_free_sgl(cmd->t_bidi_data_sg, + target_free_sgl(cmd->t_bidi_data_sg, cmd->t_bidi_data_nents); cmd->t_bidi_data_sg = NULL; cmd->t_bidi_data_nents = 0; @@ -2246,11 +2247,11 @@ static inline void transport_free_pages(struct se_cmd *cmd) } transport_reset_sgl_orig(cmd); - transport_free_sgl(cmd->t_data_sg, cmd->t_data_nents); + target_free_sgl(cmd->t_data_sg, cmd->t_data_nents); cmd->t_data_sg = NULL; cmd->t_data_nents = 0; - transport_free_sgl(cmd->t_bidi_data_sg, cmd->t_bidi_data_nents); + target_free_sgl(cmd->t_bidi_data_sg, cmd->t_bidi_data_nents); cmd->t_bidi_data_sg = NULL; cmd->t_bidi_data_nents = 0; } @@ -2324,20 +2325,22 @@ EXPORT_SYMBOL(transport_kunmap_data_sg); int target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length, - bool zero_page) + bool zero_page, bool chainable) { struct scatterlist *sg; struct page *page; gfp_t zero_flag = (zero_page) ? __GFP_ZERO : 0; - unsigned int nent; + unsigned int nalloc, nent; int i = 0; - nent = DIV_ROUND_UP(length, PAGE_SIZE); - sg = kmalloc(sizeof(struct scatterlist) * nent, GFP_KERNEL); + nalloc = nent = DIV_ROUND_UP(length, PAGE_SIZE); + if (chainable) + nalloc++; + sg = kmalloc_array(nalloc, sizeof(struct scatterlist), GFP_KERNEL); if (!sg) return -ENOMEM; - sg_init_table(sg, nent); + sg_init_table(sg, nalloc); while (length) { u32 page_len = min_t(u32, length, PAGE_SIZE); @@ -2361,6 +2364,7 @@ out: kfree(sg); return -ENOMEM; } +EXPORT_SYMBOL(target_alloc_sgl); /* * Allocate any required resources to execute the command. For writes we @@ -2376,7 +2380,7 @@ transport_generic_new_cmd(struct se_cmd *cmd) if (cmd->prot_op != TARGET_PROT_NORMAL && !(cmd->se_cmd_flags & SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC)) { ret = target_alloc_sgl(&cmd->t_prot_sg, &cmd->t_prot_nents, - cmd->prot_length, true); + cmd->prot_length, true, false); if (ret < 0) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } @@ -2401,13 +2405,13 @@ transport_generic_new_cmd(struct se_cmd *cmd) ret = target_alloc_sgl(&cmd->t_bidi_data_sg, &cmd->t_bidi_data_nents, - bidi_length, zero_flag); + bidi_length, zero_flag, false); if (ret < 0) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } ret = target_alloc_sgl(&cmd->t_data_sg, &cmd->t_data_nents, - cmd->data_length, zero_flag); + cmd->data_length, zero_flag, false); if (ret < 0) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } else if ((cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) && @@ -2421,7 +2425,7 @@ transport_generic_new_cmd(struct se_cmd *cmd) ret = target_alloc_sgl(&cmd->t_bidi_data_sg, &cmd->t_bidi_data_nents, - caw_length, zero_flag); + caw_length, zero_flag, false); if (ret < 0) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 47fe94ee10b8..75cd85426ae3 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -563,7 +563,7 @@ static int target_xcopy_setup_pt_cmd( if (alloc_mem) { rc = target_alloc_sgl(&cmd->t_data_sg, &cmd->t_data_nents, - cmd->data_length, false); + cmd->data_length, false, false); if (rc < 0) { ret = rc; goto out; diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index b2c9fada8eac..2be976dd4966 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -53,6 +53,11 @@ struct mlx5_core_cq { unsigned arm_sn; struct mlx5_rsc_debug *dbg; int pid; + struct { + struct list_head list; + void (*comp)(struct mlx5_core_cq *); + void *priv; + } tasklet_ctx; }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 07b504f7eb84..80776d0c52dc 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -42,6 +42,7 @@ #include <linux/vmalloc.h> #include <linux/radix-tree.h> #include <linux/workqueue.h> +#include <linux/interrupt.h> #include <linux/mlx5/device.h> #include <linux/mlx5/doorbell.h> @@ -312,6 +313,14 @@ struct mlx5_buf { u8 page_shift; }; +struct mlx5_eq_tasklet { + struct list_head list; + struct list_head process_list; + struct tasklet_struct task; + /* lock on completion tasklet list */ + spinlock_t lock; +}; + struct mlx5_eq { struct mlx5_core_dev *dev; __be32 __iomem *doorbell; @@ -325,6 +334,7 @@ struct mlx5_eq { struct list_head list; int index; struct mlx5_rsc_debug *dbg; + struct mlx5_eq_tasklet tasklet_ctx; }; struct mlx5_core_psv { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index fb2cef4e9747..fc0320c004a3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -220,6 +220,7 @@ enum ib_device_cap_flags { IB_DEVICE_ON_DEMAND_PAGING = (1 << 31), IB_DEVICE_SG_GAPS_REG = (1ULL << 32), IB_DEVICE_VIRTUAL_FUNCTION = ((u64)1 << 33), + IB_DEVICE_RAW_SCATTER_FCS = ((u64)1 << 34), }; enum ib_signature_prot_cap { @@ -931,6 +932,13 @@ struct ib_qp_cap { u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; + + /* + * Maximum number of rdma_rw_ctx structures in flight at a time. + * ib_create_qp() will calculate the right amount of neededed WRs + * and MRs based on this. + */ + u32 max_rdma_ctxs; }; enum ib_sig_type { @@ -981,6 +989,7 @@ enum ib_qp_create_flags { IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_SIGNATURE_EN = 1 << 6, IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, + IB_QP_CREATE_SCATTER_FCS = 1 << 8, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, @@ -1002,7 +1011,11 @@ struct ib_qp_init_attr { enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; enum ib_qp_create_flags create_flags; - u8 port_num; /* special QP types only */ + + /* + * Only needed for special QP types, or when using the RW API. + */ + u8 port_num; }; struct ib_qp_open_attr { @@ -1421,9 +1434,14 @@ struct ib_qp { struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; + spinlock_t mr_lock; + int mrs_used; + struct list_head rdma_mrs; + struct list_head sig_mrs; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; + /* count times opened, mcast attaches, flow attaches */ atomic_t usecnt; struct list_head open_list; @@ -1438,12 +1456,16 @@ struct ib_qp { struct ib_mr { struct ib_device *device; struct ib_pd *pd; - struct ib_uobject *uobject; u32 lkey; u32 rkey; u64 iova; u32 length; unsigned int page_size; + bool need_inval; + union { + struct ib_uobject *uobject; /* user */ + struct list_head qp_entry; /* FR */ + }; }; struct ib_mw { @@ -1827,7 +1849,8 @@ struct ib_device { u32 max_num_sg); int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, - int sg_nents); + int sg_nents, + unsigned int *sg_offset); struct ib_mw * (*alloc_mw)(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); @@ -2317,6 +2340,18 @@ static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, device->add_gid && device->del_gid; } +/* + * Check if the device supports READ W/ INVALIDATE. + */ +static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) +{ + /* + * iWarp drivers must support READ W/ INVALIDATE. No other protocol + * has support for it yet. + */ + return rdma_protocol_iwarp(dev, port_num); +} + int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid, struct ib_gid_attr *attr); @@ -3111,29 +3146,23 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); -int ib_map_mr_sg(struct ib_mr *mr, - struct scatterlist *sg, - int sg_nents, - unsigned int page_size); +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size); static inline int -ib_map_mr_sg_zbva(struct ib_mr *mr, - struct scatterlist *sg, - int sg_nents, - unsigned int page_size) +ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) { int n; - n = ib_map_mr_sg(mr, sg, sg_nents, page_size); + n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size); mr->iova = 0; return n; } -int ib_sg_to_pages(struct ib_mr *mr, - struct scatterlist *sgl, - int sg_nents, - int (*set_page)(struct ib_mr *, u64)); +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64)); void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); diff --git a/include/rdma/mr_pool.h b/include/rdma/mr_pool.h new file mode 100644 index 000000000000..986010b812eb --- /dev/null +++ b/include/rdma/mr_pool.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#ifndef _RDMA_MR_POOL_H +#define _RDMA_MR_POOL_H 1 + +#include <rdma/ib_verbs.h> + +struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list); +void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr); + +int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, + enum ib_mr_type type, u32 max_num_sg); +void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list); + +#endif /* _RDMA_MR_POOL_H */ diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index a8696551abb1..d57ceee90d26 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -467,6 +467,7 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi, } struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); +void rvt_dealloc_device(struct rvt_dev_info *rdi); int rvt_register_device(struct rvt_dev_info *rvd); void rvt_unregister_device(struct rvt_dev_info *rvd); int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 497e59065c2c..0e1ff2abfe92 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -117,8 +117,9 @@ /* * Wait flags that would prevent any packet type from being sent. */ -#define RVT_S_ANY_WAIT_IO (RVT_S_WAIT_PIO | RVT_S_WAIT_TX | \ - RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM) +#define RVT_S_ANY_WAIT_IO \ + (RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN | RVT_S_WAIT_TX | \ + RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM) /* * Wait flags that would prevent send work requests from making progress. diff --git a/include/rdma/rw.h b/include/rdma/rw.h new file mode 100644 index 000000000000..377d865e506d --- /dev/null +++ b/include/rdma/rw.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#ifndef _RDMA_RW_H +#define _RDMA_RW_H + +#include <linux/dma-mapping.h> +#include <linux/scatterlist.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <rdma/mr_pool.h> + +struct rdma_rw_ctx { + /* number of RDMA READ/WRITE WRs (not counting MR WRs) */ + u32 nr_ops; + + /* tag for the union below: */ + u8 type; + + union { + /* for mapping a single SGE: */ + struct { + struct ib_sge sge; + struct ib_rdma_wr wr; + } single; + + /* for mapping of multiple SGEs: */ + struct { + struct ib_sge *sges; + struct ib_rdma_wr *wrs; + } map; + + /* for registering multiple WRs: */ + struct rdma_rw_reg_ctx { + struct ib_sge sge; + struct ib_rdma_wr wr; + struct ib_reg_wr reg_wr; + struct ib_send_wr inv_wr; + struct ib_mr *mr; + } *reg; + + struct { + struct rdma_rw_reg_ctx data; + struct rdma_rw_reg_ctx prot; + struct ib_send_wr sig_inv_wr; + struct ib_mr *sig_mr; + struct ib_sge sig_sge; + struct ib_sig_handover_wr sig_wr; + } *sig; + }; +}; + +int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir); +void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, + enum dma_data_direction dir); + +int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + struct ib_sig_attrs *sig_attrs, u64 remote_addr, u32 rkey, + enum dma_data_direction dir); +void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + enum dma_data_direction dir); + +struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr); +int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr); + +void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr); +int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr); +void rdma_rw_cleanup_mrs(struct ib_qp *qp); + +#endif /* _RDMA_RW_H */ diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index 28ee5c2e6bcd..d8ab5101fad5 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -85,7 +85,6 @@ extern struct configfs_attribute *passthrough_attrib_attrs[]; void *transport_kmap_data_sg(struct se_cmd *); void transport_kunmap_data_sg(struct se_cmd *); /* core helpers also used by xcopy during internal command setup */ -int target_alloc_sgl(struct scatterlist **, unsigned int *, u32, bool); sense_reason_t transport_generic_map_mem_to_cmd(struct se_cmd *, struct scatterlist *, u32, struct scatterlist *, u32); diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h index 8ff6d40a294f..78d88f03b296 100644 --- a/include/target/target_core_fabric.h +++ b/include/target/target_core_fabric.h @@ -185,6 +185,10 @@ int core_tpg_set_initiator_node_tag(struct se_portal_group *, int core_tpg_register(struct se_wwn *, struct se_portal_group *, int); int core_tpg_deregister(struct se_portal_group *); +int target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, + u32 length, bool zero_page, bool chainable); +void target_free_sgl(struct scatterlist *sgl, int nents); + /* * The LIO target core uses DMA_TO_DEVICE to mean that data is going * to the target (eg handling a WRITE) and DMA_FROM_DEVICE to mean diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 8126c143a519..b6543d73d20a 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -226,6 +226,7 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_odp_caps odp_caps; __u64 timestamp_mask; __u64 hca_core_clock; /* in KHZ */ + __u64 device_cap_flags_ex; }; struct ib_uverbs_query_port { diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 93ff038ea9d1..d921adc62765 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -111,7 +111,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) cpu_relax(); } - ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE); + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE); if (unlikely(ret != ibmr->sg_len)) return ret < 0 ? ret : -EINVAL; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c250924a9fd3..94c3fa910b85 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -421,7 +421,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, return -ENOMEM; } - n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE); + n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); if (unlikely(n != frmr->sg_nents)) { pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", __func__, frmr->fr_mr, n, frmr->sg_nents); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 3b24a646eb46..fbe7444e7de6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -281,7 +281,7 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, } atomic_inc(&xprt->sc_dma_used); - n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE); + n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); if (unlikely(n != frmr->sg_nents)) { pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n", frmr->mr, n, frmr->sg_nents); |