From fc968aee5e984c9cc2417147b9b5cd1da7ab43f3 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 6 Feb 2018 20:37:31 +0000 Subject: [PATCH 001/199] IB/cxgb3: remove cxio_dbg.c cxio_dbg.c is uncompiled since commit 2b540355cd2f ("RDMA/cxgb3: cleanups") 10 years after, we could remove it. Signed-off-by: Corentin Labbe Acked-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb3/Kconfig | 9 -- drivers/infiniband/hw/cxgb3/Makefile | 2 - drivers/infiniband/hw/cxgb3/cxio_dbg.c | 206 ------------------------- drivers/infiniband/hw/cxgb3/cxio_hal.h | 9 -- drivers/infiniband/hw/cxgb3/iwch_cq.c | 6 - 5 files changed, 232 deletions(-) delete mode 100644 drivers/infiniband/hw/cxgb3/cxio_dbg.c diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig index 431be733fbbe..a7b77cb3d5d5 100644 --- a/drivers/infiniband/hw/cxgb3/Kconfig +++ b/drivers/infiniband/hw/cxgb3/Kconfig @@ -16,12 +16,3 @@ config INFINIBAND_CXGB3 To compile this driver as a module, choose M here: the module will be called iw_cxgb3. - -config INFINIBAND_CXGB3_DEBUG - bool "Verbose debugging output" - depends on INFINIBAND_CXGB3 - default n - ---help--- - This option causes the Chelsio RDMA driver to produce copious - amounts of debug messages. Select this if you are developing - the driver or trying to diagnose a problem. diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile index 2c66d35d19bd..66fe0917aba0 100644 --- a/drivers/infiniband/hw/cxgb3/Makefile +++ b/drivers/infiniband/hw/cxgb3/Makefile @@ -5,5 +5,3 @@ obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o iw_cxgb3-y := iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \ iwch_provider.o iwch.o cxio_hal.o cxio_resource.o - -ccflags-$(CONFIG_INFINIBAND_CXGB3_DEBUG) += -DDEBUG diff --git a/drivers/infiniband/hw/cxgb3/cxio_dbg.c b/drivers/infiniband/hw/cxgb3/cxio_dbg.c deleted file mode 100644 index 97dbe728520a..000000000000 --- a/drivers/infiniband/hw/cxgb3/cxio_dbg.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2006 Chelsio, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef DEBUG -#include -#include -#include "common.h" -#include "cxgb3_ioctl.h" -#include "cxio_hal.h" -#include "cxio_wr.h" - -void cxio_dump_tpt(struct cxio_rdev *rdev, u32 stag) -{ - struct ch_mem_range *m; - u64 *data; - int rc; - int size = 32; - - m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); - if (!m) - return; - - m->mem_id = MEM_PMRX; - m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base; - m->len = size; - pr_debug("%s TPT addr 0x%x len %d\n", __func__, m->addr, m->len); - rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); - if (rc) { - pr_debug("%s toectl returned error %d\n", __func__, rc); - kfree(m); - return; - } - - data = (u64 *)m->buf; - while (size > 0) { - pr_debug("TPT %08x: %016llx\n", - m->addr, (unsigned long long)*data); - size -= 8; - data++; - m->addr += 8; - } - kfree(m); -} - -void cxio_dump_pbl(struct cxio_rdev *rdev, u32 pbl_addr, uint len, u8 shift) -{ - struct ch_mem_range *m; - u64 *data; - int rc; - int size, npages; - - shift += 12; - npages = (len + (1ULL << shift) - 1) >> shift; - size = npages * sizeof(u64); - - m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); - if (!m) - return; - - m->mem_id = MEM_PMRX; - m->addr = pbl_addr; - m->len = size; - pr_debug("%s PBL addr 0x%x len %d depth %d\n", - __func__, m->addr, m->len, npages); - rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); - if (rc) { - pr_debug("%s toectl returned error %d\n", __func__, rc); - kfree(m); - return; - } - - data = (u64 *)m->buf; - while (size > 0) { - pr_debug("PBL %08x: %016llx\n", - m->addr, (unsigned long long)*data); - size -= 8; - data++; - m->addr += 8; - } - kfree(m); -} - -void cxio_dump_wqe(union t3_wr *wqe) -{ - __be64 *data = (__be64 *)wqe; - uint size = (uint)(be64_to_cpu(*data) & 0xff); - - if (size == 0) - size = 8; - while (size > 0) { - pr_debug("WQE %p: %016llx\n", - data, (unsigned long long)be64_to_cpu(*data)); - size--; - data++; - } -} - -void cxio_dump_wce(struct t3_cqe *wce) -{ - __be64 *data = (__be64 *)wce; - int size = sizeof(*wce); - - while (size > 0) { - pr_debug("WCE %p: %016llx\n", - data, (unsigned long long)be64_to_cpu(*data)); - size -= 8; - data++; - } -} - -void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents) -{ - struct ch_mem_range *m; - int size = nents * 64; - u64 *data; - int rc; - - m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); - if (!m) - return; - - m->mem_id = MEM_PMRX; - m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base; - m->len = size; - pr_debug("%s RQT addr 0x%x len %d\n", __func__, m->addr, m->len); - rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); - if (rc) { - pr_debug("%s toectl returned error %d\n", __func__, rc); - kfree(m); - return; - } - - data = (u64 *)m->buf; - while (size > 0) { - pr_debug("RQT %08x: %016llx\n", - m->addr, (unsigned long long)*data); - size -= 8; - data++; - m->addr += 8; - } - kfree(m); -} - -void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid) -{ - struct ch_mem_range *m; - int size = TCB_SIZE; - u32 *data; - int rc; - - m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); - if (!m) - return; - - m->mem_id = MEM_CM; - m->addr = hwtid * size; - m->len = size; - pr_debug("%s TCB %d len %d\n", __func__, m->addr, m->len); - rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); - if (rc) { - pr_debug("%s toectl returned error %d\n", __func__, rc); - kfree(m); - return; - } - - data = (u32 *)m->buf; - while (size > 0) { - printk("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n", - m->addr, - *(data+2), *(data+3), *(data),*(data+1), - *(data+6), *(data+7), *(data+4), *(data+5)); - size -= 32; - data += 8; - m->addr += 32; - } - kfree(m); -} -#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h index 7e70c5492262..c64e50b5a548 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.h +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -202,13 +202,4 @@ int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb); #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#ifdef DEBUG -void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag); -void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint len, u8 shift); -void cxio_dump_wqe(union t3_wr *wqe); -void cxio_dump_wce(struct t3_cqe *wce); -void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents); -void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid); -#endif - #endif diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c index dd5348e48806..0a8542c20804 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cq.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c @@ -200,9 +200,6 @@ int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) spin_lock_irqsave(&chp->lock, flags); for (npolled = 0; npolled < num_entries; ++npolled) { -#ifdef DEBUG - int i=0; -#endif /* * Because T3 can post CQEs that are _not_ associated @@ -211,9 +208,6 @@ int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) */ do { err = iwch_poll_cq_one(rhp, chp, wc + npolled); -#ifdef DEBUG - BUG_ON(++i > 1000); -#endif } while (err == -EAGAIN); if (err <= 0) break; From 173bc6be96c733ff7e6418eece9d64d03e7465b3 Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 7 Feb 2018 17:49:28 +0800 Subject: [PATCH 002/199] RDMA/hns: Fix a bug with modifying mac address When modifying mac address, it will trigger hns_roce_del_gid function and can't delete the default gid matched the index because the attribute of gid is null. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index eb9a69fc7bec..8255bb9021b0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -100,6 +100,7 @@ static int hns_roce_del_gid(struct ib_device *device, u8 port_num, unsigned int index, void **context) { struct hns_roce_dev *hr_dev = to_hr_dev(device); + struct ib_gid_attr zattr = { }; union ib_gid zgid = { {0} }; u8 port = port_num - 1; unsigned long flags; @@ -110,7 +111,7 @@ static int hns_roce_del_gid(struct ib_device *device, u8 port_num, spin_lock_irqsave(&hr_dev->iboe.lock, flags); - ret = hr_dev->hw->set_gid(hr_dev, port, index, &zgid, NULL); + ret = hr_dev->hw->set_gid(hr_dev, port, index, &zgid, &zattr); spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); From ced07769dc8484a9221aa616b3e935e748e8db03 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Wed, 7 Feb 2018 17:49:29 +0800 Subject: [PATCH 003/199] RDMA/hns: Fix QP state judgement before receiving work requests The QP can accept receive work requests only when the QP is in the states that allow them to be submitted. This patch updates the QP state judgement based on the specification. Signed-off-by: Yixian Liu Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index db2ff352d75f..0aa748304ab5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -509,7 +509,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, spin_lock_irqsave(&hr_qp->rq.lock, flags); ind = hr_qp->rq.head & (hr_qp->rq.wqe_cnt - 1); - if (hr_qp->state == IB_QPS_RESET || hr_qp->state == IB_QPS_ERR) { + if (hr_qp->state == IB_QPS_RESET) { spin_unlock_irqrestore(&hr_qp->rq.lock, flags); *bad_wr = wr; return -EINVAL; From d480bb50d294e23a6773c507b8017e6bc45efc67 Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 7 Feb 2018 17:49:30 +0800 Subject: [PATCH 004/199] RDMA/hns: Use free_pages function instead of free_page It need to use free_pages function for free the memory allocated by __get_free_pages function. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index da86a8117bd5..f7256d88d38f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -933,7 +933,7 @@ int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev, ret = hns_roce_write_mtt(hr_dev, mtt, n, i, pages); out: - free_page((unsigned long) pages); + free_pages((unsigned long) pages, order); return ret; } From 5229f87efcc5a0c800e7f3b49264af984ea4aba9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 7 Feb 2018 16:45:51 -0700 Subject: [PATCH 005/199] RDMA: Do not used __packed in uapi headers __packed is not available in linux/types.h, so we cannot use it in the uapi headers. The construction struct ABC {} __packed; may still compile even if __packed is not defined, however it simply creates a variable called __packed, and doesn't set the alignment. All these uses of packed are on structs that already have aligned members. While use in hfi may indicate the struct itself is unaligned, the use in ocrdma is on a UHW struct which should never be unaligned, so just delete it there. Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/hfi/hfi1_user.h | 6 +++--- include/uapi/rdma/ocrdma-abi.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index 791bea2f8297..43b46bf6f8bb 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -219,7 +219,7 @@ struct sdma_req_info { * in charge of managing its own ring. */ __u16 comp_idx; -} __packed; +} __attribute__((__packed__)); /* * SW KDETH header. @@ -230,7 +230,7 @@ struct hfi1_kdeth_header { __le16 jkey; __le16 hcrc; __le32 swdata[7]; -} __packed; +} __attribute__((__packed__)); /* * Structure describing the headers that User space uses. The @@ -241,7 +241,7 @@ struct hfi1_pkt_header { __be16 lrh[4]; __be32 bth[3]; struct hfi1_kdeth_header kdeth; -} __packed; +} __attribute__((__packed__)); /* diff --git a/include/uapi/rdma/ocrdma-abi.h b/include/uapi/rdma/ocrdma-abi.h index ad64a3cea1cd..e0475d59cdf0 100644 --- a/include/uapi/rdma/ocrdma-abi.h +++ b/include/uapi/rdma/ocrdma-abi.h @@ -127,7 +127,7 @@ struct ocrdma_create_qp_uresp { __u32 db_rq_offset; __u32 db_shift; __u64 rsvd[11]; -} __packed; +}; struct ocrdma_create_srq_uresp { __u16 rq_dbid; From 7061f28d8a2faf8131ac3a8ceb1af9850313e22c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 7 Feb 2018 16:49:10 -0700 Subject: [PATCH 006/199] rxe: Do not use 'struct sockaddr' in a uapi header Linux has two 'linux/socket.h' files - and only the one in the kernel defines struct sockaddr - the user space one does not. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_av.c | 5 +++-- include/uapi/rdma/rdma_user_rxe.h | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 7522d1af3ae2..7f1ae364088a 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -74,8 +74,9 @@ void rxe_av_fill_ip_info(struct rxe_av *av, struct ib_gid_attr *sgid_attr, union ib_gid *sgid) { - rdma_gid2ip(&av->sgid_addr._sockaddr, sgid); - rdma_gid2ip(&av->dgid_addr._sockaddr, &rdma_ah_read_grh(attr)->dgid); + rdma_gid2ip((struct sockaddr *)&av->sgid_addr, sgid); + rdma_gid2ip((struct sockaddr *)&av->dgid_addr, + &rdma_ah_read_grh(attr)->dgid); av->network_type = ib_gid_to_network_type(sgid_attr->gid_type, sgid); } diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index bdeea948b2f3..e3e6852b58eb 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -35,6 +35,9 @@ #define RDMA_USER_RXE_H #include +#include +#include +#include union rxe_gid { __u8 raw[16]; @@ -57,7 +60,6 @@ struct rxe_av { __u8 network_type; struct rxe_global_route grh; union { - struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } sgid_addr, dgid_addr; From 71591d1280e5ef02c2af2ffb9801d0c842973be9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 14 Feb 2018 20:11:17 +0200 Subject: [PATCH 007/199] RDMA/hns: Replace __raw_write*(cpu_to_le*()) with LE write*() There is no need to repeat the semantics of writel() and similar. Moreover sparse complains about this: drivers/infiniband/hw/hns/hns_roce_hw_v1.c:1690:22: expected unsigned long long val drivers/infiniband/hw/hns/hns_roce_hw_v1.c:1690:22: got restricted __le64 Fixing this by replacing __raw_write*(cpu_to_le*()) calls by plain write*() ones. Note, write*() accessors are little endian by definition. Reported-by: kbuild test robot Signed-off-by: Andy Shevchenko Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 8 ++++---- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index da13bd7c3ca9..47e1b6ac1e1a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1687,13 +1687,13 @@ static int hns_roce_v1_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, roce_set_field(val, ROCEE_MB6_ROCEE_MB_TOKEN_M, ROCEE_MB6_ROCEE_MB_TOKEN_S, token); - __raw_writeq(cpu_to_le64(in_param), hcr + 0); - __raw_writeq(cpu_to_le64(out_param), hcr + 2); - __raw_writel(cpu_to_le32(in_modifier), hcr + 4); + writeq(in_param, hcr + 0); + writeq(out_param, hcr + 2); + writel(in_modifier, hcr + 4); /* Memory barrier */ wmb(); - __raw_writel(cpu_to_le32(val), hcr + 5); + writel(val, hcr + 5); mmiowb(); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 0aa748304ab5..016bca1923ec 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1228,14 +1228,14 @@ static int hns_roce_v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, roce_set_field(val1, HNS_ROCE_VF_MB5_TOKEN_MASK, HNS_ROCE_VF_MB5_TOKEN_SHIFT, token); - __raw_writeq(cpu_to_le64(in_param), hcr + 0); - __raw_writeq(cpu_to_le64(out_param), hcr + 2); + writeq(in_param, hcr + 0); + writeq(out_param, hcr + 2); /* Memory barrier */ wmb(); - __raw_writel(cpu_to_le32(val0), hcr + 4); - __raw_writel(cpu_to_le32(val1), hcr + 5); + writel(val0, hcr + 4); + writel(val1, hcr + 5); mmiowb(); From 3a148896b24adf8688dc0c59af54531931677a40 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 12 Feb 2018 09:50:25 -0800 Subject: [PATCH 008/199] IB/srp: Fix completion vector assignment algorithm Ensure that cv_end is equal to ibdev->num_comp_vectors for the NUMA node with the highest index. This patch improves spreading of RDMA channels over completion vectors and thereby improves performance, especially on systems with only a single NUMA node. This patch drops support for the comp_vector login parameter by ignoring the value of that parameter since I have not found a good way to combine support for that parameter and automatic spreading of RDMA channels over completion vectors. Fixes: d92c0da71a35 ("IB/srp: Add multichannel support") Reported-by: Alexander Schmid Signed-off-by: Bart Van Assche Cc: Alexander Schmid Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index b48843833d69..241c0e72dce3 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3871,12 +3871,10 @@ static ssize_t srp_create_target(struct device *dev, num_online_nodes()); const int ch_end = ((node_idx + 1) * target->ch_count / num_online_nodes()); - const int cv_start = (node_idx * ibdev->num_comp_vectors / - num_online_nodes() + target->comp_vector) - % ibdev->num_comp_vectors; - const int cv_end = ((node_idx + 1) * ibdev->num_comp_vectors / - num_online_nodes() + target->comp_vector) - % ibdev->num_comp_vectors; + const int cv_start = node_idx * ibdev->num_comp_vectors / + num_online_nodes(); + const int cv_end = (node_idx + 1) * ibdev->num_comp_vectors / + num_online_nodes(); int cpu_idx = 0; for_each_online_cpu(cpu) { From b5bc59818614a9d6d1fa2347b14dd52c4774fb70 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:31 +0200 Subject: [PATCH 009/199] RDMA/uverbs: Convert command mask validity check function to be bool The function validate_command_mask() returns only two results: success or failure, so convert it to return bool instead of 0 and -1. Reported-by: Noa Osherovich Reviewed-by: Matan Barak Reviewed-by: Dennis Dalessandro Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index b1ca223aa380..7cadd4b5a0c0 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -635,7 +635,7 @@ err_put_refs: return filp; } -static int verify_command_mask(struct ib_device *ib_dev, __u32 command) +static bool verify_command_mask(struct ib_device *ib_dev, __u32 command) { u64 mask; @@ -645,9 +645,9 @@ static int verify_command_mask(struct ib_device *ib_dev, __u32 command) mask = ib_dev->uverbs_ex_cmd_mask; if (mask & ((u64)1 << command)) - return 0; + return true; - return -1; + return false; } static bool verify_command_idx(u32 command, bool extended) @@ -706,7 +706,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - if (verify_command_mask(ib_dev, command)) { + if (!verify_command_mask(ib_dev, command)) { ret = -EOPNOTSUPP; goto out; } From 08f0e161636d940ff6c8f78863ce62361291e74b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:32 +0200 Subject: [PATCH 010/199] RDMA/uverbs: Update sizeof users Update sizeof() users to be consistent with coding style. Reviewed-by: Dennis Dalessandro Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 7cadd4b5a0c0..66b2f92bd47b 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -468,7 +468,7 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) return; } - entry = kmalloc(sizeof *entry, GFP_ATOMIC); + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) { spin_unlock_irqrestore(&ev_queue->lock, flags); return; @@ -501,7 +501,7 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file, return; } - entry = kmalloc(sizeof *entry, GFP_ATOMIC); + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) { spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags); return; @@ -676,10 +676,10 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, return -EACCES; } - if (count < sizeof hdr) + if (count < sizeof(hdr)) return -EINVAL; - if (copy_from_user(&hdr, buf, sizeof hdr)) + if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; srcu_key = srcu_read_lock(&file->device->disassociate_srcu); @@ -1032,7 +1032,7 @@ static void ib_uverbs_add_one(struct ib_device *device) if (!device->alloc_ucontext) return; - uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL); + uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL); if (!uverbs_dev) return; From a6c4a66ae923eb780ef693a6841cbff7a025cbf9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:33 +0200 Subject: [PATCH 011/199] RDMA/uverbs: Refactor flags checks and update return value Since commit f21519b23c1b ("IB/core: extended command: an improved infrastructure for uverbs commands"), the uverbs supports extra flags as an input to the command interface. However actually, there is only one flag available and used, so it is better to refactor the code, so the resolution and report to the users is done as early as possible. As part of this change, we changed the return value of failure case from ENOSYS to be EINVAL to be consistent with the rest flags checks. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 66b2f92bd47b..d28b6f1543c7 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -701,6 +701,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; extended_command = flags & IB_USER_VERBS_CMD_FLAG_EXTENDED; + if (flags & ~IB_USER_VERBS_CMD_FLAG_EXTENDED) { + ret = -EINVAL; + goto out; + } + if (!verify_command_idx(command, extended_command)) { ret = -EINVAL; goto out; @@ -732,8 +737,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, buf + sizeof(hdr), hdr.in_words * 4, hdr.out_words * 4); - - } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + } else { struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_udata ucore; struct ib_udata uhw; @@ -804,8 +808,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, ret = uverbs_ex_cmd_table[command](file, ib_dev, &ucore, &uhw); if (!ret) ret = written_count; - } else { - ret = -ENOSYS; } out: From 43ae95130db8fb70010f09b734c7c606eb9e61ce Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:34 +0200 Subject: [PATCH 012/199] RDMA/uverbs: Fail as early as possible if not enough header data was provided Fail as early as possible if not enough header data was provided. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index d28b6f1543c7..2189a26bbe64 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -662,6 +662,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_device *ib_dev; struct ib_uverbs_cmd_hdr hdr; bool extended_command; @@ -706,6 +707,12 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } + if (extended_command && + count < (sizeof(hdr) + sizeof(ex_hdr))) { + ret = -EINVAL; + goto out; + } + if (!verify_command_idx(command, extended_command)) { ret = -EINVAL; goto out; @@ -738,7 +745,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, hdr.in_words * 4, hdr.out_words * 4); } else { - struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_udata ucore; struct ib_udata uhw; size_t written_count = count; @@ -753,11 +759,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - if (count < (sizeof(hdr) + sizeof(ex_hdr))) { - ret = -EINVAL; - goto out; - } - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) { ret = -EFAULT; goto out; From a9ed5b38aad704bbc250f9df2e9a29ee54388829 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:35 +0200 Subject: [PATCH 013/199] RDMA/uverbs: Return not supported error code for unsupported commands Command that doesn't exist means that it is not supported, so update code to return -EOPNOTSUPP in case of failure. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 2189a26bbe64..8d1547f5dc8e 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -714,7 +714,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, } if (!verify_command_idx(command, extended_command)) { - ret = -EINVAL; + ret = -EOPNOTSUPP; goto out; } From f2630ce2fb93eab5f50aa00dac7ae24a3b4ce17f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:36 +0200 Subject: [PATCH 014/199] RDMA/uverbs: Unify return values of not supported command The non-existing command is supposed to return -EOPNOTSUPP, but the current code returns different errors for different flows for the same failure. This patch unifies those flows. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 8d1547f5dc8e..20797a1c77f3 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -653,9 +653,11 @@ static bool verify_command_mask(struct ib_device *ib_dev, __u32 command) static bool verify_command_idx(u32 command, bool extended) { if (extended) - return command < ARRAY_SIZE(uverbs_ex_cmd_table); + return command < ARRAY_SIZE(uverbs_ex_cmd_table) && + uverbs_ex_cmd_table[command]; - return command < ARRAY_SIZE(uverbs_cmd_table); + return command < ARRAY_SIZE(uverbs_cmd_table) && + uverbs_cmd_table[command]; } static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, @@ -730,11 +732,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, } if (!flags) { - if (!uverbs_cmd_table[command]) { - ret = -EINVAL; - goto out; - } - if (hdr.in_words * 4 != count) { ret = -EINVAL; goto out; @@ -749,11 +746,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, struct ib_udata uhw; size_t written_count = count; - if (!uverbs_ex_cmd_table[command]) { - ret = -ENOSYS; - goto out; - } - if (!file->ucontext) { ret = -EINVAL; goto out; From 77833b8a48084cb17e4bd631360b0093dd245a31 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:37 +0200 Subject: [PATCH 015/199] RDMA/uverbs: Refactor command header processing Move all command header processing into separate function and perform those checks before acquiring SRCU read lock. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 62 ++++++++++++++------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 20797a1c77f3..906fcceb9ea4 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -660,6 +660,29 @@ static bool verify_command_idx(u32 command, bool extended) uverbs_cmd_table[command]; } +static ssize_t process_hdr(struct ib_uverbs_cmd_hdr *hdr, + __u32 *command, bool *extended) +{ + __u32 flags; + + if (hdr->command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + *command = hdr->command & IB_USER_VERBS_CMD_COMMAND_MASK; + flags = (hdr->command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; + + *extended = flags & IB_USER_VERBS_CMD_FLAG_EXTENDED; + if (flags & ~IB_USER_VERBS_CMD_FLAG_EXTENDED) + return -EINVAL; + + if (!verify_command_idx(*command, *extended)) + return -EOPNOTSUPP; + + return 0; +} + static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { @@ -667,9 +690,8 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_device *ib_dev; struct ib_uverbs_cmd_hdr hdr; - bool extended_command; + bool extended; __u32 command; - __u32 flags; int srcu_key; ssize_t ret; @@ -685,6 +707,13 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; + ret = process_hdr(&hdr, &command, &extended); + if (ret) + return ret; + + if (extended && count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); ib_dev = srcu_dereference(file->device->ib_dev, &file->device->disassociate_srcu); @@ -693,33 +722,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) { - ret = -EINVAL; - goto out; - } - - command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; - flags = (hdr.command & - IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; - - extended_command = flags & IB_USER_VERBS_CMD_FLAG_EXTENDED; - if (flags & ~IB_USER_VERBS_CMD_FLAG_EXTENDED) { - ret = -EINVAL; - goto out; - } - - if (extended_command && - count < (sizeof(hdr) + sizeof(ex_hdr))) { - ret = -EINVAL; - goto out; - } - - if (!verify_command_idx(command, extended_command)) { - ret = -EOPNOTSUPP; - goto out; - } - if (!verify_command_mask(ib_dev, command)) { ret = -EOPNOTSUPP; goto out; @@ -731,7 +733,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - if (!flags) { + if (!extended) { if (hdr.in_words * 4 != count) { ret = -EINVAL; goto out; From eb455e329bf65c75372da65a6a268c519f489183 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:38 +0200 Subject: [PATCH 016/199] RDMA/uverbs: Properly check command supported mask The check based on index is not sufficient because IB_USER_VERBS_EX_CMD_CREATE_CQ = IB_USER_VERBS_CMD_CREATE_CQ and IB_USER_VERBS_CMD_CREATE_CQ <= IB_USER_VERBS_CMD_OPEN_QP, so if we execute IB_USER_VERBS_EX_CMD_CREATE_CQ this code checks ib_dev->uverbs_cmd_mask not ib_dev->uverbs_ex_cmd_mask. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 906fcceb9ea4..1f2fd839953f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -635,19 +635,13 @@ err_put_refs: return filp; } -static bool verify_command_mask(struct ib_device *ib_dev, __u32 command) +static bool verify_command_mask(struct ib_device *ib_dev, + __u32 command, bool extended) { - u64 mask; + if (!extended) + return ib_dev->uverbs_cmd_mask & BIT_ULL(command); - if (command <= IB_USER_VERBS_CMD_OPEN_QP) - mask = ib_dev->uverbs_cmd_mask; - else - mask = ib_dev->uverbs_ex_cmd_mask; - - if (mask & ((u64)1 << command)) - return true; - - return false; + return ib_dev->uverbs_ex_cmd_mask & BIT_ULL(command); } static bool verify_command_idx(u32 command, bool extended) @@ -722,7 +716,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - if (!verify_command_mask(ib_dev, command)) { + if (!verify_command_mask(ib_dev, command, extended)) { ret = -EOPNOTSUPP; goto out; } From 491d5c6a3023e303cc693b91d9b3a4d54471c944 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:39 +0200 Subject: [PATCH 017/199] RDMA/uverbs: Move uncontext check before SRCU read lock There is no need to take SRCU lock before checking file->ucontext, so move it do it before it. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 1f2fd839953f..ff70e1ead1ba 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -705,6 +705,10 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (ret) return ret; + if (!file->ucontext && + (command != IB_USER_VERBS_CMD_GET_CONTEXT || extended)) + return -EINVAL; + if (extended && count < (sizeof(hdr) + sizeof(ex_hdr))) return -EINVAL; @@ -721,12 +725,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - if (!file->ucontext && - command != IB_USER_VERBS_CMD_GET_CONTEXT) { - ret = -EINVAL; - goto out; - } - if (!extended) { if (hdr.in_words * 4 != count) { ret = -EINVAL; @@ -742,11 +740,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, struct ib_udata uhw; size_t written_count = count; - if (!file->ucontext) { - ret = -EINVAL; - goto out; - } - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) { ret = -EFAULT; goto out; From e21719fbbd1786c09132e0483c72c136a61b79ed Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:40 +0200 Subject: [PATCH 018/199] RDMa/uverbs: Copy ex_hdr outside of SRCU read lock The SRCU read lock protects the IB device pointer and doesn't need to be called before copying user provided header. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index ff70e1ead1ba..a23cf9e33b98 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -709,8 +709,12 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, (command != IB_USER_VERBS_CMD_GET_CONTEXT || extended)) return -EINVAL; - if (extended && count < (sizeof(hdr) + sizeof(ex_hdr))) - return -EINVAL; + if (extended) { + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + } srcu_key = srcu_read_lock(&file->device->disassociate_srcu); ib_dev = srcu_dereference(file->device->ib_dev, @@ -740,11 +744,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, struct ib_udata uhw; size_t written_count = count; - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) { - ret = -EFAULT; - goto out; - } - count -= sizeof(hdr) + sizeof(ex_hdr); buf += sizeof(hdr) + sizeof(ex_hdr); From 6284380a97a60cbe827904dc454a835bc309c248 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:41 +0200 Subject: [PATCH 019/199] RDMA/uverbs: Refactor the header validation logic Move all header validation logic to be performed before SRCU read lock. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 90 ++++++++++++++------------- 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index a23cf9e33b98..12d6e5631164 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -677,6 +677,42 @@ static ssize_t process_hdr(struct ib_uverbs_cmd_hdr *hdr, return 0; } +static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr, + struct ib_uverbs_ex_cmd_hdr *ex_hdr, + size_t count, bool extended) +{ + if (extended) { + count -= sizeof(*hdr) + sizeof(*ex_hdr); + + if ((hdr->in_words + ex_hdr->provider_in_words) * 8 != count) + return -EINVAL; + + if (ex_hdr->cmd_hdr_reserved) + return -EINVAL; + + if (ex_hdr->response) { + if (!hdr->out_words && !ex_hdr->provider_out_words) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, + u64_to_user_ptr(ex_hdr->response), + (hdr->out_words + ex_hdr->provider_out_words) * 8)) + return -EFAULT; + } else { + if (hdr->out_words || ex_hdr->provider_out_words) + return -EINVAL; + } + + return 0; + } + + /* not extended command */ + if (hdr->in_words * 4 != count) + return -EINVAL; + + return 0; +} + static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { @@ -716,6 +752,10 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, return -EFAULT; } + ret = verify_hdr(&hdr, &ex_hdr, count, extended); + if (ret) + return ret; + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); ib_dev = srcu_dereference(file->device->ib_dev, &file->device->disassociate_srcu); @@ -729,52 +769,17 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - if (!extended) { - if (hdr.in_words * 4 != count) { - ret = -EINVAL; - goto out; - } + buf += sizeof(hdr); - ret = uverbs_cmd_table[command](file, ib_dev, - buf + sizeof(hdr), - hdr.in_words * 4, - hdr.out_words * 4); + if (!extended) { + ret = uverbs_cmd_table[command](file, ib_dev, buf, + hdr.in_words * 4, + hdr.out_words * 4); } else { struct ib_udata ucore; struct ib_udata uhw; - size_t written_count = count; - count -= sizeof(hdr) + sizeof(ex_hdr); - buf += sizeof(hdr) + sizeof(ex_hdr); - - if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) { - ret = -EINVAL; - goto out; - } - - if (ex_hdr.cmd_hdr_reserved) { - ret = -EINVAL; - goto out; - } - - if (ex_hdr.response) { - if (!hdr.out_words && !ex_hdr.provider_out_words) { - ret = -EINVAL; - goto out; - } - - if (!access_ok(VERIFY_WRITE, - u64_to_user_ptr(ex_hdr.response), - (hdr.out_words + ex_hdr.provider_out_words) * 8)) { - ret = -EFAULT; - goto out; - } - } else { - if (hdr.out_words || ex_hdr.provider_out_words) { - ret = -EINVAL; - goto out; - } - } + buf += sizeof(ex_hdr); ib_uverbs_init_udata_buf_or_null(&ucore, buf, u64_to_user_ptr(ex_hdr.response), @@ -787,8 +792,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, ex_hdr.provider_out_words * 8); ret = uverbs_ex_cmd_table[command](file, ib_dev, &ucore, &uhw); - if (!ret) - ret = written_count; + ret = (ret) ? : count; } out: From cd35cf4b40f351fc9e53c2e2877a56da87dcd46d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:42 +0200 Subject: [PATCH 020/199] RDMA/uverbs: Replace user's types with kernel's types The internal to kernel variable declarations don't need to be declared with user types. This patch converts such occurrences appeared in ib_uverbs_write(). Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 12d6e5631164..2a6deecf6f76 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -636,7 +636,7 @@ err_put_refs: } static bool verify_command_mask(struct ib_device *ib_dev, - __u32 command, bool extended) + u32 command, bool extended) { if (!extended) return ib_dev->uverbs_cmd_mask & BIT_ULL(command); @@ -655,11 +655,11 @@ static bool verify_command_idx(u32 command, bool extended) } static ssize_t process_hdr(struct ib_uverbs_cmd_hdr *hdr, - __u32 *command, bool *extended) + u32 *command, bool *extended) { - __u32 flags; + u32 flags; - if (hdr->command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + if (hdr->command & ~(u32)(IB_USER_VERBS_CMD_FLAGS_MASK | IB_USER_VERBS_CMD_COMMAND_MASK)) return -EINVAL; @@ -721,8 +721,8 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, struct ib_device *ib_dev; struct ib_uverbs_cmd_hdr hdr; bool extended; - __u32 command; int srcu_key; + u32 command; ssize_t ret; if (!ib_safe_file_access(filp)) { From 372e15c5db5f3f15423a2e6e6a71b77b39026ecf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:43 +0200 Subject: [PATCH 021/199] RDMA/uverbs: Reduce number of command header flags checks Simplify the code by directly checking the availability of extended command flog instead of doing multiple shift operations. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 11 ++--------- include/uapi/rdma/ib_user_verbs.h | 5 +---- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 2a6deecf6f76..fbba831f879e 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -657,19 +657,12 @@ static bool verify_command_idx(u32 command, bool extended) static ssize_t process_hdr(struct ib_uverbs_cmd_hdr *hdr, u32 *command, bool *extended) { - u32 flags; - - if (hdr->command & ~(u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + if (hdr->command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED | IB_USER_VERBS_CMD_COMMAND_MASK)) return -EINVAL; *command = hdr->command & IB_USER_VERBS_CMD_COMMAND_MASK; - flags = (hdr->command & - IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; - - *extended = flags & IB_USER_VERBS_CMD_FLAG_EXTENDED; - if (flags & ~IB_USER_VERBS_CMD_FLAG_EXTENDED) - return -EINVAL; + *extended = hdr->command & IB_USER_VERBS_CMD_FLAG_EXTENDED; if (!verify_command_idx(*command, *extended)) return -EOPNOTSUPP; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 04d0e67b1312..d56fba09dc8a 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -141,10 +141,7 @@ struct ib_uverbs_cq_moderation_caps { */ #define IB_USER_VERBS_CMD_COMMAND_MASK 0xff -#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u -#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24 - -#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80 +#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80000000u struct ib_uverbs_cmd_hdr { __u32 command; From 87915bf82eeaaf8c103e61dffe267ff18ffd4021 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Feb 2018 18:12:44 +0200 Subject: [PATCH 022/199] RDMA/verbs: Return proper error code for not supported system call The proper return error is -EOPNOTSUPP and not -ENOSYS, so update all places in verbs.c to match this semantics. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 46 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 93025d2009b8..2c7b0ceb46e6 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -655,7 +655,7 @@ int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) return ah->device->modify_ah ? ah->device->modify_ah(ah, ah_attr) : - -ENOSYS; + -EOPNOTSUPP; } EXPORT_SYMBOL(rdma_modify_ah); @@ -663,7 +663,7 @@ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) { return ah->device->query_ah ? ah->device->query_ah(ah, ah_attr) : - -ENOSYS; + -EOPNOTSUPP; } EXPORT_SYMBOL(rdma_query_ah); @@ -689,7 +689,7 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq *srq; if (!pd->device->create_srq) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); srq = pd->device->create_srq(pd, srq_init_attr, NULL); @@ -722,7 +722,7 @@ int ib_modify_srq(struct ib_srq *srq, { return srq->device->modify_srq ? srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) : - -ENOSYS; + -EOPNOTSUPP; } EXPORT_SYMBOL(ib_modify_srq); @@ -730,7 +730,7 @@ int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr) { return srq->device->query_srq ? - srq->device->query_srq(srq, srq_attr) : -ENOSYS; + srq->device->query_srq(srq, srq_attr) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_query_srq); @@ -1457,7 +1457,7 @@ int ib_query_qp(struct ib_qp *qp, { return qp->device->query_qp ? qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : - -ENOSYS; + -EOPNOTSUPP; } EXPORT_SYMBOL(ib_query_qp); @@ -1594,7 +1594,7 @@ EXPORT_SYMBOL(ib_create_cq); int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period) { return cq->device->modify_cq ? - cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS; + cq->device->modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP; } EXPORT_SYMBOL(rdma_set_cq_moderation); @@ -1611,7 +1611,7 @@ EXPORT_SYMBOL(ib_destroy_cq); int ib_resize_cq(struct ib_cq *cq, int cqe) { return cq->device->resize_cq ? - cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS; + cq->device->resize_cq(cq, cqe, NULL) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_resize_cq); @@ -1649,7 +1649,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, struct ib_mr *mr; if (!pd->device->alloc_mr) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); mr = pd->device->alloc_mr(pd, mr_type, max_num_sg); if (!IS_ERR(mr)) { @@ -1673,7 +1673,7 @@ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, struct ib_fmr *fmr; if (!pd->device->alloc_fmr) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr); if (!IS_ERR(fmr)) { @@ -1757,7 +1757,7 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) int ret; if (!qp->device->attach_mcast) - return -ENOSYS; + return -EOPNOTSUPP; if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) @@ -1775,7 +1775,7 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) int ret; if (!qp->device->detach_mcast) - return -ENOSYS; + return -EOPNOTSUPP; if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) @@ -1793,7 +1793,7 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller) struct ib_xrcd *xrcd; if (!device->alloc_xrcd) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); xrcd = device->alloc_xrcd(device, NULL, NULL); if (!IS_ERR(xrcd)) { @@ -1847,7 +1847,7 @@ struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq *wq; if (!pd->device->create_wq) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); wq = pd->device->create_wq(pd, wq_attr, NULL); if (!IS_ERR(wq)) { @@ -1902,7 +1902,7 @@ int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, int err; if (!wq->device->modify_wq) - return -ENOSYS; + return -EOPNOTSUPP; err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL); return err; @@ -1927,7 +1927,7 @@ struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, u32 table_size; if (!device->create_rwq_ind_table) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); table_size = (1 << init_attr->log_ind_tbl_size); rwq_ind_table = device->create_rwq_ind_table(device, @@ -1977,7 +1977,7 @@ struct ib_flow *ib_create_flow(struct ib_qp *qp, { struct ib_flow *flow_id; if (!qp->device->create_flow) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); flow_id = qp->device->create_flow(qp, flow_attr, domain); if (!IS_ERR(flow_id)) { @@ -2004,7 +2004,7 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status) { return mr->device->check_mr_status ? - mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; + mr->device->check_mr_status(mr, check_mask, mr_status) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_check_mr_status); @@ -2012,7 +2012,7 @@ int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, int state) { if (!device->set_vf_link_state) - return -ENOSYS; + return -EOPNOTSUPP; return device->set_vf_link_state(device, vf, port, state); } @@ -2022,7 +2022,7 @@ int ib_get_vf_config(struct ib_device *device, int vf, u8 port, struct ifla_vf_info *info) { if (!device->get_vf_config) - return -ENOSYS; + return -EOPNOTSUPP; return device->get_vf_config(device, vf, port, info); } @@ -2032,7 +2032,7 @@ int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, struct ifla_vf_stats *stats) { if (!device->get_vf_stats) - return -ENOSYS; + return -EOPNOTSUPP; return device->get_vf_stats(device, vf, port, stats); } @@ -2042,7 +2042,7 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, int type) { if (!device->set_vf_guid) - return -ENOSYS; + return -EOPNOTSUPP; return device->set_vf_guid(device, vf, port, guid, type); } @@ -2077,7 +2077,7 @@ int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { if (unlikely(!mr->device->map_mr_sg)) - return -ENOSYS; + return -EOPNOTSUPP; mr->page_size = page_size; From 8efe991e8bd07c415ffe1174853d81c14812a42b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 13 Feb 2018 12:18:39 +0200 Subject: [PATCH 023/199] IB/uverbs: Tidy uverbs_uobject_add Maintaining the uobjects list is mandatory, hoist it into the common rdma_alloc_commit_uobject() function and inline it as there is now only one caller. Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rdma_core.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index d8eead5d106d..a6e904973ba8 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -350,13 +350,6 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type, return type->type_class->alloc_begin(type, ucontext); } -static void uverbs_uobject_add(struct ib_uobject *uobject) -{ - mutex_lock(&uobject->context->uobjects_lock); - list_add(&uobject->list, &uobject->context->uobjects); - mutex_unlock(&uobject->context->uobjects_lock); -} - static int __must_check remove_commit_idr_uobject(struct ib_uobject *uobj, enum rdma_remove_reason why) { @@ -502,7 +495,6 @@ out: static void alloc_commit_idr_uobject(struct ib_uobject *uobj) { - uverbs_uobject_add(uobj); spin_lock(&uobj->context->ufile->idr_lock); /* * We already allocated this IDR with a NULL object, so @@ -518,7 +510,6 @@ static void alloc_commit_fd_uobject(struct ib_uobject *uobj) struct ib_uobject_file *uobj_file = container_of(uobj, struct ib_uobject_file, uobj); - uverbs_uobject_add(&uobj_file->uobj); fd_install(uobj_file->uobj.id, uobj->object); /* This shouldn't be used anymore. Use the file object instead */ uobj_file->uobj.id = 0; @@ -545,6 +536,10 @@ int rdma_alloc_commit_uobject(struct ib_uobject *uobj) assert_uverbs_usecnt(uobj, true); atomic_set(&uobj->usecnt, 0); + mutex_lock(&uobj->context->uobjects_lock); + list_add(&uobj->list, &uobj->context->uobjects); + mutex_unlock(&uobj->context->uobjects_lock); + uobj->type->type_class->alloc_commit(uobj); up_read(&uobj->context->cleanup_rwsem); From 74630e8204f3f00671b9b3cdf4101b8deadae42b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 28 Feb 2018 08:29:56 +0200 Subject: [PATCH 024/199] mailmap: Map Leon Romanovsky's emails Update .mailmap file to point to my primary open-source related e-mail address. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index e18cab73e209..e9dbed5e136d 100644 --- a/.mailmap +++ b/.mailmap @@ -100,6 +100,8 @@ Koushik Krzysztof Kozlowski Krzysztof Kozlowski Kuninori Morimoto +Leon Romanovsky +Leon Romanovsky Leonid I Ananiev Linas Vepstas Linus Lüssing From 55de9a77daf3064e4f49296260d975885b574663 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 25 Feb 2018 13:39:52 +0200 Subject: [PATCH 025/199] RDMA/mlx5: Refactor QP type check to be as early as possible Perform QP type check in one place and fail as early as possible. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5663530ea5fd..476ec4e8305c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2948,18 +2948,16 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, u16 op; u8 tx_affinity = 0; + mlx5_st = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ? + qp->qp_sub_type : ibqp->qp_type); + if (mlx5_st < 0) + return -EINVAL; + context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return -ENOMEM; - err = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ? - qp->qp_sub_type : ibqp->qp_type); - if (err < 0) { - mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type); - goto out; - } - - context->flags = cpu_to_be32(err << 16); + context->flags = cpu_to_be32(mlx5_st << 16); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); @@ -3113,10 +3111,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); - mlx5_st = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ? - qp->qp_sub_type : ibqp->qp_type); - if (mlx5_st < 0) - goto out; if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || !optab[mlx5_cur][mlx5_new]) From b7c5bc73680dfcb1bdc3a1c77c0344f045ac017d Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 27 Jan 2018 20:06:59 +0100 Subject: [PATCH 026/199] IB/usnic: Delete an error message for a failed memory allocation in usnic_transport_init() Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/usnic/usnic_transport.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c index de318389a301..2e9bba52a686 100644 --- a/drivers/infiniband/hw/usnic/usnic_transport.c +++ b/drivers/infiniband/hw/usnic/usnic_transport.c @@ -201,10 +201,8 @@ int usnic_transport_sock_get_addr(struct socket *sock, int *proto, int usnic_transport_init(void) { roce_bitmap = kzalloc(ROCE_BITMAP_SZ, GFP_KERNEL); - if (!roce_bitmap) { - usnic_err("Failed to allocate bit map"); + if (!roce_bitmap) return -ENOMEM; - } /* Do not ever allocate bit 0, hence set it here */ bitmap_set(roce_bitmap, 0, 1); From c7ec83772a08d84e5e6a774a7c49ec1d6dc2dd88 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 27 Jan 2018 21:48:01 +0100 Subject: [PATCH 027/199] RDMA/iwpm: Delete an error message for a failed memory allocation in iwpm_create_nlmsg() Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/iwpm_util.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 81528f64061a..9821ae900f6d 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -439,10 +439,9 @@ struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, struct sk_buff *skb = NULL; skb = dev_alloc_skb(IWPM_MSG_SIZE); - if (!skb) { - pr_err("%s Unable to allocate skb\n", __func__); + if (!skb) goto create_nlmsg_exit; - } + if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, NLM_F_REQUEST))) { pr_warn("%s: Unable to put the nlmsg header\n", __func__); From e5d6574ded7c3a15d02341253929780bc4ee1408 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 20 Feb 2018 21:56:26 +0100 Subject: [PATCH 028/199] infiniband: qplib_fp: fix pointer cast Building for a 32-bit target results in a couple of warnings from casting between a 32-bit pointer and a 64-bit integer: drivers/infiniband/hw/bnxt_re/qplib_fp.c: In function 'bnxt_qplib_service_nq': drivers/infiniband/hw/bnxt_re/qplib_fp.c:333:23: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle, ^ drivers/infiniband/hw/bnxt_re/qplib_fp.c:336:12: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] (struct bnxt_qplib_srq *)q_handle, ^ In file included from include/linux/byteorder/little_endian.h:5, from arch/arm/include/uapi/asm/byteorder.h:22, from include/asm-generic/bitops/le.h:6, from arch/arm/include/asm/bitops.h:342, from include/linux/bitops.h:38, from include/linux/kernel.h:11, from include/linux/interrupt.h:6, from drivers/infiniband/hw/bnxt_re/qplib_fp.c:39: drivers/infiniband/hw/bnxt_re/qplib_fp.c: In function 'bnxt_qplib_create_srq': include/uapi/linux/byteorder/little_endian.h:31:43: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] #define __cpu_to_le64(x) ((__force __le64)(__u64)(x)) ^ include/linux/byteorder/generic.h:86:21: note: in expansion of macro '__cpu_to_le64' #define cpu_to_le64 __cpu_to_le64 ^~~~~~~~~~~~~ drivers/infiniband/hw/bnxt_re/qplib_fp.c:569:19: note: in expansion of macro 'cpu_to_le64' req.srq_handle = cpu_to_le64(srq); Using a uintptr_t as an intermediate works on all architectures. Fixes: 37cb11acf1f7 ("RDMA/bnxt_re: Add SRQ support for Broadcom adapters") Signed-off-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 3ea5b9624f6b..d85bf7be577d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -283,7 +283,7 @@ static void bnxt_qplib_service_nq(unsigned long data) u32 sw_cons, raw_cons; u16 type; int budget = nq->budget; - u64 q_handle; + uintptr_t q_handle; /* Service the NQ until empty */ raw_cons = hwq->cons; @@ -566,7 +566,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, /* Configure the request */ req.dpi = cpu_to_le32(srq->dpi->dpi); - req.srq_handle = cpu_to_le64(srq); + req.srq_handle = cpu_to_le64((uintptr_t)srq); req.srq_size = cpu_to_le16((u16)srq->hwq.max_elements); pbl = &srq->hwq.pbl[PBL_LVL_0]; From a8ed748708ba9c9976fb0d7b4844f4c2fa5ecb34 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 20 Feb 2018 21:56:27 +0100 Subject: [PATCH 029/199] infiniband: bnxt_re: use BIT_ULL() for 64-bit bit masks On 32-bit targets, we otherwise get a warning about an impossible constant integer expression: In file included from include/linux/kernel.h:11, from include/linux/interrupt.h:6, from drivers/infiniband/hw/bnxt_re/ib_verbs.c:39: drivers/infiniband/hw/bnxt_re/ib_verbs.c: In function 'bnxt_re_query_device': include/linux/bitops.h:7:24: error: left shift count >= width of type [-Werror=shift-count-overflow] #define BIT(nr) (1UL << (nr)) ^~ drivers/infiniband/hw/bnxt_re/bnxt_re.h:61:34: note: in expansion of macro 'BIT' #define BNXT_RE_MAX_MR_SIZE_HIGH BIT(39) ^~~ drivers/infiniband/hw/bnxt_re/bnxt_re.h:62:30: note: in expansion of macro 'BNXT_RE_MAX_MR_SIZE_HIGH' #define BNXT_RE_MAX_MR_SIZE BNXT_RE_MAX_MR_SIZE_HIGH ^~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/bnxt_re/ib_verbs.c:149:25: note: in expansion of macro 'BNXT_RE_MAX_MR_SIZE' ib_attr->max_mr_size = BNXT_RE_MAX_MR_SIZE; ^~~~~~~~~~~~~~~~~~~ Fixes: 872f3578241d ("RDMA/bnxt_re: Add support for MRs with Huge pages") Signed-off-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 4 ++-- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 3eb7a8387116..96f76896488d 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -57,8 +57,8 @@ #define BNXT_RE_PAGE_SIZE_8M BIT(BNXT_RE_PAGE_SHIFT_8M) #define BNXT_RE_PAGE_SIZE_1G BIT(BNXT_RE_PAGE_SHIFT_1G) -#define BNXT_RE_MAX_MR_SIZE_LOW BIT(BNXT_RE_PAGE_SHIFT_1G) -#define BNXT_RE_MAX_MR_SIZE_HIGH BIT(39) +#define BNXT_RE_MAX_MR_SIZE_LOW BIT_ULL(BNXT_RE_PAGE_SHIFT_1G) +#define BNXT_RE_MAX_MR_SIZE_HIGH BIT_ULL(39) #define BNXT_RE_MAX_MR_SIZE BNXT_RE_MAX_MR_SIZE_HIGH #define BNXT_RE_MAX_QPC_COUNT (64 * 1024) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 643174d949a8..6593d4cad26a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3586,7 +3586,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, int umem_pgs, page_shift, rc; if (length > BNXT_RE_MAX_MR_SIZE) { - dev_err(rdev_to_dev(rdev), "MR Size: %lld > Max supported:%ld\n", + dev_err(rdev_to_dev(rdev), "MR Size: %lld > Max supported:%lld\n", length, BNXT_RE_MAX_MR_SIZE); return ERR_PTR(-ENOMEM); } From e68088e78d82920632eba112b968e49d588d02a2 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 23 Feb 2018 14:09:24 -0800 Subject: [PATCH 030/199] IB/srp: Fix srp_abort() Before commit e494f6a72839 ("[SCSI] improved eh timeout handler") it did not really matter whether or not abort handlers like srp_abort() called .scsi_done() when returning another value than SUCCESS. Since that commit however this matters. Hence only call .scsi_done() when returning SUCCESS. Signed-off-by: Bart Van Assche Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 241c0e72dce3..4a1a489ce8bb 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -2974,9 +2974,11 @@ static int srp_abort(struct scsi_cmnd *scmnd) ret = FAST_IO_FAIL; else ret = FAILED; - srp_free_req(ch, req, scmnd, 0); - scmnd->result = DID_ABORT << 16; - scmnd->scsi_done(scmnd); + if (ret == SUCCESS) { + srp_free_req(ch, req, scmnd, 0); + scmnd->result = DID_ABORT << 16; + scmnd->scsi_done(scmnd); + } return ret; } From c74ff7501e8dda9e9542a1fcabb2233776c1d19d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 23 Feb 2018 14:09:25 -0800 Subject: [PATCH 031/199] Revert "IB/srp: Avoid that a cable pull can trigger a kernel crash" The caller of srp_ib_lookup_path() is responsible for holding a reference on the SCSI host. That means that commit 8a0d18c62121 was not necessary. Hence revert it. Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 4a1a489ce8bb..4021d608fe85 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -765,19 +765,12 @@ static void srp_path_rec_completion(int status, static int srp_ib_lookup_path(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; - int ret = -ENODEV; + int ret; ch->ib_cm.path.numb_path = 1; init_completion(&ch->done); - /* - * Avoid that the SCSI host can be removed by srp_remove_target() - * before srp_path_rec_completion() is called. - */ - if (!scsi_host_get(target->scsi_host)) - goto out; - ch->ib_cm.path_query_id = ib_sa_path_rec_get(&srp_sa_client, target->srp_host->srp_dev->dev, target->srp_host->port, @@ -791,27 +784,21 @@ static int srp_ib_lookup_path(struct srp_rdma_ch *ch) GFP_KERNEL, srp_path_rec_completion, ch, &ch->ib_cm.path_query); - ret = ch->ib_cm.path_query_id; - if (ret < 0) - goto put; + if (ch->ib_cm.path_query_id < 0) + return ch->ib_cm.path_query_id; ret = wait_for_completion_interruptible(&ch->done); if (ret < 0) - goto put; + return ret; - ret = ch->status; - if (ret < 0) + if (ch->status < 0) shost_printk(KERN_WARNING, target->scsi_host, PFX "Path record query failed: sgid %pI6, dgid %pI6, pkey %#04x, service_id %#16llx\n", ch->ib_cm.path.sgid.raw, ch->ib_cm.path.dgid.raw, be16_to_cpu(target->ib_cm.pkey), be64_to_cpu(target->ib_cm.service_id)); -put: - scsi_host_put(target->scsi_host); - -out: - return ret; + return ch->status; } static int srp_rdma_lookup_path(struct srp_rdma_ch *ch) From 7da09af91d51561f373bedcd7a9d521ac79ee695 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 23 Feb 2018 14:09:26 -0800 Subject: [PATCH 032/199] IB/srp: Use %pIS instead of inet_ntop() Except for a minor log message change, this patch does not change any functionality. For the introduction of %pIS, see also commit 1067964305df ("lib: vsprintf: add IPv4/v6 generic %p[Ii]S[pfs] format specifier"). Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 52 ++++++----------------------- 1 file changed, 10 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 4021d608fe85..d61f48a86508 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -327,29 +327,10 @@ static int srp_new_ib_cm_id(struct srp_rdma_ch *ch) return 0; } -static const char *inet_ntop(const void *sa, char *dst, unsigned int size) -{ - switch (((struct sockaddr *)sa)->sa_family) { - case AF_INET: - snprintf(dst, size, "%pI4", - &((struct sockaddr_in *)sa)->sin_addr); - break; - case AF_INET6: - snprintf(dst, size, "%pI6", - &((struct sockaddr_in6 *)sa)->sin6_addr); - break; - default: - snprintf(dst, size, "???"); - break; - } - return dst; -} - static int srp_new_rdma_cm_id(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct rdma_cm_id *new_cm_id; - char src_addr[64], dst_addr[64]; int ret; new_cm_id = rdma_create_id(target->net, srp_rdma_cm_handler, ch, @@ -366,13 +347,8 @@ static int srp_new_rdma_cm_id(struct srp_rdma_ch *ch) (struct sockaddr *)&target->rdma_cm.dst, SRP_PATH_REC_TIMEOUT_MS); if (ret) { - pr_err("No route available from %s to %s (%d)\n", - target->rdma_cm.src_specified ? - inet_ntop(&target->rdma_cm.src, src_addr, - sizeof(src_addr)) : "(any)", - inet_ntop(&target->rdma_cm.dst, dst_addr, - sizeof(dst_addr)), - ret); + pr_err("No route available from %pIS to %pIS (%d)\n", + &target->rdma_cm.src, &target->rdma_cm.dst, ret); goto out; } ret = wait_for_completion_interruptible(&ch->done); @@ -381,10 +357,8 @@ static int srp_new_rdma_cm_id(struct srp_rdma_ch *ch) ret = ch->status; if (ret) { - pr_err("Resolving address %s failed (%d)\n", - inet_ntop(&target->rdma_cm.dst, dst_addr, - sizeof(dst_addr)), - ret); + pr_err("Resolving address %pIS failed (%d)\n", + &target->rdma_cm.dst, ret); goto out; } @@ -3778,14 +3752,11 @@ static ssize_t srp_create_target(struct device *dev, if (!srp_conn_unique(target->srp_host, target)) { if (target->using_rdma_cm) { - char dst_addr[64]; - shost_printk(KERN_INFO, target->scsi_host, - PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;dest=%s\n", + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;dest=%pIS\n", be64_to_cpu(target->id_ext), be64_to_cpu(target->ioc_guid), - inet_ntop(&target->rdma_cm.dst, dst_addr, - sizeof(dst_addr))); + &target->rdma_cm.dst); } else { shost_printk(KERN_INFO, target->scsi_host, PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", @@ -3894,8 +3865,8 @@ static ssize_t srp_create_target(struct device *dev, char dst[64]; if (target->using_rdma_cm) - inet_ntop(&target->rdma_cm.dst, dst, - sizeof(dst)); + snprintf(dst, sizeof(dst), "%pIS", + &target->rdma_cm.dst); else snprintf(dst, sizeof(dst), "%pI6", target->ib_cm.orig_dgid.raw); @@ -3928,14 +3899,11 @@ connected: if (target->state != SRP_TARGET_REMOVED) { if (target->using_rdma_cm) { - char dst[64]; - - inet_ntop(&target->rdma_cm.dst, dst, sizeof(dst)); shost_printk(KERN_DEBUG, target->scsi_host, PFX - "new target: id_ext %016llx ioc_guid %016llx sgid %pI6 dest %s\n", + "new target: id_ext %016llx ioc_guid %016llx sgid %pI6 dest %pIS\n", be64_to_cpu(target->id_ext), be64_to_cpu(target->ioc_guid), - target->sgid.raw, dst); + target->sgid.raw, &target->rdma_cm.dst); } else { shost_printk(KERN_DEBUG, target->scsi_host, PFX "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", From 4f1d58343210cf52f2cb96c2499c81e2ae8d62c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hern=C3=A1n=20Gonzalez?= Date: Tue, 27 Feb 2018 19:05:42 -0300 Subject: [PATCH 033/199] IB/qib: Remove unused variable (char *qib_sdma_event_names[]) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note: This is compile only tested as I have no access to the hw. This variable was not used anywhere in the code. Removing it saves 88 bytes. add/remove: 0/1 grow/shrink: 0/0 up/down: 0/-88 (-88) Function old new delta qib_sdma_event_names 88 - -88 Total: Before=2874565, After=2874477, chg -0.00% Signed-off-by: Hernán Gonzalez Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib.h | 1 - drivers/infiniband/hw/qib/qib_sdma.c | 14 -------------- 2 files changed, 15 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 0235f76bbc72..b709ebd1fb2e 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -473,7 +473,6 @@ enum qib_sdma_events { }; extern char *qib_sdma_state_names[]; -extern char *qib_sdma_event_names[]; struct sdma_set_state_action { unsigned op_enable:1; diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c index c3690bd51582..1f2d7a054d90 100644 --- a/drivers/infiniband/hw/qib/qib_sdma.c +++ b/drivers/infiniband/hw/qib/qib_sdma.c @@ -64,20 +64,6 @@ char *qib_sdma_state_names[] = { [qib_sdma_state_s99_running] = "s99_Running", }; -char *qib_sdma_event_names[] = { - [qib_sdma_event_e00_go_hw_down] = "e00_GoHwDown", - [qib_sdma_event_e10_go_hw_start] = "e10_GoHwStart", - [qib_sdma_event_e20_hw_started] = "e20_HwStarted", - [qib_sdma_event_e30_go_running] = "e30_GoRunning", - [qib_sdma_event_e40_sw_cleaned] = "e40_SwCleaned", - [qib_sdma_event_e50_hw_cleaned] = "e50_HwCleaned", - [qib_sdma_event_e60_hw_halted] = "e60_HwHalted", - [qib_sdma_event_e70_go_idle] = "e70_GoIdle", - [qib_sdma_event_e7220_err_halted] = "e7220_ErrHalted", - [qib_sdma_event_e7322_err_halted] = "e7322_ErrHalted", - [qib_sdma_event_e90_timer_tick] = "e90_TimerTick", -}; - /* declare all statics here rather than keep sorting */ static int alloc_sdma(struct qib_pportdata *); static void sdma_complete(struct kref *); From 7f566a91b126075953949dbfe28a7221e9703aff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hern=C3=A1n=20Gonzalez?= Date: Tue, 27 Feb 2018 19:05:43 -0300 Subject: [PATCH 034/199] IB/qib: Move char *qib_sdma_state_names[] and constify while there. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note: This is compile only tested as I have no access to the hw. This variable was not used in qib_sdma.c but in qib_iba7322.c. Declaring it there, as static, saves 56 bytes. add/remove: 0/2 grow/shrink: 0/0 up/down: 0/-144 (-144) Function old new delta qib_sdma_state_names 56 - -56 qib_sdma_event_names 88 - -88 Total: Before=2874565, After=2874421, chg -0.01% Signed-off-by: Hernán Gonzalez Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib.h | 2 -- drivers/infiniband/hw/qib/qib_iba7322.c | 10 ++++++++++ drivers/infiniband/hw/qib/qib_sdma.c | 10 ---------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index b709ebd1fb2e..46072455130c 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -472,8 +472,6 @@ enum qib_sdma_events { qib_sdma_event_e90_timer_tick, }; -extern char *qib_sdma_state_names[]; - struct sdma_set_state_action { unsigned op_enable:1; unsigned op_intenable:1; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 6265dac415fc..8414ae44a518 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -463,6 +463,16 @@ static u8 ib_rate_to_delay[IB_RATE_120_GBPS + 1] = { [IB_RATE_40_GBPS] = 1 }; +static const char * const qib_sdma_state_names[] = { + [qib_sdma_state_s00_hw_down] = "s00_HwDown", + [qib_sdma_state_s10_hw_start_up_wait] = "s10_HwStartUpWait", + [qib_sdma_state_s20_idle] = "s20_Idle", + [qib_sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait", + [qib_sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait", + [qib_sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait", + [qib_sdma_state_s99_running] = "s99_Running", +}; + #define IBA7322_LINKSPEED_SHIFT SYM_LSB(IBCStatusA_0, LinkSpeedActive) #define IBA7322_LINKWIDTH_SHIFT SYM_LSB(IBCStatusA_0, LinkWidthActive) diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c index 1f2d7a054d90..d0723d4aef5c 100644 --- a/drivers/infiniband/hw/qib/qib_sdma.c +++ b/drivers/infiniband/hw/qib/qib_sdma.c @@ -54,16 +54,6 @@ MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries"); #define SDMA_DESC_COUNT_LSB 16 #define SDMA_DESC_GEN_LSB 30 -char *qib_sdma_state_names[] = { - [qib_sdma_state_s00_hw_down] = "s00_HwDown", - [qib_sdma_state_s10_hw_start_up_wait] = "s10_HwStartUpWait", - [qib_sdma_state_s20_idle] = "s20_Idle", - [qib_sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait", - [qib_sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait", - [qib_sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait", - [qib_sdma_state_s99_running] = "s99_Running", -}; - /* declare all statics here rather than keep sorting */ static int alloc_sdma(struct qib_pportdata *); static void sdma_complete(struct kref *); From c33bab622d47b7db55b387096c1a5c8e02f5bf37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hern=C3=A1n=20Gonzalez?= Date: Tue, 27 Feb 2018 19:07:58 -0300 Subject: [PATCH 035/199] IB/rxe: Remove unused variable (char *rxe_qp_state_name[]) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note: This is compile only tested as I have no access to the hw. This variable was not used anywhere in the code. Removing it saves 24 bytes. add/remove: 0/1 grow/shrink: 0/0 up/down: 0/-24 (-24) Function old new delta rxe_qp_state_name 24 - -24 Total: Before=3348732, After=3348708, chg -0.00% Signed-off-by: Hernán Gonzalez Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 9 --------- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 -- 2 files changed, 11 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 2fcf1cab7678..98a7a19146a8 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -40,15 +40,6 @@ #include "rxe_queue.h" #include "rxe_task.h" -char *rxe_qp_state_name[] = { - [QP_STATE_RESET] = "RESET", - [QP_STATE_INIT] = "INIT", - [QP_STATE_READY] = "READY", - [QP_STATE_DRAIN] = "DRAIN", - [QP_STATE_DRAINED] = "DRAINED", - [QP_STATE_ERROR] = "ERROR", -}; - static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, int has_srq) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 1019f5e7dbdd..af1470d29391 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -139,8 +139,6 @@ enum rxe_qp_state { QP_STATE_ERROR }; -extern char *rxe_qp_state_name[]; - struct rxe_req_info { enum rxe_qp_state state; int wqe_index; From 042932f7a3274126ee3bfb2516e8d8260c545b11 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 1 Mar 2018 16:23:54 +0000 Subject: [PATCH 036/199] infiniband: remove redundant assignment to pointer 'rdi' The pointer rdi is being initialized with a value that is never read and re-assigned immediately after, hence the initialization is redundant and can be removed. Cleans up clang warning: drivers/infiniband/sw/rdmavt/vt.c:94:23: warning: Value stored to 'rdi' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/vt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index a4553b2b3696..a67b0ddc2230 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -91,7 +91,7 @@ module_exit(rvt_cleanup); */ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports) { - struct rvt_dev_info *rdi = ERR_PTR(-ENOMEM); + struct rvt_dev_info *rdi; rdi = (struct rvt_dev_info *)ib_alloc_device(size); if (!rdi) From a1ae7d0345edd593d6725d3218434d903a0af95d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 1 Mar 2018 14:00:28 -0800 Subject: [PATCH 037/199] RDMA/core: Avoid that ib_drain_qp() triggers an out-of-bounds stack access This patch fixes the following KASAN complaint: ================================================================== BUG: KASAN: stack-out-of-bounds in rxe_post_send+0x77d/0x9b0 [rdma_rxe] Read of size 8 at addr ffff880061aef860 by task 01/1080 CPU: 2 PID: 1080 Comm: 01 Not tainted 4.16.0-rc3-dbg+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 Call Trace: dump_stack+0x85/0xc7 print_address_description+0x65/0x270 kasan_report+0x231/0x350 rxe_post_send+0x77d/0x9b0 [rdma_rxe] __ib_drain_sq+0x1ad/0x250 [ib_core] ib_drain_qp+0x9/0x30 [ib_core] srp_destroy_qp+0x51/0x70 [ib_srp] srp_free_ch_ib+0xfc/0x380 [ib_srp] srp_create_target+0x1071/0x19e0 [ib_srp] kernfs_fop_write+0x180/0x210 __vfs_write+0xb1/0x2e0 vfs_write+0xf6/0x250 SyS_write+0x99/0x110 do_syscall_64+0xee/0x2b0 entry_SYSCALL_64_after_hwframe+0x42/0xb7 The buggy address belongs to the page: page:ffffea000186bbc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0x4000000000000000() raw: 4000000000000000 0000000000000000 0000000000000000 00000000ffffffff raw: 0000000000000000 ffffea000186bbe0 0000000000000000 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff880061aef700: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff880061aef780: 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 >ffff880061aef800: f2 f2 f2 f2 f2 f2 f2 00 00 00 00 00 f2 f2 f2 f2 ^ ffff880061aef880: f2 f2 f2 00 00 00 00 00 00 00 00 00 00 00 f2 f2 ffff880061aef900: f2 f2 f2 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== Fixes: 765d67748bcf ("IB: new common API for draining queues") Signed-off-by: Bart Van Assche Cc: Steve Wise Cc: Sagi Grimberg Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 2c7b0ceb46e6..4e2b231b03f7 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2194,7 +2194,13 @@ static void __ib_drain_sq(struct ib_qp *qp) struct ib_cq *cq = qp->send_cq; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe sdrain; - struct ib_send_wr swr = {}, *bad_swr; + struct ib_send_wr *bad_swr; + struct ib_rdma_wr swr = { + .wr = { + .opcode = IB_WR_RDMA_WRITE, + .wr_cqe = &sdrain.cqe, + }, + }; int ret; ret = ib_modify_qp(qp, &attr, IB_QP_STATE); @@ -2203,11 +2209,10 @@ static void __ib_drain_sq(struct ib_qp *qp) return; } - swr.wr_cqe = &sdrain.cqe; sdrain.cqe.done = ib_drain_qp_done; init_completion(&sdrain.done); - ret = ib_post_send(qp, &swr, &bad_swr); + ret = ib_post_send(qp, &swr.wr, &bad_swr); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; From a6544a624c3ff92a64e4aca3931fa064607bd3da Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 1 Mar 2018 14:00:29 -0800 Subject: [PATCH 038/199] RDMA/rxe: Fix an out-of-bounds read This patch avoids that KASAN reports the following when the SRP initiator calls srp_post_send(): ================================================================== BUG: KASAN: stack-out-of-bounds in rxe_post_send+0x5c4/0x980 [rdma_rxe] Read of size 8 at addr ffff880066606e30 by task 02-mq/1074 CPU: 2 PID: 1074 Comm: 02-mq Not tainted 4.16.0-rc3-dbg+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 Call Trace: dump_stack+0x85/0xc7 print_address_description+0x65/0x270 kasan_report+0x231/0x350 rxe_post_send+0x5c4/0x980 [rdma_rxe] srp_post_send.isra.16+0x149/0x190 [ib_srp] srp_queuecommand+0x94d/0x1670 [ib_srp] scsi_dispatch_cmd+0x1c2/0x550 [scsi_mod] scsi_queue_rq+0x843/0xa70 [scsi_mod] blk_mq_dispatch_rq_list+0x143/0xac0 blk_mq_do_dispatch_ctx+0x1c5/0x260 blk_mq_sched_dispatch_requests+0x2bf/0x2f0 __blk_mq_run_hw_queue+0xdb/0x160 __blk_mq_delay_run_hw_queue+0xba/0x100 blk_mq_run_hw_queue+0xf2/0x190 blk_mq_sched_insert_request+0x163/0x2f0 blk_execute_rq+0xb0/0x130 scsi_execute+0x14e/0x260 [scsi_mod] scsi_probe_and_add_lun+0x366/0x13d0 [scsi_mod] __scsi_scan_target+0x18a/0x810 [scsi_mod] scsi_scan_target+0x11e/0x130 [scsi_mod] srp_create_target+0x1522/0x19e0 [ib_srp] kernfs_fop_write+0x180/0x210 __vfs_write+0xb1/0x2e0 vfs_write+0xf6/0x250 SyS_write+0x99/0x110 do_syscall_64+0xee/0x2b0 entry_SYSCALL_64_after_hwframe+0x42/0xb7 The buggy address belongs to the page: page:ffffea0001998180 count:0 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0x4000000000000000() raw: 4000000000000000 0000000000000000 0000000000000000 00000000ffffffff raw: dead000000000100 dead000000000200 0000000000000000 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff880066606d00: 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 ffff880066606d80: f1 00 f2 f2 f2 f2 f2 f2 f2 00 00 f2 f2 f2 f2 f2 >ffff880066606e00: f2 00 00 00 00 00 f2 f2 f2 f3 f3 f3 f3 00 00 00 ^ ffff880066606e80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff880066606f00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Bart Van Assche Cc: Moni Shoua Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f4bab2cd0ec2..45594091353c 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -711,9 +711,8 @@ static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr, memcpy(wqe->dma.sge, ibwr->sg_list, num_sge * sizeof(struct ib_sge)); - wqe->iova = (mask & WR_ATOMIC_MASK) ? - atomic_wr(ibwr)->remote_addr : - rdma_wr(ibwr)->remote_addr; + wqe->iova = mask & WR_ATOMIC_MASK ? atomic_wr(ibwr)->remote_addr : + mask & WR_READ_OR_WRITE_MASK ? rdma_wr(ibwr)->remote_addr : 0; wqe->mask = mask; wqe->dma.length = length; wqe->dma.resid = length; From 2a78cb4db487372152bed2055c038f9634d595e8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 1 Mar 2018 14:00:30 -0800 Subject: [PATCH 039/199] IB/srpt: Fix an out-of-bounds stack access in srpt_zerolength_write() Avoid triggering an out-of-bounds stack access by changing the type of 'wr' from ib_send_wr into ib_rdma_wr. This patch fixes the following KASAN bug report: BUG: KASAN: stack-out-of-bounds in rxe_post_send+0x7a9/0x9a0 [rdma_rxe] Read of size 8 at addr ffff880068197a48 by task kworker/2:1/44 Workqueue: ib_cm cm_work_handler [ib_cm] Call Trace: dump_stack+0x8e/0xcd print_address_description+0x6f/0x280 kasan_report+0x25a/0x380 __asan_load8+0x54/0x90 rxe_post_send+0x7a9/0x9a0 [rdma_rxe] srpt_zerolength_write+0xf0/0x180 [ib_srpt] srpt_cm_rtu_recv+0x68/0x110 [ib_srpt] srpt_rdma_cm_handler+0xbb/0x15b [ib_srpt] cma_ib_handler+0x1aa/0x4a0 [rdma_cm] cm_process_work+0x30/0x100 [ib_cm] cm_work_handler+0xa86/0x351b [ib_cm] process_one_work+0x475/0x9f0 worker_thread+0x69/0x690 kthread+0x1ad/0x1d0 ret_from_fork+0x3a/0x50 Fixes: aaf45bd83eba ("IB/srpt: Detect session shutdown reliably") Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srpt/ib_srpt.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 0373b7c40902..33454309a98b 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -838,16 +838,19 @@ static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch, */ static int srpt_zerolength_write(struct srpt_rdma_ch *ch) { - struct ib_send_wr wr, *bad_wr; + struct ib_send_wr *bad_wr; + struct ib_rdma_wr wr = { + .wr = { + .opcode = IB_WR_RDMA_WRITE, + .wr_cqe = &ch->zw_cqe, + .send_flags = IB_SEND_SIGNALED, + } + }; pr_debug("%s-%d: queued zerolength write\n", ch->sess_name, ch->qp->qp_num); - memset(&wr, 0, sizeof(wr)); - wr.opcode = IB_WR_RDMA_WRITE; - wr.wr_cqe = &ch->zw_cqe; - wr.send_flags = IB_SEND_SIGNALED; - return ib_post_send(ch->qp, &wr, &bad_wr); + return ib_post_send(ch->qp, &wr.wr, &bad_wr); } static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) From 6b0c549fc616a2024178fce276df80fd138f3c31 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Fri, 2 Mar 2018 15:17:12 -0600 Subject: [PATCH 040/199] i40iw: Refactor handling of txpend list Currently the TX pending lists for IEQ and ILQ are handled separately. The handling of both can be consolidated in i40iw_poll_completion. Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_puda.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index 4c21197830b3..d9c7ae6a7030 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -348,8 +348,8 @@ enum i40iw_status_code i40iw_puda_poll_completion(struct i40iw_sc_dev *dev, spin_lock_irqsave(&rsrc->bufpool_lock, flags); rsrc->tx_wqe_avail_cnt++; spin_unlock_irqrestore(&rsrc->bufpool_lock, flags); - if (!list_empty(&rsrc->vsi->ilq->txpend)) - i40iw_puda_send_buf(rsrc->vsi->ilq, NULL); + if (!list_empty(&rsrc->txpend)) + i40iw_puda_send_buf(rsrc, NULL); } done: @@ -1471,10 +1471,6 @@ static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid) struct i40iw_puda_buf *buf = (struct i40iw_puda_buf *)sqwrid; i40iw_puda_ret_bufpool(ieq, buf); - if (!list_empty(&ieq->txpend)) { - buf = i40iw_puda_get_listbuf(&ieq->txpend); - i40iw_puda_send_buf(ieq, buf); - } } /** From 7de8b3576ab88bf8b3307eac2972d3b58dc28708 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 2 Mar 2018 15:17:13 -0600 Subject: [PATCH 041/199] i40iw: Improve CM node lookup time on connection setup Currently all CM nodes involved in a connection are maintained in a connected_node list per dev. During connection setup, we need to search this every time we receive a packet on the iWARP LAN Queue (ILQ) and this can be pretty inefficient for large number of connections. Fix this by organizing the CM nodes in two lists - accelerated list and non-accelerated list. The search on ILQ receive would be limited to only non accelerated nodes. When a node moves to RTS, it is added to the accelerated list. Benchmarking ucmatose 16k connections shows a 20% improvement in test completion time. Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw.h | 3 +- drivers/infiniband/hw/i40iw/i40iw_cm.c | 127 ++++++++++++++++------ drivers/infiniband/hw/i40iw/i40iw_cm.h | 5 +- drivers/infiniband/hw/i40iw/i40iw_utils.c | 2 +- 4 files changed, 100 insertions(+), 37 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index bcddd7061fc0..a20650f060ce 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -564,7 +564,8 @@ struct i40iw_cm_node *i40iw_find_node(struct i40iw_cm_core *cm_core, u32 *rem_addr, u16 loc_port, u32 *loc_addr, - bool add_refcnt); + bool add_refcnt, + bool accelerated_list); enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev, struct i40iw_sc_qp *qp, diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index abf4cd897849..d4780d3887ca 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1182,6 +1182,26 @@ static void i40iw_handle_close_entry(struct i40iw_cm_node *cm_node, u32 rem_node cm_node->close_entry = NULL; } +/** + * i40iw_build_timer_list - Add cm_nodes to timer list + * @timer_list: ptr to timer list + * @hte: ptr to accelerated or non-accelerated list + */ +static void i40iw_build_timer_list(struct list_head *timer_list, + struct list_head *hte) +{ + struct i40iw_cm_node *cm_node; + struct list_head *list_core_temp, *list_node; + + list_for_each_safe(list_node, list_core_temp, hte) { + cm_node = container_of(list_node, struct i40iw_cm_node, list); + if (cm_node->close_entry || cm_node->send_entry) { + atomic_inc(&cm_node->ref_count); + list_add(&cm_node->timer_entry, timer_list); + } + } +} + /** * i40iw_cm_timer_tick - system's timer expired callback * @pass: Pointing to cm_core @@ -1202,15 +1222,10 @@ static void i40iw_cm_timer_tick(struct timer_list *t) struct list_head timer_list; INIT_LIST_HEAD(&timer_list); - spin_lock_irqsave(&cm_core->ht_lock, flags); - list_for_each_safe(list_node, list_core_temp, &cm_core->connected_nodes) { - cm_node = container_of(list_node, struct i40iw_cm_node, list); - if (cm_node->close_entry || cm_node->send_entry) { - atomic_inc(&cm_node->ref_count); - list_add(&cm_node->timer_entry, &timer_list); - } - } + spin_lock_irqsave(&cm_core->ht_lock, flags); + i40iw_build_timer_list(&timer_list, &cm_core->non_accelerated_list); + i40iw_build_timer_list(&timer_list, &cm_core->accelerated_list); spin_unlock_irqrestore(&cm_core->ht_lock, flags); list_for_each_safe(list_node, list_core_temp, &timer_list) { @@ -1406,19 +1421,22 @@ static int i40iw_send_fin(struct i40iw_cm_node *cm_node) * @loc_port: local tcp port num * @loc_addr: loc ip addr * @add_refcnt: flag to increment refcount of cm_node + * @accelerated_list: flag for accelerated vs non-accelerated list to search */ struct i40iw_cm_node *i40iw_find_node(struct i40iw_cm_core *cm_core, u16 rem_port, u32 *rem_addr, u16 loc_port, u32 *loc_addr, - bool add_refcnt) + bool add_refcnt, + bool accelerated_list) { struct list_head *hte; struct i40iw_cm_node *cm_node; unsigned long flags; - hte = &cm_core->connected_nodes; + hte = accelerated_list ? + &cm_core->accelerated_list : &cm_core->non_accelerated_list; /* walk list and find cm_node associated with this session ID */ spin_lock_irqsave(&cm_core->ht_lock, flags); @@ -1487,21 +1505,39 @@ static struct i40iw_cm_listener *i40iw_find_listener( static void i40iw_add_hte_node(struct i40iw_cm_core *cm_core, struct i40iw_cm_node *cm_node) { - struct list_head *hte; unsigned long flags; if (!cm_node || !cm_core) { i40iw_pr_err("cm_node or cm_core == NULL\n"); return; } - spin_lock_irqsave(&cm_core->ht_lock, flags); - /* get a handle on the hash table element (list head for this slot) */ - hte = &cm_core->connected_nodes; - list_add_tail(&cm_node->list, hte); + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_add_tail(&cm_node->list, &cm_core->non_accelerated_list); spin_unlock_irqrestore(&cm_core->ht_lock, flags); } +/** + * i40iw_find_port - find port that matches reference port + * @port: port number + * @accelerated_list: flag for accelerated vs non-accelerated list + */ +static bool i40iw_find_port(struct i40iw_cm_core *cm_core, u16 port, + bool accelerated_list) +{ + struct list_head *hte; + struct i40iw_cm_node *cm_node; + + hte = accelerated_list ? + &cm_core->accelerated_list : &cm_core->non_accelerated_list; + + list_for_each_entry(cm_node, hte, list) { + if (cm_node->loc_port == port) + return true; + } + return false; +} + /** * i40iw_port_in_use - determine if port is in use * @port: port number @@ -1510,19 +1546,14 @@ static void i40iw_add_hte_node(struct i40iw_cm_core *cm_core, static bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port, bool active_side) { struct i40iw_cm_listener *listen_node; - struct i40iw_cm_node *cm_node; unsigned long flags; bool ret = false; if (active_side) { - /* search connected node list */ spin_lock_irqsave(&cm_core->ht_lock, flags); - list_for_each_entry(cm_node, &cm_core->connected_nodes, list) { - if (cm_node->loc_port == port) { - ret = true; - break; - } - } + ret = i40iw_find_port(cm_core, port, true); + if (!ret) + ret = i40iw_find_port(cm_core, port, false); if (!ret) clear_bit(port, cm_core->active_side_ports); spin_unlock_irqrestore(&cm_core->ht_lock, flags); @@ -1829,9 +1860,11 @@ static int i40iw_dec_refcnt_listen(struct i40iw_cm_core *cm_core, INIT_LIST_HEAD(&reset_list); if (free_hanging_nodes) { spin_lock_irqsave(&cm_core->ht_lock, flags); - list_for_each_safe(list_pos, list_temp, &cm_core->connected_nodes) { + list_for_each_safe(list_pos, + list_temp, &cm_core->non_accelerated_list) { cm_node = container_of(list_pos, struct i40iw_cm_node, list); - if ((cm_node->listener == listener) && !cm_node->accelerated) { + if ((cm_node->listener == listener) && + !cm_node->accelerated) { atomic_inc(&cm_node->ref_count); list_add(&cm_node->reset_entry, &reset_list); } @@ -3144,7 +3177,8 @@ void i40iw_receive_ilq(struct i40iw_sc_vsi *vsi, struct i40iw_puda_buf *rbuf) cm_info.rem_addr, cm_info.loc_port, cm_info.loc_addr, - true); + true, + false); if (!cm_node) { /* Only type of packet accepted are for */ @@ -3202,7 +3236,8 @@ void i40iw_setup_cm_core(struct i40iw_device *iwdev) cm_core->iwdev = iwdev; cm_core->dev = &iwdev->sc_dev; - INIT_LIST_HEAD(&cm_core->connected_nodes); + INIT_LIST_HEAD(&cm_core->accelerated_list); + INIT_LIST_HEAD(&cm_core->non_accelerated_list); INIT_LIST_HEAD(&cm_core->listen_nodes); timer_setup(&cm_core->tcp_timer, i40iw_cm_timer_tick, 0); @@ -3585,6 +3620,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct i40iw_qp *iwqp; struct i40iw_device *iwdev; struct i40iw_sc_dev *dev; + struct i40iw_cm_core *cm_core; struct i40iw_cm_node *cm_node; struct ib_qp_attr attr; int passive_state; @@ -3594,6 +3630,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct i40iw_kmem_info accept; enum i40iw_status_code status; u64 tagged_offset; + unsigned long flags; memset(&attr, 0, sizeof(attr)); ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn); @@ -3603,6 +3640,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) iwqp = to_iwqp(ibqp); iwdev = iwqp->iwdev; dev = &iwdev->sc_dev; + cm_core = &iwdev->cm_core; cm_node = (struct i40iw_cm_node *)cm_id->provider_data; if (((struct sockaddr_in *)&cm_id->local_addr)->sin_family == AF_INET) { @@ -3697,6 +3735,10 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL); cm_node->accelerated = true; + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_move_tail(&cm_node->list, &cm_core->accelerated_list); + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_ESTABLISHED, 0); if (status) @@ -4026,10 +4068,12 @@ static void i40iw_cm_event_connected(struct i40iw_cm_event *event) { struct i40iw_qp *iwqp; struct i40iw_device *iwdev; + struct i40iw_cm_core *cm_core; struct i40iw_cm_node *cm_node; struct i40iw_sc_dev *dev; struct ib_qp_attr attr; struct iw_cm_id *cm_id; + unsigned long flags; int status; bool read0; @@ -4038,6 +4082,7 @@ static void i40iw_cm_event_connected(struct i40iw_cm_event *event) iwqp = (struct i40iw_qp *)cm_id->provider_data; iwdev = to_iwdev(iwqp->ibqp.device); dev = &iwdev->sc_dev; + cm_core = &iwdev->cm_core; if (iwqp->destroyed) { status = -ETIMEDOUT; @@ -4057,6 +4102,9 @@ static void i40iw_cm_event_connected(struct i40iw_cm_event *event) i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL); cm_node->accelerated = true; + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_move_tail(&cm_node->list, &cm_core->accelerated_list); + spin_unlock_irqrestore(&cm_core->ht_lock, flags); status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_CONNECT_REPLY, 0); if (status) @@ -4256,25 +4304,38 @@ void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr, struct list_head *list_node; struct i40iw_cm_node *cm_node; unsigned long flags; - struct list_head connected_list; + struct list_head teardown_list; struct ib_qp_attr attr; - INIT_LIST_HEAD(&connected_list); + INIT_LIST_HEAD(&teardown_list); spin_lock_irqsave(&cm_core->ht_lock, flags); - list_for_each_safe(list_node, list_core_temp, &cm_core->connected_nodes) { + list_for_each_safe(list_node, list_core_temp, + &cm_core->accelerated_list) { cm_node = container_of(list_node, struct i40iw_cm_node, list); if (disconnect_all || (nfo->vlan_id == cm_node->vlan_id && (!memcmp(cm_node->loc_addr, ipaddr, nfo->ipv4 ? 4 : 16) || !memcmp(cm_node->rem_addr, ipaddr, nfo->ipv4 ? 4 : 16)))) { atomic_inc(&cm_node->ref_count); - list_add(&cm_node->connected_entry, &connected_list); + list_add(&cm_node->teardown_entry, &teardown_list); + } + } + list_for_each_safe(list_node, list_core_temp, + &cm_core->non_accelerated_list) { + cm_node = container_of(list_node, struct i40iw_cm_node, list); + if (disconnect_all || + (nfo->vlan_id == cm_node->vlan_id && + (!memcmp(cm_node->loc_addr, ipaddr, nfo->ipv4 ? 4 : 16) || + !memcmp(cm_node->rem_addr, ipaddr, nfo->ipv4 ? 4 : 16)))) { + atomic_inc(&cm_node->ref_count); + list_add(&cm_node->teardown_entry, &teardown_list); } } spin_unlock_irqrestore(&cm_core->ht_lock, flags); - list_for_each_safe(list_node, list_core_temp, &connected_list) { - cm_node = container_of(list_node, struct i40iw_cm_node, connected_entry); + list_for_each_safe(list_node, list_core_temp, &teardown_list) { + cm_node = container_of(list_node, struct i40iw_cm_node, + teardown_entry); attr.qp_state = IB_QPS_ERR; i40iw_modify_qp(&cm_node->iwqp->ibqp, &attr, IB_QP_STATE, NULL); if (iwdev->reset) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index cf60c451e071..78ba36ae2bbe 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -341,7 +341,7 @@ struct i40iw_cm_node { int accept_pend; struct list_head timer_entry; struct list_head reset_entry; - struct list_head connected_entry; + struct list_head teardown_entry; atomic_t passive_state; bool qhash_set; u8 user_pri; @@ -403,7 +403,8 @@ struct i40iw_cm_core { struct i40iw_sc_dev *dev; struct list_head listen_nodes; - struct list_head connected_nodes; + struct list_head accelerated_list; + struct list_head non_accelerated_list; struct timer_list tcp_timer; diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index ddc1056b0b4e..8cad4e8772bc 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -1407,7 +1407,7 @@ struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev, rem_port = ntohs(tcph->source); cm_node = i40iw_find_node(&iwdev->cm_core, rem_port, rem_addr, loc_port, - loc_addr, false); + loc_addr, false, true); if (!cm_node) return NULL; iwqp = cm_node->iwqp; From 7e952b19eb638ffa2d511796e35c62a48ec1aef0 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 2 Mar 2018 15:17:14 -0600 Subject: [PATCH 042/199] i40iw: Implement get_vector_affinity API Storage ULPs (like NVMEoF) benefit from exposing affinity mapping per completion vector to find the optimal multi-queue affinity assignments. The ULPs call the verbs API ib_get_vector_affinity introduced in commit c66cd353bbe ("RDMA/core: expose affinity mappings per completion vector") to get the underlying devices affinity mappings. Add support in driver to expose the affinity masks per MSI-X completion vector. Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 70024e8e2692..a51798578f27 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2728,6 +2728,25 @@ static int i40iw_destroy_ah(struct ib_ah *ah) return -ENOSYS; } +/** + * i40iw_get_vector_affinity - report IRQ affinity mask + * @ibdev: IB device + * @comp_vector: completion vector index + */ +static const struct cpumask *i40iw_get_vector_affinity(struct ib_device *ibdev, + int comp_vector) +{ + struct i40iw_device *iwdev = to_iwdev(ibdev); + struct i40iw_msix_vector *msix_vec; + + if (iwdev->msix_shared) + msix_vec = &iwdev->iw_msixtbl[comp_vector]; + else + msix_vec = &iwdev->iw_msixtbl[comp_vector + 1]; + + return irq_get_affinity_mask(msix_vec->irq); +} + /** * i40iw_init_rdma_device - initialization of iwarp device * @iwdev: iwarp device @@ -2824,6 +2843,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev iwibdev->ibdev.req_notify_cq = i40iw_req_notify_cq; iwibdev->ibdev.post_send = i40iw_post_send; iwibdev->ibdev.post_recv = i40iw_post_recv; + iwibdev->ibdev.get_vector_affinity = i40iw_get_vector_affinity; return iwibdev; } From 666fe24bbeb699c100b396095963a62c6f078b38 Mon Sep 17 00:00:00 2001 From: Arushi Date: Sat, 3 Mar 2018 21:54:57 +0530 Subject: [PATCH 043/199] infiniband: hw: Drop unnecessary continue Continue at the bottom of a loop are removed. Issue found using drop_continue.cocci Coccinelle script. Signed-off-by: Arushi Singhal Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_init.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 3990f386aa32..6c68f8a97018 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -678,11 +678,9 @@ int qib_init(struct qib_devdata *dd, int reinit) lastfail = qib_create_rcvhdrq(dd, rcd); if (!lastfail) lastfail = qib_setup_eagerbufs(rcd); - if (lastfail) { + if (lastfail) qib_dev_err(dd, "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); - continue; - } } for (pidx = 0; pidx < dd->num_pports; ++pidx) { From 41904439479e94dd61ac499312e8d8266b13f81d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 5 Mar 2018 09:58:33 -0800 Subject: [PATCH 044/199] IB/hfi1: Add a missing rcu_read_unlock() This patch avoids that sparse reports the following: drivers/infiniband/hw/hfi1/driver.c:251:13: warning: context imbalance in 'rcv_hdrerr' - different lock contexts for basic block Signed-off-by: Bart Van Assche Cc: Mike Marciniszyn Cc: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/driver.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index addc68e83606..46d1475b2154 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -390,6 +390,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, svc_type = IB_CC_SVCTYPE_UC; break; default: + rcu_read_unlock(); goto drop; } From fbd36818eea88462be4176c9fb73bb7728971ff5 Mon Sep 17 00:00:00 2001 From: Sergey Gorenko Date: Mon, 5 Mar 2018 20:15:56 +0200 Subject: [PATCH 045/199] IB/srp: Use the IB_DEVICE_SG_GAPS_REG HCA feature if supported If a HCA supports the SG_GAPS_REG feature then fewer memory regions are required per command. This patch reduces the number of memory regions that is allocated per SRP session. Signed-off-by: Sergey Gorenko Reviewed-by: Max Gurtovoy Tested-by: Laurence Oberman Signed-off-by: Leon Romanovsky Acked-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 56 ++++++++++++++++++----------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index d61f48a86508..9a5ea6251450 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -431,6 +431,7 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, struct srp_fr_desc *d; struct ib_mr *mr; int i, ret = -EINVAL; + enum ib_mr_type mr_type; if (pool_size <= 0) goto err; @@ -444,9 +445,13 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, spin_lock_init(&pool->lock); INIT_LIST_HEAD(&pool->free_list); + if (device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { - mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, - max_page_list_len); + mr = ib_alloc_mr(pd, mr_type, max_page_list_len); if (IS_ERR(mr)) { ret = PTR_ERR(mr); if (ret == -ENOMEM) @@ -2996,8 +3001,9 @@ static int srp_slave_alloc(struct scsi_device *sdev) struct Scsi_Host *shost = sdev->host; struct srp_target_port *target = host_to_target(shost); struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; - if (true) + if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) blk_queue_virt_boundary(sdev->request_queue, ~srp_dev->mr_page_mask); @@ -3775,26 +3781,36 @@ static ssize_t srp_create_target(struct device *dev, } if (srp_dev->use_fast_reg || srp_dev->use_fmr) { - /* - * FR and FMR can only map one HCA page per entry. If the - * start address is not aligned on a HCA page boundary two - * entries will be used for the head and the tail although - * these two entries combined contain at most one HCA page of - * data. Hence the "+ 1" in the calculation below. - * - * The indirect data buffer descriptor is contiguous so the - * memory for that buffer will only be registered if - * register_always is true. Hence add one to mr_per_cmd if - * register_always has been set. - */ + bool gaps_reg = (ibdev->attrs.device_cap_flags & + IB_DEVICE_SG_GAPS_REG); + max_sectors_per_mr = srp_dev->max_pages_per_mr << (ilog2(srp_dev->mr_page_size) - 9); - mr_per_cmd = register_always + - (target->scsi_host->max_sectors + 1 + - max_sectors_per_mr - 1) / max_sectors_per_mr; + if (!gaps_reg) { + /* + * FR and FMR can only map one HCA page per entry. If + * the start address is not aligned on a HCA page + * boundary two entries will be used for the head and + * the tail although these two entries combined + * contain at most one HCA page of data. Hence the "+ + * 1" in the calculation below. + * + * The indirect data buffer descriptor is contiguous + * so the memory for that buffer will only be + * registered if register_always is true. Hence add + * one to mr_per_cmd if register_always has been set. + */ + mr_per_cmd = register_always + + (target->scsi_host->max_sectors + 1 + + max_sectors_per_mr - 1) / max_sectors_per_mr; + } else { + mr_per_cmd = register_always + + (target->sg_tablesize + + srp_dev->max_pages_per_mr - 1) / + srp_dev->max_pages_per_mr; + } pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n", - target->scsi_host->max_sectors, - srp_dev->max_pages_per_mr, srp_dev->mr_page_size, + target->scsi_host->max_sectors, srp_dev->max_pages_per_mr, srp_dev->mr_page_size, max_sectors_per_mr, mr_per_cmd); } From 63231585a6167840172cf7e9045b43ced8a0b6c2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 5 Mar 2018 17:36:47 -0600 Subject: [PATCH 046/199] RDMA/bnxt_re/qplib_sp: Use true and false for boolean values Assign true or false to boolean variables instead of an integer value. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Acked-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 03057983341f..cf0539e1d31f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -153,7 +153,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc); } - attr->is_atomic = 0; + attr->is_atomic = false; bail: bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf); return rc; From 63cf1a902c9dd6b0761861ea87fce3663f59403b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 2 Mar 2018 13:14:15 -0800 Subject: [PATCH 047/199] IB/srpt: Add RDMA/CM support Add a parameter for configuring the port on which the ib_srpt driver listens for incoming RDMA/CM connections, namely /sys/kernel/config/target/srpt/discovery_auth/rdma_cm_port. The default value for this parameter is 0 which means "do not listen for incoming RDMA/CM connections". Add RDMA/CM support to all code that handles connection state changes. Modify srpt_init_nodeacl() such that ACLs can be configured for IPv4 and IPv6 addresses. Note: incoming connection requests are only accepted for ports that have been enabled. See also the "if (!sport->enabled)" code in the connection request handler. See also the following configfs attribute: /sys/kernel/config/target/srpt/$port/$port/enable. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 380 +++++++++++++++++++++----- drivers/infiniband/ulp/srpt/ib_srpt.h | 8 +- 2 files changed, 325 insertions(+), 63 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 33454309a98b..8956d4621273 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -92,6 +93,11 @@ MODULE_PARM_DESC(srpt_service_guid, " instead of using the node_guid of the first HCA."); static struct ib_client srpt_client; +/* Protects both rdma_cm_port and rdma_cm_id. */ +static DEFINE_MUTEX(rdma_cm_mutex); +/* Port number RDMA/CM will bind to. */ +static u16 rdma_cm_port; +static struct rdma_cm_id *rdma_cm_id; static void srpt_release_cmd(struct se_cmd *se_cmd); static void srpt_free_ch(struct kref *kref); static int srpt_queue_status(struct se_cmd *cmd); @@ -220,7 +226,10 @@ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) switch (event->event) { case IB_EVENT_COMM_EST: - ib_cm_notify(ch->ib_cm.cm_id, event->event); + if (ch->using_rdma_cm) + rdma_notify(ch->rdma_cm.cm_id, event->event); + else + ib_cm_notify(ch->ib_cm.cm_id, event->event); break; case IB_EVENT_QP_LAST_WQE_REACHED: pr_debug("%s-%d, state %s: received Last WQE event.\n", @@ -1060,6 +1069,8 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) struct ib_qp_attr *attr; int ret; + WARN_ON_ONCE(ch->using_rdma_cm); + attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; @@ -1099,6 +1110,8 @@ static int srpt_ch_qp_rtr(struct srpt_rdma_ch *ch, struct ib_qp *qp) int attr_mask; int ret; + WARN_ON_ONCE(ch->using_rdma_cm); + qp_attr.qp_state = IB_QPS_RTR; ret = ib_cm_init_qp_attr(ch->ib_cm.cm_id, &qp_attr, &attr_mask); if (ret) @@ -1749,18 +1762,33 @@ retry: qp_init->cap.max_recv_sge = qp_init->cap.max_send_sge; } - ch->qp = ib_create_qp(sdev->pd, qp_init); - if (IS_ERR(ch->qp)) { - ret = PTR_ERR(ch->qp); - if (ret == -ENOMEM) { - sq_size /= 2; - if (sq_size >= MIN_SRPT_SQ_SIZE) { - ib_destroy_cq(ch->cq); - goto retry; - } + if (ch->using_rdma_cm) { + ret = rdma_create_qp(ch->rdma_cm.cm_id, sdev->pd, qp_init); + ch->qp = ch->rdma_cm.cm_id->qp; + } else { + ch->qp = ib_create_qp(sdev->pd, qp_init); + if (!IS_ERR(ch->qp)) { + ret = srpt_init_ch_qp(ch, ch->qp); + if (ret) + ib_destroy_qp(ch->qp); + } else { + ret = PTR_ERR(ch->qp); + } + } + if (ret) { + bool retry = sq_size > MIN_SRPT_SQ_SIZE; + + if (retry) { + pr_debug("failed to create queue pair with sq_size = %d (%d) - retrying\n", + sq_size, ret); + ib_free_cq(ch->cq); + sq_size = max(sq_size / 2, MIN_SRPT_SQ_SIZE); + goto retry; + } else { + pr_err("failed to create queue pair with sq_size = %d (%d)\n", + sq_size, ret); + goto err_destroy_cq; } - pr_err("failed to create_qp ret= %d\n", ret); - goto err_destroy_cq; } atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr); @@ -1769,10 +1797,6 @@ retry: __func__, ch->cq->cqe, qp_init->cap.max_send_sge, qp_init->cap.max_send_wr, ch); - ret = srpt_init_ch_qp(ch, ch->qp); - if (ret) - goto err_destroy_qp; - if (!sdev->use_srq) for (i = 0; i < ch->rq_size; i++) srpt_post_recv(sdev, ch, ch->ioctx_recv_ring[i]); @@ -1781,9 +1805,8 @@ out: kfree(qp_init); return ret; -err_destroy_qp: - ib_destroy_qp(ch->qp); err_destroy_cq: + ch->qp = NULL; ib_free_cq(ch->cq); goto out; } @@ -1852,9 +1875,13 @@ static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) if (!srpt_set_ch_state(ch, CH_DISCONNECTING)) return -ENOTCONN; - ret = ib_send_cm_dreq(ch->ib_cm.cm_id, NULL, 0); - if (ret < 0) - ret = ib_send_cm_drep(ch->ib_cm.cm_id, NULL, 0); + if (ch->using_rdma_cm) { + ret = rdma_disconnect(ch->rdma_cm.cm_id); + } else { + ret = ib_send_cm_dreq(ch->ib_cm.cm_id, NULL, 0); + if (ret < 0) + ret = ib_send_cm_drep(ch->ib_cm.cm_id, NULL, 0); + } if (ret < 0 && srpt_close_ch(ch)) ret = 0; @@ -2005,7 +2032,10 @@ static void srpt_release_channel_work(struct work_struct *w) transport_deregister_session(se_sess); ch->sess = NULL; - ib_destroy_cm_id(ch->ib_cm.cm_id); + if (ch->using_rdma_cm) + rdma_destroy_id(ch->rdma_cm.cm_id); + else + ib_destroy_cm_id(ch->ib_cm.cm_id); srpt_destroy_ch_ib(ch); @@ -2029,26 +2059,33 @@ static void srpt_release_channel_work(struct work_struct *w) /** * srpt_cm_req_recv - process the event IB_CM_REQ_RECEIVED - * @cm_id: IB/CM connection identifier. - * @port_num: Port through which the IB/CM REQ message was received. + * @sdev: HCA through which the login request was received. + * @ib_cm_id: IB/CM connection identifier in case of IB/CM. + * @rdma_cm_id: RDMA/CM connection identifier in case of RDMA/CM. + * @port_num: Port through which the REQ message was received. * @pkey: P_Key of the incoming connection. * @req: SRP login request. - * @src_addr: GID of the port that submitted the login request. + * @src_addr: GID (IB/CM) or IP address (RDMA/CM) of the port that submitted + * the login request. * * Ownership of the cm_id is transferred to the target session if this - * functions returns zero. Otherwise the caller remains the owner of cm_id. + * function returns zero. Otherwise the caller remains the owner of cm_id. */ -static int srpt_cm_req_recv(struct ib_cm_id *cm_id, +static int srpt_cm_req_recv(struct srpt_device *const sdev, + struct ib_cm_id *ib_cm_id, + struct rdma_cm_id *rdma_cm_id, u8 port_num, __be16 pkey, const struct srp_login_req *req, const char *src_addr) { - struct srpt_device *sdev = cm_id->context; struct srpt_port *sport = &sdev->port[port_num - 1]; struct srpt_nexus *nexus; struct srp_login_rsp *rsp = NULL; struct srp_login_rej *rej = NULL; - struct ib_cm_rep_param *rep_param = NULL; + union { + struct rdma_conn_param rdma_cm; + struct ib_cm_rep_param ib_cm; + } *rep_param = NULL; struct srpt_rdma_ch *ch; char i_port_id[36]; u32 it_iu_len; @@ -2118,8 +2155,14 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, ch->zw_cqe.done = srpt_zerolength_write_done; INIT_WORK(&ch->release_work, srpt_release_channel_work); ch->sport = sport; - ch->ib_cm.cm_id = cm_id; - cm_id->context = ch; + if (ib_cm_id) { + ch->ib_cm.cm_id = ib_cm_id; + ib_cm_id->context = ch; + } else { + ch->using_rdma_cm = true; + ch->rdma_cm.cm_id = rdma_cm_id; + rdma_cm_id->context = ch; + } /* * ch->rq_size should be at least as large as the initiator queue * depth to avoid that the initiator driver has to report QUEUE_FULL @@ -2230,7 +2273,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, mutex_unlock(&sport->mutex); - ret = srpt_ch_qp_rtr(ch, ch->qp); + ret = ch->using_rdma_cm ? 0 : srpt_ch_qp_rtr(ch, ch->qp); if (ret) { rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_err("rejected SRP_LOGIN_REQ because enabling RTR failed (error code = %d)\n", @@ -2254,25 +2297,38 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, atomic_set(&ch->req_lim_delta, 0); /* create cm reply */ - rep_param->qp_num = ch->qp->qp_num; - rep_param->private_data = (void *)rsp; - rep_param->private_data_len = sizeof(*rsp); - rep_param->rnr_retry_count = 7; - rep_param->flow_control = 1; - rep_param->failover_accepted = 0; - rep_param->srq = 1; - rep_param->responder_resources = 4; - rep_param->initiator_depth = 4; + if (ch->using_rdma_cm) { + rep_param->rdma_cm.private_data = (void *)rsp; + rep_param->rdma_cm.private_data_len = sizeof(*rsp); + rep_param->rdma_cm.rnr_retry_count = 7; + rep_param->rdma_cm.flow_control = 1; + rep_param->rdma_cm.responder_resources = 4; + rep_param->rdma_cm.initiator_depth = 4; + } else { + rep_param->ib_cm.qp_num = ch->qp->qp_num; + rep_param->ib_cm.private_data = (void *)rsp; + rep_param->ib_cm.private_data_len = sizeof(*rsp); + rep_param->ib_cm.rnr_retry_count = 7; + rep_param->ib_cm.flow_control = 1; + rep_param->ib_cm.failover_accepted = 0; + rep_param->ib_cm.srq = 1; + rep_param->ib_cm.responder_resources = 4; + rep_param->ib_cm.initiator_depth = 4; + } /* * Hold the sport mutex while accepting a connection to avoid that * srpt_disconnect_ch() is invoked concurrently with this code. */ mutex_lock(&sport->mutex); - if (sport->enabled && ch->state == CH_CONNECTING) - ret = ib_send_cm_rep(cm_id, rep_param); - else + if (sport->enabled && ch->state == CH_CONNECTING) { + if (ch->using_rdma_cm) + ret = rdma_accept(rdma_cm_id, &rep_param->rdma_cm); + else + ret = ib_send_cm_rep(ib_cm_id, &rep_param->ib_cm); + } else { ret = -EINVAL; + } mutex_unlock(&sport->mutex); switch (ret) { @@ -2302,7 +2358,8 @@ free_ring: ch->sport->sdev, ch->rq_size, ch->max_rsp_size, DMA_TO_DEVICE); free_ch: - cm_id->context = NULL; + if (ib_cm_id) + ib_cm_id->context = NULL; kfree(ch); ch = NULL; @@ -2315,8 +2372,11 @@ reject: rej->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); - ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, - (void *)rej, sizeof(*rej)); + if (rdma_cm_id) + rdma_reject(rdma_cm_id, rej, sizeof(*rej)); + else + ib_send_cm_rej(ib_cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, + rej, sizeof(*rej)); out: kfree(rep_param); @@ -2335,10 +2395,44 @@ static int srpt_ib_cm_req_recv(struct ib_cm_id *cm_id, srpt_format_guid(sguid, sizeof(sguid), ¶m->primary_path->dgid.global.interface_id); - return srpt_cm_req_recv(cm_id, param->port, param->primary_path->pkey, + return srpt_cm_req_recv(cm_id->context, cm_id, NULL, param->port, + param->primary_path->pkey, private_data, sguid); } +static int srpt_rdma_cm_req_recv(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct srpt_device *sdev; + struct srp_login_req req; + const struct srp_login_req_rdma *req_rdma; + char src_addr[40]; + + sdev = ib_get_client_data(cm_id->device, &srpt_client); + if (!sdev) + return -ECONNREFUSED; + + if (event->param.conn.private_data_len < sizeof(*req_rdma)) + return -EINVAL; + + /* Transform srp_login_req_rdma into srp_login_req. */ + req_rdma = event->param.conn.private_data; + memset(&req, 0, sizeof(req)); + req.opcode = req_rdma->opcode; + req.tag = req_rdma->tag; + req.req_it_iu_len = req_rdma->req_it_iu_len; + req.req_buf_fmt = req_rdma->req_buf_fmt; + req.req_flags = req_rdma->req_flags; + memcpy(req.initiator_port_id, req_rdma->initiator_port_id, 16); + memcpy(req.target_port_id, req_rdma->target_port_id, 16); + + snprintf(src_addr, sizeof(src_addr), "%pIS", + &cm_id->route.addr.src_addr); + + return srpt_cm_req_recv(sdev, NULL, cm_id, cm_id->port_num, + cm_id->route.path_rec->pkey, &req, src_addr); +} + static void srpt_cm_rej_recv(struct srpt_rdma_ch *ch, enum ib_cm_rej_reason reason, const u8 *private_data, @@ -2362,14 +2456,14 @@ static void srpt_cm_rej_recv(struct srpt_rdma_ch *ch, * srpt_cm_rtu_recv - process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event * @ch: SRPT RDMA channel. * - * An IB_CM_RTU_RECEIVED message indicates that the connection is established - * and that the recipient may begin transmitting (RTU = ready to use). + * An RTU (ready to use) message indicates that the connection has been + * established and that the recipient may begin transmitting. */ static void srpt_cm_rtu_recv(struct srpt_rdma_ch *ch) { int ret; - ret = srpt_ch_qp_rts(ch, ch->qp); + ret = ch->using_rdma_cm ? 0 : srpt_ch_qp_rts(ch, ch->qp); if (ret < 0) { pr_err("%s-%d: QP transition to RTS failed\n", ch->sess_name, ch->qp->qp_num); @@ -2456,6 +2550,49 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return ret; } +static int srpt_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct srpt_rdma_ch *ch = cm_id->context; + int ret = 0; + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = srpt_rdma_cm_req_recv(cm_id, event); + break; + case RDMA_CM_EVENT_REJECTED: + srpt_cm_rej_recv(ch, event->status, + event->param.conn.private_data, + event->param.conn.private_data_len); + break; + case RDMA_CM_EVENT_ESTABLISHED: + srpt_cm_rtu_recv(ch); + break; + case RDMA_CM_EVENT_DISCONNECTED: + if (ch->state < CH_DISCONNECTING) + srpt_disconnect_ch(ch); + else + srpt_close_ch(ch); + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + srpt_close_ch(ch); + break; + case RDMA_CM_EVENT_UNREACHABLE: + pr_info("Received CM REP error for ch %s-%d.\n", ch->sess_name, + ch->qp->qp_num); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_ADDR_CHANGE: + break; + default: + pr_err("received unrecognized RDMA CM event %d\n", + event->event); + break; + } + + return ret; +} + static int srpt_write_pending_status(struct se_cmd *se_cmd) { struct srpt_send_ioctx *ioctx; @@ -2827,7 +2964,7 @@ static void srpt_add_one(struct ib_device *device) { struct srpt_device *sdev; struct srpt_port *sport; - int i; + int i, ret; pr_debug("device = %p\n", device); @@ -2851,9 +2988,15 @@ static void srpt_add_one(struct ib_device *device) if (!srpt_service_guid) srpt_service_guid = be64_to_cpu(device->node_guid); - sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev); - if (IS_ERR(sdev->cm_id)) - goto err_ring; + if (rdma_port_get_link_layer(device, 1) == IB_LINK_LAYER_INFINIBAND) + sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev); + if (IS_ERR(sdev->cm_id)) { + pr_info("ib_create_cm_id() failed: %ld\n", + PTR_ERR(sdev->cm_id)); + sdev->cm_id = NULL; + if (!rdma_cm_id) + goto err_ring; + } /* print out target login information */ pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx," @@ -2866,8 +3009,14 @@ static void srpt_add_one(struct ib_device *device) * in the system as service_id; therefore, the target_id will change * if this HCA is gone bad and replaced by different HCA */ - if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0)) + ret = sdev->cm_id ? + ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0) : + 0; + if (ret < 0) { + pr_err("ib_cm_listen() failed: %d (cm_id state = %d)\n", ret, + sdev->cm_id->state); goto err_cm; + } INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device, srpt_event_handler); @@ -2907,7 +3056,8 @@ out: err_event: ib_unregister_event_handler(&sdev->event_handler); err_cm: - ib_destroy_cm_id(sdev->cm_id); + if (sdev->cm_id) + ib_destroy_cm_id(sdev->cm_id); err_ring: srpt_free_srq(sdev); ib_dealloc_pd(sdev->pd); @@ -2942,7 +3092,10 @@ static void srpt_remove_one(struct ib_device *device, void *client_data) for (i = 0; i < sdev->device->phys_port_cnt; i++) cancel_work_sync(&sdev->port[i].work); - ib_destroy_cm_id(sdev->cm_id); + if (sdev->cm_id) + ib_destroy_cm_id(sdev->cm_id); + + ib_set_client_data(device, &srpt_client, NULL); /* * Unregistering a target must happen after destroying sdev->cm_id @@ -3106,18 +3259,26 @@ static int srpt_parse_i_port_id(u8 i_port_id[16], const char *name) leading_zero_bytes = 16 - count; memset(i_port_id, 0, leading_zero_bytes); ret = hex2bin(i_port_id + leading_zero_bytes, p, count); - if (ret < 0) - pr_debug("hex2bin failed for srpt_parse_i_port_id: %d\n", ret); + out: return ret; } /* - * configfs callback function invoked for - * mkdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + * configfs callback function invoked for mkdir + * /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + * + * i_port_id must be an initiator port GUID, GID or IP address. See also the + * target_alloc_session() calls in this driver. Examples of valid initiator + * port IDs: + * 0x0000000000000000505400fffe4a0b7b + * 0000000000000000505400fffe4a0b7b + * 5054:00ff:fe4a:0b7b + * 192.168.122.76 */ static int srpt_init_nodeacl(struct se_node_acl *se_nacl, const char *name) { + struct sockaddr_storage sa; u64 guid; u8 i_port_id[16]; int ret; @@ -3125,6 +3286,9 @@ static int srpt_init_nodeacl(struct se_node_acl *se_nacl, const char *name) ret = srpt_parse_guid(&guid, name); if (ret < 0) ret = srpt_parse_i_port_id(i_port_id, name); + if (ret < 0) + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, name, NULL, + &sa); if (ret < 0) pr_err("invalid initiator port ID %s\n", name); return ret; @@ -3299,6 +3463,95 @@ static struct configfs_attribute *srpt_tpg_attrib_attrs[] = { NULL, }; +static struct rdma_cm_id *srpt_create_rdma_id(struct sockaddr *listen_addr) +{ + struct rdma_cm_id *rdma_cm_id; + int ret; + + rdma_cm_id = rdma_create_id(&init_net, srpt_rdma_cm_handler, + NULL, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(rdma_cm_id)) { + pr_err("RDMA/CM ID creation failed: %ld\n", + PTR_ERR(rdma_cm_id)); + goto out; + } + + ret = rdma_bind_addr(rdma_cm_id, listen_addr); + if (ret) { + char addr_str[64]; + + snprintf(addr_str, sizeof(addr_str), "%pISp", listen_addr); + pr_err("Binding RDMA/CM ID to address %s failed: %d\n", + addr_str, ret); + rdma_destroy_id(rdma_cm_id); + rdma_cm_id = ERR_PTR(ret); + goto out; + } + + ret = rdma_listen(rdma_cm_id, 128); + if (ret) { + pr_err("rdma_listen() failed: %d\n", ret); + rdma_destroy_id(rdma_cm_id); + rdma_cm_id = ERR_PTR(ret); + } + +out: + return rdma_cm_id; +} + +static ssize_t srpt_rdma_cm_port_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", rdma_cm_port); +} + +static ssize_t srpt_rdma_cm_port_store(struct config_item *item, + const char *page, size_t count) +{ + struct sockaddr_in addr4 = { .sin_family = AF_INET }; + struct sockaddr_in6 addr6 = { .sin6_family = AF_INET6 }; + struct rdma_cm_id *new_id = NULL; + u16 val; + int ret; + + ret = kstrtou16(page, 0, &val); + if (ret < 0) + return ret; + ret = count; + if (rdma_cm_port == val) + goto out; + + if (val) { + addr6.sin6_port = cpu_to_be16(val); + new_id = srpt_create_rdma_id((struct sockaddr *)&addr6); + if (IS_ERR(new_id)) { + addr4.sin_port = cpu_to_be16(val); + new_id = srpt_create_rdma_id((struct sockaddr *)&addr4); + if (IS_ERR(new_id)) { + ret = PTR_ERR(new_id); + goto out; + } + } + } + + mutex_lock(&rdma_cm_mutex); + rdma_cm_port = val; + swap(rdma_cm_id, new_id); + mutex_unlock(&rdma_cm_mutex); + + if (new_id) + rdma_destroy_id(new_id); + ret = count; +out: + return ret; +} + +CONFIGFS_ATTR(srpt_, rdma_cm_port); + +static struct configfs_attribute *srpt_da_attrs[] = { + &srpt_attr_rdma_cm_port, + NULL, +}; + static ssize_t srpt_tpg_enable_show(struct config_item *item, char *page) { struct se_portal_group *se_tpg = to_tpg(item); @@ -3444,6 +3697,7 @@ static const struct target_core_fabric_ops srpt_template = { .fabric_drop_tpg = srpt_drop_tpg, .fabric_init_nodeacl = srpt_init_nodeacl, + .tfc_discovery_attrs = srpt_da_attrs, .tfc_wwn_attrs = srpt_wwn_attrs, .tfc_tpg_base_attrs = srpt_tpg_attrs, .tfc_tpg_attrib_attrs = srpt_tpg_attrib_attrs, @@ -3497,6 +3751,8 @@ out: static void __exit srpt_cleanup_module(void) { + if (rdma_cm_id) + rdma_destroy_id(rdma_cm_id); ib_unregister_client(&srpt_client); target_unregister_template(&srpt_template); } diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 4d9199fd00dc..2361483476a0 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -261,6 +262,7 @@ enum rdma_ch_state { * @spinlock: Protects free_list and state. * @free_list: Head of list with free send I/O contexts. * @state: channel state. See also enum rdma_ch_state. + * @using_rdma_cm: Whether the RDMA/CM or IB/CM is used for this channel. * @processing_wait_list: Whether or not cmd_wait_list is being processed. * @ioctx_ring: Send ring. * @ioctx_recv_ring: Receive I/O context ring. @@ -280,6 +282,9 @@ struct srpt_rdma_ch { struct { struct ib_cm_id *cm_id; } ib_cm; + struct { + struct rdma_cm_id *cm_id; + } rdma_cm; }; struct ib_cq *cq; struct ib_cqe zw_cqe; @@ -300,9 +305,10 @@ struct srpt_rdma_ch { struct list_head list; struct list_head cmd_wait_list; uint16_t pkey; + bool using_rdma_cm; bool processing_wait_list; struct se_session *sess; - u8 sess_name[24]; + u8 sess_name[40]; struct work_struct release_work; }; From 86af617641512f4aeb78fd25dcec7e0f4bb1d5e5 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 27 Feb 2018 06:04:32 -0500 Subject: [PATCH 048/199] IB/rxe: remove unnecessary skb_clone In send_atomic_ack function, it is not necessary to make a skb_clone. To gain better performance (high throughput and low latency), this skb_clone is removed. The following tests are made. server client --------- --------- |1.1.1.1|<----rxe-channel--->|1.1.1.2| --------- --------- On server: rping -s -a 1.1.1.1 -v -C 1000 -S 512 On client: rping -c -a 1.1.1.1 -v -C 1000 -S 512 The kernel config CONFIG_DEBUG_KMEMLEAK is enabled on both server and client. This test runs for several hours. There is no memory leak and the whole system can work well. Based on the above network, the following tests are made. Server: ibv_rc_pingpong -d rxe0 -g 1 Client: ibv_rc_pingpong -d rxe0 -g 1 1.1.1.1 The test results on Server(10 tests are made). Before: Throughput is 137.07 Mbit/sec Latency is 517.76 usec/iter After: Throughput is 148.85 Mbit/sec Latency is 476.64 usec/iter The throughput is enhanced and the latency is reduced. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_resp.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index d37bb9b97569..a65c9969f7fc 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -969,7 +969,6 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, int rc = 0; struct rxe_pkt_info ack_pkt; struct sk_buff *skb; - struct sk_buff *skb_copy; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct resp_res *res; @@ -981,14 +980,7 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, goto out; } - skb_copy = skb_clone(skb, GFP_ATOMIC); - if (skb_copy) - rxe_add_ref(qp); /* for the new SKB */ - else { - pr_warn("Could not clone atomic response\n"); - rc = -ENOMEM; - goto out; - } + rxe_add_ref(qp); res = &qp->resp.resources[qp->resp.res_head]; free_rd_atomic_resource(qp, res); @@ -998,19 +990,18 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, memset((unsigned char *)SKB_TO_PKT(skb) + sizeof(ack_pkt), 0, sizeof(skb->cb) - sizeof(ack_pkt)); + refcount_inc(&skb->users); res->type = RXE_ATOMIC_MASK; res->atomic.skb = skb; res->first_psn = ack_pkt.psn; res->last_psn = ack_pkt.psn; res->cur_psn = ack_pkt.psn; - rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb_copy); + rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb); if (rc) { pr_err_ratelimited("Failed sending ack\n"); rxe_drop_ref(qp); - kfree_skb(skb_copy); } - out: return rc; } From 31f1bd14cbfe4f7a4ea1ada2d4e0dc802a258f5d Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 27 Feb 2018 06:04:33 -0500 Subject: [PATCH 049/199] IB/rxe: remove unnecessary rxe in rxe_send In the function rxe_send, the variable rxe is not used in it. So it should be removed. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_loc.h | 5 ++--- drivers/infiniband/sw/rxe/rxe_net.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 4ef75d5b729b..e8150ab7df58 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -143,8 +143,7 @@ int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); /* rxe_net.c */ int rxe_loopback(struct sk_buff *skb); -int rxe_send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb); +int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb); struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt); int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, @@ -268,7 +267,7 @@ static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp, memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); err = rxe_loopback(skb); } else { - err = rxe_send(rxe, pkt, skb); + err = rxe_send(pkt, skb); } if (err) { diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 159246b03867..a7753dc3261e 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -450,7 +450,7 @@ static void rxe_skb_tx_dtor(struct sk_buff *skb) rxe_drop_ref(qp); } -int rxe_send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct sk_buff *skb) +int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb) { struct rxe_av *av; int err; From befd8d98f230d911b0db308f19663ec03572e0c9 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Wed, 7 Mar 2018 00:47:57 -0500 Subject: [PATCH 050/199] IB/rxe: change the function rxe_init_device_param type The function rxe_init_device_param always return 0. So the function type is changed to void. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index b7debb6f2eac..e493fdbd61c6 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -78,7 +78,7 @@ void rxe_release(struct kref *kref) } /* initialize rxe device parameters */ -static int rxe_init_device_param(struct rxe_dev *rxe) +static void rxe_init_device_param(struct rxe_dev *rxe) { rxe->max_inline_data = RXE_MAX_INLINE_DATA; @@ -122,8 +122,6 @@ static int rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; rxe->max_ucontext = RXE_MAX_UCONTEXT; - - return 0; } /* initialize port attributes */ From d50a8a96ee663909254e2f1db9aed2414e9f45ba Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 26 Feb 2018 15:02:21 +0200 Subject: [PATCH 051/199] IB/mlx4: Move mlx4_uverbs_ex_query_device_resp to include/uapi/ This struct is involved in the user API for mlx4 and should not be hidden inside a driver header file. Fixes: 09d208b258a2 ("IB/mlx4: Add report for RSS capabilities by vendor channel") Reviewed-by: Mark Bloch Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/mlx4_ib.h | 14 -------------- include/uapi/rdma/mlx4-abi.h | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e14919c15b06..d0640bd79679 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -645,20 +645,6 @@ enum query_device_resp_mask { QUERY_DEVICE_RESP_MASK_TIMESTAMP = 1UL << 0, }; -struct mlx4_ib_rss_caps { - __u64 rx_hash_fields_mask; /* enum mlx4_rx_hash_fields */ - __u8 rx_hash_function; /* enum mlx4_rx_hash_function_flags */ - __u8 reserved[7]; -}; - -struct mlx4_uverbs_ex_query_device_resp { - __u32 comp_mask; - __u32 response_length; - __u64 hca_core_clock_offset; - __u32 max_inl_recv_sz; - struct mlx4_ib_rss_caps rss_caps; -}; - static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx4_ib_dev, ib_dev); diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index 7f9c37346613..d84616adff32 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -156,4 +156,18 @@ enum mlx4_ib_rx_hash_fields { MLX4_IB_RX_HASH_INNER = 1ULL << 31, }; +struct mlx4_ib_rss_caps { + __u64 rx_hash_fields_mask; /* enum mlx4_ib_rx_hash_fields */ + __u8 rx_hash_function; /* enum mlx4_ib_rx_hash_function_flags */ + __u8 reserved[7]; +}; + +struct mlx4_uverbs_ex_query_device_resp { + __u32 comp_mask; + __u32 response_length; + __u64 hca_core_clock_offset; + __u32 max_inl_recv_sz; + struct mlx4_ib_rss_caps rss_caps; +}; + #endif /* MLX4_ABI_USER_H */ From 88831a2cfe2245822200ecf4bd7ff77abdf1499a Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:57:22 -0800 Subject: [PATCH 052/199] RDMA/restrack: clean up res_to_dev() Simplify res_to_dev() to make it easier to read/maintain. Reviewed-by: Leon Romanovsky Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/restrack.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3dbc4e4cca41..41a780085e6d 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -60,31 +60,17 @@ static void set_kern_name(struct rdma_restrack_entry *res) static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) { - enum rdma_restrack_type type = res->type; - struct ib_device *dev; - struct ib_pd *pd; - struct ib_cq *cq; - struct ib_qp *qp; - - switch (type) { + switch (res->type) { case RDMA_RESTRACK_PD: - pd = container_of(res, struct ib_pd, res); - dev = pd->device; - break; + return container_of(res, struct ib_pd, res)->device; case RDMA_RESTRACK_CQ: - cq = container_of(res, struct ib_cq, res); - dev = cq->device; - break; + return container_of(res, struct ib_cq, res)->device; case RDMA_RESTRACK_QP: - qp = container_of(res, struct ib_qp, res); - dev = qp->device; - break; + return container_of(res, struct ib_qp, res)->device; default: - WARN_ONCE(true, "Wrong resource tracking type %u\n", type); + WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; } - - return dev; } static bool res_is_user(struct rdma_restrack_entry *res) From d12ff624828073f94628d49f6579e9c20acc56cd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:57:29 -0800 Subject: [PATCH 053/199] RDMA/nldev: common resource dumpit function Create a common dumpit function that can be used by all common resource types. This reduces code replication and simplifies the code as we add more resource types. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 61 +++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 5326a684555f..f38c6838bb31 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -212,10 +212,10 @@ err: return ret; } -static int fill_res_qp_entry(struct sk_buff *msg, - struct ib_qp *qp, uint32_t port) +static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, + struct rdma_restrack_entry *res, uint32_t port) { - struct rdma_restrack_entry *res = &qp->res; + struct ib_qp *qp = container_of(res, struct ib_qp, res); struct ib_qp_init_attr qp_init_attr; struct nlattr *entry_attr; struct ib_qp_attr qp_attr; @@ -558,23 +558,40 @@ static int nldev_res_get_dumpit(struct sk_buff *skb, return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb); } -static int nldev_res_get_qp_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +struct nldev_fill_res_entry { + int (*fill_res_func)(struct sk_buff *msg, struct netlink_callback *cb, + struct rdma_restrack_entry *res, u32 port); + enum rdma_nldev_attr nldev_attr; + enum rdma_nldev_command nldev_cmd; +}; + +static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_QP] = { + .fill_res_func = fill_res_qp_entry, + .nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET, + .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, + }, +}; + +static int res_get_common_dumpit(struct sk_buff *skb, + struct netlink_callback *cb, + enum rdma_restrack_type res_type) { + const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; int err, ret = 0, idx = 0; struct nlattr *table_attr; struct ib_device *device; int start = cb->args[0]; - struct ib_qp *qp = NULL; struct nlmsghdr *nlh; u32 index, port = 0; + bool filled = false; err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NULL); /* - * Right now, we are expecting the device index to get QP information, + * Right now, we are expecting the device index to get res information, * but it is possible to extend this code to return all devices in * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. * if it doesn't exist, we will iterate over all devices. @@ -601,7 +618,7 @@ static int nldev_res_get_qp_dumpit(struct sk_buff *skb, } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_QP_GET), + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), 0, NLM_F_MULTI); if (fill_nldev_handle(skb, device)) { @@ -609,24 +626,26 @@ static int nldev_res_get_qp_dumpit(struct sk_buff *skb, goto err; } - table_attr = nla_nest_start(skb, RDMA_NLDEV_ATTR_RES_QP); + table_attr = nla_nest_start(skb, fe->nldev_attr); if (!table_attr) { ret = -EMSGSIZE; goto err; } down_read(&device->res.rwsem); - hash_for_each_possible(device->res.hash, res, node, RDMA_RESTRACK_QP) { + hash_for_each_possible(device->res.hash, res, node, res_type) { if (idx < start) goto next; if ((rdma_is_kernel_res(res) && task_active_pid_ns(current) != &init_pid_ns) || - (!rdma_is_kernel_res(res) && - task_active_pid_ns(current) != task_active_pid_ns(res->task))) + (!rdma_is_kernel_res(res) && task_active_pid_ns(current) != + task_active_pid_ns(res->task))) /* - * 1. Kernel QPs should be visible in init namspace only - * 2. Present only QPs visible in the current namespace + * 1. Kern resources should be visible in init + * namspace only + * 2. Present only resources visible in the current + * namespace */ goto next; @@ -638,10 +657,10 @@ static int nldev_res_get_qp_dumpit(struct sk_buff *skb, */ goto next; - qp = container_of(res, struct ib_qp, res); + filled = true; up_read(&device->res.rwsem); - ret = fill_res_qp_entry(skb, qp, port); + ret = fe->fill_res_func(skb, cb, res, port); down_read(&device->res.rwsem); /* * Return resource back, but it won't be released till @@ -667,10 +686,10 @@ next: idx++; cb->args[0] = idx; /* - * No more QPs to fill, cancel the message and + * No more entries to fill, cancel the message and * return 0 to mark end of dumpit. */ - if (!qp) + if (!filled) goto err; put_device(&device->dev); @@ -688,6 +707,12 @@ err_index: return ret; } +static int nldev_res_get_qp_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP); +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, From a3b641af72ba899991ed847556951128ef41d52f Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:57:36 -0800 Subject: [PATCH 054/199] RDMA/CM: move rdma_id_private to cma_priv.h Move struct rdma_id_private to a new header cma_priv.h so the resource tracking services in core/nldev.c can read useful information about cm_ids. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 41 +--------------- drivers/infiniband/core/cma_priv.h | 79 ++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 40 deletions(-) create mode 100644 drivers/infiniband/core/cma_priv.h diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e66963ca58bd..203519eb0048 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -62,6 +62,7 @@ #include #include "core_priv.h" +#include "cma_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); @@ -327,46 +328,6 @@ struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ -struct rdma_id_private { - struct rdma_cm_id id; - - struct rdma_bind_list *bind_list; - struct hlist_node node; - struct list_head list; /* listen_any_list or cma_device.list */ - struct list_head listen_list; /* per device listens */ - struct cma_device *cma_dev; - struct list_head mc_list; - - int internal_id; - enum rdma_cm_state state; - spinlock_t lock; - struct mutex qp_mutex; - - struct completion comp; - atomic_t refcount; - struct mutex handler_mutex; - - int backlog; - int timeout_ms; - struct ib_sa_query *query; - int query_id; - union { - struct ib_cm_id *ib; - struct iw_cm_id *iw; - } cm_id; - - u32 seq_num; - u32 qkey; - u32 qp_num; - pid_t owner; - u32 options; - u8 srq; - u8 tos; - bool tos_set; - u8 reuseaddr; - u8 afonly; - enum ib_gid_type gid_type; -}; struct cma_multicast { struct rdma_id_private *id_priv; diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h new file mode 100644 index 000000000000..11a41bef32ed --- /dev/null +++ b/drivers/infiniband/core/cma_priv.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _CMA_PRIV_H +#define _CMA_PRIV_H + +struct rdma_id_private { + struct rdma_cm_id id; + + struct rdma_bind_list *bind_list; + struct hlist_node node; + struct list_head list; /* listen_any_list or cma_device.list */ + struct list_head listen_list; /* per device listens */ + struct cma_device *cma_dev; + struct list_head mc_list; + + int internal_id; + enum rdma_cm_state state; + spinlock_t lock; + struct mutex qp_mutex; + + struct completion comp; + atomic_t refcount; + struct mutex handler_mutex; + + int backlog; + int timeout_ms; + struct ib_sa_query *query; + int query_id; + union { + struct ib_cm_id *ib; + struct iw_cm_id *iw; + } cm_id; + + u32 seq_num; + u32 qkey; + u32 qp_num; + pid_t owner; + u32 options; + u8 srq; + u8 tos; + bool tos_set; + u8 reuseaddr; + u8 afonly; + enum ib_gid_type gid_type; +}; +#endif /* _CMA_PRIV_H */ From 00313983cda6f37f747058e58c1cb8fba02bc134 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:57:44 -0800 Subject: [PATCH 055/199] RDMA/nldev: provide detailed CM_ID information Implement RDMA nldev netlink interface to get detailed CM_ID information. Because cm_id's are attached to rdma devices in various work queue contexts, the pid and task information at restrak_add() time is sometimes not useful. For example, an nvme/f host connection cm_id ends up being bound to a device in a work queue context and the resulting pid at attach time no longer exists after connection setup. So instead we mark all cm_id's created via the rdma_ucm as "user", and all others as "kernel". This required tweaking the restrack code a little. It also required wrapping some rdma_cm functions to allow passing the module name string. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 61 ++++++++++------ drivers/infiniband/core/cma_priv.h | 6 +- drivers/infiniband/core/nldev.c | 107 ++++++++++++++++++++++++++--- drivers/infiniband/core/restrack.c | 14 +++- drivers/infiniband/core/ucma.c | 8 +-- include/rdma/rdma_cm.h | 18 +++-- include/rdma/restrack.h | 20 ++++++ include/uapi/rdma/rdma_netlink.h | 14 ++++ 8 files changed, 205 insertions(+), 43 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 203519eb0048..f1c64b4909d9 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -466,6 +466,8 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv, id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); + id_priv->res.type = RDMA_RESTRACK_CM_ID; + rdma_restrack_add(&id_priv->res); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, @@ -738,10 +740,10 @@ static void cma_deref_id(struct rdma_id_private *id_priv) complete(&id_priv->comp); } -struct rdma_cm_id *rdma_create_id(struct net *net, - rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps, - enum ib_qp_type qp_type) +struct rdma_cm_id *__rdma_create_id(struct net *net, + rdma_cm_event_handler event_handler, + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *id_priv; @@ -749,7 +751,10 @@ struct rdma_cm_id *rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - id_priv->owner = task_pid_nr(current); + if (caller) + id_priv->res.kern_name = caller; + else + rdma_restrack_set_task(&id_priv->res, current); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; @@ -769,7 +774,7 @@ struct rdma_cm_id *rdma_create_id(struct net *net, return &id_priv->id; } -EXPORT_SYMBOL(rdma_create_id); +EXPORT_SYMBOL(__rdma_create_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { @@ -1629,6 +1634,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_unlock(&id_priv->handler_mutex); if (id_priv->cma_dev) { + rdma_restrack_del(&id_priv->res); if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); @@ -1778,6 +1784,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event, struct net_device *net_dev) { + struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; @@ -1787,9 +1794,11 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, ib_event->param.req_rcvd.primary_path->service_id; int ret; - id = rdma_create_id(listen_id->route.addr.dev_addr.net, + listen_id_priv = container_of(listen_id, struct rdma_id_private, id); + id = __rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, - listen_id->ps, ib_event->param.req_rcvd.qp_type); + listen_id->ps, ib_event->param.req_rcvd.qp_type, + listen_id_priv->res.kern_name); if (IS_ERR(id)) return NULL; @@ -1838,14 +1847,17 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event, struct net_device *net_dev) { + struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct net *net = listen_id->route.addr.dev_addr.net; int ret; - id = rdma_create_id(net, listen_id->event_handler, listen_id->context, - listen_id->ps, IB_QPT_UD); + listen_id_priv = container_of(listen_id, struct rdma_id_private, id); + id = __rdma_create_id(net, listen_id->event_handler, listen_id->context, + listen_id->ps, IB_QPT_UD, + listen_id_priv->res.kern_name); if (IS_ERR(id)) return NULL; @@ -2111,10 +2123,11 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, goto out; /* Create a new RDMA id for the new IW CM ID */ - new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, - listen_id->id.event_handler, - listen_id->id.context, - RDMA_PS_TCP, IB_QPT_RC); + new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, + listen_id->id.event_handler, + listen_id->id.context, + RDMA_PS_TCP, IB_QPT_RC, + listen_id->res.kern_name); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; @@ -2239,8 +2252,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return; - id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, - id_priv->id.qp_type); + id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, + id_priv->id.qp_type, id_priv->res.kern_name); if (IS_ERR(id)) return; @@ -3348,8 +3361,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - if (id_priv->cma_dev) + if (id_priv->cma_dev) { + rdma_restrack_del(&id_priv->res); cma_release_dev(id_priv); + } err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; @@ -3732,14 +3747,18 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + const char *caller) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); - id_priv->owner = task_pid_nr(current); + if (caller) + id_priv->res.kern_name = caller; + else + rdma_restrack_set_task(&id_priv->res, current); if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; @@ -3779,7 +3798,7 @@ reject: rdma_reject(id, NULL, 0); return ret; } -EXPORT_SYMBOL(rdma_accept); +EXPORT_SYMBOL(__rdma_accept); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { @@ -4457,7 +4476,7 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) goto out; - id_stats->pid = id_priv->owner; + id_stats->pid = task_pid_vnr(id_priv->res.task); id_stats->port_space = id->ps; id_stats->cm_state = id_priv->state; id_stats->qp_num = id_priv->qp_num; diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index 11a41bef32ed..56f52b70c346 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -67,7 +67,6 @@ struct rdma_id_private { u32 seq_num; u32 qkey; u32 qp_num; - pid_t owner; u32 options; u8 srq; u8 tos; @@ -75,5 +74,10 @@ struct rdma_id_private { u8 reuseaddr; u8 afonly; enum ib_gid_type gid_type; + + /* + * Internal to RDMA/core, don't use in the drivers + */ + struct rdma_restrack_entry res; }; #endif /* _CMA_PRIV_H */ diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index f38c6838bb31..3fd3f9e99e11 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -34,9 +34,11 @@ #include #include #include +#include #include #include "core_priv.h" +#include "cma_priv.h" static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, @@ -71,6 +73,13 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, .len = TASK_COMM_LEN }, + [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, + [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, }; static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ -182,6 +191,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) [RDMA_RESTRACK_PD] = "pd", [RDMA_RESTRACK_CQ] = "cq", [RDMA_RESTRACK_QP] = "qp", + [RDMA_RESTRACK_CM_ID] = "cm_id", }; struct rdma_restrack_root *res = &device->res; @@ -212,6 +222,25 @@ err: return ret; } +static int fill_res_name_pid(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + /* + * For user resources, user is should read /proc/PID/comm to get the + * name of the task file. + */ + if (rdma_is_kernel_res(res)) { + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, + res->kern_name)) + return -EMSGSIZE; + } else { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, + task_pid_vnr(res->task))) + return -EMSGSIZE; + } + return 0; +} + static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, struct rdma_restrack_entry *res, uint32_t port) { @@ -262,19 +291,65 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; - /* - * Existence of task means that it is user QP and netlink - * user is invited to go and read /proc/PID/comm to get name - * of the task file and res->task_com should be NULL. - */ - if (rdma_is_kernel_res(res)) { - if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, res->kern_name)) + if (fill_res_name_pid(msg, res)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); +out: + return -EMSGSIZE; +} + +static int fill_res_cm_id_entry(struct sk_buff *msg, + struct netlink_callback *cb, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct rdma_id_private *id_priv = + container_of(res, struct rdma_id_private, res); + struct rdma_cm_id *cm_id = &id_priv->id; + struct nlattr *entry_attr; + + if (port && port != cm_id->port_num) + return 0; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY); + if (!entry_attr) + goto out; + + if (cm_id->port_num && + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) + goto err; + + if (id_priv->qp_num) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num)) goto err; - } else { - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, task_pid_vnr(res->task))) + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type)) goto err; } + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps)) + goto err; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state)) + goto err; + + if (cm_id->route.addr.src_addr.ss_family && + nla_put(msg, RDMA_NLDEV_ATTR_RES_SRC_ADDR, + sizeof(cm_id->route.addr.src_addr), + &cm_id->route.addr.src_addr)) + goto err; + if (cm_id->route.addr.dst_addr.ss_family && + nla_put(msg, RDMA_NLDEV_ATTR_RES_DST_ADDR, + sizeof(cm_id->route.addr.dst_addr), + &cm_id->route.addr.dst_addr)) + goto err; + + if (fill_res_name_pid(msg, res)) + goto err; + nla_nest_end(msg, entry_attr); return 0; @@ -571,6 +646,11 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, }, + [RDMA_RESTRACK_CM_ID] = { + .fill_res_func = fill_res_cm_id_entry, + .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, + .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, + }, }; static int res_get_common_dumpit(struct sk_buff *skb, @@ -713,6 +793,12 @@ static int nldev_res_get_qp_dumpit(struct sk_buff *skb, return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP); } +static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID); +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -739,6 +825,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { * too. */ }, + [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { + .dump = nldev_res_get_cm_id_dumpit, + }, }; void __init nldev_init(void) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 41a780085e6d..6da949e7a50b 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -3,12 +3,15 @@ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. */ +#include #include #include #include #include #include +#include "cma_priv.h" + void rdma_restrack_init(struct rdma_restrack_root *res) { init_rwsem(&res->rwsem); @@ -44,7 +47,7 @@ static void set_kern_name(struct rdma_restrack_entry *res) struct ib_qp *qp; if (type != RDMA_RESTRACK_QP) - /* PD and CQ types already have this name embedded in */ + /* Other types already have this name embedded in */ return; qp = container_of(res, struct ib_qp, res); @@ -67,6 +70,9 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) return container_of(res, struct ib_cq, res)->device; case RDMA_RESTRACK_QP: return container_of(res, struct ib_qp, res)->device; + case RDMA_RESTRACK_CM_ID: + return container_of(res, struct rdma_id_private, + res)->id.device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; @@ -82,6 +88,8 @@ static bool res_is_user(struct rdma_restrack_entry *res) return container_of(res, struct ib_cq, res)->uobject; case RDMA_RESTRACK_QP: return container_of(res, struct ib_qp, res)->uobject; + case RDMA_RESTRACK_CM_ID: + return !res->kern_name; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return false; @@ -96,8 +104,8 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) return; if (res_is_user(res)) { - get_task_struct(current); - res->task = current; + if (!res->task) + rdma_restrack_set_task(res, current); res->kern_name = NULL; } else { set_kern_name(res); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index f015f1bf88c9..476462639ea8 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -476,8 +476,8 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, return -ENOMEM; ctx->uid = cmd.uid; - ctx->cm_id = rdma_create_id(current->nsproxy->net_ns, - ucma_event_handler, ctx, cmd.ps, qp_type); + ctx->cm_id = __rdma_create_id(current->nsproxy->net_ns, + ucma_event_handler, ctx, cmd.ps, qp_type, NULL); if (IS_ERR(ctx->cm_id)) { ret = PTR_ERR(ctx->cm_id); goto err1; @@ -1084,12 +1084,12 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&file->mut); - ret = rdma_accept(ctx->cm_id, &conn_param); + ret = __rdma_accept(ctx->cm_id, &conn_param, NULL); if (!ret) ctx->uid = cmd.uid; mutex_unlock(&file->mut); } else - ret = rdma_accept(ctx->cm_id, NULL); + ret = __rdma_accept(ctx->cm_id, NULL, NULL); ucma_put_ctx(ctx); return ret; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 6538a5cc27b6..62caae818173 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -157,6 +157,11 @@ struct rdma_cm_id { u8 port_num; }; +struct rdma_cm_id *__rdma_create_id(struct net *net, + rdma_cm_event_handler event_handler, + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type, const char *caller); + /** * rdma_create_id - Create an RDMA identifier. * @@ -169,10 +174,9 @@ struct rdma_cm_id { * * The id holds a reference on the network namespace until it is destroyed. */ -struct rdma_cm_id *rdma_create_id(struct net *net, - rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps, - enum ib_qp_type qp_type); +#define rdma_create_id(net, event_handler, context, ps, qp_type) \ + __rdma_create_id((net), (event_handler), (context), (ps), (qp_type), \ + KBUILD_MODNAME) /** * rdma_destroy_id - Destroys an RDMA identifier. @@ -284,6 +288,9 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); */ int rdma_listen(struct rdma_cm_id *id, int backlog); +int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + const char *caller); + /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. @@ -299,7 +306,8 @@ int rdma_listen(struct rdma_cm_id *id, int backlog); * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. */ -int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); +#define rdma_accept(id, conn_param) \ + __rdma_accept((id), (conn_param), KBUILD_MODNAME) /** * rdma_notify - Notifies the RDMA CM of an asynchronous event that has diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 2cdf8dcf4bdc..af886670af85 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -11,6 +11,7 @@ #include #include #include +#include /** * enum rdma_restrack_type - HW objects to track @@ -28,6 +29,10 @@ enum rdma_restrack_type { * @RDMA_RESTRACK_QP: Queue pair (QP) */ RDMA_RESTRACK_QP, + /** + * @RDMA_RESTRACK_CM_ID: Connection Manager ID (CM_ID) + */ + RDMA_RESTRACK_CM_ID, /** * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations */ @@ -150,4 +155,19 @@ int __must_check rdma_restrack_get(struct rdma_restrack_entry *res); * @res: resource entry */ int rdma_restrack_put(struct rdma_restrack_entry *res); + +/** + * rdma_restrack_set_task() - set the task for this resource + * @res: resource entry + * @task: task struct + */ +static inline void rdma_restrack_set_task(struct rdma_restrack_entry *res, + struct task_struct *task) +{ + if (res->task) + put_task_struct(res->task); + get_task_struct(task); + res->task = task; +} + #endif /* _RDMA_RESTRACK_H_ */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 4c77e2a7b07e..0399aed06548 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -238,6 +238,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */ + RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -350,6 +352,18 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_RES_KERN_NAME, /* string */ + RDMA_NLDEV_ATTR_RES_CM_ID, /* nested table */ + RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, /* nested table */ + /* + * rdma_cm_id port space. + */ + RDMA_NLDEV_ATTR_RES_PS, /* u32 */ + /* + * Source and destination socket addresses + */ + RDMA_NLDEV_ATTR_RES_SRC_ADDR, /* __kernel_sockaddr_storage */ + RDMA_NLDEV_ATTR_RES_DST_ADDR, /* __kernel_sockaddr_storage */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ From a34fc0893eef691863b5c118df8ff8e6c9fbc7b7 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:57:51 -0800 Subject: [PATCH 056/199] RDMA/nldev: provide detailed CQ information Implement the RDMA nldev netlink interface for dumping detailed CQ information. Reviewed-by: Leon Romanovsky Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 52 ++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 8 +++++ 2 files changed, 60 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 3fd3f9e99e11..83e43926c957 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -80,6 +80,11 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { .len = sizeof(struct __kernel_sockaddr_storage) }, + [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, }; static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ -359,6 +364,39 @@ out: return -EMSGSIZE; } +static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_cq *cq = container_of(res, struct ib_cq, res); + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY); + if (!entry_attr) + goto out; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, + atomic_read(&cq->usecnt), 0)) + goto err; + + /* Poll context is only valid for kernel CQs */ + if (rdma_is_kernel_res(res) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) + goto err; + + if (fill_res_name_pid(msg, res)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); +out: + return -EMSGSIZE; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -651,6 +689,11 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, }, + [RDMA_RESTRACK_CQ] = { + .fill_res_func = fill_res_cq_entry, + .nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET, + .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, + }, }; static int res_get_common_dumpit(struct sk_buff *skb, @@ -799,6 +842,12 @@ static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb, return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID); } +static int nldev_res_get_cq_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ); +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -828,6 +877,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { .dump = nldev_res_get_cm_id_dumpit, }, + [RDMA_NLDEV_CMD_RES_CQ_GET] = { + .dump = nldev_res_get_cq_dumpit, + }, }; void __init nldev_init(void) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 0399aed06548..36cf1f0025fd 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -240,6 +240,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */ + RDMA_NLDEV_CMD_RES_CQ_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -364,6 +366,12 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_SRC_ADDR, /* __kernel_sockaddr_storage */ RDMA_NLDEV_ATTR_RES_DST_ADDR, /* __kernel_sockaddr_storage */ + RDMA_NLDEV_ATTR_RES_CQ, /* nested table */ + RDMA_NLDEV_ATTR_RES_CQ_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_CQE, /* u32 */ + RDMA_NLDEV_ATTR_RES_USECNT, /* u64 */ + RDMA_NLDEV_ATTR_RES_POLL_CTX, /* u8 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ From 750fb1656ab7781d5d0f1cd38ca4f1f958f02f45 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:57:58 -0800 Subject: [PATCH 057/199] iw_cxgb4: initialize ib_mr fields for user mrs Some of the struct ib_mr fields weren't getting initialized. This was benign, but will cause problems when dumping the mr resource via nldev/restrack. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/mem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 7e0eb201cc26..e90f2fd8dc16 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -391,6 +391,9 @@ static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag) mhp->attr.stag = stag; mmid = stag >> 8; mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + mhp->ibmr.length = mhp->attr.len; + mhp->ibmr.iova = mhp->attr.va_fbo; + mhp->ibmr.page_size = 1U << (mhp->attr.page_size + 12); pr_debug("mmid 0x%x mhp %p\n", mmid, mhp); return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); } From e6f0330106f4aa51e377b183c759758242ccab6c Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:58:06 -0800 Subject: [PATCH 058/199] mlx4_ib: set user mr attributes in struct ib_mr Setting iova, length, and page_size allows this information to be seen via NLDEV netlink queries, which can aid in user rdma debugging. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/mr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 4975f3e6596e..17f4f151a97f 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -407,6 +407,9 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto err_mr; mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.length = length; + mr->ibmr.iova = virt_addr; + mr->ibmr.page_size = 1U << shift; return &mr->ibmr; From fccec5b89ac61ebe2f353feecd08a16621f2418b Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:58:13 -0800 Subject: [PATCH 059/199] RDMA/nldev: provide detailed MR information Implement the RDMA nldev netlink interface for dumping detailed MR information. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 56 ++++++++++++++++++++++++++++ drivers/infiniband/core/restrack.c | 36 +++++++++++------- drivers/infiniband/core/uverbs_cmd.c | 2 + drivers/infiniband/core/verbs.c | 3 ++ include/rdma/ib_verbs.h | 5 +++ include/rdma/restrack.h | 4 ++ include/uapi/rdma/rdma_netlink.h | 9 +++++ 7 files changed, 102 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 83e43926c957..4c6626ecdb99 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -85,6 +85,12 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, }; static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ -197,6 +203,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) [RDMA_RESTRACK_CQ] = "cq", [RDMA_RESTRACK_QP] = "qp", [RDMA_RESTRACK_CM_ID] = "cm_id", + [RDMA_RESTRACK_MR] = "mr", }; struct rdma_restrack_root *res = &device->res; @@ -397,6 +404,41 @@ out: return -EMSGSIZE; } +static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY); + if (!entry_attr) + goto out; + + if (netlink_capable(cb->skb, CAP_NET_ADMIN)) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_IOVA, + mr->iova, 0)) + goto err; + } + + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, 0)) + goto err; + + if (fill_res_name_pid(msg, res)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); +out: + return -EMSGSIZE; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -694,6 +736,11 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, }, + [RDMA_RESTRACK_MR] = { + .fill_res_func = fill_res_mr_entry, + .nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET, + .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, + }, }; static int res_get_common_dumpit(struct sk_buff *skb, @@ -848,6 +895,12 @@ static int nldev_res_get_cq_dumpit(struct sk_buff *skb, return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ); } +static int nldev_res_get_mr_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR); +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -880,6 +933,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_RES_CQ_GET] = { .dump = nldev_res_get_cq_dumpit, }, + [RDMA_NLDEV_CMD_RES_MR_GET] = { + .dump = nldev_res_get_mr_dumpit, + }, }; void __init nldev_init(void) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 6da949e7a50b..e1d9934d6e81 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -43,22 +43,28 @@ EXPORT_SYMBOL(rdma_restrack_count); static void set_kern_name(struct rdma_restrack_entry *res) { - enum rdma_restrack_type type = res->type; - struct ib_qp *qp; + struct ib_pd *pd; - if (type != RDMA_RESTRACK_QP) - /* Other types already have this name embedded in */ - return; - - qp = container_of(res, struct ib_qp, res); - if (!qp->pd) { - WARN_ONCE(true, "XRC QPs are not supported\n"); - /* Survive, despite the programmer's error */ - res->kern_name = " "; - return; + switch (res->type) { + case RDMA_RESTRACK_QP: + pd = container_of(res, struct ib_qp, res)->pd; + if (!pd) { + WARN_ONCE(true, "XRC QPs are not supported\n"); + /* Survive, despite the programmer's error */ + res->kern_name = " "; + } + break; + case RDMA_RESTRACK_MR: + pd = container_of(res, struct ib_mr, res)->pd; + break; + default: + /* Other types set kern_name directly */ + pd = NULL; + break; } - res->kern_name = qp->pd->res.kern_name; + if (pd) + res->kern_name = pd->res.kern_name; } static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) @@ -73,6 +79,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) case RDMA_RESTRACK_CM_ID: return container_of(res, struct rdma_id_private, res)->id.device; + case RDMA_RESTRACK_MR: + return container_of(res, struct ib_mr, res)->device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; @@ -90,6 +98,8 @@ static bool res_is_user(struct rdma_restrack_entry *res) return container_of(res, struct ib_qp, res)->uobject; case RDMA_RESTRACK_CM_ID: return !res->kern_name; + case RDMA_RESTRACK_MR: + return container_of(res, struct ib_mr, res)->pd->uobject; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return false; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index a148de35df8d..9f9fc14523db 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -693,6 +693,8 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, mr->pd = pd; mr->uobject = uobj; atomic_inc(&pd->usecnt); + mr->res.type = RDMA_RESTRACK_MR; + rdma_restrack_add(&mr->res); uobj->object = mr; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 4e2b231b03f7..873b7aa9e8dd 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1622,6 +1622,7 @@ int ib_dereg_mr(struct ib_mr *mr) struct ib_pd *pd = mr->pd; int ret; + rdma_restrack_del(&mr->res); ret = mr->device->dereg_mr(mr); if (!ret) atomic_dec(&pd->usecnt); @@ -1658,6 +1659,8 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, mr->uobject = NULL; atomic_inc(&pd->usecnt); mr->need_inval = false; + mr->res.type = RDMA_RESTRACK_MR; + rdma_restrack_add(&mr->res); } return mr; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 73b2387e3f74..7df3274818f9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1772,6 +1772,11 @@ struct ib_mr { struct ib_uobject *uobject; /* user */ struct list_head qp_entry; /* FR */ }; + + /* + * Implementation details of the RDMA core, don't use in drivers: + */ + struct rdma_restrack_entry res; }; struct ib_mw { diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index af886670af85..a56f4f200277 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -33,6 +33,10 @@ enum rdma_restrack_type { * @RDMA_RESTRACK_CM_ID: Connection Manager ID (CM_ID) */ RDMA_RESTRACK_CM_ID, + /** + * @RDMA_RESTRACK_MR: Memory Region (MR) + */ + RDMA_RESTRACK_MR, /** * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 36cf1f0025fd..6d9ec38e3af0 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -242,6 +242,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_CQ_GET, /* can dump */ + RDMA_NLDEV_CMD_RES_MR_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -372,6 +374,13 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_USECNT, /* u64 */ RDMA_NLDEV_ATTR_RES_POLL_CTX, /* u8 */ + RDMA_NLDEV_ATTR_RES_MR, /* nested table */ + RDMA_NLDEV_ATTR_RES_MR_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_RKEY, /* u32 */ + RDMA_NLDEV_ATTR_RES_LKEY, /* u32 */ + RDMA_NLDEV_ATTR_RES_IOVA, /* u64 */ + RDMA_NLDEV_ATTR_RES_MRLEN, /* u64 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ From 5292443431fff5ba20f1b48985d382dd824016eb Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:58:20 -0800 Subject: [PATCH 060/199] mlx4_ib: zero out struct ib_pd when allocating Zero out the fields of the struct ib_pd for user mode pds so that users querying pds via nldev will not get garbage. For simplicity, use kzalloc() to allocate the mlx4_ib_pd struct. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 8d2ee9322f2e..2e3789fffcab 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1327,7 +1327,7 @@ static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct mlx4_ib_pd *pd; int err; - pd = kmalloc(sizeof *pd, GFP_KERNEL); + pd = kzalloc(sizeof(*pd), GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); @@ -1343,7 +1343,6 @@ static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, kfree(pd); return ERR_PTR(-EFAULT); } - return &pd->ibpd; } From 29cf1351d450f95957eb0ef2e8cc0c7765fc5785 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 1 Mar 2018 13:58:28 -0800 Subject: [PATCH 061/199] RDMA/nldev: provide detailed PD information Implement the RDMA nldev netlink interface for dumping detailed PD information. Reviewed-by: Leon Romanovsky Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 57 ++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 7 ++++ 2 files changed, 64 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 4c6626ecdb99..192084c78352 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -91,6 +91,10 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 }, }; static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ -439,6 +443,45 @@ out: return -EMSGSIZE; } +static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_pd *pd = container_of(res, struct ib_pd, res); + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY); + if (!entry_attr) + goto out; + + if (netlink_capable(cb->skb, CAP_NET_ADMIN)) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, + pd->local_dma_lkey)) + goto err; + if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, + pd->unsafe_global_rkey)) + goto err; + } + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, + atomic_read(&pd->usecnt), 0)) + goto err; + if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, + pd->unsafe_global_rkey)) + goto err; + + if (fill_res_name_pid(msg, res)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); +out: + return -EMSGSIZE; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -741,6 +784,11 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, }, + [RDMA_RESTRACK_PD] = { + .fill_res_func = fill_res_pd_entry, + .nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET, + .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, + }, }; static int res_get_common_dumpit(struct sk_buff *skb, @@ -901,6 +949,12 @@ static int nldev_res_get_mr_dumpit(struct sk_buff *skb, return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR); } +static int nldev_res_get_pd_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD); +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -936,6 +990,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_RES_MR_GET] = { .dump = nldev_res_get_mr_dumpit, }, + [RDMA_NLDEV_CMD_RES_PD_GET] = { + .dump = nldev_res_get_pd_dumpit, + }, }; void __init nldev_init(void) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 6d9ec38e3af0..351139c7e2e7 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -244,6 +244,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_MR_GET, /* can dump */ + RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -381,6 +383,11 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_IOVA, /* u64 */ RDMA_NLDEV_ATTR_RES_MRLEN, /* u64 */ + RDMA_NLDEV_ATTR_RES_PD, /* nested table */ + RDMA_NLDEV_ATTR_RES_PD_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */ + RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ From 8932ff803d72804316ea85fe6705e0867f827d65 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 7 Mar 2018 16:56:03 -0800 Subject: [PATCH 062/199] IB/hfi1: Fix a kernel-doc warning Avoid that building with W=1 causes the following warning to appear: drivers/infiniband/hw/hfi1/qp.c:484: warning: Cannot understand * on line 484 - I thought it was a doc line Signed-off-by: Bart Van Assche Cc: Mike Marciniszyn Cc: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index d30dd1a5b0a6..1697d96151bd 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -481,7 +481,6 @@ static void iowait_sdma_drained(struct iowait *wait) } /** - * * qp_to_sdma_engine - map a qp to a send engine * @qp: the QP * @sc5: the 5 bit sc From 036ef0a1a867511ff1b4ecb3e5b185267216dab0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 7 Mar 2018 16:57:56 -0800 Subject: [PATCH 063/199] RDMA/bnxt_re: Remove an unused variable This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Selvin Xavier Cc: Devesh Sharma Cc: Somnath Kotur Cc: Sriharsha Basavapatna Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 6593d4cad26a..23be00afaa6f 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1540,14 +1540,13 @@ int bnxt_re_post_srq_recv(struct ib_srq *ib_srq, struct ib_recv_wr *wr, ib_srq); struct bnxt_qplib_swqe wqe; unsigned long flags; - int rc = 0, payload_sz = 0; + int rc = 0; spin_lock_irqsave(&srq->lock, flags); while (wr) { /* Transcribe each ib_recv_wr to qplib_swqe */ wqe.num_sge = wr->num_sge; - payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe.sg_list, - wr->num_sge); + bnxt_re_build_sgl(wr->sg_list, wqe.sg_list, wr->num_sge); wqe.wr_id = wr->wr_id; wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV; From e088a685eae94a0607b8f7b99949a0e14d748813 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 9 Mar 2018 18:36:29 +0800 Subject: [PATCH 064/199] RDMA/hns: Support rq record doorbell for the user space This patch adds interfaces and definitions to support the rq record doorbell for the user space. Signed-off-by: Yixian Liu Signed-off-by: Lijun Ou Signed-off-by: Wei Hu (Xavier) Signed-off-by: Shaobo Xu Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/Makefile | 2 +- drivers/infiniband/hw/hns/hns_roce_db.c | 68 +++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_device.h | 46 +++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 26 +++++++- drivers/infiniband/hw/hns/hns_roce_main.c | 5 ++ drivers/infiniband/hw/hns/hns_roce_qp.c | 53 +++++++++++++++- include/uapi/rdma/hns-abi.h | 5 ++ 7 files changed, 200 insertions(+), 5 deletions(-) create mode 100644 drivers/infiniband/hw/hns/hns_roce_db.c diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index 97bf2cd1cacb..cf03404b9d58 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -7,7 +7,7 @@ ccflags-y := -Idrivers/net/ethernet/hisilicon/hns3 obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ - hns_roce_cq.o hns_roce_alloc.o + hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o hns-roce-hw-v1-objs := hns_roce_hw_v1.o obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c new file mode 100644 index 000000000000..987f2811d2c4 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_db.c @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */ +/* + * Copyright (c) 2017 Hisilicon Limited. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + */ + +#include +#include +#include "hns_roce_device.h" + +int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, + struct hns_roce_db *db) +{ + struct hns_roce_user_db_page *page; + int ret = 0; + + mutex_lock(&context->page_mutex); + + list_for_each_entry(page, &context->page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto out; + } + + refcount_set(&page->refcount, 1); + page->user_virt = (virt & PAGE_MASK); + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + ret = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + + (virt & ~PAGE_MASK); + db->u.user_page = page; + refcount_inc(&page->refcount); + +out: + mutex_unlock(&context->page_mutex); + + return ret; +} +EXPORT_SYMBOL(hns_roce_db_map_user); + +void hns_roce_db_unmap_user(struct hns_roce_ucontext *context, + struct hns_roce_db *db) +{ + mutex_lock(&context->page_mutex); + + refcount_dec(&db->u.user_page->refcount); + if (refcount_dec_if_one(&db->u.user_page->refcount)) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->page_mutex); +} +EXPORT_SYMBOL(hns_roce_db_unmap_user); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 165a09b314f6..aa5cc78244ba 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -105,6 +105,10 @@ #define PAGES_SHIFT_24 24 #define PAGES_SHIFT_32 32 +enum { + HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0, +}; + enum hns_roce_qp_state { HNS_ROCE_QP_STATE_RST, HNS_ROCE_QP_STATE_INIT, @@ -178,7 +182,8 @@ enum { enum { HNS_ROCE_CAP_FLAG_REREG_MR = BIT(0), HNS_ROCE_CAP_FLAG_ROCE_V1_V2 = BIT(1), - HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2) + HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2), + HNS_ROCE_CAP_FLAG_RECORD_DB = BIT(3) }; enum hns_roce_mtt_type { @@ -186,6 +191,10 @@ enum hns_roce_mtt_type { MTT_TYPE_CQE, }; +enum { + HNS_ROCE_DB_PER_PAGE = PAGE_SIZE / 4 +}; + #define HNS_ROCE_CMD_SUCCESS 1 #define HNS_ROCE_PORT_DOWN 0 @@ -203,6 +212,8 @@ struct hns_roce_uar { struct hns_roce_ucontext { struct ib_ucontext ibucontext; struct hns_roce_uar uar; + struct list_head page_list; + struct mutex page_mutex; }; struct hns_roce_pd { @@ -335,6 +346,33 @@ struct hns_roce_buf { int page_shift; }; +struct hns_roce_db_pgdir { + struct list_head list; + DECLARE_BITMAP(order0, HNS_ROCE_DB_PER_PAGE); + DECLARE_BITMAP(order1, HNS_ROCE_DB_PER_PAGE / 2); + unsigned long *bits[2]; + u32 *page; + dma_addr_t db_dma; +}; + +struct hns_roce_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + refcount_t refcount; +}; + +struct hns_roce_db { + u32 *db_record; + union { + struct hns_roce_db_pgdir *pgdir; + struct hns_roce_user_db_page *user_page; + } u; + dma_addr_t dma; + int index; + int order; +}; + struct hns_roce_cq_buf { struct hns_roce_buf hr_buf; struct hns_roce_mtt hr_mtt; @@ -466,6 +504,8 @@ struct hns_roce_qp { struct ib_qp ibqp; struct hns_roce_buf hr_buf; struct hns_roce_wq rq; + struct hns_roce_db rdb; + u8 rdb_en; u32 doorbell_qpn; __le32 sq_signal_bits; u32 sq_next_wqe; @@ -930,6 +970,10 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq); void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq); +int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, + struct hns_roce_db *db); +void hns_roce_db_unmap_user(struct hns_roce_ucontext *context, + struct hns_roce_db *db); void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn); void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type); void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 016bca1923ec..21575912f739 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1168,7 +1168,8 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->flags = HNS_ROCE_CAP_FLAG_REREG_MR | HNS_ROCE_CAP_FLAG_ROCE_V1_V2 | - HNS_ROCE_CAP_FLAG_RQ_INLINE; + HNS_ROCE_CAP_FLAG_RQ_INLINE | + HNS_ROCE_CAP_FLAG_RECORD_DB; caps->pkey_table_len[0] = 1; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; caps->ceqe_depth = HNS_ROCE_V2_COMP_EQE_NUM; @@ -2274,6 +2275,23 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, hr_qp->qkey = attr->qkey; } + if (hr_qp->rdb_en) { + roce_set_bit(context->byte_68_rq_db, + V2_QPC_BYTE_68_RQ_RECORD_EN_S, 1); + roce_set_bit(qpc_mask->byte_68_rq_db, + V2_QPC_BYTE_68_RQ_RECORD_EN_S, 0); + } + + roce_set_field(context->byte_68_rq_db, + V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M, + V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S, + ((u32)hr_qp->rdb.dma) >> 1); + roce_set_field(qpc_mask->byte_68_rq_db, + V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M, + V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S, 0); + context->rq_db_record_addr = hr_qp->rdb.dma >> 32; + qpc_mask->rq_db_record_addr = 0; + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 1); roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0); @@ -3211,6 +3229,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, hr_qp->sq.tail = 0; hr_qp->sq_next_wqe = 0; hr_qp->next_sge = 0; + if (hr_qp->rq.wqe_cnt) + *hr_qp->rdb.db_record = 0; } out: @@ -3437,6 +3457,10 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt); if (is_user) { + if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1)) + hns_roce_db_unmap_user( + to_hr_ucontext(hr_qp->ibqp.uobject->context), + &hr_qp->rdb); ib_umem_release(hr_qp->umem); } else { kfree(hr_qp->sq.wrid); diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 8255bb9021b0..d6c9c578dba1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -351,6 +351,11 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, if (ret) goto error_fail_uar_alloc; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) { + INIT_LIST_HEAD(&context->page_list); + mutex_init(&context->page_mutex); + } + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (ret) goto error_fail_copy_to_udata; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 088973a05882..92597e280a63 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -489,6 +489,15 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, return 0; } +static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || + attr->qp_type == IB_QPT_XRC_TGT || attr->srq) + return 0; + + return 1; +} + static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, struct ib_pd *ib_pd, struct ib_qp_init_attr *init_attr, @@ -497,6 +506,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, { struct device *dev = hr_dev->dev; struct hns_roce_ib_create_qp ucmd; + struct hns_roce_ib_create_qp_resp resp; unsigned long qpn = 0; int ret = 0; u32 page_shift; @@ -602,6 +612,18 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, dev_err(dev, "hns_roce_ib_umem_write_mtt error for create qp\n"); goto err_mtt; } + + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && + (udata->outlen == sizeof(resp)) && + hns_roce_qp_has_rq(init_attr)) { + ret = hns_roce_db_map_user( + to_hr_ucontext(ib_pd->uobject->context), + ucmd.db_addr, &hr_qp->rdb); + if (ret) { + dev_err(dev, "rp record doorbell map failed!\n"); + goto err_mtt; + } + } } else { if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { @@ -698,17 +720,44 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, else hr_qp->doorbell_qpn = cpu_to_le64(hr_qp->qpn); + if (ib_pd->uobject && (udata->outlen == sizeof(resp)) && + (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)) { + + /* indicate kernel supports record db */ + resp.cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB; + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) + goto err_qp; + + hr_qp->rdb_en = 1; + } hr_qp->event = hns_roce_ib_qp_event; return 0; +err_qp: + if (init_attr->qp_type == IB_QPT_GSI && + hr_dev->hw_rev == HNS_ROCE_HW_VER1) + hns_roce_qp_remove(hr_dev, hr_qp); + else + hns_roce_qp_free(hr_dev, hr_qp); + err_qpn: if (!sqpn) hns_roce_release_range_qp(hr_dev, qpn, 1); err_wrid: - kfree(hr_qp->sq.wrid); - kfree(hr_qp->rq.wrid); + if (ib_pd->uobject) { + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && + (udata->outlen == sizeof(resp)) && + hns_roce_qp_has_rq(init_attr)) + hns_roce_db_unmap_user( + to_hr_ucontext(ib_pd->uobject->context), + &hr_qp->rdb); + } else { + kfree(hr_qp->sq.wrid); + kfree(hr_qp->rq.wrid); + } err_mtt: hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index a9c03b0eed57..6150c1941eca 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -49,7 +49,12 @@ struct hns_roce_ib_create_qp { __u8 reserved[5]; }; +struct hns_roce_ib_create_qp_resp { + __u64 cap_flags; +}; + struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; + __u32 reserved; }; #endif /* HNS_ABI_USER_H */ From 9b44703d0a21980441cb120ffe4c6880dd453191 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 9 Mar 2018 18:36:30 +0800 Subject: [PATCH 065/199] RDMA/hns: Support cq record doorbell for the user space This patch updates to support cq record doorbell for the user space. Signed-off-by: Yixian Liu Signed-off-by: Lijun Ou Signed-off-by: Wei Hu (Xavier) Signed-off-by: Shaobo Xu Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_cq.c | 42 ++++++++++++++++++--- drivers/infiniband/hw/hns/hns_roce_device.h | 6 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 10 +++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 3 ++ include/uapi/rdma/hns-abi.h | 7 ++++ 5 files changed, 62 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index bccc9b54c9ce..8226f19fcdd6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -315,6 +315,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); struct device *dev = hr_dev->dev; struct hns_roce_ib_create_cq ucmd; + struct hns_roce_ib_create_cq_resp resp; struct hns_roce_cq *hr_cq = NULL; struct hns_roce_uar *uar = NULL; int vector = attr->comp_vector; @@ -378,6 +379,16 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, goto err_mtt; } + if (context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && + (udata->outlen == sizeof(resp))) { + ret = hns_roce_db_map_user(to_hr_ucontext(context), + ucmd.db_addr, &hr_cq->db); + if (ret) { + dev_err(dev, "cq record doorbell map failed!\n"); + goto err_cqc; + } + } + /* * For the QP created by kernel space, tptr value should be initialized * to zero; For the QP created by user space, it will cause synchronous @@ -393,14 +404,27 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, hr_cq->cq_depth = cq_entries; if (context) { - if (ib_copy_to_udata(udata, &hr_cq->cqn, sizeof(u64))) { - ret = -EFAULT; - goto err_cqc; - } + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && + (udata->outlen == sizeof(resp))) { + hr_cq->db_en = 1; + resp.cqn = hr_cq->cqn; + resp.cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB; + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + } else + ret = ib_copy_to_udata(udata, &hr_cq->cqn, sizeof(u64)); + + if (ret) + goto err_dbmap; } return &hr_cq->ib_cq; +err_dbmap: + if (context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && + (udata->outlen == sizeof(resp))) + hns_roce_db_unmap_user(to_hr_ucontext(context), + &hr_cq->db); + err_cqc: hns_roce_free_cq(hr_dev, hr_cq); @@ -430,12 +454,18 @@ int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq) hns_roce_free_cq(hr_dev, hr_cq); hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); - if (ib_cq->uobject) + if (ib_cq->uobject) { ib_umem_release(hr_cq->umem); - else + + if (hr_cq->db_en == 1) + hns_roce_db_unmap_user( + to_hr_ucontext(ib_cq->uobject->context), + &hr_cq->db); + } else { /* Free the buff of stored cq */ hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, ib_cq->cqe); + } kfree(hr_cq); } diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index aa5cc78244ba..aacbf18849fc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -109,6 +109,10 @@ enum { HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0, }; +enum { + HNS_ROCE_SUPPORT_CQ_RECORD_DB = 1 << 0, +}; + enum hns_roce_qp_state { HNS_ROCE_QP_STATE_RST, HNS_ROCE_QP_STATE_INIT, @@ -381,6 +385,8 @@ struct hns_roce_cq_buf { struct hns_roce_cq { struct ib_cq ib_cq; struct hns_roce_cq_buf hr_buf; + struct hns_roce_db db; + u8 db_en; spinlock_t lock; struct ib_umem *umem; void (*comp)(struct hns_roce_cq *cq); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 21575912f739..bc0a2b7afea9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1638,6 +1638,16 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->byte_40_cqe_ba, V2_CQC_BYTE_40_CQE_BA_M, V2_CQC_BYTE_40_CQE_BA_S, (dma_handle >> (32 + 3))); + if (hr_cq->db_en) + roce_set_bit(cq_context->byte_44_db_record, + V2_CQC_BYTE_44_DB_RECORD_EN_S, 1); + + roce_set_field(cq_context->byte_44_db_record, + V2_CQC_BYTE_44_DB_RECORD_ADDR_M, + V2_CQC_BYTE_44_DB_RECORD_ADDR_S, + ((u32)hr_cq->db.dma) >> 1); + cq_context->db_record_addr = hr_cq->db.dma >> 32; + roce_set_field(cq_context->byte_56_cqe_period_maxcnt, V2_CQC_BYTE_56_CQ_MAX_CNT_M, V2_CQC_BYTE_56_CQ_MAX_CNT_S, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 2bf8a47e3de3..182b6726f783 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -299,6 +299,9 @@ struct hns_roce_v2_cq_context { #define V2_CQC_BYTE_44_DB_RECORD_EN_S 0 +#define V2_CQC_BYTE_44_DB_RECORD_ADDR_S 1 +#define V2_CQC_BYTE_44_DB_RECORD_ADDR_M GENMASK(31, 1) + #define V2_CQC_BYTE_52_CQE_CNT_S 0 #define V2_CQC_BYTE_52_CQE_CNT_M GENMASK(23, 0) diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 6150c1941eca..38e8f192bf72 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -38,6 +38,13 @@ struct hns_roce_ib_create_cq { __u64 buf_addr; + __u64 db_addr; +}; + +struct hns_roce_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; + __u64 cap_flags; }; struct hns_roce_ib_create_qp { From 472bc0fbd47cb89f72607328b6b09b4a962ec200 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 9 Mar 2018 18:36:31 +0800 Subject: [PATCH 066/199] RDMA/hns: Support rq record doorbell for kernel space This patch updates to support rq record doorbell for the kernel space. Signed-off-by: Yixian Liu Signed-off-by: Lijun Ou Signed-off-by: Wei Hu (Xavier) Signed-off-by: Shaobo Xu Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_db.c | 112 ++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_device.h | 6 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 15 +-- drivers/infiniband/hw/hns/hns_roce_main.c | 5 + drivers/infiniband/hw/hns/hns_roce_qp.c | 17 ++- 5 files changed, 142 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c index 987f2811d2c4..ebee2782a573 100644 --- a/drivers/infiniband/hw/hns/hns_roce_db.c +++ b/drivers/infiniband/hw/hns/hns_roce_db.c @@ -66,3 +66,115 @@ void hns_roce_db_unmap_user(struct hns_roce_ucontext *context, mutex_unlock(&context->page_mutex); } EXPORT_SYMBOL(hns_roce_db_unmap_user); + +static struct hns_roce_db_pgdir *hns_roce_alloc_db_pgdir( + struct device *dma_device) +{ + struct hns_roce_db_pgdir *pgdir; + + pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL); + if (!pgdir) + return NULL; + + bitmap_fill(pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2); + pgdir->bits[0] = pgdir->order0; + pgdir->bits[1] = pgdir->order1; + pgdir->page = dma_alloc_coherent(dma_device, PAGE_SIZE, + &pgdir->db_dma, GFP_KERNEL); + if (!pgdir->page) { + kfree(pgdir); + return NULL; + } + + return pgdir; +} + +static int hns_roce_alloc_db_from_pgdir(struct hns_roce_db_pgdir *pgdir, + struct hns_roce_db *db, int order) +{ + int o; + int i; + + for (o = order; o <= 1; ++o) { + i = find_first_bit(pgdir->bits[o], HNS_ROCE_DB_PER_PAGE >> o); + if (i < HNS_ROCE_DB_PER_PAGE >> o) + goto found; + } + + return -ENOMEM; + +found: + clear_bit(i, pgdir->bits[o]); + + i <<= o; + + if (o > order) + set_bit(i ^ 1, pgdir->bits[order]); + + db->u.pgdir = pgdir; + db->index = i; + db->db_record = pgdir->page + db->index; + db->dma = pgdir->db_dma + db->index * 4; + db->order = order; + + return 0; +} + +int hns_roce_alloc_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db, + int order) +{ + struct hns_roce_db_pgdir *pgdir; + int ret = 0; + + mutex_lock(&hr_dev->pgdir_mutex); + + list_for_each_entry(pgdir, &hr_dev->pgdir_list, list) + if (!hns_roce_alloc_db_from_pgdir(pgdir, db, order)) + goto out; + + pgdir = hns_roce_alloc_db_pgdir(hr_dev->dev); + if (!pgdir) { + ret = -ENOMEM; + goto out; + } + + list_add(&pgdir->list, &hr_dev->pgdir_list); + + /* This should never fail -- we just allocated an empty page: */ + WARN_ON(hns_roce_alloc_db_from_pgdir(pgdir, db, order)); + +out: + mutex_unlock(&hr_dev->pgdir_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(hns_roce_alloc_db); + +void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db) +{ + int o; + int i; + + mutex_lock(&hr_dev->pgdir_mutex); + + o = db->order; + i = db->index; + + if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) { + clear_bit(i ^ 1, db->u.pgdir->order0); + ++o; + } + + i >>= o; + set_bit(i, db->u.pgdir->bits[o]); + + if (bitmap_full(db->u.pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2)) { + dma_free_coherent(hr_dev->dev, PAGE_SIZE, db->u.pgdir->page, + db->u.pgdir->db_dma); + list_del(&db->u.pgdir->list); + kfree(db->u.pgdir); + } + + mutex_unlock(&hr_dev->pgdir_mutex); +} +EXPORT_SYMBOL_GPL(hns_roce_free_db); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index aacbf18849fc..56e73516d802 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -771,6 +771,8 @@ struct hns_roce_dev { spinlock_t bt_cmd_lock; struct hns_roce_ib_iboe iboe; + struct list_head pgdir_list; + struct mutex pgdir_mutex; int irq[HNS_ROCE_MAX_IRQ_NUM]; u8 __iomem *reg_base; struct hns_roce_caps caps; @@ -980,6 +982,10 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, struct hns_roce_db *db); void hns_roce_db_unmap_user(struct hns_roce_ucontext *context, struct hns_roce_db *db); +int hns_roce_alloc_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db, + int order); +void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db); + void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn); void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type); void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index bc0a2b7afea9..ca978520462a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -498,7 +498,6 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct hns_roce_v2_wqe_data_seg *dseg; struct hns_roce_rinl_sge *sge_list; struct device *dev = hr_dev->dev; - struct hns_roce_v2_db rq_db; unsigned long flags; void *wqe = NULL; int ret = 0; @@ -564,17 +563,7 @@ out: /* Memory barrier */ wmb(); - rq_db.byte_4 = 0; - rq_db.parameter = 0; - - roce_set_field(rq_db.byte_4, V2_DB_BYTE_4_TAG_M, - V2_DB_BYTE_4_TAG_S, hr_qp->qpn); - roce_set_field(rq_db.byte_4, V2_DB_BYTE_4_CMD_M, - V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_RQ_DB); - roce_set_field(rq_db.parameter, V2_DB_PARAMETER_CONS_IDX_M, - V2_DB_PARAMETER_CONS_IDX_S, hr_qp->rq.head); - - hns_roce_write64_k((__le32 *)&rq_db, hr_qp->rq.db_reg_l); + *hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff; } spin_unlock_irqrestore(&hr_qp->rq.lock, flags); @@ -3476,6 +3465,8 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, kfree(hr_qp->sq.wrid); kfree(hr_qp->rq.wrid); hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); + if (hr_qp->rq.wqe_cnt) + hns_roce_free_db(hr_dev, &hr_qp->rdb); } if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d6c9c578dba1..e1ee6666f790 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -665,6 +665,11 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev) spin_lock_init(&hr_dev->sm_lock); spin_lock_init(&hr_dev->bt_cmd_lock); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) { + INIT_LIST_HEAD(&hr_dev->pgdir_list); + mutex_init(&hr_dev->pgdir_mutex); + } + ret = hns_roce_init_uar_table(hr_dev); if (ret) { dev_err(dev, "Failed to initialize uar table. aborting\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 92597e280a63..f0ad455ad62b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -652,6 +652,16 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, hr_qp->rq.db_reg_l = hr_dev->reg_base + hr_dev->odb_offset + DB_REG_OFFSET * hr_dev->priv_uar.index; + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && + hns_roce_qp_has_rq(init_attr)) { + ret = hns_roce_alloc_db(hr_dev, &hr_qp->rdb, 0); + if (ret) { + dev_err(dev, "rq record doorbell alloc failed!\n"); + goto err_rq_sge_list; + } + *hr_qp->rdb.db_record = 0; + } + /* Allocate QP buf */ page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz; if (hns_roce_buf_alloc(hr_dev, hr_qp->buff_size, @@ -659,7 +669,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, &hr_qp->hr_buf, page_shift)) { dev_err(dev, "hns_roce_buf_alloc error!\n"); ret = -ENOMEM; - goto err_rq_sge_list; + goto err_db; } hr_qp->mtt.mtt_type = MTT_TYPE_WQE; @@ -768,6 +778,11 @@ err_buf: else hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); +err_db: + if (!ib_pd->uobject && hns_roce_qp_has_rq(init_attr) && + (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)) + hns_roce_free_db(hr_dev, &hr_qp->rdb); + err_rq_sge_list: if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list); From 86188a8810ed0b73ce6653daa3c4bc29e8a79a7c Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 9 Mar 2018 18:36:32 +0800 Subject: [PATCH 067/199] RDMA/hns: Support cq record doorbell for kernel space This patch updates to support cq record doorbell for the kernel space. Signed-off-by: Yixian Liu Signed-off-by: Lijun Ou Signed-off-by: Wei Hu (Xavier) Signed-off-by: Shaobo Xu Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_cq.c | 17 ++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 19 +------------------ 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 8226f19fcdd6..462b644bbbd7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -358,12 +358,21 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, /* Get user space parameters */ uar = &to_hr_ucontext(context)->uar; } else { + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) { + ret = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1); + if (ret) + goto err_cq; + + hr_cq->set_ci_db = hr_cq->db.db_record; + *hr_cq->set_ci_db = 0; + } + /* Init mmt table and write buff address to mtt table */ ret = hns_roce_ib_alloc_cq_buf(hr_dev, &hr_cq->hr_buf, cq_entries); if (ret) { dev_err(dev, "Failed to alloc_cq_buf.\n"); - goto err_cq; + goto err_db; } uar = &hr_dev->priv_uar; @@ -436,6 +445,10 @@ err_mtt: hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, hr_cq->ib_cq.cqe); +err_db: + if (!context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)) + hns_roce_free_db(hr_dev, &hr_cq->db); + err_cq: kfree(hr_cq); return ERR_PTR(ret); @@ -465,6 +478,8 @@ int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq) /* Free the buff of stored cq */ hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, ib_cq->cqe); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) + hns_roce_free_db(hr_dev, &hr_cq->db); } kfree(hr_cq); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 56e73516d802..fb305b7f99a8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -395,6 +395,7 @@ struct hns_roce_cq { struct hns_roce_uar *uar; u32 cq_depth; u32 cons_index; + u32 *set_ci_db; void __iomem *cq_db_l; u16 *tptr_addr; int arm_sn; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ca978520462a..684c2d1a0ed0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1497,24 +1497,7 @@ static struct hns_roce_v2_cqe *next_cqe_sw_v2(struct hns_roce_cq *hr_cq) static void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index) { - struct hns_roce_v2_cq_db cq_db; - - cq_db.byte_4 = 0; - cq_db.parameter = 0; - - roce_set_field(cq_db.byte_4, V2_CQ_DB_BYTE_4_TAG_M, - V2_CQ_DB_BYTE_4_TAG_S, hr_cq->cqn); - roce_set_field(cq_db.byte_4, V2_CQ_DB_BYTE_4_CMD_M, - V2_CQ_DB_BYTE_4_CMD_S, HNS_ROCE_V2_CQ_DB_PTR); - - roce_set_field(cq_db.parameter, V2_CQ_DB_PARAMETER_CONS_IDX_M, - V2_CQ_DB_PARAMETER_CONS_IDX_S, - cons_index & ((hr_cq->cq_depth << 1) - 1)); - roce_set_field(cq_db.parameter, V2_CQ_DB_PARAMETER_CMD_SN_M, - V2_CQ_DB_PARAMETER_CMD_SN_S, 1); - - hns_roce_write64_k((__be32 *)&cq_db, hr_cq->cq_db_l); - + *hr_cq->set_ci_db = cons_index & 0xffffff; } static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn, From 8a18e911d0da6192735e15bf2aebcc99655cf8c6 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sun, 11 Mar 2018 23:30:28 -0400 Subject: [PATCH 068/199] IB: remove duplicate header files In hfi.h, the header file opa_addr.h is included twice. In vt.h, the header file mmap.h is included twice. Signed-off-by: Zhu Yanjun Acked-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 1 - drivers/infiniband/sw/rdmavt/vt.h | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 90bc8c76d2ca..32c48265405e 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -70,7 +70,6 @@ #include #include #include -#include #include "chip_registers.h" #include "common.h" diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h index 8823b2e7aac6..0675ea6c3872 100644 --- a/drivers/infiniband/sw/rdmavt/vt.h +++ b/drivers/infiniband/sw/rdmavt/vt.h @@ -59,7 +59,6 @@ #include "mmap.h" #include "cq.h" #include "mad.h" -#include "mmap.h" #define rvt_pr_info(rdi, fmt, ...) \ __rvt_pr_info(rdi->driver_f.get_pci_dev(rdi), \ From 75a4598209cbe45540baa316c3b51d9db222e96e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 11 Mar 2018 13:51:32 +0200 Subject: [PATCH 069/199] RDMA/mlx5: Fix NULL dereference while accessing XRC_TGT QPs mlx5 modify_qp() relies on FW that the error will be thrown if wrong state is supplied. The missing check in FW causes the following crash while using XRC_TGT QPs. [ 14.769632] BUG: unable to handle kernel NULL pointer dereference at (null) [ 14.771085] IP: mlx5_ib_modify_qp+0xf60/0x13f0 [ 14.771894] PGD 800000001472e067 P4D 800000001472e067 PUD 14529067 PMD 0 [ 14.773126] Oops: 0002 [#1] SMP PTI [ 14.773763] CPU: 0 PID: 365 Comm: ubsan Not tainted 4.16.0-rc1-00038-g8151138c0793 #119 [ 14.775192] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [ 14.777522] RIP: 0010:mlx5_ib_modify_qp+0xf60/0x13f0 [ 14.778417] RSP: 0018:ffffbf48001c7bd8 EFLAGS: 00010246 [ 14.779346] RAX: 0000000000000000 RBX: ffff9a8f9447d400 RCX: 0000000000000000 [ 14.780643] RDX: 0000000000000000 RSI: 000000000000000a RDI: 0000000000000000 [ 14.781930] RBP: 0000000000000000 R08: 00000000000217b0 R09: ffffffffbc9c1504 [ 14.783214] R10: fffff4a180519480 R11: ffff9a8f94523600 R12: ffff9a8f9493e240 [ 14.784507] R13: ffff9a8f9447d738 R14: 000000000000050a R15: 0000000000000000 [ 14.785800] FS: 00007f545b466700(0000) GS:ffff9a8f9fc00000(0000) knlGS:0000000000000000 [ 14.787073] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 14.787792] CR2: 0000000000000000 CR3: 00000000144be000 CR4: 00000000000006b0 [ 14.788689] Call Trace: [ 14.789007] _ib_modify_qp+0x71/0x120 [ 14.789475] modify_qp.isra.20+0x207/0x2f0 [ 14.790010] ib_uverbs_modify_qp+0x90/0xe0 [ 14.790532] ib_uverbs_write+0x1d2/0x3c0 [ 14.791049] ? __handle_mm_fault+0x93c/0xe40 [ 14.791644] __vfs_write+0x36/0x180 [ 14.792096] ? handle_mm_fault+0xc1/0x210 [ 14.792601] vfs_write+0xad/0x1e0 [ 14.793018] SyS_write+0x52/0xc0 [ 14.793422] do_syscall_64+0x75/0x180 [ 14.793888] entry_SYSCALL_64_after_hwframe+0x21/0x86 [ 14.794527] RIP: 0033:0x7f545ad76099 [ 14.794975] RSP: 002b:00007ffd78787468 EFLAGS: 00000287 ORIG_RAX: 0000000000000001 [ 14.795958] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f545ad76099 [ 14.797075] RDX: 0000000000000078 RSI: 0000000020009000 RDI: 0000000000000003 [ 14.798140] RBP: 00007ffd78787470 R08: 00007ffd78787480 R09: 00007ffd78787480 [ 14.799207] R10: 00007ffd78787480 R11: 0000000000000287 R12: 00005599ada98760 [ 14.800277] R13: 00007ffd78787560 R14: 0000000000000000 R15: 0000000000000000 [ 14.801341] Code: 4c 8b 1c 24 48 8b 83 70 02 00 00 48 c7 83 cc 02 00 00 00 00 00 00 48 c7 83 24 03 00 00 00 00 00 00 c7 83 2c 03 00 00 00 00 00 00 00 00 00 00 00 48 8b 83 70 02 00 00 c7 40 04 00 00 00 00 4c [ 14.804012] RIP: mlx5_ib_modify_qp+0xf60/0x13f0 RSP: ffffbf48001c7bd8 [ 14.804838] CR2: 0000000000000000 [ 14.805288] ---[ end trace 3f1da0df5c8b7c37 ]--- Cc: syzkaller Reported-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 3e04d8bd6b3c..b8cbf00e3ef1 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3156,7 +3156,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ - if (new_state == IB_QPS_RESET && !ibqp->uobject) { + if (new_state == IB_QPS_RESET && + !ibqp->uobject && ibqp->qp_type != IB_QPT_XRC_TGT) { mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (send_cq != recv_cq) From 88de869bbe4fb669ae92f19225a90b07d8173ccb Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 11 Mar 2018 13:51:33 +0200 Subject: [PATCH 070/199] RDMA/uverbs: Ensure validity of current QP state value The QP state is internal enum which is checked at the driver level by calling to ib_modify_qp_is_ok(). Move this check closer to user and leave kernel users to be checked by compiler. Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 9f9fc14523db..33c7f1290adb 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1991,6 +1991,13 @@ static int modify_qp(struct ib_uverbs_file *file, goto release_qp; } + if ((cmd->base.attr_mask & IB_QP_CUR_STATE && + cmd->base.cur_qp_state > IB_QPS_ERR) || + cmd->base.qp_state > IB_QPS_ERR) { + ret = -EINVAL; + goto release_qp; + } + attr->qp_state = cmd->base.qp_state; attr->cur_qp_state = cmd->base.cur_qp_state; attr->path_mtu = cmd->base.path_mtu; From fbf1795c969ae81a0c292ca0ee0baa944da4ede3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 11 Mar 2018 13:51:34 +0200 Subject: [PATCH 071/199] RDMA/pvrdma: Properly annotate QP states QP states provided by core layer are converted to enum ib_qp_state and better to use internal variable in that type instead of int. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 7bf518bdbf21..eb5b1065ec08 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -489,7 +489,7 @@ int pvrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, union pvrdma_cmd_req req; union pvrdma_cmd_resp rsp; struct pvrdma_cmd_modify_qp *cmd = &req.modify_qp; - int cur_state, next_state; + enum ib_qp_state cur_state, next_state; int ret; /* Sanity checking. Should need lock here */ From 19b1f54099b6ee334acbfbcfbdffd1d1f057216d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 11 Mar 2018 13:51:35 +0200 Subject: [PATCH 072/199] RDMA/verbs: Simplify modify QP check All callers to ib_modify_qp_is_ok() provides enum ib_qp_state makes the checks of out-of-scope redundant. Let's remove them together with updating function signature to return boolean result. Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 20 ++++++++------------ include/rdma/ib_verbs.h | 6 +++--- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 873b7aa9e8dd..f7de886da430 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1263,34 +1263,30 @@ static const struct { } }; -int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll) +bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll) { enum ib_qp_attr_mask req_param, opt_param; - if (cur_state < 0 || cur_state > IB_QPS_ERR || - next_state < 0 || next_state > IB_QPS_ERR) - return 0; - if (mask & IB_QP_CUR_STATE && cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) - return 0; + return false; if (!qp_state_table[cur_state][next_state].valid) - return 0; + return false; req_param = qp_state_table[cur_state][next_state].req_param[type]; opt_param = qp_state_table[cur_state][next_state].opt_param[type]; if ((mask & req_param) != req_param) - return 0; + return false; if (mask & ~(req_param | opt_param | IB_QP_STATE)) - return 0; + return false; - return 1; + return true; } EXPORT_SYMBOL(ib_modify_qp_is_ok); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7df3274818f9..5eb10c2470f0 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2480,9 +2480,9 @@ static inline bool ib_is_udata_cleared(struct ib_udata *udata, * transition from cur_state to next_state is allowed by the IB spec, * and that the attribute mask supplied is allowed for the transition. */ -int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll); +bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); From c62adb7def71d7e0b4ba44f8da81a448eb53d6d5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 12 Mar 2018 13:55:55 -0700 Subject: [PATCH 073/199] IB/srp: Fix IPv6 address parsing Split IPv6 addresses at the colon that separates the IPv6 address and the port number instead of at a colon in the middle of the IPv6 address. Check whether the IPv6 address is surrounded with square brackets. Fixes: 19f313438c77 ("IB/srp: Add RDMA/CM support") Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srp/ib_srp.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9a5ea6251450..4c52ca922f0b 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3414,18 +3414,37 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_ERR, NULL } }; +/** + * srp_parse_in - parse an IP address and port number combination + * + * Parse the following address formats: + * - IPv4: :, e.g. 1.2.3.4:5. + * - IPv6: \[\]:, e.g. [1::2:3%4]:5. + */ static int srp_parse_in(struct net *net, struct sockaddr_storage *sa, const char *addr_port_str) { - char *addr = kstrdup(addr_port_str, GFP_KERNEL); - char *port_str = addr; + char *addr_end, *addr = kstrdup(addr_port_str, GFP_KERNEL); + char *port_str; int ret; if (!addr) return -ENOMEM; - strsep(&port_str, ":"); - ret = inet_pton_with_scope(net, AF_UNSPEC, addr, port_str, sa); + port_str = strrchr(addr, ':'); + if (!port_str) + return -EINVAL; + *port_str++ = '\0'; + ret = inet_pton_with_scope(net, AF_INET, addr, port_str, sa); + if (ret && addr[0]) { + addr_end = addr + strlen(addr) - 1; + if (addr[0] == '[' && *addr_end == ']') { + *addr_end = '\0'; + ret = inet_pton_with_scope(net, AF_INET6, addr + 1, + port_str, sa); + } + } kfree(addr); + pr_debug("%s -> %pISpfsc\n", addr_port_str, sa); return ret; } From 72f7cc09b143cf972c8c7571fc95d1017ba76c3d Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 13 Mar 2018 15:18:46 +0200 Subject: [PATCH 074/199] IB/mlx5: Expose more priorities for bypass namespace BYPASS namespace is used by the RDMA side to insert flow rules into the vport RX flow tables. Currently only 8 priorities are exposed, increase this to 16 to allow more flexibility. This change will also cause the BYPASS namespace to use 32 levels (as apposed to 16 today) of flow tables, 16 levels for regular rules and 16 for don't trap rules. Reviewed-by: Maor Gottlieb Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/linux/mlx5/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index e5258ee4e38b..413df3c11a46 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1204,8 +1204,8 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } -#define MLX5_BY_PASS_NUM_REGULAR_PRIOS 8 -#define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 8 +#define MLX5_BY_PASS_NUM_REGULAR_PRIOS 16 +#define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 16 #define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1 #define MLX5_BY_PASS_NUM_PRIOS (MLX5_BY_PASS_NUM_REGULAR_PRIOS +\ MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS +\ From 65edd0e758b8a215825fb3ee685c8eaf6a9cc0d0 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 13 Mar 2018 15:18:47 +0200 Subject: [PATCH 075/199] IB/mlx5: Only synchronize RCU once when removing mkeys Instead synchronizing RCU in a loop when removing mkeys in a batch do it once at the end before freeing them. The result is only waiting for one RCU grace period instead of many serially. Signed-off-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 44 +++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index a5fad3e87ff7..820f93439b0c 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -220,26 +220,32 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *tmp_mr; struct mlx5_ib_mr *mr; - int err; + LIST_HEAD(del_list); int i; for (i = 0; i < num; i++) { spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { spin_unlock_irq(&ent->lock); - return; + break; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_del(&mr->list); + list_move(&mr->list, &del_list); ent->cur--; ent->size--; spin_unlock_irq(&ent->lock); - err = destroy_mkey(dev, mr); - if (err) - mlx5_ib_warn(dev, "failed destroy mkey\n"); - else - kfree(mr); + mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); + } + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + synchronize_srcu(&dev->mr_srcu); +#endif + + list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { + list_del(&mr->list); + kfree(mr); } } @@ -562,26 +568,32 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *tmp_mr; struct mlx5_ib_mr *mr; - int err; + LIST_HEAD(del_list); cancel_delayed_work(&ent->dwork); while (1) { spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { spin_unlock_irq(&ent->lock); - return; + break; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_del(&mr->list); + list_move(&mr->list, &del_list); ent->cur--; ent->size--; spin_unlock_irq(&ent->lock); - err = destroy_mkey(dev, mr); - if (err) - mlx5_ib_warn(dev, "failed destroy mkey\n"); - else - kfree(mr); + mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); + } + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + synchronize_srcu(&dev->mr_srcu); +#endif + + list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { + list_del(&mr->list); + kfree(mr); } } From c44ef998f25eaddcd78924f98e5baed602d933e6 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Tue, 13 Mar 2018 15:18:48 +0200 Subject: [PATCH 076/199] IB/mlx5: Maintain a single emergency page The mlx5 driver needs to be able to issue invalidation to ODP MRs even if it cannot allocate memory. To this end it preallocates emergency pages to use when the situation arises. This flow should be extremely rare enough, that we don't need to worry about contention and therefore a single emergency page is good enough. Signed-off-by: Ilya Lesokhin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 44 +++++++++++++++++++--------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 ++-- drivers/infiniband/hw/mlx5/mr.c | 11 ++++--- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d9474b95d8e5..bd565af30d66 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -92,6 +92,12 @@ static LIST_HEAD(mlx5_ib_dev_list); */ static DEFINE_MUTEX(mlx5_ib_multiport_mutex); +/* We can't use an array for xlt_emergency_page because dma_map_single + * doesn't work on kernel modules memory + */ +static unsigned long xlt_emergency_page; +static struct mutex xlt_emergency_page_mutex; + struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi) { struct mlx5_ib_dev *dev; @@ -1698,17 +1704,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif - context->upd_xlt_page = __get_free_page(GFP_KERNEL); - if (!context->upd_xlt_page) { - err = -ENOMEM; - goto out_uars; - } - mutex_init(&context->upd_xlt_page_mutex); - if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { err = mlx5_ib_alloc_transport_domain(dev, &context->tdn); if (err) - goto out_page; + goto out_uars; } INIT_LIST_HEAD(&context->vma_private_list); @@ -1785,9 +1784,6 @@ out_td: if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_ib_dealloc_transport_domain(dev, context->tdn); -out_page: - free_page(context->upd_xlt_page); - out_uars: deallocate_uars(dev, context); @@ -1813,7 +1809,6 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_ib_dealloc_transport_domain(dev, context->tdn); - free_page(context->upd_xlt_page); deallocate_uars(dev, context); kfree(bfregi->sys_pages); kfree(bfregi->count); @@ -5292,14 +5287,33 @@ static struct mlx5_interface mlx5_ib_interface = { .protocol = MLX5_INTERFACE_PROTOCOL_IB, }; +unsigned long mlx5_ib_get_xlt_emergency_page(void) +{ + mutex_lock(&xlt_emergency_page_mutex); + return xlt_emergency_page; +} + +void mlx5_ib_put_xlt_emergency_page(void) +{ + mutex_unlock(&xlt_emergency_page_mutex); +} + static int __init mlx5_ib_init(void) { int err; - mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0); - if (!mlx5_ib_event_wq) + xlt_emergency_page = __get_free_page(GFP_KERNEL); + if (!xlt_emergency_page) return -ENOMEM; + mutex_init(&xlt_emergency_page_mutex); + + mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0); + if (!mlx5_ib_event_wq) { + free_page(xlt_emergency_page); + return -ENOMEM; + } + mlx5_ib_odp_init(); err = mlx5_register_interface(&mlx5_ib_interface); @@ -5311,6 +5325,8 @@ static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); destroy_workqueue(mlx5_ib_event_wq); + mutex_destroy(&xlt_emergency_page_mutex); + free_page(xlt_emergency_page); } module_init(mlx5_ib_init); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index e0bad28e0f09..d88c240c52ce 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -130,9 +130,6 @@ struct mlx5_ib_ucontext { /* protect vma_private_list add/del */ struct mutex vma_private_list_mutex; - unsigned long upd_xlt_page; - /* protect ODP/KSM */ - struct mutex upd_xlt_page_mutex; u64 lib_caps; }; @@ -1220,4 +1217,7 @@ static inline int get_num_static_uars(struct mlx5_ib_dev *dev, return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages; } +unsigned long mlx5_ib_get_xlt_emergency_page(void); +void mlx5_ib_put_xlt_emergency_page(void); + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 820f93439b0c..87e8b3339ddd 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -985,7 +985,6 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, { struct mlx5_ib_dev *dev = mr->dev; struct device *ddev = dev->ib_dev.dev.parent; - struct mlx5_ib_ucontext *uctx = NULL; int size; void *xlt; dma_addr_t dma; @@ -1001,6 +1000,7 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, size_t pages_to_map = 0; size_t pages_iter = 0; gfp_t gfp; + bool use_emergency_page = false; /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, * so we need to align the offset and length accordingly @@ -1027,12 +1027,11 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, } if (!xlt) { - uctx = to_mucontext(mr->ibmr.pd->uobject->context); mlx5_ib_warn(dev, "Using XLT emergency buffer\n"); + xlt = (void *)mlx5_ib_get_xlt_emergency_page(); size = PAGE_SIZE; - xlt = (void *)uctx->upd_xlt_page; - mutex_lock(&uctx->upd_xlt_page_mutex); memset(xlt, 0, size); + use_emergency_page = true; } pages_iter = size / desc_size; dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE); @@ -1096,8 +1095,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); free_xlt: - if (uctx) - mutex_unlock(&uctx->upd_xlt_page_mutex); + if (use_emergency_page) + mlx5_ib_put_xlt_emergency_page(); else free_pages((unsigned long)xlt, get_order(size)); From baa00fcde447d420fb3eba434f5ee7e04df90234 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Mar 2018 13:06:20 +0100 Subject: [PATCH 077/199] RDMA/i40iw: include linux/irq.h We get a build failure on ARM unless the header is included explicitly: drivers/infiniband/hw/i40iw/i40iw_verbs.c: In function 'i40iw_get_vector_affinity': drivers/infiniband/hw/i40iw/i40iw_verbs.c:2747:9: error: implicit declaration of function 'irq_get_affinity_mask'; did you mean 'irq_create_affinity_masks'? [-Werror=implicit-function-declaration] return irq_get_affinity_mask(msix_vec->irq); ^~~~~~~~~~~~~~~~~~~~~ irq_create_affinity_masks drivers/infiniband/hw/i40iw/i40iw_verbs.c:2747:9: error: returning 'int' from a function with return type 'const struct cpumask *' makes pointer from integer without a cast [-Werror=int-conversion] return irq_get_affinity_mask(msix_vec->irq); Fixes: 7e952b19eb63 ("i40iw: Implement get_vector_affinity API") Signed-off-by: Arnd Bergmann Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index a51798578f27..f3af952402e9 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include From 43c9fc509fa59d602f9c303d02b33db015022881 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 14 Feb 2018 21:45:43 +0100 Subject: [PATCH 078/199] rdma_rxe: make rxe work over 802.1q VLAN devices This patch fixes RDMA/rxe over 802.1q VLAN devices. Without it, I observed the following behavior: a) adding a VLAN device to RXE via rxe_net_add() creates a non-functional RDMA device. This is caused by the logic in enum_all_gids_of_dev_cb() / is_eth_port_of_netdev(), which only considers networks connected to "upper devices" of the configured network device, resulting in an empty set of gids for a VLAN interface that is an "upper device" itself. Later attempts to connect via this rdma device fail in cma_acuire_dev() because no gids can be resolved. b) adding the master device of the VLAN device instead seems to work initially, target addresses via VLAN devices are resolved successfully. But the connection times out because no 802.1q VLAN headers are inserted in the ethernet packets, which are therefore never received. This happens because the RXE layer sends the packets via the master device rather than the VLAN device. The problem could be solved by changing either a) or b). My thinking was that the logic in a) was created deliberately, thus I decided to work on b). It turns out that the information about the VLAN interface for the gid at hand is available in the AV information. My patch converts the RXE code to use this netdev instead of rxe->ndev. With this change, RXE over vlan works on my test system. Signed-off-by: Martin Wilck Reviewed-by: Moni Shoua Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_net.c | 54 ++++++++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe_recv.c | 2 +- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index a7753dc3261e..9da6e37fb70c 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -182,11 +182,39 @@ static struct dst_entry *rxe_find_route6(struct net_device *ndev, #endif +/* + * Derive the net_device from the av. + * For physical devices, this will just return rxe->ndev. + * But for VLAN devices, it will return the vlan dev. + * Caller should dev_put() the returned net_device. + */ +static struct net_device *rxe_netdev_from_av(struct rxe_dev *rxe, + int port_num, + struct rxe_av *av) +{ + union ib_gid gid; + struct ib_gid_attr attr; + struct net_device *ndev = rxe->ndev; + + if (ib_get_cached_gid(&rxe->ib_dev, port_num, av->grh.sgid_index, + &gid, &attr) == 0 && + attr.ndev && attr.ndev != ndev) + ndev = attr.ndev; + else + /* Only to ensure that caller may call dev_put() */ + dev_hold(ndev); + + return ndev; +} + static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_av *av) { struct dst_entry *dst = NULL; + struct net_device *ndev; + + ndev = rxe_netdev_from_av(rxe, qp->attr.port_num, av); if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); @@ -201,14 +229,14 @@ static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, saddr = &av->sgid_addr._sockaddr_in.sin_addr; daddr = &av->dgid_addr._sockaddr_in.sin_addr; - dst = rxe_find_route4(rxe->ndev, saddr, daddr); + dst = rxe_find_route4(ndev, saddr, daddr); } else if (av->network_type == RDMA_NETWORK_IPV6) { struct in6_addr *saddr6; struct in6_addr *daddr6; saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; - dst = rxe_find_route6(rxe->ndev, saddr6, daddr6); + dst = rxe_find_route6(ndev, saddr6, daddr6); #if IS_ENABLED(CONFIG_IPV6) if (dst) qp->dst_cookie = @@ -217,6 +245,7 @@ static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, } } + dev_put(ndev); return dst; } @@ -224,9 +253,14 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct udphdr *udph; struct net_device *ndev = skb->dev; + struct net_device *rdev = ndev; struct rxe_dev *rxe = net_to_rxe(ndev); struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + if (!rxe && is_vlan_dev(rdev)) { + rdev = vlan_dev_real_dev(ndev); + rxe = net_to_rxe(rdev); + } if (!rxe) goto drop; @@ -498,6 +532,10 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, { unsigned int hdr_len; struct sk_buff *skb; + struct net_device *ndev; + const int port_num = 1; + + ndev = rxe_netdev_from_av(rxe, port_num, av); if (av->network_type == RDMA_NETWORK_IPV4) hdr_len = ETH_HLEN + sizeof(struct udphdr) + @@ -506,26 +544,30 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, hdr_len = ETH_HLEN + sizeof(struct udphdr) + sizeof(struct ipv6hdr); - skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(rxe->ndev), + skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), GFP_ATOMIC); - if (unlikely(!skb)) + + if (unlikely(!skb)) { + dev_put(ndev); return NULL; + } skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev)); - skb->dev = rxe->ndev; + skb->dev = ndev; if (av->network_type == RDMA_NETWORK_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); pkt->rxe = rxe; - pkt->port_num = 1; + pkt->port_num = port_num; pkt->hdr = skb_put(skb, paylen); pkt->mask |= RXE_GRH_MASK; memset(pkt->hdr, 0, paylen); + dev_put(ndev); return skb; } diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 4c3f899241d4..08ad9dc72205 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -347,7 +347,7 @@ static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid, IB_GID_TYPE_ROCE_UDP_ENCAP, - 1, rxe->ndev, NULL); + 1, skb->dev, NULL); } /* rxe_rcv is called from the interface driver */ From 6ee687735e745eafae9e6b93d1ea70bc52e7ad07 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Mar 2018 14:51:57 -0700 Subject: [PATCH 079/199] drivers/infiniband/core/verbs.c: fix build with gcc-4.4.4 gcc-4.4.4 has issues with initialization of anonymous unions. drivers/infiniband/core/verbs.c: In function '__ib_drain_sq': drivers/infiniband/core/verbs.c:2204: error: unknown field 'wr_cqe' specified in initializer drivers/infiniband/core/verbs.c:2204: warning: initialization makes integer from pointer without a cast Work around this. Fixes: a1ae7d0345edd5 ("RDMA/core: Avoid that ib_drain_qp() triggers an out-of-bounds stack access") Cc: Bart Van Assche Cc: Steve Wise Cc: Sagi Grimberg Cc: Jason Gunthorpe Cc: Signed-off-by: Andrew Morton Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index f7de886da430..95e3b307c93a 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2196,8 +2196,9 @@ static void __ib_drain_sq(struct ib_qp *qp) struct ib_send_wr *bad_swr; struct ib_rdma_wr swr = { .wr = { + .next = NULL, + { .wr_cqe = &sdrain.cqe, }, .opcode = IB_WR_RDMA_WRITE, - .wr_cqe = &sdrain.cqe, }, }; int ret; From 06892cc190550807d332c95a0114c7e175584012 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Mar 2018 15:06:45 -0700 Subject: [PATCH 080/199] drivers/infiniband/ulp/srpt/ib_srpt.c: fix build with gcc-4.4.4 gcc-4.4.4 has issues with initialization of anonymous unions: drivers/infiniband/ulp/srpt/ib_srpt.c: In function 'srpt_zerolength_write': drivers/infiniband/ulp/srpt/ib_srpt.c:854: error: unknown field 'wr_cqe' specified in initializer drivers/infiniband/ulp/srpt/ib_srpt.c:854: warning: initialization makes integer from pointer without a cast Work aound this. Fixes: 2a78cb4db487 ("IB/srpt: Fix an out-of-bounds stack access in srpt_zerolength_write()") Cc: Bart Van Assche Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Signed-off-by: Andrew Morton Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 8956d4621273..dfec0e1fac29 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -850,8 +850,9 @@ static int srpt_zerolength_write(struct srpt_rdma_ch *ch) struct ib_send_wr *bad_wr; struct ib_rdma_wr wr = { .wr = { + .next = NULL, + { .wr_cqe = &ch->zw_cqe, }, .opcode = IB_WR_RDMA_WRITE, - .wr_cqe = &ch->zw_cqe, .send_flags = IB_SEND_SIGNALED, } }; From 4289861d88d6c7b5e4c8cc7fe2ad6cdf0cdfc366 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 13 Mar 2018 15:29:24 +0200 Subject: [PATCH 081/199] RDMA/mlx5: Protect from NULL pointer derefence The mlx5_ib_alloc_implicit_mr() can fail to acquire pages and the returned mr pointer won't be valid. Ensure that it is not error prior to access. Cc: # 4.10 Fixes: 81713d3788d2 ("IB/mlx5: Add implicit MR support") Reported-by: Noa Osherovich Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 45d9044a1ba9..c7a8ece05bd2 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1232,6 +1232,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(-EINVAL); mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); + if (IS_ERR(mr)) + return ERR_CAST(mr); return &mr->ibmr; } #endif From ea30f013765865f50fbfb6460aea2b18df19c978 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 13 Mar 2018 15:29:25 +0200 Subject: [PATCH 082/199] RDMA/mlx5: Return proper value for not-supported command Return -EOPNOTSUPP value to the user for unsupported reg_user_mr. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index c7a8ece05bd2..9a8018cd320a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1220,7 +1220,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, bool use_umr = true; if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) - return ERR_PTR(-EINVAL); + return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); From 4638a3b2428a66db6e2d934ff200e254763f78a7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 13 Mar 2018 15:29:26 +0200 Subject: [PATCH 083/199] RDMA/mlx5: Unify error flows in rereg MR failure paths According to the IBTA spec 1.3, the driver failure in MR reregister shall release old and new MRs. C11-20: If the CI returns any other error, the CI shall invalidate both "old" and "new" registrations, and release any associated resources. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 9a8018cd320a..413f90af1bcb 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1375,10 +1375,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, ib_umem_release(mr->umem); err = mr_umem_get(pd, addr, len, access_flags, &mr->umem, &npages, &page_shift, &ncont, &order); - if (err < 0) { - clean_mr(dev, mr); - return err; - } + if (err) + goto err; } if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) { @@ -1395,13 +1393,16 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, mlx5_ib_warn(dev, "Failed to destroy MKey\n"); } if (err) - return err; + goto err; mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont, page_shift, access_flags, true); - if (IS_ERR(mr)) - return PTR_ERR(mr); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + mr = to_mmr(ib_mr); + goto err; + } mr->allocated_from_cache = 0; mr->live = 1; @@ -1427,13 +1428,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, err = rereg_umr(pd, mr, access_flags, flags); } - if (err) { - mlx5_ib_warn(dev, "Failed to rereg UMR\n"); - ib_umem_release(mr->umem); - mr->umem = NULL; - clean_mr(dev, mr); - return err; - } + if (err) + goto err; } set_mr_fileds(dev, mr, npages, len, access_flags); @@ -1442,6 +1438,14 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, update_odp_mr(mr); #endif return 0; + +err: + if (mr->umem) { + ib_umem_release(mr->umem); + mr->umem = NULL; + } + clean_mr(dev, mr); + return err; } static int From c985bd0ed732c8ce7a2e3d91988a09c5a9c1c0c9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 13 Mar 2018 15:29:27 +0200 Subject: [PATCH 084/199] RDMA/mlx5: Guard ODP specific assignments with specific CONFIG "live" is needed for ODP only and is better to be guarded by appropriate CONFIG. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 413f90af1bcb..e520b941fc4d 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1296,7 +1296,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING mr->live = 1; +#endif return &mr->ibmr; error: ib_umem_release(umem); @@ -1405,7 +1407,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, } mr->allocated_from_cache = 0; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING mr->live = 1; +#endif } else { /* * Send a UMR WQE From eeea6953c42f85d19e565a4d3a49c1530f602e22 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 13 Mar 2018 15:29:28 +0200 Subject: [PATCH 085/199] RDMA/mlx5: Simplify clean and destroy MR calls The failure to destroy the MRs is printed on mlx5_core layer as error and it makes warning prints useless. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 40 +++++++++------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index e520b941fc4d..bcf5e22cf743 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -47,8 +47,8 @@ enum { #define MLX5_UMR_ALIGN 2048 -static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); -static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); @@ -1385,15 +1385,10 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, /* * UMR can't be used - MKey needs to be replaced. */ - if (mr->allocated_from_cache) { + if (mr->allocated_from_cache) err = unreg_umr(dev, mr); - if (err) - mlx5_ib_warn(dev, "Failed to unregister MR\n"); - } else { + else err = destroy_mkey(dev, mr); - if (err) - mlx5_ib_warn(dev, "Failed to destroy MKey\n"); - } if (err) goto err; @@ -1498,10 +1493,9 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) } } -static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { int allocated_from_cache = mr->allocated_from_cache; - int err; if (mr->sig) { if (mlx5_core_destroy_psv(dev->mdev, @@ -1518,21 +1512,11 @@ static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) mlx5_free_priv_descs(mr); - if (!allocated_from_cache) { - u32 key = mr->mmkey.key; - - err = destroy_mkey(dev, mr); - if (err) { - mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", - key, err); - return err; - } - } - - return 0; + if (!allocated_from_cache) + destroy_mkey(dev, mr); } -static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { int npages = mr->npages; struct ib_umem *umem = mr->umem; @@ -1573,16 +1557,12 @@ static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) kfree(mr); else mlx5_mr_cache_free(dev, mr); - - return 0; } int mlx5_ib_dereg_mr(struct ib_mr *ibmr) { - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); - struct mlx5_ib_mr *mr = to_mmr(ibmr); - - return dereg_mr(dev, mr); + dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr)); + return 0; } struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, From 6612b4983f7e8d295a7503452719b113464b395f Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:11 +0200 Subject: [PATCH 086/199] IB/core: Fix comments of GID query functions Exported symbol's comments should be with function definition and not in the header file. Therefore comments of ib_find_cached_gid() and ib_find_cached_gid_by_port() functions are moved closer to their definitions. The function name in then comment is different than the actual function name, fix it to be same as ib_cache_gid_find_by_filter(). Also current comment section of ib_find_cached_gid_by_port() contains the desciption of ib_find_cached_gid(), fix that as well. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 31 +++++++++++++++++++++++++++++-- include/rdma/ib_cache.h | 29 ----------------------------- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index e9a409d7f4e2..31def0f2ac49 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -492,6 +492,19 @@ static int ib_cache_gid_find(struct ib_device *ib_dev, mask, port, index); } +/** + * ib_find_cached_gid_by_port - Returns the GID table index where a specified + * GID value occurs. It searches for the specified GID value in the local + * software cache. + * @device: The device to query. + * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @port_num: The port number of the device where the GID value should be + * searched. + * @ndev: In RoCE, the net device of the device. Null means ignore. + * @index: The index into the cached GID table where the GID was found. This + * parameter may be NULL. + */ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, @@ -528,7 +541,7 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, EXPORT_SYMBOL(ib_find_cached_gid_by_port); /** - * ib_find_gid_by_filter - Returns the GID table index where a specified + * ib_cache_gid_find_by_filter - Returns the GID table index where a specified * GID value occurs * @device: The device to query. * @gid: The GID value to search for. @@ -539,7 +552,7 @@ EXPORT_SYMBOL(ib_find_cached_gid_by_port); * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. - * @index: The index into the cached GID table where the GID was found. This + * @index: The index into the cached GID table where the GID was found. This * parameter may be NULL. * * ib_cache_gid_find_by_filter() searches for the specified GID value @@ -848,6 +861,20 @@ int ib_get_cached_gid(struct ib_device *device, } EXPORT_SYMBOL(ib_get_cached_gid); +/** + * ib_find_cached_gid - Returns the port number and GID table index where + * a specified GID value occurs. + * @device: The device to query. + * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @ndev: In RoCE, the net device of the device. NULL means ignore. + * @port_num: The port number of the device where the GID value was found. + * @index: The index into the cached GID table where the GID was found. This + * parameter may be NULL. + * + * ib_find_cached_gid() searches for the specified GID value in + * the local software cache. + */ int ib_find_cached_gid(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index 385ec88ee9e5..eb49cc8d1f95 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -55,20 +55,6 @@ int ib_get_cached_gid(struct ib_device *device, union ib_gid *gid, struct ib_gid_attr *attr); -/** - * ib_find_cached_gid - Returns the port number and GID table index where - * a specified GID value occurs. - * @device: The device to query. - * @gid: The GID value to search for. - * @gid_type: The GID type to search for. - * @ndev: In RoCE, the net device of the device. NULL means ignore. - * @port_num: The port number of the device where the GID value was found. - * @index: The index into the cached GID table where the GID was found. This - * parameter may be NULL. - * - * ib_find_cached_gid() searches for the specified GID value in - * the local software cache. - */ int ib_find_cached_gid(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, @@ -76,21 +62,6 @@ int ib_find_cached_gid(struct ib_device *device, u8 *port_num, u16 *index); -/** - * ib_find_cached_gid_by_port - Returns the GID table index where a specified - * GID value occurs - * @device: The device to query. - * @gid: The GID value to search for. - * @gid_type: The GID type to search for. - * @port_num: The port number of the device where the GID value sould be - * searched. - * @ndev: In RoCE, the net device of the device. Null means ignore. - * @index: The index into the cached GID table where the GID was found. This - * parameter may be NULL. - * - * ib_find_cached_gid() searches for the specified GID value in - * the local software cache. - */ int ib_find_cached_gid_by_port(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, From b26c4a1138dff34cff507bafa4c87e365f4145a6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:12 +0200 Subject: [PATCH 087/199] IB/{core, ipoib}: Simplify ib_find_gid() for unused ndev ib_find_gid() is only used by IPoIB driver. For IB link layer, GID table entries are not based on netdevice. Netdevice parameter is unused here. Therefore, it is removed. Reviewed-by: Daniel Jurgens Reviewed-by: Mark Bloch Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 3 +-- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 2 +- include/rdma/ib_verbs.h | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index bb065c9449be..0ab99e62cc5c 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1050,13 +1050,12 @@ EXPORT_SYMBOL(ib_modify_port); * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. - * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - struct net_device *ndev, u8 *port_num, u16 *index) + u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 10384ea50bed..f47f9ace1f48 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -1085,7 +1085,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) netif_addr_unlock_bh(priv->dev); - err = ib_find_gid(priv->ca, &search_gid, priv->dev, &port, &index); + err = ib_find_gid(priv->ca, &search_gid, &port, &index); netif_addr_lock_bh(priv->dev); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5eb10c2470f0..ac3791e056cf 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2857,7 +2857,7 @@ int ib_modify_port(struct ib_device *device, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, - struct net_device *ndev, u8 *port_num, u16 *index); + u8 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); From 6d337179f28cc50ddd7e224f677b4cda70b275fc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:13 +0200 Subject: [PATCH 088/199] IB/core: Honor return status of ib_init_ah_from_mcmember() The return status of ib_init_ah_from_mcmember() is ignored by cma_ib_mc_handler(). Honor it and return error event if ah attribute initialization failed. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 4a57869daf83..720ef15a5ec8 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3918,10 +3918,14 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) rdma_start_port(id_priv->cma_dev->device)]; event.event = RDMA_CM_EVENT_MULTICAST_JOIN; - ib_init_ah_from_mcmember(id_priv->id.device, - id_priv->id.port_num, &multicast->rec, - ndev, gid_type, - &event.param.ud.ah_attr); + ret = ib_init_ah_from_mcmember(id_priv->id.device, + id_priv->id.port_num, + &multicast->rec, + ndev, gid_type, + &event.param.ud.ah_attr); + if (ret) + event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); if (ndev) From 563c4ba3bd2b8b0b21c65669ec2226b1cfa1138b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:14 +0200 Subject: [PATCH 089/199] IB/core: Honor port_num while resolving GID for IB link layer ah_attr contains the port number to which cm_id is bound. However, while searching for GID table for matching GID entry, the port number is ignored. This could cause the wrong GID to be used when the ah_attr is converted to an AH. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/multicast.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 45f2f095f793..4eb72ff539fc 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -724,21 +724,19 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, { int ret; u16 gid_index; - u8 p; - if (rdma_protocol_roce(device, port_num)) { - ret = ib_find_cached_gid_by_port(device, &rec->port_gid, - gid_type, port_num, - ndev, - &gid_index); - } else if (rdma_protocol_ib(device, port_num)) { - ret = ib_find_cached_gid(device, &rec->port_gid, - IB_GID_TYPE_IB, NULL, &p, + /* GID table is not based on the netdevice for IB link layer, + * so ignore ndev during search. + */ + if (rdma_protocol_ib(device, port_num)) + ndev = NULL; + else if (!rdma_protocol_roce(device, port_num)) + return -EINVAL; + + ret = ib_find_cached_gid_by_port(device, &rec->port_gid, + gid_type, port_num, + ndev, &gid_index); - } else { - ret = -EINVAL; - } - if (ret) return ret; From 5ac08a341303dd2105d7b5dc26b38b0d85ac726b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:15 +0200 Subject: [PATCH 090/199] IB/cma: Use rdma_protocol_roce() and remove cma_protocol_roce_dev_port() rdma_protocol_roce() API from the ib_core already provides a way to detect whether a given device+port is RoCE or not. Therefore, make use of it and avoid implementing it again in rdmacm module. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 720ef15a5ec8..34fa0507ed4f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1407,21 +1407,12 @@ static bool cma_match_private_data(struct rdma_id_private *id_priv, return true; } -static bool cma_protocol_roce_dev_port(struct ib_device *device, int port_num) -{ - enum rdma_link_layer ll = rdma_port_get_link_layer(device, port_num); - enum rdma_transport_type transport = - rdma_node_get_transport(device->node_type); - - return ll == IB_LINK_LAYER_ETHERNET && transport == RDMA_TRANSPORT_IB; -} - static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; const int port_num = id->port_num ?: rdma_start_port(device); - return cma_protocol_roce_dev_port(device, port_num); + return rdma_protocol_roce(device, port_num); } static bool cma_match_net_dev(const struct rdma_cm_id *id, @@ -1434,7 +1425,7 @@ static bool cma_match_net_dev(const struct rdma_cm_id *id, /* This request is an AF_IB request or a RoCE request */ return (!id->port_num || id->port_num == port_num) && (addr->src_addr.ss_family == AF_IB || - cma_protocol_roce_dev_port(id->device, port_num)); + rdma_protocol_roce(id->device, port_num)); return !addr->dev_addr.bound_dev_if || (net_eq(dev_net(net_dev), addr->dev_addr.net) && @@ -1489,7 +1480,7 @@ static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id, if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; - } else if (cma_protocol_roce_dev_port(req.device, req.port)) { + } else if (rdma_protocol_roce(req.device, req.port)) { /* TODO find the net dev matching the request parameters * through the RoCE GID table */ *net_dev = NULL; From a9c06aeba9977e71b81ef3e107cb588e00dae150 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:16 +0200 Subject: [PATCH 091/199] IB/core: Remove rdma_resolve_ip_route() as exported symbol rdma_resolve_ip_route() is used only by ib_core module. Therefore it is removed as an exported symbol. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 1 - drivers/infiniband/core/core_priv.h | 6 ++++++ include/rdma/ib_addr.h | 4 ---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9183d148d644..b0a52c996208 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -711,7 +711,6 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, return addr_resolve(src_in, dst_addr, addr, false, 0); } -EXPORT_SYMBOL(rdma_resolve_ip_route); void rdma_addr_cancel(struct rdma_dev_addr *addr) { diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 25bb178f6074..52b2b401e1f4 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -333,4 +333,10 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, return qp; } + +struct rdma_dev_addr; +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr); + #endif /* _CORE_PRIV_H */ diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index d656809f1217..494eacdf5260 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -119,10 +119,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client, struct rdma_dev_addr *addr, void *context), void *context); -int rdma_resolve_ip_route(struct sockaddr *src_addr, - const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr); - void rdma_addr_cancel(struct rdma_dev_addr *addr); void rdma_copy_addr(struct rdma_dev_addr *dev_addr, From 115b68aa6ea4bb7dca1cbf66cb99cefc274180cb Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:17 +0200 Subject: [PATCH 092/199] IB/ocrdma: Removed GID add/del null routines add_gid() and del_gid() are optional callback routines. ib_core ignores invoking them while updating GID table entries if they are not implemented by provider drivers. Therefore remove them. Reviewed-by: Daniel Jurgens Reviewed-by: Mark Bloch Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 2 -- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 16 ---------------- drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 10 ---------- 3 files changed, 28 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index fbfbd9e96147..42dc0de54cb8 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -160,8 +160,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.modify_port = ocrdma_modify_port; dev->ibdev.query_gid = ocrdma_query_gid; dev->ibdev.get_netdev = ocrdma_get_netdev; - dev->ibdev.add_gid = ocrdma_add_gid; - dev->ibdev.del_gid = ocrdma_del_gid; dev->ibdev.get_link_layer = ocrdma_link_layer; dev->ibdev.alloc_pd = ocrdma_alloc_pd; dev->ibdev.dealloc_pd = ocrdma_dealloc_pd; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 8009bdad4e5b..1e3dc92bc37b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -80,22 +80,6 @@ int ocrdma_query_gid(struct ib_device *ibdev, u8 port, return ret; } -int ocrdma_add_gid(struct ib_device *device, - u8 port_num, - unsigned int index, - const union ib_gid *gid, - const struct ib_gid_attr *attr, - void **context) { - return 0; -} - -int ocrdma_del_gid(struct ib_device *device, - u8 port_num, - unsigned int index, - void **context) { - return 0; -} - int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *uhw) { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 704ef1e9271b..a48eab35861f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -64,16 +64,6 @@ void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid); int ocrdma_query_gid(struct ib_device *, u8 port, int index, union ib_gid *gid); struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num); -int ocrdma_add_gid(struct ib_device *device, - u8 port_num, - unsigned int index, - const union ib_gid *gid, - const struct ib_gid_attr *attr, - void **context); -int ocrdma_del_gid(struct ib_device *device, - u8 port_num, - unsigned int index, - void **context); int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey); struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *, From a22af59ea9a5f9496c37bc4e4654da45a4e0ca2a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:18 +0200 Subject: [PATCH 093/199] IB/cm: Add and use a helper function to add cm_id's to the port list Add and use helper function add_cm_id_to_port_list() to attach cm_id to port list. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index e6749157fd86..c5cd1b3ffa54 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -462,6 +462,26 @@ static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, grh, &av->ah_attr); } +static int add_cm_id_to_port_list(struct cm_id_private *cm_id_priv, + struct cm_av *av, + struct cm_port *port) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&cm.lock, flags); + + if (&cm_id_priv->av == av) + list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); + else if (&cm_id_priv->alt_av == av) + list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); + else + ret = -EINVAL; + + spin_unlock_irqrestore(&cm.lock, flags); + return ret; +} + static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, struct cm_id_private *cm_id_priv) { @@ -502,16 +522,7 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, av->timeout = path->packet_life_time + 1; - spin_lock_irqsave(&cm.lock, flags); - if (&cm_id_priv->av == av) - list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); - else if (&cm_id_priv->alt_av == av) - list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); - else - ret = -EINVAL; - - spin_unlock_irqrestore(&cm.lock, flags); - + ret = add_cm_id_to_port_list(cm_id_priv, av, port); return ret; } From 0a5141593567fca3e1d64da756b8d1b490f6c600 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:20 +0200 Subject: [PATCH 094/199] IB/core: Refactor ib_init_ah_attr_from_path() for RoCE Resolving route for RoCE for a path record is needed only for the received CM requests. Therefore, (a) ib_init_ah_attr_from_path() is refactored first to isolate the code of resolving route. (b) Setting dlid, path bits is not needed for RoCE. Additionally ah attribute initialization is done from the path record entry, so it is better to refer to path record entry type for different link layer instead of ah attribute type while initializing ah attribute itself. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sa_query.c | 199 +++++++++++++++-------------- include/rdma/ib_sa.h | 5 + 2 files changed, 108 insertions(+), 96 deletions(-) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 9f029a1ca5ea..1cfec68c7911 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1227,118 +1227,125 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } +static int +roce_resolve_route_from_path(struct ib_device *device, u8 port_num, + struct sa_path_rec *rec) +{ + struct net_device *resolved_dev; + struct net_device *ndev; + struct net_device *idev; + struct rdma_dev_addr dev_addr = { + .bound_dev_if = ((sa_path_get_ifindex(rec) >= 0) ? + sa_path_get_ifindex(rec) : 0), + .net = sa_path_get_ndev(rec) ? + sa_path_get_ndev(rec) : + &init_net + }; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + int ret; + + if (!device->get_netdev) + return -EOPNOTSUPP; + + rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); + + /* validate the route */ + ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, + &dgid_addr._sockaddr, &dev_addr); + if (ret) + return ret; + + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) + return -EINVAL; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + resolved_dev = dev_get_by_index(dev_addr.net, + dev_addr.bound_dev_if); + if (!resolved_dev) { + ret = -ENODEV; + goto done; + } + ndev = ib_get_ndev_from_path(rec); + rcu_read_lock(); + if ((ndev && ndev != resolved_dev) || + (resolved_dev != idev && + !rdma_is_upper_dev_rcu(idev, resolved_dev))) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(resolved_dev); + if (ndev) + dev_put(ndev); +done: + dev_put(idev); + return ret; +} + +static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, + struct sa_path_rec *rec, + struct rdma_ah_attr *ah_attr) +{ + enum ib_gid_type type = sa_conv_pathrec_to_gid_type(rec); + struct net_device *ndev; + u16 gid_index; + int ret; + + ndev = ib_get_ndev_from_path(rec); + ret = ib_find_cached_gid_by_port(device, &rec->sgid, type, + port_num, ndev, &gid_index); + if (ndev) + dev_put(ndev); + if (ret) + return ret; + + rdma_ah_set_grh(ah_attr, &rec->dgid, + be32_to_cpu(rec->flow_label), + gid_index, rec->hop_limit, + rec->traffic_class); + return 0; +} + int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr) { - int ret; - u16 gid_index; - int use_roce; - struct net_device *ndev = NULL; + int ret = 0; - memset(ah_attr, 0, sizeof *ah_attr); + memset(ah_attr, 0, sizeof(*ah_attr)); ah_attr->type = rdma_ah_find_type(device, port_num); - - rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec))); - - if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && - (rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE))) - rdma_ah_set_make_grd(ah_attr, true); - rdma_ah_set_sl(ah_attr, rec->sl); - rdma_ah_set_path_bits(ah_attr, be32_to_cpu(sa_path_get_slid(rec)) & - get_src_path_mask(device, port_num)); rdma_ah_set_port_num(ah_attr, port_num); rdma_ah_set_static_rate(ah_attr, rec->rate); - use_roce = rdma_cap_eth_ah(device, port_num); - if (use_roce) { - struct net_device *idev; - struct net_device *resolved_dev; - struct rdma_dev_addr dev_addr = { - .bound_dev_if = ((sa_path_get_ifindex(rec) >= 0) ? - sa_path_get_ifindex(rec) : 0), - .net = sa_path_get_ndev(rec) ? - sa_path_get_ndev(rec) : - &init_net - }; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } sgid_addr, dgid_addr; - - if (!device->get_netdev) - return -EOPNOTSUPP; - - rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); - rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); - - /* validate the route */ - ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, - &dgid_addr._sockaddr, &dev_addr); + if (sa_path_is_roce(rec)) { + ret = roce_resolve_route_from_path(device, port_num, rec); if (ret) return ret; - if ((dev_addr.network == RDMA_NETWORK_IPV4 || - dev_addr.network == RDMA_NETWORK_IPV6) && - rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) - return -EINVAL; + memcpy(ah_attr->roce.dmac, sa_path_get_dmac(rec), ETH_ALEN); + } else { + rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec))); + if (sa_path_is_opa(rec) && + rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)) + rdma_ah_set_make_grd(ah_attr, true); - idev = device->get_netdev(device, port_num); - if (!idev) - return -ENODEV; - - resolved_dev = dev_get_by_index(dev_addr.net, - dev_addr.bound_dev_if); - if (!resolved_dev) { - dev_put(idev); - return -ENODEV; - } - ndev = ib_get_ndev_from_path(rec); - rcu_read_lock(); - if ((ndev && ndev != resolved_dev) || - (resolved_dev != idev && - !rdma_is_upper_dev_rcu(idev, resolved_dev))) - ret = -EHOSTUNREACH; - rcu_read_unlock(); - dev_put(idev); - dev_put(resolved_dev); - if (ret) { - if (ndev) - dev_put(ndev); - return ret; - } + rdma_ah_set_path_bits(ah_attr, + be32_to_cpu(sa_path_get_slid(rec)) & + get_src_path_mask(device, port_num)); } - if (rec->hop_limit > 0 || use_roce) { - enum ib_gid_type type = sa_conv_pathrec_to_gid_type(rec); - - ret = ib_find_cached_gid_by_port(device, &rec->sgid, type, - port_num, ndev, &gid_index); - if (ret) { - if (ndev) - dev_put(ndev); - return ret; - } - - rdma_ah_set_grh(ah_attr, &rec->dgid, - be32_to_cpu(rec->flow_label), - gid_index, rec->hop_limit, - rec->traffic_class); - if (ndev) - dev_put(ndev); - } - - if (use_roce) { - u8 *dmac = sa_path_get_dmac(rec); - - if (!dmac) - return -EINVAL; - memcpy(ah_attr->roce.dmac, dmac, ETH_ALEN); - } - - return 0; + if (rec->hop_limit > 0 || sa_path_is_roce(rec)) + ret = init_ah_attr_grh_fields(device, port_num, rec, ah_attr); + return ret; } EXPORT_SYMBOL(ib_init_ah_attr_from_path); diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 811cfcfcbe3d..82b8e59af14a 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -590,6 +590,11 @@ static inline bool sa_path_is_roce(struct sa_path_rec *rec) (rec->rec_type == SA_PATH_REC_TYPE_ROCE_V2)); } +static inline bool sa_path_is_opa(struct sa_path_rec *rec) +{ + return (rec->rec_type == SA_PATH_REC_TYPE_OPA); +} + static inline void sa_path_set_slid(struct sa_path_rec *rec, u32 slid) { if (rec->rec_type == SA_PATH_REC_TYPE_IB) From cb12a8e2fa5fc6381aa8ae542276099a64dee6e9 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:22 +0200 Subject: [PATCH 095/199] IB/cm: Introduce and use helper function to get cm_port from path Introduce and use helper function get_cm_port_from_path() to get cm_port based on the the path record entry. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c5cd1b3ffa54..4cc0fe6a29ff 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -482,13 +482,11 @@ static int add_cm_id_to_port_list(struct cm_id_private *cm_id_priv, return ret; } -static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, - struct cm_id_private *cm_id_priv) +static struct cm_port *get_cm_port_from_path(struct sa_path_rec *path) { struct cm_device *cm_dev; struct cm_port *port = NULL; unsigned long flags; - int ret; u8 p; struct net_device *ndev = ib_get_ndev_from_path(path); @@ -497,7 +495,7 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, sa_conv_pathrec_to_gid_type(path), ndev, &p, NULL)) { - port = cm_dev->port[p-1]; + port = cm_dev->port[p - 1]; break; } } @@ -505,9 +503,20 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, if (ndev) dev_put(ndev); + return port; +} +static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, + struct cm_id_private *cm_id_priv) +{ + struct cm_device *cm_dev; + struct cm_port *port; + int ret; + + port = get_cm_port_from_path(path); if (!port) return -EINVAL; + cm_dev = port->cm_dev; ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num, be16_to_cpu(path->pkey), &av->pkey_index); From e41a7c41947d33dbca16d1695460c121342a4601 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 13 Mar 2018 16:06:23 +0200 Subject: [PATCH 096/199] IB/core: Move rdma_addr_find_l2_eth_by_grh to core_priv.h Before commit [1], rdma_addr_find_l2_eth_by_grh() was an exported function and therefore declaration in include/rdma/ib_addr.h was fine. But now that its scope is limited to ib_core module, its better to have it in core_priv.h. [1] commit 1060f8653414 ("IB/{core/cm}: Fix generating a return AH for RoCEE") Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 5 +++++ include/rdma/ib_addr.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 52b2b401e1f4..54163a6e4067 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -339,4 +339,9 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr); +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *dmac, const struct net_device *ndev, + int *hoplimit); + #endif /* _CORE_PRIV_H */ diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 494eacdf5260..e8860a46754a 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -127,11 +127,6 @@ void rdma_copy_addr(struct rdma_dev_addr *dev_addr, int rdma_addr_size(struct sockaddr *addr); -int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, - const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, - int *hoplimit); - static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; From 7b48221cf41a90cf4bfc36e6d699b7fa4169c970 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Thu, 15 Mar 2018 15:23:14 +0800 Subject: [PATCH 097/199] RDMA/hns: Fix cqn type and init resp This patch changes the type of cqn from u32 to u64 to keep userspace and kernel consistent, initializes resp both for cq and qp to zeros, and also changes the condition judgment of outlen considering future caps extension. Suggested-by: Jason Gunthorpe Fixes: e088a685eae9 (hns: Support rq record doorbell for the user space) Fixes: 9b44703d0a21 (hns: Support cq record doorbell for the user space) Signed-off-by: Yixian Liu Signed-off-by: Lijun Ou Signed-off-by: Wei Hu (Xavier) Signed-off-by: Shaobo Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cq.c | 15 +++++++-------- drivers/infiniband/hw/hns/hns_roce_qp.c | 8 ++++---- include/uapi/rdma/hns-abi.h | 3 +-- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 462b644bbbd7..095a9100717d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -315,7 +315,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); struct device *dev = hr_dev->dev; struct hns_roce_ib_create_cq ucmd; - struct hns_roce_ib_create_cq_resp resp; + struct hns_roce_ib_create_cq_resp resp = {}; struct hns_roce_cq *hr_cq = NULL; struct hns_roce_uar *uar = NULL; int vector = attr->comp_vector; @@ -389,7 +389,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, } if (context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && - (udata->outlen == sizeof(resp))) { + (udata->outlen >= sizeof(resp))) { ret = hns_roce_db_map_user(to_hr_ucontext(context), ucmd.db_addr, &hr_cq->db); if (ret) { @@ -413,15 +413,14 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, hr_cq->cq_depth = cq_entries; if (context) { + resp.cqn = hr_cq->cqn; if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && - (udata->outlen == sizeof(resp))) { + (udata->outlen >= sizeof(resp))) { hr_cq->db_en = 1; - resp.cqn = hr_cq->cqn; resp.cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB; - ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); - } else - ret = ib_copy_to_udata(udata, &hr_cq->cqn, sizeof(u64)); + } + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (ret) goto err_dbmap; } @@ -430,7 +429,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, err_dbmap: if (context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && - (udata->outlen == sizeof(resp))) + (udata->outlen >= sizeof(resp))) hns_roce_db_unmap_user(to_hr_ucontext(context), &hr_cq->db); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index f0ad455ad62b..e289a924e789 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -506,7 +506,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, { struct device *dev = hr_dev->dev; struct hns_roce_ib_create_qp ucmd; - struct hns_roce_ib_create_qp_resp resp; + struct hns_roce_ib_create_qp_resp resp = {}; unsigned long qpn = 0; int ret = 0; u32 page_shift; @@ -614,7 +614,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, } if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && - (udata->outlen == sizeof(resp)) && + (udata->outlen >= sizeof(resp)) && hns_roce_qp_has_rq(init_attr)) { ret = hns_roce_db_map_user( to_hr_ucontext(ib_pd->uobject->context), @@ -730,7 +730,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, else hr_qp->doorbell_qpn = cpu_to_le64(hr_qp->qpn); - if (ib_pd->uobject && (udata->outlen == sizeof(resp)) && + if (ib_pd->uobject && (udata->outlen >= sizeof(resp)) && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)) { /* indicate kernel supports record db */ @@ -759,7 +759,7 @@ err_qpn: err_wrid: if (ib_pd->uobject) { if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && - (udata->outlen == sizeof(resp)) && + (udata->outlen >= sizeof(resp)) && hns_roce_qp_has_rq(init_attr)) hns_roce_db_unmap_user( to_hr_ucontext(ib_pd->uobject->context), diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 38e8f192bf72..f7af7e59a5e4 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -42,8 +42,7 @@ struct hns_roce_ib_create_cq { }; struct hns_roce_ib_create_cq_resp { - __u32 cqn; - __u32 reserved; + __u64 cqn; /* Only 32 bits used, 64 for compat */ __u64 cap_flags; }; From 561e5d48968be22fc71af6a0e13af6edae595dbe Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Tue, 13 Mar 2018 23:20:24 -0400 Subject: [PATCH 098/199] RDMA/qedr: eliminate duplicate barriers on weakly-ordered archs Code includes wmb() followed by writel() in multiple places. writel() already has a barrier on some architectures like arm64. This ends up CPU observing two barriers back to back before executing the register write. Since code already has an explicit barrier call, changing writel() to writel_relaxed(). Signed-off-by: Sinan Kaya Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 875b17272d65..b61a395f89de 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1870,7 +1870,7 @@ static int qedr_update_qp_state(struct qedr_dev *dev, if (rdma_protocol_roce(&dev->ibdev, 1)) { wmb(); - writel(qp->rq.db_data.raw, qp->rq.db); + writel_relaxed(qp->rq.db_data.raw, qp->rq.db); /* Make sure write takes effect */ mmiowb(); } @@ -3257,7 +3257,7 @@ int qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, * redundant doorbell. */ wmb(); - writel(qp->sq.db_data.raw, qp->sq.db); + writel_relaxed(qp->sq.db_data.raw, qp->sq.db); /* Make sure write sticks */ mmiowb(); From 95da6e96f1e74e6ddfb347f7a8310f419bb47bf7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:45:10 -0700 Subject: [PATCH 099/199] RDMAVT: Fix synchronization around percpu_ref rvt_mregion uses percpu_ref for reference counting and RCU to protect accesses from lkey_table. When a rvt_mregion needs to be freed, it first gets unregistered from lkey_table and then rvt_check_refs() is called to wait for in-flight usages before the rvt_mregion is freed. rvt_check_refs() seems to have a couple issues. * It has a fast exit path which tests percpu_ref_is_zero(). However, a percpu_ref reading zero doesn't mean that the object can be released. In fact, the ->release() callback might not even have started executing yet. Proceeding with freeing can lead to use-after-free. * lkey_table is RCU protected but there is no RCU grace period in the free path. percpu_ref uses RCU internally but it's sched-RCU whose grace periods are different from regular RCU. Also, it generally isn't a good idea to depend on internal behaviors like this. To address the above issues, this patch removes the fast exit and adds an explicit synchronize_rcu(). Signed-off-by: Tejun Heo Acked-by: Dennis Dalessandro Cc: Mike Marciniszyn Cc: linux-rdma@vger.kernel.org Cc: Linus Torvalds Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/mr.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 1b2e5362a3ff..cc429b567d0a 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -489,11 +489,13 @@ static int rvt_check_refs(struct rvt_mregion *mr, const char *t) unsigned long timeout; struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device); - if (percpu_ref_is_zero(&mr->refcount)) - return 0; - /* avoid dma mr */ - if (mr->lkey) + if (mr->lkey) { + /* avoid dma mr */ rvt_dereg_clean_qps(mr); + /* @mr was indexed on rcu protected @lkey_table */ + synchronize_rcu(); + } + timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ); if (!timeout) { rvt_pr_err(rdi, From b92ec0fe3224dbce7d50fb6cbfaf4eaf4a6f0359 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 13 Mar 2018 16:33:17 -0600 Subject: [PATCH 100/199] RDMA/rxe: Get rid of confusing udata parameter to rxe_cq_chk_attr It isn't used and it couldn't possibly ever be used correctly. Tested-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_cq.c | 2 +- drivers/infiniband/sw/rxe/rxe_loc.h | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index c4aabf78dc90..c9593e472753 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -36,7 +36,7 @@ #include "rxe_queue.h" int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, - int cqe, int comp_vector, struct ib_udata *udata) + int cqe, int comp_vector) { int count; diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index e8150ab7df58..31070a696f36 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -52,7 +52,7 @@ struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt); /* rxe_cq.c */ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, - int cqe, int comp_vector, struct ib_udata *udata); + int cqe, int comp_vector); int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, int comp_vector, struct ib_ucontext *context, diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 45594091353c..34539c3242a8 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -892,7 +892,7 @@ static struct ib_cq *rxe_create_cq(struct ib_device *dev, if (attr->flags) return ERR_PTR(-EINVAL); - err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector, udata); + err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); if (err) goto err1; @@ -931,7 +931,7 @@ static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) struct rxe_cq *cq = to_rcq(ibcq); struct rxe_dev *rxe = to_rdev(ibcq->device); - err = rxe_cq_chk_attr(rxe, cq, cqe, 0, udata); + err = rxe_cq_chk_attr(rxe, cq, cqe, 0); if (err) goto err1; From 0c43ab371bcb07d9ed9c95ea116e6d1d703b56ca Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 13 Mar 2018 16:33:18 -0600 Subject: [PATCH 101/199] RDMA/rxe: Use structs to describe the uABI instead of opencoding Open coding pointer math is not acceptable for describing the uABI in RDMA. Provide structs for all the cases. The udata is casted to the struct as close to the verbs entry point as possible for maximum clarity. Function signatures and so forth are revised to allow for this. Tested-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_cq.c | 13 ++++---- drivers/infiniband/sw/rxe/rxe_loc.h | 13 +++++--- drivers/infiniband/sw/rxe/rxe_qp.c | 26 ++++++++------- drivers/infiniband/sw/rxe/rxe_queue.c | 24 +++----------- drivers/infiniband/sw/rxe/rxe_queue.h | 5 ++- drivers/infiniband/sw/rxe/rxe_srq.c | 44 +++++++++--------------- drivers/infiniband/sw/rxe/rxe_verbs.c | 48 ++++++++++++++++++++++++--- include/uapi/rdma/rdma_user_rxe.h | 22 ++++++++++++ 8 files changed, 116 insertions(+), 79 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index c9593e472753..2ee4b08b00ea 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -83,7 +83,7 @@ static void rxe_send_complete(unsigned long data) int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, int comp_vector, struct ib_ucontext *context, - struct ib_udata *udata) + struct rxe_create_cq_resp __user *uresp) { int err; @@ -94,15 +94,15 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, return -ENOMEM; } - err = do_mmap_info(rxe, udata, false, context, cq->queue->buf, - cq->queue->buf_size, &cq->queue->ip); + err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, + cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); if (err) { kvfree(cq->queue->buf); kfree(cq->queue); return err; } - if (udata) + if (uresp) cq->is_user = 1; cq->is_dying = false; @@ -114,14 +114,15 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, return 0; } -int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, struct ib_udata *udata) +int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, + struct rxe_resize_cq_resp __user *uresp) { int err; err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe, sizeof(struct rxe_cqe), cq->queue->ip ? cq->queue->ip->context : NULL, - udata, NULL, &cq->cq_lock); + uresp ? &uresp->mi : NULL, NULL, &cq->cq_lock); if (!err) cq->ibcq.cqe = cqe; diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 31070a696f36..b71023c1c58b 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -56,9 +56,10 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, int comp_vector, struct ib_ucontext *context, - struct ib_udata *udata); + struct rxe_create_cq_resp __user *uresp); -int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, struct ib_udata *udata); +int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, + struct rxe_resize_cq_resp __user *uresp); int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); @@ -158,7 +159,8 @@ int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid); int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init); int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, - struct ib_qp_init_attr *init, struct ib_udata *udata, + struct ib_qp_init_attr *init, + struct rxe_create_qp_resp __user *uresp, struct ib_pd *ibpd); int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init); @@ -226,11 +228,12 @@ int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_init_attr *init, - struct ib_ucontext *context, struct ib_udata *udata); + struct ib_ucontext *context, + struct rxe_create_srq_resp __user *uresp); int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, - struct ib_udata *udata); + struct rxe_modify_srq_cmd *ucmd); void rxe_release(struct kref *kref); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 98a7a19146a8..b9f7aa1114b2 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -216,7 +216,8 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init, - struct ib_ucontext *context, struct ib_udata *udata) + struct ib_ucontext *context, + struct rxe_create_qp_resp __user *uresp) { int err; int wqe_size; @@ -241,9 +242,9 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, if (!qp->sq.queue) return -ENOMEM; - err = do_mmap_info(rxe, udata, true, - context, qp->sq.queue->buf, - qp->sq.queue->buf_size, &qp->sq.queue->ip); + err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, context, + qp->sq.queue->buf, qp->sq.queue->buf_size, + &qp->sq.queue->ip); if (err) { kvfree(qp->sq.queue->buf); @@ -274,7 +275,8 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init, - struct ib_ucontext *context, struct ib_udata *udata) + struct ib_ucontext *context, + struct rxe_create_qp_resp __user *uresp) { int err; int wqe_size; @@ -294,9 +296,8 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, if (!qp->rq.queue) return -ENOMEM; - err = do_mmap_info(rxe, udata, false, context, - qp->rq.queue->buf, - qp->rq.queue->buf_size, + err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, context, + qp->rq.queue->buf, qp->rq.queue->buf_size, &qp->rq.queue->ip); if (err) { kvfree(qp->rq.queue->buf); @@ -322,14 +323,15 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, /* called by the create qp verb */ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, - struct ib_qp_init_attr *init, struct ib_udata *udata, + struct ib_qp_init_attr *init, + struct rxe_create_qp_resp __user *uresp, struct ib_pd *ibpd) { int err; struct rxe_cq *rcq = to_rcq(init->recv_cq); struct rxe_cq *scq = to_rcq(init->send_cq); struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; - struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL; + struct ib_ucontext *context = ibpd->uobject ? ibpd->uobject->context : NULL; rxe_add_ref(pd); rxe_add_ref(rcq); @@ -344,11 +346,11 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, rxe_qp_init_misc(rxe, qp, init); - err = rxe_qp_init_req(rxe, qp, init, context, udata); + err = rxe_qp_init_req(rxe, qp, init, context, uresp); if (err) goto err1; - err = rxe_qp_init_resp(rxe, qp, init, context, udata); + err = rxe_qp_init_resp(rxe, qp, init, context, uresp); if (err) goto err2; diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c index d14bf496d62d..f84ab4469261 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.c +++ b/drivers/infiniband/sw/rxe/rxe_queue.c @@ -37,35 +37,21 @@ #include "rxe_queue.h" int do_mmap_info(struct rxe_dev *rxe, - struct ib_udata *udata, - bool is_req, + struct mminfo __user *outbuf, struct ib_ucontext *context, struct rxe_queue_buf *buf, size_t buf_size, struct rxe_mmap_info **ip_p) { int err; - u32 len, offset; struct rxe_mmap_info *ip = NULL; - if (udata) { - if (is_req) { - len = udata->outlen - sizeof(struct mminfo); - offset = sizeof(struct mminfo); - } else { - len = udata->outlen; - offset = 0; - } - - if (len < sizeof(ip->info)) - goto err1; - + if (outbuf) { ip = rxe_create_mmap_info(rxe, buf_size, context, buf); if (!ip) goto err1; - err = copy_to_user(udata->outbuf + offset, &ip->info, - sizeof(ip->info)); + err = copy_to_user(outbuf, &ip->info, sizeof(ip->info)); if (err) goto err2; @@ -171,7 +157,7 @@ int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p, unsigned int elem_size, struct ib_ucontext *context, - struct ib_udata *udata, + struct mminfo __user *outbuf, spinlock_t *producer_lock, spinlock_t *consumer_lock) { @@ -184,7 +170,7 @@ int rxe_queue_resize(struct rxe_queue *q, if (!new_q) return -ENOMEM; - err = do_mmap_info(new_q->rxe, udata, false, context, new_q->buf, + err = do_mmap_info(new_q->rxe, outbuf, context, new_q->buf, new_q->buf_size, &new_q->ip); if (err) { vfree(new_q->buf); diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h index 8c8641c87817..79ba4b320054 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.h +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -77,8 +77,7 @@ struct rxe_queue { }; int do_mmap_info(struct rxe_dev *rxe, - struct ib_udata *udata, - bool is_req, + struct mminfo __user *outbuf, struct ib_ucontext *context, struct rxe_queue_buf *buf, size_t buf_size, @@ -94,7 +93,7 @@ int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p, unsigned int elem_size, struct ib_ucontext *context, - struct ib_udata *udata, + struct mminfo __user *outbuf, /* Protect producers while resizing queue */ spinlock_t *producer_lock, /* Protect consumers while resizing queue */ diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index efc832a2d7c6..0d6c04ba7fc3 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -99,7 +99,8 @@ err1: int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_init_attr *init, - struct ib_ucontext *context, struct ib_udata *udata) + struct ib_ucontext *context, + struct rxe_create_srq_resp __user *uresp) { int err; int srq_wqe_size; @@ -126,55 +127,41 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, srq->rq.queue = q; - err = do_mmap_info(rxe, udata, false, context, q->buf, + err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, q->buf, q->buf_size, &q->ip); if (err) return err; - if (udata && udata->outlen >= sizeof(struct mminfo) + sizeof(u32)) { - if (copy_to_user(udata->outbuf + sizeof(struct mminfo), - &srq->srq_num, sizeof(u32))) + if (uresp) { + if (copy_to_user(&uresp->srq_num, &srq->srq_num, + sizeof(uresp->srq_num))) return -EFAULT; } + return 0; } int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, - struct ib_udata *udata) + struct rxe_modify_srq_cmd *ucmd) { int err; struct rxe_queue *q = srq->rq.queue; - struct mminfo mi = { .offset = 1, .size = 0}; + struct mminfo __user *mi = NULL; if (mask & IB_SRQ_MAX_WR) { - /* Check that we can write the mminfo struct to user space */ - if (udata && udata->inlen >= sizeof(__u64)) { - __u64 mi_addr; - - /* Get address of user space mminfo struct */ - err = ib_copy_from_udata(&mi_addr, udata, - sizeof(mi_addr)); - if (err) - goto err1; - - udata->outbuf = (void __user *)(unsigned long)mi_addr; - udata->outlen = sizeof(mi); - - if (!access_ok(VERIFY_WRITE, - (void __user *)udata->outbuf, - udata->outlen)) { - err = -EFAULT; - goto err1; - } - } + /* + * This is completely screwed up, the response is supposed to + * be in the outbuf not like this. + */ + mi = u64_to_user_ptr(ucmd->mmap_info_addr); err = rxe_queue_resize(q, &attr->max_wr, rcv_wqe_size(srq->rq.max_sge), srq->rq.queue->ip ? srq->rq.queue->ip->context : NULL, - udata, &srq->rq.producer_lock, + mi, &srq->rq.producer_lock, &srq->rq.consumer_lock); if (err) goto err2; @@ -188,6 +175,5 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, err2: rxe_queue_cleanup(q); srq->rq.queue = NULL; -err1: return err; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 34539c3242a8..ced79e49234b 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -407,6 +407,13 @@ static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd, struct rxe_pd *pd = to_rpd(ibpd); struct rxe_srq *srq; struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL; + struct rxe_create_srq_resp __user *uresp = NULL; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) + return ERR_PTR(-EINVAL); + uresp = udata->outbuf; + } err = rxe_srq_chk_attr(rxe, NULL, &init->attr, IB_SRQ_INIT_MASK); if (err) @@ -422,7 +429,7 @@ static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd, rxe_add_ref(pd); srq->pd = pd; - err = rxe_srq_from_init(rxe, srq, init, context, udata); + err = rxe_srq_from_init(rxe, srq, init, context, uresp); if (err) goto err2; @@ -443,12 +450,22 @@ static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, int err; struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_dev *rxe = to_rdev(ibsrq->device); + struct rxe_modify_srq_cmd ucmd = {}; + + if (udata) { + if (udata->inlen < sizeof(ucmd)) + return -EINVAL; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + } err = rxe_srq_chk_attr(rxe, srq, attr, mask); if (err) goto err1; - err = rxe_srq_from_attr(rxe, srq, attr, mask, udata); + err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd); if (err) goto err1; @@ -517,6 +534,13 @@ static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd, struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_qp *qp; + struct rxe_create_qp_resp __user *uresp = NULL; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) + return ERR_PTR(-EINVAL); + uresp = udata->outbuf; + } err = rxe_qp_chk_init(rxe, init); if (err) @@ -538,7 +562,7 @@ static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd, rxe_add_index(qp); - err = rxe_qp_from_init(rxe, qp, pd, init, udata, ibpd); + err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibpd); if (err) goto err3; @@ -888,6 +912,13 @@ static struct ib_cq *rxe_create_cq(struct ib_device *dev, int err; struct rxe_dev *rxe = to_rdev(dev); struct rxe_cq *cq; + struct rxe_create_cq_resp __user *uresp = NULL; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) + return ERR_PTR(-EINVAL); + uresp = udata->outbuf; + } if (attr->flags) return ERR_PTR(-EINVAL); @@ -903,7 +934,7 @@ static struct ib_cq *rxe_create_cq(struct ib_device *dev, } err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, - context, udata); + context, uresp); if (err) goto err2; @@ -930,12 +961,19 @@ static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) int err; struct rxe_cq *cq = to_rcq(ibcq); struct rxe_dev *rxe = to_rdev(ibcq->device); + struct rxe_resize_cq_resp __user *uresp = NULL; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) + return -EINVAL; + uresp = udata->outbuf; + } err = rxe_cq_chk_attr(rxe, cq, cqe, 0); if (err) goto err1; - err = rxe_cq_resize_queue(cq, cqe, udata); + err = rxe_cq_resize_queue(cq, cqe, uresp); if (err) goto err1; diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index e3e6852b58eb..b3b1bfc8fa21 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -144,4 +144,26 @@ struct rxe_recv_wqe { struct rxe_dma_info dma; }; +struct rxe_create_cq_resp { + struct mminfo mi; +}; + +struct rxe_resize_cq_resp { + struct mminfo mi; +}; + +struct rxe_create_qp_resp { + struct mminfo rq_mi; + struct mminfo sq_mi; +}; + +struct rxe_create_srq_resp { + struct mminfo mi; + __u32 srq_num; +}; + +struct rxe_modify_srq_cmd { + __u64 mmap_info_addr; +}; + #endif /* RDMA_USER_RXE_H */ From 48962f5c6fffcb676dd6ebd70f7869cfc6cc8356 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 13 Mar 2018 16:26:46 -0600 Subject: [PATCH 102/199] RDMA/mlx4: Move flag constants to uapi header MLX4_USER_DEV_CAP_LARGE_CQE (via mlx4_ib_alloc_ucontext_resp.dev_caps) and MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET (via mlx4_uverbs_ex_query_device_resp.comp_mask) are copied directly to userspace and form part of the uAPI. Move them to the uapi header where they belong. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx4/mlx4_ib.h | 4 ---- drivers/net/ethernet/mellanox/mlx4/fw.c | 1 + drivers/net/ethernet/mellanox/mlx4/main.c | 1 + include/linux/mlx4/device.h | 4 ---- include/uapi/rdma/mlx4-abi.h | 8 ++++++++ 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index c9eaaa216891..f57229b85536 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -575,7 +575,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, if (uhw->outlen >= resp.response_length + sizeof(resp.hca_core_clock_offset)) { resp.response_length += sizeof(resp.hca_core_clock_offset); if (!err && !mlx4_is_slave(dev->dev)) { - resp.comp_mask |= QUERY_DEVICE_RESP_MASK_TIMESTAMP; + resp.comp_mask |= MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET; resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE; } } diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index d0640bd79679..87c47b1dd870 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -641,10 +641,6 @@ struct mlx4_uverbs_ex_query_device { __u32 reserved; }; -enum query_device_resp_mask { - QUERY_DEVICE_RESP_MASK_TIMESTAMP = 1UL << 0, -}; - static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx4_ib_dev, ib_dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 634f603f941c..de6b3d416148 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "fw.h" #include "icm.h" diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 4d84cab77105..958619ff24ae 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -46,6 +46,7 @@ #include #include +#include #include #include diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index a9b5fed8f7c6..81d0799b6091 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -256,10 +256,6 @@ enum { MLX4_DEV_CAP_EQE_STRIDE_ENABLED = 1LL << 3 }; -enum { - MLX4_USER_DEV_CAP_LARGE_CQE = 1L << 0 -}; - enum { MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0, MLX4_FUNC_CAP_EQE_CQE_STRIDE = 1L << 1, diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index d84616adff32..be58594cec87 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -59,6 +59,10 @@ struct mlx4_ib_alloc_ucontext_resp_v3 { __u16 bf_regs_per_page; }; +enum { + MLX4_USER_DEV_CAP_LARGE_CQE = 1L << 0, +}; + struct mlx4_ib_alloc_ucontext_resp { __u32 dev_caps; __u32 qp_tab_size; @@ -162,6 +166,10 @@ struct mlx4_ib_rss_caps { __u8 reserved[7]; }; +enum query_device_resp_mask { + MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, +}; + struct mlx4_uverbs_ex_query_device_resp { __u32 comp_mask; __u32 response_length; From 9a657b4c4a9073037121331bb54663bf11f08342 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 13 Mar 2018 22:01:32 -0600 Subject: [PATCH 103/199] RDMA/i40iw: Move uapi header to include/uapi All of these defines are part of the uABI for the driver, this header duplicates providers/i40iw/i40iw-abi.h in rdma-core. Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 1 + drivers/infiniband/hw/i40iw/i40iw.h | 2 +- .../i40iw/i40iw_ucontext.h => include/uapi/rdma/i40iw-abi.h | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) rename drivers/infiniband/hw/i40iw/i40iw_ucontext.h => include/uapi/rdma/i40iw-abi.h (98%) diff --git a/MAINTAINERS b/MAINTAINERS index 3bdc260e36b7..556672eea6d5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7226,6 +7226,7 @@ M: Shiraz Saleem L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/i40iw/ +F: include/uapi/rdma/i40iw-abi.h INTEL TELEMETRY DRIVER M: Souvik Kumar Chakravarty diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index a20650f060ce..50dc50e83918 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -60,7 +60,7 @@ #include #include "i40iw_type.h" #include "i40iw_p.h" -#include "i40iw_ucontext.h" +#include #include "i40iw_pble.h" #include "i40iw_verbs.h" #include "i40iw_cm.h" diff --git a/drivers/infiniband/hw/i40iw/i40iw_ucontext.h b/include/uapi/rdma/i40iw-abi.h similarity index 98% rename from drivers/infiniband/hw/i40iw/i40iw_ucontext.h rename to include/uapi/rdma/i40iw-abi.h index 57d3f1d11ff1..bfc3aaf2e56a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ucontext.h +++ b/include/uapi/rdma/i40iw-abi.h @@ -34,8 +34,8 @@ * */ -#ifndef I40IW_USER_CONTEXT_H -#define I40IW_USER_CONTEXT_H +#ifndef I40IW_ABI_H +#define I40IW_ABI_H #include From 633fb4d9fdaa613308c136293107f28e08e85d25 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 14 Mar 2018 14:39:42 -0600 Subject: [PATCH 104/199] RDMA/hns: Use structs to describe the uABI instead of opencoding Open coding a loose value is not acceptable for describing the uABI in RDMA. Provide the missing struct. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_pd.c | 5 ++++- include/uapi/rdma/hns-abi.h | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index bdab2188c04a..4b41e041799c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -32,6 +32,7 @@ #include #include +#include #include "hns_roce_device.h" static int hns_roce_pd_alloc(struct hns_roce_dev *hr_dev, unsigned long *pdn) @@ -77,7 +78,9 @@ struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev, } if (context) { - if (ib_copy_to_udata(udata, &pd->pdn, sizeof(u64))) { + struct hns_roce_ib_alloc_pd_resp uresp = {.pdn = pd->pdn}; + + if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { hns_roce_pd_free(to_hr_dev(ib_dev), pd->pdn); dev_err(dev, "[alloc_pd]ib_copy_to_udata failed!\n"); kfree(pd); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index f7af7e59a5e4..aa774985a0c7 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -63,4 +63,9 @@ struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 reserved; }; + +struct hns_roce_ib_alloc_pd_resp { + __u32 pdn; +}; + #endif /* HNS_ABI_USER_H */ From 7f86260b5f44d93ab20d3e9afda0e3f48d005ffe Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 14 Mar 2018 16:01:50 -0600 Subject: [PATCH 105/199] RDMA/cxgb4: Use structs to describe the uABI instead of opencoding Open coding a loose value is not acceptable for describing the uABI in RDMA. Provide the missing struct. Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/provider.c | 4 +++- include/uapi/rdma/cxgb4-abi.h | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 1b5c6cd2ac4d..42568a4df3f8 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -281,7 +281,9 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, php->pdid = pdid; php->rhp = rhp; if (context) { - if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) { + struct c4iw_alloc_pd_resp uresp = {.pdid = php->pdid}; + + if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { c4iw_deallocate_pd(&php->ibpd); return ERR_PTR(-EFAULT); } diff --git a/include/uapi/rdma/cxgb4-abi.h b/include/uapi/rdma/cxgb4-abi.h index 05f71f1bc119..c398a1ee8d00 100644 --- a/include/uapi/rdma/cxgb4-abi.h +++ b/include/uapi/rdma/cxgb4-abi.h @@ -79,4 +79,9 @@ struct c4iw_alloc_ucontext_resp { __u32 status_page_size; __u32 reserved; /* explicit padding (optional for i386) */ }; + +struct c4iw_alloc_pd_resp { + __u32 pdid; +}; + #endif /* CXGB4_ABI_USER_H */ From a8b9234b1272de7a2af87e076fad51ba096a3c30 Mon Sep 17 00:00:00 2001 From: Henry Orosco Date: Wed, 14 Mar 2018 14:45:22 -0500 Subject: [PATCH 106/199] i40iw: Refactor of driver generated AEs The flush CQP OP can be used to optionally generate Asynchronous Events (AEs) in addition to QP flush. Consolidate all HW AE generation code under a new function i40iw_gen_ae which use the flush CQP OP to only generate AEs. Signed-off-by: Henry Orosco Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw.h | 5 ++ drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 56 +++++++++++++++++++++-- drivers/infiniband/hw/i40iw/i40iw_d.h | 5 +- drivers/infiniband/hw/i40iw/i40iw_hw.c | 33 +++++++++++++ drivers/infiniband/hw/i40iw/i40iw_type.h | 11 +++++ drivers/infiniband/hw/i40iw/i40iw_utils.c | 8 ++-- 6 files changed, 108 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index 50dc50e83918..45e97dd323ad 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -572,6 +572,11 @@ enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev, struct i40iw_qp_flush_info *info, bool wait); +void i40iw_gen_ae(struct i40iw_device *iwdev, + struct i40iw_sc_qp *qp, + struct i40iw_gen_ae_info *info, + bool wait); + void i40iw_copy_ip_ntohl(u32 *dst, __be32 *src); struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index c74fd3309b93..4d841a3c68f3 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -2614,10 +2614,8 @@ static enum i40iw_status_code i40iw_sc_qp_flush_wqes( qp->flush_sq |= flush_sq; qp->flush_rq |= flush_rq; - if (!flush_sq && !flush_rq) { - if (info->ae_code != I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR) - return 0; - } + if (!flush_sq && !flush_rq) + return 0; cqp = qp->pd->dev->cqp; wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch); @@ -2658,6 +2656,49 @@ static enum i40iw_status_code i40iw_sc_qp_flush_wqes( return 0; } +/** + * i40iw_sc_gen_ae - generate AE, currently uses flush WQE CQP OP + * @qp: sc qp + * @info: gen ae information + * @scratch: u64 saved to be used during cqp completion + * @post_sq: flag for cqp db to ring + */ +static enum i40iw_status_code i40iw_sc_gen_ae( + struct i40iw_sc_qp *qp, + struct i40iw_gen_ae_info *info, + u64 scratch, + bool post_sq) +{ + u64 temp; + u64 *wqe; + struct i40iw_sc_cqp *cqp; + u64 header; + + cqp = qp->pd->dev->cqp; + wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch); + if (!wqe) + return I40IW_ERR_RING_FULL; + + temp = info->ae_code | + LS_64(info->ae_source, I40IW_CQPSQ_FWQE_AESOURCE); + + set_64bit_val(wqe, 8, temp); + + header = qp->qp_uk.qp_id | + LS_64(I40IW_CQP_OP_GEN_AE, I40IW_CQPSQ_OPCODE) | + LS_64(1, I40IW_CQPSQ_FWQE_GENERATE_AE) | + LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID); + + i40iw_insert_wqe_hdr(wqe, header); + + i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "GEN_AE WQE", + wqe, I40IW_CQP_WQE_SIZE * 8); + + if (post_sq) + i40iw_sc_cqp_post_sq(cqp); + return 0; +} + /** * i40iw_sc_qp_upload_context - upload qp's context * @dev: sc device struct @@ -4148,6 +4189,13 @@ static enum i40iw_status_code i40iw_exec_cqp_cmd(struct i40iw_sc_dev *dev, pcmdinfo->in.u.qp_flush_wqes. scratch, pcmdinfo->post_sq); break; + case OP_GEN_AE: + status = i40iw_sc_gen_ae( + pcmdinfo->in.u.gen_ae.qp, + &pcmdinfo->in.u.gen_ae.info, + pcmdinfo->in.u.gen_ae.scratch, + pcmdinfo->post_sq); + break; case OP_ADD_ARP_CACHE_ENTRY: status = i40iw_sc_add_arp_cache_entry( pcmdinfo->in.u.add_arp_cache_entry.cqp, diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h index 4b65e4140bd7..6ddaeec87d2f 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_d.h +++ b/drivers/infiniband/hw/i40iw/i40iw_d.h @@ -418,6 +418,8 @@ #define I40IW_CQP_OP_QUERY_FPM_VALUES 0x20 #define I40IW_CQP_OP_COMMIT_FPM_VALUES 0x21 #define I40IW_CQP_OP_FLUSH_WQES 0x22 +/* I40IW_CQP_OP_GEN_AE is the same value as I40IW_CQP_OP_FLUSH_WQES */ +#define I40IW_CQP_OP_GEN_AE 0x22 #define I40IW_CQP_OP_MANAGE_APBVT 0x23 #define I40IW_CQP_OP_NOP 0x24 #define I40IW_CQP_OP_MANAGE_QUAD_HASH_TABLE_ENTRY 0x25 @@ -1729,6 +1731,7 @@ enum i40iw_alignment { #define OP_COMMIT_FPM_VALUES 30 #define OP_REQUESTED_COMMANDS 31 #define OP_COMPLETED_COMMANDS 32 -#define OP_SIZE_CQP_STAT_ARRAY 33 +#define OP_GEN_AE 33 +#define OP_SIZE_CQP_STAT_ARRAY 34 #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c index 61540e14e4b9..d7af9a25bf1a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c @@ -667,6 +667,39 @@ enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev, return 0; } +/** + * i40iw_gen_ae - generate AE + * @iwdev: iwarp device + * @qp: qp associated with AE + * @info: info for ae + * @wait: wait for completion + */ +void i40iw_gen_ae(struct i40iw_device *iwdev, + struct i40iw_sc_qp *qp, + struct i40iw_gen_ae_info *info, + bool wait) +{ + struct i40iw_gen_ae_info *ae_info; + struct i40iw_cqp_request *cqp_request; + struct cqp_commands_info *cqp_info; + + cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait); + if (!cqp_request) + return; + + cqp_info = &cqp_request->info; + ae_info = &cqp_request->info.in.u.gen_ae.info; + memcpy(ae_info, info, sizeof(*ae_info)); + + cqp_info->cqp_cmd = OP_GEN_AE; + cqp_info->post_sq = 1; + cqp_info->in.u.gen_ae.qp = qp; + cqp_info->in.u.gen_ae.scratch = (uintptr_t)cqp_request; + if (i40iw_handle_cqp_op(iwdev, cqp_request)) + i40iw_pr_err("CQP OP failed attempting to generate ae_code=0x%x\n", + info->ae_code); +} + /** * i40iw_hw_manage_vf_pble_bp - manage vf pbles * @iwdev: iwarp device diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h index a27d392c92a2..adc8d2ec523d 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_type.h +++ b/drivers/infiniband/hw/i40iw/i40iw_type.h @@ -1004,6 +1004,11 @@ struct i40iw_cqp_query_fpm_values { u32 pbl_max; }; +struct i40iw_gen_ae_info { + u16 ae_code; + u8 ae_source; +}; + struct i40iw_cqp_ops { enum i40iw_status_code (*cqp_init)(struct i40iw_sc_cqp *, struct i40iw_cqp_init_info *); @@ -1290,6 +1295,12 @@ struct cqp_info { u64 scratch; } qp_flush_wqes; + struct { + struct i40iw_sc_qp *qp; + struct i40iw_gen_ae_info info; + u64 scratch; + } gen_ae; + struct { struct i40iw_sc_cqp *cqp; void *fpm_values_va; diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 8cad4e8772bc..a9ea966877f2 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -1284,15 +1284,13 @@ void i40iw_cqp_qp_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp) */ void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp) { - struct i40iw_qp_flush_info info; + struct i40iw_gen_ae_info info; struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev; i40iw_debug(dev, I40IW_DEBUG_AEQ, "%s entered\n", __func__); - memset(&info, 0, sizeof(info)); info.ae_code = I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR; - info.generate_ae = true; - info.ae_source = 0x3; - (void)i40iw_hw_flush_wqes(iwdev, qp, &info, false); + info.ae_source = I40IW_AE_SOURCE_RQ; + i40iw_gen_ae(iwdev, qp, &info, false); } /** From 546b1452fdcccdcc98962b324cab6d74fc976fe9 Mon Sep 17 00:00:00 2001 From: Henry Orosco Date: Wed, 14 Mar 2018 14:45:23 -0500 Subject: [PATCH 107/199] i40iw: Tear-down connection after CQP Modify QP failure There is no explicit tear-down sequence initiated on connections if the Control QP OP, Modify QP to close, fails. Fix this by triggering a driver generated Asynchronous Event (AE) on Modify QP failures and tear-down the connection on receipt of the AE. This fix can be generalized to other Modify QP failures (i.e. RTS->TERM, IDLE->RTS, etc) as any modify failure will require a connection tear-down. Fixes: d37498417947 ("i40iw: add files for iwarp interface") Signed-off-by: Henry Orosco Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw.h | 1 + drivers/infiniband/hw/i40iw/i40iw_cm.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_hw.c | 2 ++ drivers/infiniband/hw/i40iw/i40iw_verbs.c | 35 +++++++++++++++++------ 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index 45e97dd323ad..d5d8c1be345a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -559,6 +559,7 @@ void i40iw_next_iw_state(struct i40iw_qp *iwqp, u8 state, u8 del_hash, u8 term, u8 term_len); int i40iw_send_syn(struct i40iw_cm_node *cm_node, u32 sendack); +int i40iw_send_reset(struct i40iw_cm_node *cm_node); struct i40iw_cm_node *i40iw_find_node(struct i40iw_cm_core *cm_core, u16 rem_port, u32 *rem_addr, diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index d4780d3887ca..4cfa8f4647e2 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -539,7 +539,7 @@ static struct i40iw_puda_buf *i40iw_form_cm_frame(struct i40iw_cm_node *cm_node, * i40iw_send_reset - Send RST packet * @cm_node: connection's node */ -static int i40iw_send_reset(struct i40iw_cm_node *cm_node) +int i40iw_send_reset(struct i40iw_cm_node *cm_node) { struct i40iw_puda_buf *sqbuf; int flags = SET_RST | SET_ACK; diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c index d7af9a25bf1a..6139836fb533 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c @@ -352,6 +352,8 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) else i40iw_cm_disconn(iwqp); break; + case I40IW_AE_BAD_CLOSE: + /* fall through */ case I40IW_AE_RESET_SENT: i40iw_next_iw_state(iwqp, I40IW_QP_STATE_ERROR, 1, 0, 0); i40iw_cm_disconn(iwqp); diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index f3af952402e9..60e004d2100e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -831,10 +831,10 @@ static int i40iw_query_qp(struct ib_qp *ibqp, void i40iw_hw_modify_qp(struct i40iw_device *iwdev, struct i40iw_qp *iwqp, struct i40iw_modify_qp_info *info, bool wait) { - enum i40iw_status_code status; struct i40iw_cqp_request *cqp_request; struct cqp_commands_info *cqp_info; struct i40iw_modify_qp_info *m_info; + struct i40iw_gen_ae_info ae_info; cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait); if (!cqp_request) @@ -847,9 +847,25 @@ void i40iw_hw_modify_qp(struct i40iw_device *iwdev, struct i40iw_qp *iwqp, cqp_info->post_sq = 1; cqp_info->in.u.qp_modify.qp = &iwqp->sc_qp; cqp_info->in.u.qp_modify.scratch = (uintptr_t)cqp_request; - status = i40iw_handle_cqp_op(iwdev, cqp_request); - if (status) - i40iw_pr_err("CQP-OP Modify QP fail"); + if (!i40iw_handle_cqp_op(iwdev, cqp_request)) + return; + + switch (m_info->next_iwarp_state) { + case I40IW_QP_STATE_RTS: + if (iwqp->iwarp_state == I40IW_QP_STATE_IDLE) + i40iw_send_reset(iwqp->cm_node); + /* fall through */ + case I40IW_QP_STATE_IDLE: + case I40IW_QP_STATE_TERMINATE: + case I40IW_QP_STATE_CLOSING: + ae_info.ae_code = I40IW_AE_BAD_CLOSE; + ae_info.ae_source = 0; + i40iw_gen_ae(iwdev, &iwqp->sc_qp, &ae_info, false); + break; + case I40IW_QP_STATE_ERROR: + default: + break; + } } /** @@ -962,10 +978,6 @@ int i40iw_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, iwqp->ibqp_state = attr->qp_state; - if (issue_modify_qp) - iwqp->iwarp_state = info.next_iwarp_state; - else - info.next_iwarp_state = iwqp->iwarp_state; } if (attr_mask & IB_QP_ACCESS_FLAGS) { ctx_info->iwarp_info_valid = true; @@ -1003,9 +1015,14 @@ int i40iw_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, spin_unlock_irqrestore(&iwqp->lock, flags); - if (issue_modify_qp) + if (issue_modify_qp) { i40iw_hw_modify_qp(iwdev, iwqp, &info, true); + spin_lock_irqsave(&iwqp->lock, flags); + iwqp->iwarp_state = info.next_iwarp_state; + spin_unlock_irqrestore(&iwqp->lock, flags); + } + if (issue_modify_qp && (iwqp->ibqp_state > IB_QPS_RTS)) { if (dont_wait) { if (iwqp->cm_id && iwqp->hw_tcp_state) { From 9c71172c4a2f6695fdfb89780da160f579a002c2 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 15 Mar 2018 16:56:39 +0200 Subject: [PATCH 108/199] IB/mlx4: Report TSO capabilities Report to the user area the TSO device capabilities, it includes the max_tso size and the QP types that support it. The TSO is applicable only when when of the ports is ETH and the device supports it. uresp logic around rss_caps is updated to fix a till-now harmless bug computing the length of the structure to copy. The code did not handle the implicit padding before rss_caps correctly. This is necessay to copy tss_caps successfully. Reviewed-by: Mark Bloch Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 22 ++++++++++++++++++++-- include/uapi/rdma/mlx4-abi.h | 9 +++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f57229b85536..88b0aef37bc4 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -429,6 +429,9 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, return real_index; } +#define field_avail(type, fld, sz) (offsetof(type, fld) + \ + sizeof(((type *)0)->fld) <= (sz)) + static int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) @@ -587,8 +590,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, sizeof(struct mlx4_wqe_data_seg); } - if (uhw->outlen >= resp.response_length + sizeof(resp.rss_caps)) { - resp.response_length += sizeof(resp.rss_caps); + if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { if (props->rss_caps.supported_qpts) { resp.rss_caps.rx_hash_function = MLX4_IB_RX_HASH_FUNC_TOEPLITZ; @@ -608,6 +610,22 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, resp.rss_caps.rx_hash_fields_mask |= MLX4_IB_RX_HASH_INNER; } + resp.response_length = offsetof(typeof(resp), rss_caps) + + sizeof(resp.rss_caps); + } + + if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { + if (dev->dev->caps.max_gso_sz && + ((mlx4_ib_port_link_layer(ibdev, 1) == + IB_LINK_LAYER_ETHERNET) || + (mlx4_ib_port_link_layer(ibdev, 2) == + IB_LINK_LAYER_ETHERNET))) { + resp.tso_caps.max_tso = dev->dev->caps.max_gso_sz; + resp.tso_caps.supported_qpts |= + 1 << IB_QPT_RAW_PACKET; + } + resp.response_length = offsetof(typeof(resp), tso_caps) + + sizeof(resp.tso_caps); } if (uhw->outlen) { diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index be58594cec87..a448abd07052 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -170,12 +170,21 @@ enum query_device_resp_mask { MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, }; +struct mlx4_ib_tso_caps { + __u32 max_tso; /* Maximum tso payload size in bytes */ + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported. + */ + __u32 supported_qpts; +}; + struct mlx4_uverbs_ex_query_device_resp { __u32 comp_mask; __u32 response_length; __u64 hca_core_clock_offset; __u32 max_inl_recv_sz; struct mlx4_ib_rss_caps rss_caps; + struct mlx4_ib_tso_caps tso_caps; }; #endif /* MLX4_ABI_USER_H */ From 6d06c9aa3816c26144f54d18c5b63ba9ff026fb9 Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Thu, 15 Mar 2018 16:56:40 +0200 Subject: [PATCH 109/199] IB/mlx4: Add Scatter FCS support over WQ creation As a default, for Ethernet packets, the device scatters only the payload of ingress packets. The scatter FCS feature lets the user to get the FCS (Ethernet's frame check sequence) in the received WR's buffer as a 4 Bytes trailer following the packet's payload. Reviewed-by: Yishai Hadas Signed-off-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 21 +++++++++++++-------- drivers/infiniband/hw/mlx4/mlx4_ib.h | 1 + drivers/infiniband/hw/mlx4/qp.c | 19 ++++++++++++++++++- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 88b0aef37bc4..b9befda1eb27 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -559,14 +559,19 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->timestamp_mask = 0xFFFFFFFFFFFFULL; props->max_ah = INT_MAX; - if ((dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) && - (mlx4_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET || - mlx4_ib_port_link_layer(ibdev, 2) == IB_LINK_LAYER_ETHERNET)) { - props->rss_caps.max_rwq_indirection_tables = props->max_qp; - props->rss_caps.max_rwq_indirection_table_size = - dev->dev->caps.max_rss_tbl_sz; - props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; - props->max_wq_type_rq = props->max_qp; + if (mlx4_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET || + mlx4_ib_port_link_layer(ibdev, 2) == IB_LINK_LAYER_ETHERNET) { + if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) { + props->rss_caps.max_rwq_indirection_tables = + props->max_qp; + props->rss_caps.max_rwq_indirection_table_size = + dev->dev->caps.max_rss_tbl_sz; + props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; + props->max_wq_type_rq = props->max_qp; + } + + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) + props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS; } props->cq_caps.max_cq_moderation_count = MLX4_MAX_CQ_COUNT; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 87c47b1dd870..7b1429917aba 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -189,6 +189,7 @@ enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, + MLX4_IB_QP_SCATTER_FCS = IB_QP_CREATE_SCATTER_FCS, /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */ MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI, diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index f045491f2c14..04efc05fb531 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1096,6 +1096,17 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->inl_recv_sz = ucmd.qp.inl_recv_sz; } + if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) { + if (!(dev->dev->caps.flags & + MLX4_DEV_CAP_FLAG_FCS_KEEP)) { + pr_debug("scatter FCS is unsupported\n"); + err = -EOPNOTSUPP; + goto err; + } + + qp->flags |= MLX4_IB_QP_SCATTER_FCS; + } + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp, qp->inl_recv_sz); if (err) @@ -2234,6 +2245,9 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, if (qp->inl_recv_sz) context->param3 |= cpu_to_be32(1 << 25); + if (qp->flags & MLX4_IB_QP_SCATTER_FCS) + context->param3 |= cpu_to_be32(1 << 29); + if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI) context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; else if (qp_type == IB_QPT_RAW_PACKET) @@ -4204,7 +4218,7 @@ struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, return ERR_PTR(-EOPNOTSUPP); } - if (init_attr->create_flags) { + if (init_attr->create_flags & ~IB_WQ_FLAGS_SCATTER_FCS) { pr_debug("unsupported create_flags %u\n", init_attr->create_flags); return ERR_PTR(-EOPNOTSUPP); @@ -4225,6 +4239,9 @@ struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, ib_qp_init_attr.recv_cq = init_attr->cq; ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */ + if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS) + ib_qp_init_attr.create_flags |= IB_QP_CREATE_SCATTER_FCS; + err = create_qp_common(dev, pd, MLX4_IB_RWQ_SRC, &ib_qp_init_attr, udata, 0, &qp); if (err) { From 7d9a935e169b7e51a5f84caf8dfb02aad6206902 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 15 Mar 2018 11:10:42 +0200 Subject: [PATCH 110/199] RDMA/restrack: Don't rely on uninitialized variable in restrack_add flow The restrack code relies on the fact that object structures are zeroed at the allocation stage, the mlx4 CQ wasn't allocated with kzalloc and it caused to the following crash. [ 137.392209] general protection fault: 0000 [#1] SMP KASAN PTI [ 137.392972] CPU: 0 PID: 622 Comm: ibv_rc_pingpong Tainted: G W 4.16.0-rc1-00099-g00313983cda6 #11 [ 137.395079] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 [ 137.396866] RIP: 0010:rdma_restrack_del+0xc8/0xf0 [ 137.397762] RSP: 0018:ffff8801b54e7968 EFLAGS: 00010206 [ 137.399008] RAX: 0000000000000000 RBX: ffff8801d8bcbae8 RCX: ffffffffb82314df [ 137.400055] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: 70696b533d454741 [ 137.401103] RBP: ffff8801d90c07a0 R08: ffff8801d8bcbb00 R09: 0000000000000000 [ 137.402470] R10: 0000000000000001 R11: ffffed0036a9cf52 R12: ffff8801d90c0ad0 [ 137.403318] R13: ffff8801d853fb20 R14: ffff8801d8bcbb28 R15: 0000000000000014 [ 137.404736] FS: 00007fb415d43740(0000) GS:ffff8801e5c00000(0000) knlGS:0000000000000000 [ 137.406074] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 137.407101] CR2: 00007fb41557df20 CR3: 00000001b580c001 CR4: 00000000003606b0 [ 137.408308] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 137.409352] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 137.410385] Call Trace: [ 137.411058] ib_destroy_cq+0x23/0x60 [ 137.411460] uverbs_free_cq+0x37/0xa0 [ 137.412040] remove_commit_idr_uobject+0x38/0xf0 [ 137.413042] _rdma_remove_commit_uobject+0x5c/0x160 [ 137.413782] ? lookup_get_idr_uobject+0x39/0x50 [ 137.414737] rdma_remove_commit_uobject+0x3b/0x70 [ 137.415742] ib_uverbs_destroy_cq+0x114/0x1d0 [ 137.416260] ? ib_uverbs_req_notify_cq+0x160/0x160 [ 137.417073] ? kernel_text_address+0x5c/0x90 [ 137.417805] ? __kernel_text_address+0xe/0x30 [ 137.418766] ? unwind_get_return_address+0x2f/0x50 [ 137.419558] ib_uverbs_write+0x453/0x6a0 [ 137.420220] ? show_ibdev+0x90/0x90 [ 137.420653] ? __kasan_slab_free+0x136/0x180 [ 137.421155] ? kmem_cache_free+0x78/0x1e0 [ 137.422192] ? remove_vma+0x83/0x90 [ 137.422614] ? do_munmap+0x447/0x6c0 [ 137.423045] ? vm_munmap+0xb0/0x100 [ 137.423481] ? SyS_munmap+0x1d/0x30 [ 137.424120] ? do_syscall_64+0xeb/0x250 [ 137.424984] ? entry_SYSCALL_64_after_hwframe+0x21/0x86 [ 137.425611] ? lru_add_drain_all+0x270/0x270 [ 137.426116] ? lru_add_drain_cpu+0xa3/0x170 [ 137.426616] ? lru_add_drain+0x11/0x20 [ 137.427058] ? free_pages_and_swap_cache+0xa6/0x120 [ 137.427672] ? tlb_flush_mmu_free+0x78/0x90 [ 137.428168] ? arch_tlb_finish_mmu+0x6d/0xb0 [ 137.428680] __vfs_write+0xc4/0x350 [ 137.430917] ? kernel_read+0xa0/0xa0 [ 137.432758] ? remove_vma+0x90/0x90 [ 137.434781] ? __kasan_slab_free+0x14b/0x180 [ 137.437486] ? remove_vma+0x83/0x90 [ 137.439836] ? kmem_cache_free+0x78/0x1e0 [ 137.442195] ? percpu_counter_add_batch+0x1d/0x90 [ 137.444389] vfs_write+0xf7/0x280 [ 137.446030] SyS_write+0xa1/0x120 [ 137.447867] ? SyS_read+0x120/0x120 [ 137.449670] ? mm_fault_error+0x180/0x180 [ 137.451539] ? _cond_resched+0x16/0x50 [ 137.453697] ? SyS_read+0x120/0x120 [ 137.455883] do_syscall_64+0xeb/0x250 [ 137.457686] entry_SYSCALL_64_after_hwframe+0x21/0x86 [ 137.459595] RIP: 0033:0x7fb415637b94 [ 137.461315] RSP: 002b:00007ffdebea7d88 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 137.463879] RAX: ffffffffffffffda RBX: 00005565022d1bd0 RCX: 00007fb415637b94 [ 137.466519] RDX: 0000000000000018 RSI: 00007ffdebea7da0 RDI: 0000000000000003 [ 137.469543] RBP: 00007ffdebea7d98 R08: 0000000000000000 R09: 00005565022d40c0 [ 137.472479] R10: 00000000000009cf R11: 0000000000000246 R12: 00005565022d2520 [ 137.475125] R13: 00000000000003e8 R14: 0000000000000000 R15: 00007ffdebea7fd0 [ 137.477760] Code: f7 e8 dd 0d 0b ff 48 c7 43 40 00 00 00 00 48 89 df e8 0d 0b 0b ff 48 8d 7b 28 c6 03 00 e8 41 0d 0b ff 48 8b 7b 28 48 85 ff 74 06 ff 4f 48 74 10 5b 48 89 ef 5d 41 5c 41 5d 41 5e e9 32 b0 ee [ 137.483375] RIP: rdma_restrack_del+0xc8/0xf0 RSP: ffff8801b54e7968 [ 137.486436] ---[ end trace 81835a1ea6722eed ]--- [ 137.488566] Kernel panic - not syncing: Fatal exception [ 137.491162] Kernel Offset: 0x36000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Fixes: 00313983cda6 ("RDMA/nldev: provide detailed CM_ID information") Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index e1d9934d6e81..4cad0cd9aa0c 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -113,13 +113,15 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) if (!dev) return; + if (res->type != RDMA_RESTRACK_CM_ID || !res_is_user(res)) + res->task = NULL; + if (res_is_user(res)) { if (!res->task) rdma_restrack_set_task(res, current); res->kern_name = NULL; } else { set_kern_name(res); - res->task = NULL; } kref_init(&res->kref); From 311d0da97480d19d4ecd57f3ee264e3c232d78e5 Mon Sep 17 00:00:00 2001 From: Honggang Li Date: Thu, 15 Mar 2018 17:02:13 +0800 Subject: [PATCH 111/199] IB/core: Set speed string to SDR for invalid active rates Before commit f1b65df5a232 ("IB/mlx5: Add support for active_width and active_speed in RoCE"), the mlx5_ib driver set default active_width and active_speed to IB_WIDTH_4X and IB_SPEED_QDR. Now, the active_width and active_speed are zeros if the RoCE port is in DOWN state. The speed string should be set to " SDR" instead of a blank string when active_speed is zero. Signed-off-by: Honggang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 8ae1308eecc7..cf36ff1f0068 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -273,6 +273,7 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, break; case IB_SPEED_SDR: default: /* default to SDR for invalid rates */ + speed = " SDR"; rate = 25; break; } From 7672ed33c4c15dbe9d56880683baaba4227cf940 Mon Sep 17 00:00:00 2001 From: Honggang Li Date: Fri, 16 Mar 2018 10:37:13 +0800 Subject: [PATCH 112/199] IB/mlx5: Set the default active rate and width to QDR and 4X Before commit f1b65df5a232 ("IB/mlx5: Add support for active_width and active_speed in RoCE"), the mlx5_ib driver set the default active_width and active_speed to IB_WIDTH_4X and IB_SPEED_QDR. When the RoCE port is down, the RoCE port does not negotiate the active width with the remote side, causing the active width to be zero. When running userspace ibstat to view the port status, ibstat will panic as it reads an invalid width from sys file. This patch restores the original behavior. Fixes: f1b65df5a232 ("IB/mlx5: Add support for active_width and active_speed in RoCE"). Signed-off-by: Honggang Li Reviewed-by: Hal Rosenstock Reviewed-by: Noa Osherovich Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index a5a3f0b25608..3408bede0ee5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -405,6 +405,9 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, if (err) goto out; + props->active_width = IB_WIDTH_4X; + props->active_speed = IB_SPEED_QDR; + translate_eth_proto_oper(eth_prot_oper, &props->active_speed, &props->active_width); From 958d2c1ba37680b765a089dc374cc199fb61619b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 15 Mar 2018 21:18:14 -0600 Subject: [PATCH 113/199] RDMA/bnxt: Fix structure layout for bnxt_re_pd_resp What is going on here is a bit subtle, in the kernel there is no problem because the struct is copied using copy_from_user, so it can safely have an 8 byte alignment, however in userspace it must be constructed by concatenation with the ib_uverbs_alloc_pd_resp struct. This is due to the required memory layout to execute the command. Since ibv_uverbs_alloc_pd_resp is only 4 bytes long, this causes misalignment, and the user space will experience an unexpected padding. Currently it works around this via pointer maths. Make everything more robust by having the compiler reduce the alignment of the struct to 4. The userspace has assertions to ensure this works properly in all situations. Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/bnxt_re-abi.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index db54115be044..2d3c9aac661a 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -53,11 +53,16 @@ struct bnxt_re_uctx_resp { __u32 rsvd; }; +/* + * This struct is placed after the ib_uverbs_alloc_pd_resp struct, which is + * not 8 byted aligned. To avoid undesired padding in various cases we have to + * set this struct to packed. + */ struct bnxt_re_pd_resp { __u32 pdid; __u32 dpi; __u64 dbr; -}; +} __attribute__((packed, aligned(4))); struct bnxt_re_cq_req { __u64 cq_va; From 6d5b2047fe62ed83a90f0ecdf1cc9b4ae6fcc974 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 19 Mar 2018 07:59:59 +0200 Subject: [PATCH 114/199] IB/core: Use rdma_is_port_valid() Use rdma_is_port_valid() which performs port validity check instead of open coding the same check. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 31def0f2ac49..5b9416af825b 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -937,8 +937,7 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, unsigned long flags; int p; - if (port_num < rdma_start_port(device) || - port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port_num)) return -EINVAL; p = port_num - rdma_start_port(device); @@ -1048,7 +1047,7 @@ int ib_get_cached_port_state(struct ib_device *device, unsigned long flags; int ret = 0; - if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); From b19744e965abed7ad0167c25097f405b88ce5d13 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 19 Mar 2018 08:07:14 +0200 Subject: [PATCH 115/199] IB/core: Remove unimplemented ib_peek_cq ib_peek_cq() verb doesn't seem be implemented in current code. There is some past reference to it at [1] about it being unimplemented. Lot of user documentation created out of kdoc refers to this unimplemented API. Therefore, remove unimplemented API. [1] http://lists.openfabrics.org/pipermail/ofw/2008-May/002465.html Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ac3791e056cf..3cc48f34e3e4 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3225,18 +3225,6 @@ static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, return cq->device->poll_cq(cq, num_entries, wc); } -/** - * ib_peek_cq - Returns the number of unreaped completions currently - * on the specified CQ. - * @cq: The CQ to peek. - * @wc_cnt: A minimum number of unreaped completions to check for. - * - * If the number of unreaped completions is greater than or equal to wc_cnt, - * this function returns wc_cnt, otherwise, it returns the actual number of - * unreaped completions. - */ -int ib_peek_cq(struct ib_cq *cq, int wc_cnt); - /** * ib_req_notify_cq - Request completion notification on a CQ. * @cq: The CQ to generate an event for. From df7e40425813c50cd252e6f5e348a81ef1acae56 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Mon, 19 Mar 2018 21:36:07 +0800 Subject: [PATCH 116/199] RDMA/hns: Fix init resp when alloc ucontext The data in resp will be copied from kernel to userspace, thus it needs to be initialized to zeros to avoid copying uninited stack memory. Reported-by: Dan Carpenter Fixes: e088a685eae9 ("RDMA/hns: Support rq record doorbell for the user space") Signed-off-by: Yixian Liu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index e1ee6666f790..6e48b1f507cf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -338,7 +338,7 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, { int ret = 0; struct hns_roce_ucontext *context; - struct hns_roce_ib_alloc_ucontext_resp resp; + struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); resp.qp_tab_size = hr_dev->caps.num_qps; From 05d3ac978ed25b753bfe34fe76c50c31ee506a82 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Mon, 19 Mar 2018 15:10:29 +0200 Subject: [PATCH 117/199] net/mlx5: Packet pacing enhancement Add two new parameters: max_burst_sz and typical_pkt_size (both in bytes) to rate limit configurations. max_burst_sz: The device will schedule bursts of packets for an SQ connected to this rate, smaller than or equal to this value. Value 0x0 indicates packet bursts will be limited to the device defaults. This field should be used if bursts of packets must be strictly kept under a certain value. typical_pkt_size: When the rate limit is intended for a stream of similar packets, stating the typical packet size can improve the accuracy of the rate limiter. The expected packet size will be the same for all SQs associated with the same rate limit index. Ethernet driver is updated according to this change, but these two parameters will be kept as 0 due to lacking of proper way to get the configurations from user space which requires to change ndo_set_tx_maxrate interface. Signed-off-by: Bodong Wang Reviewed-by: Daniel Jurgens Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 19 ++++-- drivers/net/ethernet/mellanox/mlx5/core/rl.c | 63 ++++++++++++------- include/linux/mlx5/driver.h | 15 ++++- include/linux/mlx5/mlx5_ifc.h | 12 +++- 4 files changed, 76 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 47bab842c5ee..2ee4ffbddd5f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1195,10 +1195,13 @@ static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) { struct mlx5e_channel *c = sq->channel; struct mlx5_core_dev *mdev = c->mdev; + struct mlx5_rate_limit rl = {0}; mlx5e_destroy_sq(mdev, sq->sqn); - if (sq->rate_limit) - mlx5_rl_remove_rate(mdev, sq->rate_limit); + if (sq->rate_limit) { + rl.rate = sq->rate_limit; + mlx5_rl_remove_rate(mdev, &rl); + } mlx5e_free_txqsq_descs(sq); mlx5e_free_txqsq(sq); } @@ -1528,6 +1531,7 @@ static int mlx5e_set_sq_maxrate(struct net_device *dev, struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_modify_sq_param msp = {0}; + struct mlx5_rate_limit rl = {0}; u16 rl_index = 0; int err; @@ -1535,14 +1539,17 @@ static int mlx5e_set_sq_maxrate(struct net_device *dev, /* nothing to do */ return 0; - if (sq->rate_limit) + if (sq->rate_limit) { + rl.rate = sq->rate_limit; /* remove current rl index to free space to next ones */ - mlx5_rl_remove_rate(mdev, sq->rate_limit); + mlx5_rl_remove_rate(mdev, &rl); + } sq->rate_limit = 0; if (rate) { - err = mlx5_rl_add_rate(mdev, rate, &rl_index); + rl.rate = rate; + err = mlx5_rl_add_rate(mdev, &rl_index, &rl); if (err) { netdev_err(dev, "Failed configuring rate %u: %d\n", rate, err); @@ -1560,7 +1567,7 @@ static int mlx5e_set_sq_maxrate(struct net_device *dev, rate, err); /* remove the rate from the table */ if (rate) - mlx5_rl_remove_rate(mdev, rate); + mlx5_rl_remove_rate(mdev, &rl); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c index d3c33e9eea72..bc86dffdc43c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c @@ -107,16 +107,16 @@ int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, * If the table is full, return NULL */ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table, - u32 rate) + struct mlx5_rate_limit *rl) { struct mlx5_rl_entry *ret_entry = NULL; bool empty_found = false; int i; for (i = 0; i < table->max_size; i++) { - if (table->rl_entry[i].rate == rate) + if (mlx5_rl_are_equal(&table->rl_entry[i].rl, rl)) return &table->rl_entry[i]; - if (!empty_found && !table->rl_entry[i].rate) { + if (!empty_found && !table->rl_entry[i].rl.rate) { empty_found = true; ret_entry = &table->rl_entry[i]; } @@ -126,7 +126,8 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table, } static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev, - u32 rate, u16 index) + u16 index, + struct mlx5_rate_limit *rl) { u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)] = {0}; u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0}; @@ -134,7 +135,9 @@ static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev, MLX5_SET(set_pp_rate_limit_in, in, opcode, MLX5_CMD_OP_SET_PP_RATE_LIMIT); MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index); - MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rate); + MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rl->rate); + MLX5_SET(set_pp_rate_limit_in, in, burst_upper_bound, rl->max_burst_sz); + MLX5_SET(set_pp_rate_limit_in, in, typical_packet_size, rl->typical_pkt_sz); return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } @@ -146,7 +149,17 @@ bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate) } EXPORT_SYMBOL(mlx5_rl_is_in_range); -int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index) +bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0, + struct mlx5_rate_limit *rl_1) +{ + return ((rl_0->rate == rl_1->rate) && + (rl_0->max_burst_sz == rl_1->max_burst_sz) && + (rl_0->typical_pkt_sz == rl_1->typical_pkt_sz)); +} +EXPORT_SYMBOL(mlx5_rl_are_equal); + +int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index, + struct mlx5_rate_limit *rl) { struct mlx5_rl_table *table = &dev->priv.rl_table; struct mlx5_rl_entry *entry; @@ -154,14 +167,14 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index) mutex_lock(&table->rl_lock); - if (!rate || !mlx5_rl_is_in_range(dev, rate)) { + if (!rl->rate || !mlx5_rl_is_in_range(dev, rl->rate)) { mlx5_core_err(dev, "Invalid rate: %u, should be %u to %u\n", - rate, table->min_rate, table->max_rate); + rl->rate, table->min_rate, table->max_rate); err = -EINVAL; goto out; } - entry = find_rl_entry(table, rate); + entry = find_rl_entry(table, rl); if (!entry) { mlx5_core_err(dev, "Max number of %u rates reached\n", table->max_size); @@ -173,13 +186,15 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index) entry->refcount++; } else { /* new rate limit */ - err = mlx5_set_pp_rate_limit_cmd(dev, rate, entry->index); + err = mlx5_set_pp_rate_limit_cmd(dev, entry->index, rl); if (err) { - mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n", - rate, err); + mlx5_core_err(dev, "Failed configuring rate limit(err %d): \ + rate %u, max_burst_sz %u, typical_pkt_sz %u\n", + err, rl->rate, rl->max_burst_sz, + rl->typical_pkt_sz); goto out; } - entry->rate = rate; + entry->rl = *rl; entry->refcount = 1; } *index = entry->index; @@ -190,27 +205,30 @@ out: } EXPORT_SYMBOL(mlx5_rl_add_rate); -void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate) +void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl) { struct mlx5_rl_table *table = &dev->priv.rl_table; struct mlx5_rl_entry *entry = NULL; + struct mlx5_rate_limit reset_rl = {0}; /* 0 is a reserved value for unlimited rate */ - if (rate == 0) + if (rl->rate == 0) return; mutex_lock(&table->rl_lock); - entry = find_rl_entry(table, rate); + entry = find_rl_entry(table, rl); if (!entry || !entry->refcount) { - mlx5_core_warn(dev, "Rate %u is not configured\n", rate); + mlx5_core_warn(dev, "Rate %u, max_burst_sz %u typical_pkt_sz %u \ + are not configured\n", + rl->rate, rl->max_burst_sz, rl->typical_pkt_sz); goto out; } entry->refcount--; if (!entry->refcount) { /* need to remove rate */ - mlx5_set_pp_rate_limit_cmd(dev, 0, entry->index); - entry->rate = 0; + mlx5_set_pp_rate_limit_cmd(dev, entry->index, &reset_rl); + entry->rl = reset_rl; } out: @@ -257,13 +275,14 @@ int mlx5_init_rl_table(struct mlx5_core_dev *dev) void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev) { struct mlx5_rl_table *table = &dev->priv.rl_table; + struct mlx5_rate_limit rl = {0}; int i; /* Clear all configured rates */ for (i = 0; i < table->max_size; i++) - if (table->rl_entry[i].rate) - mlx5_set_pp_rate_limit_cmd(dev, 0, - table->rl_entry[i].index); + if (table->rl_entry[i].rl.rate) + mlx5_set_pp_rate_limit_cmd(dev, table->rl_entry[i].index, + &rl); kfree(dev->priv.rl_table.rl_entry); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index cded85ab6fe4..767d193c269a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -591,8 +591,14 @@ struct mlx5_eswitch; struct mlx5_lag; struct mlx5_pagefault; +struct mlx5_rate_limit { + u32 rate; + u32 max_burst_sz; + u16 typical_pkt_sz; +}; + struct mlx5_rl_entry { - u32 rate; + struct mlx5_rate_limit rl; u16 index; u16 refcount; }; @@ -1107,9 +1113,12 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token, int mlx5_init_rl_table(struct mlx5_core_dev *dev); void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev); -int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index); -void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate); +int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index, + struct mlx5_rate_limit *rl); +void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl); bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate); +bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0, + struct mlx5_rate_limit *rl_1); int mlx5_alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg, bool map_wc, bool fast_path); void mlx5_free_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 14ad84afe8ba..c63bbdc35503 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -571,7 +571,10 @@ struct mlx5_ifc_qos_cap_bits { u8 esw_scheduling[0x1]; u8 esw_bw_share[0x1]; u8 esw_rate_limit[0x1]; - u8 reserved_at_4[0x1c]; + u8 reserved_at_4[0x1]; + u8 packet_pacing_burst_bound[0x1]; + u8 packet_pacing_typical_size[0x1]; + u8 reserved_at_7[0x19]; u8 reserved_at_20[0x20]; @@ -7313,7 +7316,12 @@ struct mlx5_ifc_set_pp_rate_limit_in_bits { u8 rate_limit[0x20]; - u8 reserved_at_a0[0x160]; + u8 burst_upper_bound[0x20]; + + u8 reserved_at_c0[0x10]; + u8 typical_packet_size[0x10]; + + u8 reserved_at_e0[0x120]; }; struct mlx5_ifc_access_register_out_bits { From 61147f391a8b3bdde4c0a631dd132d85d00b90a0 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Mon, 19 Mar 2018 15:10:30 +0200 Subject: [PATCH 118/199] IB/mlx5: Packet packing enhancement for RAW QP Enable RAW QP to be able to configure burst control by modify_qp. By using burst control with rate limiting, user can achieve best performance and accuracy. The burst control information is passed by user through udata. This patch also reports burst control capability for mlx5 related hardwares, burst control is only marked as supported when both packet_pacing_burst_bound and packet_pacing_typical_size are supported. Signed-off-by: Bodong Wang Reviewed-by: Daniel Jurgens Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 4 ++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/qp.c | 94 ++++++++++++++++++++++------ include/uapi/rdma/mlx5-abi.h | 19 +++++- 4 files changed, 98 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 3408bede0ee5..d06aae9aa600 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -989,6 +989,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_CAP_QOS(mdev, packet_pacing_min_rate); resp.packet_pacing_caps.supported_qpts |= 1 << IB_QPT_RAW_PACKET; + if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) && + MLX5_CAP_QOS(mdev, packet_pacing_typical_size)) + resp.packet_pacing_caps.cap_flags |= + MLX5_IB_PP_SUPPORT_BURST; } resp.response_length += sizeof(resp.packet_pacing_caps); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index f9ba1ea94f0f..aeea74357cbe 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -403,7 +403,7 @@ struct mlx5_ib_qp { struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; - u32 rate_limit; + struct mlx5_rate_limit rl; u32 underlay_qpn; bool tunnel_offload_en; /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 6c7b4c2bfaa4..2fb3d9a400d3 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -86,7 +86,9 @@ struct mlx5_modify_raw_qp_param { u16 operation; u32 set_mask; /* raw_qp_set_mask_map */ - u32 rate_limit; + + struct mlx5_rate_limit rl; + u8 rq_q_ctr_id; }; @@ -2774,8 +2776,9 @@ static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, const struct mlx5_modify_raw_qp_param *raw_qp_param) { struct mlx5_ib_qp *ibqp = sq->base.container_mibqp; - u32 old_rate = ibqp->rate_limit; - u32 new_rate = old_rate; + struct mlx5_rate_limit old_rl = ibqp->rl; + struct mlx5_rate_limit new_rl = old_rl; + bool new_rate_added = false; u16 rl_index = 0; void *in; void *sqc; @@ -2797,39 +2800,43 @@ static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, pr_warn("%s: Rate limit can only be changed when SQ is moving to RDY\n", __func__); else - new_rate = raw_qp_param->rate_limit; + new_rl = raw_qp_param->rl; } - if (old_rate != new_rate) { - if (new_rate) { - err = mlx5_rl_add_rate(dev, new_rate, &rl_index); + if (!mlx5_rl_are_equal(&old_rl, &new_rl)) { + if (new_rl.rate) { + err = mlx5_rl_add_rate(dev, &rl_index, &new_rl); if (err) { - pr_err("Failed configuring rate %u: %d\n", - new_rate, err); + pr_err("Failed configuring rate limit(err %d): \ + rate %u, max_burst_sz %u, typical_pkt_sz %u\n", + err, new_rl.rate, new_rl.max_burst_sz, + new_rl.typical_pkt_sz); + goto out; } + new_rate_added = true; } MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); + /* index 0 means no limit */ MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); } err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen); if (err) { /* Remove new rate from table if failed */ - if (new_rate && - old_rate != new_rate) - mlx5_rl_remove_rate(dev, new_rate); + if (new_rate_added) + mlx5_rl_remove_rate(dev, &new_rl); goto out; } /* Only remove the old rate after new rate was set */ - if ((old_rate && - (old_rate != new_rate)) || + if ((old_rl.rate && + !mlx5_rl_are_equal(&old_rl, &new_rl)) || (new_state != MLX5_SQC_STATE_RDY)) - mlx5_rl_remove_rate(dev, old_rate); + mlx5_rl_remove_rate(dev, &old_rl); - ibqp->rate_limit = new_rate; + ibqp->rl = new_rl; sq->state = new_state; out: @@ -2906,7 +2913,8 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, - enum ib_qp_state cur_state, enum ib_qp_state new_state) + enum ib_qp_state cur_state, enum ib_qp_state new_state, + const struct mlx5_ib_modify_qp *ucmd) { static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { [MLX5_QP_STATE_RST] = { @@ -3144,7 +3152,30 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_RATE_LIMIT) { - raw_qp_param.rate_limit = attr->rate_limit; + raw_qp_param.rl.rate = attr->rate_limit; + + if (ucmd->burst_info.max_burst_sz) { + if (attr->rate_limit && + MLX5_CAP_QOS(dev->mdev, packet_pacing_burst_bound)) { + raw_qp_param.rl.max_burst_sz = + ucmd->burst_info.max_burst_sz; + } else { + err = -EINVAL; + goto out; + } + } + + if (ucmd->burst_info.typical_pkt_sz) { + if (attr->rate_limit && + MLX5_CAP_QOS(dev->mdev, packet_pacing_typical_size)) { + raw_qp_param.rl.typical_pkt_sz = + ucmd->burst_info.typical_pkt_sz; + } else { + err = -EINVAL; + goto out; + } + } + raw_qp_param.set_mask |= MLX5_RAW_QP_RATE_LIMIT; } @@ -3332,8 +3363,10 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_modify_qp ucmd = {}; enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; + size_t required_cmd_sz; int err = -EINVAL; int port; enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; @@ -3341,6 +3374,28 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (ibqp->rwq_ind_tbl) return -ENOSYS; + if (udata && udata->inlen) { + required_cmd_sz = offsetof(typeof(ucmd), reserved) + + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, + min(udata->inlen, sizeof(ucmd)))) + return -EFAULT; + + if (ucmd.comp_mask || + memchr_inv(&ucmd.reserved, 0, sizeof(ucmd.reserved)) || + memchr_inv(&ucmd.burst_info.reserved, 0, + sizeof(ucmd.burst_info.reserved))) + return -EOPNOTSUPP; + } + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); @@ -3421,7 +3476,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } - err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, + new_state, &ucmd); out: mutex_unlock(&qp->mutex); diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 1111aa4e7c1e..d2e0d234704f 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -163,6 +163,10 @@ struct mlx5_ib_cqe_comp_caps { __u32 supported_format; /* enum mlx5_ib_cqe_comp_res_format */ }; +enum mlx5_ib_packet_pacing_cap_flags { + MLX5_IB_PP_SUPPORT_BURST = 1 << 0, +}; + struct mlx5_packet_pacing_caps { __u32 qp_rate_limit_min; __u32 qp_rate_limit_max; /* In kpbs */ @@ -172,7 +176,8 @@ struct mlx5_packet_pacing_caps { * supported_qpts |= 1 << IB_QPT_RAW_PACKET */ __u32 supported_qpts; - __u32 reserved; + __u8 cap_flags; /* enum mlx5_ib_packet_pacing_cap_flags */ + __u8 reserved[3]; }; enum mlx5_ib_mpw_caps { @@ -362,6 +367,18 @@ struct mlx5_ib_create_ah_resp { __u8 reserved[6]; }; +struct mlx5_ib_burst_info { + __u32 max_burst_sz; + __u16 typical_pkt_sz; + __u16 reserved; +}; + +struct mlx5_ib_modify_qp { + __u32 comp_mask; + struct mlx5_ib_burst_info burst_info; + __u32 reserved; +}; + struct mlx5_ib_modify_qp_resp { __u32 response_length; __u32 dctn; From b470c154c600e427592df5237596ce0f33ce7d9f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 16 Mar 2018 10:55:57 -0700 Subject: [PATCH 119/199] IB/srp: Disallow duplicate RDMA/CM connections According to the SRP standard the INITIATOR and TARGET PORT IDENTIFIER fields from the login request specify the I_T nexus. Whether or not an SRP target closes an existing connection for an I_T nexus when a login request is received depends on the value of the MULTICHANNEL field in the login request. The SRP initiator derives the value of the INITIATOR and TARGET PORT IDENTIFIER fields from the .id_ext, .ioc_guid, .initiator_ext .sgid members of the srp_target_port structure. This means that the .rdma_cm.dst check must be removed from srp_conn_unique(). This patch avoids that for target ports that have multiple addresses, e.g. an IPv4 and an IPv6 address, and if a connection is established to both target port addresses, that the initiator logs in alternatingly every 10 seconds to the other target port address. An SRP target must namely terminate all but one connections for a given I_T nexus if the MULTICHANNEL field has not been set in the login request. Fixes: 19f313438c77 ("IB/srp: Add RDMA/CM support") Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 4c52ca922f0b..c35d2cd37d70 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3334,9 +3334,6 @@ static bool srp_conn_unique(struct srp_host *host, if (t != target && target->id_ext == t->id_ext && target->ioc_guid == t->ioc_guid && - (!target->using_rdma_cm || - memcmp(&target->rdma_cm.dst, &t->rdma_cm.dst, - sizeof(target->rdma_cm.dst)) == 0) && target->initiator_ext == t->initiator_ext) { ret = false; break; From 1f7ff9d5d36ae11356012a136f2d495cca910a5f Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 19 Mar 2018 15:02:33 +0200 Subject: [PATCH 120/199] IB/uverbs: Move to new headers and make naming consistent Use macros to make names consistent in ioctl() uAPI: The ioctl() uAPI works with object-method hierarchy. The method part also states which handler should be executed when this method is called from user-space. Therefore, we need to tie method, method's id, method's handler and the object owning this method together. Previously, this was done through explicit developer chosen names. This makes grepping the code harder. Changing the method's name, method's handler and object's name to be automatically generated based on the ids. The headers are split in a way so they be included and used by user-space. One header strictly contains structures that are used directly by user-space applications, where another header is used for internal library (i.e. libibverbs) to form the ioctl() commands. Other header simply contains the required general command structure. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 3 + drivers/infiniband/core/uverbs_cmd.c | 118 +++++++------- drivers/infiniband/core/uverbs_std_types.c | 176 +++++++++++---------- include/rdma/uverbs_ioctl.h | 1 + include/rdma/uverbs_named_ioctl.h | 59 +++++++ include/rdma/uverbs_std_types.h | 41 ++--- include/uapi/rdma/ib_user_ioctl_cmds.h | 83 ++++++++++ include/uapi/rdma/ib_user_ioctl_verbs.h | 53 +------ include/uapi/rdma/rdma_user_ioctl.h | 38 +---- include/uapi/rdma/rdma_user_ioctl_cmds.h | 71 +++++++++ 10 files changed, 396 insertions(+), 247 deletions(-) create mode 100644 include/rdma/uverbs_named_ioctl.h create mode 100644 include/uapi/rdma/ib_user_ioctl_cmds.h create mode 100644 include/uapi/rdma/rdma_user_ioctl_cmds.h diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index deccefb71a6b..0551e724c431 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -47,6 +47,9 @@ #include #include +#define UVERBS_MODULE_NAME ib_uverbs +#include + static inline void ib_uverbs_init_udata(struct ib_udata *udata, const void __user *ibuf, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 33c7f1290adb..bb29146c3823 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -50,7 +50,7 @@ static struct ib_uverbs_completion_event_file * ib_uverbs_lookup_comp_file(int fd, struct ib_ucontext *context) { - struct ib_uobject *uobj = uobj_get_read(uobj_get_type(comp_channel), + struct ib_uobject *uobj = uobj_get_read(UVERBS_OBJECT_COMP_CHANNEL, fd, context); struct ib_uobject_file *uobj_file; @@ -322,7 +322,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - uobj = uobj_alloc(uobj_get_type(pd), file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_PD, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -372,7 +372,7 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(uobj_get_type(pd), cmd.pd_handle, + uobj = uobj_get_write(UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -517,7 +517,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, } } - obj = (struct ib_uxrcd_object *)uobj_alloc(uobj_get_type(xrcd), + obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, file->ucontext); if (IS_ERR(obj)) { ret = PTR_ERR(obj); @@ -602,7 +602,7 @@ ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(uobj_get_type(xrcd), cmd.xrcd_handle, + uobj = uobj_get_write(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -663,11 +663,11 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if (ret) return ret; - uobj = uobj_alloc(uobj_get_type(mr), file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_MR, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_free; @@ -758,7 +758,7 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) return -EINVAL; - uobj = uobj_get_write(uobj_get_type(mr), cmd.mr_handle, + uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -772,7 +772,7 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, } if (cmd.flags & IB_MR_REREG_PD) { - pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto put_uobjs; @@ -824,7 +824,7 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(uobj_get_type(mr), cmd.mr_handle, + uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -853,11 +853,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - uobj = uobj_alloc(uobj_get_type(mw), file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_MW, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_free; @@ -916,7 +916,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - uobj = uobj_get_write(uobj_get_type(mw), cmd.mw_handle, + uobj = uobj_get_write(UVERBS_OBJECT_MW, cmd.mw_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -941,7 +941,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_alloc(uobj_get_type(comp_channel), file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -986,7 +986,7 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, if (cmd->comp_vector >= file->device->num_comp_vectors) return ERR_PTR(-EINVAL); - obj = (struct ib_ucq_object *)uobj_alloc(uobj_get_type(cq), + obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, file->ucontext); if (IS_ERR(obj)) return obj; @@ -1175,7 +1175,7 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); if (!cq) return -EINVAL; @@ -1240,7 +1240,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); if (!cq) return -EINVAL; @@ -1287,7 +1287,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); if (!cq) return -EINVAL; @@ -1314,7 +1314,7 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(uobj_get_type(cq), cmd.cq_handle, + uobj = uobj_get_write(UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -1373,7 +1373,7 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) return -EPERM; - obj = (struct ib_uqp_object *)uobj_alloc(uobj_get_type(qp), + obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file->ucontext); if (IS_ERR(obj)) return PTR_ERR(obj); @@ -1384,7 +1384,7 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) + sizeof(cmd->rwq_ind_tbl_handle) && (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) { - ind_tbl = uobj_get_obj_read(rwq_ind_table, + ind_tbl = uobj_get_obj_read(rwq_ind_table, UVERBS_OBJECT_RWQ_IND_TBL, cmd->rwq_ind_tbl_handle, file->ucontext); if (!ind_tbl) { @@ -1411,7 +1411,7 @@ static int create_qp(struct ib_uverbs_file *file, has_sq = false; if (cmd->qp_type == IB_QPT_XRC_TGT) { - xrcd_uobj = uobj_get_read(uobj_get_type(xrcd), cmd->pd_handle, + xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->pd_handle, file->ucontext); if (IS_ERR(xrcd_uobj)) { @@ -1431,7 +1431,7 @@ static int create_qp(struct ib_uverbs_file *file, cmd->max_recv_sge = 0; } else { if (cmd->is_srq) { - srq = uobj_get_obj_read(srq, cmd->srq_handle, + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd->srq_handle, file->ucontext); if (!srq || srq->srq_type == IB_SRQT_XRC) { ret = -EINVAL; @@ -1441,7 +1441,7 @@ static int create_qp(struct ib_uverbs_file *file, if (!ind_tbl) { if (cmd->recv_cq_handle != cmd->send_cq_handle) { - rcq = uobj_get_obj_read(cq, cmd->recv_cq_handle, + rcq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->recv_cq_handle, file->ucontext); if (!rcq) { ret = -EINVAL; @@ -1452,11 +1452,11 @@ static int create_qp(struct ib_uverbs_file *file, } if (has_sq) - scq = uobj_get_obj_read(cq, cmd->send_cq_handle, + scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->send_cq_handle, file->ucontext); if (!ind_tbl) rcq = rcq ?: scq; - pd = uobj_get_obj_read(pd, cmd->pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file->ucontext); if (!pd || (!scq && has_sq)) { ret = -EINVAL; goto err_put; @@ -1753,12 +1753,12 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - obj = (struct ib_uqp_object *)uobj_alloc(uobj_get_type(qp), + obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file->ucontext); if (IS_ERR(obj)) return PTR_ERR(obj); - xrcd_uobj = uobj_get_read(uobj_get_type(xrcd), cmd.pd_handle, + xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, file->ucontext); if (IS_ERR(xrcd_uobj)) { ret = -EINVAL; @@ -1861,7 +1861,7 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, goto out; } - qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); if (!qp) { ret = -EINVAL; goto out; @@ -1966,7 +1966,7 @@ static int modify_qp(struct ib_uverbs_file *file, if (!attr) return -ENOMEM; - qp = uobj_get_obj_read(qp, cmd->base.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, file->ucontext); if (!qp) { ret = -EINVAL; goto out; @@ -2121,7 +2121,7 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, memset(&resp, 0, sizeof resp); - uobj = uobj_get_write(uobj_get_type(qp), cmd.qp_handle, + uobj = uobj_get_write(UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -2187,7 +2187,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, if (!user_wr) return -ENOMEM; - qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); if (!qp) goto out; @@ -2223,7 +2223,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, goto out_put; } - ud->ah = uobj_get_obj_read(ah, user_wr->wr.ud.ah, + ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH, user_wr->wr.ud.ah, file->ucontext); if (!ud->ah) { kfree(ud); @@ -2458,7 +2458,7 @@ ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, if (IS_ERR(wr)) return PTR_ERR(wr); - qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); if (!qp) goto out; @@ -2507,7 +2507,7 @@ ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, if (IS_ERR(wr)) return PTR_ERR(wr); - srq = uobj_get_obj_read(srq, cmd.srq_handle, file->ucontext); + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext); if (!srq) goto out; @@ -2564,11 +2564,11 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - uobj = uobj_alloc(uobj_get_type(ah), file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_AH, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err; @@ -2636,7 +2636,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(uobj_get_type(ah), cmd.ah_handle, + uobj = uobj_get_write(UVERBS_OBJECT_AH, cmd.ah_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -2659,7 +2659,7 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; @@ -2710,7 +2710,7 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; @@ -2934,18 +2934,18 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EOPNOTSUPP; - obj = (struct ib_uwq_object *)uobj_alloc(uobj_get_type(wq), + obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, file->ucontext); if (IS_ERR(obj)) return PTR_ERR(obj); - pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); if (!pd) { err = -EINVAL; goto err_uobj; } - cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); if (!cq) { err = -EINVAL; goto err_put_pd; @@ -3049,7 +3049,7 @@ int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, return -EOPNOTSUPP; resp.response_length = required_resp_len; - uobj = uobj_get_write(uobj_get_type(wq), cmd.wq_handle, + uobj = uobj_get_write(UVERBS_OBJECT_WQ, cmd.wq_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -3100,7 +3100,7 @@ int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS)) return -EINVAL; - wq = uobj_get_obj_read(wq, cmd.wq_handle, file->ucontext); + wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, file->ucontext); if (!wq) return -EINVAL; @@ -3194,7 +3194,7 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, for (num_read_wqs = 0; num_read_wqs < num_wq_handles; num_read_wqs++) { - wq = uobj_get_obj_read(wq, wqs_handles[num_read_wqs], + wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, wqs_handles[num_read_wqs], file->ucontext); if (!wq) { err = -EINVAL; @@ -3204,7 +3204,7 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, wqs[num_read_wqs] = wq; } - uobj = uobj_alloc(uobj_get_type(rwq_ind_table), file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, file->ucontext); if (IS_ERR(uobj)) { err = PTR_ERR(uobj); goto put_wqs; @@ -3291,7 +3291,7 @@ int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EOPNOTSUPP; - uobj = uobj_get_write(uobj_get_type(rwq_ind_table), cmd.ind_tbl_handle, + uobj = uobj_get_write(UVERBS_OBJECT_RWQ_IND_TBL, cmd.ind_tbl_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -3370,13 +3370,13 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, kern_flow_attr = &cmd.flow_attr; } - uobj = uobj_alloc(uobj_get_type(flow), file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_FLOW, file->ucontext); if (IS_ERR(uobj)) { err = PTR_ERR(uobj); goto err_free_attr; } - qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); if (!qp) { err = -EINVAL; goto err_uobj; @@ -3472,7 +3472,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EINVAL; - uobj = uobj_get_write(uobj_get_type(flow), cmd.flow_handle, + uobj = uobj_get_write(UVERBS_OBJECT_FLOW, cmd.flow_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -3494,7 +3494,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_srq_init_attr attr; int ret; - obj = (struct ib_usrq_object *)uobj_alloc(uobj_get_type(srq), + obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, file->ucontext); if (IS_ERR(obj)) return PTR_ERR(obj); @@ -3503,7 +3503,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, attr.ext.tag_matching.max_num_tags = cmd->max_num_tags; if (cmd->srq_type == IB_SRQT_XRC) { - xrcd_uobj = uobj_get_read(uobj_get_type(xrcd), cmd->xrcd_handle, + xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->xrcd_handle, file->ucontext); if (IS_ERR(xrcd_uobj)) { ret = -EINVAL; @@ -3521,7 +3521,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, } if (ib_srq_has_cq(cmd->srq_type)) { - attr.ext.cq = uobj_get_obj_read(cq, cmd->cq_handle, + attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->cq_handle, file->ucontext); if (!attr.ext.cq) { ret = -EINVAL; @@ -3529,7 +3529,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, } } - pd = uobj_get_obj_read(pd, cmd->pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_put_cq; @@ -3701,7 +3701,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, ib_uverbs_init_udata(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); - srq = uobj_get_obj_read(srq, cmd.srq_handle, file->ucontext); + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext); if (!srq) return -EINVAL; @@ -3732,7 +3732,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - srq = uobj_get_obj_read(srq, cmd.srq_handle, file->ucontext); + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext); if (!srq) return -EINVAL; @@ -3769,7 +3769,7 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(uobj_get_type(srq), cmd.srq_handle, + uobj = uobj_get_write(UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -3942,7 +3942,7 @@ int ib_uverbs_ex_modify_cq(struct ib_uverbs_file *file, if (cmd.attr_mask > IB_CQ_MODERATE) return -EOPNOTSUPP; - cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); if (!cq) return -EINVAL; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index df1360e6774f..e4a4b184a6bc 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -216,9 +216,9 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ * spec. */ static const struct uverbs_attr_def uverbs_uhw_compat_in = - UVERBS_ATTR_PTR_IN_SZ(UVERBS_UHW_IN, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); + UVERBS_ATTR_PTR_IN_SZ(UVERBS_ATTR_UHW_IN, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); static const struct uverbs_attr_def uverbs_uhw_compat_out = - UVERBS_ATTR_PTR_OUT_SZ(UVERBS_UHW_OUT, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); + UVERBS_ATTR_PTR_OUT_SZ(UVERBS_ATTR_UHW_OUT, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); static void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata) @@ -229,9 +229,9 @@ static void create_udata(struct uverbs_attr_bundle *ctx, * Assume attr == 0 is input and attr == 1 is output. */ const struct uverbs_attr *uhw_in = - uverbs_attr_get(ctx, UVERBS_UHW_IN); + uverbs_attr_get(ctx, UVERBS_ATTR_UHW_IN); const struct uverbs_attr *uhw_out = - uverbs_attr_get(ctx, UVERBS_UHW_OUT); + uverbs_attr_get(ctx, UVERBS_ATTR_UHW_OUT); if (!IS_ERR(uhw_in)) { udata->inlen = uhw_in->ptr_attr.len; @@ -253,9 +253,9 @@ static void create_udata(struct uverbs_attr_bundle *ctx, } } -static int uverbs_create_cq_handler(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) +static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) { struct ib_ucontext *ucontext = file->ucontext; struct ib_ucq_object *obj; @@ -271,19 +271,23 @@ static int uverbs_create_cq_handler(struct ib_device *ib_dev, if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ)) return -EOPNOTSUPP; - ret = uverbs_copy_from(&attr.comp_vector, attrs, CREATE_CQ_COMP_VECTOR); + ret = uverbs_copy_from(&attr.comp_vector, attrs, + UVERBS_ATTR_CREATE_CQ_COMP_VECTOR); if (!ret) - ret = uverbs_copy_from(&attr.cqe, attrs, CREATE_CQ_CQE); + ret = uverbs_copy_from(&attr.cqe, attrs, + UVERBS_ATTR_CREATE_CQ_CQE); if (!ret) - ret = uverbs_copy_from(&user_handle, attrs, CREATE_CQ_USER_HANDLE); + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_CQ_USER_HANDLE); if (ret) return ret; /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ - if (uverbs_copy_from(&attr.flags, attrs, CREATE_CQ_FLAGS) == -EFAULT) + if (uverbs_copy_from(&attr.flags, attrs, + UVERBS_ATTR_CREATE_CQ_FLAGS) == -EFAULT) return -EFAULT; - ev_file_attr = uverbs_attr_get(attrs, CREATE_CQ_COMP_CHANNEL); + ev_file_attr = uverbs_attr_get(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL); if (!IS_ERR(ev_file_attr)) { ev_file_uobj = ev_file_attr->obj_attr.uobject; @@ -298,7 +302,8 @@ static int uverbs_create_cq_handler(struct ib_device *ib_dev, goto err_event_file; } - obj = container_of(uverbs_attr_get(attrs, CREATE_CQ_HANDLE)->obj_attr.uobject, + obj = container_of(uverbs_attr_get(attrs, + UVERBS_ATTR_CREATE_CQ_HANDLE)->obj_attr.uobject, typeof(*obj), uobject); obj->uverbs_file = ucontext->ufile; obj->comp_events_reported = 0; @@ -326,7 +331,7 @@ static int uverbs_create_cq_handler(struct ib_device *ib_dev, cq->res.type = RDMA_RESTRACK_CQ; rdma_restrack_add(&cq->res); - ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe, + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, sizeof(cq->cqe)); if (ret) goto err_cq; @@ -341,30 +346,31 @@ err_event_file: return ret; }; -static DECLARE_UVERBS_METHOD( - uverbs_method_cq_create, UVERBS_CQ_CREATE, uverbs_create_cq_handler, - &UVERBS_ATTR_IDR(CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, UVERBS_ACCESS_NEW, +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_CREATE, + &UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, + UVERBS_ACCESS_NEW, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(CREATE_CQ_CQE, u32, + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE, u32, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(CREATE_CQ_USER_HANDLE, u64, + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE, u64, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_FD(CREATE_CQ_COMP_CHANNEL, UVERBS_OBJECT_COMP_CHANNEL, + &UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, + UVERBS_OBJECT_COMP_CHANNEL, UVERBS_ACCESS_READ), - &UVERBS_ATTR_PTR_IN(CREATE_CQ_COMP_VECTOR, u32, + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, u32, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(CREATE_CQ_FLAGS, u32), - &UVERBS_ATTR_PTR_OUT(CREATE_CQ_RESP_CQE, u32, + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_FLAGS, u32), + &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, u32, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), &uverbs_uhw_compat_in, &uverbs_uhw_compat_out); -static int uverbs_destroy_cq_handler(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) +static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) { struct ib_uverbs_destroy_cq_resp resp; struct ib_uobject *uobj = - uverbs_attr_get(attrs, DESTROY_CQ_HANDLE)->obj_attr.uobject; + uverbs_attr_get(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject; struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object, uobject); int ret; @@ -379,81 +385,81 @@ static int uverbs_destroy_cq_handler(struct ib_device *ib_dev, resp.comp_events_reported = obj->comp_events_reported; resp.async_events_reported = obj->async_events_reported; - return uverbs_copy_to(attrs, DESTROY_CQ_RESP, &resp, sizeof(resp)); + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_CQ_RESP, &resp, + sizeof(resp)); } -static DECLARE_UVERBS_METHOD( - uverbs_method_cq_destroy, UVERBS_CQ_DESTROY, uverbs_destroy_cq_handler, - &UVERBS_ATTR_IDR(DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ, +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_DESTROY, + &UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ, UVERBS_ACCESS_DESTROY, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_OUT(DESTROY_CQ_RESP, struct ib_uverbs_destroy_cq_resp, + &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP, + struct ib_uverbs_destroy_cq_resp, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); -DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel, - UVERBS_OBJECT_COMP_CHANNEL, - &UVERBS_TYPE_ALLOC_FD(0, - sizeof(struct ib_uverbs_completion_event_file), - uverbs_hot_unplug_completion_event_file, - &uverbs_event_fops, - "[infinibandevent]", O_RDONLY)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COMP_CHANNEL, + &UVERBS_TYPE_ALLOC_FD(0, + sizeof(struct ib_uverbs_completion_event_file), + uverbs_hot_unplug_completion_event_file, + &uverbs_event_fops, + "[infinibandevent]", O_RDONLY)); -DECLARE_UVERBS_OBJECT(uverbs_object_cq, UVERBS_OBJECT_CQ, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, - uverbs_free_cq), - &uverbs_method_cq_create, - &uverbs_method_cq_destroy); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_CQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, + uverbs_free_cq), + &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY) + ); -DECLARE_UVERBS_OBJECT(uverbs_object_qp, UVERBS_OBJECT_QP, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0, - uverbs_free_qp)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_QP, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0, + uverbs_free_qp)); -DECLARE_UVERBS_OBJECT(uverbs_object_mw, UVERBS_OBJECT_MW, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw)); -DECLARE_UVERBS_OBJECT(uverbs_object_mr, UVERBS_OBJECT_MR, - /* 1 is used in order to free the MR after all the MWs */ - &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MR, + /* 1 is used in order to free the MR after all the MWs */ + &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr)); -DECLARE_UVERBS_OBJECT(uverbs_object_srq, UVERBS_OBJECT_SRQ, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0, - uverbs_free_srq)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_SRQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0, + uverbs_free_srq)); -DECLARE_UVERBS_OBJECT(uverbs_object_ah, UVERBS_OBJECT_AH, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah)); -DECLARE_UVERBS_OBJECT(uverbs_object_flow, UVERBS_OBJECT_FLOW, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow)); -DECLARE_UVERBS_OBJECT(uverbs_object_wq, UVERBS_OBJECT_WQ, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0, - uverbs_free_wq)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_WQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0, + uverbs_free_wq)); -DECLARE_UVERBS_OBJECT(uverbs_object_rwq_ind_table, - UVERBS_OBJECT_RWQ_IND_TBL, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl)); -DECLARE_UVERBS_OBJECT(uverbs_object_xrcd, UVERBS_OBJECT_XRCD, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0, - uverbs_free_xrcd)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_XRCD, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0, + uverbs_free_xrcd)); -DECLARE_UVERBS_OBJECT(uverbs_object_pd, UVERBS_OBJECT_PD, - /* 2 is used in order to free the PD after MRs */ - &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd)); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD, + /* 2 is used in order to free the PD after MRs */ + &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd)); -DECLARE_UVERBS_OBJECT(uverbs_object_device, UVERBS_OBJECT_DEVICE, NULL); +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DEVICE, NULL); DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, - &uverbs_object_device, - &uverbs_object_pd, - &uverbs_object_mr, - &uverbs_object_comp_channel, - &uverbs_object_cq, - &uverbs_object_qp, - &uverbs_object_ah, - &uverbs_object_mw, - &uverbs_object_srq, - &uverbs_object_flow, - &uverbs_object_wq, - &uverbs_object_rwq_ind_table, - &uverbs_object_xrcd); + &UVERBS_OBJECT(UVERBS_OBJECT_DEVICE), + &UVERBS_OBJECT(UVERBS_OBJECT_PD), + &UVERBS_OBJECT(UVERBS_OBJECT_MR), + &UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL), + &UVERBS_OBJECT(UVERBS_OBJECT_CQ), + &UVERBS_OBJECT(UVERBS_OBJECT_QP), + &UVERBS_OBJECT(UVERBS_OBJECT_AH), + &UVERBS_OBJECT(UVERBS_OBJECT_MW), + &UVERBS_OBJECT(UVERBS_OBJECT_SRQ), + &UVERBS_OBJECT(UVERBS_OBJECT_FLOW), + &UVERBS_OBJECT(UVERBS_OBJECT_WQ), + &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL), + &UVERBS_OBJECT(UVERBS_OBJECT_XRCD)); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 38287d9d23a1..c0be2b5f6a1e 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -37,6 +37,7 @@ #include #include #include +#include /* * ======================================= diff --git a/include/rdma/uverbs_named_ioctl.h b/include/rdma/uverbs_named_ioctl.h new file mode 100644 index 000000000000..a7f0565ca784 --- /dev/null +++ b/include/rdma/uverbs_named_ioctl.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _UVERBS_NAMED_IOCTL_ +#define _UVERBS_NAMED_IOCTL_ + +#include + +#ifndef UVERBS_MODULE_NAME +#error "Please #define UVERBS_MODULE_NAME before including rdma/uverbs_named_ioctl.h" +#endif + +#define _UVERBS_PASTE(x, y) x ## y +#define _UVERBS_NAME(x, y) _UVERBS_PASTE(x, y) +#define UVERBS_METHOD(id) _UVERBS_NAME(UVERBS_MODULE_NAME, _method_##id) +#define UVERBS_HANDLER(id) _UVERBS_NAME(UVERBS_MODULE_NAME, _handler_##id) + +#define DECLARE_UVERBS_NAMED_METHOD(id, ...) \ + DECLARE_UVERBS_METHOD(UVERBS_METHOD(id), id, UVERBS_HANDLER(id), ##__VA_ARGS__) + +#define DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(id, handler, ...) \ + DECLARE_UVERBS_METHOD(UVERBS_METHOD(id), id, handler, ##__VA_ARGS__) + +#define DECLARE_UVERBS_NAMED_METHOD_NO_OVERRIDE(id, handler, ...) \ + DECLARE_UVERBS_METHOD(UVERBS_METHOD(id), id, NULL, ##__VA_ARGS__) + +#define DECLARE_UVERBS_NAMED_OBJECT(id, ...) \ + DECLARE_UVERBS_OBJECT(UVERBS_OBJECT(id), id, ##__VA_ARGS__) + +#endif diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 5f8e20bbd67c..45ee7d1bfa32 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -38,19 +38,22 @@ #include #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) -extern const struct uverbs_object_def uverbs_object_comp_channel; -extern const struct uverbs_object_def uverbs_object_cq; -extern const struct uverbs_object_def uverbs_object_qp; -extern const struct uverbs_object_def uverbs_object_rwq_ind_table; -extern const struct uverbs_object_def uverbs_object_wq; -extern const struct uverbs_object_def uverbs_object_srq; -extern const struct uverbs_object_def uverbs_object_ah; -extern const struct uverbs_object_def uverbs_object_flow; -extern const struct uverbs_object_def uverbs_object_mr; -extern const struct uverbs_object_def uverbs_object_mw; -extern const struct uverbs_object_def uverbs_object_pd; -extern const struct uverbs_object_def uverbs_object_xrcd; -extern const struct uverbs_object_def uverbs_object_device; + +#define UVERBS_OBJECT(id) uverbs_object_##id + +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DEVICE); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_PD); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MR); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_CQ); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_QP); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_AH); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MW); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_SRQ); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_WQ); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD); extern const struct uverbs_object_tree_def uverbs_default_objects; static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void) @@ -72,22 +75,22 @@ static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type, return rdma_lookup_get_uobject(type, ucontext, id, write); } -#define uobj_get_type(_object) uverbs_object_##_object.type_attrs +#define uobj_get_type(_object) UVERBS_OBJECT(_object).type_attrs #define uobj_get_read(_type, _id, _ucontext) \ - __uobj_get(_type, false, _ucontext, _id) + __uobj_get(uobj_get_type(_type), false, _ucontext, _id) -#define uobj_get_obj_read(_object, _id, _ucontext) \ +#define uobj_get_obj_read(_object, _type, _id, _ucontext) \ ({ \ struct ib_uobject *__uobj = \ - __uobj_get(uverbs_object_##_object.type_attrs, \ + __uobj_get(uobj_get_type(_type), \ false, _ucontext, _id); \ \ (struct ib_##_object *)(IS_ERR(__uobj) ? NULL : __uobj->object);\ }) #define uobj_get_write(_type, _id, _ucontext) \ - __uobj_get(_type, true, _ucontext, _id) + __uobj_get(uobj_get_type(_type), true, _ucontext, _id) static inline void uobj_put_read(struct ib_uobject *uobj) { @@ -124,7 +127,7 @@ static inline struct ib_uobject *__uobj_alloc(const struct uverbs_obj_type *type } #define uobj_alloc(_type, ucontext) \ - __uobj_alloc(_type, ucontext) + __uobj_alloc(uobj_get_type(_type), ucontext) #endif diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h new file mode 100644 index 000000000000..77bbbed17ed5 --- /dev/null +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_IOCTL_CMDS_H +#define IB_USER_IOCTL_CMDS_H + +#define UVERBS_ID_NS_MASK 0xF000 +#define UVERBS_ID_NS_SHIFT 12 + +#define UVERBS_UDATA_DRIVER_DATA_NS 1 +#define UVERBS_UDATA_DRIVER_DATA_FLAG (1UL << UVERBS_ID_NS_SHIFT) + +enum uverbs_default_objects { + UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */ + UVERBS_OBJECT_PD, + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_OBJECT_CQ, + UVERBS_OBJECT_QP, + UVERBS_OBJECT_SRQ, + UVERBS_OBJECT_AH, + UVERBS_OBJECT_MR, + UVERBS_OBJECT_MW, + UVERBS_OBJECT_FLOW, + UVERBS_OBJECT_XRCD, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_OBJECT_WQ, +}; + +enum { + UVERBS_ATTR_UHW_IN = UVERBS_UDATA_DRIVER_DATA_FLAG, + UVERBS_ATTR_UHW_OUT, +}; + +enum uverbs_attrs_create_cq_cmd_attr_ids { + UVERBS_ATTR_CREATE_CQ_HANDLE, + UVERBS_ATTR_CREATE_CQ_CQE, + UVERBS_ATTR_CREATE_CQ_USER_HANDLE, + UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, + UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, + UVERBS_ATTR_CREATE_CQ_FLAGS, + UVERBS_ATTR_CREATE_CQ_RESP_CQE, +}; + +enum uverbs_attrs_destroy_cq_cmd_attr_ids { + UVERBS_ATTR_DESTROY_CQ_HANDLE, + UVERBS_ATTR_DESTROY_CQ_RESP, +}; + +enum uverbs_methods_cq { + UVERBS_METHOD_CQ_CREATE, + UVERBS_METHOD_CQ_DESTROY, +}; + +#endif diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 842792eae383..3d3a2f017abc 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -1,5 +1,6 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * Copyright (c) 2017-2018, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,52 +34,10 @@ #ifndef IB_USER_IOCTL_VERBS_H #define IB_USER_IOCTL_VERBS_H -#include - -#define UVERBS_UDATA_DRIVER_DATA_NS 1 -#define UVERBS_UDATA_DRIVER_DATA_FLAG (1UL << UVERBS_ID_NS_SHIFT) - -enum uverbs_default_objects { - UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */ - UVERBS_OBJECT_PD, - UVERBS_OBJECT_COMP_CHANNEL, - UVERBS_OBJECT_CQ, - UVERBS_OBJECT_QP, - UVERBS_OBJECT_SRQ, - UVERBS_OBJECT_AH, - UVERBS_OBJECT_MR, - UVERBS_OBJECT_MW, - UVERBS_OBJECT_FLOW, - UVERBS_OBJECT_XRCD, - UVERBS_OBJECT_RWQ_IND_TBL, - UVERBS_OBJECT_WQ, - UVERBS_OBJECT_LAST, -}; - -enum { - UVERBS_UHW_IN = UVERBS_UDATA_DRIVER_DATA_FLAG, - UVERBS_UHW_OUT, -}; - -enum uverbs_create_cq_cmd_attr_ids { - CREATE_CQ_HANDLE, - CREATE_CQ_CQE, - CREATE_CQ_USER_HANDLE, - CREATE_CQ_COMP_CHANNEL, - CREATE_CQ_COMP_VECTOR, - CREATE_CQ_FLAGS, - CREATE_CQ_RESP_CQE, -}; - -enum uverbs_destroy_cq_cmd_attr_ids { - DESTROY_CQ_HANDLE, - DESTROY_CQ_RESP, -}; - -enum uverbs_actions_cq_ops { - UVERBS_CQ_CREATE, - UVERBS_CQ_DESTROY, -}; +#include +#ifndef RDMA_UAPI_PTR +#define RDMA_UAPI_PTR(_type, _name) _type __attribute__((aligned(8))) _name #endif +#endif diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h index 46de0885e800..d223f4164a0f 100644 --- a/include/uapi/rdma/rdma_user_ioctl.h +++ b/include/uapi/rdma/rdma_user_ioctl.h @@ -34,49 +34,13 @@ #ifndef RDMA_USER_IOCTL_H #define RDMA_USER_IOCTL_H -#include -#include #include #include +#include -/* Documentation/ioctl/ioctl-number.txt */ -#define RDMA_IOCTL_MAGIC 0x1b /* Legacy name, for user space application which already use it */ #define IB_IOCTL_MAGIC RDMA_IOCTL_MAGIC -#define RDMA_VERBS_IOCTL \ - _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) - -#define UVERBS_ID_NS_MASK 0xF000 -#define UVERBS_ID_NS_SHIFT 12 - -enum { - /* User input */ - UVERBS_ATTR_F_MANDATORY = 1U << 0, - /* - * Valid output bit should be ignored and considered set in - * mandatory fields. This bit is kernel output. - */ - UVERBS_ATTR_F_VALID_OUTPUT = 1U << 1, -}; - -struct ib_uverbs_attr { - __u16 attr_id; /* command specific type attribute */ - __u16 len; /* only for pointers */ - __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ - __u16 reserved; - __aligned_u64 data; /* ptr to command, inline data or idr/fd */ -}; - -struct ib_uverbs_ioctl_hdr { - __u16 length; - __u16 object_id; - __u16 method_id; - __u16 num_attrs; - __aligned_u64 reserved; - struct ib_uverbs_attr attrs[0]; -}; - /* * General blocks assignments * It is closed on purpose do not expose it it user space diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h new file mode 100644 index 000000000000..aa1fffe3620b --- /dev/null +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_IOCTL_CMDS_H +#define RDMA_USER_IOCTL_CMDS_H + +#include +#include + +/* Documentation/ioctl/ioctl-number.txt */ +#define RDMA_IOCTL_MAGIC 0x1b +#define RDMA_VERBS_IOCTL \ + _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) + +enum { + /* User input */ + UVERBS_ATTR_F_MANDATORY = 1U << 0, + /* + * Valid output bit should be ignored and considered set in + * mandatory fields. This bit is kernel output. + */ + UVERBS_ATTR_F_VALID_OUTPUT = 1U << 1, +}; + +struct ib_uverbs_attr { + __u16 attr_id; /* command specific type attribute */ + __u16 len; /* only for pointers */ + __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ + __u16 reserved; + __aligned_u64 data; /* ptr to command, inline data or idr/fd */ +}; + +struct ib_uverbs_ioctl_hdr { + __u16 length; + __u16 object_id; + __u16 method_id; + __u16 num_attrs; + __aligned_u64 reserved; + struct ib_uverbs_attr attrs[0]; +}; + +#endif From 0ede73bc012c98fba244b33efbc42e48dd23ee9a Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 19 Mar 2018 15:02:34 +0200 Subject: [PATCH 121/199] IB/uverbs: Extend uverbs_ioctl header with driver_id Extending uverbs_ioctl header with driver_id and another reserved field. driver_id should be used in order to identify the driver. Since every driver could have its own parsing tree, this is necessary for strace support. Downstream patches take off the EXPERIMENTAL flag from the ioctl() IB support and thus we add some reserved fields for future usage. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 5 +++- drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/cxgb3/iwch_provider.c | 1 + drivers/infiniband/hw/cxgb4/provider.c | 1 + drivers/infiniband/hw/hfi1/verbs.c | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 1 + drivers/infiniband/hw/i40iw/i40iw_verbs.c | 1 + drivers/infiniband/hw/mlx4/main.c | 1 + drivers/infiniband/hw/mlx5/main.c | 1 + drivers/infiniband/hw/mthca/mthca_provider.c | 1 + drivers/infiniband/hw/nes/nes_verbs.c | 1 + drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 + drivers/infiniband/hw/qedr/main.c | 1 + drivers/infiniband/hw/qib/qib_verbs.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 1 + .../infiniband/hw/vmw_pvrdma/pvrdma_main.c | 1 + drivers/infiniband/sw/rdmavt/vt.c | 3 ++- drivers/infiniband/sw/rxe/rxe_verbs.c | 1 + include/rdma/ib_verbs.h | 2 ++ include/rdma/rdma_vt.h | 2 +- include/uapi/rdma/rdma_user_ioctl_cmds.h | 24 ++++++++++++++++++- 21 files changed, 48 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 339b85145044..7016e729f139 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -246,6 +246,9 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, size_t ctx_size; uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)]; + if (hdr->driver_id != ib_dev->driver_id) + return -EINVAL; + object_spec = uverbs_get_object(ib_dev, hdr->object_id); if (!object_spec) return -EPROTONOSUPPORT; @@ -350,7 +353,7 @@ long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto out; } - if (hdr.reserved) { + if (hdr.reserved1 || hdr.reserved2) { err = -EPROTONOSUPPORT; goto out; } diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index f6e361750466..abe0be8b5ddc 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -619,6 +619,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->get_hw_stats = bnxt_re_ib_get_hw_stats; ibdev->alloc_hw_stats = bnxt_re_ib_alloc_hw_stats; + ibdev->driver_id = RDMA_DRIVER_BNXT_RE; return ib_register_device(ibdev, NULL); } diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index a578ca559e11..1804b6c4a6ec 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1439,6 +1439,7 @@ int iwch_register_device(struct iwch_dev *dev) memcpy(dev->ibdev.iwcm->ifname, dev->rdev.t3cdev_p->lldev->name, sizeof(dev->ibdev.iwcm->ifname)); + dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; ret = ib_register_device(&dev->ibdev, NULL); if (ret) goto bail1; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 42568a4df3f8..dc4eabd85f54 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -629,6 +629,7 @@ void c4iw_register_device(struct work_struct *work) memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name, sizeof(dev->ibdev.iwcm->ifname)); + dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; ret = ib_register_device(&dev->ibdev, NULL); if (ret) goto err_kfree_iwcm; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 471d55c50066..c8cf4d4984d3 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1960,7 +1960,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) i, ppd->pkeys); - ret = rvt_register_device(&dd->verbs_dev.rdi); + ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1); if (ret) goto err_verbs_txreq; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 6e48b1f507cf..83e21f696bbc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -526,6 +526,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) /* OTHERS */ ib_dev->get_port_immutable = hns_roce_port_immutable; + ib_dev->driver_id = RDMA_DRIVER_HNS; ret = ib_register_device(ib_dev, NULL); if (ret) { dev_err(dev, "ib_register_device failed!\n"); diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 60e004d2100e..40e4f5ab2b46 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2927,6 +2927,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev) return -ENOMEM; iwibdev = iwdev->iwibdev; + iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW; ret = ib_register_device(&iwibdev->ibdev, NULL); if (ret) goto error; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index b9befda1eb27..d1be3231f4f0 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2955,6 +2955,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_ib_alloc_diag_counters(ibdev)) goto err_steer_free_bitmap; + ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; if (ib_register_device(&ibdev->ib_dev, NULL)) goto err_diag_counters; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d06aae9aa600..6b50711df786 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4779,6 +4779,7 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; err = init_node_data(dev); if (err) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 6fee7795d1c8..541f237965c7 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1295,6 +1295,7 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); + dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA; ret = ib_register_device(&dev->ib_dev, NULL); if (ret) return ret; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 162475aeeedd..1040a6e34230 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3854,6 +3854,7 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) struct nes_adapter *nesadapter = nesdev->nesadapter; int i, ret; + nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES; ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL); if (ret) { return ret; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 42dc0de54cb8..4547aa28d4ae 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -215,6 +215,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.destroy_srq = ocrdma_destroy_srq; dev->ibdev.post_srq_recv = ocrdma_post_srq_recv; } + dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA; return ib_register_device(&dev->ibdev, NULL); } diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index db4bf97c0e15..f865c0991ad9 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -257,6 +257,7 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.get_link_layer = qedr_link_layer; dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str; + dev->ibdev.driver_id = RDMA_DRIVER_QEDR; return ib_register_device(&dev->ibdev, NULL); } diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index fabee760407e..3977abbc83ad 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1646,7 +1646,7 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->rcd[ctxt]->pkeys); } - ret = rvt_register_device(&dd->verbs_dev.rdi); + ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB); if (ret) goto err_tx; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index f45e99a938e0..aed1ca390e30 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -433,6 +433,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.get_dev_fw_str = usnic_get_dev_fw_str; + us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC; if (ib_register_device(&us_ibdev->ib_dev, NULL)) goto err_fwd_dealloc; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index d650a9fcde24..4834460e2a0b 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -276,6 +276,7 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) if (!dev->srq_tbl) goto err_qp_free; } + dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA; spin_lock_init(&dev->srq_tbl_lock); ret = ib_register_device(&dev->ib_dev, NULL); diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index a67b0ddc2230..434199d0bc96 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -730,7 +730,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) * * Return: 0 on success otherwise an errno. */ -int rvt_register_device(struct rvt_dev_info *rdi) +int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) { int ret = 0, i; @@ -831,6 +831,7 @@ int rvt_register_device(struct rvt_dev_info *rdi) rdi->ibdev.node_type = RDMA_NODE_IB_CA; rdi->ibdev.num_comp_vectors = 1; + rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); if (ret) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index ced79e49234b..5ef8c3333e43 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1335,6 +1335,7 @@ int rxe_register_device(struct rxe_dev *rxe) } rxe->tfm = tfm; + dev->driver_id = RDMA_DRIVER_RXE; err = ib_register_device(dev, NULL); if (err) { pr_warn("%s failed with error %d\n", __func__, err); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3cc48f34e3e4..2357f2b29610 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -64,6 +64,7 @@ #include #include #include +#include #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN @@ -2385,6 +2386,7 @@ struct ib_device { int comp_vector); struct uverbs_root_spec *specs_root; + enum rdma_driver_id driver_id; }; struct ib_client { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 4118324a0310..3f4c187e435d 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -538,7 +538,7 @@ static inline void rvt_mod_retry_timer(struct rvt_qp *qp) struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); void rvt_dealloc_device(struct rvt_dev_info *rdi); -int rvt_register_device(struct rvt_dev_info *rvd); +int rvt_register_device(struct rvt_dev_info *rvd, u32 driver_id); void rvt_unregister_device(struct rvt_dev_info *rvd); int rvt_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index aa1fffe3620b..40063cf970aa 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -64,8 +64,30 @@ struct ib_uverbs_ioctl_hdr { __u16 object_id; __u16 method_id; __u16 num_attrs; - __aligned_u64 reserved; + __aligned_u64 reserved1; + __u32 driver_id; + __u32 reserved2; struct ib_uverbs_attr attrs[0]; }; +enum rdma_driver_id { + RDMA_DRIVER_UNKNOWN, + RDMA_DRIVER_MLX5, + RDMA_DRIVER_MLX4, + RDMA_DRIVER_CXGB3, + RDMA_DRIVER_CXGB4, + RDMA_DRIVER_MTHCA, + RDMA_DRIVER_BNXT_RE, + RDMA_DRIVER_OCRDMA, + RDMA_DRIVER_NES, + RDMA_DRIVER_I40IW, + RDMA_DRIVER_VMW_PVRDMA, + RDMA_DRIVER_QEDR, + RDMA_DRIVER_HNS, + RDMA_DRIVER_USNIC, + RDMA_DRIVER_RXE, + RDMA_DRIVER_HFI1, + RDMA_DRIVER_QIB, +}; + #endif From 1f07e08fab2e895c68d4eb5a519c36be75a12078 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 19 Mar 2018 15:02:35 +0200 Subject: [PATCH 122/199] IB/uverbs: Enable compact representation of uverbs_attr_spec Downstream patches extend uverbs_attr_spec with new fields. In order to save space, we move the type and flags fields to the various attribute flavors contained in the union. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 4 +-- include/rdma/uverbs_ioctl.h | 34 +++++++++++++++++--------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 7016e729f139..82a1775ba657 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -69,9 +69,9 @@ static int uverbs_process_attr(struct ib_device *ibdev, switch (spec->type) { case UVERBS_ATTR_TYPE_PTR_IN: case UVERBS_ATTR_TYPE_PTR_OUT: - if (uattr->len < spec->len || + if (uattr->len < spec->ptr.len || (!(spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ) && - uattr->len > spec->len)) + uattr->len > spec->ptr.len)) return -EINVAL; e->ptr_attr.data = uattr->data; diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index c0be2b5f6a1e..cd7c3e40c6cc 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -66,11 +66,25 @@ enum { UVERBS_ATTR_SPEC_F_MIN_SZ = 1U << 1, }; +/* Specification of a single attribute inside the ioctl message */ struct uverbs_attr_spec { - enum uverbs_attr_type type; union { - u16 len; + /* Header shared by all following union members - to reduce space. */ struct { + enum uverbs_attr_type type; + /* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */ + u8 flags; + }; + struct { + enum uverbs_attr_type type; + /* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */ + u8 flags; + u16 len; + } ptr; + struct { + enum uverbs_attr_type type; + /* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */ + u8 flags; /* * higher bits mean the namespace and lower bits mean * the type id within the namespace. @@ -79,8 +93,6 @@ struct uverbs_attr_spec { u8 access; } obj; }; - /* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */ - u8 flags; }; struct uverbs_attr_spec_hash { @@ -167,10 +179,10 @@ struct uverbs_object_tree_def { #define UA_FLAGS(_flags) .flags = _flags #define __UVERBS_ATTR0(_id, _len, _type, ...) \ ((const struct uverbs_attr_def) \ - {.id = _id, .attr = {.type = _type, {.len = _len}, .flags = 0, } }) + {.id = _id, .attr = {{.ptr = {.type = _type, .len = _len, .flags = 0, } }, } }) #define __UVERBS_ATTR1(_id, _len, _type, _flags) \ ((const struct uverbs_attr_def) \ - {.id = _id, .attr = {.type = _type, {.len = _len}, _flags, } }) + {.id = _id, .attr = {{.ptr = {.type = _type, .len = _len, _flags } },} }) #define __UVERBS_ATTR(_id, _len, _type, _flags, _n, ...) \ __UVERBS_ATTR##_n(_id, _len, _type, _flags) /* @@ -203,15 +215,13 @@ struct uverbs_object_tree_def { #define ___UVERBS_ATTR_OBJ0(_id, _obj_class, _obj_type, _access, ...)\ ((const struct uverbs_attr_def) \ {.id = _id, \ - .attr = {.type = _obj_class, \ - {.obj = {.obj_type = _obj_type, .access = _access } },\ - .flags = 0} }) + .attr = { {.obj = {.type = _obj_class, .obj_type = _obj_type, \ + .access = _access, .flags = 0 } }, } }) #define ___UVERBS_ATTR_OBJ1(_id, _obj_class, _obj_type, _access, _flags)\ ((const struct uverbs_attr_def) \ {.id = _id, \ - .attr = {.type = _obj_class, \ - {.obj = {.obj_type = _obj_type, .access = _access} }, \ - _flags} }) + .attr = { {.obj = {.type = _obj_class, .obj_type = _obj_type, \ + .access = _access, _flags} }, } }) #define ___UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, _flags, \ _n, ...) \ ___UVERBS_ATTR_OBJ##_n(_id, _obj_class, _obj_type, _access, _flags) From c66db31113948ba61682f55265df8d032e793fcc Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 19 Mar 2018 15:02:36 +0200 Subject: [PATCH 123/199] IB/uverbs: Safely extend existing attributes Previously, we've used UVERBS_ATTR_SPEC_F_MIN_SZ for extending existing attributes. The behavior of this flag was the kernel accepts anything bigger than the minimum size it specified. This is unsafe, since in order to safely extend an attribute, we need to make sure unknown size is zeroed. Replacing UVERBS_ATTR_SPEC_F_MIN_SZ with UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO, which essentially checks that the unknown size is zero. In addition, attributes are now decorated with UVERBS_ATTR_TYPE and UVERBS_ATTR_STRUCT, so we can provide the minimum and known length. Users of this flag needs to use copy_from_or_zero functions/macros. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 26 ++++++- drivers/infiniband/core/uverbs_ioctl_merge.c | 2 +- drivers/infiniband/core/uverbs_std_types.c | 20 +++--- include/rdma/ib_verbs.h | 13 ++-- include/rdma/uverbs_ioctl.h | 74 +++++++++++++++----- 5 files changed, 104 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 82a1775ba657..1e6bf2488584 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -35,6 +35,17 @@ #include "rdma_core.h" #include "uverbs.h" +static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, + u16 len) +{ + if (uattr->len > sizeof(((struct ib_uverbs_attr *)0)->data)) + return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len, + uattr->len - len); + + return !memchr_inv((const void *)&uattr->data + len, + 0, uattr->len - len); +} + static int uverbs_process_attr(struct ib_device *ibdev, struct ib_ucontext *ucontext, const struct ib_uverbs_attr *uattr, @@ -68,9 +79,20 @@ static int uverbs_process_attr(struct ib_device *ibdev, switch (spec->type) { case UVERBS_ATTR_TYPE_PTR_IN: + /* Ensure that any data provided by userspace beyond the known + * struct is zero. Userspace that knows how to use some future + * longer struct will fail here if used with an old kernel and + * non-zero content, making ABI compat/discovery simpler. + */ + if (uattr->len > spec->ptr.len && + spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO && + !uverbs_is_attr_cleared(uattr, spec->ptr.len)) + return -EOPNOTSUPP; + + /* fall through */ case UVERBS_ATTR_TYPE_PTR_OUT: - if (uattr->len < spec->ptr.len || - (!(spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ) && + if (uattr->len < spec->ptr.min_len || + (!(spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO) && uattr->len > spec->ptr.len)) return -EINVAL; diff --git a/drivers/infiniband/core/uverbs_ioctl_merge.c b/drivers/infiniband/core/uverbs_ioctl_merge.c index 62e1eb1d2a28..0f88a1919d51 100644 --- a/drivers/infiniband/core/uverbs_ioctl_merge.c +++ b/drivers/infiniband/core/uverbs_ioctl_merge.c @@ -379,7 +379,7 @@ static struct uverbs_method_spec *build_method_with_attrs(const struct uverbs_me "ib_uverbs: Tried to merge attr (%d) but it's an object with new/destroy access but isn't mandatory\n", min_id) || WARN(IS_ATTR_OBJECT(attr) && - attr->flags & UVERBS_ATTR_SPEC_F_MIN_SZ, + attr->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO, "ib_uverbs: Tried to merge attr (%d) but it's an object with min_sz flag\n", min_id)) { res = -EINVAL; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index e4a4b184a6bc..0a2d8532de21 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -216,9 +216,11 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ * spec. */ static const struct uverbs_attr_def uverbs_uhw_compat_in = - UVERBS_ATTR_PTR_IN_SZ(UVERBS_ATTR_UHW_IN, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); + UVERBS_ATTR_PTR_IN_SZ(UVERBS_ATTR_UHW_IN, UVERBS_ATTR_SIZE(0, USHRT_MAX), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)); static const struct uverbs_attr_def uverbs_uhw_compat_out = - UVERBS_ATTR_PTR_OUT_SZ(UVERBS_ATTR_UHW_OUT, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); + UVERBS_ATTR_PTR_OUT_SZ(UVERBS_ATTR_UHW_OUT, UVERBS_ATTR_SIZE(0, USHRT_MAX), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)); static void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata) @@ -350,17 +352,19 @@ static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_CREATE, &UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, UVERBS_ACCESS_NEW, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE, u32, + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE, + UVERBS_ATTR_TYPE(u32), UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE, u64, + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), &UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, UVERBS_OBJECT_COMP_CHANNEL, UVERBS_ACCESS_READ), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, u32, + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, UVERBS_ATTR_TYPE(u32), UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_FLAGS, u32), - &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, u32, + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_FLAGS, UVERBS_ATTR_TYPE(u32)), + &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32), UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), &uverbs_uhw_compat_in, &uverbs_uhw_compat_out); @@ -394,7 +398,7 @@ static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_DESTROY, UVERBS_ACCESS_DESTROY, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP, - struct ib_uverbs_destroy_cq_resp, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_cq_resp), UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COMP_CHANNEL, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2357f2b29610..e9288d0f627e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2446,11 +2446,9 @@ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } -static inline bool ib_is_udata_cleared(struct ib_udata *udata, - size_t offset, - size_t len) +static inline bool ib_is_buffer_cleared(const void __user *p, + size_t len) { - const void __user *p = udata->inbuf + offset; bool ret; u8 *buf; @@ -2466,6 +2464,13 @@ static inline bool ib_is_udata_cleared(struct ib_udata *udata, return ret; } +static inline bool ib_is_udata_cleared(struct ib_udata *udata, + size_t offset, + size_t len) +{ + return ib_is_buffer_cleared(udata->inbuf + offset, len); +} + /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index cd7c3e40c6cc..c4ee65b20bb7 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -62,8 +62,8 @@ enum uverbs_obj_access { enum { UVERBS_ATTR_SPEC_F_MANDATORY = 1U << 0, - /* Support extending attributes by length */ - UVERBS_ATTR_SPEC_F_MIN_SZ = 1U << 1, + /* Support extending attributes by length, validate all unknown size == zero */ + UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO = 1U << 1, }; /* Specification of a single attribute inside the ioctl message */ @@ -79,7 +79,10 @@ struct uverbs_attr_spec { enum uverbs_attr_type type; /* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */ u8 flags; + /* Current known size to kernel */ u16 len; + /* User isn't allowed to provide something < min_len */ + u16 min_len; } ptr; struct { enum uverbs_attr_type type; @@ -177,30 +180,41 @@ struct uverbs_object_tree_def { }; #define UA_FLAGS(_flags) .flags = _flags -#define __UVERBS_ATTR0(_id, _len, _type, ...) \ +#define __UVERBS_ATTR0(_id, _type, _fld, _attr, ...) \ ((const struct uverbs_attr_def) \ - {.id = _id, .attr = {{.ptr = {.type = _type, .len = _len, .flags = 0, } }, } }) -#define __UVERBS_ATTR1(_id, _len, _type, _flags) \ + {.id = _id, .attr = {{._fld = {.type = _type, _attr, .flags = 0, } }, } }) +#define __UVERBS_ATTR1(_id, _type, _fld, _attr, _extra1, ...) \ ((const struct uverbs_attr_def) \ - {.id = _id, .attr = {{.ptr = {.type = _type, .len = _len, _flags } },} }) -#define __UVERBS_ATTR(_id, _len, _type, _flags, _n, ...) \ - __UVERBS_ATTR##_n(_id, _len, _type, _flags) + {.id = _id, .attr = {{._fld = {.type = _type, _attr, _extra1 } },} }) +#define __UVERBS_ATTR2(_id, _type, _fld, _attr, _extra1, _extra2) \ + ((const struct uverbs_attr_def) \ + {.id = _id, .attr = {{._fld = {.type = _type, _attr, _extra1, _extra2 } },} }) +#define __UVERBS_ATTR(_id, _type, _fld, _attr, _extra1, _extra2, _n, ...) \ + __UVERBS_ATTR##_n(_id, _type, _fld, _attr, _extra1, _extra2) + +#define UVERBS_ATTR_TYPE(_type) \ + .min_len = sizeof(_type), .len = sizeof(_type) +#define UVERBS_ATTR_STRUCT(_type, _last) \ + .min_len = ((uintptr_t)(&((_type *)0)->_last + 1)), .len = sizeof(_type) +#define UVERBS_ATTR_SIZE(_min_len, _len) \ + .min_len = _min_len, .len = _len + /* * In new compiler, UVERBS_ATTR could be simplified by declaring it as * [_id] = {.type = _type, .len = _len, ##__VA_ARGS__} * But since we support older compilers too, we need the more complex code. */ -#define UVERBS_ATTR(_id, _len, _type, ...) \ - __UVERBS_ATTR(_id, _len, _type, ##__VA_ARGS__, 1, 0) +#define UVERBS_ATTR(_id, _type, _fld, _attr, ...) \ + __UVERBS_ATTR(_id, _type, _fld, _attr, ##__VA_ARGS__, 2, 1, 0) #define UVERBS_ATTR_PTR_IN_SZ(_id, _len, ...) \ - UVERBS_ATTR(_id, _len, UVERBS_ATTR_TYPE_PTR_IN, ##__VA_ARGS__) + UVERBS_ATTR(_id, UVERBS_ATTR_TYPE_PTR_IN, ptr, _len, ##__VA_ARGS__) /* If sizeof(_type) <= sizeof(u64), this will be inlined rather than a pointer */ #define UVERBS_ATTR_PTR_IN(_id, _type, ...) \ - UVERBS_ATTR_PTR_IN_SZ(_id, sizeof(_type), ##__VA_ARGS__) + UVERBS_ATTR_PTR_IN_SZ(_id, _type, ##__VA_ARGS__) #define UVERBS_ATTR_PTR_OUT_SZ(_id, _len, ...) \ - UVERBS_ATTR(_id, _len, UVERBS_ATTR_TYPE_PTR_OUT, ##__VA_ARGS__) + UVERBS_ATTR(_id, UVERBS_ATTR_TYPE_PTR_OUT, ptr, _len, ##__VA_ARGS__) #define UVERBS_ATTR_PTR_OUT(_id, _type, ...) \ - UVERBS_ATTR_PTR_OUT_SZ(_id, sizeof(_type), ##__VA_ARGS__) + UVERBS_ATTR_PTR_OUT_SZ(_id, _type, ##__VA_ARGS__) /* * In new compiler, UVERBS_ATTR_IDR (and FD) could be simplified by declaring @@ -396,8 +410,8 @@ static inline int _uverbs_copy_from(void *to, /* * Validation ensures attr->ptr_attr.len >= size. If the caller is - * using UVERBS_ATTR_SPEC_F_MIN_SZ then it must call copy_from with - * the right size. + * using UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO then it must call + * uverbs_copy_from_or_zero. */ if (unlikely(size < attr->ptr_attr.len)) return -EINVAL; @@ -411,9 +425,37 @@ static inline int _uverbs_copy_from(void *to, return 0; } +static inline int _uverbs_copy_from_or_zero(void *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, + size_t size) +{ + const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx); + size_t min_size; + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + min_size = min_t(size_t, size, attr->ptr_attr.len); + + if (uverbs_attr_ptr_is_inline(attr)) + memcpy(to, &attr->ptr_attr.data, min_size); + else if (copy_from_user(to, u64_to_user_ptr(attr->ptr_attr.data), + min_size)) + return -EFAULT; + + if (size > min_size) + memset(to + min_size, 0, size - min_size); + + return 0; +} + #define uverbs_copy_from(to, attrs_bundle, idx) \ _uverbs_copy_from(to, attrs_bundle, idx, sizeof(*to)) +#define uverbs_copy_from_or_zero(to, attrs_bundle, idx) \ + _uverbs_copy_from_or_zero(to, attrs_bundle, idx, sizeof(*to)) + /* ================================================= * Definitions -> Specs infrastructure * ================================================= From dfb1395573c8726353f8cca1c123b46292d18822 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 19 Mar 2018 15:02:37 +0200 Subject: [PATCH 124/199] IB/uverbs: Expose parsing tree of all common objects to providers The ioctl() based uverbs is based on merging feature trees. This teaches the generic parser how to parse methods according to the provider's support. In order to support merging with the common objects, exporting the common-object-tree to the provider drivers. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 15 ++++++++++ drivers/infiniband/core/uverbs_std_types.c | 34 +++++++++++++--------- include/rdma/uverbs_std_types.h | 23 ++------------- 3 files changed, 37 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 0551e724c431..340fc23dc315 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -46,6 +46,7 @@ #include #include #include +#include #define UVERBS_MODULE_NAME ib_uverbs #include @@ -250,6 +251,20 @@ struct ib_uverbs_flow_spec { }; }; +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DEVICE); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_PD); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MR); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_CQ); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_QP); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_AH); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MW); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_SRQ); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_WQ); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD); + #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ struct ib_device *ib_dev, \ diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 0a2d8532de21..9d4a0bc904dd 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -453,17 +453,23 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD, DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DEVICE, NULL); -DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, - &UVERBS_OBJECT(UVERBS_OBJECT_DEVICE), - &UVERBS_OBJECT(UVERBS_OBJECT_PD), - &UVERBS_OBJECT(UVERBS_OBJECT_MR), - &UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL), - &UVERBS_OBJECT(UVERBS_OBJECT_CQ), - &UVERBS_OBJECT(UVERBS_OBJECT_QP), - &UVERBS_OBJECT(UVERBS_OBJECT_AH), - &UVERBS_OBJECT(UVERBS_OBJECT_MW), - &UVERBS_OBJECT(UVERBS_OBJECT_SRQ), - &UVERBS_OBJECT(UVERBS_OBJECT_FLOW), - &UVERBS_OBJECT(UVERBS_OBJECT_WQ), - &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL), - &UVERBS_OBJECT(UVERBS_OBJECT_XRCD)); +static DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, + &UVERBS_OBJECT(UVERBS_OBJECT_DEVICE), + &UVERBS_OBJECT(UVERBS_OBJECT_PD), + &UVERBS_OBJECT(UVERBS_OBJECT_MR), + &UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL), + &UVERBS_OBJECT(UVERBS_OBJECT_CQ), + &UVERBS_OBJECT(UVERBS_OBJECT_QP), + &UVERBS_OBJECT(UVERBS_OBJECT_AH), + &UVERBS_OBJECT(UVERBS_OBJECT_MW), + &UVERBS_OBJECT(UVERBS_OBJECT_SRQ), + &UVERBS_OBJECT(UVERBS_OBJECT_FLOW), + &UVERBS_OBJECT(UVERBS_OBJECT_WQ), + &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL), + &UVERBS_OBJECT(UVERBS_OBJECT_XRCD)); + +const struct uverbs_object_tree_def *uverbs_default_get_objects(void) +{ + return &uverbs_default_objects; +} +EXPORT_SYMBOL_GPL(uverbs_default_get_objects); diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 45ee7d1bfa32..9d56cdb84655 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -37,29 +37,10 @@ #include #include -#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) - #define UVERBS_OBJECT(id) uverbs_object_##id -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DEVICE); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_PD); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MR); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_CQ); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_QP); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_AH); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MW); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_SRQ); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_WQ); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD); - -extern const struct uverbs_object_tree_def uverbs_default_objects; -static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void) -{ - return &uverbs_default_objects; -} +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +const struct uverbs_object_tree_def *uverbs_default_get_objects(void); #else static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void) { From 41b2a71fc848e200e023b7ccd502c3b96714248d Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 19 Mar 2018 15:02:38 +0200 Subject: [PATCH 125/199] IB/uverbs: Move ioctl path of create_cq and destroy_cq to a new file Currently, all objects are declared in uverbs_std_types. This could lead to a huge file once we implement all objects, methods and handlers. Moving each object to its own file to keep the files smaller and more readable. uverbs_std_types.c will only contain the parsing tree definition and objects without any methods. Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/uverbs.h | 3 + drivers/infiniband/core/uverbs_std_types.c | 179 +-------------- drivers/infiniband/core/uverbs_std_types_cq.c | 208 ++++++++++++++++++ include/rdma/uverbs_ioctl.h | 2 + 5 files changed, 217 insertions(+), 177 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_cq.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index f69833db0a32..4d6260fd2f52 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -34,4 +34,4 @@ ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ - uverbs_ioctl_merge.o + uverbs_ioctl_merge.o uverbs_std_types_cq.o diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 340fc23dc315..d20828afa05c 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -230,6 +230,9 @@ int uverbs_dealloc_mw(struct ib_mw *mw); void ib_uverbs_detach_umcast(struct ib_qp *qp, struct ib_uqp_object *uobj); +void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata); +extern const struct uverbs_attr_def uverbs_uhw_compat_in; +extern const struct uverbs_attr_def uverbs_uhw_compat_out; long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); struct ib_uverbs_flow_spec { diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 9d4a0bc904dd..2ed8d9203f3b 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -135,25 +135,6 @@ static int uverbs_free_srq(struct ib_uobject *uobject, return ret; } -static int uverbs_free_cq(struct ib_uobject *uobject, - enum rdma_remove_reason why) -{ - struct ib_cq *cq = uobject->object; - struct ib_uverbs_event_queue *ev_queue = cq->cq_context; - struct ib_ucq_object *ucq = - container_of(uobject, struct ib_ucq_object, uobject); - int ret; - - ret = ib_destroy_cq(cq); - if (!ret || why != RDMA_REMOVE_DESTROY) - ib_uverbs_release_ucq(uobject->context->ufile, ev_queue ? - container_of(ev_queue, - struct ib_uverbs_completion_event_file, - ev_queue) : NULL, - ucq); - return ret; -} - static int uverbs_free_mr(struct ib_uobject *uobject, enum rdma_remove_reason why) { @@ -215,15 +196,14 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ * legacy way. Every verb that could get driver specific data should get this * spec. */ -static const struct uverbs_attr_def uverbs_uhw_compat_in = +const struct uverbs_attr_def uverbs_uhw_compat_in = UVERBS_ATTR_PTR_IN_SZ(UVERBS_ATTR_UHW_IN, UVERBS_ATTR_SIZE(0, USHRT_MAX), UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)); -static const struct uverbs_attr_def uverbs_uhw_compat_out = +const struct uverbs_attr_def uverbs_uhw_compat_out = UVERBS_ATTR_PTR_OUT_SZ(UVERBS_ATTR_UHW_OUT, UVERBS_ATTR_SIZE(0, USHRT_MAX), UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)); -static void create_udata(struct uverbs_attr_bundle *ctx, - struct ib_udata *udata) +void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata) { /* * This is for ease of conversion. The purpose is to convert all drivers @@ -255,152 +235,6 @@ static void create_udata(struct uverbs_attr_bundle *ctx, } } -static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) -{ - struct ib_ucontext *ucontext = file->ucontext; - struct ib_ucq_object *obj; - struct ib_udata uhw; - int ret; - u64 user_handle; - struct ib_cq_init_attr attr = {}; - struct ib_cq *cq; - struct ib_uverbs_completion_event_file *ev_file = NULL; - const struct uverbs_attr *ev_file_attr; - struct ib_uobject *ev_file_uobj; - - if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ)) - return -EOPNOTSUPP; - - ret = uverbs_copy_from(&attr.comp_vector, attrs, - UVERBS_ATTR_CREATE_CQ_COMP_VECTOR); - if (!ret) - ret = uverbs_copy_from(&attr.cqe, attrs, - UVERBS_ATTR_CREATE_CQ_CQE); - if (!ret) - ret = uverbs_copy_from(&user_handle, attrs, - UVERBS_ATTR_CREATE_CQ_USER_HANDLE); - if (ret) - return ret; - - /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ - if (uverbs_copy_from(&attr.flags, attrs, - UVERBS_ATTR_CREATE_CQ_FLAGS) == -EFAULT) - return -EFAULT; - - ev_file_attr = uverbs_attr_get(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL); - if (!IS_ERR(ev_file_attr)) { - ev_file_uobj = ev_file_attr->obj_attr.uobject; - - ev_file = container_of(ev_file_uobj, - struct ib_uverbs_completion_event_file, - uobj_file.uobj); - uverbs_uobject_get(ev_file_uobj); - } - - if (attr.comp_vector >= ucontext->ufile->device->num_comp_vectors) { - ret = -EINVAL; - goto err_event_file; - } - - obj = container_of(uverbs_attr_get(attrs, - UVERBS_ATTR_CREATE_CQ_HANDLE)->obj_attr.uobject, - typeof(*obj), uobject); - obj->uverbs_file = ucontext->ufile; - obj->comp_events_reported = 0; - obj->async_events_reported = 0; - INIT_LIST_HEAD(&obj->comp_list); - INIT_LIST_HEAD(&obj->async_list); - - /* Temporary, only until drivers get the new uverbs_attr_bundle */ - create_udata(attrs, &uhw); - - cq = ib_dev->create_cq(ib_dev, &attr, ucontext, &uhw); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); - goto err_event_file; - } - - cq->device = ib_dev; - cq->uobject = &obj->uobject; - cq->comp_handler = ib_uverbs_comp_handler; - cq->event_handler = ib_uverbs_cq_event_handler; - cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; - obj->uobject.object = cq; - obj->uobject.user_handle = user_handle; - atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_add(&cq->res); - - ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, - sizeof(cq->cqe)); - if (ret) - goto err_cq; - - return 0; -err_cq: - ib_destroy_cq(cq); - -err_event_file: - if (ev_file) - uverbs_uobject_put(ev_file_uobj); - return ret; -}; - -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_CREATE, - &UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, - UVERBS_ACCESS_NEW, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE, - UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE, - UVERBS_ATTR_TYPE(u64), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, - UVERBS_OBJECT_COMP_CHANNEL, - UVERBS_ACCESS_READ), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_FLAGS, UVERBS_ATTR_TYPE(u32)), - &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &uverbs_uhw_compat_in, &uverbs_uhw_compat_out); - -static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) -{ - struct ib_uverbs_destroy_cq_resp resp; - struct ib_uobject *uobj = - uverbs_attr_get(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject; - struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object, - uobject); - int ret; - - if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ)) - return -EOPNOTSUPP; - - ret = rdma_explicit_destroy(uobj); - if (ret) - return ret; - - resp.comp_events_reported = obj->comp_events_reported; - resp.async_events_reported = obj->async_events_reported; - - return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_CQ_RESP, &resp, - sizeof(resp)); -} - -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_DESTROY, - &UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ, - UVERBS_ACCESS_DESTROY, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP, - UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_cq_resp), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); - DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COMP_CHANNEL, &UVERBS_TYPE_ALLOC_FD(0, sizeof(struct ib_uverbs_completion_event_file), @@ -408,13 +242,6 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COMP_CHANNEL, &uverbs_event_fops, "[infinibandevent]", O_RDONLY)); -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_CQ, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, - uverbs_free_cq), - &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE), - &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY) - ); - DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_QP, &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0, uverbs_free_qp)); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c new file mode 100644 index 000000000000..b061b4e15d8b --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_cq(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_cq *cq = uobject->object; + struct ib_uverbs_event_queue *ev_queue = cq->cq_context; + struct ib_ucq_object *ucq = + container_of(uobject, struct ib_ucq_object, uobject); + int ret; + + ret = ib_destroy_cq(cq); + if (!ret || why != RDMA_REMOVE_DESTROY) + ib_uverbs_release_ucq(uobject->context->ufile, ev_queue ? + container_of(ev_queue, + struct ib_uverbs_completion_event_file, + ev_queue) : NULL, + ucq); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext = file->ucontext; + struct ib_ucq_object *obj; + struct ib_udata uhw; + int ret; + u64 user_handle; + struct ib_cq_init_attr attr = {}; + struct ib_cq *cq; + struct ib_uverbs_completion_event_file *ev_file = NULL; + const struct uverbs_attr *ev_file_attr; + struct ib_uobject *ev_file_uobj; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ)) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.comp_vector, attrs, + UVERBS_ATTR_CREATE_CQ_COMP_VECTOR); + if (!ret) + ret = uverbs_copy_from(&attr.cqe, attrs, + UVERBS_ATTR_CREATE_CQ_CQE); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_CQ_USER_HANDLE); + if (ret) + return ret; + + /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ + if (IS_UVERBS_COPY_ERR(uverbs_copy_from(&attr.flags, attrs, + UVERBS_ATTR_CREATE_CQ_FLAGS))) + return -EFAULT; + + ev_file_attr = uverbs_attr_get(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL); + if (!IS_ERR(ev_file_attr)) { + ev_file_uobj = ev_file_attr->obj_attr.uobject; + + ev_file = container_of(ev_file_uobj, + struct ib_uverbs_completion_event_file, + uobj_file.uobj); + uverbs_uobject_get(ev_file_uobj); + } + + if (attr.comp_vector >= ucontext->ufile->device->num_comp_vectors) { + ret = -EINVAL; + goto err_event_file; + } + + obj = container_of(uverbs_attr_get(attrs, + UVERBS_ATTR_CREATE_CQ_HANDLE)->obj_attr.uobject, + typeof(*obj), uobject); + obj->uverbs_file = ucontext->ufile; + obj->comp_events_reported = 0; + obj->async_events_reported = 0; + INIT_LIST_HEAD(&obj->comp_list); + INIT_LIST_HEAD(&obj->async_list); + + /* Temporary, only until drivers get the new uverbs_attr_bundle */ + create_udata(attrs, &uhw); + + cq = ib_dev->create_cq(ib_dev, &attr, ucontext, &uhw); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto err_event_file; + } + + cq->device = ib_dev; + cq->uobject = &obj->uobject; + cq->comp_handler = ib_uverbs_comp_handler; + cq->event_handler = ib_uverbs_cq_event_handler; + cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; + obj->uobject.object = cq; + obj->uobject.user_handle = user_handle; + atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, + sizeof(cq->cqe)); + if (ret) + goto err_cq; + + return 0; +err_cq: + ib_destroy_cq(cq); + +err_event_file: + if (ev_file) + uverbs_uobject_put(ev_file_uobj); + return ret; +}; + +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_CREATE, + &UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, + UVERBS_ACCESS_NEW, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE, + UVERBS_ATTR_TYPE(u32), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_ACCESS_READ), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, UVERBS_ATTR_TYPE(u32), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_FLAGS, UVERBS_ATTR_TYPE(u32)), + &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &uverbs_uhw_compat_in, &uverbs_uhw_compat_out); + +static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_cq_resp resp; + struct ib_uobject *uobj = + uverbs_attr_get(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject; + struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object, + uobject); + int ret; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ)) + return -EOPNOTSUPP; + + ret = rdma_explicit_destroy(uobj); + if (ret) + return ret; + + resp.comp_events_reported = obj->comp_events_reported; + resp.async_events_reported = obj->async_events_reported; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_CQ_RESP, &resp, + sizeof(resp)); +} + +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_DESTROY, + &UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ, + UVERBS_ACCESS_DESTROY, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_cq_resp), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_CQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, + uverbs_free_cq), + &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY) + ); + diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index c4ee65b20bb7..faaaec7be36a 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -361,6 +361,8 @@ static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_b idx & ~UVERBS_ID_NS_MASK); } +#define IS_UVERBS_COPY_ERR(_ret) ((_ret) && (_ret) != -ENOENT) + static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle, u16 idx) { From 3d64addd435997a445d201fcbbde2fa753709971 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 19 Mar 2018 15:02:39 +0200 Subject: [PATCH 126/199] IB/uverbs: Add macros to simplify adding driver specific attributes Previously, adding driver specific attributes required drivers to declare all the hierarchy - object tree, object, methods and the attributes themselves. A common use case is adding a few attributes to an existing common method. In order to simplify the driver's code, we add some macros to do all these declarations automatically. Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/uverbs_named_ioctl.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/include/rdma/uverbs_named_ioctl.h b/include/rdma/uverbs_named_ioctl.h index a7f0565ca784..c5bb4ebdb0b0 100644 --- a/include/rdma/uverbs_named_ioctl.h +++ b/include/rdma/uverbs_named_ioctl.h @@ -56,4 +56,35 @@ #define DECLARE_UVERBS_NAMED_OBJECT(id, ...) \ DECLARE_UVERBS_OBJECT(UVERBS_OBJECT(id), id, ##__VA_ARGS__) +#define _UVERBS_COMP_NAME(x, y, z) _UVERBS_NAME(_UVERBS_NAME(x, y), z) + +#define UVERBS_NO_OVERRIDE NULL + +/* This declares a parsing tree with one object and one method. This is usually + * used for merging driver attributes to the common attributes. The driver has + * a chance to override the handler and type attrs of the original object. + * The __VA_ARGS__ just contains a list of attributes. + */ +#define ADD_UVERBS_ATTRIBUTES(_name, _object, _method, _type_attrs, _handler, ...) \ +static DECLARE_UVERBS_METHOD(_UVERBS_COMP_NAME(UVERBS_MODULE_NAME, \ + _method_, _name), \ + _method, _handler, ##__VA_ARGS__); \ + \ +static DECLARE_UVERBS_OBJECT(_UVERBS_COMP_NAME(UVERBS_MODULE_NAME, \ + _object_, _name), \ + _object, _type_attrs, \ + &_UVERBS_COMP_NAME(UVERBS_MODULE_NAME, \ + _method_, _name)); \ + \ +static DECLARE_UVERBS_OBJECT_TREE(_name, \ + &_UVERBS_COMP_NAME(UVERBS_MODULE_NAME, \ + _object_, _name)) + +/* A very common use case is that the driver doesn't override the handler and + * type_attrs. Therefore, we provide a simplified macro for this common case. + */ +#define ADD_UVERBS_ATTRIBUTES_SIMPLE(_name, _object, _method, ...) \ + ADD_UVERBS_ATTRIBUTES(_name, _object, _method, UVERBS_NO_OVERRIDE, \ + UVERBS_NO_OVERRIDE, ##__VA_ARGS__) + #endif From 185899ee8d00460a305e07ed2df178dbf7455227 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 19 Mar 2018 15:02:40 +0200 Subject: [PATCH 127/199] IB/uverbs: Enable ioctl() uAPI by default for new verbs Enable the ioctl() uAPI for IB by default if the standard write() uAPI (INFINIBAND_USER_ACCESS) is enabled. Verbs that are also available under the old write() uAPI are put inside a new INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI Kconfig. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/Kconfig | 11 +++++------ drivers/infiniband/core/uverbs_main.c | 4 ---- drivers/infiniband/core/uverbs_std_types_cq.c | 2 ++ 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 8517d6ea91a6..ee270e065ba9 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -35,14 +35,13 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from rdma-core . -config INFINIBAND_EXP_USER_ACCESS - bool "Enable the full uverbs ioctl interface (EXPERIMENTAL)" +config INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI + bool "Allow experimental legacy verbs in new ioctl uAPI (EXPERIMENTAL)" depends on INFINIBAND_USER_ACCESS ---help--- - IOCTL based ABI support for Infiniband. This allows userspace - to invoke the experimental IOCTL based ABI. - These commands are parsed via per-device parsing tree and - enables per-device features. + IOCTL based uAPI support for Infiniband is enabled by default for + new verbs only. This allows userspace to invoke the IOCTL based uAPI + for current legacy verbs too. config INFINIBAND_USER_MEM bool diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index fbba831f879e..4445d8ee9314 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -933,10 +933,8 @@ static const struct file_operations uverbs_fops = { .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, -#if IS_ENABLED(CONFIG_INFINIBAND_EXP_USER_ACCESS) .unlocked_ioctl = ib_uverbs_ioctl, .compat_ioctl = ib_uverbs_ioctl, -#endif }; static const struct file_operations uverbs_mmap_fops = { @@ -946,10 +944,8 @@ static const struct file_operations uverbs_mmap_fops = { .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, -#if IS_ENABLED(CONFIG_INFINIBAND_EXP_USER_ACCESS) .unlocked_ioctl = ib_uverbs_ioctl, .compat_ioctl = ib_uverbs_ioctl, -#endif }; static struct ib_client uverbs_client = { diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index b061b4e15d8b..b0dbae9dd0d7 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -202,7 +202,9 @@ static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_DESTROY, DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_CQ, &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, uverbs_free_cq), +#if IS_ENABLED(CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI) &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE), &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY) +#endif ); From 97d82a48d7a7eca6f20b100ae19811134509406e Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Mon, 19 Mar 2018 22:47:44 -0400 Subject: [PATCH 128/199] IB/mlx4: Eliminate duplicate barriers on weakly-ordered archs Code includes wmb() followed by writel(). writel() already has a barrier on some architectures like arm64. This ends up CPU observing two barriers back to back before executing the register write. Since code already has an explicit barrier call, changing writel() to writel_relaxed(). Signed-off-by: Sinan Kaya Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/qp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 04efc05fb531..523028e944ed 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -3894,8 +3894,8 @@ out: */ wmb(); - writel(qp->doorbell_qpn, - to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); + writel_relaxed(qp->doorbell_qpn, + to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); /* * Make sure doorbells don't leak out of SQ spinlock From 761fc376c999df9febaa491bffae2f6722f423ff Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 20 Mar 2018 13:59:50 -0600 Subject: [PATCH 129/199] RDMA/cxgb3: Use structs to describe the uABI instead of opencoding Open coding a loose value is not acceptable for describing the uABI in RDMA. Provide the missing struct. Acked-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb3/iwch_provider.c | 4 +++- include/uapi/rdma/cxgb3-abi.h | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 1804b6c4a6ec..be097c6723c0 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -440,7 +440,9 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, php->pdid = pdid; php->rhp = rhp; if (context) { - if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) { + struct iwch_alloc_pd_resp resp = {.pdid = php->pdid}; + + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { iwch_deallocate_pd(&php->ibpd); return ERR_PTR(-EFAULT); } diff --git a/include/uapi/rdma/cxgb3-abi.h b/include/uapi/rdma/cxgb3-abi.h index d5745e43ae85..17116c1c7925 100644 --- a/include/uapi/rdma/cxgb3-abi.h +++ b/include/uapi/rdma/cxgb3-abi.h @@ -74,4 +74,9 @@ struct iwch_create_qp_resp { struct iwch_reg_user_mr_resp { __u32 pbl_addr; }; + +struct iwch_alloc_pd_resp { + __u32 pdid; +}; + #endif /* CXGB3_ABI_USER_H */ From e95955773d4357a0b09a43128352047afce8f35b Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Wed, 21 Mar 2018 14:11:18 +0800 Subject: [PATCH 130/199] RDMA/hns: Fix cq record doorbell enable in kernel Upon detecting both kernel and user space support record doorbell, the kernel needs to enable this capability in hardware by db_en, and it should take place before cq context configuration in hns_roce_cq_alloc. Currently, db_en is configured after cq alloc and db_map_user has similar problem. Reported-by: Xiping Zhang Fixes: 9b44703d0a21 ("RDMA/hns: Support cq record doorbell for the user space") Signed-off-by: Yixian Liu Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cq.c | 38 +++++++++++-------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 095a9100717d..14734d0d0b76 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -355,6 +355,18 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, goto err_cq; } + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && + (udata->outlen >= sizeof(resp))) { + ret = hns_roce_db_map_user(to_hr_ucontext(context), + ucmd.db_addr, &hr_cq->db); + if (ret) { + dev_err(dev, "cq record doorbell map failed!\n"); + goto err_mtt; + } + hr_cq->db_en = 1; + resp.cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB; + } + /* Get user space parameters */ uar = &to_hr_ucontext(context)->uar; } else { @@ -385,17 +397,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, hr_cq, vector); if (ret) { dev_err(dev, "Creat CQ .Failed to cq_alloc.\n"); - goto err_mtt; - } - - if (context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && - (udata->outlen >= sizeof(resp))) { - ret = hns_roce_db_map_user(to_hr_ucontext(context), - ucmd.db_addr, &hr_cq->db); - if (ret) { - dev_err(dev, "cq record doorbell map failed!\n"); - goto err_cqc; - } + goto err_dbmap; } /* @@ -414,28 +416,22 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, if (context) { resp.cqn = hr_cq->cqn; - if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && - (udata->outlen >= sizeof(resp))) { - hr_cq->db_en = 1; - resp.cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB; - } - ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (ret) - goto err_dbmap; + goto err_cqc; } return &hr_cq->ib_cq; +err_cqc: + hns_roce_free_cq(hr_dev, hr_cq); + err_dbmap: if (context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && (udata->outlen >= sizeof(resp))) hns_roce_db_unmap_user(to_hr_ucontext(context), &hr_cq->db); -err_cqc: - hns_roce_free_cq(hr_dev, hr_cq); - err_mtt: hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); if (context) From 03286030ac0420c759fa25f5b976e40293bccaaf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 21 Mar 2018 17:12:42 +0200 Subject: [PATCH 131/199] RDMA/restrack: Remove ambiguity in resource track clean logic The restrack clean routine had simple, but powerful WARN_ON check to see if all resources are cleared prior to releasing device. The WARN_ON check performed very well, but lack of information which device caused to resource leak, the object type and origin made debug to be fun and challenging at the same time. The fact that all dumps were the same because restrack_clean() is called in dealloc() didn't help either. So let's fix spelling error and convert WARN_ON to be more debug friendly. The dmesg cut below gives example of how the output will look output for the case fixed in patch [1] [ 438.421372] restrack: ------------[ cut here ]------------ [ 438.423448] restrack: BUG: RESTRACK detected leak of resources on mlx5_2 [ 438.425600] restrack: Kernel PD object allocated by mlx5_ib is not freed [ 438.427753] restrack: Kernel CQ object allocated by mlx5_ib is not freed [ 438.429660] restrack: ------------[ cut here ]------------ [1] https://patchwork.kernel.org/patch/10298695/ Cc: Michal Kalderon Cc: Chuck Lever Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 45 +++++++++++++++++++++++++++++- include/rdma/restrack.h | 2 +- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 4cad0cd9aa0c..efddd13e3edb 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -17,9 +17,52 @@ void rdma_restrack_init(struct rdma_restrack_root *res) init_rwsem(&res->rwsem); } +static const char *type2str(enum rdma_restrack_type type) +{ + static const char * const names[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_PD] = "PD", + [RDMA_RESTRACK_CQ] = "CQ", + [RDMA_RESTRACK_QP] = "QP", + [RDMA_RESTRACK_CM_ID] = "CM_ID", + [RDMA_RESTRACK_MR] = "MR", + }; + + return names[type]; +}; + void rdma_restrack_clean(struct rdma_restrack_root *res) { - WARN_ON_ONCE(!hash_empty(res->hash)); + struct rdma_restrack_entry *e; + char buf[TASK_COMM_LEN]; + struct ib_device *dev; + const char *owner; + int bkt; + + if (hash_empty(res->hash)) + return; + + dev = container_of(res, struct ib_device, res); + pr_err("restrack: %s", CUT_HERE); + pr_err("restrack: BUG: RESTRACK detected leak of resources on %s\n", + dev->name); + hash_for_each(res->hash, bkt, e, node) { + if (rdma_is_kernel_res(e)) { + owner = e->kern_name; + } else { + /* + * There is no need to call get_task_struct here, + * because we can be here only if there are more + * get_task_struct() call than put_task_struct(). + */ + get_task_comm(buf, e->task); + owner = buf; + } + + pr_err("restrack: %s %s object allocated by %s is not freed\n", + rdma_is_kernel_res(e) ? "Kernel" : "User", + type2str(e->type), owner); + } + pr_err("restrack: %s", CUT_HERE); } int rdma_restrack_count(struct rdma_restrack_root *res, diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index a56f4f200277..f3b3e3576f6a 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -155,7 +155,7 @@ static inline bool rdma_is_kernel_res(struct rdma_restrack_entry *res) int __must_check rdma_restrack_get(struct rdma_restrack_entry *res); /** - * rdma_restrack_put() - relase resource + * rdma_restrack_put() - release resource * @res: resource entry */ int rdma_restrack_put(struct rdma_restrack_entry *res); From 98f1f4e0ed26c97a697f1e007416acbc18f4a8a9 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Mar 2018 17:16:36 +0200 Subject: [PATCH 132/199] IB/core: Refer to RoCE port property instead of GID table property ib_query_gid() in commit [1] refers to RoCE GID table capability of the HCA using rdma_cap_roce_gid_table(). ib_core maintains the GID table cache regardless of the HCA provider drivers capability to maintain RoCE GID table. Therefore, whether to return a GID table entry from the software cache or from HCA should be done based on whether the port is RoCE or not. [1] commit 03db3a2d81e6 ("IB/core: Add RoCE GID table management") Reviewed-by: Mark Bloch Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 0ab99e62cc5c..ba0e34b09648 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -878,7 +878,7 @@ int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid, struct ib_gid_attr *attr) { - if (rdma_cap_roce_gid_table(device, port_num)) + if (rdma_protocol_roce(device, port_num)) return ib_get_cached_gid(device, port_num, index, gid, attr); if (attr) From 114cc9c4b18232452f7dcc8bb3e5749f8d9a6837 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Mar 2018 17:16:35 +0200 Subject: [PATCH 133/199] IB/cma: Resolve route only while receiving CM requests Currently CM request for RoCE follows following flow. rdma_create_id() rdma_resolve_addr() rdma_resolve_route() For RC QPs: rdma_connect() ->cma_connect_ib() ->ib_send_cm_req() ->cm_init_av_by_path() ->ib_init_ah_attr_from_path() For UD QPs: rdma_connect() ->cma_resolve_ib_udp() ->ib_send_cm_sidr_req() ->cm_init_av_by_path() ->ib_init_ah_attr_from_path() In both the flows, route is already resolved before sending CM requests. Therefore, code is refactored to avoid resolving route second time in ib_cm layer. ib_init_ah_attr_from_path() is extended to resolve route when it is not yet resolved for RoCE link layer. This is achieved by caller setting route_resolved field in path record whenever it has route already resolved. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 5 +++++ drivers/infiniband/core/cma.c | 1 + drivers/infiniband/core/sa_query.c | 5 +++++ include/rdma/ib_sa.h | 8 ++++++++ 4 files changed, 19 insertions(+) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 4cc0fe6a29ff..38d79bc1bf78 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1543,6 +1543,8 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, cm_req_get_primary_local_ack_timeout(req_msg); primary_path->packet_life_time -= (primary_path->packet_life_time > 0); primary_path->service_id = req_msg->service_id; + if (sa_path_is_roce(primary_path)) + primary_path->roce.route_resolved = false; if (cm_req_has_alt_path(req_msg)) { alt_path->dgid = req_msg->alt_local_gid; @@ -1562,6 +1564,9 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, cm_req_get_alt_local_ack_timeout(req_msg); alt_path->packet_life_time -= (alt_path->packet_life_time > 0); alt_path->service_id = req_msg->service_id; + + if (sa_path_is_roce(alt_path)) + alt_path->roce.route_resolved = false; } cm_format_path_lid_from_req(req_msg, primary_path, alt_path); } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 34fa0507ed4f..8512f633efd6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2506,6 +2506,7 @@ cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); + route->path_rec->roce.route_resolved = true; sa_path_set_ndev(route->path_rec, addr->dev_addr.net); sa_path_set_ifindex(route->path_rec, ndev->ifindex); sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 1cfec68c7911..a61ec7e33613 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1248,6 +1248,9 @@ roce_resolve_route_from_path(struct ib_device *device, u8 port_num, } sgid_addr, dgid_addr; int ret; + if (rec->roce.route_resolved) + return 0; + if (!device->get_netdev) return -EOPNOTSUPP; @@ -1287,6 +1290,8 @@ roce_resolve_route_from_path(struct ib_device *device, u8 port_num, dev_put(ndev); done: dev_put(idev); + if (!ret) + rec->roce.route_resolved = true; return ret; } diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 82b8e59af14a..bacb144f7780 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -163,7 +163,15 @@ struct sa_path_rec_ib { u8 raw_traffic; }; +/** + * struct sa_path_rec_roce - RoCE specific portion of the path record entry + * @route_resolved: When set, it indicates that this route is already + * resolved for this path record entry. + * @dmac: Destination mac address for the given DGID entry + * of the path record entry. + */ struct sa_path_rec_roce { + bool route_resolved; u8 dmac[ETH_ALEN]; /* ignored in IB */ int ifindex; From f215a3d2448ae77253f0b93dcc37114779f51778 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 22 Mar 2018 12:53:35 -0700 Subject: [PATCH 134/199] iw_cxgb4: Add ib_device->get_netdev support This is useful to rdma ULPs. Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/provider.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index dc4eabd85f54..0b9cc73c3ded 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -533,6 +533,24 @@ static void get_dev_fw_str(struct ib_device *dev, char *str) FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers)); } +static struct net_device *get_netdev(struct ib_device *dev, u8 port) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev); + struct c4iw_rdev *rdev = &c4iw_dev->rdev; + struct net_device *ndev; + + if (!port || port > rdev->lldi.nports) + return NULL; + + rcu_read_lock(); + ndev = rdev->lldi.ports[port - 1]; + if (ndev) + dev_hold(ndev); + rcu_read_unlock(); + + return ndev; +} + void c4iw_register_device(struct work_struct *work) { int ret; @@ -611,6 +629,7 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; dev->ibdev.get_port_immutable = c4iw_port_immutable; dev->ibdev.get_dev_fw_str = get_dev_fw_str; + dev->ibdev.get_netdev = get_netdev; dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); if (!dev->ibdev.iwcm) { From f64705b8715a090cd5526a2c082eeb199a51e8b2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Mar 2018 11:30:43 -0600 Subject: [PATCH 135/199] RDMA/ocrdma: Fix structure layout for ocrdma_alloc_pd The udata's for alloc_pd cannot contain u64s due to alignment constraints. Switch the two never-used u64's to arrays of u32 to reduce the required struct alignment to 4 bytes. These reserved fields are totally unnecessary, never written and never read. Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ocrdma-abi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/rdma/ocrdma-abi.h b/include/uapi/rdma/ocrdma-abi.h index e0475d59cdf0..32ef8670583a 100644 --- a/include/uapi/rdma/ocrdma-abi.h +++ b/include/uapi/rdma/ocrdma-abi.h @@ -65,7 +65,7 @@ struct ocrdma_alloc_ucontext_resp { }; struct ocrdma_alloc_pd_ureq { - __u64 rsvd1; + __u32 rsvd[2]; }; struct ocrdma_alloc_pd_uresp { @@ -73,7 +73,7 @@ struct ocrdma_alloc_pd_uresp { __u32 dpp_enabled; __u32 dpp_page_addr_hi; __u32 dpp_page_addr_lo; - __u64 rsvd1; + __u32 rsvd[2]; }; struct ocrdma_create_cq_ureq { From 38b48808b9af55f02cb226a1f09b7a5e67104569 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 20 Mar 2018 14:19:46 -0600 Subject: [PATCH 136/199] RDMA: Remove minor pahole differences between 32/64 To help automatic detection we want pahole to report the same struct layouts for 32 and 64 bit compiles. These cases are all implicit padding added at the end of embedded structs as part of a union. The added reserved fields have no impact on the ABI. Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_rxe.h | 2 ++ include/uapi/rdma/vmw_pvrdma-abi.h | 1 + 2 files changed, 3 insertions(+) diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index b3b1bfc8fa21..231190b841c8 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -78,12 +78,14 @@ struct rxe_send_wr { struct { __u64 remote_addr; __u32 rkey; + __u32 reserved; } rdma; struct { __u64 remote_addr; __u64 compare_add; __u64 swap; __u32 rkey; + __u32 reserved; } atomic; struct { __u32 remote_qpn; diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h index 02ca0d0f1eb7..edf5c7224901 100644 --- a/include/uapi/rdma/vmw_pvrdma-abi.h +++ b/include/uapi/rdma/vmw_pvrdma-abi.h @@ -262,6 +262,7 @@ struct pvrdma_sq_wqe_hdr { __u32 length; __u32 access_flags; __u32 rkey; + __u32 reserved; } fast_reg; struct { __u32 remote_qpn; From 611cb92b082ad16b2fe1258e51d5aca7de540dfb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 20 Mar 2018 14:19:47 -0600 Subject: [PATCH 137/199] RDMA/ucma: Fix uABI structure layouts for 32/64 compat The rdma_ucm_event_resp is a different length on 32 and 64 bit compiles. The kernel requires it to be the expected length or longer so 32 bit builds running on a 64 bit kernel will not work. Retain full compat by having all kernels accept a struct with or without the trailing reserved field. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 9 +++++++-- include/uapi/rdma/rdma_user_cm.h | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 4bb5bed596c9..db4190b2ed27 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -382,7 +382,11 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, struct ucma_event *uevent; int ret = 0; - if (out_len < sizeof uevent->resp) + /* + * Old 32 bit user space does not send the 4 byte padding in the + * reserved field. We don't care, allow it to keep working. + */ + if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) @@ -417,7 +421,8 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, } if (copy_to_user((void __user *)(unsigned long)cmd.response, - &uevent->resp, sizeof uevent->resp)) { + &uevent->resp, + min_t(size_t, out_len, sizeof(uevent->resp)))) { ret = -EFAULT; goto done; } diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index c83ef0026079..65399c837762 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -270,10 +270,15 @@ struct rdma_ucm_event_resp { __u32 id; __u32 event; __u32 status; + /* + * NOTE: This union is not aligned to 8 bytes so none of the union + * members may contain a u64 or anything with higher alignment than 4. + */ union { struct rdma_ucm_conn_param conn; struct rdma_ucm_ud_param ud; } param; + __u32 reserved; }; /* Option levels */ From 71e80a4781afbc4b1130b88109ddd8850201c78a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 20 Mar 2018 14:19:48 -0600 Subject: [PATCH 138/199] RDMA/qedr: Fix uABI structure layouts for 32/64 compat struct qedr_alloc_ucontext_resp is a different length in 32 and 64 bit compiles due to implicit compiler padding. The structs alloc_pd_uresp, create_cq_uresp and create_qp_uresp are not padded by the compiler, but in user space the compiler pads them due to the way the core and driver structs are concatenated. Make this padding explicit and consistent for future sanity. The kernel driver can already handle the user buffer being smaller than required and copies correctly, so no compat or ABI break happens from introducing the explicit padding. Acked-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/qedr-abi.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index 261c6db4623e..396656062931 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -53,6 +53,7 @@ struct qedr_alloc_ucontext_resp { __u8 dpm_enabled; __u8 wids_enabled; __u16 wid_count; + __u32 reserved; }; struct qedr_alloc_pd_ureq { @@ -61,6 +62,7 @@ struct qedr_alloc_pd_ureq { struct qedr_alloc_pd_uresp { __u32 pd_id; + __u32 reserved; }; struct qedr_create_cq_ureq { @@ -71,6 +73,7 @@ struct qedr_create_cq_ureq { struct qedr_create_cq_uresp { __u32 db_offset; __u16 icid; + __u16 reserved; }; struct qedr_create_qp_ureq { @@ -105,6 +108,7 @@ struct qedr_create_qp_uresp { __u16 rq_icid; __u32 rq_db2_offset; + __u32 reserved; }; #endif /* __QEDR_USER_H__ */ From 366380a0c835b742da64ae2f800c65fa87692683 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 20 Mar 2018 14:19:49 -0600 Subject: [PATCH 139/199] RDMA/mlx4: Fix uABI structure layouts for 32/64 compat rss_caps in struct mlx4_uverbs_ex_query_device_resp is misaligned on 32 bit compared to 64 bit, add explicit padding. The rss caps were introduced recently and are very rarely used in user space, mainly for DPDK. We don't expect there to be a real 32 bit user, so this change is done without compat considerations. Fixes: 09d208b258a2 ("IB/mlx4: Add report for RSS capabilities by vendor channel") Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx4-abi.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index a448abd07052..50a56aeb1f41 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -183,6 +183,7 @@ struct mlx4_uverbs_ex_query_device_resp { __u32 response_length; __u64 hca_core_clock_offset; __u32 max_inl_recv_sz; + __u32 reserved; struct mlx4_ib_rss_caps rss_caps; struct mlx4_ib_tso_caps tso_caps; }; From f2e9bfac13c904e5cfe58612002acde6f058dc83 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 20 Mar 2018 14:19:50 -0600 Subject: [PATCH 140/199] RDMA/rxe: Fix uABI structure layouts for 32/64 compat With 32 bit compilation several of the fields become misaligned here. Fixing this is an ABI break for 32 bit rxe and it is in well used portions of the rxe ABI. To handle this we bump the ABI version, as expected. However the user space driver doesn't handle it properly today, so all existing user space continues to work. Updated userspace will start to require the necessary kernel version. We don't expect there to be any 32 bit users of rxe. Most likely cases, such as ARM 32 already generally don't work because rxe does not handle the CPU cache properly on its shared with userspace pages. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.h | 6 +++++- include/uapi/rdma/rdma_user_rxe.h | 12 ++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 7d232611303f..561ad307c6ec 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -59,7 +59,11 @@ #include "rxe_verbs.h" #include "rxe_loc.h" -#define RXE_UVERBS_ABI_VERSION (1) +/* + * Version 1 and Version 2 are identical on 64 bit machines, but on 32 bit + * machines Version 2 has a different struct layout. + */ +#define RXE_UVERBS_ABI_VERSION 2 #define IB_PHYS_STATE_LINK_UP (5) #define IB_PHYS_STATE_LINK_DOWN (3) diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 231190b841c8..af8f8218aed5 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -58,6 +58,8 @@ struct rxe_global_route { struct rxe_av { __u8 port_num; __u8 network_type; + __u16 reserved1; + __u32 reserved2; struct rxe_global_route grh; union { struct sockaddr_in _sockaddr_in; @@ -92,10 +94,14 @@ struct rxe_send_wr { __u32 remote_qkey; __u16 pkey_index; } ud; + /* reg is only used by the kernel and is not part of the uapi */ struct { - struct ib_mr *mr; + union { + struct ib_mr *mr; + __u64 reserved; + }; __u32 key; - int access; + __u32 access; } reg; } wr; }; @@ -118,6 +124,7 @@ struct rxe_dma_info { __u32 cur_sge; __u32 num_sge; __u32 sge_offset; + __u32 reserved; union { __u8 inline_data[0]; struct rxe_sge sge[0]; @@ -162,6 +169,7 @@ struct rxe_create_qp_resp { struct rxe_create_srq_resp { struct mminfo mi; __u32 srq_num; + __u32 reserved; }; struct rxe_modify_srq_cmd { From 26b9906612c3553189d7d1673ee116ffac474d53 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 20 Mar 2018 14:19:51 -0600 Subject: [PATCH 141/199] RDMA: Change all uapi headers to use __aligned_u64 instead of __u64 The new auditing standard for the subsystem will be to only use __aligned_64 in uapi headers to try and prevent 32/64 compat bugs from existing in the future. Changing all existing usage will help ensure new developers copy the right idea. The before/after of this patch was tested using pahole on 32 and 64 bit compiles to confirm it has no change in the structure layout, so this patch is a NOP. Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/bnxt_re-abi.h | 14 +-- include/uapi/rdma/cxgb3-abi.h | 12 +-- include/uapi/rdma/cxgb4-abi.h | 24 ++--- include/uapi/rdma/hfi/hfi1_ioctl.h | 32 +++--- include/uapi/rdma/hfi/hfi1_user.h | 4 +- include/uapi/rdma/hns-abi.h | 14 +-- include/uapi/rdma/i40iw-abi.h | 12 +-- include/uapi/rdma/ib_user_cm.h | 48 ++++----- include/uapi/rdma/ib_user_mad.h | 4 +- include/uapi/rdma/ib_user_verbs.h | 158 ++++++++++++++--------------- include/uapi/rdma/mlx4-abi.h | 24 ++--- include/uapi/rdma/mlx5-abi.h | 40 ++++---- include/uapi/rdma/mthca-abi.h | 10 +- include/uapi/rdma/nes-abi.h | 6 +- include/uapi/rdma/ocrdma-abi.h | 30 +++--- include/uapi/rdma/qedr-abi.h | 16 +-- include/uapi/rdma/rdma_user_cm.h | 34 +++---- include/uapi/rdma/rdma_user_rxe.h | 22 ++-- include/uapi/rdma/vmw_pvrdma-abi.h | 48 ++++----- 19 files changed, 276 insertions(+), 276 deletions(-) diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index 2d3c9aac661a..a7a6111e50c7 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -65,8 +65,8 @@ struct bnxt_re_pd_resp { } __attribute__((packed, aligned(4))); struct bnxt_re_cq_req { - __u64 cq_va; - __u64 cq_handle; + __aligned_u64 cq_va; + __aligned_u64 cq_handle; }; struct bnxt_re_cq_resp { @@ -77,9 +77,9 @@ struct bnxt_re_cq_resp { }; struct bnxt_re_qp_req { - __u64 qpsva; - __u64 qprva; - __u64 qp_handle; + __aligned_u64 qpsva; + __aligned_u64 qprva; + __aligned_u64 qp_handle; }; struct bnxt_re_qp_resp { @@ -88,8 +88,8 @@ struct bnxt_re_qp_resp { }; struct bnxt_re_srq_req { - __u64 srqva; - __u64 srq_handle; + __aligned_u64 srqva; + __aligned_u64 srq_handle; }; struct bnxt_re_srq_resp { diff --git a/include/uapi/rdma/cxgb3-abi.h b/include/uapi/rdma/cxgb3-abi.h index 17116c1c7925..9acb4b7a6246 100644 --- a/include/uapi/rdma/cxgb3-abi.h +++ b/include/uapi/rdma/cxgb3-abi.h @@ -41,21 +41,21 @@ * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to * avoid incompatibility between 32-bit userspace and 64-bit kernels). - * In particular do not use pointer types -- pass pointers in __u64 + * In particular do not use pointer types -- pass pointers in __aligned_u64 * instead. */ struct iwch_create_cq_req { - __u64 user_rptr_addr; + __aligned_u64 user_rptr_addr; }; struct iwch_create_cq_resp_v0 { - __u64 key; + __aligned_u64 key; __u32 cqid; __u32 size_log2; }; struct iwch_create_cq_resp { - __u64 key; + __aligned_u64 key; __u32 cqid; __u32 size_log2; __u32 memsize; @@ -63,8 +63,8 @@ struct iwch_create_cq_resp { }; struct iwch_create_qp_resp { - __u64 key; - __u64 db_key; + __aligned_u64 key; + __aligned_u64 db_key; __u32 qpid; __u32 size_log2; __u32 sq_size_log2; diff --git a/include/uapi/rdma/cxgb4-abi.h b/include/uapi/rdma/cxgb4-abi.h index c398a1ee8d00..1fefd0140c26 100644 --- a/include/uapi/rdma/cxgb4-abi.h +++ b/include/uapi/rdma/cxgb4-abi.h @@ -41,13 +41,13 @@ * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to * avoid incompatibility between 32-bit userspace and 64-bit kernels). - * In particular do not use pointer types -- pass pointers in __u64 + * In particular do not use pointer types -- pass pointers in __aligned_u64 * instead. */ struct c4iw_create_cq_resp { - __u64 key; - __u64 gts_key; - __u64 memsize; + __aligned_u64 key; + __aligned_u64 gts_key; + __aligned_u64 memsize; __u32 cqid; __u32 size; __u32 qid_mask; @@ -59,13 +59,13 @@ enum { }; struct c4iw_create_qp_resp { - __u64 ma_sync_key; - __u64 sq_key; - __u64 rq_key; - __u64 sq_db_gts_key; - __u64 rq_db_gts_key; - __u64 sq_memsize; - __u64 rq_memsize; + __aligned_u64 ma_sync_key; + __aligned_u64 sq_key; + __aligned_u64 rq_key; + __aligned_u64 sq_db_gts_key; + __aligned_u64 rq_db_gts_key; + __aligned_u64 sq_memsize; + __aligned_u64 rq_memsize; __u32 sqid; __u32 rqid; __u32 sq_size; @@ -75,7 +75,7 @@ struct c4iw_create_qp_resp { }; struct c4iw_alloc_ucontext_resp { - __u64 status_page_key; + __aligned_u64 status_page_key; __u32 status_page_size; __u32 reserved; /* explicit padding (optional for i386) */ }; diff --git a/include/uapi/rdma/hfi/hfi1_ioctl.h b/include/uapi/rdma/hfi/hfi1_ioctl.h index 9de78c5ee913..8f3d9fe7b141 100644 --- a/include/uapi/rdma/hfi/hfi1_ioctl.h +++ b/include/uapi/rdma/hfi/hfi1_ioctl.h @@ -79,7 +79,7 @@ struct hfi1_user_info { }; struct hfi1_ctxt_info { - __u64 runtime_flags; /* chip/drv runtime flags (HFI1_CAP_*) */ + __aligned_u64 runtime_flags; /* chip/drv runtime flags (HFI1_CAP_*) */ __u32 rcvegr_size; /* size of each eager buffer */ __u16 num_active; /* number of active units */ __u16 unit; /* unit (chip) assigned to caller */ @@ -98,9 +98,9 @@ struct hfi1_ctxt_info { struct hfi1_tid_info { /* virtual address of first page in transfer */ - __u64 vaddr; + __aligned_u64 vaddr; /* pointer to tid array. this array is big enough */ - __u64 tidlist; + __aligned_u64 tidlist; /* number of tids programmed by this request */ __u32 tidcnt; /* length of transfer buffer programmed by this request */ @@ -131,23 +131,23 @@ struct hfi1_base_info { */ __u32 bthqp; /* PIO credit return address, */ - __u64 sc_credits_addr; + __aligned_u64 sc_credits_addr; /* * Base address of write-only pio buffers for this process. * Each buffer has sendpio_credits*64 bytes. */ - __u64 pio_bufbase_sop; + __aligned_u64 pio_bufbase_sop; /* * Base address of write-only pio buffers for this process. * Each buffer has sendpio_credits*64 bytes. */ - __u64 pio_bufbase; + __aligned_u64 pio_bufbase; /* address where receive buffer queue is mapped into */ - __u64 rcvhdr_bufbase; + __aligned_u64 rcvhdr_bufbase; /* base address of Eager receive buffers. */ - __u64 rcvegr_bufbase; + __aligned_u64 rcvegr_bufbase; /* base address of SDMA completion ring */ - __u64 sdma_comp_bufbase; + __aligned_u64 sdma_comp_bufbase; /* * User register base for init code, not to be used directly by * protocol or applications. Always maps real chip register space. @@ -155,20 +155,20 @@ struct hfi1_base_info { * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail, * ur_rcvtidflow */ - __u64 user_regbase; + __aligned_u64 user_regbase; /* notification events */ - __u64 events_bufbase; + __aligned_u64 events_bufbase; /* status page */ - __u64 status_bufbase; + __aligned_u64 status_bufbase; /* rcvhdrtail update */ - __u64 rcvhdrtail_base; + __aligned_u64 rcvhdrtail_base; /* * shared memory pages for subctxts if ctxt is shared; these cover * all the processes in the group sharing a single context. * all have enough space for the num_subcontexts value on this job. */ - __u64 subctxt_uregbase; - __u64 subctxt_rcvegrbuf; - __u64 subctxt_rcvhdrbuf; + __aligned_u64 subctxt_uregbase; + __aligned_u64 subctxt_rcvegrbuf; + __aligned_u64 subctxt_rcvhdrbuf; }; #endif /* _LINIUX__HFI1_IOCTL_H */ diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index 43b46bf6f8bb..c6a984c0c881 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -177,8 +177,8 @@ struct hfi1_sdma_comp_entry { * Device status and notifications from driver to user-space. */ struct hfi1_status { - __u64 dev; /* device/hw status bits */ - __u64 port; /* port state and status bits */ + __aligned_u64 dev; /* device/hw status bits */ + __aligned_u64 port; /* port state and status bits */ char freezemsg[0]; }; diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index aa774985a0c7..7092c8de4bd8 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -37,18 +37,18 @@ #include struct hns_roce_ib_create_cq { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; }; struct hns_roce_ib_create_cq_resp { - __u64 cqn; /* Only 32 bits used, 64 for compat */ - __u64 cap_flags; + __aligned_u64 cqn; /* Only 32 bits used, 64 for compat */ + __aligned_u64 cap_flags; }; struct hns_roce_ib_create_qp { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; __u8 log_sq_bb_count; __u8 log_sq_stride; __u8 sq_no_prefetch; @@ -56,7 +56,7 @@ struct hns_roce_ib_create_qp { }; struct hns_roce_ib_create_qp_resp { - __u64 cap_flags; + __aligned_u64 cap_flags; }; struct hns_roce_ib_alloc_ucontext_resp { diff --git a/include/uapi/rdma/i40iw-abi.h b/include/uapi/rdma/i40iw-abi.h index bfc3aaf2e56a..79890baa6fdb 100644 --- a/include/uapi/rdma/i40iw-abi.h +++ b/include/uapi/rdma/i40iw-abi.h @@ -61,17 +61,17 @@ struct i40iw_alloc_pd_resp { }; struct i40iw_create_cq_req { - __u64 user_cq_buffer; - __u64 user_shadow_area; + __aligned_u64 user_cq_buffer; + __aligned_u64 user_shadow_area; }; struct i40iw_create_qp_req { - __u64 user_wqe_buffers; - __u64 user_compl_ctx; + __aligned_u64 user_wqe_buffers; + __aligned_u64 user_compl_ctx; /* UDA QP PHB */ - __u64 user_sq_phb; /* place for VA of the sq phb buff */ - __u64 user_rq_phb; /* place for VA of the rq phb buff */ + __aligned_u64 user_sq_phb; /* place for VA of the sq phb buff */ + __aligned_u64 user_rq_phb; /* place for VA of the rq phb buff */ }; enum i40iw_memreg_type { diff --git a/include/uapi/rdma/ib_user_cm.h b/include/uapi/rdma/ib_user_cm.h index f4041bdc4d08..4a8f9562f7cd 100644 --- a/include/uapi/rdma/ib_user_cm.h +++ b/include/uapi/rdma/ib_user_cm.h @@ -73,8 +73,8 @@ struct ib_ucm_cmd_hdr { }; struct ib_ucm_create_id { - __u64 uid; - __u64 response; + __aligned_u64 uid; + __aligned_u64 response; }; struct ib_ucm_create_id_resp { @@ -82,7 +82,7 @@ struct ib_ucm_create_id_resp { }; struct ib_ucm_destroy_id { - __u64 response; + __aligned_u64 response; __u32 id; __u32 reserved; }; @@ -92,7 +92,7 @@ struct ib_ucm_destroy_id_resp { }; struct ib_ucm_attr_id { - __u64 response; + __aligned_u64 response; __u32 id; __u32 reserved; }; @@ -105,7 +105,7 @@ struct ib_ucm_attr_id_resp { }; struct ib_ucm_init_qp_attr { - __u64 response; + __aligned_u64 response; __u32 id; __u32 qp_state; }; @@ -123,7 +123,7 @@ struct ib_ucm_notify { }; struct ib_ucm_private_data { - __u64 data; + __aligned_u64 data; __u32 id; __u8 len; __u8 reserved[3]; @@ -135,9 +135,9 @@ struct ib_ucm_req { __u32 qp_type; __u32 psn; __be64 sid; - __u64 data; - __u64 primary_path; - __u64 alternate_path; + __aligned_u64 data; + __aligned_u64 primary_path; + __aligned_u64 alternate_path; __u8 len; __u8 peer_to_peer; __u8 responder_resources; @@ -153,8 +153,8 @@ struct ib_ucm_req { }; struct ib_ucm_rep { - __u64 uid; - __u64 data; + __aligned_u64 uid; + __aligned_u64 data; __u32 id; __u32 qpn; __u32 psn; @@ -172,15 +172,15 @@ struct ib_ucm_rep { struct ib_ucm_info { __u32 id; __u32 status; - __u64 info; - __u64 data; + __aligned_u64 info; + __aligned_u64 data; __u8 info_len; __u8 data_len; __u8 reserved[6]; }; struct ib_ucm_mra { - __u64 data; + __aligned_u64 data; __u32 id; __u8 len; __u8 timeout; @@ -188,8 +188,8 @@ struct ib_ucm_mra { }; struct ib_ucm_lap { - __u64 path; - __u64 data; + __aligned_u64 path; + __aligned_u64 data; __u32 id; __u8 len; __u8 reserved[3]; @@ -199,8 +199,8 @@ struct ib_ucm_sidr_req { __u32 id; __u32 timeout; __be64 sid; - __u64 data; - __u64 path; + __aligned_u64 data; + __aligned_u64 path; __u16 reserved_pkey; __u8 len; __u8 max_cm_retries; @@ -212,8 +212,8 @@ struct ib_ucm_sidr_rep { __u32 qpn; __u32 qkey; __u32 status; - __u64 info; - __u64 data; + __aligned_u64 info; + __aligned_u64 data; __u8 info_len; __u8 data_len; __u8 reserved[6]; @@ -222,9 +222,9 @@ struct ib_ucm_sidr_rep { * event notification ABI structures. */ struct ib_ucm_event_get { - __u64 response; - __u64 data; - __u64 info; + __aligned_u64 response; + __aligned_u64 data; + __aligned_u64 info; __u8 data_len; __u8 info_len; __u8 reserved[6]; @@ -303,7 +303,7 @@ struct ib_ucm_sidr_rep_event_resp { #define IB_UCM_PRES_ALTERNATE 0x08 struct ib_ucm_event_resp { - __u64 uid; + __aligned_u64 uid; __u32 id; __u32 event; __u32 present; diff --git a/include/uapi/rdma/ib_user_mad.h b/include/uapi/rdma/ib_user_mad.h index 330a3c5f1aa8..ef92118dad97 100644 --- a/include/uapi/rdma/ib_user_mad.h +++ b/include/uapi/rdma/ib_user_mad.h @@ -143,7 +143,7 @@ struct ib_user_mad_hdr { */ struct ib_user_mad { struct ib_user_mad_hdr hdr; - __u64 data[0]; + __aligned_u64 data[0]; }; /* @@ -225,7 +225,7 @@ struct ib_user_mad_reg_req2 { __u8 mgmt_class_version; __u16 res; __u32 flags; - __u64 method_mask[2]; + __aligned_u64 method_mask[2]; __u32 oui; __u8 rmpp_version; __u8 reserved[3]; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index d56fba09dc8a..aa0615105563 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -117,13 +117,13 @@ enum { */ struct ib_uverbs_async_event_desc { - __u64 element; + __aligned_u64 element; __u32 event_type; /* enum ib_event_type */ __u32 reserved; }; struct ib_uverbs_comp_event_desc { - __u64 cq_handle; + __aligned_u64 cq_handle; }; struct ib_uverbs_cq_moderation_caps { @@ -150,15 +150,15 @@ struct ib_uverbs_cmd_hdr { }; struct ib_uverbs_ex_cmd_hdr { - __u64 response; + __aligned_u64 response; __u16 provider_in_words; __u16 provider_out_words; __u32 cmd_hdr_reserved; }; struct ib_uverbs_get_context { - __u64 response; - __u64 driver_data[0]; + __aligned_u64 response; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_get_context_resp { @@ -167,16 +167,16 @@ struct ib_uverbs_get_context_resp { }; struct ib_uverbs_query_device { - __u64 response; - __u64 driver_data[0]; + __aligned_u64 response; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_query_device_resp { - __u64 fw_ver; + __aligned_u64 fw_ver; __be64 node_guid; __be64 sys_image_guid; - __u64 max_mr_size; - __u64 page_size_cap; + __aligned_u64 max_mr_size; + __aligned_u64 page_size_cap; __u32 vendor_id; __u32 vendor_part_id; __u32 hw_ver; @@ -221,7 +221,7 @@ struct ib_uverbs_ex_query_device { }; struct ib_uverbs_odp_caps { - __u64 general_caps; + __aligned_u64 general_caps; struct { __u32 rc_odp_caps; __u32 uc_odp_caps; @@ -260,9 +260,9 @@ struct ib_uverbs_ex_query_device_resp { __u32 comp_mask; __u32 response_length; struct ib_uverbs_odp_caps odp_caps; - __u64 timestamp_mask; - __u64 hca_core_clock; /* in KHZ */ - __u64 device_cap_flags_ex; + __aligned_u64 timestamp_mask; + __aligned_u64 hca_core_clock; /* in KHZ */ + __aligned_u64 device_cap_flags_ex; struct ib_uverbs_rss_caps rss_caps; __u32 max_wq_type_rq; __u32 raw_packet_caps; @@ -271,10 +271,10 @@ struct ib_uverbs_ex_query_device_resp { }; struct ib_uverbs_query_port { - __u64 response; + __aligned_u64 response; __u8 port_num; __u8 reserved[7]; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_query_port_resp { @@ -302,8 +302,8 @@ struct ib_uverbs_query_port_resp { }; struct ib_uverbs_alloc_pd { - __u64 response; - __u64 driver_data[0]; + __aligned_u64 response; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_alloc_pd_resp { @@ -315,10 +315,10 @@ struct ib_uverbs_dealloc_pd { }; struct ib_uverbs_open_xrcd { - __u64 response; + __aligned_u64 response; __u32 fd; __u32 oflags; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_open_xrcd_resp { @@ -330,13 +330,13 @@ struct ib_uverbs_close_xrcd { }; struct ib_uverbs_reg_mr { - __u64 response; - __u64 start; - __u64 length; - __u64 hca_va; + __aligned_u64 response; + __aligned_u64 start; + __aligned_u64 length; + __aligned_u64 hca_va; __u32 pd_handle; __u32 access_flags; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_reg_mr_resp { @@ -346,12 +346,12 @@ struct ib_uverbs_reg_mr_resp { }; struct ib_uverbs_rereg_mr { - __u64 response; + __aligned_u64 response; __u32 mr_handle; __u32 flags; - __u64 start; - __u64 length; - __u64 hca_va; + __aligned_u64 start; + __aligned_u64 length; + __aligned_u64 hca_va; __u32 pd_handle; __u32 access_flags; }; @@ -366,7 +366,7 @@ struct ib_uverbs_dereg_mr { }; struct ib_uverbs_alloc_mw { - __u64 response; + __aligned_u64 response; __u32 pd_handle; __u8 mw_type; __u8 reserved[3]; @@ -382,7 +382,7 @@ struct ib_uverbs_dealloc_mw { }; struct ib_uverbs_create_comp_channel { - __u64 response; + __aligned_u64 response; }; struct ib_uverbs_create_comp_channel_resp { @@ -390,13 +390,13 @@ struct ib_uverbs_create_comp_channel_resp { }; struct ib_uverbs_create_cq { - __u64 response; - __u64 user_handle; + __aligned_u64 response; + __aligned_u64 user_handle; __u32 cqe; __u32 comp_vector; __s32 comp_channel; __u32 reserved; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; enum ib_uverbs_ex_create_cq_flags { @@ -405,7 +405,7 @@ enum ib_uverbs_ex_create_cq_flags { }; struct ib_uverbs_ex_create_cq { - __u64 user_handle; + __aligned_u64 user_handle; __u32 cqe; __u32 comp_vector; __s32 comp_channel; @@ -426,26 +426,26 @@ struct ib_uverbs_ex_create_cq_resp { }; struct ib_uverbs_resize_cq { - __u64 response; + __aligned_u64 response; __u32 cq_handle; __u32 cqe; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_resize_cq_resp { __u32 cqe; __u32 reserved; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_poll_cq { - __u64 response; + __aligned_u64 response; __u32 cq_handle; __u32 ne; }; struct ib_uverbs_wc { - __u64 wr_id; + __aligned_u64 wr_id; __u32 status; __u32 opcode; __u32 vendor_err; @@ -477,7 +477,7 @@ struct ib_uverbs_req_notify_cq { }; struct ib_uverbs_destroy_cq { - __u64 response; + __aligned_u64 response; __u32 cq_handle; __u32 reserved; }; @@ -546,8 +546,8 @@ struct ib_uverbs_qp_attr { }; struct ib_uverbs_create_qp { - __u64 response; - __u64 user_handle; + __aligned_u64 response; + __aligned_u64 user_handle; __u32 pd_handle; __u32 send_cq_handle; __u32 recv_cq_handle; @@ -561,7 +561,7 @@ struct ib_uverbs_create_qp { __u8 qp_type; __u8 is_srq; __u8 reserved; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; enum ib_uverbs_create_qp_mask { @@ -587,7 +587,7 @@ enum { }; struct ib_uverbs_ex_create_qp { - __u64 user_handle; + __aligned_u64 user_handle; __u32 pd_handle; __u32 send_cq_handle; __u32 recv_cq_handle; @@ -608,13 +608,13 @@ struct ib_uverbs_ex_create_qp { }; struct ib_uverbs_open_qp { - __u64 response; - __u64 user_handle; + __aligned_u64 response; + __aligned_u64 user_handle; __u32 pd_handle; __u32 qpn; __u8 qp_type; __u8 reserved[7]; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; /* also used for open response */ @@ -655,10 +655,10 @@ struct ib_uverbs_qp_dest { }; struct ib_uverbs_query_qp { - __u64 response; + __aligned_u64 response; __u32 qp_handle; __u32 attr_mask; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_query_qp_resp { @@ -692,7 +692,7 @@ struct ib_uverbs_query_qp_resp { __u8 alt_timeout; __u8 sq_sig_all; __u8 reserved[5]; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_modify_qp { @@ -722,7 +722,7 @@ struct ib_uverbs_modify_qp { __u8 alt_port_num; __u8 alt_timeout; __u8 reserved[2]; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_ex_modify_qp { @@ -740,7 +740,7 @@ struct ib_uverbs_ex_modify_qp_resp { }; struct ib_uverbs_destroy_qp { - __u64 response; + __aligned_u64 response; __u32 qp_handle; __u32 reserved; }; @@ -756,13 +756,13 @@ struct ib_uverbs_destroy_qp_resp { * document the ABI. */ struct ib_uverbs_sge { - __u64 addr; + __aligned_u64 addr; __u32 length; __u32 lkey; }; struct ib_uverbs_send_wr { - __u64 wr_id; + __aligned_u64 wr_id; __u32 num_sge; __u32 opcode; __u32 send_flags; @@ -772,14 +772,14 @@ struct ib_uverbs_send_wr { } ex; union { struct { - __u64 remote_addr; + __aligned_u64 remote_addr; __u32 rkey; __u32 reserved; } rdma; struct { - __u64 remote_addr; - __u64 compare_add; - __u64 swap; + __aligned_u64 remote_addr; + __aligned_u64 compare_add; + __aligned_u64 swap; __u32 rkey; __u32 reserved; } atomic; @@ -793,7 +793,7 @@ struct ib_uverbs_send_wr { }; struct ib_uverbs_post_send { - __u64 response; + __aligned_u64 response; __u32 qp_handle; __u32 wr_count; __u32 sge_count; @@ -806,13 +806,13 @@ struct ib_uverbs_post_send_resp { }; struct ib_uverbs_recv_wr { - __u64 wr_id; + __aligned_u64 wr_id; __u32 num_sge; __u32 reserved; }; struct ib_uverbs_post_recv { - __u64 response; + __aligned_u64 response; __u32 qp_handle; __u32 wr_count; __u32 sge_count; @@ -825,7 +825,7 @@ struct ib_uverbs_post_recv_resp { }; struct ib_uverbs_post_srq_recv { - __u64 response; + __aligned_u64 response; __u32 srq_handle; __u32 wr_count; __u32 sge_count; @@ -838,8 +838,8 @@ struct ib_uverbs_post_srq_recv_resp { }; struct ib_uverbs_create_ah { - __u64 response; - __u64 user_handle; + __aligned_u64 response; + __aligned_u64 user_handle; __u32 pd_handle; __u32 reserved; struct ib_uverbs_ah_attr attr; @@ -858,7 +858,7 @@ struct ib_uverbs_attach_mcast { __u32 qp_handle; __u16 mlid; __u16 reserved; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_detach_mcast { @@ -866,7 +866,7 @@ struct ib_uverbs_detach_mcast { __u32 qp_handle; __u16 mlid; __u16 reserved; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_flow_spec_hdr { @@ -874,7 +874,7 @@ struct ib_uverbs_flow_spec_hdr { __u16 size; __u16 reserved; /* followed by flow_spec */ - __u64 flow_spec_data[0]; + __aligned_u64 flow_spec_data[0]; }; struct ib_uverbs_flow_eth_filter { @@ -1033,18 +1033,18 @@ struct ib_uverbs_destroy_flow { }; struct ib_uverbs_create_srq { - __u64 response; - __u64 user_handle; + __aligned_u64 response; + __aligned_u64 user_handle; __u32 pd_handle; __u32 max_wr; __u32 max_sge; __u32 srq_limit; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_create_xsrq { - __u64 response; - __u64 user_handle; + __aligned_u64 response; + __aligned_u64 user_handle; __u32 srq_type; __u32 pd_handle; __u32 max_wr; @@ -1053,7 +1053,7 @@ struct ib_uverbs_create_xsrq { __u32 max_num_tags; __u32 xrcd_handle; __u32 cq_handle; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_create_srq_resp { @@ -1068,14 +1068,14 @@ struct ib_uverbs_modify_srq { __u32 attr_mask; __u32 max_wr; __u32 srq_limit; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_query_srq { - __u64 response; + __aligned_u64 response; __u32 srq_handle; __u32 reserved; - __u64 driver_data[0]; + __aligned_u64 driver_data[0]; }; struct ib_uverbs_query_srq_resp { @@ -1086,7 +1086,7 @@ struct ib_uverbs_query_srq_resp { }; struct ib_uverbs_destroy_srq { - __u64 response; + __aligned_u64 response; __u32 srq_handle; __u32 reserved; }; @@ -1098,7 +1098,7 @@ struct ib_uverbs_destroy_srq_resp { struct ib_uverbs_ex_create_wq { __u32 comp_mask; __u32 wq_type; - __u64 user_handle; + __aligned_u64 user_handle; __u32 pd_handle; __u32 cq_handle; __u32 max_wr; diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index 50a56aeb1f41..04f64bc4045f 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -77,8 +77,8 @@ struct mlx4_ib_alloc_pd_resp { }; struct mlx4_ib_create_cq { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; }; struct mlx4_ib_create_cq_resp { @@ -87,12 +87,12 @@ struct mlx4_ib_create_cq_resp { }; struct mlx4_ib_resize_cq { - __u64 buf_addr; + __aligned_u64 buf_addr; }; struct mlx4_ib_create_srq { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; }; struct mlx4_ib_create_srq_resp { @@ -101,7 +101,7 @@ struct mlx4_ib_create_srq_resp { }; struct mlx4_ib_create_qp_rss { - __u64 rx_hash_fields_mask; /* Use enum mlx4_ib_rx_hash_fields */ + __aligned_u64 rx_hash_fields_mask; /* Use enum mlx4_ib_rx_hash_fields */ __u8 rx_hash_function; /* Use enum mlx4_ib_rx_hash_function_flags */ __u8 reserved[7]; __u8 rx_hash_key[40]; @@ -110,8 +110,8 @@ struct mlx4_ib_create_qp_rss { }; struct mlx4_ib_create_qp { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; __u8 log_sq_bb_count; __u8 log_sq_stride; __u8 sq_no_prefetch; @@ -120,8 +120,8 @@ struct mlx4_ib_create_qp { }; struct mlx4_ib_create_wq { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; __u8 log_range_size; __u8 reserved[3]; __u32 comp_mask; @@ -161,7 +161,7 @@ enum mlx4_ib_rx_hash_fields { }; struct mlx4_ib_rss_caps { - __u64 rx_hash_fields_mask; /* enum mlx4_ib_rx_hash_fields */ + __aligned_u64 rx_hash_fields_mask; /* enum mlx4_ib_rx_hash_fields */ __u8 rx_hash_function; /* enum mlx4_ib_rx_hash_function_flags */ __u8 reserved[7]; }; @@ -181,7 +181,7 @@ struct mlx4_ib_tso_caps { struct mlx4_uverbs_ex_query_device_resp { __u32 comp_mask; __u32 response_length; - __u64 hca_core_clock_offset; + __aligned_u64 hca_core_clock_offset; __u32 max_inl_recv_sz; __u32 reserved; struct mlx4_ib_rss_caps rss_caps; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index d2e0d234704f..09c50f390a3c 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -84,7 +84,7 @@ struct mlx5_ib_alloc_ucontext_req_v2 { __u8 reserved0; __u16 reserved1; __u32 reserved2; - __u64 lib_caps; + __aligned_u64 lib_caps; }; enum mlx5_ib_alloc_ucontext_resp_mask { @@ -125,7 +125,7 @@ struct mlx5_ib_alloc_ucontext_resp { __u8 cmds_supp_uhw; __u8 eth_min_inline; __u8 clock_info_versions; - __u64 hca_core_clock_offset; + __aligned_u64 hca_core_clock_offset; __u32 log_uar_size; __u32 num_uars_per_page; __u32 num_dyn_bfregs; @@ -147,7 +147,7 @@ struct mlx5_ib_tso_caps { }; struct mlx5_ib_rss_caps { - __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ + __aligned_u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */ __u8 reserved[7]; }; @@ -248,8 +248,8 @@ enum mlx5_ib_create_cq_flags { }; struct mlx5_ib_create_cq { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; __u32 cqe_size; __u8 cqe_comp_en; __u8 cqe_comp_res_format; @@ -262,15 +262,15 @@ struct mlx5_ib_create_cq_resp { }; struct mlx5_ib_resize_cq { - __u64 buf_addr; + __aligned_u64 buf_addr; __u16 cqe_size; __u16 reserved0; __u32 reserved1; }; struct mlx5_ib_create_srq { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; __u32 flags; __u32 reserved0; /* explicit padding (optional on i386) */ __u32 uidx; @@ -283,8 +283,8 @@ struct mlx5_ib_create_srq_resp { }; struct mlx5_ib_create_qp { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; __u32 sq_wqe_count; __u32 rq_wqe_count; __u32 rq_wqe_shift; @@ -292,8 +292,8 @@ struct mlx5_ib_create_qp { __u32 uidx; __u32 bfreg_index; union { - __u64 sq_buf_addr; - __u64 access_key; + __aligned_u64 sq_buf_addr; + __aligned_u64 access_key; }; }; @@ -324,7 +324,7 @@ enum mlx5_rx_hash_fields { }; struct mlx5_ib_create_qp_rss { - __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ + __aligned_u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */ __u8 rx_key_len; /* valid only for Toeplitz */ __u8 reserved[6]; @@ -349,8 +349,8 @@ enum mlx5_ib_create_wq_mask { }; struct mlx5_ib_create_wq { - __u64 buf_addr; - __u64 db_addr; + __aligned_u64 buf_addr; + __aligned_u64 db_addr; __u32 rq_wqe_count; __u32 rq_wqe_shift; __u32 user_index; @@ -402,13 +402,13 @@ struct mlx5_ib_modify_wq { struct mlx5_ib_clock_info { __u32 sign; __u32 resv; - __u64 nsec; - __u64 cycles; - __u64 frac; + __aligned_u64 nsec; + __aligned_u64 cycles; + __aligned_u64 frac; __u32 mult; __u32 shift; - __u64 mask; - __u64 overflow_period; + __aligned_u64 mask; + __aligned_u64 overflow_period; }; enum mlx5_ib_mmap_cmd { diff --git a/include/uapi/rdma/mthca-abi.h b/include/uapi/rdma/mthca-abi.h index 3020d8a907a7..ac756cd9e807 100644 --- a/include/uapi/rdma/mthca-abi.h +++ b/include/uapi/rdma/mthca-abi.h @@ -74,8 +74,8 @@ struct mthca_reg_mr { struct mthca_create_cq { __u32 lkey; __u32 pdn; - __u64 arm_db_page; - __u64 set_db_page; + __aligned_u64 arm_db_page; + __aligned_u64 set_db_page; __u32 arm_db_index; __u32 set_db_index; }; @@ -93,7 +93,7 @@ struct mthca_resize_cq { struct mthca_create_srq { __u32 lkey; __u32 db_index; - __u64 db_page; + __aligned_u64 db_page; }; struct mthca_create_srq_resp { @@ -104,8 +104,8 @@ struct mthca_create_srq_resp { struct mthca_create_qp { __u32 lkey; __u32 reserved; - __u64 sq_db_page; - __u64 rq_db_page; + __aligned_u64 sq_db_page; + __aligned_u64 rq_db_page; __u32 sq_db_index; __u32 rq_db_index; }; diff --git a/include/uapi/rdma/nes-abi.h b/include/uapi/rdma/nes-abi.h index f5b2437aab28..35bfd4015d07 100644 --- a/include/uapi/rdma/nes-abi.h +++ b/include/uapi/rdma/nes-abi.h @@ -72,14 +72,14 @@ struct nes_alloc_pd_resp { }; struct nes_create_cq_req { - __u64 user_cq_buffer; + __aligned_u64 user_cq_buffer; __u32 mcrqf; __u8 reserved[4]; }; struct nes_create_qp_req { - __u64 user_wqe_buffers; - __u64 user_qp_buffer; + __aligned_u64 user_wqe_buffers; + __aligned_u64 user_qp_buffer; }; enum iwnes_memreg_type { diff --git a/include/uapi/rdma/ocrdma-abi.h b/include/uapi/rdma/ocrdma-abi.h index 32ef8670583a..284d47b41f6e 100644 --- a/include/uapi/rdma/ocrdma-abi.h +++ b/include/uapi/rdma/ocrdma-abi.h @@ -55,13 +55,13 @@ struct ocrdma_alloc_ucontext_resp { __u32 wqe_size; __u32 max_inline_data; __u32 dpp_wqe_size; - __u64 ah_tbl_page; + __aligned_u64 ah_tbl_page; __u32 ah_tbl_len; __u32 rqe_size; __u8 fw_ver[32]; /* for future use/new features in progress */ - __u64 rsvd1; - __u64 rsvd2; + __aligned_u64 rsvd1; + __aligned_u64 rsvd2; }; struct ocrdma_alloc_pd_ureq { @@ -87,13 +87,13 @@ struct ocrdma_create_cq_uresp { __u32 page_size; __u32 num_pages; __u32 max_hw_cqe; - __u64 page_addr[MAX_CQ_PAGES]; - __u64 db_page_addr; + __aligned_u64 page_addr[MAX_CQ_PAGES]; + __aligned_u64 db_page_addr; __u32 db_page_size; __u32 phase_change; /* for future use/new features in progress */ - __u64 rsvd1; - __u64 rsvd2; + __aligned_u64 rsvd1; + __aligned_u64 rsvd2; }; #define MAX_QP_PAGES 8 @@ -115,9 +115,9 @@ struct ocrdma_create_qp_uresp { __u32 rq_page_size; __u32 num_sq_pages; __u32 num_rq_pages; - __u64 sq_page_addr[MAX_QP_PAGES]; - __u64 rq_page_addr[MAX_QP_PAGES]; - __u64 db_page_addr; + __aligned_u64 sq_page_addr[MAX_QP_PAGES]; + __aligned_u64 rq_page_addr[MAX_QP_PAGES]; + __aligned_u64 db_page_addr; __u32 db_page_size; __u32 dpp_credit; __u32 dpp_offset; @@ -126,7 +126,7 @@ struct ocrdma_create_qp_uresp { __u32 db_sq_offset; __u32 db_rq_offset; __u32 db_shift; - __u64 rsvd[11]; + __aligned_u64 rsvd[11]; }; struct ocrdma_create_srq_uresp { @@ -137,16 +137,16 @@ struct ocrdma_create_srq_uresp { __u32 rq_page_size; __u32 num_rq_pages; - __u64 rq_page_addr[MAX_QP_PAGES]; - __u64 db_page_addr; + __aligned_u64 rq_page_addr[MAX_QP_PAGES]; + __aligned_u64 db_page_addr; __u32 db_page_size; __u32 num_rqe_allocated; __u32 db_rq_offset; __u32 db_shift; - __u64 rsvd2; - __u64 rsvd3; + __aligned_u64 rsvd2; + __aligned_u64 rsvd3; }; #endif /* OCRDMA_ABI_USER_H */ diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index 396656062931..8ba098900e9a 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -40,7 +40,7 @@ /* user kernel communication data structures. */ struct qedr_alloc_ucontext_resp { - __u64 db_pa; + __aligned_u64 db_pa; __u32 db_size; __u32 max_send_wr; @@ -57,7 +57,7 @@ struct qedr_alloc_ucontext_resp { }; struct qedr_alloc_pd_ureq { - __u64 rsvd1; + __aligned_u64 rsvd1; }; struct qedr_alloc_pd_uresp { @@ -66,8 +66,8 @@ struct qedr_alloc_pd_uresp { }; struct qedr_create_cq_ureq { - __u64 addr; - __u64 len; + __aligned_u64 addr; + __aligned_u64 len; }; struct qedr_create_cq_uresp { @@ -82,17 +82,17 @@ struct qedr_create_qp_ureq { /* SQ */ /* user space virtual address of SQ buffer */ - __u64 sq_addr; + __aligned_u64 sq_addr; /* length of SQ buffer */ - __u64 sq_len; + __aligned_u64 sq_len; /* RQ */ /* user space virtual address of RQ buffer */ - __u64 rq_addr; + __aligned_u64 rq_addr; /* length of RQ buffer */ - __u64 rq_len; + __aligned_u64 rq_len; }; struct qedr_create_qp_uresp { diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 65399c837762..c4f28cb92214 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -80,8 +80,8 @@ struct rdma_ucm_cmd_hdr { }; struct rdma_ucm_create_id { - __u64 uid; - __u64 response; + __aligned_u64 uid; + __aligned_u64 response; __u16 ps; __u8 qp_type; __u8 reserved[5]; @@ -92,7 +92,7 @@ struct rdma_ucm_create_id_resp { }; struct rdma_ucm_destroy_id { - __u64 response; + __aligned_u64 response; __u32 id; __u32 reserved; }; @@ -102,7 +102,7 @@ struct rdma_ucm_destroy_id_resp { }; struct rdma_ucm_bind_ip { - __u64 response; + __aligned_u64 response; struct sockaddr_in6 addr; __u32 id; }; @@ -143,13 +143,13 @@ enum { }; struct rdma_ucm_query { - __u64 response; + __aligned_u64 response; __u32 id; __u32 option; }; struct rdma_ucm_query_route_resp { - __u64 node_guid; + __aligned_u64 node_guid; struct ib_user_path_rec ib_route[2]; struct sockaddr_in6 src_addr; struct sockaddr_in6 dst_addr; @@ -159,7 +159,7 @@ struct rdma_ucm_query_route_resp { }; struct rdma_ucm_query_addr_resp { - __u64 node_guid; + __aligned_u64 node_guid; __u8 port_num; __u8 reserved; __u16 pkey; @@ -210,7 +210,7 @@ struct rdma_ucm_listen { }; struct rdma_ucm_accept { - __u64 uid; + __aligned_u64 uid; struct rdma_ucm_conn_param conn_param; __u32 id; __u32 reserved; @@ -228,7 +228,7 @@ struct rdma_ucm_disconnect { }; struct rdma_ucm_init_qp_attr { - __u64 response; + __aligned_u64 response; __u32 id; __u32 qp_state; }; @@ -239,8 +239,8 @@ struct rdma_ucm_notify { }; struct rdma_ucm_join_ip_mcast { - __u64 response; /* rdma_ucm_create_id_resp */ - __u64 uid; + __aligned_u64 response; /* rdma_ucm_create_id_resp */ + __aligned_u64 uid; struct sockaddr_in6 addr; __u32 id; }; @@ -253,8 +253,8 @@ enum { }; struct rdma_ucm_join_mcast { - __u64 response; /* rdma_ucma_create_id_resp */ - __u64 uid; + __aligned_u64 response; /* rdma_ucma_create_id_resp */ + __aligned_u64 uid; __u32 id; __u16 addr_size; __u16 join_flags; @@ -262,11 +262,11 @@ struct rdma_ucm_join_mcast { }; struct rdma_ucm_get_event { - __u64 response; + __aligned_u64 response; }; struct rdma_ucm_event_resp { - __u64 uid; + __aligned_u64 uid; __u32 id; __u32 event; __u32 status; @@ -296,7 +296,7 @@ enum { }; struct rdma_ucm_set_option { - __u64 optval; + __aligned_u64 optval; __u32 id; __u32 level; __u32 optname; @@ -304,7 +304,7 @@ struct rdma_ucm_set_option { }; struct rdma_ucm_migrate_id { - __u64 response; + __aligned_u64 response; __u32 id; __u32 fd; }; diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index af8f8218aed5..1f8a9e7daea4 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -68,7 +68,7 @@ struct rxe_av { }; struct rxe_send_wr { - __u64 wr_id; + __aligned_u64 wr_id; __u32 num_sge; __u32 opcode; __u32 send_flags; @@ -78,14 +78,14 @@ struct rxe_send_wr { } ex; union { struct { - __u64 remote_addr; + __aligned_u64 remote_addr; __u32 rkey; __u32 reserved; } rdma; struct { - __u64 remote_addr; - __u64 compare_add; - __u64 swap; + __aligned_u64 remote_addr; + __aligned_u64 compare_add; + __aligned_u64 swap; __u32 rkey; __u32 reserved; } atomic; @@ -98,7 +98,7 @@ struct rxe_send_wr { struct { union { struct ib_mr *mr; - __u64 reserved; + __aligned_u64 reserved; }; __u32 key; __u32 access; @@ -107,13 +107,13 @@ struct rxe_send_wr { }; struct rxe_sge { - __u64 addr; + __aligned_u64 addr; __u32 length; __u32 lkey; }; struct mminfo { - __u64 offset; + __aligned_u64 offset; __u32 size; __u32 pad; }; @@ -136,7 +136,7 @@ struct rxe_send_wqe { struct rxe_av av; __u32 status; __u32 state; - __u64 iova; + __aligned_u64 iova; __u32 mask; __u32 first_psn; __u32 last_psn; @@ -147,7 +147,7 @@ struct rxe_send_wqe { }; struct rxe_recv_wqe { - __u64 wr_id; + __aligned_u64 wr_id; __u32 num_sge; __u32 padding; struct rxe_dma_info dma; @@ -173,7 +173,7 @@ struct rxe_create_srq_resp { }; struct rxe_modify_srq_cmd { - __u64 mmap_info_addr; + __aligned_u64 mmap_info_addr; }; #endif /* RDMA_USER_RXE_H */ diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h index edf5c7224901..d13fd490b66d 100644 --- a/include/uapi/rdma/vmw_pvrdma-abi.h +++ b/include/uapi/rdma/vmw_pvrdma-abi.h @@ -143,7 +143,7 @@ struct pvrdma_alloc_pd_resp { }; struct pvrdma_create_cq { - __u64 buf_addr; + __aligned_u64 buf_addr; __u32 buf_size; __u32 reserved; }; @@ -154,13 +154,13 @@ struct pvrdma_create_cq_resp { }; struct pvrdma_resize_cq { - __u64 buf_addr; + __aligned_u64 buf_addr; __u32 buf_size; __u32 reserved; }; struct pvrdma_create_srq { - __u64 buf_addr; + __aligned_u64 buf_addr; __u32 buf_size; __u32 reserved; }; @@ -171,25 +171,25 @@ struct pvrdma_create_srq_resp { }; struct pvrdma_create_qp { - __u64 rbuf_addr; - __u64 sbuf_addr; + __aligned_u64 rbuf_addr; + __aligned_u64 sbuf_addr; __u32 rbuf_size; __u32 sbuf_size; - __u64 qp_addr; + __aligned_u64 qp_addr; }; /* PVRDMA masked atomic compare and swap */ struct pvrdma_ex_cmp_swap { - __u64 swap_val; - __u64 compare_val; - __u64 swap_mask; - __u64 compare_mask; + __aligned_u64 swap_val; + __aligned_u64 compare_val; + __aligned_u64 swap_mask; + __aligned_u64 compare_mask; }; /* PVRDMA masked atomic fetch and add */ struct pvrdma_ex_fetch_add { - __u64 add_val; - __u64 field_boundary; + __aligned_u64 add_val; + __aligned_u64 field_boundary; }; /* PVRDMA address vector. */ @@ -207,14 +207,14 @@ struct pvrdma_av { /* PVRDMA scatter/gather entry */ struct pvrdma_sge { - __u64 addr; + __aligned_u64 addr; __u32 length; __u32 lkey; }; /* PVRDMA receive queue work request */ struct pvrdma_rq_wqe_hdr { - __u64 wr_id; /* wr id */ + __aligned_u64 wr_id; /* wr id */ __u32 num_sge; /* size of s/g array */ __u32 total_len; /* reserved */ }; @@ -222,7 +222,7 @@ struct pvrdma_rq_wqe_hdr { /* PVRDMA send queue work request */ struct pvrdma_sq_wqe_hdr { - __u64 wr_id; /* wr id */ + __aligned_u64 wr_id; /* wr id */ __u32 num_sge; /* size of s/g array */ __u32 total_len; /* reserved */ __u32 opcode; /* operation type */ @@ -234,19 +234,19 @@ struct pvrdma_sq_wqe_hdr { __u32 reserved; union { struct { - __u64 remote_addr; + __aligned_u64 remote_addr; __u32 rkey; __u8 reserved[4]; } rdma; struct { - __u64 remote_addr; - __u64 compare_add; - __u64 swap; + __aligned_u64 remote_addr; + __aligned_u64 compare_add; + __aligned_u64 swap; __u32 rkey; __u32 reserved; } atomic; struct { - __u64 remote_addr; + __aligned_u64 remote_addr; __u32 log_arg_sz; __u32 rkey; union { @@ -255,8 +255,8 @@ struct pvrdma_sq_wqe_hdr { } wr_data; } masked_atomics; struct { - __u64 iova_start; - __u64 pl_pdir_dma; + __aligned_u64 iova_start; + __aligned_u64 pl_pdir_dma; __u32 page_shift; __u32 page_list_len; __u32 length; @@ -275,8 +275,8 @@ struct pvrdma_sq_wqe_hdr { /* Completion queue element. */ struct pvrdma_cqe { - __u64 wr_id; - __u64 qp; + __aligned_u64 wr_id; + __aligned_u64 qp; __u32 opcode; __u32 status; __u32 byte_len; From be23fb9a2c1d33037c1499a04e93bb0c03cf73d6 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 22 Mar 2018 11:52:02 +0200 Subject: [PATCH 142/199] IB/uverbs: UAPI pointers should use __aligned_u64 type The ioctl() UAPIs are meant to be used by both user-space and kernel ioctl() handlers. Mostly, these UAPI structs tend to consist of simple types, but sometimes user-space pointers may be passed between user-space and kernel. We would like to avoid dereferencing a user-space pointer in the kernel, thus - we always define RDMA_UAPI_PTR as a __aligned_u64 type. Fixes: 1f7ff9d5d36a ('IB/uverbs: Move to new headers and make naming consistent') Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_ioctl_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 3d3a2f017abc..173629ecc09b 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -37,7 +37,7 @@ #include #ifndef RDMA_UAPI_PTR -#define RDMA_UAPI_PTR(_type, _name) _type __attribute__((aligned(8))) _name +#define RDMA_UAPI_PTR(_type, _name) __aligned_u64 _name #endif #endif From ea8af0d2f2b5b16da4553205ddaf225e0a057e03 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 22 Mar 2018 15:34:03 +0200 Subject: [PATCH 143/199] IB/mlx5: Enable ECN capable bits for UD RoCE v2 QPs When working with RC QPs, the FW sets the ECN capable bits for all the RoCE v2 packets. On the other hand, for UD QPs, the driver needs to set the the ECN capable bits in the Address Handler since the HW generates each packet according to the Address Handler and not the QP context. If ECN is not enabled in NIC or switch, these bits are ignored. Fixes: 2811ba51b049 ("IB/mlx5: Add RoCE fields to Address Vector") Reviewed-by: Mark Bloch Signed-off-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/ah.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index fe269f680103..e6bde32a83f3 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -36,6 +36,9 @@ static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, struct rdma_ah_attr *ah_attr) { + enum ib_gid_type gid_type; + int err; + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); @@ -50,6 +53,12 @@ static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + err = mlx5_get_roce_gid_type(dev, ah_attr->port_num, + ah_attr->grh.sgid_index, + &gid_type); + if (err) + return ERR_PTR(err); + memcpy(ah->av.rmac, ah_attr->roce.dmac, sizeof(ah_attr->roce.dmac)); ah->av.udp_sport = @@ -57,6 +66,9 @@ static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, rdma_ah_get_port_num(ah_attr), rdma_ah_read_grh(ah_attr)->sgid_index); ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0x7) << 1; + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) +#define MLX5_ECN_ENABLED BIT(1) + ah->av.tclass |= MLX5_ECN_ENABLED; } else { ah->av.rlid = cpu_to_be16(rdma_ah_get_dlid(ah_attr)); ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f; From c8d75a980fab886a9c716567e6b47cc414ad84ee Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 22 Mar 2018 15:34:04 +0200 Subject: [PATCH 144/199] IB/mlx5: Respect new UMR capabilities In some firmware configuration, UMR usage from Virtual Functions is restricted. This information is published to the driver using new capability bits. Avoid using UMRs in these cases and use the Firmware slow-path flow to create mkeys and populate them with Virtual to Physical address translation. Older drivers that do not have this patch, will end up using memory keys that aren't populated with Virtual to Physical address translation that is done part of the UMR work. Reviewed-by: Mark Bloch Signed-off-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Tested-by: Laurence Oberman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mr.c | 35 ++++++++++++++++++++++++++++----- drivers/infiniband/hw/mlx5/qp.c | 21 +++++++++++++++++--- include/linux/mlx5/mlx5_ifc.h | 6 +++++- 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index bcf5e22cf743..60683090d138 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -51,6 +51,21 @@ static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +static bool umr_can_modify_entity_size(struct mlx5_ib_dev *dev) +{ + return !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled); +} + +static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) +{ + return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); +} + +static bool use_umr(struct mlx5_ib_dev *dev, int order) +{ + return order <= mr_cache_max_order(dev) && + umr_can_modify_entity_size(dev); +} static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { @@ -956,7 +971,10 @@ static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages, { struct mlx5_ib_dev *dev = mr->dev; struct ib_umem *umem = mr->umem; + if (flags & MLX5_IB_UPD_XLT_INDIRECT) { + if (!umr_can_use_indirect_mkey(dev)) + return -EPERM; mlx5_odp_populate_klm(xlt, idx, npages, mr, flags); return npages; } @@ -1003,6 +1021,10 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, gfp_t gfp; bool use_emergency_page = false; + if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && + !umr_can_use_indirect_mkey(dev)) + return -EPERM; + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, * so we need to align the offset and length accordingly */ @@ -1211,13 +1233,13 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; + bool populate_mtts = false; struct ib_umem *umem; int page_shift; int npages; int ncont; int order; int err; - bool use_umr = true; if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) return ERR_PTR(-EOPNOTSUPP); @@ -1244,26 +1266,29 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (err < 0) return ERR_PTR(err); - if (order <= mr_cache_max_order(dev)) { + if (use_umr(dev, order)) { mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont, page_shift, order, access_flags); if (PTR_ERR(mr) == -EAGAIN) { mlx5_ib_dbg(dev, "cache empty for order %d\n", order); mr = NULL; } + populate_mtts = false; } else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) { if (access_flags & IB_ACCESS_ON_DEMAND) { err = -EINVAL; pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB\n"); goto error; } - use_umr = false; + populate_mtts = true; } if (!mr) { + if (!umr_can_modify_entity_size(dev)) + populate_mtts = true; mutex_lock(&dev->slow_path_mutex); mr = reg_create(NULL, pd, virt_addr, length, umem, ncont, - page_shift, access_flags, !use_umr); + page_shift, access_flags, populate_mtts); mutex_unlock(&dev->slow_path_mutex); } @@ -1281,7 +1306,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, update_odp_mr(mr); #endif - if (use_umr) { + if (!populate_mtts) { int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE; if (access_flags & IB_ACCESS_ON_DEMAND) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 2fb3d9a400d3..c152c6f35101 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3697,8 +3697,19 @@ static __be64 get_umr_update_pd_mask(void) return cpu_to_be64(result); } -static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, - struct ib_send_wr *wr, int atomic) +static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) +{ + if ((mask & MLX5_MKEY_MASK_PAGE_SIZE && + MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) || + (mask & MLX5_MKEY_MASK_A && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))) + return -EPERM; + return 0; +} + +static int set_reg_umr_segment(struct mlx5_ib_dev *dev, + struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, int atomic) { struct mlx5_umr_wr *umrwr = umr_wr(wr); @@ -3730,6 +3741,8 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, if (!wr->num_sge) umr->flags |= MLX5_UMR_INLINE; + + return umr_check_mkey_mask(dev, be64_to_cpu(umr->mkey_mask)); } static u8 get_umr_flags(int acc) @@ -4552,7 +4565,9 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey); - set_reg_umr_segment(seg, wr, !!(MLX5_CAP_GEN(mdev, atomic))); + err = set_reg_umr_segment(dev, seg, wr, !!(MLX5_CAP_GEN(mdev, atomic))); + if (unlikely(err)) + goto out; seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((seg == qend))) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c63bbdc35503..64963fd2cd9b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -916,7 +916,11 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_202[0x1]; u8 ipoib_enhanced_offloads[0x1]; u8 ipoib_basic_offloads[0x1]; - u8 reserved_at_205[0x5]; + u8 reserved_at_205[0x1]; + u8 repeated_block_disabled[0x1]; + u8 umr_modify_entity_size_disabled[0x1]; + u8 umr_modify_atomic_disabled[0x1]; + u8 umr_indirect_mkey_disabled[0x1]; u8 umr_fence[0x2]; u8 reserved_at_20c[0x3]; u8 drain_sigerr[0x1]; From e945130b52bea65d15f9bdf54949d4cb7a88db7f Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 27 Mar 2018 15:51:05 +0300 Subject: [PATCH 145/199] IB/core: Protect against concurrent access to hardware stats Currently access to hardware stats buffer isn't protected, this can result in multiple writes and reads at the same time to the same memory location. This can lead to providing an incorrect value to the user. Add a mutex to protect against it. Fixes: b40f4757daa1 ("IB/core: Make device counter infrastructure dynamic") Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sysfs.c | 34 +++++++++++++++++++++++++++------ include/rdma/ib_verbs.h | 4 ++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index cf36ff1f0068..9b0fbab41dc6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -811,10 +811,15 @@ static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, dev = port->ibdev; stats = port->hw_stats; } + mutex_lock(&stats->lock); ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); if (ret) - return ret; - return print_hw_stat(stats, hsa->index, buf); + goto unlock; + ret = print_hw_stat(stats, hsa->index, buf); +unlock: + mutex_unlock(&stats->lock); + + return ret; } static ssize_t show_stats_lifespan(struct kobject *kobj, @@ -822,17 +827,25 @@ static ssize_t show_stats_lifespan(struct kobject *kobj, char *buf) { struct hw_stats_attribute *hsa; + struct rdma_hw_stats *stats; int msecs; hsa = container_of(attr, struct hw_stats_attribute, attr); if (!hsa->port_num) { struct ib_device *dev = container_of((struct device *)kobj, struct ib_device, dev); - msecs = jiffies_to_msecs(dev->hw_stats->lifespan); + + stats = dev->hw_stats; } else { struct ib_port *p = container_of(kobj, struct ib_port, kobj); - msecs = jiffies_to_msecs(p->hw_stats->lifespan); + + stats = p->hw_stats; } + + mutex_lock(&stats->lock); + msecs = jiffies_to_msecs(stats->lifespan); + mutex_unlock(&stats->lock); + return sprintf(buf, "%d\n", msecs); } @@ -841,6 +854,7 @@ static ssize_t set_stats_lifespan(struct kobject *kobj, const char *buf, size_t count) { struct hw_stats_attribute *hsa; + struct rdma_hw_stats *stats; int msecs; int jiffies; int ret; @@ -855,11 +869,18 @@ static ssize_t set_stats_lifespan(struct kobject *kobj, if (!hsa->port_num) { struct ib_device *dev = container_of((struct device *)kobj, struct ib_device, dev); - dev->hw_stats->lifespan = jiffies; + + stats = dev->hw_stats; } else { struct ib_port *p = container_of(kobj, struct ib_port, kobj); - p->hw_stats->lifespan = jiffies; + + stats = p->hw_stats; } + + mutex_lock(&stats->lock); + stats->lifespan = jiffies; + mutex_unlock(&stats->lock); + return count; } @@ -952,6 +973,7 @@ static void setup_hw_stats(struct ib_device *device, struct ib_port *port, sysfs_attr_init(hsag->attrs[i]); } + mutex_init(&stats->lock); /* treat an error here as non-fatal */ hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num); if (hsag->attrs[i]) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e9288d0f627e..48f416fabe0c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -470,6 +470,9 @@ enum ib_port_speed { /** * struct rdma_hw_stats + * @lock - Mutex to protect parallel write access to lifespan and values + * of counters, which are 64bits and not guaranteeed to be written + * atomicaly on 32bits systems. * @timestamp - Used by the core code to track when the last update was * @lifespan - Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults @@ -485,6 +488,7 @@ enum ib_port_speed { * filled in by the drivers get_stats routine */ struct rdma_hw_stats { + struct mutex lock; /* Protect lifespan and values[] */ unsigned long timestamp; unsigned long lifespan; const char * const *names; From 97c45c2c28cd291e06778d9d36a0f60ee74726bc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 25 Mar 2018 13:40:19 +0300 Subject: [PATCH 146/199] IB/cm: Block processing alternate path handling RoCE Rx cm messages Due to below reasons, it is better to not support alternate path receive messages for RoCE in near term. 1. Alternate path for RoCE is not supported at rdmacm layer. 2. It is not supported in uverbs/core layer for RoCE. 3. Alternate path for IPv6 for link local address cannot resolve route determinstically without a valid incoming interface id whose usecase make sense only with dual port mode. 4. init_av_from_path while processing LAP messages for IB and RoCE can lead to adding duplicate entry of AV into the port list, leads to list corruption. 5. rdma-core userspace a well known userspace implementation has removed support of libucm which use ucm.ko module, which is the only module that can trigger alternate path related messages. 6. ucm kernel module is requested to be removed from the IB core in patch [1]. [1] https://patchwork.kernel.org/patch/10268503/ Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 38d79bc1bf78..a92e1a5c202b 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3175,6 +3175,13 @@ static int cm_lap_handler(struct cm_work *work) struct ib_mad_send_buf *msg = NULL; int ret; + /* Currently Alternate path messages are not supported for + * RoCE link layer. + */ + if (rdma_protocol_roce(work->port->cm_dev->ib_device, + work->port->port_num)) + return -EINVAL; + /* todo: verify LAP request and send reject APR if invalid. */ lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id, @@ -3324,6 +3331,13 @@ static int cm_apr_handler(struct cm_work *work) struct cm_apr_msg *apr_msg; int ret; + /* Currently Alternate path messages are not supported for + * RoCE link layer. + */ + if (rdma_protocol_roce(work->port->cm_dev->ib_device, + work->port->port_num)) + return -EINVAL; + apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id, apr_msg->local_comm_id); From 3401857ea347e86a51adb844c5e9207dcdc0139b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 25 Mar 2018 13:40:20 +0300 Subject: [PATCH 147/199] IB/core: Generate GID change event regardless of RoCE GID table property Due to following reasons, GID table event is generated regardless of GID table property. 1. GID table cache is maintained at ib core layer regardless of link layer. 2. GID change event has no relation with IB link layer. 3. GID change event also doesn't depend on whether HCA supports GID table or not. Fixes: f3906bd36087 ("IB/core: Refactor GID cache's ib_dispatch_event") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 5b9416af825b..4f7704342410 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -116,15 +116,13 @@ struct ib_gid_table { static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) { - if (rdma_cap_roce_gid_table(ib_dev, port)) { - struct ib_event event; + struct ib_event event; - event.device = ib_dev; - event.element.port_num = port; - event.event = IB_EVENT_GID_CHANGE; + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; - ib_dispatch_event(&event); - } + ib_dispatch_event(&event); } static const char * const gid_type_str[] = { From 4ab7cb4bf362dc3b85fd8ddac3b16949404be96b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 25 Mar 2018 13:40:21 +0300 Subject: [PATCH 148/199] IB/core: Refer to RoCE port property instead of GID table property ib_find_gid_by_filter() searches GID with filter only for RoCE link layer regardless of HCA's support for GID table. Therefore, right way to lookup is compare RoCE port property and not the GID table property. Fixes: 99b27e3b5da0 ("IB/cache: Add ib_find_gid_by_filter cache API") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 4f7704342410..b94f8d1b262f 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -893,7 +893,7 @@ int ib_find_gid_by_filter(struct ib_device *device, void *context, u16 *index) { /* Only RoCE GID table supports filter function */ - if (!rdma_cap_roce_gid_table(device, port_num) && filter) + if (!rdma_protocol_roce(device, port_num) && filter) return -EPROTONOSUPPORT; return ib_cache_gid_find_by_filter(device, gid, From 22d24f75a19399bd8c4822541c60e853a16a1956 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 25 Mar 2018 13:40:22 +0300 Subject: [PATCH 149/199] IB/core: Search GID only for IB link layer Even though API is only used by IPoIB driver, its incorrect to refer RoCE GID table property to search for GID. Look for only IB link layer to search for the GID. Fixes: dbb12562f7c2 ("IB/{core, ipoib}: Simplify ib_find_gid to search only for IB link layer") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index ba0e34b09648..dbe984faed65 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1061,7 +1061,7 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid, int ret, port, i; for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { - if (rdma_cap_roce_gid_table(device, port)) + if (!rdma_protocol_ib(device, port)) continue; for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { From 190fb9c4d130f23b64d73e9921afb9a502340455 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 25 Mar 2018 13:40:23 +0300 Subject: [PATCH 150/199] IB/core: Refer to RoCE port property to decide building cache IB core maintains the GID cache entries for the GID table. This cache table has to be maintained regardless of HCA's support of GID table. For IB and iWarp ports, cache is created by querying the HCA. For RoCE cache is created based on netdev events. Therefore just refer to the RoCE port property of the {device, port} to decide whether to build cache by querying HCA or from netdev events. There is no need to check if HCA support GID table or not. ib_cache_update() referred to RoCE attribute before validating port. Though in all current callers port is valid, it is incorrect to query RoCE port property before validating the port. Therefore, rdma_protocol_roce() check is done after rdma_is_port_valid() verifies that port is valid. Fixes: 115b68aa6ea4 ("IB/ocrdma: Removed GID add/del null routines") Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index b94f8d1b262f..552f3c8dc246 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1070,12 +1070,12 @@ static void ib_cache_update(struct ib_device *device, int i; int ret; struct ib_gid_table *table; - bool use_roce_gid_table = - rdma_cap_roce_gid_table(device, port); + bool use_roce_gid_table; if (!rdma_is_port_valid(device, port)) return; + use_roce_gid_table = rdma_protocol_roce(device, port); table = device->cache.ports[port - rdma_start_port(device)].gid; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); From 1b90d3002e3ee39b22de5604497b20edeeee558e Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 27 Mar 2018 10:34:38 -0700 Subject: [PATCH 151/199] RDMA/CMA: remove RDMA_PS_SDP This is no longer supported, so remove it. Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- include/rdma/rdma_cm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 62caae818173..7652efc35eb9 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -65,7 +65,6 @@ enum rdma_cm_event_type { const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event); enum rdma_port_space { - RDMA_PS_SDP = 0x0001, RDMA_PS_IPOIB = 0x0002, RDMA_PS_IB = 0x013F, RDMA_PS_TCP = 0x0106, From 2253fc0caa800ba7c1e380446eb3fb7958a85b93 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 27 Mar 2018 08:38:07 -0700 Subject: [PATCH 152/199] RDMA/CMA: Add rdma_port_space to UAPI Since the rdma_port_space enum is being passed between user and kernel for user cm_id setup, we need it in a UAPI header. So add it to rdma_user_cm.h. This also fixes the cm_id restrack changes which pass up the port space value via the RDMA_NLDEV_ATTR_RES_PS attribute. Fixes: 00313983cda6 ("RDMA/nldev: provide detailed CM_ID information") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 32 +++++++++++++++++--------------- include/rdma/rdma_cm.h | 17 ++++++----------- include/uapi/rdma/rdma_user_cm.h | 10 +++++++++- 3 files changed, 32 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 8512f633efd6..b3574d4eeea9 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -175,7 +175,7 @@ static struct cma_pernet *cma_pernet(struct net *net) return net_generic(net, cma_pernet_id); } -static struct idr *cma_pernet_idr(struct net *net, enum rdma_port_space ps) +static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); @@ -204,7 +204,7 @@ struct cma_device { }; struct rdma_bind_list { - enum rdma_port_space ps; + enum rdma_ucm_port_space ps; struct hlist_head owners; unsigned short port; }; @@ -217,7 +217,7 @@ struct class_port_info_context { u8 port_num; }; -static int cma_ps_alloc(struct net *net, enum rdma_port_space ps, +static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { struct idr *idr = cma_pernet_idr(net, ps); @@ -226,14 +226,15 @@ static int cma_ps_alloc(struct net *net, enum rdma_port_space ps, } static struct rdma_bind_list *cma_ps_find(struct net *net, - enum rdma_port_space ps, int snum) + enum rdma_ucm_port_space ps, int snum) { struct idr *idr = cma_pernet_idr(net, ps); return idr_find(idr, snum); } -static void cma_ps_remove(struct net *net, enum rdma_port_space ps, int snum) +static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, + int snum) { struct idr *idr = cma_pernet_idr(net, ps); @@ -742,7 +743,7 @@ static void cma_deref_id(struct rdma_id_private *id_priv) struct rdma_cm_id *__rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps, + void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *id_priv; @@ -1366,7 +1367,7 @@ static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, return net_dev; } -static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id) +static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } @@ -2994,7 +2995,7 @@ static void cma_bind_port(struct rdma_bind_list *bind_list, hlist_add_head(&id_priv->node, &bind_list->owners); } -static int cma_alloc_port(enum rdma_port_space ps, +static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; @@ -3057,7 +3058,7 @@ static int cma_port_is_unique(struct rdma_bind_list *bind_list, return 0; } -static int cma_alloc_any_port(enum rdma_port_space ps, +static int cma_alloc_any_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; @@ -3135,7 +3136,7 @@ static int cma_check_port(struct rdma_bind_list *bind_list, return 0; } -static int cma_use_port(enum rdma_port_space ps, +static int cma_use_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; @@ -3169,8 +3170,8 @@ static int cma_bind_listen(struct rdma_id_private *id_priv) return ret; } -static enum rdma_port_space cma_select_inet_ps( - struct rdma_id_private *id_priv) +static enum rdma_ucm_port_space +cma_select_inet_ps(struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: @@ -3184,9 +3185,10 @@ static enum rdma_port_space cma_select_inet_ps( } } -static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) +static enum rdma_ucm_port_space +cma_select_ib_ps(struct rdma_id_private *id_priv) { - enum rdma_port_space ps = 0; + enum rdma_ucm_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; @@ -3217,7 +3219,7 @@ static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) static int cma_get_port(struct rdma_id_private *id_priv) { - enum rdma_port_space ps; + enum rdma_ucm_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 7652efc35eb9..4480e636b934 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -38,6 +38,7 @@ #include #include #include +#include /* * Upon receiving a device removal event, users must destroy the associated @@ -64,13 +65,6 @@ enum rdma_cm_event_type { const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event); -enum rdma_port_space { - RDMA_PS_IPOIB = 0x0002, - RDMA_PS_IB = 0x013F, - RDMA_PS_TCP = 0x0106, - RDMA_PS_UDP = 0x0111, -}; - #define RDMA_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL #define RDMA_IB_IP_PS_TCP 0x0000000001060000ULL #define RDMA_IB_IP_PS_UDP 0x0000000001110000ULL @@ -151,15 +145,16 @@ struct rdma_cm_id { struct ib_qp *qp; rdma_cm_event_handler event_handler; struct rdma_route route; - enum rdma_port_space ps; + enum rdma_ucm_port_space ps; enum ib_qp_type qp_type; u8 port_num; }; struct rdma_cm_id *__rdma_create_id(struct net *net, - rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps, - enum ib_qp_type qp_type, const char *caller); + rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, + const char *caller); /** * rdma_create_id - Create an RDMA identifier. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index c4f28cb92214..e1269024af47 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -70,6 +70,14 @@ enum { RDMA_USER_CM_CMD_JOIN_MCAST }; +/* See IBTA Annex A11, servies ID bytes 4 & 5 */ +enum rdma_ucm_port_space { + RDMA_PS_IPOIB = 0x0002, + RDMA_PS_IB = 0x013F, + RDMA_PS_TCP = 0x0106, + RDMA_PS_UDP = 0x0111, +}; + /* * command ABI structures. */ @@ -82,7 +90,7 @@ struct rdma_ucm_cmd_hdr { struct rdma_ucm_create_id { __aligned_u64 uid; __aligned_u64 response; - __u16 ps; + __u16 ps; /* use enum rdma_ucm_port_space */ __u8 qp_type; __u8 reserved[5]; }; From a343e3f89e365a598ab4061fd2bc9ed5daf1905d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 29 Mar 2018 13:11:07 +0100 Subject: [PATCH 153/199] qedr: Fix spelling mistake: "hanlde" -> "handle" Trivial fix to spelling mistake in DP_ERR message text Signed-off-by: Colin Ian King Reviewed-by: Shamir Rabinovitch Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index f865c0991ad9..2274d12a4f75 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -708,7 +708,7 @@ static void qedr_affiliated_event(void *context, u8 e_code, void *fw_handle) "Error: CQ event with NULL pointer ibcq. Handle=%llx\n", roce_handle64); } - DP_ERR(dev, "CQ event %d on hanlde %p\n", e_code, cq); + DP_ERR(dev, "CQ event %d on handle %p\n", e_code, cq); break; case EVENT_TYPE_QP: qp = (struct qedr_qp *)(uintptr_t)roce_handle64; @@ -724,7 +724,7 @@ static void qedr_affiliated_event(void *context, u8 e_code, void *fw_handle) "Error: QP event with NULL pointer ibqp. Handle=%llx\n", roce_handle64); } - DP_ERR(dev, "QP event %d on hanlde %p\n", e_code, qp); + DP_ERR(dev, "QP event %d on handle %p\n", e_code, qp); break; default: break; From 99dae690255e90f5cbefcc76ad92b35cdf87d14d Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Wed, 21 Mar 2018 04:08:37 -0400 Subject: [PATCH 154/199] IB/rxe: optimize mcast recv process In mcast recv process, the function skb_clone is used. In fact, the refcount can be increased to replace cloning a new skb since the original skb will not be modified before it is freed. This can make the performance better and save the memory. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_recv.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 08ad9dc72205..dd80c7d9074a 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -276,7 +276,6 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) { struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); struct rxe_mc_grp *mcg; - struct sk_buff *skb_copy; struct rxe_mc_elem *mce; struct rxe_qp *qp; union ib_gid dgid; @@ -309,18 +308,14 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) continue; /* if *not* the last qp in the list - * make a copy of the skb to post to the next qp + * increase the users of the skb then post to the next qp */ - skb_copy = (mce->qp_list.next != &mcg->qp_list) ? - skb_clone(skb, GFP_ATOMIC) : NULL; + if (mce->qp_list.next != &mcg->qp_list) + refcount_inc(&skb->users); pkt->qp = qp; rxe_add_ref(qp); rxe_rcv_pkt(rxe, pkt, skb); - - skb = skb_copy; - if (!skb) - break; } spin_unlock_bh(&mcg->mcg_lock); @@ -328,8 +323,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) rxe_drop_ref(mcg); /* drop ref from rxe_pool_get_key. */ err1: - if (skb) - kfree_skb(skb); + kfree_skb(skb); } static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) From 5b2cc79de8782ea98ef22cddb26fcd566c565094 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 27 Mar 2018 20:40:49 +0300 Subject: [PATCH 155/199] RDMA/nldev: Provide netdevice name and index Export the net device name and index to easily find connection between IB devices and relevant net devices. We also updated the comment regarding the devices without FW. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 31 ++++++++++++++++++++++++++----- include/uapi/rdma/rdma_netlink.h | 13 +++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 192084c78352..eb567765f45c 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -95,6 +95,9 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ }, }; static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ -123,7 +126,7 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) return -EMSGSIZE; ib_get_device_fw_str(device, fw); - /* Device without FW has strlen(fw) */ + /* Device without FW has strlen(fw) = 0 */ if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) return -EMSGSIZE; @@ -139,8 +142,10 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) } static int fill_port_info(struct sk_buff *msg, - struct ib_device *device, u32 port) + struct ib_device *device, u32 port, + const struct net *net) { + struct net_device *netdev = NULL; struct ib_port_attr attr; int ret; @@ -174,7 +179,23 @@ static int fill_port_info(struct sk_buff *msg, return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) return -EMSGSIZE; - return 0; + + if (device->get_netdev) + netdev = device->get_netdev(device, port); + + if (netdev && net_eq(dev_net(netdev), net)) { + ret = nla_put_u32(msg, + RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); + if (ret) + goto out; + ret = nla_put_string(msg, + RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); + } + +out: + if (netdev) + dev_put(netdev); + return ret; } static int fill_res_info_entry(struct sk_buff *msg, @@ -603,7 +624,7 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); - err = fill_port_info(msg, device, port); + err = fill_port_info(msg, device, port, sock_net(skb->sk)); if (err) goto err_free; @@ -663,7 +684,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, RDMA_NLDEV_CMD_PORT_GET), 0, NLM_F_MULTI); - if (fill_port_info(skb, device, p)) { + if (fill_port_info(skb, device, p, sock_net(skb->sk))) { nlmsg_cancel(skb, nlh); goto out; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 351139c7e2e7..0ce0943fc808 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -388,6 +388,19 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */ RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */ + /* + * Provides logical name and index of netdevice which is + * connected to physical port. This information is relevant + * for RoCE and iWARP. + * + * The netdevices which are associated with containers are + * supposed to be exported together with GID table once it + * will be exposed through the netlink. Because the + * associated netdevices are properties of GIDs. + */ + RDMA_NLDEV_ATTR_NDEV_INDEX, /* u32 */ + RDMA_NLDEV_ATTR_NDEV_NAME, /* string */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ From 6f57c933a49afb30d987f0ba7db4f52452f7d638 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 27 Mar 2018 14:18:47 -0600 Subject: [PATCH 156/199] RDMA: Use u64_to_user_ptr everywhere This is already used in many places, get the rest of them too, only to make the code a bit clearer & simpler. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucm.c | 18 +++++++++--------- drivers/infiniband/core/ucma.c | 20 ++++++++++---------- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 4 ++-- drivers/infiniband/hw/qib/qib_diag.c | 2 +- drivers/infiniband/hw/qib/qib_file_ops.c | 8 ++++---- 6 files changed, 27 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 01702265c1e1..9eef96dacbd7 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -430,7 +430,7 @@ static ssize_t ib_ucm_event(struct ib_ucm_file *file, uevent->resp.id = ctx->id; } - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &uevent->resp, sizeof(uevent->resp))) { result = -EFAULT; goto done; @@ -441,7 +441,7 @@ static ssize_t ib_ucm_event(struct ib_ucm_file *file, result = -ENOMEM; goto done; } - if (copy_to_user((void __user *)(unsigned long)cmd.data, + if (copy_to_user(u64_to_user_ptr(cmd.data), uevent->data, uevent->data_len)) { result = -EFAULT; goto done; @@ -453,7 +453,7 @@ static ssize_t ib_ucm_event(struct ib_ucm_file *file, result = -ENOMEM; goto done; } - if (copy_to_user((void __user *)(unsigned long)cmd.info, + if (copy_to_user(u64_to_user_ptr(cmd.info), uevent->info, uevent->info_len)) { result = -EFAULT; goto done; @@ -502,7 +502,7 @@ static ssize_t ib_ucm_create_id(struct ib_ucm_file *file, } resp.id = ctx->id; - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { result = -EFAULT; goto err2; @@ -556,7 +556,7 @@ static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file, ib_ucm_cleanup_events(ctx); resp.events_reported = ctx->events_reported; - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) result = -EFAULT; @@ -588,7 +588,7 @@ static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file, resp.local_id = ctx->cm_id->local_id; resp.remote_id = ctx->cm_id->remote_id; - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) result = -EFAULT; @@ -625,7 +625,7 @@ static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) result = -EFAULT; @@ -699,7 +699,7 @@ static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len) if (!len) return 0; - data = memdup_user((void __user *)(unsigned long)src, len); + data = memdup_user(u64_to_user_ptr(src), len); if (IS_ERR(data)) return PTR_ERR(data); @@ -721,7 +721,7 @@ static int ib_ucm_path_get(struct sa_path_rec **path, u64 src) if (!sa_path) return -ENOMEM; - if (copy_from_user(&upath, (void __user *)(unsigned long)src, + if (copy_from_user(&upath, u64_to_user_ptr(src), sizeof(upath))) { kfree(sa_path); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index db4190b2ed27..2abb70c26559 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -420,7 +420,7 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, uevent->resp.id = ctx->id; } - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &uevent->resp, min_t(size_t, out_len, sizeof(uevent->resp)))) { ret = -EFAULT; @@ -489,7 +489,7 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, } resp.id = ctx->id; - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { ret = -EFAULT; goto err2; @@ -614,7 +614,7 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, } resp.events_reported = ucma_free_ctx(ctx); - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; @@ -840,7 +840,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, ucma_copy_iw_route(&resp, &ctx->cm_id->route); out: - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; @@ -986,7 +986,7 @@ static ssize_t ucma_query(struct ucma_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - response = (void __user *)(unsigned long) cmd.response; + response = u64_to_user_ptr(cmd.response); ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -1169,7 +1169,7 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file, goto out; ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; @@ -1305,7 +1305,7 @@ static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf, if (unlikely(cmd.optval > KMALLOC_MAX_SIZE)) return -EINVAL; - optval = memdup_user((void __user *) (unsigned long) cmd.optval, + optval = memdup_user(u64_to_user_ptr(cmd.optval), cmd.optlen); if (IS_ERR(optval)) { ret = PTR_ERR(optval); @@ -1383,7 +1383,7 @@ static ssize_t ucma_process_join(struct ucma_file *file, goto err2; resp.id = mc->id; - if (copy_to_user((void __user *)(unsigned long) cmd->response, + if (copy_to_user(u64_to_user_ptr(cmd->response), &resp, sizeof(resp))) { ret = -EFAULT; goto err3; @@ -1488,7 +1488,7 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, resp.events_reported = mc->events_reported; kfree(mc); - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; out: @@ -1575,7 +1575,7 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, ucma_unlock_files(cur_file, new_file); response: - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index bb29146c3823..536d78baacd3 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3581,7 +3581,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, if (cmd->srq_type == IB_SRQT_XRC) resp.srqn = srq->ext.xrc.srq_num; - if (copy_to_user((void __user *) (unsigned long) cmd->response, + if (copy_to_user(u64_to_user_ptr(cmd->response), &resp, sizeof resp)) { ret = -EFAULT; goto err_copy; diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index c1c596adcd01..0d5330b7353d 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -473,7 +473,7 @@ nomem: tinfo->tidcnt = tididx; tinfo->length = mapped_pages * PAGE_SIZE; - if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, + if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), tidlist, sizeof(tidlist[0]) * tididx)) { /* * On failure to copy to the user level, we need to undo @@ -513,7 +513,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, if (unlikely(tinfo->tidcnt > fd->tid_used)) return -EINVAL; - tidinfo = memdup_user((void __user *)(unsigned long)tinfo->tidlist, + tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist), sizeof(tidinfo[0]) * tinfo->tidcnt); if (IS_ERR(tidinfo)) return PTR_ERR(tidinfo); diff --git a/drivers/infiniband/hw/qib/qib_diag.c b/drivers/infiniband/hw/qib/qib_diag.c index a9377eee8734..11da796dd1b7 100644 --- a/drivers/infiniband/hw/qib/qib_diag.c +++ b/drivers/infiniband/hw/qib/qib_diag.c @@ -614,7 +614,7 @@ static ssize_t qib_diagpkt_write(struct file *fp, } if (copy_from_user(tmpbuf, - (const void __user *) (unsigned long) dp.data, + u64_to_user_ptr(dp.data), dp.len)) { ret = -EFAULT; goto bail; diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 52c29db3a2f4..6a8800b65047 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -443,7 +443,7 @@ cleanup: ret = -EFAULT; goto cleanup; } - if (copy_to_user((void __user *) (unsigned long) ti->tidmap, + if (copy_to_user(u64_to_user_ptr(ti->tidmap), tidmap, sizeof(tidmap))) { ret = -EFAULT; goto cleanup; @@ -490,7 +490,7 @@ static int qib_tid_free(struct qib_ctxtdata *rcd, unsigned subctxt, goto done; } - if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap, + if (copy_from_user(tidmap, u64_to_user_ptr(ti->tidmap), sizeof(tidmap))) { ret = -EFAULT; goto done; @@ -2168,8 +2168,8 @@ static ssize_t qib_write(struct file *fp, const char __user *data, ret = qib_do_user_init(fp, &cmd.cmd.user_info); if (ret) goto bail; - ret = qib_get_base_info(fp, (void __user *) (unsigned long) - cmd.cmd.user_info.spu_base_info, + ret = qib_get_base_info(fp, u64_to_user_ptr( + cmd.cmd.user_info.spu_base_info), cmd.cmd.user_info.spu_base_info_size); break; From fd59015d68ee5bb0397a13cc02dbce9525f7c593 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 29 Mar 2018 13:26:32 +0300 Subject: [PATCH 157/199] IB/addr: Constify dst_entry pointer Make dst_entry pointer as const struct dst_entry* to improve code readablity to make sure that dst structure fields are not modified by various functions which are using it. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index b0a52c996208..e314db5bcae3 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -313,7 +313,8 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } -static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, +static int ib_nl_fetch_ha(const struct dst_entry *dst, + struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) @@ -324,7 +325,8 @@ static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); } -static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, +static int dst_fetch_ha(const struct dst_entry *dst, + struct rdma_dev_addr *dev_addr, const void *daddr) { struct neighbour *n; @@ -348,7 +350,7 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, return ret; } -static bool has_gateway(struct dst_entry *dst, sa_family_t family) +static bool has_gateway(const struct dst_entry *dst, sa_family_t family) { struct rtable *rt; struct rt6_info *rt6; @@ -362,7 +364,7 @@ static bool has_gateway(struct dst_entry *dst, sa_family_t family) return rt6->rt6i_flags & RTF_GATEWAY; } -static int fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, +static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const struct sockaddr *dst_in, u32 seq) { const struct sockaddr_in *dst_in4 = @@ -466,7 +468,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, } #endif -static int addr_resolve_neigh(struct dst_entry *dst, +static int addr_resolve_neigh(const struct dst_entry *dst, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, u32 seq) From 218b9e3eb8b53785a98dfa2e4b7c700103085d33 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 29 Mar 2018 13:26:33 +0300 Subject: [PATCH 158/199] RDMA/cma: Move rdma_cm_state to cma_priv.h rdma_cm_state enum is internal to rdma_cm kernel module. It is not required to expose state enums to ULP modules. So lets keep its scope limited to rdma_cm module in cma_priv.h file. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma_priv.h | 14 ++++++++++++++ include/rdma/rdma_cm.h | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index 56f52b70c346..194cfe78c447 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -36,6 +36,20 @@ #ifndef _CMA_PRIV_H #define _CMA_PRIV_H +enum rdma_cm_state { + RDMA_CM_IDLE, + RDMA_CM_ADDR_QUERY, + RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_QUERY, + RDMA_CM_ROUTE_RESOLVED, + RDMA_CM_CONNECT, + RDMA_CM_DISCONNECT, + RDMA_CM_ADDR_BOUND, + RDMA_CM_LISTEN, + RDMA_CM_DEVICE_REMOVAL, + RDMA_CM_DESTROYING +}; + struct rdma_id_private { struct rdma_cm_id id; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 4480e636b934..690934733ba7 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -113,20 +113,6 @@ struct rdma_cm_event { } param; }; -enum rdma_cm_state { - RDMA_CM_IDLE, - RDMA_CM_ADDR_QUERY, - RDMA_CM_ADDR_RESOLVED, - RDMA_CM_ROUTE_QUERY, - RDMA_CM_ROUTE_RESOLVED, - RDMA_CM_CONNECT, - RDMA_CM_DISCONNECT, - RDMA_CM_ADDR_BOUND, - RDMA_CM_LISTEN, - RDMA_CM_DEVICE_REMOVAL, - RDMA_CM_DESTROYING -}; - struct rdma_cm_id; /** From 44016b3466407ff2766aeccb0ef2adca677c5106 Mon Sep 17 00:00:00 2001 From: Bharat Potnuri Date: Thu, 29 Mar 2018 17:10:13 +0530 Subject: [PATCH 159/199] iw_cxgb4: print mapped ports correctly c4iw_ep_common structure holds the mapped addresses, so while printing them, use appropriate pointers. Fixes: bab572f1d ("iw_cxgb4: Guard against null cm_id in dump_ep/qp") Signed-off-by: Potnuri Bharat Teja Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/device.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 7a9d0de89d6a..5ce3cd1884b5 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -220,14 +220,14 @@ static void set_ep_sin_addrs(struct c4iw_ep *ep, { struct iw_cm_id *id = ep->com.cm_id; - *lsin = (struct sockaddr_in *)&ep->com.local_addr; - *rsin = (struct sockaddr_in *)&ep->com.remote_addr; + *m_lsin = (struct sockaddr_in *)&ep->com.local_addr; + *m_rsin = (struct sockaddr_in *)&ep->com.remote_addr; if (id) { - *m_lsin = (struct sockaddr_in *)&id->m_local_addr; - *m_rsin = (struct sockaddr_in *)&id->m_remote_addr; + *lsin = (struct sockaddr_in *)&id->local_addr; + *rsin = (struct sockaddr_in *)&id->remote_addr; } else { - *m_lsin = &zero_sin; - *m_rsin = &zero_sin; + *lsin = &zero_sin; + *rsin = &zero_sin; } } @@ -239,14 +239,14 @@ static void set_ep_sin6_addrs(struct c4iw_ep *ep, { struct iw_cm_id *id = ep->com.cm_id; - *lsin6 = (struct sockaddr_in6 *)&ep->com.local_addr; - *rsin6 = (struct sockaddr_in6 *)&ep->com.remote_addr; + *m_lsin6 = (struct sockaddr_in6 *)&ep->com.local_addr; + *m_rsin6 = (struct sockaddr_in6 *)&ep->com.remote_addr; if (id) { - *m_lsin6 = (struct sockaddr_in6 *)&id->m_local_addr; - *m_rsin6 = (struct sockaddr_in6 *)&id->m_remote_addr; + *lsin6 = (struct sockaddr_in6 *)&id->local_addr; + *rsin6 = (struct sockaddr_in6 *)&id->remote_addr; } else { - *m_lsin6 = &zero_sin6; - *m_rsin6 = &zero_sin6; + *lsin6 = &zero_sin6; + *rsin6 = &zero_sin6; } } From 12ed56bad92265c4430712afd8fa37090dd7888a Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 1 Apr 2018 09:22:18 +0300 Subject: [PATCH 160/199] IB/ipoib: Delete unused struct This structure is not needed since the introduction of commit 'c42687784b9a ("IB/ipoib: Scatter-Gather support in connected mode")' Signed-off-by: Yuval Shaia Reviewed-by: Leon Romanovsky Reviewed-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 8033a006277f..308e0ce49289 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -193,11 +193,6 @@ struct ipoib_tx_buf { u64 mapping[MAX_SKB_FRAGS + 1]; }; -struct ipoib_cm_tx_buf { - struct sk_buff *skb; - u64 mapping; -}; - struct ib_cm_id; struct ipoib_cm_data { From 689a8e31938d41ce75955a81936de4d62c2fc677 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 30 Mar 2018 10:38:58 -0500 Subject: [PATCH 161/199] IB/ocrdma_hw: Remove redundant checks and goto labels Check on return values and goto label mbx_err are unnecessary. Addresses-Coverity-ID: 1271151 ("Identical code for different branches") Addresses-Coverity-ID: 1268788 ("Identical code for different branches") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 9904918589a4..90cf77223771 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -2014,7 +2014,7 @@ static int ocrdma_mbx_reg_mr_cont(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr, u32 pbl_cnt, u32 pbl_offset, u32 last) { - int status = -ENOMEM; + int status; int i; struct ocrdma_reg_nsmr_cont *cmd; @@ -2033,9 +2033,7 @@ static int ocrdma_mbx_reg_mr_cont(struct ocrdma_dev *dev, upper_32_bits(hwmr->pbl_table[i + pbl_offset].pa); } status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); - if (status) - goto mbx_err; -mbx_err: + kfree(cmd); return status; } @@ -3133,12 +3131,12 @@ done: static int ocrdma_mbx_modify_eqd(struct ocrdma_dev *dev, struct ocrdma_eq *eq, int num) { - int i, status = -ENOMEM; + int i, status; struct ocrdma_modify_eqd_req *cmd; cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_EQ_DELAY, sizeof(*cmd)); if (!cmd) - return status; + return -ENOMEM; ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_MODIFY_EQ_DELAY, OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); @@ -3151,9 +3149,7 @@ static int ocrdma_mbx_modify_eqd(struct ocrdma_dev *dev, struct ocrdma_eq *eq, (eq[i].aic_obj.prev_eqd * 65)/100; } status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); - if (status) - goto mbx_err; -mbx_err: + kfree(cmd); return status; } From 41d902cb7c326d711674977763c4b30df87611bc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Apr 2018 10:00:53 +0300 Subject: [PATCH 162/199] RDMA/mlx5: Fix definition of mlx5_ib_create_qp_resp This structure is pushed down the ex and the non-ex path, so it needs to be aligned to 8 bytes to go through ex without implicit padding. Old user space will provide 4 bytes of resp on !ex and 8 bytes on ex, so take the approach of just copying the minimum length. New user space will consistently provide 8 bytes in both cases. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 4 ++-- include/uapi/rdma/mlx5-abi.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index c152c6f35101..c8f01f32ebb4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -880,7 +880,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, goto err_free; } - err = ib_copy_to_udata(udata, resp, sizeof(*resp)); + err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp))); if (err) { mlx5_ib_dbg(dev, "copy failed\n"); goto err_unmap; @@ -1468,7 +1468,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return -EOPNOTSUPP; } - err = ib_copy_to_udata(udata, &resp, min_resp_len); + err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (err) { mlx5_ib_dbg(dev, "copy failed\n"); return -EINVAL; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 09c50f390a3c..f7d18fb01771 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -335,6 +335,7 @@ struct mlx5_ib_create_qp_rss { struct mlx5_ib_create_qp_resp { __u32 bfreg_index; + __u32 reserved; }; struct mlx5_ib_alloc_mw { From 3e64f8d6f514c31b6856bfb97737232dd4afcccb Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 3 Apr 2018 10:32:28 -0500 Subject: [PATCH 163/199] i40iw: Remove pre-production workaround for resource profile 1 Support for resource profile 1 is currenlty deprecated due to a pre-production errata. Remove this workaround as its no longer needed. Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index b08862978de8..9cd0d3ef9057 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -1560,8 +1560,6 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl, enum i40iw_status_code status; memcpy(&hdl->ldev, ldev, sizeof(*ldev)); - if (resource_profile == 1) - resource_profile = 2; iwdev->mpa_version = mpa_version; iwdev->resource_profile = (resource_profile < I40IW_HMC_PROFILE_EQUAL) ? From ca486a3b338ea0858104bab80d86475de3575966 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 1 Apr 2018 13:51:28 -0500 Subject: [PATCH 164/199] IB/qedr: Remove GID add/del dummy routines qedr driver's add_gid() and del_gid() callbacks are doing simple checks which are already done by the ib core before invoking these callback routines. Therefore, code is simplified to skip implementing add_gid() and del_gid() callback functions. Signed-off-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/main.c | 3 --- drivers/infiniband/hw/qedr/verbs.c | 31 ------------------------------ drivers/infiniband/hw/qedr/verbs.h | 5 ----- 3 files changed, 39 deletions(-) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 2274d12a4f75..d3ed711b8f92 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -164,9 +164,6 @@ static void qedr_roce_register_device(struct qedr_dev *dev) dev->ibdev.node_type = RDMA_NODE_IB_CA; dev->ibdev.query_gid = qedr_query_gid; - dev->ibdev.add_gid = qedr_add_gid; - dev->ibdev.del_gid = qedr_del_gid; - dev->ibdev.get_port_immutable = qedr_roce_port_immutable; } diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index b61a395f89de..8587a6840c10 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -105,37 +105,6 @@ int qedr_query_gid(struct ib_device *ibdev, u8 port, int index, return rc; } -int qedr_add_gid(struct ib_device *device, u8 port_num, - unsigned int index, const union ib_gid *gid, - const struct ib_gid_attr *attr, void **context) -{ - if (!rdma_cap_roce_gid_table(device, port_num)) - return -EINVAL; - - if (port_num > QEDR_MAX_PORT) - return -EINVAL; - - if (!context) - return -EINVAL; - - return 0; -} - -int qedr_del_gid(struct ib_device *device, u8 port_num, - unsigned int index, void **context) -{ - if (!rdma_cap_roce_gid_table(device, port_num)) - return -EINVAL; - - if (port_num > QEDR_MAX_PORT) - return -EINVAL; - - if (!context) - return -EINVAL; - - return 0; -} - int qedr_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *udata) { diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 1a94425dea33..b5330495bf7c 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -48,11 +48,6 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *); int qedr_dealloc_ucontext(struct ib_ucontext *); int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma); -int qedr_del_gid(struct ib_device *device, u8 port_num, - unsigned int index, void **context); -int qedr_add_gid(struct ib_device *device, u8 port_num, - unsigned int index, const union ib_gid *gid, - const struct ib_gid_attr *attr, void **context); struct ib_pd *qedr_alloc_pd(struct ib_device *, struct ib_ucontext *, struct ib_udata *); int qedr_dealloc_pd(struct ib_pd *pd); From 8435168d50e66fa5eae01852769d20a36f9e5e83 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 3 Apr 2018 15:33:01 -0700 Subject: [PATCH 165/199] RDMA/ucma: Don't allow setting RDMA_OPTION_IB_PATH without an RDMA device Check to make sure that ctx->cm_id->device is set before we use it. Otherwise userspace can trigger a NULL dereference by doing RDMA_USER_CM_CMD_SET_OPTION on an ID that is not bound to a device. Cc: Reported-by: Signed-off-by: Roland Dreier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 2abb70c26559..5e4f2cce7383 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1231,6 +1231,9 @@ static int ucma_set_ib_path(struct ucma_context *ctx, if (!optlen) return -EINVAL; + if (!ctx->cm_id->device) + return -EINVAL; + memset(&sa_path, 0, sizeof(sa_path)); sa_path.rec_type = SA_PATH_REC_TYPE_IB; From 72e1ff0fb7e09c34956e4b3f619481da4d9787c1 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 1 Apr 2018 15:08:18 +0300 Subject: [PATCH 166/199] RDMA/core: Update query_gid documentation for HCA drivers query_gid() should return right GID value for iWarp and IB link layers. It is a no-op for RoCE link layer. Update the documentation to reflect this. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 48f416fabe0c..a2f658125795 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2141,6 +2141,10 @@ struct ib_device { */ struct net_device *(*get_netdev)(struct ib_device *device, u8 port_num); + /* query_gid should be return GID value for @device, when @port_num + * link layer is either IB or iWarp. It is no-op if @port_num port + * is RoCE link layer. + */ int (*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); From 0e1f9b924471c132dcf314476916e3c4bd4956b2 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 1 Apr 2018 15:08:19 +0300 Subject: [PATCH 167/199] RDMA/providers: Simplify query_gid callback of RoCE providers ib_query_gid() fetches the GID from the software cache maintained in ib_core for RoCE ports. Therefore, simplify the provider drivers for RoCE to treat query_gid() callback as never called for RoCE, and only require non-RoCE devices to implement it. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 4 +++- drivers/infiniband/hw/bnxt_re/main.c | 1 - drivers/infiniband/hw/hns/hns_roce_main.c | 7 ------- drivers/infiniband/hw/mlx4/main.c | 17 +---------------- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 - drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 18 ------------------ drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 2 -- drivers/infiniband/hw/qedr/main.c | 1 - drivers/infiniband/hw/qedr/verbs.c | 21 --------------------- drivers/infiniband/hw/qedr/verbs.h | 1 - drivers/infiniband/sw/rxe/rxe_verbs.c | 18 ------------------ 11 files changed, 4 insertions(+), 87 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index dbe984faed65..5d79e8de31f5 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -103,7 +103,6 @@ static int ib_device_check_mandatory(struct ib_device *device) IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(query_pkey), - IB_MANDATORY_FUNC(query_gid), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_ah), @@ -884,6 +883,9 @@ int ib_query_gid(struct ib_device *device, if (attr) return -EINVAL; + if (!device->query_gid) + return -EOPNOTSUPP; + return device->query_gid(device, port_num, index, gid); } EXPORT_SYMBOL(ib_query_gid); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index abe0be8b5ddc..f6c739ec8b62 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -574,7 +574,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->get_port_immutable = bnxt_re_get_port_immutable; ibdev->get_dev_fw_str = bnxt_re_query_fw_str; ibdev->query_pkey = bnxt_re_query_pkey; - ibdev->query_gid = bnxt_re_query_gid; ibdev->get_netdev = bnxt_re_get_netdev; ibdev->add_gid = bnxt_re_add_gid; ibdev->del_gid = bnxt_re_del_gid; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 83e21f696bbc..76e2e5b41895 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -296,12 +296,6 @@ static enum rdma_link_layer hns_roce_get_link_layer(struct ib_device *device, return IB_LINK_LAYER_ETHERNET; } -static int hns_roce_query_gid(struct ib_device *ib_dev, u8 port_num, int index, - union ib_gid *gid) -{ - return 0; -} - static int hns_roce_query_pkey(struct ib_device *ib_dev, u8 port, u16 index, u16 *pkey) { @@ -482,7 +476,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev->modify_port = hns_roce_modify_port; ib_dev->get_link_layer = hns_roce_get_link_layer; ib_dev->get_netdev = hns_roce_get_netdev; - ib_dev->query_gid = hns_roce_query_gid; ib_dev->add_gid = hns_roce_add_gid; ib_dev->del_gid = hns_roce_del_gid; ib_dev->query_pkey = hns_roce_query_pkey; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d1be3231f4f0..d9422a44549e 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -888,24 +888,9 @@ out: static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { - int ret; - if (rdma_protocol_ib(ibdev, port)) return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); - - if (!rdma_protocol_roce(ibdev, port)) - return -ENODEV; - - if (!rdma_cap_roce_gid_table(ibdev, port)) - return -ENODEV; - - ret = ib_get_cached_gid(ibdev, port, index, gid, NULL); - if (ret == -EAGAIN) { - memcpy(gid, &zgid, sizeof(*gid)); - return 0; - } - - return ret; + return 0; } static int mlx4_ib_query_sl2vl(struct ib_device *ibdev, u8 port, u64 *sl2vl_tbl) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 4547aa28d4ae..eb8b6a935016 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -158,7 +158,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.query_device = ocrdma_query_device; dev->ibdev.query_port = ocrdma_query_port; dev->ibdev.modify_port = ocrdma_modify_port; - dev->ibdev.query_gid = ocrdma_query_gid; dev->ibdev.get_netdev = ocrdma_get_netdev; dev->ibdev.get_link_layer = ocrdma_link_layer; dev->ibdev.alloc_pd = ocrdma_alloc_pd; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 1e3dc92bc37b..784ed6b09a46 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -62,24 +62,6 @@ int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) return 0; } -int ocrdma_query_gid(struct ib_device *ibdev, u8 port, - int index, union ib_gid *sgid) -{ - int ret; - - memset(sgid, 0, sizeof(*sgid)); - if (index >= OCRDMA_MAX_SGID) - return -EINVAL; - - ret = ib_get_cached_gid(ibdev, port, index, sgid, NULL); - if (ret == -EAGAIN) { - memcpy(sgid, &zgid, sizeof(*sgid)); - return 0; - } - - return ret; -} - int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *uhw) { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index a48eab35861f..9a9971708646 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -61,8 +61,6 @@ enum rdma_protocol_type ocrdma_query_protocol(struct ib_device *device, u8 port_num); void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid); -int ocrdma_query_gid(struct ib_device *, u8 port, - int index, union ib_gid *gid); struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num); int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey); diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index d3ed711b8f92..e3bd2ca8968d 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -162,7 +162,6 @@ static int qedr_iw_register_device(struct qedr_dev *dev) static void qedr_roce_register_device(struct qedr_dev *dev) { dev->ibdev.node_type = RDMA_NODE_IB_CA; - dev->ibdev.query_gid = qedr_query_gid; dev->ibdev.get_port_immutable = qedr_roce_port_immutable; } diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 8587a6840c10..ccc09a8e4195 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -84,27 +84,6 @@ int qedr_iw_query_gid(struct ib_device *ibdev, u8 port, return 0; } -int qedr_query_gid(struct ib_device *ibdev, u8 port, int index, - union ib_gid *sgid) -{ - struct qedr_dev *dev = get_qedr_dev(ibdev); - int rc = 0; - - if (!rdma_cap_roce_gid_table(ibdev, port)) - return -ENODEV; - - rc = ib_get_cached_gid(ibdev, port, index, sgid, NULL); - if (rc == -EAGAIN) { - memcpy(sgid, &zgid, sizeof(*sgid)); - return 0; - } - - DP_DEBUG(dev, QEDR_MSG_INIT, "query gid: index=%d %llx:%llx\n", index, - sgid->global.interface_id, sgid->global.subnet_prefix); - - return rc; -} - int qedr_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *udata) { diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index b5330495bf7c..2c57e4c592a6 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -38,7 +38,6 @@ int qedr_query_port(struct ib_device *, u8 port, struct ib_port_attr *props); int qedr_modify_port(struct ib_device *, u8 port, int mask, struct ib_port_modify *props); -int qedr_query_gid(struct ib_device *, u8 port, int index, union ib_gid *gid); int qedr_iw_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 5ef8c3333e43..f83bbf550ec0 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -77,23 +77,6 @@ out: return rc; } -static int rxe_query_gid(struct ib_device *device, - u8 port_num, int index, union ib_gid *gid) -{ - int ret; - - if (index > RXE_PORT_GID_TBL_LEN) - return -EINVAL; - - ret = ib_get_cached_gid(device, port_num, index, gid, NULL); - if (ret == -EAGAIN) { - memcpy(gid, &zgid, sizeof(*gid)); - return 0; - } - - return ret; -} - static int rxe_add_gid(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr, void **context) @@ -1285,7 +1268,6 @@ int rxe_register_device(struct rxe_dev *rxe) dev->query_port = rxe_query_port; dev->modify_port = rxe_modify_port; dev->get_link_layer = rxe_get_link_layer; - dev->query_gid = rxe_query_gid; dev->get_netdev = rxe_get_netdev; dev->add_gid = rxe_add_gid; dev->del_gid = rxe_del_gid; From f35faa4ba9568138eea1c58abb92e8ef415dce41 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 1 Apr 2018 15:08:20 +0300 Subject: [PATCH 168/199] IB/core: Simplify ib_query_gid to always refer to cache Currently following inconsistencies exist. 1. ib_query_gid() returns GID from the software cache for a RoCE port and returns GID from the HCA for an IB port. This is incorrect because software GID cache is maintained regardless of HCA port type. 2. GID is queries from the HCA via ib_query_gid and updated in the software cache for IB link layer. Both of them might not be in sync. ULPs such as SRP initiator, SRP target, IPoIB driver have historically used ib_query_gid() API to query the GID. However CM used cached version during CM processing, When software cache was introduced, this inconsitency remained. In order to simplify, improve readability and avoid link layer specific above inconsistencies, this patch brings following changes. 1. ib_query_gid() always refers to the cache layer regardless of link layer. 2. cache module who reads the GID entry from HCA and builds the cache, directly invokes the HCA provider verb's query_gid() callback function. 3. ib_query_port() is being called in early stage where GID cache is not yet build while reading port immutable property. Therefore it needs to read the default GID from the HCA for IB link layer to publish the subnet prefix. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 4 ++-- drivers/infiniband/core/device.c | 15 +++------------ 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 552f3c8dc246..e03eaf0c7527 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1116,8 +1116,8 @@ static void ib_cache_update(struct ib_device *device, if (!use_roce_gid_table) { for (i = 0; i < gid_cache->table_len; ++i) { - ret = ib_query_gid(device, port, i, - gid_cache->table + i, NULL); + ret = device->query_gid(device, port, i, + gid_cache->table + i); if (ret) { pr_warn("ib_query_gid failed (%d) for %s (index %d)\n", ret, device->name, i); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5d79e8de31f5..601ff782e5f3 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -853,7 +853,7 @@ int ib_query_port(struct ib_device *device, if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) return 0; - err = ib_query_gid(device, port_num, 0, &gid, NULL); + err = device->query_gid(device, port_num, 0, &gid); if (err) return err; @@ -871,22 +871,13 @@ EXPORT_SYMBOL(ib_query_port); * @attr: Returned GID attributes related to this GID index (only in RoCE). * NULL means ignore. * - * ib_query_gid() fetches the specified GID table entry. + * ib_query_gid() fetches the specified GID table entry from the cache. */ int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid, struct ib_gid_attr *attr) { - if (rdma_protocol_roce(device, port_num)) - return ib_get_cached_gid(device, port_num, index, gid, attr); - - if (attr) - return -EINVAL; - - if (!device->query_gid) - return -EOPNOTSUPP; - - return device->query_gid(device, port_num, index, gid); + return ib_get_cached_gid(device, port_num, index, gid, attr); } EXPORT_SYMBOL(ib_query_gid); From 598ff6bae689453aa894bc38f3f1bb78eb131a61 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 1 Apr 2018 15:08:21 +0300 Subject: [PATCH 169/199] IB/core: Refactor GID modify code for RoCE Code is refactored to prepare separate functions for RoCE which can do more complex operations related to reference counting, while still maintainining code readability. This includes (a) Simplification to not perform netdevice checks and modifications for IB link layer. (b) Do not add RoCE GID entry which has NULL netdevice; instead return an error. (c) If GID addition fails at provider level add_gid(), do not add the entry in the cache and keep the entry marked as INVALID. (d) Simplify and reuse the ib_cache_gid_add()/del() routines so that they can be used even for modifying default GIDs. This avoid some code duplication in modifying default GIDs. (e) find_gid() routine refers to the data entry flags to qualify a GID as valid or invalid GID rather than depending on attributes and zeroness of the GID content. (f) gid_table_reserve_default() sets the GID default attribute at beginning while setting up the GID table. There is no need to use default_gid flag in low level functions such as write_gid(), add_gid(), del_gid(), as they never need to update the DEFAULT property of the GID entry while during GID table update. As as result of this refactor, reserved GID 0:0:0:0:0:0:0:0 is no longer searchable as described below. A unicast GID entry of 0:0:0:0:0:0:0:0 is Reserved GID as per the IB spec version 1.3 section 4.1.1, point (6) whose snippet is below. "The unicast GID address 0:0:0:0:0:0:0:0 is reserved - referred to as the Reserved GID. It shall never be assigned to any endport. It shall not be used as a destination address or in a global routing header (GRH)." GID table cache now only stores valid GID entries. Before this patch, Reserved GID 0:0:0:0:0:0:0:0 was searchable in the GID table using ib_find_cached_gid_by_port() and other similar find routines. Zero GID is no longer searchable as it shall not to be present in GRH or path recored entry as described in IB spec version 1.3 section 4.1.1, point (6), section 12.7.10 and section 12.7.20. ib_cache_update() is simplified to check link layer once, use unified locking scheme for all link layers, removed temporary gid table allocation/free logic. Additionally, (a) Expand ib_gid_attr to store port and index so that GID query routines can get port and index information from the attribute structure. (b) Expand ib_gid_attr to store device as well so that in future code when GID reference counting is done, device is used to reach back to the GID table entry. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 486 +++++++++++++++++--------------- drivers/infiniband/core/sysfs.c | 18 +- include/rdma/ib_verbs.h | 5 +- 3 files changed, 273 insertions(+), 236 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index e03eaf0c7527..045ca11fa135 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -59,8 +59,6 @@ struct ib_update_work { union ib_gid zgid; EXPORT_SYMBOL(zgid); -static const struct ib_gid_attr zattr; - enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, @@ -73,15 +71,6 @@ enum gid_table_entry_props { GID_TABLE_ENTRY_DEFAULT = 1UL << 1, }; -enum gid_table_write_action { - GID_TABLE_WRITE_ACTION_ADD, - GID_TABLE_WRITE_ACTION_DEL, - /* MODIFY only updates the GID table. Currently only used by - * ib_cache_update. - */ - GID_TABLE_WRITE_ACTION_MODIFY -}; - struct ib_gid_table_entry { unsigned long props; union ib_gid gid; @@ -100,16 +89,13 @@ struct ib_gid_table { * (a) Find the GID * (b) Delete it. * - * Add/delete should be carried out atomically. - * This is done by locking this mutex from multiple - * writers. We don't need this lock for IB, as the MAD - * layer replaces all entries. All data_vec entries - * are locked by this lock. **/ - struct mutex lock; - /* This lock protects the table entries from being - * read and written simultaneously. + /* Any writer to data_vec must hold this lock and the write side of + * rwlock. readers must hold only rwlock. All writers must be in a + * sleepable context. */ + struct mutex lock; + /* rwlock protects data_vec[ix]->props. */ rwlock_t rwlock; struct ib_gid_table_entry *data_vec; }; @@ -163,94 +149,128 @@ int ib_cache_gid_parse_type_str(const char *buf) } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); -/* This function expects that rwlock will be write locked in all - * scenarios and that lock will be locked in sleep-able (RoCE) - * scenarios. - */ -static int write_gid(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table, int ix, - const union ib_gid *gid, - const struct ib_gid_attr *attr, - enum gid_table_write_action action, - bool default_gid) - __releases(&table->rwlock) __acquires(&table->rwlock) +static void del_roce_gid(struct ib_device *device, u8 port_num, + struct ib_gid_table *table, int ix) { + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, + device->name, port_num, ix, + table->data_vec[ix].gid.raw); + + if (rdma_cap_roce_gid_table(device, port_num)) + device->del_gid(device, port_num, ix, + &table->data_vec[ix].context); + dev_put(table->data_vec[ix].attr.ndev); +} + +static int add_roce_gid(struct ib_gid_table *table, + const union ib_gid *gid, + const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry; + int ix = attr->index; int ret = 0; - struct net_device *old_net_dev; - enum ib_gid_type old_gid_type; - /* in rdma_cap_roce_gid_table, this funciton should be protected by a - * sleep-able lock. - */ - - if (rdma_cap_roce_gid_table(ib_dev, port)) { - table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; - write_unlock_irq(&table->rwlock); - /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by - * RoCE providers and thus only updates the cache. - */ - if (action == GID_TABLE_WRITE_ACTION_ADD) - ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr, - &table->data_vec[ix].context); - else if (action == GID_TABLE_WRITE_ACTION_DEL) - ret = ib_dev->del_gid(ib_dev, port, ix, - &table->data_vec[ix].context); - write_lock_irq(&table->rwlock); + if (!attr->ndev) { + pr_err("%s NULL netdev device=%s port=%d index=%d\n", + __func__, attr->device->name, attr->port_num, + attr->index); + return -EINVAL; } - old_net_dev = table->data_vec[ix].attr.ndev; - old_gid_type = table->data_vec[ix].attr.gid_type; - if (old_net_dev && old_net_dev != attr->ndev) - dev_put(old_net_dev); - /* if modify_gid failed, just delete the old gid */ - if (ret || action == GID_TABLE_WRITE_ACTION_DEL) { - gid = &zgid; - attr = &zattr; - table->data_vec[ix].context = NULL; + entry = &table->data_vec[ix]; + if ((entry->props & GID_TABLE_ENTRY_INVALID) == 0) { + WARN(1, "GID table corruption device=%s port=%d index=%d\n", + attr->device->name, attr->port_num, + attr->index); + return -EINVAL; } - memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid)); - memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr)); - if (default_gid) { - table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; - if (action == GID_TABLE_WRITE_ACTION_DEL) - table->data_vec[ix].attr.gid_type = old_gid_type; + if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { + ret = attr->device->add_gid(attr->device, attr->port_num, + ix, gid, attr, &entry->context); + if (ret) { + pr_err("%s GID add failed device=%s port=%d index=%d\n", + __func__, attr->device->name, attr->port_num, + attr->index); + goto add_err; + } } - if (table->data_vec[ix].attr.ndev && - table->data_vec[ix].attr.ndev != old_net_dev) - dev_hold(table->data_vec[ix].attr.ndev); - - table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; + dev_hold(attr->ndev); +add_err: + if (!ret) + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, + attr->device->name, attr->port_num, ix, gid->raw); return ret; } -static int add_gid(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table, int ix, - const union ib_gid *gid, - const struct ib_gid_attr *attr, - bool default_gid) { - return write_gid(ib_dev, port, table, ix, gid, attr, - GID_TABLE_WRITE_ACTION_ADD, default_gid); +/** + * add_modify_gid - Add or modify GID table entry + * + * @table: GID table in which GID to be added or modified + * @gid: GID content + * @attr: Attributes of the GID + * + * Returns 0 on success or appropriate error code. It accepts zero + * GID addition for non RoCE ports for HCA's who report them as valid + * GID. However such zero GIDs are not added to the cache. + */ +static int add_modify_gid(struct ib_gid_table *table, + const union ib_gid *gid, + const struct ib_gid_attr *attr) +{ + int ret; + + if (rdma_protocol_roce(attr->device, attr->port_num)) { + ret = add_roce_gid(table, gid, attr); + if (ret) + return ret; + } else { + /* + * Some HCA's report multiple GID entries with only one + * valid GID, but remaining as zero GID. + * So ignore such behavior for IB link layer and don't + * fail the call, but don't add such entry to GID cache. + */ + if (!memcmp(gid, &zgid, sizeof(*gid))) + return 0; + } + + lockdep_assert_held(&table->lock); + memcpy(&table->data_vec[attr->index].gid, gid, sizeof(*gid)); + memcpy(&table->data_vec[attr->index].attr, attr, sizeof(*attr)); + + write_lock_irq(&table->rwlock); + table->data_vec[attr->index].props &= ~GID_TABLE_ENTRY_INVALID; + write_unlock_irq(&table->rwlock); + return 0; } -static int modify_gid(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table, int ix, - const union ib_gid *gid, - const struct ib_gid_attr *attr, - bool default_gid) { - return write_gid(ib_dev, port, table, ix, gid, attr, - GID_TABLE_WRITE_ACTION_MODIFY, default_gid); +/** + * del_gid - Delete GID table entry + * + * @ib_dev: IB device whose GID entry to be deleted + * @port: Port number of the IB device + * @table: GID table of the IB device for a port + * @ix: GID entry index to delete + * + */ +static void del_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix) +{ + lockdep_assert_held(&table->lock); + write_lock_irq(&table->rwlock); + table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; + write_unlock_irq(&table->rwlock); + + if (rdma_protocol_roce(ib_dev, port)) + del_roce_gid(ib_dev, port, table, ix); + memcpy(&table->data_vec[ix].gid, &zgid, sizeof(zgid)); + memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr)); + table->data_vec[ix].context = NULL; } -static int del_gid(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table, int ix, - bool default_gid) { - return write_gid(ib_dev, port, table, ix, &zgid, &zattr, - GID_TABLE_WRITE_ACTION_DEL, default_gid); -} - -/* rwlock should be read locked */ +/* rwlock should be read locked, or lock should be held */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, unsigned long mask, int *pempty) @@ -266,15 +286,32 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, i++; + /* find_gid() is used during GID addition where it is expected + * to return a free entry slot which is not duplicate. + * Free entry slot is requested and returned if pempty is set, + * so lookup free slot only if requested. + */ + if (pempty && empty < 0) { + if (data->props & GID_TABLE_ENTRY_INVALID) { + /* Found an invalid (free) entry; allocate it */ + if (data->props & GID_TABLE_ENTRY_DEFAULT) { + if (default_gid) + empty = curr_index; + } else { + empty = curr_index; + } + } + } + + /* + * Additionally find_gid() is used to find valid entry during + * lookup operation, where validity needs to be checked. So + * find the empty entry first to continue to search for a free + * slot and ignore its INVALID flag. + */ if (data->props & GID_TABLE_ENTRY_INVALID) continue; - if (empty < 0) - if (!memcmp(&data->gid, &zgid, sizeof(*gid)) && - !memcmp(attr, &zattr, sizeof(*attr)) && - !data->props) - empty = curr_index; - if (found >= 0) continue; @@ -310,19 +347,55 @@ static void make_default_gid(struct net_device *dev, union ib_gid *gid) addrconf_ifid_eui48(&gid->raw[8], dev); } -int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, - union ib_gid *gid, struct ib_gid_attr *attr) +static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr, + unsigned long mask, bool default_gid) { struct ib_gid_table *table; - int ix; int ret = 0; - struct net_device *idev; int empty; + int ix; + + /* Do not allow adding zero GID in support of + * IB spec version 1.3 section 4.1.1 point (6) and + * section 12.7.10 and section 12.7.20 + */ + if (!memcmp(gid, &zgid, sizeof(*gid))) + return -EINVAL; table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; - if (!memcmp(gid, &zgid, sizeof(*gid))) - return -EINVAL; + mutex_lock(&table->lock); + + ix = find_gid(table, gid, attr, default_gid, mask, &empty); + if (ix >= 0) + goto out_unlock; + + if (empty < 0) { + ret = -ENOSPC; + goto out_unlock; + } + attr->device = ib_dev; + attr->index = empty; + attr->port_num = port; + ret = add_modify_gid(table, gid, attr); + if (!ret) + dispatch_gid_change_event(ib_dev, port); + +out_unlock: + mutex_unlock(&table->lock); + if (ret) + pr_warn("%s: unable to add gid %pI6 error=%d\n", + __func__, gid->raw, ret); + return ret; +} + +int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct net_device *idev; + unsigned long mask; + int ret; if (ib_dev->get_netdev) { idev = ib_dev->get_netdev(ib_dev, port); @@ -340,27 +413,11 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, dev_put(idev); } - mutex_lock(&table->lock); - write_lock_irq(&table->rwlock); + mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV; - ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_GID_TYPE | - GID_ATTR_FIND_MASK_NETDEV, &empty); - if (ix >= 0) - goto out_unlock; - - if (empty < 0) { - ret = -ENOSPC; - goto out_unlock; - } - - ret = add_gid(ib_dev, port, table, empty, gid, attr, false); - if (!ret) - dispatch_gid_change_event(ib_dev, port); - -out_unlock: - write_unlock_irq(&table->rwlock); - mutex_unlock(&table->lock); + ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); return ret; } @@ -368,29 +425,32 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *attr) { struct ib_gid_table *table; + int ret = 0; int ix; table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; mutex_lock(&table->lock); - write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | - GID_ATTR_FIND_MASK_NETDEV | - GID_ATTR_FIND_MASK_DEFAULT, + GID_ATTR_FIND_MASK_NETDEV, NULL); - if (ix < 0) + if (ix < 0) { + ret = -EINVAL; goto out_unlock; + } - if (!del_gid(ib_dev, port, table, ix, false)) - dispatch_gid_change_event(ib_dev, port); + del_gid(ib_dev, port, table, ix); + dispatch_gid_change_event(ib_dev, port); out_unlock: - write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); - return 0; + if (ret) + pr_debug("%s: can't delete gid %pI6 error=%d\n", + __func__, gid->raw, ret); + return ret; } int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, @@ -403,16 +463,14 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; mutex_lock(&table->lock); - write_lock_irq(&table->rwlock); - for (ix = 0; ix < table->sz; ix++) - if (table->data_vec[ix].attr.ndev == ndev) - if (!del_gid(ib_dev, port, table, ix, - !!(table->data_vec[ix].props & - GID_TABLE_ENTRY_DEFAULT))) - deleted = true; + for (ix = 0; ix < table->sz; ix++) { + if (table->data_vec[ix].attr.ndev == ndev) { + del_gid(ib_dev, port, table, ix); + deleted = true; + } + } - write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); if (deleted) @@ -609,6 +667,7 @@ static struct ib_gid_table *alloc_gid_table(int sz) { struct ib_gid_table *table = kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); + int i; if (!table) return NULL; @@ -622,6 +681,11 @@ static struct ib_gid_table *alloc_gid_table(int sz) table->sz = sz; rwlock_init(&table->rwlock); + /* Mark all entries as invalid so that allocator can allocate + * one of the invalid (free) entry. + */ + for (i = 0; i < sz; i++) + table->data_vec[i].props |= GID_TABLE_ENTRY_INVALID; return table; err_free_table: @@ -646,16 +710,15 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, if (!table) return; - write_lock_irq(&table->rwlock); + mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { if (memcmp(&table->data_vec[i].gid, &zgid, - sizeof(table->data_vec[i].gid))) - if (!del_gid(ib_dev, port, table, i, - table->data_vec[i].props & - GID_ATTR_FIND_MASK_DEFAULT)) - deleted = true; + sizeof(table->data_vec[i].gid))) { + del_gid(ib_dev, port, table, i); + deleted = true; + } } - write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); if (deleted) dispatch_gid_change_event(ib_dev, port); @@ -668,9 +731,9 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, { union ib_gid gid; struct ib_gid_attr gid_attr; - struct ib_gid_attr zattr_type = zattr; struct ib_gid_table *table; unsigned int gid_type; + unsigned long mask; table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; @@ -679,60 +742,19 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, gid_attr.ndev = ndev; for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { - int ix; - union ib_gid current_gid; - struct ib_gid_attr current_gid_attr = {}; - if (1UL << gid_type & ~gid_type_mask) continue; gid_attr.gid_type = gid_type; - mutex_lock(&table->lock); - write_lock_irq(&table->rwlock); - ix = find_gid(table, NULL, &gid_attr, true, - GID_ATTR_FIND_MASK_GID_TYPE | - GID_ATTR_FIND_MASK_DEFAULT, - NULL); - - /* Coudn't find default GID location */ - if (WARN_ON(ix < 0)) - goto release; - - zattr_type.gid_type = gid_type; - - if (!__ib_cache_gid_get(ib_dev, port, ix, - ¤t_gid, ¤t_gid_attr) && - mode == IB_CACHE_GID_DEFAULT_MODE_SET && - !memcmp(&gid, ¤t_gid, sizeof(gid)) && - !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) - goto release; - - if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || - memcmp(¤t_gid_attr, &zattr_type, - sizeof(current_gid_attr))) { - if (del_gid(ib_dev, port, table, ix, true)) { - pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", - ix, gid.raw); - goto release; - } else { - dispatch_gid_change_event(ib_dev, port); - } - } - if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { - if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) - pr_warn("ib_cache_gid: unable to add default gid %pI6\n", - gid.raw); - else - dispatch_gid_change_event(ib_dev, port); + mask = GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT; + __ib_cache_gid_add(ib_dev, port, &gid, + &gid_attr, mask, true); + } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) { + ib_cache_gid_del(ib_dev, port, &gid, &gid_attr); } - -release: - if (current_gid_attr.ndev) - dev_put(current_gid_attr.ndev); - write_unlock_irq(&table->rwlock); - mutex_unlock(&table->lock); } } @@ -1057,25 +1079,50 @@ int ib_get_cached_port_state(struct ib_device *device, } EXPORT_SYMBOL(ib_get_cached_port_state); +static int config_non_roce_gid_cache(struct ib_device *device, + u8 port, int gid_tbl_len) +{ + struct ib_gid_attr gid_attr = {}; + struct ib_gid_table *table; + union ib_gid gid; + int ret = 0; + int i; + + gid_attr.device = device; + gid_attr.port_num = port; + table = device->cache.ports[port - rdma_start_port(device)].gid; + + mutex_lock(&table->lock); + for (i = 0; i < gid_tbl_len; ++i) { + if (!device->query_gid) + continue; + ret = device->query_gid(device, port, i, &gid); + if (ret) { + pr_warn("query_gid failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } + gid_attr.index = i; + add_modify_gid(table, &gid, &gid_attr); + } +err: + mutex_unlock(&table->lock); + return ret; +} + static void ib_cache_update(struct ib_device *device, u8 port, bool enforce_security) { struct ib_port_attr *tprops = NULL; struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; - struct ib_gid_cache { - int table_len; - union ib_gid table[0]; - } *gid_cache = NULL; int i; int ret; struct ib_gid_table *table; - bool use_roce_gid_table; if (!rdma_is_port_valid(device, port)) return; - use_roce_gid_table = rdma_protocol_roce(device, port); table = device->cache.ports[port - rdma_start_port(device)].gid; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); @@ -1089,6 +1136,13 @@ static void ib_cache_update(struct ib_device *device, goto err; } + if (!rdma_protocol_roce(device, port)) { + ret = config_non_roce_gid_cache(device, port, + tprops->gid_tbl_len); + if (ret) + goto err; + } + pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len * sizeof *pkey_cache->table, GFP_KERNEL); if (!pkey_cache) @@ -1096,15 +1150,6 @@ static void ib_cache_update(struct ib_device *device, pkey_cache->table_len = tprops->pkey_tbl_len; - if (!use_roce_gid_table) { - gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len * - sizeof(*gid_cache->table), GFP_KERNEL); - if (!gid_cache) - goto err; - - gid_cache->table_len = tprops->gid_tbl_len; - } - for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { @@ -1114,33 +1159,12 @@ static void ib_cache_update(struct ib_device *device, } } - if (!use_roce_gid_table) { - for (i = 0; i < gid_cache->table_len; ++i) { - ret = device->query_gid(device, port, i, - gid_cache->table + i); - if (ret) { - pr_warn("ib_query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); - goto err; - } - } - } - write_lock_irq(&device->cache.lock); old_pkey_cache = device->cache.ports[port - rdma_start_port(device)].pkey; device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache; - if (!use_roce_gid_table) { - write_lock(&table->rwlock); - for (i = 0; i < gid_cache->table_len; i++) { - modify_gid(device, port, table, i, gid_cache->table + i, - &zattr, false); - } - write_unlock(&table->rwlock); - } - device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc; device->cache.ports[port - rdma_start_port(device)].port_state = tprops->state; @@ -1154,14 +1178,12 @@ static void ib_cache_update(struct ib_device *device, port, tprops->subnet_prefix); - kfree(gid_cache); kfree(old_pkey_cache); kfree(tprops); return; err: kfree(pkey_cache); - kfree(gid_cache); kfree(tprops); } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 9b0fbab41dc6..31c7efaf8e7a 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -389,14 +389,26 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); + union ib_gid *pgid; union ib_gid gid; ssize_t ret; ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL); - if (ret) - return ret; - return sprintf(buf, "%pI6\n", gid.raw); + /* If reading GID fails, it is likely due to GID entry being empty + * (invalid) or reserved GID in the table. + * User space expects to read GID table entries as long as it given + * index is within GID table size. + * Administrative/debugging tool fails to query rest of the GID entries + * if it hits error while querying a GID of the given index. + * To avoid user space throwing such error on fail to read gid, return + * zero GID as before. This maintains backward compatibility. + */ + if (ret) + pgid = &zgid; + else + pgid = &gid; + return sprintf(buf, "%pI6\n", pgid->raw); } static ssize_t show_port_gid_attr_ndev(struct ib_port *p, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a2f658125795..dc2541f13d7f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -91,8 +91,11 @@ enum ib_gid_type { #define ROCE_V2_UDP_DPORT 4791 struct ib_gid_attr { - enum ib_gid_type gid_type; struct net_device *ndev; + struct ib_device *device; + enum ib_gid_type gid_type; + u16 index; + u8 port_num; }; enum rdma_node_type { From 14169e333e712e3640a6e4b1a81239ce952e3fcf Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 1 Apr 2018 15:08:22 +0300 Subject: [PATCH 170/199] IB/providers: Avoid zero GID check for RoCE Now that the IB core GID cache ensures that a zero GID doesn't exist in the GID table remove zero GID checks from the provider drivers for clarity. Reviewed-by: Mark Bloch Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 3 --- drivers/infiniband/hw/mlx4/qp.c | 2 -- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 6 +----- drivers/infiniband/hw/qedr/qedr_roce_cm.c | 6 ------ drivers/infiniband/hw/qedr/verbs.c | 3 --- 5 files changed, 1 insertion(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d9422a44549e..a31a3edfbf28 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -411,9 +411,6 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, if (attr.ndev) dev_put(attr.ndev); - if (!memcmp(&gid, &zgid, sizeof(gid))) - return -EINVAL; - spin_lock_irqsave(&iboe->lock, flags); port_gid_table = &iboe->gids[port_num - 1]; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 523028e944ed..726a6ae90de0 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -2370,8 +2370,6 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, status = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &gid_attr); - if (!status && !memcmp(&gid, &zgid, sizeof(gid))) - status = -ENOENT; if (!status && gid_attr.ndev) { vlan = rdma_vlan_dev_vlan_id(gid_attr.ndev); memcpy(smac, gid_attr.ndev->dev_addr, ETH_ALEN); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 90cf77223771..12783262eb75 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -2494,7 +2494,7 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, { int status; struct rdma_ah_attr *ah_attr = &attrs->ah_attr; - union ib_gid sgid, zgid; + union ib_gid sgid; struct ib_gid_attr sgid_attr; u32 vlan_id = 0xFFFF; u8 mac_addr[6], hdr_type; @@ -2533,10 +2533,6 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, dev_put(sgid_attr.ndev); } - memset(&zgid, 0, sizeof(zgid)); - if (!memcmp(&sgid, &zgid, sizeof(zgid))) - return -EINVAL; - qp->sgid_idx = grh->sgid_index; memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid)); status = ocrdma_resolve_dmac(dev, ah_attr, &mac_addr[0]); diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.c b/drivers/infiniband/hw/qedr/qedr_roce_cm.c index 2bdbb12bfc69..eedb937d6fa9 100644 --- a/drivers/infiniband/hw/qedr/qedr_roce_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.c @@ -420,12 +420,6 @@ static inline int qedr_gsi_build_header(struct qedr_dev *dev, dev_put(sgid_attr.ndev); } - if (!memcmp(&sgid, &zgid, sizeof(sgid))) { - DP_ERR(dev, "gsi post send: GID not found GID index %d\n", - grh->sgid_index); - return -ENOENT; - } - has_udp = (sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP); if (!has_udp) { /* RoCE v1 */ diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index ccc09a8e4195..a523d6f5fef3 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1093,9 +1093,6 @@ static inline int get_gid_info_from_table(struct ib_qp *ibqp, if (rc) return rc; - if (!memcmp(&gid, &zgid, sizeof(gid))) - return -ENOENT; - if (gid_attr.ndev) { qp_params->vlan_id = rdma_vlan_dev_vlan_id(gid_attr.ndev); From 3e44e0ee0893cbea257e585dbd4c7d2ff00f1a6b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 1 Apr 2018 15:08:23 +0300 Subject: [PATCH 171/199] IB/providers: Avoid null netdev check for RoCE Now that IB core GID cache ensures that all RoCE entries have an associated netdev remove null checks from the provider drivers for clarity. Reviewed-by: Mark Bloch Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 5 +- drivers/infiniband/hw/hns/hns_roce_ah.c | 8 +-- drivers/infiniband/hw/mlx4/ah.c | 10 ++-- drivers/infiniband/hw/mlx4/qp.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 6 -- drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 8 +-- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 2 +- drivers/infiniband/hw/qedr/qedr_roce_cm.c | 10 ++-- drivers/infiniband/hw/qedr/verbs.c | 70 +++++++++++------------ drivers/infiniband/sw/rxe/rxe_verbs.c | 4 +- 10 files changed, 53 insertions(+), 72 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 944eb0acbbac..63a0e08dd6fe 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -718,8 +718,7 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd, grh->sgid_index); goto fail; } - if (sgid_attr.ndev) - dev_put(sgid_attr.ndev); + dev_put(sgid_attr.ndev); /* Get network header type for this GID */ nw_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); switch (nw_type) { @@ -1697,7 +1696,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, status = ib_get_cached_gid(&rdev->ibdev, 1, grh->sgid_index, &sgid, &sgid_attr); - if (!status && sgid_attr.ndev) { + if (!status) { memcpy(qp->qplib_qp.smac, sgid_attr.ndev->dev_addr, ETH_ALEN); dev_put(sgid_attr.ndev); diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 7dd6a66ea244..d74928621559 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -68,11 +68,9 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, return ERR_PTR(ret); } - if (gid_attr.ndev) { - if (is_vlan_dev(gid_attr.ndev)) - vlan_tag = vlan_dev_vlan_id(gid_attr.ndev); - dev_put(gid_attr.ndev); - } + if (is_vlan_dev(gid_attr.ndev)) + vlan_tag = vlan_dev_vlan_id(gid_attr.ndev); + dev_put(gid_attr.ndev); if (vlan_tag < 0x1000) vlan_tag |= (rdma_ah_get_sl(ah_attr) & diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 6dee4fdc5d67..9345d5b546d1 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -101,12 +101,10 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, if (ret) return ERR_PTR(ret); eth_zero_addr(ah->av.eth.s_mac); - if (gid_attr.ndev) { - if (is_vlan_dev(gid_attr.ndev)) - vlan_tag = vlan_dev_vlan_id(gid_attr.ndev); - memcpy(ah->av.eth.s_mac, gid_attr.ndev->dev_addr, ETH_ALEN); - dev_put(gid_attr.ndev); - } + if (is_vlan_dev(gid_attr.ndev)) + vlan_tag = vlan_dev_vlan_id(gid_attr.ndev); + memcpy(ah->av.eth.s_mac, gid_attr.ndev->dev_addr, ETH_ALEN); + dev_put(gid_attr.ndev); if (vlan_tag < 0x1000) vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13; ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 726a6ae90de0..50af8915e7ec 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -2370,7 +2370,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, status = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &gid_attr); - if (!status && gid_attr.ndev) { + if (!status) { vlan = rdma_vlan_dev_vlan_id(gid_attr.ndev); memcpy(smac, gid_attr.ndev->dev_addr, ETH_ALEN); dev_put(gid_attr.ndev); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6b50711df786..9e6780eadd1e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -525,9 +525,6 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr)) return 0; - if (!attr.ndev) - return 0; - dev_put(attr.ndev); if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) @@ -547,9 +544,6 @@ int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, if (ret) return ret; - if (!attr.ndev) - return -ENODEV; - dev_put(attr.ndev); *gid_type = attr.gid_type; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index dec650930ca6..3897b64532e1 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -193,11 +193,9 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr, __func__, status); goto av_conf_err; } - if (sgid_attr.ndev) { - if (is_vlan_dev(sgid_attr.ndev)) - vlan_tag = vlan_dev_vlan_id(sgid_attr.ndev); - dev_put(sgid_attr.ndev); - } + if (is_vlan_dev(sgid_attr.ndev)) + vlan_tag = vlan_dev_vlan_id(sgid_attr.ndev); + dev_put(sgid_attr.ndev); /* Get network header type for this GID */ ah->hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 12783262eb75..2c260e1c29d1 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -2527,7 +2527,7 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, status = ib_get_cached_gid(&dev->ibdev, 1, grh->sgid_index, &sgid, &sgid_attr); - if (!status && sgid_attr.ndev) { + if (!status) { vlan_id = rdma_vlan_dev_vlan_id(sgid_attr.ndev); memcpy(mac_addr, sgid_attr.ndev->dev_addr, ETH_ALEN); dev_put(sgid_attr.ndev); diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.c b/drivers/infiniband/hw/qedr/qedr_roce_cm.c index eedb937d6fa9..0f14e687bb91 100644 --- a/drivers/infiniband/hw/qedr/qedr_roce_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.c @@ -412,13 +412,11 @@ static inline int qedr_gsi_build_header(struct qedr_dev *dev, return rc; } - if (sgid_attr.ndev) { - vlan_id = rdma_vlan_dev_vlan_id(sgid_attr.ndev); - if (vlan_id < VLAN_CFI_MASK) - has_vlan = true; + vlan_id = rdma_vlan_dev_vlan_id(sgid_attr.ndev); + if (vlan_id < VLAN_CFI_MASK) + has_vlan = true; - dev_put(sgid_attr.ndev); - } + dev_put(sgid_attr.ndev); has_udp = (sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP); if (!has_udp) { diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index a523d6f5fef3..a9f494fb892a 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1093,43 +1093,41 @@ static inline int get_gid_info_from_table(struct ib_qp *ibqp, if (rc) return rc; - if (gid_attr.ndev) { - qp_params->vlan_id = rdma_vlan_dev_vlan_id(gid_attr.ndev); + qp_params->vlan_id = rdma_vlan_dev_vlan_id(gid_attr.ndev); - dev_put(gid_attr.ndev); - nw_type = ib_gid_to_network_type(gid_attr.gid_type, &gid); - switch (nw_type) { - case RDMA_NETWORK_IPV6: - memcpy(&qp_params->sgid.bytes[0], &gid.raw[0], - sizeof(qp_params->sgid)); - memcpy(&qp_params->dgid.bytes[0], - &grh->dgid, - sizeof(qp_params->dgid)); - qp_params->roce_mode = ROCE_V2_IPV6; - SET_FIELD(qp_params->modify_flags, - QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1); - break; - case RDMA_NETWORK_IB: - memcpy(&qp_params->sgid.bytes[0], &gid.raw[0], - sizeof(qp_params->sgid)); - memcpy(&qp_params->dgid.bytes[0], - &grh->dgid, - sizeof(qp_params->dgid)); - qp_params->roce_mode = ROCE_V1; - break; - case RDMA_NETWORK_IPV4: - memset(&qp_params->sgid, 0, sizeof(qp_params->sgid)); - memset(&qp_params->dgid, 0, sizeof(qp_params->dgid)); - ipv4_addr = qedr_get_ipv4_from_gid(gid.raw); - qp_params->sgid.ipv4_addr = ipv4_addr; - ipv4_addr = - qedr_get_ipv4_from_gid(grh->dgid.raw); - qp_params->dgid.ipv4_addr = ipv4_addr; - SET_FIELD(qp_params->modify_flags, - QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1); - qp_params->roce_mode = ROCE_V2_IPV4; - break; - } + dev_put(gid_attr.ndev); + nw_type = ib_gid_to_network_type(gid_attr.gid_type, &gid); + switch (nw_type) { + case RDMA_NETWORK_IPV6: + memcpy(&qp_params->sgid.bytes[0], &gid.raw[0], + sizeof(qp_params->sgid)); + memcpy(&qp_params->dgid.bytes[0], + &grh->dgid, + sizeof(qp_params->dgid)); + qp_params->roce_mode = ROCE_V2_IPV6; + SET_FIELD(qp_params->modify_flags, + QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1); + break; + case RDMA_NETWORK_IB: + memcpy(&qp_params->sgid.bytes[0], &gid.raw[0], + sizeof(qp_params->sgid)); + memcpy(&qp_params->dgid.bytes[0], + &grh->dgid, + sizeof(qp_params->dgid)); + qp_params->roce_mode = ROCE_V1; + break; + case RDMA_NETWORK_IPV4: + memset(&qp_params->sgid, 0, sizeof(qp_params->sgid)); + memset(&qp_params->dgid, 0, sizeof(qp_params->dgid)); + ipv4_addr = qedr_get_ipv4_from_gid(gid.raw); + qp_params->sgid.ipv4_addr = ipv4_addr; + ipv4_addr = + qedr_get_ipv4_from_gid(grh->dgid.raw); + qp_params->dgid.ipv4_addr = ipv4_addr; + SET_FIELD(qp_params->modify_flags, + QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1); + qp_params->roce_mode = ROCE_V2_IPV4; + break; } for (i = 0; i < 4; i++) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f83bbf550ec0..0661c2783b14 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -256,9 +256,7 @@ static int rxe_init_av(struct rxe_dev *rxe, struct rdma_ah_attr *attr, rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr); rxe_av_fill_ip_info(av, attr, &sgid_attr, &sgid); - - if (sgid_attr.ndev) - dev_put(sgid_attr.ndev); + dev_put(sgid_attr.ndev); return 0; } From 414448d249d82c9be93b35e61e0303e84ef2f959 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 1 Apr 2018 15:08:24 +0300 Subject: [PATCH 172/199] RDMA: Use ib_gid_attr during GID modification Now that ib_gid_attr contains device, port and index, simplify the provider APIs add_gid() and del_gid() to use device, port and index fields from the ib_gid_attr attributes structure. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 5 ++- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 10 +++--- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 6 ++-- drivers/infiniband/hw/hns/hns_roce_main.c | 20 +++++------ drivers/infiniband/hw/mlx4/main.c | 30 +++++++---------- drivers/infiniband/hw/mlx5/main.c | 13 ++++---- .../infiniband/hw/vmw_pvrdma/pvrdma_main.c | 31 +++++------------ drivers/infiniband/sw/rxe/rxe_verbs.c | 10 +++--- include/rdma/ib_verbs.h | 33 ++++++++----------- 9 files changed, 63 insertions(+), 95 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 045ca11fa135..e337b08de2ff 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -157,7 +157,7 @@ static void del_roce_gid(struct ib_device *device, u8 port_num, table->data_vec[ix].gid.raw); if (rdma_cap_roce_gid_table(device, port_num)) - device->del_gid(device, port_num, ix, + device->del_gid(&table->data_vec[ix].attr, &table->data_vec[ix].context); dev_put(table->data_vec[ix].attr.ndev); } @@ -186,8 +186,7 @@ static int add_roce_gid(struct ib_gid_table *table, } if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { - ret = attr->device->add_gid(attr->device, attr->port_num, - ix, gid, attr, &entry->context); + ret = attr->device->add_gid(gid, attr, &entry->context); if (ret) { pr_err("%s GID add failed device=%s port=%d index=%d\n", __func__, attr->device->name, attr->port_num, diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 63a0e08dd6fe..a76e206704d4 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -314,12 +314,11 @@ int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num, return rc; } -int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num, - unsigned int index, void **context) +int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context) { int rc = 0; struct bnxt_re_gid_ctx *ctx, **ctx_tbl; - struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct bnxt_re_dev *rdev = to_bnxt_re_dev(attr->device, ibdev); struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl; struct bnxt_qplib_gid *gid_to_del; @@ -365,15 +364,14 @@ int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num, return rc; } -int bnxt_re_add_gid(struct ib_device *ibdev, u8 port_num, - unsigned int index, const union ib_gid *gid, +int bnxt_re_add_gid(const union ib_gid *gid, const struct ib_gid_attr *attr, void **context) { int rc; u32 tbl_idx = 0; u16 vlan_id = 0xFFFF; struct bnxt_re_gid_ctx *ctx, **ctx_tbl; - struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct bnxt_re_dev *rdev = to_bnxt_re_dev(attr->device, ibdev); struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl; if ((attr->ndev) && is_vlan_dev(attr->ndev)) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index e62b7c2c7da6..5c6414cad4af 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -157,10 +157,8 @@ int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num, void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str); int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index, u16 *pkey); -int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num, - unsigned int index, void **context); -int bnxt_re_add_gid(struct ib_device *ibdev, u8 port_num, - unsigned int index, const union ib_gid *gid, +int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context); +int bnxt_re_add_gid(const union ib_gid *gid, const struct ib_gid_attr *attr, void **context); int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid); diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 76e2e5b41895..9d48bc07a9e6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -74,12 +74,11 @@ static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 port, u8 *addr) return hr_dev->hw->set_mac(hr_dev, phy_port, addr); } -static int hns_roce_add_gid(struct ib_device *device, u8 port_num, - unsigned int index, const union ib_gid *gid, +static int hns_roce_add_gid(const union ib_gid *gid, const struct ib_gid_attr *attr, void **context) { - struct hns_roce_dev *hr_dev = to_hr_dev(device); - u8 port = port_num - 1; + struct hns_roce_dev *hr_dev = to_hr_dev(attr->device); + u8 port = attr->port_num - 1; unsigned long flags; int ret; @@ -88,21 +87,20 @@ static int hns_roce_add_gid(struct ib_device *device, u8 port_num, spin_lock_irqsave(&hr_dev->iboe.lock, flags); - ret = hr_dev->hw->set_gid(hr_dev, port, index, (union ib_gid *)gid, - attr); + ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, + (union ib_gid *)gid, attr); spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); return ret; } -static int hns_roce_del_gid(struct ib_device *device, u8 port_num, - unsigned int index, void **context) +static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context) { - struct hns_roce_dev *hr_dev = to_hr_dev(device); + struct hns_roce_dev *hr_dev = to_hr_dev(attr->device); struct ib_gid_attr zattr = { }; union ib_gid zgid = { {0} }; - u8 port = port_num - 1; + u8 port = attr->port_num - 1; unsigned long flags; int ret; @@ -111,7 +109,7 @@ static int hns_roce_del_gid(struct ib_device *device, u8 port_num, spin_lock_irqsave(&hr_dev->iboe.lock, flags); - ret = hr_dev->hw->set_gid(hr_dev, port, index, &zgid, &zattr); + ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, &zgid, &zattr); spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index a31a3edfbf28..8eca09b53fe8 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -246,14 +246,11 @@ static int mlx4_ib_update_gids(struct gid_entry *gids, return mlx4_ib_update_gids_v1(gids, ibdev, port_num); } -static int mlx4_ib_add_gid(struct ib_device *device, - u8 port_num, - unsigned int index, - const union ib_gid *gid, +static int mlx4_ib_add_gid(const union ib_gid *gid, const struct ib_gid_attr *attr, void **context) { - struct mlx4_ib_dev *ibdev = to_mdev(device); + struct mlx4_ib_dev *ibdev = to_mdev(attr->device); struct mlx4_ib_iboe *iboe = &ibdev->iboe; struct mlx4_port_gid_table *port_gid_table; int free = -1, found = -1; @@ -262,16 +259,16 @@ static int mlx4_ib_add_gid(struct ib_device *device, int i; struct gid_entry *gids = NULL; - if (!rdma_cap_roce_gid_table(device, port_num)) + if (!rdma_cap_roce_gid_table(attr->device, attr->port_num)) return -EINVAL; - if (port_num > MLX4_MAX_PORTS) + if (attr->port_num > MLX4_MAX_PORTS) return -EINVAL; if (!context) return -EINVAL; - port_gid_table = &iboe->gids[port_num - 1]; + port_gid_table = &iboe->gids[attr->port_num - 1]; spin_lock_bh(&iboe->lock); for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) { if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid)) && @@ -318,33 +315,30 @@ static int mlx4_ib_add_gid(struct ib_device *device, spin_unlock_bh(&iboe->lock); if (!ret && hw_update) { - ret = mlx4_ib_update_gids(gids, ibdev, port_num); + ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num); kfree(gids); } return ret; } -static int mlx4_ib_del_gid(struct ib_device *device, - u8 port_num, - unsigned int index, - void **context) +static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) { struct gid_cache_context *ctx = *context; - struct mlx4_ib_dev *ibdev = to_mdev(device); + struct mlx4_ib_dev *ibdev = to_mdev(attr->device); struct mlx4_ib_iboe *iboe = &ibdev->iboe; struct mlx4_port_gid_table *port_gid_table; int ret = 0; int hw_update = 0; struct gid_entry *gids = NULL; - if (!rdma_cap_roce_gid_table(device, port_num)) + if (!rdma_cap_roce_gid_table(attr->device, attr->port_num)) return -EINVAL; - if (port_num > MLX4_MAX_PORTS) + if (attr->port_num > MLX4_MAX_PORTS) return -EINVAL; - port_gid_table = &iboe->gids[port_num - 1]; + port_gid_table = &iboe->gids[attr->port_num - 1]; spin_lock_bh(&iboe->lock); if (ctx) { ctx->refcount--; @@ -376,7 +370,7 @@ static int mlx4_ib_del_gid(struct ib_device *device, spin_unlock_bh(&iboe->lock); if (!ret && hw_update) { - ret = mlx4_ib_update_gids(gids, ibdev, port_num); + ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num); kfree(gids); } return ret; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 9e6780eadd1e..bc9eabd95948 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -502,18 +502,19 @@ static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num, vlan_id, port_num); } -static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num, - unsigned int index, const union ib_gid *gid, +static int mlx5_ib_add_gid(const union ib_gid *gid, const struct ib_gid_attr *attr, __always_unused void **context) { - return set_roce_addr(to_mdev(device), port_num, index, gid, attr); + return set_roce_addr(to_mdev(attr->device), attr->port_num, + attr->index, gid, attr); } -static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num, - unsigned int index, __always_unused void **context) +static int mlx5_ib_del_gid(const struct ib_gid_attr *attr, + __always_unused void **context) { - return set_roce_addr(to_mdev(device), port_num, index, NULL, NULL); + return set_roce_addr(to_mdev(attr->device), attr->port_num, + attr->index, NULL, NULL); } __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 4834460e2a0b..0be33a81bbe6 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -62,17 +62,10 @@ static DEFINE_MUTEX(pvrdma_device_list_lock); static LIST_HEAD(pvrdma_device_list); static struct workqueue_struct *event_wq; -static int pvrdma_add_gid(struct ib_device *ibdev, - u8 port_num, - unsigned int index, - const union ib_gid *gid, +static int pvrdma_add_gid(const union ib_gid *gid, const struct ib_gid_attr *attr, void **context); -static int pvrdma_del_gid(struct ib_device *ibdev, - u8 port_num, - unsigned int index, - void **context); - +static int pvrdma_del_gid(const struct ib_gid_attr *attr, void **context); static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) @@ -657,18 +650,15 @@ static int pvrdma_add_gid_at_index(struct pvrdma_dev *dev, return 0; } -static int pvrdma_add_gid(struct ib_device *ibdev, - u8 port_num, - unsigned int index, - const union ib_gid *gid, +static int pvrdma_add_gid(const union ib_gid *gid, const struct ib_gid_attr *attr, void **context) { - struct pvrdma_dev *dev = to_vdev(ibdev); + struct pvrdma_dev *dev = to_vdev(attr->device); return pvrdma_add_gid_at_index(dev, gid, ib_gid_type_to_pvrdma(attr->gid_type), - index); + attr->index); } static int pvrdma_del_gid_at_index(struct pvrdma_dev *dev, int index) @@ -698,17 +688,14 @@ static int pvrdma_del_gid_at_index(struct pvrdma_dev *dev, int index) return 0; } -static int pvrdma_del_gid(struct ib_device *ibdev, - u8 port_num, - unsigned int index, - void **context) +static int pvrdma_del_gid(const struct ib_gid_attr *attr, void **context) { - struct pvrdma_dev *dev = to_vdev(ibdev); + struct pvrdma_dev *dev = to_vdev(attr->device); dev_dbg(&dev->pdev->dev, "removing gid at index %u from %s", - index, dev->netdev->name); + attr->index, dev->netdev->name); - return pvrdma_del_gid_at_index(dev, index); + return pvrdma_del_gid_at_index(dev, attr->index); } static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev, diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 0661c2783b14..08f3e0618b81 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -77,19 +77,17 @@ out: return rc; } -static int rxe_add_gid(struct ib_device *device, u8 port_num, unsigned int - index, const union ib_gid *gid, +static int rxe_add_gid(const union ib_gid *gid, const struct ib_gid_attr *attr, void **context) { - if (index >= RXE_PORT_GID_TBL_LEN) + if (attr->index >= RXE_PORT_GID_TBL_LEN) return -EINVAL; return 0; } -static int rxe_del_gid(struct ib_device *device, u8 port_num, unsigned int - index, void **context) +static int rxe_del_gid(const struct ib_gid_attr *attr, void **context) { - if (index >= RXE_PORT_GID_TBL_LEN) + if (attr->index >= RXE_PORT_GID_TBL_LEN) return -EINVAL; return 0; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index dc2541f13d7f..1e3059ce73b6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2151,34 +2151,29 @@ struct ib_device { int (*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); - /* When calling add_gid, the HW vendor's driver should - * add the gid of device @device at gid index @index of - * port @port_num to be @gid. Meta-info of that gid (for example, - * the network device related to this gid is available - * at @attr. @context allows the HW vendor driver to store extra - * information together with a GID entry. The HW vendor may allocate - * memory to contain this information and store it in @context when a - * new GID entry is written to. Params are consistent until the next - * call of add_gid or delete_gid. The function should return 0 on + /* When calling add_gid, the HW vendor's driver should add the gid + * of device of port at gid index available at @attr. Meta-info of + * that gid (for example, the network device related to this gid) is + * available at @attr. @context allows the HW vendor driver to store + * extra information together with a GID entry. The HW vendor driver may + * allocate memory to contain this information and store it in @context + * when a new GID entry is written to. Params are consistent until the + * next call of add_gid or delete_gid. The function should return 0 on * success or error otherwise. The function could be called - * concurrently for different ports. This function is only called - * when roce_gid_table is used. + * concurrently for different ports. This function is only called when + * roce_gid_table is used. */ - int (*add_gid)(struct ib_device *device, - u8 port_num, - unsigned int index, - const union ib_gid *gid, + int (*add_gid)(const union ib_gid *gid, const struct ib_gid_attr *attr, void **context); /* When calling del_gid, the HW vendor's driver should delete the - * gid of device @device at gid index @index of port @port_num. + * gid of device @device at gid index gid_index of port port_num + * available in @attr. * Upon the deletion of a GID entry, the HW vendor must free any * allocated memory. The caller will clear @context afterwards. * This function is only called when roce_gid_table is used. */ - int (*del_gid)(struct ib_device *device, - u8 port_num, - unsigned int index, + int (*del_gid)(const struct ib_gid_attr *attr, void **context); int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); From 8c84660bb437fe8692e6a2b4e85023ccb874a520 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:41 +0300 Subject: [PATCH 173/199] IB/mlx5: Initialize the parsing tree root without the help of uverbs In order to have a custom parsing tree, a provider driver needs to assign its parsing tree to ib_device specs_tree field. Otherwise, the uverbs client assigns a common default parsing tree for it. In downstream patches, the mlx5_ib driver gains a custom parsing tree, which contains both the common objects and a new flags field for the UVERBS_FLOW_ACTION_ESP_CREATE command. This patch makes mlx5_ib assign its own tree to specs_root, which later on will be extended. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 38 ++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + 2 files changed, 39 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bc9eabd95948..82ad0faf8007 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -60,6 +60,10 @@ #include "ib_rep.h" #include "cmd.h" #include +#include + +#define UVERBS_MODULE_NAME mlx5_ib +#include #define DRIVER_NAME "mlx5_ib" #define DRIVER_VERSION "5.0-0" @@ -4544,6 +4548,24 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) mlx5_nic_vport_disable_roce(dev->mdev); } +#define NUM_TREES 0 +static int populate_specs_root(struct mlx5_ib_dev *dev) +{ + const struct uverbs_object_tree_def *default_root[NUM_TREES + 1] = { + uverbs_default_get_objects()}; + size_t num_trees = 1; + + dev->ib_dev.specs_root = + uverbs_alloc_spec_tree(num_trees, default_root); + + return PTR_ERR_OR_ZERO(dev->ib_dev.specs_root); +} + +static void depopulate_specs_root(struct mlx5_ib_dev *dev) +{ + uverbs_free_spec_tree(dev->ib_dev.specs_root); +} + void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { mlx5_ib_cleanup_multiport_master(dev); @@ -4989,11 +5011,21 @@ void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev) mlx5_free_bfreg(dev->mdev, &dev->bfreg); } +static int mlx5_ib_stage_populate_specs(struct mlx5_ib_dev *dev) +{ + return populate_specs_root(dev); +} + int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { return ib_register_device(&dev->ib_dev, NULL); } +static void mlx5_ib_stage_depopulate_specs(struct mlx5_ib_dev *dev) +{ + depopulate_specs_root(dev); +} + void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) { destroy_umrc_res(dev); @@ -5128,6 +5160,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR, NULL, mlx5_ib_stage_pre_ib_reg_umr_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_SPECS, + mlx5_ib_stage_populate_specs, + mlx5_ib_stage_depopulate_specs), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), @@ -5173,6 +5208,9 @@ static const struct mlx5_ib_profile nic_rep_profile = { STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR, NULL, mlx5_ib_stage_pre_ib_reg_umr_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_SPECS, + mlx5_ib_stage_populate_specs, + mlx5_ib_stage_depopulate_specs), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index aeea74357cbe..0eda960ab8e0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -740,6 +740,7 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_UAR, MLX5_IB_STAGE_BFREG, MLX5_IB_STAGE_PRE_IB_REG_UMR, + MLX5_IB_STAGE_SPECS, MLX5_IB_STAGE_IB_REG, MLX5_IB_STAGE_POST_IB_REG_UMR, MLX5_IB_STAGE_DELAY_DROP, From 494c5580aa6721874a6d9d62dac1c94e83e79302 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:42 +0300 Subject: [PATCH 174/199] IB/uverbs: Add enum attribute type to ioctl() interface Methods sometimes need to get one attribute out of a group of pre-defined attributes. This is an enum-like behavior. Since this is a common requirement, we add a new ENUM attribute to the generic uverbs ioctl() layer. This attribute is embedded in methods, like any other attributes we currently have. ENUM attributes point to an array of standard UVERBS_ATTR_PTR_IN. The user-space encodes the enum's attribute id in the id field and the internal PTR_IN attr id in the enum_data.elem_id field. This ENUM attribute could be shared by several attributes and it can get UVERBS_ATTR_SPEC_F_MANDATORY flag, stating this attribute must be supported by the kernel, like any other attribute. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 39 ++++++++++++++++++------ include/rdma/uverbs_ioctl.h | 34 +++++++++++++++++++++ include/uapi/rdma/rdma_user_ioctl_cmds.h | 8 ++++- 3 files changed, 71 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 1e6bf2488584..8c93970dc8f1 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -55,14 +55,12 @@ static int uverbs_process_attr(struct ib_device *ibdev, struct ib_uverbs_attr __user *uattr_ptr) { const struct uverbs_attr_spec *spec; + const struct uverbs_attr_spec *val_spec; struct uverbs_attr *e; const struct uverbs_object_spec *object; struct uverbs_obj_attr *o_attr; struct uverbs_attr *elements = attr_bundle_h->attrs; - if (uattr->reserved) - return -EINVAL; - if (attr_id >= attr_spec_bucket->num_attrs) { if (uattr->flags & UVERBS_ATTR_F_MANDATORY) return -EINVAL; @@ -74,26 +72,46 @@ static int uverbs_process_attr(struct ib_device *ibdev, return -EINVAL; spec = &attr_spec_bucket->attrs[attr_id]; + val_spec = spec; e = &elements[attr_id]; e->uattr = uattr_ptr; switch (spec->type) { + case UVERBS_ATTR_TYPE_ENUM_IN: + if (uattr->attr_data.enum_data.elem_id >= spec->enum_def.num_elems) + return -EOPNOTSUPP; + + if (uattr->attr_data.enum_data.reserved) + return -EINVAL; + + val_spec = &spec->enum_def.ids[uattr->attr_data.enum_data.elem_id]; + + /* Currently we only support PTR_IN based enums */ + if (val_spec->type != UVERBS_ATTR_TYPE_PTR_IN) + return -EOPNOTSUPP; + + e->ptr_attr.enum_id = uattr->attr_data.enum_data.elem_id; + /* fall through */ case UVERBS_ATTR_TYPE_PTR_IN: /* Ensure that any data provided by userspace beyond the known * struct is zero. Userspace that knows how to use some future * longer struct will fail here if used with an old kernel and * non-zero content, making ABI compat/discovery simpler. */ - if (uattr->len > spec->ptr.len && - spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO && - !uverbs_is_attr_cleared(uattr, spec->ptr.len)) + if (uattr->len > val_spec->ptr.len && + val_spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO && + !uverbs_is_attr_cleared(uattr, val_spec->ptr.len)) return -EOPNOTSUPP; /* fall through */ case UVERBS_ATTR_TYPE_PTR_OUT: - if (uattr->len < spec->ptr.min_len || - (!(spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO) && - uattr->len > spec->ptr.len)) + if (uattr->len < val_spec->ptr.min_len || + (!(val_spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO) && + uattr->len > val_spec->ptr.len)) + return -EINVAL; + + if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN && + uattr->attr_data.reserved) return -EINVAL; e->ptr_attr.data = uattr->data; @@ -106,6 +124,9 @@ static int uverbs_process_attr(struct ib_device *ibdev, return -EINVAL; /* fall through */ case UVERBS_ATTR_TYPE_FD: + if (uattr->attr_data.reserved) + return -EINVAL; + if (uattr->len != 0 || !ucontext || uattr->data > INT_MAX) return -EINVAL; diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index faaaec7be36a..3d6ac684b8f0 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -51,6 +51,7 @@ enum uverbs_attr_type { UVERBS_ATTR_TYPE_PTR_OUT, UVERBS_ATTR_TYPE_IDR, UVERBS_ATTR_TYPE_FD, + UVERBS_ATTR_TYPE_ENUM_IN, }; enum uverbs_obj_access { @@ -95,6 +96,18 @@ struct uverbs_attr_spec { u16 obj_type; u8 access; } obj; + struct { + enum uverbs_attr_type type; + /* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */ + u8 flags; + u8 num_elems; + /* + * The enum attribute can select one of the attributes + * contained in the ids array. Currently only PTR_IN + * attributes are supported in the ids array. + */ + const struct uverbs_attr_spec *ids; + } enum_def; }; }; @@ -215,6 +228,10 @@ struct uverbs_object_tree_def { UVERBS_ATTR(_id, UVERBS_ATTR_TYPE_PTR_OUT, ptr, _len, ##__VA_ARGS__) #define UVERBS_ATTR_PTR_OUT(_id, _type, ...) \ UVERBS_ATTR_PTR_OUT_SZ(_id, _type, ##__VA_ARGS__) +#define UVERBS_ATTR_ENUM_IN(_id, _enum_arr, ...) \ + UVERBS_ATTR(_id, UVERBS_ATTR_TYPE_ENUM_IN, enum_def, \ + .ids = (_enum_arr), \ + .num_elems = ARRAY_SIZE(_enum_arr), ##__VA_ARGS__) /* * In new compiler, UVERBS_ATTR_IDR (and FD) could be simplified by declaring @@ -254,6 +271,11 @@ struct uverbs_object_tree_def { #define DECLARE_UVERBS_ATTR_SPEC(_name, ...) \ const struct uverbs_attr_def _name = __VA_ARGS__ +#define DECLARE_UVERBS_ENUM(_name, ...) \ + const struct uverbs_enum_spec _name = { \ + .len = ARRAY_SIZE(((struct uverbs_attr_spec[]){__VA_ARGS__})),\ + .ids = {__VA_ARGS__}, \ + } #define _UVERBS_METHOD_ATTRS_SZ(...) \ (sizeof((const struct uverbs_attr_def * const []){__VA_ARGS__}) /\ sizeof(const struct uverbs_attr_def *)) @@ -305,6 +327,7 @@ struct uverbs_ptr_attr { u16 len; /* Combination of bits from enum UVERBS_ATTR_F_XXXX */ u16 flags; + u8 enum_id; }; struct uverbs_obj_attr { @@ -374,6 +397,17 @@ static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr return &attrs_bundle->hash[idx_bucket].attrs[idx & ~UVERBS_ID_NS_MASK]; } +static inline int uverbs_attr_get_enum_id(const struct uverbs_attr_bundle *attrs_bundle, + u16 idx) +{ + const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx); + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + return attr->ptr_attr.enum_id; +} + static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle, size_t idx, const void *from, size_t size) { diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 40063cf970aa..1da5a1e1f3a8 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -55,7 +55,13 @@ struct ib_uverbs_attr { __u16 attr_id; /* command specific type attribute */ __u16 len; /* only for pointers */ __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ - __u16 reserved; + union { + struct { + __u8 elem_id; + __u8 reserved; + } enum_data; + __u16 reserved; + } attr_data; __aligned_u64 data; /* ptr to command, inline data or idr/fd */ }; From 8510020d2916b9f7d8e406aa2fb2d35fbe692f4d Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 28 Mar 2018 09:27:43 +0300 Subject: [PATCH 175/199] IB/mlx4: Check for egress flow steering ConnectX3 doesn't support egress flow steering. Return an EOPNOTSUPP error when such a flow is being created. Signed-off-by: Boris Pismenny Reviewed-by: Aviad Yehezkel Reviewed-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 8eca09b53fe8..5b70744f414a 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1858,6 +1858,9 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, if (flow_attr->port < 1 || flow_attr->port > qp->device->phys_port_cnt) return ERR_PTR(-EINVAL); + if (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP) + return ERR_PTR(-EOPNOTSUPP); + if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && (flow_attr->type != IB_FLOW_ATTR_NORMAL)) return ERR_PTR(-EOPNOTSUPP); From 766d8551ada05326f0cafc5fc0bd32a666cebeed Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:44 +0300 Subject: [PATCH 176/199] IB/uverbs: Refactor kern_spec_to_ib_spec_filter The current implementation of kern_spec_to_ib_spec_filter, which takes a uAPI based flow steering specification and creates the respective kernel API flow steering structure, gets a ib_uverbs_flow_spec structure. The new flow_action uAPI gets a match mask and filter from user-space which aren't encoded in the flow steering's ib_uverbs_flow_spec structure. Exporting the logic out of kern_spec_to_ib_spec_filter to get user-space blobs rather than ib_uverbs_flow_spec structure. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 6 ++++ drivers/infiniband/core/uverbs_cmd.c | 47 ++++++++++++++++++---------- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index d20828afa05c..0fabedf446fb 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -254,6 +254,12 @@ struct ib_uverbs_flow_spec { }; }; +int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, + const void *kern_spec_mask, + const void *kern_spec_val, + size_t kern_filter_sz, + union ib_flow_spec *ib_spec); + extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DEVICE); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_PD); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MR); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 536d78baacd3..f6ffe18df679 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2765,13 +2765,13 @@ static int kern_spec_to_ib_spec_action(struct ib_uverbs_flow_spec *kern_spec, return 0; } -static size_t kern_spec_filter_sz(struct ib_uverbs_flow_spec_hdr *spec) +static size_t kern_spec_filter_sz(const struct ib_uverbs_flow_spec_hdr *spec) { /* Returns user space filter size, includes padding */ return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2; } -static ssize_t spec_filter_size(void *kern_spec_filter, u16 kern_filter_size, +static ssize_t spec_filter_size(const void *kern_spec_filter, u16 kern_filter_size, u16 ib_real_filter_sz) { /* @@ -2789,28 +2789,21 @@ static ssize_t spec_filter_size(void *kern_spec_filter, u16 kern_filter_size, return kern_filter_size; } -static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, - union ib_flow_spec *ib_spec) +int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, + const void *kern_spec_mask, + const void *kern_spec_val, + size_t kern_filter_sz, + union ib_flow_spec *ib_spec) { ssize_t actual_filter_sz; - ssize_t kern_filter_sz; ssize_t ib_filter_sz; - void *kern_spec_mask; - void *kern_spec_val; - if (kern_spec->reserved) - return -EINVAL; - - ib_spec->type = kern_spec->type; - - kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr); /* User flow spec size must be aligned to 4 bytes */ if (kern_filter_sz != ALIGN(kern_filter_sz, 4)) return -EINVAL; - kern_spec_val = (void *)kern_spec + - sizeof(struct ib_uverbs_flow_spec_hdr); - kern_spec_mask = kern_spec_val + kern_filter_sz; + ib_spec->type = type; + if (ib_spec->type == (IB_FLOW_SPEC_INNER | IB_FLOW_SPEC_VXLAN_TUNNEL)) return -EINVAL; @@ -2885,6 +2878,28 @@ static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, return 0; } +static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) +{ + ssize_t kern_filter_sz; + void *kern_spec_mask; + void *kern_spec_val; + + if (kern_spec->reserved) + return -EINVAL; + + kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr); + + kern_spec_val = (void *)kern_spec + + sizeof(struct ib_uverbs_flow_spec_hdr); + kern_spec_mask = kern_spec_val + kern_filter_sz; + + return ib_uverbs_kern_spec_to_ib_spec_filter(kern_spec->type, + kern_spec_mask, + kern_spec_val, + kern_filter_sz, ib_spec); +} + static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { From 2eb9beaee5d73130d28c54e91eecb8a186581e08 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:45 +0300 Subject: [PATCH 177/199] IB/uverbs: Add flow_action create and destroy verbs A verbs application may receive and transmits packets using a data path pipeline. Sometimes, the first stage in the receive pipeline or the last stage in the transmit pipeline involves transforming a packet, either in order to make it easier for later stages to process it or to prepare it for transmission over the wire. Such transformation could be stripping/encapsulating the packet (i.e. vxlan), decrypting/encrypting it (i.e. ipsec), altering headers, doing some complex FPGA changes, etc. Some hardware could do such transformations without software data path intervention at all. The flow steering API supports steering a packet (either to a QP or dropping it) and some simple packet immutable actions (i.e. tagging a packet). Complex actions, that may change the packet, could bloat the flow steering API extensively. Sometimes the same action should be applied to several flows. In this case, it's easier to bind several flows to the same action and modify it than change all matching flows. Introducing a new flow_action object that abstracts any packet transformation (out of a standard and well defined set of actions). This flow_action object could be tied to a flow steering rule via a new specification. Currently, we support esp flow_action, which encrypts or decrypts a packet according to the given parameters. However, we present a flexible schema that could be used to other transformation actions tied to flow rules. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/uverbs.h | 4 + drivers/infiniband/core/uverbs_std_types.c | 10 +- .../core/uverbs_std_types_flow_action.c | 348 ++++++++++++++++++ include/rdma/ib_verbs.h | 65 ++++ include/uapi/rdma/ib_user_ioctl_cmds.h | 19 + include/uapi/rdma/ib_user_ioctl_verbs.h | 59 +++ 7 files changed, 506 insertions(+), 2 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_flow_action.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 4d6260fd2f52..445c5504f605 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -34,4 +34,5 @@ ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ - uverbs_ioctl_merge.o uverbs_std_types_cq.o + uverbs_ioctl_merge.o uverbs_std_types_cq.o \ + uverbs_std_types_flow_action.o diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 0fabedf446fb..a94b5e7ee02a 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -234,6 +234,9 @@ void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata); extern const struct uverbs_attr_def uverbs_uhw_compat_in; extern const struct uverbs_attr_def uverbs_uhw_compat_out; long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +int uverbs_destroy_def_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs); struct ib_uverbs_flow_spec { union { @@ -273,6 +276,7 @@ extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_WQ); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION); #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 2ed8d9203f3b..47b9a85f3854 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -191,6 +191,13 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ return 0; }; +int uverbs_destroy_def_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + return 0; +} + /* * This spec is used in order to pass information to the hardware driver in a * legacy way. Every verb that could get driver specific data should get this @@ -293,7 +300,8 @@ static DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, &UVERBS_OBJECT(UVERBS_OBJECT_FLOW), &UVERBS_OBJECT(UVERBS_OBJECT_WQ), &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL), - &UVERBS_OBJECT(UVERBS_OBJECT_XRCD)); + &UVERBS_OBJECT(UVERBS_OBJECT_XRCD), + &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION)); const struct uverbs_object_tree_def *uverbs_default_get_objects(void) { diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c new file mode 100644 index 000000000000..3d4c7b8dff48 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "uverbs.h" +#include + +static int uverbs_free_flow_action(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_flow_action *action = uobject->object; + + if (why == RDMA_REMOVE_DESTROY && + atomic_read(&action->usecnt)) + return -EBUSY; + + return action->device->destroy_flow_action(action); +} + +static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs, + u32 flags) +{ + u64 verbs_flags = flags; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ESN)) + verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED; + + return verbs_flags; +}; + +static int validate_flow_action_esp_keymat_aes_gcm(struct ib_flow_action_attrs_esp_keymats *keymat) +{ + struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm = + &keymat->keymat.aes_gcm; + + if (aes_gcm->iv_algo > IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ) + return -EOPNOTSUPP; + + if (aes_gcm->key_len != 32 && + aes_gcm->key_len != 24 && + aes_gcm->key_len != 16) + return -EINVAL; + + if (aes_gcm->icv_len != 16 && + aes_gcm->icv_len != 8 && + aes_gcm->icv_len != 12) + return -EINVAL; + + return 0; +} + +static int (* const flow_action_esp_keymat_validate[])(struct ib_flow_action_attrs_esp_keymats *keymat) = { + [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = validate_flow_action_esp_keymat_aes_gcm, +}; + +static int parse_esp_ip(enum ib_flow_spec_type proto, + const void __user *val_ptr, + size_t len, union ib_flow_spec *out) +{ + int ret; + const struct ib_uverbs_flow_ipv4_filter ipv4 = { + .src_ip = cpu_to_be32(0xffffffffUL), + .dst_ip = cpu_to_be32(0xffffffffUL), + .proto = 0xff, + .tos = 0xff, + .ttl = 0xff, + .flags = 0xff, + }; + const struct ib_uverbs_flow_ipv6_filter ipv6 = { + .src_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + .dst_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + .flow_label = cpu_to_be32(0xffffffffUL), + .next_hdr = 0xff, + .traffic_class = 0xff, + .hop_limit = 0xff, + }; + union { + struct ib_uverbs_flow_ipv4_filter ipv4; + struct ib_uverbs_flow_ipv6_filter ipv6; + } user_val = {}; + const void *user_pmask; + size_t val_len; + + /* If the flow IPv4/IPv6 flow specifications are extended, the mask + * should be changed as well. + */ + BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv4_filter, flags) + + sizeof(ipv4.flags) != sizeof(ipv4)); + BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv6_filter, reserved) + + sizeof(ipv6.reserved) != sizeof(ipv6)); + + switch (proto) { + case IB_FLOW_SPEC_IPV4: + if (len > sizeof(user_val.ipv4) && + !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv4), + len - sizeof(user_val.ipv4))) + return -EOPNOTSUPP; + + val_len = min_t(size_t, len, sizeof(user_val.ipv4)); + ret = copy_from_user(&user_val.ipv4, val_ptr, + val_len); + if (ret) + return -EFAULT; + + user_pmask = &ipv4; + break; + case IB_FLOW_SPEC_IPV6: + if (len > sizeof(user_val.ipv6) && + !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv6), + len - sizeof(user_val.ipv6))) + return -EOPNOTSUPP; + + val_len = min_t(size_t, len, sizeof(user_val.ipv6)); + ret = copy_from_user(&user_val.ipv6, val_ptr, + val_len); + if (ret) + return -EFAULT; + + user_pmask = &ipv6; + break; + default: + return -EOPNOTSUPP; + } + + return ib_uverbs_kern_spec_to_ib_spec_filter(proto, user_pmask, + &user_val, + val_len, out); +} + +static int flow_action_esp_get_encap(struct ib_flow_spec_list *out, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_flow_action_esp_encap uverbs_encap; + int ret; + + ret = uverbs_copy_from(&uverbs_encap, attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP); + if (ret) + return ret; + + /* We currently support only one encap */ + if (uverbs_encap.next_ptr) + return -EOPNOTSUPP; + + if (uverbs_encap.type != IB_FLOW_SPEC_IPV4 && + uverbs_encap.type != IB_FLOW_SPEC_IPV6) + return -EOPNOTSUPP; + + return parse_esp_ip(uverbs_encap.type, + u64_to_user_ptr(uverbs_encap.val_ptr), + uverbs_encap.len, + &out->spec); +} + +struct ib_flow_action_esp_attr { + struct ib_flow_action_attrs_esp hdr; + struct ib_flow_action_attrs_esp_keymats keymat; + struct ib_flow_action_attrs_esp_replays replay; + /* We currently support only one spec */ + struct ib_flow_spec_list encap; +}; + +#define ESP_LAST_SUPPORTED_FLAG IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW +static int parse_flow_action_esp(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs, + struct ib_flow_action_esp_attr *esp_attr) +{ + struct ib_uverbs_flow_action_esp uverbs_esp = {}; + int ret; + + /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ + ret = uverbs_copy_from(&esp_attr->hdr.esn, attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_ESN); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + /* This can be called from FLOW_ACTION_ESP_MODIFY where + * UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS is optional + */ + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) { + ret = uverbs_copy_from_or_zero(&uverbs_esp, attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS); + if (ret) + return ret; + + if (uverbs_esp.flags & ~((ESP_LAST_SUPPORTED_FLAG << 1) - 1)) + return -EOPNOTSUPP; + + esp_attr->hdr.spi = uverbs_esp.spi; + esp_attr->hdr.seq = uverbs_esp.seq; + esp_attr->hdr.tfc_pad = uverbs_esp.tfc_pad; + esp_attr->hdr.hard_limit_pkts = uverbs_esp.hard_limit_pkts; + } + esp_attr->hdr.flags = esp_flags_uverbs_to_verbs(attrs, uverbs_esp.flags); + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT)) { + esp_attr->keymat.protocol = + uverbs_attr_get_enum_id(attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT); + ret = uverbs_copy_from_or_zero(&esp_attr->keymat.keymat, + attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT); + if (ret) + return ret; + + ret = flow_action_esp_keymat_validate[esp_attr->keymat.protocol](&esp_attr->keymat); + if (ret) + return ret; + + esp_attr->hdr.keymat = &esp_attr->keymat; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY)) { + esp_attr->replay.protocol = + uverbs_attr_get_enum_id(attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY); + + ret = uverbs_copy_from_or_zero(&esp_attr->replay.replay, + attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY); + if (ret) + return ret; + + esp_attr->hdr.replay = &esp_attr->replay; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP)) { + ret = flow_action_esp_get_encap(&esp_attr->encap, attrs); + if (ret) + return ret; + + esp_attr->hdr.encap = &esp_attr->encap; + } + + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + int ret; + struct ib_uobject *uobj; + struct ib_flow_action *action; + struct ib_flow_action_esp_attr esp_attr = {}; + + if (!ib_dev->create_flow_action_esp) + return -EOPNOTSUPP; + + ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr); + if (ret) + return ret; + + /* No need to check as this attribute is marked as MANDATORY */ + uobj = uverbs_attr_get(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject; + action = ib_dev->create_flow_action_esp(ib_dev, &esp_attr.hdr, attrs); + if (IS_ERR(action)) + return PTR_ERR(action); + + atomic_set(&action->usecnt, 0); + action->device = ib_dev; + action->type = IB_FLOW_ACTION_ESP; + action->uobject = uobj; + uobj->object = action; + + return 0; +} + +static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = { + [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = { + .ptr = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_keymat_aes_gcm), + .flags = UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO, + }, + }, +}; + +static const struct uverbs_attr_spec uverbs_flow_action_esp_replay[] = { + [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = { + .ptr = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_replay_bmp, size), + .flags = UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO, + } + }, +}; + +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + &UVERBS_ATTR_IDR(UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE, UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, hard_limit_pkts), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY | + UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, UVERBS_ATTR_TYPE(__u32)), + &UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, + uverbs_flow_action_esp_keymat, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, + uverbs_flow_action_esp_replay), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_encap, type))); + +static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_FLOW_ACTION_DESTROY, + uverbs_destroy_def_handler, + &UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_DESTROY, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW_ACTION, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow_action), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY)); + diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1e3059ce73b6..49da92143341 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -65,6 +65,7 @@ #include #include #include +#include #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN @@ -2001,6 +2002,63 @@ struct ib_flow { struct ib_uobject *uobject; }; +enum ib_flow_action_type { + IB_FLOW_ACTION_UNSPECIFIED, + IB_FLOW_ACTION_ESP = 1, +}; + +struct ib_flow_action_attrs_esp_keymats { + enum ib_uverbs_flow_action_esp_keymat protocol; + union { + struct ib_uverbs_flow_action_esp_keymat_aes_gcm aes_gcm; + } keymat; +}; + +struct ib_flow_action_attrs_esp_replays { + enum ib_uverbs_flow_action_esp_replay protocol; + union { + struct ib_uverbs_flow_action_esp_replay_bmp bmp; + } replay; +}; + +enum ib_flow_action_attrs_esp_flags { + /* All user-space flags at the top: Use enum ib_uverbs_flow_action_esp_flags + * This is done in order to share the same flags between user-space and + * kernel and spare an unnecessary translation. + */ + + /* Kernel flags */ + IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED = 1ULL << 32, +}; + +struct ib_flow_spec_list { + struct ib_flow_spec_list *next; + union ib_flow_spec spec; +}; + +struct ib_flow_action_attrs_esp { + struct ib_flow_action_attrs_esp_keymats *keymat; + struct ib_flow_action_attrs_esp_replays *replay; + struct ib_flow_spec_list *encap; + /* Used only if IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED is enabled. + * Value of 0 is a valid value. + */ + u32 esn; + u32 spi; + u32 seq; + u32 tfc_pad; + /* Use enum ib_flow_action_attrs_esp_flags */ + u64 flags; + u64 hard_limit_pkts; +}; + +struct ib_flow_action { + struct ib_device *device; + struct ib_uobject *uobject; + enum ib_flow_action_type type; + atomic_t usecnt; +}; + struct ib_mad_hdr; struct ib_grh; @@ -2077,6 +2135,8 @@ struct ib_port_pkey_list { struct list_head pkey_list; }; +struct uverbs_attr_bundle; + struct ib_device { /* Do not access @dma_device directly from ULP nor from HW drivers. */ struct device *dma_device; @@ -2331,6 +2391,11 @@ struct ib_device { struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); + struct ib_flow_action * (*create_flow_action_esp)(struct ib_device *device, + const struct ib_flow_action_attrs_esp *attr, + struct uverbs_attr_bundle *attrs); + int (*destroy_flow_action)(struct ib_flow_action *action); + /** * rdma netdev operation * diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 77bbbed17ed5..d3a2c03ea4a8 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -53,6 +53,7 @@ enum uverbs_default_objects { UVERBS_OBJECT_XRCD, UVERBS_OBJECT_RWQ_IND_TBL, UVERBS_OBJECT_WQ, + UVERBS_OBJECT_FLOW_ACTION, }; enum { @@ -75,9 +76,27 @@ enum uverbs_attrs_destroy_cq_cmd_attr_ids { UVERBS_ATTR_DESTROY_CQ_RESP, }; +enum uverbs_attrs_create_flow_action_esp { + UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE, + UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, + UVERBS_ATTR_FLOW_ACTION_ESP_ESN, + UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, + UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, + UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, +}; + +enum uverbs_attrs_destroy_flow_action_esp { + UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, +}; + enum uverbs_methods_cq { UVERBS_METHOD_CQ_CREATE, UVERBS_METHOD_CQ_DESTROY, }; +enum uverbs_methods_actions_flow_action_ops { + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + UVERBS_METHOD_FLOW_ACTION_DESTROY, +}; + #endif diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 173629ecc09b..04e46ea517d3 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -40,4 +40,63 @@ #define RDMA_UAPI_PTR(_type, _name) __aligned_u64 _name #endif +enum ib_uverbs_flow_action_esp_keymat { + IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM, +}; + +enum ib_uverbs_flow_action_esp_keymat_aes_gcm_iv_algo { + IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ, +}; + +struct ib_uverbs_flow_action_esp_keymat_aes_gcm { + __aligned_u64 iv; + __u32 iv_algo; /* Use enum ib_uverbs_flow_action_esp_keymat_aes_gcm_iv_algo */ + + __u32 salt; + __u32 icv_len; + + __u32 key_len; + __u32 aes_key[256 / 32]; +}; + +enum ib_uverbs_flow_action_esp_replay { + IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE, + IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP, +}; + +struct ib_uverbs_flow_action_esp_replay_bmp { + __u32 size; +}; + +enum ib_uverbs_flow_action_esp_flags { + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO = 0UL << 0, /* Default */ + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD = 1UL << 0, + + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TUNNEL = 0UL << 1, /* Default */ + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TRANSPORT = 1UL << 1, + + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_DECRYPT = 0UL << 2, /* Default */ + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT = 1UL << 2, + + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW = 1UL << 3, +}; + +struct ib_uverbs_flow_action_esp_encap { + /* This struct represents a list of pointers to flow_xxxx_filter that + * encapsulates the payload in ESP tunnel mode. + */ + RDMA_UAPI_PTR(void *, val_ptr); /* pointer to a flow_xxxx_filter */ + RDMA_UAPI_PTR(struct ib_uverbs_flow_action_esp_encap *, next_ptr); + __u16 len; /* Len of the filter struct val_ptr points to */ + __u16 type; /* Use flow_spec_type enum */ +}; + +struct ib_uverbs_flow_action_esp { + __u32 spi; + __u32 seq; + __u32 tfc_pad; + __u32 flags; + __aligned_u64 hard_limit_pkts; +}; + #endif From 9b828441976ef719f1008a9855fff95a45e474b8 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:46 +0300 Subject: [PATCH 178/199] IB/uverbs: Add action_handle flow steering specification Binding a flow_action to flow steering rule requires using a new specification. Therefore, adding such an IB_FLOW_SPEC_ACTION_HANDLE flow specification. Flow steering rules could use flow_action(s) and as of that we need to avoid deleting flow_action(s) as long as they're being used. Moreover, when the attached rules are deleted, action_handle reference count should be decremented. Introducing a new mechanism of flow resources to keep track on the attached action_handle(s). Later on, this mechanism should be extended to other attached flow steering resources like flow counters. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 8 ++ drivers/infiniband/core/uverbs_cmd.c | 86 ++++++++++++++++++++-- drivers/infiniband/core/uverbs_std_types.c | 14 +++- include/rdma/ib_verbs.h | 8 ++ include/uapi/rdma/ib_user_verbs.h | 13 ++++ 5 files changed, 121 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index a94b5e7ee02a..1bac0b51686a 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -203,11 +203,18 @@ struct ib_ucq_object { u32 async_events_reported; }; +struct ib_uflow_resources; +struct ib_uflow_object { + struct ib_uobject uobject; + struct ib_uflow_resources *resources; +}; + extern const struct file_operations uverbs_event_fops; void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, struct ib_device *ib_dev); void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file); +void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res); void ib_uverbs_release_ucq(struct ib_uverbs_file *file, struct ib_uverbs_completion_event_file *ev_file, @@ -254,6 +261,7 @@ struct ib_uverbs_flow_spec { struct ib_uverbs_flow_spec_ipv6 ipv6; struct ib_uverbs_flow_spec_action_tag flow_tag; struct ib_uverbs_flow_spec_action_drop drop; + struct ib_uverbs_flow_spec_action_handle action; }; }; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index f6ffe18df679..69050dd77421 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2739,8 +2739,52 @@ out_put: return ret ? ret : in_len; } -static int kern_spec_to_ib_spec_action(struct ib_uverbs_flow_spec *kern_spec, - union ib_flow_spec *ib_spec) +struct ib_uflow_resources { + size_t max; + size_t num; + struct ib_flow_action *collection[0]; +}; + +static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) +{ + struct ib_uflow_resources *resources; + + resources = + kmalloc(sizeof(*resources) + + num_specs * sizeof(*resources->collection), GFP_KERNEL); + + if (!resources) + return NULL; + + resources->num = 0; + resources->max = num_specs; + + return resources; +} + +void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) +{ + unsigned int i; + + for (i = 0; i < uflow_res->num; i++) + atomic_dec(&uflow_res->collection[i]->usecnt); + + kfree(uflow_res); +} + +static void flow_resources_add(struct ib_uflow_resources *uflow_res, + struct ib_flow_action *action) +{ + WARN_ON(uflow_res->num >= uflow_res->max); + + atomic_inc(&action->usecnt); + uflow_res->collection[uflow_res->num++] = action; +} + +static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext, + struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec, + struct ib_uflow_resources *uflow_res) { ib_spec->type = kern_spec->type; switch (ib_spec->type) { @@ -2759,6 +2803,21 @@ static int kern_spec_to_ib_spec_action(struct ib_uverbs_flow_spec *kern_spec, ib_spec->drop.size = sizeof(struct ib_flow_spec_action_drop); break; + case IB_FLOW_SPEC_ACTION_HANDLE: + if (kern_spec->action.size != + sizeof(struct ib_uverbs_flow_spec_action_handle)) + return -EOPNOTSUPP; + ib_spec->action.act = uobj_get_obj_read(flow_action, + UVERBS_OBJECT_FLOW_ACTION, + kern_spec->action.handle, + ucontext); + if (!ib_spec->action.act) + return -EINVAL; + ib_spec->action.size = + sizeof(struct ib_flow_spec_action_handle); + flow_resources_add(uflow_res, ib_spec->action.act); + uobj_put_obj_read(ib_spec->action.act); + break; default: return -EINVAL; } @@ -2900,14 +2959,17 @@ static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, kern_filter_sz, ib_spec); } -static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, - union ib_flow_spec *ib_spec) +static int kern_spec_to_ib_spec(struct ib_ucontext *ucontext, + struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec, + struct ib_uflow_resources *uflow_res) { if (kern_spec->reserved) return -EINVAL; if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG) - return kern_spec_to_ib_spec_action(kern_spec, ib_spec); + return kern_spec_to_ib_spec_action(ucontext, kern_spec, ib_spec, + uflow_res); else return kern_spec_to_ib_spec_filter(kern_spec, ib_spec); } @@ -3322,10 +3384,12 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; + struct ib_uflow_object *uflow; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; struct ib_qp *qp; + struct ib_uflow_resources *uflow_res; int err = 0; void *kern_spec; void *ib_spec; @@ -3403,6 +3467,11 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err = -ENOMEM; goto err_put; } + uflow_res = flow_resources_alloc(cmd.flow_attr.num_of_specs); + if (!uflow_res) { + err = -ENOMEM; + goto err_free_flow_attr; + } flow_attr->type = kern_flow_attr->type; flow_attr->priority = kern_flow_attr->priority; @@ -3417,7 +3486,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && cmd.flow_attr.size >= ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { - err = kern_spec_to_ib_spec(kern_spec, ib_spec); + err = kern_spec_to_ib_spec(file->ucontext, kern_spec, ib_spec, + uflow_res); if (err) goto err_free; flow_attr->size += @@ -3439,6 +3509,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, } flow_id->uobject = uobj; uobj->object = flow_id; + uflow = container_of(uobj, typeof(*uflow), uobject); + uflow->resources = uflow_res; memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; @@ -3457,6 +3529,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err_copy: ib_destroy_flow(flow_id); err_free: + ib_uverbs_flow_resources_free(uflow_res); +err_free_flow_attr: kfree(flow_attr); err_put: uobj_put_obj_read(qp); diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 47b9a85f3854..173eab8d3482 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -48,7 +48,16 @@ static int uverbs_free_ah(struct ib_uobject *uobject, static int uverbs_free_flow(struct ib_uobject *uobject, enum rdma_remove_reason why) { - return ib_destroy_flow((struct ib_flow *)uobject->object); + int ret; + struct ib_flow *flow = (struct ib_flow *)uobject->object; + struct ib_uflow_object *uflow = + container_of(uobject, struct ib_uflow_object, uobject); + + ret = ib_destroy_flow(flow); + if (!ret) + ib_uverbs_flow_resources_free(uflow->resources); + + return ret; } static int uverbs_free_mw(struct ib_uobject *uobject, @@ -268,7 +277,8 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH, &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah)); DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow)); + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object), + 0, uverbs_free_flow)); DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_WQ, &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 49da92143341..c1b9cba79710 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1836,6 +1836,7 @@ enum ib_flow_spec_type { /* Actions */ IB_FLOW_SPEC_ACTION_TAG = 0x1000, IB_FLOW_SPEC_ACTION_DROP = 0x1001, + IB_FLOW_SPEC_ACTION_HANDLE = 0x1002, }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 8 @@ -1969,6 +1970,12 @@ struct ib_flow_spec_action_drop { u16 size; }; +struct ib_flow_spec_action_handle { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_action *act; +}; + union ib_flow_spec { struct { u32 type; @@ -1982,6 +1989,7 @@ union ib_flow_spec { struct ib_flow_spec_tunnel tunnel; struct ib_flow_spec_action_tag flow_tag; struct ib_flow_spec_action_drop drop; + struct ib_flow_spec_action_handle action; }; struct ib_flow_attr { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index aa0615105563..ac41ce234186 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -984,6 +984,19 @@ struct ib_uverbs_flow_spec_action_drop { }; }; +struct ib_uverbs_flow_spec_action_handle { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + __u32 handle; + __u32 reserved1; +}; + struct ib_uverbs_flow_tunnel_filter { __be32 tunnel_id; }; From 21e82d3e1dcf9ce61ae387ca1a507cf53665336a Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 28 Mar 2018 09:27:47 +0300 Subject: [PATCH 179/199] IB/uverbs: Introduce egress flow steering The egress flag indicates that this flow steering rule is for egress traffic. The scope of an egress rule is port-wide, meaning all packets originated from that port, which match the steering rule specification will be effected by this steering rule's action. Reviewed-by: Yishai Hadas Signed-off-by: Boris Pismenny Reviewed-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c1b9cba79710..c674fc1e596b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1854,7 +1854,8 @@ enum ib_flow_domain { enum ib_flow_flags { IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ - IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 2 /* Must be last */ + IB_FLOW_ATTR_FLAGS_EGRESS = 1UL << 2, /* Egress flow */ + IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 3 /* Must be last */ }; struct ib_flow_eth_filter { From 7d12f8d5a1645275dd452138bf1fe478be736704 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:48 +0300 Subject: [PATCH 180/199] IB/uverbs: Add modify ESP flow_action flow_actions of ESP type could be modified during runtime. This could be common for example when ESN should be changed. Adding a new UVERBS_FLOW_ACTION_ESP_MODIFY method for changing ESP parameters of an existing ESP flow_action. The new method uses the UVERBS_FLOW_ACTION_ESP_CREATE attributes, but adds a new IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS which means ESP_ATTRS should be changed. In addition, we add a new FLOW_ACTION_ESP_REPLAY_NONE replay type that could be used when one wants to disable a replay protection over a specific flow_action. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- .../core/uverbs_std_types_flow_action.c | 97 ++++++++++++++++++- include/rdma/ib_verbs.h | 4 + include/uapi/rdma/ib_user_ioctl_cmds.h | 1 + 3 files changed, 97 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index 3d4c7b8dff48..cbcec3da12f6 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -46,13 +46,17 @@ static int uverbs_free_flow_action(struct ib_uobject *uobject, } static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs, - u32 flags) + u32 flags, bool is_modify) { u64 verbs_flags = flags; if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ESN)) verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED; + if (is_modify && uverbs_attr_is_valid(attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) + verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS; + return verbs_flags; }; @@ -81,6 +85,32 @@ static int (* const flow_action_esp_keymat_validate[])(struct ib_flow_action_att [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = validate_flow_action_esp_keymat_aes_gcm, }; +static int flow_action_esp_replay_none(struct ib_flow_action_attrs_esp_replays *replay, + bool is_modify) +{ + /* This is used in order to modify an esp flow action with an enabled + * replay protection to a disabled one. This is only supported via + * modify, as in create verb we can simply drop the REPLAY attribute and + * achieve the same thing. + */ + return is_modify ? 0 : -EINVAL; +} + +static int flow_action_esp_replay_def_ok(struct ib_flow_action_attrs_esp_replays *replay, + bool is_modify) +{ + /* Some replay protections could always be enabled without validating + * anything. + */ + return 0; +} + +static int (* const flow_action_esp_replay_validate[])(struct ib_flow_action_attrs_esp_replays *replay, + bool is_modify) = { + [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = flow_action_esp_replay_none, + [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = flow_action_esp_replay_def_ok, +}; + static int parse_esp_ip(enum ib_flow_spec_type proto, const void __user *val_ptr, size_t len, union ib_flow_spec *out) @@ -194,7 +224,8 @@ struct ib_flow_action_esp_attr { static int parse_flow_action_esp(struct ib_device *ib_dev, struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs, - struct ib_flow_action_esp_attr *esp_attr) + struct ib_flow_action_esp_attr *esp_attr, + bool is_modify) { struct ib_uverbs_flow_action_esp uverbs_esp = {}; int ret; @@ -222,7 +253,8 @@ static int parse_flow_action_esp(struct ib_device *ib_dev, esp_attr->hdr.tfc_pad = uverbs_esp.tfc_pad; esp_attr->hdr.hard_limit_pkts = uverbs_esp.hard_limit_pkts; } - esp_attr->hdr.flags = esp_flags_uverbs_to_verbs(attrs, uverbs_esp.flags); + esp_attr->hdr.flags = esp_flags_uverbs_to_verbs(attrs, uverbs_esp.flags, + is_modify); if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT)) { esp_attr->keymat.protocol = @@ -252,6 +284,11 @@ static int parse_flow_action_esp(struct ib_device *ib_dev, if (ret) return ret; + ret = flow_action_esp_replay_validate[esp_attr->replay.protocol](&esp_attr->replay, + is_modify); + if (ret) + return ret; + esp_attr->hdr.replay = &esp_attr->replay; } @@ -278,7 +315,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device if (!ib_dev->create_flow_action_esp) return -EOPNOTSUPP; - ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr); + ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr, false); if (ret) return ret; @@ -297,6 +334,33 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device return 0; } +static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + int ret; + struct ib_uobject *uobj; + struct ib_flow_action *action; + struct ib_flow_action_esp_attr esp_attr = {}; + + if (!ib_dev->modify_flow_action_esp) + return -EOPNOTSUPP; + + ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr, true); + if (ret) + return ret; + + uobj = uverbs_attr_get(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject; + action = uobj->object; + + if (action->type != IB_FLOW_ACTION_ESP) + return -EINVAL; + + return ib_dev->modify_flow_action_esp(action, + &esp_attr.hdr, + attrs); +} + static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = { [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = { .ptr = { @@ -308,6 +372,13 @@ static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = { }; static const struct uverbs_attr_spec uverbs_flow_action_esp_replay[] = { + [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = { + .ptr = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + /* No need to specify any data */ + .len = 0, + } + }, [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = { .ptr = { .type = UVERBS_ATTR_TYPE_PTR_IN, @@ -334,6 +405,21 @@ static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_encap, type))); +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, + &UVERBS_ATTR_IDR(UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE, UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_WRITE, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, hard_limit_pkts), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, UVERBS_ATTR_TYPE(__u32)), + &UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, + uverbs_flow_action_esp_keymat), + &UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, + uverbs_flow_action_esp_replay), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_encap, type))); + static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_FLOW_ACTION_DESTROY, uverbs_destroy_def_handler, &UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, @@ -344,5 +430,6 @@ static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_FLOW_ACTION_DESTRO DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW_ACTION, &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow_action), &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY)); + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c674fc1e596b..8c3ca073016d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2038,6 +2038,7 @@ enum ib_flow_action_attrs_esp_flags { /* Kernel flags */ IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED = 1ULL << 32, + IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS = 1ULL << 33, }; struct ib_flow_spec_list { @@ -2404,6 +2405,9 @@ struct ib_device { const struct ib_flow_action_attrs_esp *attr, struct uverbs_attr_bundle *attrs); int (*destroy_flow_action)(struct ib_flow_action *action); + int (*modify_flow_action_esp)(struct ib_flow_action *action, + const struct ib_flow_action_attrs_esp *attr, + struct uverbs_attr_bundle *attrs); /** * rdma netdev operation diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index d3a2c03ea4a8..500b64a444ad 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -97,6 +97,7 @@ enum uverbs_methods_cq { enum uverbs_methods_actions_flow_action_ops { UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, UVERBS_METHOD_FLOW_ACTION_DESTROY, + UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, }; #endif From 56ab0b38b80e5771920e163cc9bd52504b03f539 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:49 +0300 Subject: [PATCH 181/199] IB/uverbs: Introduce ESP steering match filter Adding a new ESP steering match filter that could match against spi and seq used in IPSec protocol. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_cmd.c | 11 +++++++++++ include/rdma/ib_verbs.h | 16 ++++++++++++++++ include/uapi/rdma/ib_user_verbs.h | 18 ++++++++++++++++++ 4 files changed, 46 insertions(+) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 1bac0b51686a..3229e87d03cb 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -257,6 +257,7 @@ struct ib_uverbs_flow_spec { }; struct ib_uverbs_flow_spec_eth eth; struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_esp esp; struct ib_uverbs_flow_spec_tcp_udp tcp_udp; struct ib_uverbs_flow_spec_ipv6 ipv6; struct ib_uverbs_flow_spec_action_tag flow_tag; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 69050dd77421..f38600490fd1 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2931,6 +2931,17 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, (ntohl(ib_spec->tunnel.val.tunnel_id)) >= BIT(24)) return -EINVAL; break; + case IB_FLOW_SPEC_ESP: + ib_filter_sz = offsetof(struct ib_flow_esp_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->esp.size = sizeof(struct ib_flow_spec_esp); + memcpy(&ib_spec->esp.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->esp.mask, kern_spec_mask, actual_filter_sz); + break; default: return -EINVAL; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8c3ca073016d..a6dba77c1b28 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1828,6 +1828,7 @@ enum ib_flow_spec_type { /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, IB_FLOW_SPEC_IPV6 = 0x31, + IB_FLOW_SPEC_ESP = 0x34, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41, @@ -1960,6 +1961,20 @@ struct ib_flow_spec_tunnel { struct ib_flow_tunnel_filter mask; }; +struct ib_flow_esp_filter { + __be32 spi; + __be32 seq; + /* Must be last */ + u8 real_sz[0]; +}; + +struct ib_flow_spec_esp { + u32 type; + u16 size; + struct ib_flow_esp_filter val; + struct ib_flow_esp_filter mask; +}; + struct ib_flow_spec_action_tag { enum ib_flow_spec_type type; u16 size; @@ -1988,6 +2003,7 @@ union ib_flow_spec { struct ib_flow_spec_tcp_udp tcp_udp; struct ib_flow_spec_ipv6 ipv6; struct ib_flow_spec_tunnel tunnel; + struct ib_flow_spec_esp esp; struct ib_flow_spec_action_tag flow_tag; struct ib_flow_spec_action_drop drop; struct ib_flow_spec_action_handle action; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index ac41ce234186..df5d339952fe 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -1014,6 +1014,24 @@ struct ib_uverbs_flow_spec_tunnel { struct ib_uverbs_flow_tunnel_filter mask; }; +struct ib_uverbs_flow_spec_esp_filter { + __u32 spi; + __u32 seq; +}; + +struct ib_uverbs_flow_spec_esp { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_esp_filter val; + struct ib_uverbs_flow_spec_esp_filter mask; +}; + struct ib_uverbs_flow_attr { __u32 type; __u16 size; From c6475a0bca30fc2f9e5e4c48935f08973c2780ef Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Wed, 28 Mar 2018 09:27:50 +0300 Subject: [PATCH 182/199] IB/mlx5: Add implementation for create and destroy action_xfrm Adding implementation in mlx5 driver to create and destroy action_xfrm object. This merely call the accel layer. A user may pass MLX5_IB_XFRM_FLAGS_REQUIRE_METADATA flag which states that [s]he expects a metadata header to be added to the payload. This header represents information regarding the transformation's state. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Aviad Yehezkel Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 133 +++++++++++++++++++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 16 +++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 44 +++++++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 43 +++++++ 4 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 include/uapi/rdma/mlx5_user_ioctl_cmds.h create mode 100644 include/uapi/rdma/mlx5_user_ioctl_verbs.h diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 82ad0faf8007..8b16a42f2086 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -60,7 +60,10 @@ #include "ib_rep.h" #include "cmd.h" #include +#include #include +#include +#include #define UVERBS_MODULE_NAME mlx5_ib #include @@ -3103,6 +3106,122 @@ unlock: return ERR_PTR(err); } +static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags) +{ + u32 flags = 0; + + if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA) + flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA; + + return flags; +} + +#define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA +static struct ib_flow_action * +mlx5_ib_create_flow_action_esp(struct ib_device *device, + const struct ib_flow_action_attrs_esp *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dev *mdev = to_mdev(device); + struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm; + struct mlx5_accel_esp_xfrm_attrs accel_attrs = {}; + struct mlx5_ib_flow_action *action; + u64 action_flags; + u64 flags; + int err = 0; + + if (IS_UVERBS_COPY_ERR(uverbs_copy_from(&action_flags, attrs, + MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS))) + return ERR_PTR(-EFAULT); + + if (action_flags >= (MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1)) + return ERR_PTR(-EOPNOTSUPP); + + flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags); + + /* We current only support a subset of the standard features. Only a + * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn + * (with overlap). Full offload mode isn't supported. + */ + if (!attr->keymat || attr->replay || attr->encap || + attr->spi || attr->seq || attr->tfc_pad || + attr->hard_limit_pkts || + (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT))) + return ERR_PTR(-EOPNOTSUPP); + + if (attr->keymat->protocol != + IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM) + return ERR_PTR(-EOPNOTSUPP); + + aes_gcm = &attr->keymat->keymat.aes_gcm; + + if (aes_gcm->icv_len != 16 || + aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ) + return ERR_PTR(-EOPNOTSUPP); + + action = kmalloc(sizeof(*action), GFP_KERNEL); + if (!action) + return ERR_PTR(-ENOMEM); + + action->esp_aes_gcm.ib_flags = attr->flags; + memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key, + sizeof(accel_attrs.keymat.aes_gcm.aes_key)); + accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8; + memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt, + sizeof(accel_attrs.keymat.aes_gcm.salt)); + memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv, + sizeof(accel_attrs.keymat.aes_gcm.seq_iv)); + accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8; + accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ; + accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM; + + accel_attrs.esn = attr->esn; + if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) + accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED; + if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW) + accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + + if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT) + accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT; + + action->esp_aes_gcm.ctx = + mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags); + if (IS_ERR(action->esp_aes_gcm.ctx)) { + err = PTR_ERR(action->esp_aes_gcm.ctx); + goto err_parse; + } + + action->esp_aes_gcm.ib_flags = attr->flags; + + return &action->ib_action; + +err_parse: + kfree(action); + return ERR_PTR(err); +} + +static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action) +{ + struct mlx5_ib_flow_action *maction = to_mflow_act(action); + + switch (action->type) { + case IB_FLOW_ACTION_ESP: + /* + * We only support aes_gcm by now, so we implicitly know this is + * the underline crypto. + */ + mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx); + break; + default: + WARN_ON(true); + break; + } + + kfree(maction); + return 0; +} + static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); @@ -4548,13 +4667,23 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) mlx5_nic_vport_disable_roce(dev->mdev); } -#define NUM_TREES 0 +ADD_UVERBS_ATTRIBUTES_SIMPLE(mlx5_ib_flow_action, UVERBS_OBJECT_FLOW_ACTION, + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + &UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, + UVERBS_ATTR_TYPE(u64), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +#define NUM_TREES 1 static int populate_specs_root(struct mlx5_ib_dev *dev) { const struct uverbs_object_tree_def *default_root[NUM_TREES + 1] = { uverbs_default_get_objects()}; size_t num_trees = 1; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE && + !WARN_ON(num_trees >= ARRAY_SIZE(default_root))) + default_root[num_trees++] = &mlx5_ib_flow_action; + dev->ib_dev.specs_root = uverbs_alloc_spec_tree(num_trees, default_root); @@ -4796,6 +4925,8 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + dev->ib_dev.create_flow_action_esp = mlx5_ib_create_flow_action_esp; + dev->ib_dev.destroy_flow_action = mlx5_ib_destroy_flow_action; dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; err = init_node_data(dev); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 0eda960ab8e0..5fe73971425e 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -772,6 +772,16 @@ struct mlx5_ib_multiport_info { bool unaffiliate; }; +struct mlx5_ib_flow_action { + struct ib_flow_action ib_action; + union { + struct { + u64 ib_flags; + struct mlx5_accel_esp_xfrm *ctx; + } esp_aes_gcm; + }; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -895,6 +905,12 @@ static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw) return container_of(ibmw, struct mlx5_ib_mw, ibmw); } +static inline struct mlx5_ib_flow_action * +to_mflow_act(struct ib_flow_action *ibact) +{ + return container_of(ibact, struct mlx5_ib_flow_action, ib_action); +} + int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db); void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h new file mode 100644 index 000000000000..521813d5348c --- /dev/null +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_USER_IOCTL_CMDS_H +#define MLX5_USER_IOCTL_CMDS_H + +#include + +enum mlx5_ib_create_flow_action_attrs { + /* This attribute belong to the driver namespace */ + MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS = (1U << UVERBS_ID_NS_SHIFT), +}; + +#endif + diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h new file mode 100644 index 000000000000..8a2fb33f3ed4 --- /dev/null +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_USER_IOCTL_VERBS_H +#define MLX5_USER_IOCTL_VERBS_H + +#include + +enum mlx5_ib_uapi_flow_action_flags { + MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA = 1 << 0, +}; + +#endif + From 349705c1936269aa6e5f923a0bc663e1addf288e Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:51 +0300 Subject: [PATCH 183/199] IB/mlx5: Add modify_flow_action_esp verb Adding implementation in mlx5 driver to modify action_xfrm object. This merely call the accel layer. Currently a user can modify only the ESN parameters. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Aviad Yehezkel Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8b16a42f2086..bc46589a904d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3201,6 +3201,54 @@ err_parse: return ERR_PTR(err); } +static int +mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action, + const struct ib_flow_action_attrs_esp *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_flow_action *maction = to_mflow_act(action); + struct mlx5_accel_esp_xfrm_attrs accel_attrs; + int err = 0; + + if (attr->keymat || attr->replay || attr->encap || + attr->spi || attr->seq || attr->tfc_pad || + attr->hard_limit_pkts || + (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | + IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS | + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))) + return -EOPNOTSUPP; + + /* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can + * be modified. + */ + if (!(maction->esp_aes_gcm.ib_flags & + IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) && + attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)) + return -EINVAL; + + memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs, + sizeof(accel_attrs)); + + accel_attrs.esn = attr->esn; + if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW) + accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + else + accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + + err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx, + &accel_attrs); + if (err) + return err; + + maction->esp_aes_gcm.ib_flags &= + ~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW; + maction->esp_aes_gcm.ib_flags |= + attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW; + + return 0; +} + static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action) { struct mlx5_ib_flow_action *maction = to_mflow_act(action); @@ -4927,6 +4975,7 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); dev->ib_dev.create_flow_action_esp = mlx5_ib_create_flow_action_esp; dev->ib_dev.destroy_flow_action = mlx5_ib_destroy_flow_action; + dev->ib_dev.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp; dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; err = init_node_data(dev); From 363c5a570d4a386fa1bf8d3833de817d7c4fcda2 Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Wed, 28 Mar 2018 09:27:52 +0300 Subject: [PATCH 184/199] {net,IB}/mlx5: Add ipsec helper Simple wrapper to understand if we are dealing with IPsec flow. Signed-off-by: Aviad Yehezkel Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/fs_helpers.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/mlx5/fs_helpers.h b/include/linux/mlx5/fs_helpers.h index 7b476bbae731..9db21cd0e92c 100644 --- a/include/linux/mlx5/fs_helpers.h +++ b/include/linux/mlx5/fs_helpers.h @@ -38,6 +38,14 @@ #define MLX5_FS_IPV4_VERSION 4 #define MLX5_FS_IPV6_VERSION 6 +static inline bool mlx5_fs_is_ipsec_flow(const u32 *match_c) +{ + void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, + misc_parameters); + + return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi); +} + static inline bool _mlx5_fs_is_outer_ipproto_flow(const u32 *match_c, const u32 *match_v, u8 match) { From 802c2125689d1ceedd9671a8f728e85eacdac077 Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Wed, 28 Mar 2018 09:27:53 +0300 Subject: [PATCH 185/199] IB/mlx5: Add IPsec support for egress and ingress This commit introduces support for the esp_aes_gcm flow specification for the Innova device. To that end we add support for egress steering and some validations that an IPsec rule is indeed valid. Signed-off-by: Matan Barak Signed-off-by: Aviad Yehezkel Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 127 ++++++++++++++++++++++++--- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + 2 files changed, 117 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bc46589a904d..119c4c165970 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -2321,8 +2322,28 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) offsetof(typeof(filter), field) -\ sizeof(filter.field)) +static int parse_flow_flow_action(const union ib_flow_spec *ib_spec, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_act *action) +{ + struct mlx5_ib_flow_action *maction = to_mflow_act(ib_spec->action.act); + + switch (maction->ib_action.type) { + case IB_FLOW_ACTION_ESP: + /* Currently only AES_GCM keymat is supported by the driver */ + action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx; + action->action |= flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS ? + MLX5_FLOW_CONTEXT_ACTION_ENCRYPT : + MLX5_FLOW_CONTEXT_ACTION_DECRYPT; + return 0; + default: + return -EOPNOTSUPP; + } +} + static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, u32 *match_v, const union ib_flow_spec *ib_spec, + const struct ib_flow_attr *flow_attr, struct mlx5_flow_act *action) { void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, @@ -2332,6 +2353,7 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, void *headers_c; void *headers_v; int match_ipv; + int ret; if (ib_spec->type & IB_FLOW_SPEC_INNER) { headers_c = MLX5_ADDR_OF(fte_match_param, match_c, @@ -2482,7 +2504,15 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, ntohl(ib_spec->ipv6.mask.flow_label), ntohl(ib_spec->ipv6.val.flow_label), ib_spec->type & IB_FLOW_SPEC_INNER); + break; + case IB_FLOW_SPEC_ESP: + if (ib_spec->esp.mask.seq) + return -EOPNOTSUPP; + MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, + ntohl(ib_spec->esp.mask.spi)); + MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi, + ntohl(ib_spec->esp.val.spi)); break; case IB_FLOW_SPEC_TCP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, @@ -2550,6 +2580,11 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, return -EOPNOTSUPP; action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP; break; + case IB_FLOW_SPEC_ACTION_HANDLE: + ret = parse_flow_flow_action(ib_spec, flow_attr, action); + if (ret) + return ret; + break; default: return -EINVAL; } @@ -2591,6 +2626,46 @@ static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr) return false; } +enum valid_spec { + VALID_SPEC_INVALID, + VALID_SPEC_VALID, + VALID_SPEC_NA, +}; + +static enum valid_spec +is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev, + const struct mlx5_flow_spec *spec, + const struct mlx5_flow_act *flow_act, + bool egress) +{ + const u32 *match_c = spec->match_criteria; + bool is_crypto = + (flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT)); + bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c); + bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP; + + /* + * Currently only crypto is supported in egress, when regular egress + * rules would be supported, always return VALID_SPEC_NA. + */ + if (!is_crypto) + return egress ? VALID_SPEC_INVALID : VALID_SPEC_NA; + + return is_crypto && is_ipsec && + (!egress || (!is_drop && !flow_act->has_flow_tag)) ? + VALID_SPEC_VALID : VALID_SPEC_INVALID; +} + +static bool is_valid_spec(struct mlx5_core_dev *mdev, + const struct mlx5_flow_spec *spec, + const struct mlx5_flow_act *flow_act, + bool egress) +{ + /* We curretly only support ipsec egress flow */ + return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID; +} + static bool is_valid_ethertype(struct mlx5_core_dev *mdev, const struct ib_flow_attr *flow_attr, bool check_inner) @@ -2715,13 +2790,17 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - if (flow_is_multicast_only(flow_attr) && - !dont_trap) + if (ft_type == MLX5_IB_FT_TX) + priority = 0; + else if (flow_is_multicast_only(flow_attr) && + !dont_trap) priority = MLX5_IB_FLOW_MCAST_PRIO; else priority = ib_prio_to_core_prio(flow_attr->priority, dont_trap); ns = mlx5_get_flow_namespace(dev->mdev, + ft_type == MLX5_IB_FT_TX ? + MLX5_FLOW_NAMESPACE_EGRESS : MLX5_FLOW_NAMESPACE_BYPASS); num_entries = MLX5_FS_MAX_ENTRIES; num_groups = MLX5_FS_MAX_TYPES; @@ -2808,6 +2887,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, unsigned int spec_index; int err = 0; int dest_num = 1; + bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; if (!is_valid_attr(dev->mdev, flow_attr)) return ERR_PTR(-EINVAL); @@ -2824,7 +2904,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { err = parse_flow_attr(dev->mdev, spec->match_criteria, spec->match_value, - ib_flow, &flow_act); + ib_flow, flow_attr, &flow_act); if (err < 0) goto free; @@ -2847,12 +2927,23 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, } spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); + + if (is_egress && + !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) { + err = -EINVAL; + goto free; + } + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) { rule_dst = NULL; dest_num = 0; } else { - flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : - MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + if (is_egress) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; + else + flow_act.action |= + dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; } if (flow_act.has_flow_tag && @@ -3026,6 +3117,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct mlx5_flow_destination *dst = NULL; struct mlx5_ib_flow_prio *ft_prio_tx = NULL; struct mlx5_ib_flow_prio *ft_prio; + bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; int err; int underlay_qpn; @@ -3034,7 +3126,13 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, if (domain != IB_FLOW_DOMAIN_USER || flow_attr->port > dev->num_ports || - (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)) + (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP | + IB_FLOW_ATTR_FLAGS_EGRESS))) + return ERR_PTR(-EINVAL); + + if (is_egress && + (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) return ERR_PTR(-EINVAL); dst = kzalloc(sizeof(*dst), GFP_KERNEL); @@ -3043,7 +3141,8 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, mutex_lock(&dev->flow_db->lock); - ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX); + ft_prio = get_flow_table(dev, flow_attr, + is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; @@ -3057,11 +3156,15 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, } } - dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; - if (mqp->flags & MLX5_IB_QP_RSS) - dst->tir_num = mqp->rss_qp.tirn; - else - dst->tir_num = mqp->raw_packet_qp.rq.tirn; + if (is_egress) { + dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT; + } else { + dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; + if (mqp->flags & MLX5_IB_QP_RSS) + dst->tir_num = mqp->rss_qp.tirn; + else + dst->tir_num = mqp->raw_packet_qp.rq.tirn; + } if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5fe73971425e..2b27ddafc354 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -152,6 +152,7 @@ struct mlx5_ib_pd { #define MLX5_IB_NUM_FLOW_FT (MLX5_IB_FLOW_LEFTOVERS_PRIO + 1) #define MLX5_IB_NUM_SNIFFER_FTS 2 +#define MLX5_IB_NUM_EGRESS_FTS 1 struct mlx5_ib_flow_prio { struct mlx5_flow_table *flow_table; unsigned int refcount; @@ -167,6 +168,7 @@ struct mlx5_ib_flow_handler { struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio sniffer[MLX5_IB_NUM_SNIFFER_FTS]; + struct mlx5_ib_flow_prio egress[MLX5_IB_NUM_EGRESS_FTS]; struct mlx5_flow_table *lag_demux_ft; /* Protect flow steering bypass flow tables * when add/del flow rules. From c03faa562d0279e463900c92d4a54c0dc3c806b0 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:54 +0300 Subject: [PATCH 186/199] IB/mlx5: Add information for querying IPsec capabilities Users should be able to query for IPSec support. Adding a few capabilities bits as part of the driver specific part in alloc_ucontext: MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA Payload's header is returned with metadata representing the IPSec decryption state. MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_RX Support ESP_AES_GCM in ingress path. MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX Support ESP_AES_GCM in egress path. MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_RSS_ONLY Hardware doesn't support matching SPI in flow steering rules but just hashing and spreading the traffic accordingly. Signed-off-by: Aviad Yehezkel Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 12 ++++++++++++ include/uapi/rdma/mlx5-abi.h | 10 +++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 119c4c165970..25e70ae0b484 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1681,6 +1681,18 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.response_length = min(offsetof(typeof(resp), response_length) + sizeof(resp.response_length), udata->outlen); + if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) { + if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS)) + resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA) + resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA; + if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi)) + resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN) + resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN; + /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */ + } + context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index f7d18fb01771..f60d2659cdb7 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -107,6 +107,14 @@ enum mlx5_user_inline_mode { MLX5_USER_INLINE_MODE_TCP_UDP, }; +enum { + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM = 1 << 0, + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA = 1 << 1, + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING = 1 << 2, + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD = 1 << 3, + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN = 1 << 4, +}; + struct mlx5_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 bf_reg_size; @@ -118,7 +126,7 @@ struct mlx5_ib_alloc_ucontext_resp { __u32 max_recv_wr; __u32 max_srq_recv_wr; __u16 num_ports; - __u16 reserved1; + __u16 flow_action_flags; __u32 comp_mask; __u32 response_length; __u8 cqe_version; From 2d93fc856959bb940b3cf1e7cbad38721d6bea75 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 28 Mar 2018 09:27:55 +0300 Subject: [PATCH 187/199] IB/mlx5: Add ability to hash by IPSEC_SPI when creating a TIR When a Raw Ethernet QP is created, we actually create a few objects. One of these objects is a TIR. Currently, a TIR could hash (and spread the traffic) by IP or port only. Adding a hashing by IPSec SPI to TIR creation with the required UAPI bit. Signed-off-by: Matan Barak Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 4 ++++ drivers/infiniband/hw/mlx5/qp.c | 16 ++++++++++++---- include/uapi/rdma/mlx5-abi.h | 1 + 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 25e70ae0b484..31295e39896c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -856,6 +856,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_RX_HASH_SRC_PORT_UDP | MLX5_RX_HASH_DST_PORT_UDP | MLX5_RX_HASH_INNER; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & + MLX5_ACCEL_IPSEC_CAP_DEVICE) + resp.rss_caps.rx_hash_fields_mask |= + MLX5_RX_HASH_IPSEC_SPI; resp.response_length += sizeof(resp.rss_caps); } } else { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index c8f01f32ebb4..0a0524f60924 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1413,6 +1413,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, void *tirc; void *hfso; u32 selected_fields = 0; + u32 outer_l4; size_t min_resp_len; u32 tdn = mucontext->tdn; struct mlx5_ib_create_qp_rss ucmd = {}; @@ -1543,10 +1544,14 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); - if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) && - ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))) { + outer_l4 = ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) << 0 | + ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) << 1 | + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) << 2; + + /* Check that only one l4 protocol is set */ + if (outer_l4 & (outer_l4 - 1)) { err = -EINVAL; goto err; } @@ -1577,6 +1582,9 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT; + if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) + selected_fields |= MLX5_HASH_FIELD_SEL_IPSEC_SPI; + MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); create_tir: diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index f60d2659cdb7..d86a65b993f8 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -327,6 +327,7 @@ enum mlx5_rx_hash_fields { MLX5_RX_HASH_DST_PORT_TCP = 1 << 5, MLX5_RX_HASH_SRC_PORT_UDP = 1 << 6, MLX5_RX_HASH_DST_PORT_UDP = 1 << 7, + MLX5_RX_HASH_IPSEC_SPI = 1 << 8, /* Save bits for future fields */ MLX5_RX_HASH_INNER = (1UL << 31), }; From 57939021e8f882d13a5263a9d682c64ae00c578d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 Apr 2018 20:58:13 -0600 Subject: [PATCH 188/199] RDMA/qedr: Zero stack memory before copying to user space The fact this struct was not init'd like all the others was missed when the padding reserved field was added. Reported-by: Dan Carpenter Fixes: 71e80a4781af ("RDMA/qedr: Fix uABI structure layouts for 32/64 compat") Acked-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index a9f494fb892a..1835dc9eb3e3 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -473,9 +473,9 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev, pd->pd_id = pd_id; if (udata && context) { - struct qedr_alloc_pd_uresp uresp; - - uresp.pd_id = pd_id; + struct qedr_alloc_pd_uresp uresp = { + .pd_id = pd_id, + }; rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (rc) { From 39e00b6cf65831469a57333a929b8ca986892798 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 3 Apr 2018 23:06:18 -0500 Subject: [PATCH 189/199] IB/rxe: Removed GID add/del dummy routines rxe driver's add_gid() and del_gid() callbacks are doing simple checks which are already done by the ib core before invoking these callback routines. Therefore, code is simplified to skip implementing add_gid() and del_gid() callback functions. They are only invoked by ib_core if they are implemented. Signed-off-by: Parav Pandit Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 08f3e0618b81..a65550d6a849 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -77,21 +77,6 @@ out: return rc; } -static int rxe_add_gid(const union ib_gid *gid, - const struct ib_gid_attr *attr, void **context) -{ - if (attr->index >= RXE_PORT_GID_TBL_LEN) - return -EINVAL; - return 0; -} - -static int rxe_del_gid(const struct ib_gid_attr *attr, void **context) -{ - if (attr->index >= RXE_PORT_GID_TBL_LEN) - return -EINVAL; - return 0; -} - static struct net_device *rxe_get_netdev(struct ib_device *device, u8 port_num) { @@ -1265,8 +1250,6 @@ int rxe_register_device(struct rxe_dev *rxe) dev->modify_port = rxe_modify_port; dev->get_link_layer = rxe_get_link_layer; dev->get_netdev = rxe_get_netdev; - dev->add_gid = rxe_add_gid; - dev->del_gid = rxe_del_gid; dev->query_pkey = rxe_query_pkey; dev->alloc_ucontext = rxe_alloc_ucontext; dev->dealloc_ucontext = rxe_dealloc_ucontext; From 09c4854fde84421726a90249b4c20df2492f425f Mon Sep 17 00:00:00 2001 From: "Kalderon, Michal" Date: Thu, 5 Apr 2018 09:59:29 +0300 Subject: [PATCH 190/199] RDMA/qedr: Fix wmb usage in qedr This patch comes as a result of Sinan Kaya's work and the decision that writel() must be a strong enough barrier for DMA. wmb usages in qedr driver have either been removed where they were there only to order DMA accesses, and replaced with smp_wmb and comments for the places that the barrier was there for SMP reasons. Fixes: 561e5d48968b ("RDMA/qedr: eliminate duplicate barriers on weakly-ordered archs") Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 1835dc9eb3e3..26a698927d2e 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -804,8 +804,6 @@ static inline void qedr_init_cq_params(struct qedr_cq *cq, static void doorbell_cq(struct qedr_cq *cq, u32 cons, u8 flags) { - /* Flush data before signalling doorbell */ - wmb(); cq->db.data.agg_flags = flags; cq->db.data.value = cpu_to_le32(cons); writeq(cq->db.raw, cq->db_addr); @@ -1812,8 +1810,7 @@ static int qedr_update_qp_state(struct qedr_dev *dev, */ if (rdma_protocol_roce(&dev->ibdev, 1)) { - wmb(); - writel_relaxed(qp->rq.db_data.raw, qp->rq.db); + writel(qp->rq.db_data.raw, qp->rq.db); /* Make sure write takes effect */ mmiowb(); } @@ -3198,9 +3195,16 @@ int qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, * vane. However this is not harmful (as long as the producer value is * unchanged). For performance reasons we avoid checking for this * redundant doorbell. + * + * qp->wqe_wr_id is accessed during qedr_poll_cq, as + * soon as we give the doorbell, we could get a completion + * for this wr, therefore we need to make sure that the + * memory is updated before giving the doorbell. + * During qedr_poll_cq, rmb is called before accessing the + * cqe. This covers for the smp_rmb as well. */ - wmb(); - writel_relaxed(qp->sq.db_data.raw, qp->sq.db); + smp_wmb(); + writel(qp->sq.db_data.raw, qp->sq.db); /* Make sure write sticks */ mmiowb(); @@ -3286,8 +3290,14 @@ int qedr_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, qedr_inc_sw_prod(&qp->rq); - /* Flush all the writes before signalling doorbell */ - wmb(); + /* qp->rqe_wr_id is accessed during qedr_poll_cq, as + * soon as we give the doorbell, we could get a completion + * for this wr, therefore we need to make sure that the + * memory is update before giving the doorbell. + * During qedr_poll_cq, rmb is called before accessing the + * cqe. This covers for the smp_rmb as well. + */ + smp_wmb(); qp->rq.db_data.data.value++; From d41c1208955394198398f8a2ab8e0e25ad592e2b Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 5 Apr 2018 18:53:22 +0300 Subject: [PATCH 191/199] IB/uverbs: Expose device memory capabilities to user Adding a new capability field under ib_uverbs_ex_query_device_resp - max_dm_size - which reflects the maximum amount of device memory that is available for allocation on a device in bytes. Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_verbs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index df5d339952fe..9be07394fdbe 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -268,6 +268,7 @@ struct ib_uverbs_ex_query_device_resp { __u32 raw_packet_caps; struct ib_uverbs_tm_caps tm_caps; struct ib_uverbs_cq_moderation_caps cq_moderation_caps; + __aligned_u64 max_dm_size; }; struct ib_uverbs_query_port { From 1d8eeb9f6a6e0d8ac43a54fd95126044bf8d6695 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 5 Apr 2018 18:53:23 +0300 Subject: [PATCH 192/199] IB/uverbs: Add device memory capabilities reporting This change allows vendors to report device memory capability max_dm_size - to user via uverbs command. Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 6 ++++++ include/rdma/ib_verbs.h | 1 + 2 files changed, 7 insertions(+) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index f38600490fd1..13cb5e4deb86 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -4006,6 +4006,12 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.cq_moderation_caps.max_cq_moderation_period = attr.cq_caps.max_cq_moderation_period; resp.response_length += sizeof(resp.cq_moderation_caps); + + if (ucore->outlen < resp.response_length + sizeof(resp.max_dm_size)) + goto end; + + resp.max_dm_size = attr.max_dm_size; + resp.response_length += sizeof(resp.max_dm_size); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a6dba77c1b28..ed425627efd8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -372,6 +372,7 @@ struct ib_device_attr { u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ struct ib_tm_caps tm_caps; struct ib_cq_caps cq_caps; + u64 max_dm_size; }; enum ib_mtu { From bee76d7ab5d270919e80e4764df7cd7e4f06ed24 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 5 Apr 2018 18:53:24 +0300 Subject: [PATCH 193/199] IB/uverbs: Add alloc/free dm uverbs ioctl support This change adds uverbs support for allocation/freeing of device memory commands. A new uverbs object is defined of type idr to represent and track the new resource type allocation per context. The API requires provider driver to implement 2 new ib_device callbacks - one for allocation and one for deallocation which return and accept (respectively) the ib_dm object which represents the allocated memory on the device. The support is added via the ioctl command infrastructure only. Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_std_types.c | 3 +- drivers/infiniband/core/uverbs_std_types_dm.c | 108 ++++++++++++++++++ include/rdma/ib_verbs.h | 20 +++- include/uapi/rdma/ib_user_ioctl_cmds.h | 15 +++ 6 files changed, 146 insertions(+), 3 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_dm.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 445c5504f605..636da34f8308 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -35,4 +35,4 @@ ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_ioctl_merge.o uverbs_std_types_cq.o \ - uverbs_std_types_flow_action.o + uverbs_std_types_flow_action.o uverbs_std_types_dm.o diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 3229e87d03cb..cfb51618ab7a 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -286,6 +286,7 @@ extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_WQ); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM); #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 173eab8d3482..4fedf59ec396 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -311,7 +311,8 @@ static DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, &UVERBS_OBJECT(UVERBS_OBJECT_WQ), &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL), &UVERBS_OBJECT(UVERBS_OBJECT_XRCD), - &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION)); + &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION), + &UVERBS_OBJECT(UVERBS_OBJECT_DM)); const struct uverbs_object_tree_def *uverbs_default_get_objects(void) { diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c new file mode 100644 index 000000000000..8b681575b615 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_dm.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "uverbs.h" +#include + +static int uverbs_free_dm(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_dm *dm = uobject->object; + + if (why == RDMA_REMOVE_DESTROY && atomic_read(&dm->usecnt)) + return -EBUSY; + + return dm->device->dealloc_dm(dm); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext = file->ucontext; + struct ib_dm_alloc_attr attr = {}; + struct ib_uobject *uobj; + struct ib_dm *dm; + int ret; + + if (!ib_dev->alloc_dm) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.length, attrs, + UVERBS_ATTR_ALLOC_DM_LENGTH); + if (ret) + return ret; + + ret = uverbs_copy_from(&attr.alignment, attrs, + UVERBS_ATTR_ALLOC_DM_ALIGNMENT); + if (ret) + return ret; + + uobj = uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE)->obj_attr.uobject; + + dm = ib_dev->alloc_dm(ib_dev, ucontext, &attr, attrs); + if (IS_ERR(dm)) + return PTR_ERR(dm); + + dm->device = ib_dev; + dm->length = attr.length; + dm->uobject = uobj; + atomic_set(&dm->usecnt, 0); + + uobj->object = dm; + + return 0; +} + +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_DM_ALLOC, + &UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DM_HANDLE, UVERBS_OBJECT_DM, + UVERBS_ACCESS_NEW, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_ALIGNMENT, + UVERBS_ATTR_TYPE(u32), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_DM_FREE, + uverbs_destroy_def_handler, + &UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DM_HANDLE, + UVERBS_OBJECT_DM, + UVERBS_ACCESS_DESTROY, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DM, + /* 1 is used in order to free the DM after MRs */ + &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_dm), + &UVERBS_METHOD(UVERBS_METHOD_DM_ALLOC), + &UVERBS_METHOD(UVERBS_METHOD_DM_FREE)); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ed425627efd8..6806c4f5657a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -321,6 +321,12 @@ struct ib_cq_caps { u16 max_cq_moderation_period; }; +struct ib_dm_alloc_attr { + u64 length; + u32 alignment; + u32 flags; +}; + struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; @@ -1769,6 +1775,14 @@ struct ib_qp { struct rdma_restrack_entry res; }; +struct ib_dm { + struct ib_device *device; + u32 length; + u32 flags; + struct ib_uobject *uobject; + atomic_t usecnt; +}; + struct ib_mr { struct ib_device *device; struct ib_pd *pd; @@ -2425,7 +2439,11 @@ struct ib_device { int (*modify_flow_action_esp)(struct ib_flow_action *action, const struct ib_flow_action_attrs_esp *attr, struct uverbs_attr_bundle *attrs); - + struct ib_dm * (*alloc_dm)(struct ib_device *device, + struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs); + int (*dealloc_dm)(struct ib_dm *dm); /** * rdma netdev operation * diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 500b64a444ad..6034df2625c6 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -54,6 +54,7 @@ enum uverbs_default_objects { UVERBS_OBJECT_RWQ_IND_TBL, UVERBS_OBJECT_WQ, UVERBS_OBJECT_FLOW_ACTION, + UVERBS_OBJECT_DM, }; enum { @@ -100,4 +101,18 @@ enum uverbs_methods_actions_flow_action_ops { UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, }; +enum uverbs_attrs_alloc_dm_cmd_attr_ids { + UVERBS_ATTR_ALLOC_DM_HANDLE, + UVERBS_ATTR_ALLOC_DM_LENGTH, + UVERBS_ATTR_ALLOC_DM_ALIGNMENT, +}; + +enum uverbs_attrs_free_dm_cmd_attr_ids { + UVERBS_ATTR_FREE_DM_HANDLE, +}; + +enum uverbs_methods_dm { + UVERBS_METHOD_DM_ALLOC, + UVERBS_METHOD_DM_FREE, +}; #endif From be934cca9e987e73eb20e3c80731a9580d5acc79 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 5 Apr 2018 18:53:25 +0300 Subject: [PATCH 194/199] IB/uverbs: Add device memory registration ioctl support Adding new ioctl method for the MR object - REG_DM_MR. This command can be used by users to register an allocated device memory buffer as an MR and receive lkey and rkey to be used within work requests. It is added as a new method under the MR object and using a new ib_device callback - reg_dm_mr. The command creates a standard ib_mr object which represents the registered memory. Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/uverbs_std_types.c | 10 -- drivers/infiniband/core/uverbs_std_types_mr.c | 147 ++++++++++++++++++ drivers/infiniband/core/verbs.c | 6 +- include/rdma/ib_verbs.h | 11 ++ include/rdma/uverbs_ioctl.h | 12 ++ include/uapi/rdma/ib_user_ioctl_cmds.h | 16 ++ 7 files changed, 193 insertions(+), 12 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_mr.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 636da34f8308..dda9e856e3fa 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -35,4 +35,5 @@ ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_ioctl_merge.o uverbs_std_types_cq.o \ - uverbs_std_types_flow_action.o uverbs_std_types_dm.o + uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ + uverbs_std_types_mr.o diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 4fedf59ec396..569f48bd821e 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -144,12 +144,6 @@ static int uverbs_free_srq(struct ib_uobject *uobject, return ret; } -static int uverbs_free_mr(struct ib_uobject *uobject, - enum rdma_remove_reason why) -{ - return ib_dereg_mr((struct ib_mr *)uobject->object); -} - static int uverbs_free_xrcd(struct ib_uobject *uobject, enum rdma_remove_reason why) { @@ -265,10 +259,6 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_QP, DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW, &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw)); -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MR, - /* 1 is used in order to free the MR after all the MWs */ - &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr)); - DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_SRQ, &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0, uverbs_free_srq)); diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c new file mode 100644 index 000000000000..68f7cadf088f --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "uverbs.h" +#include + +static int uverbs_free_mr(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + return ib_dereg_mr((struct ib_mr *)uobject->object); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_dm_mr_attr attr = {}; + struct ib_uobject *uobj; + struct ib_dm *dm; + struct ib_pd *pd; + struct ib_mr *mr; + int ret; + + if (!ib_dev->reg_dm_mr) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.offset, attrs, UVERBS_ATTR_REG_DM_MR_OFFSET); + if (ret) + return ret; + + ret = uverbs_copy_from(&attr.length, attrs, + UVERBS_ATTR_REG_DM_MR_LENGTH); + if (ret) + return ret; + + ret = uverbs_copy_from(&attr.access_flags, attrs, + UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS); + if (ret) + return ret; + + if (!(attr.access_flags & IB_ZERO_BASED)) + return -EINVAL; + + ret = ib_check_mr_access(attr.access_flags); + if (ret) + return ret; + + pd = uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_PD_HANDLE); + + dm = uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_DM_HANDLE); + + uobj = uverbs_attr_get(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE)->obj_attr.uobject; + + if (attr.offset > dm->length || attr.length > dm->length || + attr.length > dm->length - attr.offset) + return -EINVAL; + + mr = pd->device->reg_dm_mr(pd, dm, &attr, attrs); + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->device = pd->device; + mr->pd = pd; + mr->dm = dm; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + atomic_inc(&dm->usecnt); + + uobj->object = mr; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_LKEY, &mr->lkey, + sizeof(mr->lkey)); + if (ret) + goto err_dereg; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + if (ret) + goto err_dereg; + + return 0; + +err_dereg: + ib_dereg_mr(mr); + + return ret; +} + +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_DM_MR_REG, + &UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE, UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_PD_HANDLE, UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS, + UVERBS_ATTR_TYPE(u32), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_DM_HANDLE, UVERBS_OBJECT_DM, + UVERBS_ACCESS_READ, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MR, + /* 1 is used in order to free the MR after all the MWs */ + &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr), + &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG)); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 95e3b307c93a..7eff3aeffe01 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1616,12 +1616,16 @@ EXPORT_SYMBOL(ib_resize_cq); int ib_dereg_mr(struct ib_mr *mr) { struct ib_pd *pd = mr->pd; + struct ib_dm *dm = mr->dm; int ret; rdma_restrack_del(&mr->res); ret = mr->device->dereg_mr(mr); - if (!ret) + if (!ret) { atomic_dec(&pd->usecnt); + if (dm) + atomic_dec(&dm->usecnt); + } return ret; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6806c4f5657a..4bd24c48b1ad 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -321,6 +321,12 @@ struct ib_cq_caps { u16 max_cq_moderation_period; }; +struct ib_dm_mr_attr { + u64 length; + u64 offset; + u32 access_flags; +}; + struct ib_dm_alloc_attr { u64 length; u32 alignment; @@ -1797,6 +1803,8 @@ struct ib_mr { struct list_head qp_entry; /* FR */ }; + struct ib_dm *dm; + /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -2444,6 +2452,9 @@ struct ib_device { struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); int (*dealloc_dm)(struct ib_dm *dm); + struct ib_mr * (*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm, + struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs); /** * rdma netdev operation * diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 3d6ac684b8f0..4a4201d997a7 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -408,6 +408,18 @@ static inline int uverbs_attr_get_enum_id(const struct uverbs_attr_bundle *attrs return attr->ptr_attr.enum_id; } +static inline void *uverbs_attr_get_obj(const struct uverbs_attr_bundle *attrs_bundle, + u16 idx) +{ + struct ib_uobject *uobj = + uverbs_attr_get(attrs_bundle, idx)->obj_attr.uobject; + + if (IS_ERR(uobj)) + return uobj; + + return uobj->object; +} + static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle, size_t idx, const void *from, size_t size) { diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 6034df2625c6..83e3890eef20 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -115,4 +115,20 @@ enum uverbs_methods_dm { UVERBS_METHOD_DM_ALLOC, UVERBS_METHOD_DM_FREE, }; + +enum uverbs_attrs_reg_dm_mr_cmd_attr_ids { + UVERBS_ATTR_REG_DM_MR_HANDLE, + UVERBS_ATTR_REG_DM_MR_OFFSET, + UVERBS_ATTR_REG_DM_MR_LENGTH, + UVERBS_ATTR_REG_DM_MR_PD_HANDLE, + UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS, + UVERBS_ATTR_REG_DM_MR_DM_HANDLE, + UVERBS_ATTR_REG_DM_MR_RESP_LKEY, + UVERBS_ATTR_REG_DM_MR_RESP_RKEY, +}; + +enum uverbs_methods_mr { + UVERBS_METHOD_DM_MR_REG, +}; + #endif From e72bd817aee2bd867a90aac68aca07d99addcb55 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 5 Apr 2018 18:53:26 +0300 Subject: [PATCH 195/199] net/mlx5: Query device memory capabilities This patch adds querying of device memory capabilities by the mlx5_core driver during initialization. Device memory capabilities is a new capability type and structure which contains the necessary data that is needed for future device memory allocation. The presence of this new capabilities struct is indicated in the general capabilities struct which is queried first by the driver. If the presence bit is set, the driver will also query the new capabilities struct and save it in the device context. Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 ++++++ include/linux/mlx5/device.h | 9 +++++++++ include/linux/mlx5/mlx5_ifc.h | 20 +++++++++++++++++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 9d11e92fb541..17ec55874714 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -192,6 +192,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) if (MLX5_CAP_GEN(dev, qcam_reg)) mlx5_get_qcam_reg(dev); + if (MLX5_CAP_GEN(dev, device_memory)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_DEV_MEM); + if (err) + return err; + } + return 0; } diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 413df3c11a46..2651691c05fb 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1013,6 +1013,9 @@ enum mlx5_cap_type { MLX5_CAP_RESERVED, MLX5_CAP_VECTOR_CALC, MLX5_CAP_QOS, + MLX5_CAP_DEBUG, + MLX5_CAP_RESERVED_14, + MLX5_CAP_DEV_MEM, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1161,6 +1164,12 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP64_FPGA(mdev, cap) \ MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap) +#define MLX5_CAP_DEV_MEM(mdev, cap)\ + MLX5_GET(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap) + +#define MLX5_CAP64_DEV_MEM(mdev, cap)\ + MLX5_GET64(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 64963fd2cd9b..13c3bf25753b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -658,6 +658,24 @@ struct mlx5_ifc_roce_cap_bits { u8 reserved_at_100[0x700]; }; +struct mlx5_ifc_device_mem_cap_bits { + u8 memic[0x1]; + u8 reserved_at_1[0x1f]; + + u8 reserved_at_20[0xb]; + u8 log_min_memic_alloc_size[0x5]; + u8 reserved_at_30[0x8]; + u8 log_max_memic_addr_alignment[0x8]; + + u8 memic_bar_start_addr[0x40]; + + u8 memic_bar_size[0x20]; + + u8 max_memic_size[0x20]; + + u8 reserved_at_c0[0x740]; +}; + enum { MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE = 0x0, MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES = 0x2, @@ -872,7 +890,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 ets[0x1]; u8 nic_flow_table[0x1]; u8 eswitch_flow_table[0x1]; - u8 early_vf_enable[0x1]; + u8 device_memory[0x1]; u8 mcam_reg[0x1]; u8 pcam_reg[0x1]; u8 local_ca_ack_delay[0x5]; From 24da00164f7a9c247d2224a54494d0e955199630 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 5 Apr 2018 18:53:27 +0300 Subject: [PATCH 196/199] IB/mlx5: Device memory support in mlx5_ib This patch adds the mlx5_ib driver implementation for the device memory allocation API. It implements the ib_device callbacks for allocation and deallocation operations as well as a new mmap command support which allows mapping an allocated device memory to a VMA. The change also adds reporting of device memory maximum size and alignment parameters reported in device capabilities. The allocation/deallocation operations are using new firmware commands to allocate MEMIC memory on the device. Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 106 +++++++++++++++++ drivers/infiniband/hw/mlx5/cmd.h | 4 + drivers/infiniband/hw/mlx5/main.c | 143 ++++++++++++++++++++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 35 +++++- include/linux/mlx5/mlx5_ifc.h | 55 +++++++++ include/uapi/rdma/mlx5-abi.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 6 +- 7 files changed, 347 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 6f6712f87a73..55a227cc8609 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -66,3 +66,109 @@ int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev, return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out)); } + +int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, + u64 length, u32 alignment) +{ + struct mlx5_core_dev *dev = memic->dev; + u64 num_memic_hw_pages = MLX5_CAP_DEV_MEM(dev, memic_bar_size) + >> PAGE_SHIFT; + u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr); + u32 max_alignment = MLX5_CAP_DEV_MEM(dev, log_max_memic_addr_alignment); + u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE); + u32 out[MLX5_ST_SZ_DW(alloc_memic_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_memic_in)] = {}; + u32 mlx5_alignment; + u64 page_idx = 0; + int ret = 0; + + if (!length || (length & MLX5_MEMIC_ALLOC_SIZE_MASK)) + return -EINVAL; + + /* mlx5 device sets alignment as 64*2^driver_value + * so normalizing is needed. + */ + mlx5_alignment = (alignment < MLX5_MEMIC_BASE_ALIGN) ? 0 : + alignment - MLX5_MEMIC_BASE_ALIGN; + if (mlx5_alignment > max_alignment) + return -EINVAL; + + MLX5_SET(alloc_memic_in, in, opcode, MLX5_CMD_OP_ALLOC_MEMIC); + MLX5_SET(alloc_memic_in, in, range_size, num_pages * PAGE_SIZE); + MLX5_SET(alloc_memic_in, in, memic_size, length); + MLX5_SET(alloc_memic_in, in, log_memic_addr_alignment, + mlx5_alignment); + + do { + spin_lock(&memic->memic_lock); + page_idx = bitmap_find_next_zero_area(memic->memic_alloc_pages, + num_memic_hw_pages, + page_idx, + num_pages, 0); + + if (page_idx + num_pages <= num_memic_hw_pages) + bitmap_set(memic->memic_alloc_pages, + page_idx, num_pages); + else + ret = -ENOMEM; + + spin_unlock(&memic->memic_lock); + + if (ret) + return ret; + + MLX5_SET64(alloc_memic_in, in, range_start_addr, + hw_start_addr + (page_idx * PAGE_SIZE)); + + ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (ret) { + spin_lock(&memic->memic_lock); + bitmap_clear(memic->memic_alloc_pages, + page_idx, num_pages); + spin_unlock(&memic->memic_lock); + + if (ret == -EAGAIN) { + page_idx++; + continue; + } + + return ret; + } + + *addr = pci_resource_start(dev->pdev, 0) + + MLX5_GET64(alloc_memic_out, out, memic_start_addr); + + return ret; + } while (page_idx < num_memic_hw_pages); + + return ret; +} + +int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length) +{ + struct mlx5_core_dev *dev = memic->dev; + u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr); + u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE); + u32 out[MLX5_ST_SZ_DW(dealloc_memic_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {0}; + u64 start_page_idx; + int err; + + addr -= pci_resource_start(dev->pdev, 0); + start_page_idx = (addr - hw_start_addr) >> PAGE_SHIFT; + + MLX5_SET(dealloc_memic_in, in, opcode, MLX5_CMD_OP_DEALLOC_MEMIC); + MLX5_SET64(dealloc_memic_in, in, memic_start_addr, addr); + MLX5_SET(dealloc_memic_in, in, memic_size, length); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + + if (!err) { + spin_lock(&memic->memic_lock); + bitmap_clear(memic->memic_alloc_pages, + start_page_idx, num_pages); + spin_unlock(&memic->memic_lock); + } + + return err; +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 78ffded7cc2c..e7206c8a8011 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -33,6 +33,7 @@ #ifndef MLX5_IB_CMD_H #define MLX5_IB_CMD_H +#include "mlx5_ib.h" #include #include @@ -41,4 +42,7 @@ int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, void *out, int out_size); int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev, void *in, int in_size); +int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, + u64 length, u32 alignment); +int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 31295e39896c..e17eac32394c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -38,6 +38,7 @@ #include #include #include +#include #if defined(CONFIG_X86) #include #endif @@ -891,6 +892,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS; } + if (MLX5_CAP_DEV_MEM(mdev, memic)) { + props->max_dm_size = + MLX5_CAP_DEV_MEM(mdev, max_memic_size); + } + if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; @@ -2014,6 +2020,8 @@ static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) return "best effort WC"; case MLX5_IB_MMAP_NC_PAGE: return "NC"; + case MLX5_IB_MMAP_DEVICE_MEM: + return "Device Memory"; default: return NULL; } @@ -2172,6 +2180,34 @@ free_bfreg: return err; } +static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *mctx = to_mucontext(context); + struct mlx5_ib_dev *dev = to_mdev(context->device); + u16 page_idx = get_extended_index(vma->vm_pgoff); + size_t map_size = vma->vm_end - vma->vm_start; + u32 npages = map_size >> PAGE_SHIFT; + phys_addr_t pfn; + pgprot_t prot; + + if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) != + page_idx + npages) + return -EINVAL; + + pfn = ((pci_resource_start(dev->mdev->pdev, 0) + + MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >> + PAGE_SHIFT) + + page_idx; + prot = pgprot_writecombine(vma->vm_page_prot); + vma->vm_page_prot = prot; + + if (io_remap_pfn_range(vma, vma->vm_start, pfn, map_size, + vma->vm_page_prot)) + return -EAGAIN; + + return mlx5_ib_set_vma_data(vma, mctx); +} + static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); @@ -2216,6 +2252,9 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm case MLX5_IB_MMAP_CLOCK_INFO: return mlx5_ib_mmap_clock_info_page(dev, vma, context); + case MLX5_IB_MMAP_DEVICE_MEM: + return dm_mmap(ibcontext, vma); + default: return -EINVAL; } @@ -2223,6 +2262,87 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm return 0; } +struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + u64 act_size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE); + struct mlx5_memic *memic = &to_mdev(ibdev)->memic; + phys_addr_t memic_addr; + struct mlx5_ib_dm *dm; + u64 start_offset; + u32 page_idx; + int err; + + dm = kzalloc(sizeof(*dm), GFP_KERNEL); + if (!dm) + return ERR_PTR(-ENOMEM); + + mlx5_ib_dbg(to_mdev(ibdev), "alloc_memic req: user_length=0x%llx act_length=0x%llx log_alignment=%d\n", + attr->length, act_size, attr->alignment); + + err = mlx5_cmd_alloc_memic(memic, &memic_addr, + act_size, attr->alignment); + if (err) + goto err_free; + + start_offset = memic_addr & ~PAGE_MASK; + page_idx = (memic_addr - pci_resource_start(memic->dev->pdev, 0) - + MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >> + PAGE_SHIFT; + + err = uverbs_copy_to(attrs, + MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + &start_offset, sizeof(start_offset)); + if (err) + goto err_dealloc; + + err = uverbs_copy_to(attrs, + MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, + &page_idx, sizeof(page_idx)); + if (err) + goto err_dealloc; + + bitmap_set(to_mucontext(context)->dm_pages, page_idx, + DIV_ROUND_UP(act_size, PAGE_SIZE)); + + dm->dev_addr = memic_addr; + + return &dm->ibdm; + +err_dealloc: + mlx5_cmd_dealloc_memic(memic, memic_addr, + act_size); +err_free: + kfree(dm); + return ERR_PTR(err); +} + +int mlx5_ib_dealloc_dm(struct ib_dm *ibdm) +{ + struct mlx5_memic *memic = &to_mdev(ibdm->device)->memic; + struct mlx5_ib_dm *dm = to_mdm(ibdm); + u64 act_size = roundup(dm->ibdm.length, MLX5_MEMIC_BASE_SIZE); + u32 page_idx; + int ret; + + ret = mlx5_cmd_dealloc_memic(memic, dm->dev_addr, act_size); + if (ret) + return ret; + + page_idx = (dm->dev_addr - pci_resource_start(memic->dev->pdev, 0) - + MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >> + PAGE_SHIFT; + bitmap_clear(to_mucontext(ibdm->uobject->context)->dm_pages, + page_idx, + DIV_ROUND_UP(act_size, PAGE_SIZE)); + + kfree(dm); + + return 0; +} + static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) @@ -4834,13 +4954,22 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) mlx5_nic_vport_disable_roce(dev->mdev); } +ADD_UVERBS_ATTRIBUTES_SIMPLE(mlx5_ib_dm, UVERBS_OBJECT_DM, + UVERBS_METHOD_DM_ALLOC, + &UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, + UVERBS_ATTR_TYPE(u16), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + ADD_UVERBS_ATTRIBUTES_SIMPLE(mlx5_ib_flow_action, UVERBS_OBJECT_FLOW_ACTION, UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, &UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, UVERBS_ATTR_TYPE(u64), UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); -#define NUM_TREES 1 +#define NUM_TREES 2 static int populate_specs_root(struct mlx5_ib_dev *dev) { const struct uverbs_object_tree_def *default_root[NUM_TREES + 1] = { @@ -4851,6 +4980,10 @@ static int populate_specs_root(struct mlx5_ib_dev *dev) !WARN_ON(num_trees >= ARRAY_SIZE(default_root))) default_root[num_trees++] = &mlx5_ib_flow_action; + if (MLX5_CAP_DEV_MEM(dev->mdev, memic) && + !WARN_ON(num_trees >= ARRAY_SIZE(default_root))) + default_root[num_trees++] = &mlx5_ib_dm; + dev->ib_dev.specs_root = uverbs_alloc_spec_tree(num_trees, default_root); @@ -4925,6 +5058,9 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); + spin_lock_init(&dev->memic.memic_lock); + dev->memic.dev = mdev; + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING err = init_srcu_struct(&dev->mr_srcu); if (err) @@ -5087,6 +5223,11 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } + if (MLX5_CAP_DEV_MEM(mdev, memic)) { + dev->ib_dev.alloc_dm = mlx5_ib_alloc_dm; + dev->ib_dev.dealloc_dm = mlx5_ib_dealloc_dm; + } + dev->ib_dev.create_flow = mlx5_ib_create_flow; dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; dev->ib_dev.uverbs_ex_cmd_mask |= diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 2b27ddafc354..3e9b6548a96b 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -45,6 +45,7 @@ #include #include #include +#include #define mlx5_ib_dbg(dev, format, arg...) \ pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ @@ -108,6 +109,16 @@ enum { MLX5_IB_INVALID_BFREG = BIT(31), }; +enum { + MLX5_MAX_MEMIC_PAGES = 0x100, + MLX5_MEMIC_ALLOC_SIZE_MASK = 0x3f, +}; + +enum { + MLX5_MEMIC_BASE_ALIGN = 6, + MLX5_MEMIC_BASE_SIZE = 1 << MLX5_MEMIC_BASE_ALIGN, +}; + struct mlx5_ib_vma_private_data { struct list_head list; struct vm_area_struct *vma; @@ -131,6 +142,7 @@ struct mlx5_ib_ucontext { struct mutex vma_private_list_mutex; u64 lib_caps; + DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES); }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -521,6 +533,11 @@ enum mlx5_ib_mtt_access_flags { MLX5_IB_MTT_WRITE = (1 << 1), }; +struct mlx5_ib_dm { + struct ib_dm ibdm; + phys_addr_t dev_addr; +}; + #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) struct mlx5_ib_mr { @@ -784,6 +801,12 @@ struct mlx5_ib_flow_action { }; }; +struct mlx5_memic { + struct mlx5_core_dev *dev; + spinlock_t memic_lock; + DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES); +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -830,6 +853,7 @@ struct mlx5_ib_dev { u8 umr_fence; struct list_head ib_dev_list; u64 sys_image_guid; + struct mlx5_memic memic; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -897,6 +921,11 @@ static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) return container_of(msrq, struct mlx5_ib_srq, msrq); } +static inline struct mlx5_ib_dm *to_mdm(struct ib_dm *ibdm) +{ + return container_of(ibdm, struct mlx5_ib_dm, ibdm); +} + static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) { return container_of(ibmr, struct mlx5_ib_mr, ibmr); @@ -1041,7 +1070,11 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, struct ib_udata *udata); int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev); - +struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs); +int mlx5_ib_dealloc_dm(struct ib_dm *ibdm); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 13c3bf25753b..a64e59b65a33 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -92,6 +92,8 @@ enum { MLX5_CMD_OP_DESTROY_MKEY = 0x202, MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS = 0x203, MLX5_CMD_OP_PAGE_FAULT_RESUME = 0x204, + MLX5_CMD_OP_ALLOC_MEMIC = 0x205, + MLX5_CMD_OP_DEALLOC_MEMIC = 0x206, MLX5_CMD_OP_CREATE_EQ = 0x301, MLX5_CMD_OP_DESTROY_EQ = 0x302, MLX5_CMD_OP_QUERY_EQ = 0x303, @@ -8886,4 +8888,57 @@ struct mlx5_ifc_destroy_vport_lag_in_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_alloc_memic_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_30[0x20]; + + u8 reserved_at_40[0x18]; + u8 log_memic_addr_alignment[0x8]; + + u8 range_start_addr[0x40]; + + u8 range_size[0x20]; + + u8 memic_size[0x20]; +}; + +struct mlx5_ifc_alloc_memic_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 memic_start_addr[0x40]; +}; + +struct mlx5_ifc_dealloc_memic_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; + + u8 memic_start_addr[0x40]; + + u8 memic_size[0x20]; + + u8 reserved_at_e0[0x20]; +}; + +struct mlx5_ifc_dealloc_memic_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + #endif /* MLX5_IFC_H */ diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index d86a65b993f8..cb4a02c4a1ce 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -430,6 +430,7 @@ enum mlx5_ib_mmap_cmd { MLX5_IB_MMAP_CORE_CLOCK = 5, MLX5_IB_MMAP_ALLOC_WC = 6, MLX5_IB_MMAP_CLOCK_INFO = 7, + MLX5_IB_MMAP_DEVICE_MEM = 8, }; enum { diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 521813d5348c..f7d685ef2d1f 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -40,5 +40,9 @@ enum mlx5_ib_create_flow_action_attrs { MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS = (1U << UVERBS_ID_NS_SHIFT), }; -#endif +enum mlx5_ib_alloc_dm_attrs { + MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, +}; +#endif From cdbd0d2bae14566cf875595180b91527b4431df8 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 5 Apr 2018 18:53:28 +0300 Subject: [PATCH 197/199] net/mlx5: Mkey creation command adjustments This change updates the mlx5 interface to create mkey on the device. The updates in the command mailbox include increasing the access mode type field to 5 bits in order to support additional types such as MLX5_MKC_ACCESS_MODE_MEMIC which represents device memory access type and will be used when registering MR on allocated device memory. All the places that use the old access mode format are adjusted as well. Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 16 +++++++--------- drivers/infiniband/hw/mlx5/mr.c | 13 ++++++++----- .../net/ethernet/mellanox/mlx5/core/en_common.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- .../net/ethernet/mellanox/mlx5/core/fpga/conn.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 9 +++++++-- 6 files changed, 25 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 55a227cc8609..188512bf46e6 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -99,23 +99,21 @@ int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, MLX5_SET(alloc_memic_in, in, log_memic_addr_alignment, mlx5_alignment); - do { + while (page_idx < num_memic_hw_pages) { spin_lock(&memic->memic_lock); page_idx = bitmap_find_next_zero_area(memic->memic_alloc_pages, num_memic_hw_pages, page_idx, num_pages, 0); - if (page_idx + num_pages <= num_memic_hw_pages) + if (page_idx < num_memic_hw_pages) bitmap_set(memic->memic_alloc_pages, page_idx, num_pages); - else - ret = -ENOMEM; spin_unlock(&memic->memic_lock); - if (ret) - return ret; + if (page_idx >= num_memic_hw_pages) + break; MLX5_SET64(alloc_memic_in, in, range_start_addr, hw_start_addr + (page_idx * PAGE_SIZE)); @@ -138,10 +136,10 @@ int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, *addr = pci_resource_start(dev->pdev, 0) + MLX5_GET64(alloc_memic_out, out, memic_start_addr); - return ret; - } while (page_idx < num_memic_hw_pages); + return 0; + } - return ret; + return -ENOMEM; } int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 60683090d138..d3f7ce97c3a5 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -204,7 +204,9 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, umr_en, 1); - MLX5_SET(mkc, mkc, access_mode, ent->access_mode); + MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, + (ent->access_mode >> 2) & 0x7); MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); @@ -804,7 +806,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); @@ -1171,7 +1173,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, free, !populate); - MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); @@ -1668,7 +1670,8 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, goto err_free_in; } - MLX5_SET(mkc, mkc, access_mode, mr->access_mode); + MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7); MLX5_SET(mkc, mkc, umr_en, 1); mr->ibmr.device = pd->device; @@ -1749,7 +1752,7 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET(mkc, mkc, lr, 1); - MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_KLMS); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2))); MLX5_SET(mkc, mkc, qpn, 0xffffff); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 784e282803db..db3278cc052b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -70,7 +70,7 @@ static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, return -ENOMEM; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); MLX5_SET(mkc, mkc, lw, 1); MLX5_SET(mkc, mkc, lr, 1); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 2ee4ffbddd5f..7bafa78a6c37 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -360,7 +360,7 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev, MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET(mkc, mkc, lw, 1); MLX5_SET(mkc, mkc, lr, 1); - MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index e6175f8ac0e4..de7fe087d6fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -232,7 +232,7 @@ static int mlx5_fpga_conn_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, return -ENOMEM; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); MLX5_SET(mkc, mkc, lw, 1); MLX5_SET(mkc, mkc, lr, 1); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a64e59b65a33..fa6f134c85d7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2720,12 +2720,17 @@ enum { MLX5_MKC_ACCESS_MODE_MTT = 0x1, MLX5_MKC_ACCESS_MODE_KLMS = 0x2, MLX5_MKC_ACCESS_MODE_KSM = 0x3, + MLX5_MKC_ACCESS_MODE_MEMIC = 0x5, }; struct mlx5_ifc_mkc_bits { u8 reserved_at_0[0x1]; u8 free[0x1]; - u8 reserved_at_2[0xd]; + u8 reserved_at_2[0x1]; + u8 access_mode_4_2[0x3]; + u8 reserved_at_6[0x7]; + u8 relaxed_ordering_write[0x1]; + u8 reserved_at_e[0x1]; u8 small_fence_on_rdma_read_response[0x1]; u8 umr_en[0x1]; u8 a[0x1]; @@ -2733,7 +2738,7 @@ struct mlx5_ifc_mkc_bits { u8 rr[0x1]; u8 lw[0x1]; u8 lr[0x1]; - u8 access_mode[0x2]; + u8 access_mode_1_0[0x2]; u8 reserved_at_18[0x8]; u8 qpn[0x18]; From 6c29f57ea4751c4887627521027cd72aba831a97 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 5 Apr 2018 18:53:29 +0300 Subject: [PATCH 198/199] IB/mlx5: Device memory mr registration support Adding mlx5_ib driver implementation for reg_dm_mr callback which allows registering device memory (DM) as an MR for local and remote access. Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 1 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 9 ++++ drivers/infiniband/hw/mlx5/mr.c | 74 ++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e17eac32394c..4ead79513e3a 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5226,6 +5226,7 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) if (MLX5_CAP_DEV_MEM(mdev, memic)) { dev->ib_dev.alloc_dm = mlx5_ib_alloc_dm; dev->ib_dev.dealloc_dm = mlx5_ib_dealloc_dm; + dev->ib_dev.reg_dm_mr = mlx5_ib_reg_dm_mr; } dev->ib_dev.create_flow = mlx5_ib_create_flow; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 3e9b6548a96b..49a1aa0ff429 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -540,6 +540,12 @@ struct mlx5_ib_dm { #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) +#define MLX5_IB_DM_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE |\ + IB_ACCESS_REMOTE_WRITE |\ + IB_ACCESS_REMOTE_READ |\ + IB_ACCESS_REMOTE_ATOMIC |\ + IB_ZERO_BASED) + struct mlx5_ib_mr { struct ib_mr ibmr; void *descs; @@ -1075,6 +1081,9 @@ struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); int mlx5_ib_dealloc_dm(struct ib_dm *ibdm); +struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, + struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index d3f7ce97c3a5..d6350e7a2a47 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1229,6 +1229,80 @@ static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, mr->access_flags = access_flags; } +static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr, + u64 length, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MEMIC & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, + (MLX5_MKC_ACCESS_MODE_MEMIC >> 2) & 0x7); + MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET64(mkc, mkc, len, length); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET64(mkc, mkc, start_addr, + memic_addr - pci_resource_start(dev->mdev->pdev, 0)); + + err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen); + if (err) + goto err_in; + + kfree(in); + + mr->umem = NULL; + set_mr_fileds(dev, mr, 0, length, acc); + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, + struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dm *mdm = to_mdm(dm); + u64 memic_addr; + + if (attr->access_flags & ~MLX5_IB_DM_ALLOWED_ACCESS) + return ERR_PTR(-EINVAL); + + memic_addr = mdm->dev_addr + attr->offset; + + return mlx5_ib_get_memic_mr(pd, memic_addr, attr->length, + attr->access_flags); +} + struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) From efc365e7290d040fbd43f60b0e97653489a739d4 Mon Sep 17 00:00:00 2001 From: Mikhail Malygin Date: Mon, 2 Apr 2018 12:26:59 +0300 Subject: [PATCH 199/199] IB/rxe: Fix for oops in rxe_register_device on ppc64le arch On ppc64le arch rxe_add command causes oops in kernel log: [ 92.495140] Oops: Kernel access of bad area, sig: 11 [#1] [ 92.499710] SMP NR_CPUS=2048 NUMA pSeries [ 92.499792] Modules linked in: ipt_MASQUERADE(E) nf_nat_masquerade_ipv4(E) nf_conntrack_netlink(E) nfnetlink(E) xfrm_user(E) iptable _nat(E) nf_conntrack_ipv4(E) nf_defrag_ipv4(E) nf_nat_ipv4(E) xt_addrtype(E) iptable_filter(E) ip_tables(E) xt_conntrack(E) x_tables(E) nf_nat(E) nf_conntrack(E) br_netfilter(E) bridge(E) stp(E) llc(E) overlay(E) af_packet(E) rpcrdma(E) ib_isert(E) iscsi_target_mod(E) i b_iser(E) libiscsi(E) ib_srpt(E) target_core_mod(E) ib_srp(E) ib_ipoib(E) rdma_ucm(E) ib_ucm(E) ib_uverbs(E) ib_umad(E) bochs_drm(E) tt m(E) drm_kms_helper(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) fb_sys_fops(E) drm(E) agpgart(E) virtio_rng(E) virtio_console(E) rtc_ generic(E) dm_ec(OEN) ttln_rdma(OEN) rdma_cm(E) configfs(E) iw_cm(E) ib_cm(E) rdma_rxe(E) ip6_udp_tunnel(E) udp_tunnel(E) ib_core(E) ql a2xxx(E) [ 92.499832] scsi_transport_fc(E) nvme_fc(E) nvme_fabrics(E) nvme_core(E) ipmi_watchdog(E) ipmi_ssif(E) ipmi_poweroff(E) ipmi_powernv(EX) ipmi_devintf(E) ipmi_msghandler(E) dummy(E) ext4(E) crc16(E) jbd2(E) mbcache(E) dm_service_time(E) scsi_transport_iscsi(E) sd_mod(E) sr_mod(E) cdrom(E) hid_generic(E) usbhid(E) virtio_blk(E) virtio_scsi(E) virtio_net(E) ibmvscsi(EX) scsi_transport_srp(E) xhci_pci(E) xhci_hcd(E) usbcore(E) usb_common(E) virtio_pci(E) virtio_ring(E) virtio(E) sunrpc(E) dm_mirror(E) dm_region_hash(E) dm_log(E) sg(E) dm_multipath(E) dm_mod(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) scsi_mod(E) autofs4(E) [ 92.499834] Supported: No, Unsupported modules are loaded [ 92.499839] CPU: 3 PID: 5576 Comm: sh Tainted: G OE NX 4.4.120-ttln.17-default #1 [ 92.499841] task: c0000000afe8a490 ti: c0000000beba8000 task.ti: c0000000beba8000 [ 92.499842] NIP: c00000000008ba3c LR: c000000000027644 CTR: c00000000008ba10 [ 92.499844] REGS: c0000000bebab750 TRAP: 0300 Tainted: G OE NX (4.4.120-ttln.17-default) [ 92.499850] MSR: 8000000000009033 CR: 28424428 XER: 20000000 [ 92.499871] CFAR: 0000000000002424 DAR: 0000000000000208 DSISR: 40000000 SOFTE: 1 GPR00: c000000000027644 c0000000bebab9d0 c000000000f09700 0000000000000000 GPR04: d0000000043d7192 0000000000000002 000000000000001a fffffffffffffffe GPR08: 000000000000009c c00000000008ba10 d0000000043e5848 d0000000043d3828 GPR12: c00000000008ba10 c000000007a02400 0000000010062e38 0000010020388860 GPR16: 0000000000000000 0000000000000000 00000100203885f0 00000000100f6c98 GPR20: c0000000b3f1fcc0 c0000000b3f1fc48 c0000000b3f1fbd0 c0000000b3f1fb58 GPR24: c0000000b3f1fae0 c0000000b3f1fa68 00000000000005dc c0000000b3f1f9f0 GPR28: d0000000043e5848 c0000000b3f1f900 c0000000b3f1f320 c0000000b3f1f000 [ 92.499881] NIP [c00000000008ba3c] dma_get_required_mask_pSeriesLP+0x2c/0x1a0 [ 92.499885] LR [c000000000027644] dma_get_required_mask+0x44/0xac [ 92.499886] Call Trace: [ 92.499891] [c0000000bebab9d0] [c0000000bebaba30] 0xc0000000bebaba30 (unreliable) [ 92.499894] [c0000000bebaba10] [c000000000027644] dma_get_required_mask+0x44/0xac [ 92.499904] [c0000000bebaba30] [d0000000043cb4b4] rxe_register_device+0xc4/0x430 [rdma_rxe] [ 92.499910] [c0000000bebabab0] [d0000000043c06c8] rxe_add+0x448/0x4e0 [rdma_rxe] [ 92.499915] [c0000000bebabb30] [d0000000043d28dc] rxe_net_add+0x4c/0xf0 [rdma_rxe] [ 92.499921] [c0000000bebabb60] [d0000000043d305c] rxe_param_set_add+0x6c/0x1ac [rdma_rxe] [ 92.499924] [c0000000bebabbf0] [c0000000000e78c0] param_attr_store+0xa0/0x180 [ 92.499927] [c0000000bebabc70] [c0000000000e6448] module_attr_store+0x48/0x70 [ 92.499932] [c0000000bebabc90] [c000000000391f60] sysfs_kf_write+0x70/0xb0 [ 92.499935] [c0000000bebabcb0] [c000000000390f1c] kernfs_fop_write+0x18c/0x1e0 [ 92.499939] [c0000000bebabd00] [c0000000002e22ac] __vfs_write+0x4c/0x1d0 [ 92.499942] [c0000000bebabd90] [c0000000002e2f94] vfs_write+0xc4/0x200 [ 92.499945] [c0000000bebabde0] [c0000000002e488c] SyS_write+0x6c/0x110 [ 92.499948] [c0000000bebabe30] [c000000000009384] system_call+0x38/0xe4 [ 92.499949] Instruction dump: [ 92.499954] 4e800020 3c4c00e8 3842dcf0 7c0802a6 f8010010 60000000 7c0802a6 fba1ffe8 [ 92.499958] fbc1fff0 fbe1fff8 f8010010 f821ffc1 7c7e1b78 2fa90000 419e0078 [ 92.499962] ---[ end trace bed077e15eb420cf ]--- It fails in dma_get_required_mask, that has ppc-specific implementation, and fail if provided device argument is NULL Signed-off-by: Mikhail Malygin Reviewed-by: Yonatan Cohen Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index a65550d6a849..2cb52fd48cf1 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1208,7 +1208,7 @@ int rxe_register_device(struct rxe_dev *rxe) rxe->ndev->dev_addr); dev->dev.dma_ops = &dma_virt_ops; dma_coerce_mask_and_coherent(&dev->dev, - dma_get_required_mask(dev->dev.parent)); + dma_get_required_mask(&dev->dev)); dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION; dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)