From 26312973bfbc1db24b157797776ee2b5b48f5c50 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Thu, 6 Oct 2022 12:14:56 -0400 Subject: [PATCH 001/122] IB/uverbs: fix the typo of optional Fix the typo of optional in the function of UVERBS_HANDLER. Signed-off-by: Deming Wang Link: https://lore.kernel.org/r/20221006161456.2998-1-wangdeming@inspur.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_std_types_qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c index dd1075466f61..7b4773fa4bc0 100644 --- a/drivers/infiniband/core/uverbs_std_types_qp.c +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -163,7 +163,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( UVERBS_ATTR_CREATE_QP_SRQ_HANDLE)) return -EINVAL; - /* send_cq is optinal */ + /* send_cq is optional */ if (cap.max_send_wr) { send_cq = uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); From 53c2d5b14a82f6e7f0f8089083972df20e66a354 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Sat, 1 Oct 2022 10:00:45 +0800 Subject: [PATCH 002/122] RDMA/core: return -EOPNOSUPP for ODP unsupported device ib_reg_mr(3) which is used to register a MR with specific access flags for specific HCA will set errno when something go wrong. So, here we should return the specific -EOPNOTSUPP when the being requested ODP access flag is unsupported by the HCA(such as RXE). Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20221001020045.8324-1-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 975d6e9efbcb..a1f4d53a4bb6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4334,7 +4334,7 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev, if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) - return -EINVAL; + return -EOPNOTSUPP; return 0; } From 7ac7bfe746d8faddbd79abed526ee67f46d8867c Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Sun, 9 Oct 2022 16:10:47 +0800 Subject: [PATCH 003/122] RDMA/opa_vnic: fix spelling typo in comment Fix spelling typo in comment. Reported-by: k2ci Signed-off-by: Jiangshan Yi Link: https://lore.kernel.org/r/20221009081047.2643471-1-13667453960@163.com Signed-off-by: Leon Romanovsky --- include/rdma/opa_vnic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h index f3d5377b217a..d297f084001a 100644 --- a/include/rdma/opa_vnic.h +++ b/include/rdma/opa_vnic.h @@ -51,7 +51,7 @@ static inline void *opa_vnic_dev_priv(const struct net_device *dev) return oparn->dev_priv; } -/* opa_vnic skb meta data structrue */ +/* opa_vnic skb meta data structure */ struct opa_vnic_skb_mdata { u8 vl; u8 entropy; From acc7d94ab431fb6f34e41588f2784410c2d9008e Mon Sep 17 00:00:00 2001 From: Sergey Gorenko Date: Sun, 16 Oct 2022 12:38:31 +0300 Subject: [PATCH 004/122] IB/iser: open code iser_conn_state_comp_exch There is a single caller to iser_conn_state_comp_exch. Open code its logic and remove it. Acked-by: Max Gurtovoy Signed-off-by: Sergey Gorenko Link: https://lore.kernel.org/r/20221016093833.12537-2-mgurtovoy@nvidia.com Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/iser/iser_verbs.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index a00ca117303a..a73c30230ff9 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -347,22 +347,6 @@ static void iser_device_try_release(struct iser_device *device) mutex_unlock(&ig.device_list_mutex); } -/* - * Called with state mutex held - */ -static int iser_conn_state_comp_exch(struct iser_conn *iser_conn, - enum iser_conn_state comp, - enum iser_conn_state exch) -{ - int ret; - - ret = (iser_conn->state == comp); - if (ret) - iser_conn->state = exch; - - return ret; -} - void iser_release_work(struct work_struct *work) { struct iser_conn *iser_conn; @@ -465,10 +449,10 @@ int iser_conn_terminate(struct iser_conn *iser_conn) int err = 0; /* terminate the iser conn only if the conn state is UP */ - if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, - ISER_CONN_TERMINATING)) + if (iser_conn->state != ISER_CONN_UP) return 0; + iser_conn->state = ISER_CONN_TERMINATING; iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state); /* suspend queuing of new iscsi commands */ From a75243ae08d23272235cb26117464843538936b4 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 16 Oct 2022 12:38:32 +0300 Subject: [PATCH 005/122] IB/iser: add safety checks for state_mutex lock In some cases, we need to make sure that state_mutex is taken. Use lockdep_assert_held to warn us in case it doesn't while it should. Signed-off-by: Max Gurtovoy Link: https://lore.kernel.org/r/20221016093833.12537-3-mgurtovoy@nvidia.com Reviewed-by: Sergey Gorenko Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/iser/iser_verbs.c | 26 ++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index a73c30230ff9..f33e3a7f605d 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -448,6 +448,8 @@ int iser_conn_terminate(struct iser_conn *iser_conn) struct ib_conn *ib_conn = &iser_conn->ib_conn; int err = 0; + lockdep_assert_held(&iser_conn->state_mutex); + /* terminate the iser conn only if the conn state is UP */ if (iser_conn->state != ISER_CONN_UP) return 0; @@ -482,9 +484,10 @@ int iser_conn_terminate(struct iser_conn *iser_conn) */ static void iser_connect_error(struct rdma_cm_id *cma_id) { - struct iser_conn *iser_conn; + struct iser_conn *iser_conn = cma_id->context; + + lockdep_assert_held(&iser_conn->state_mutex); - iser_conn = cma_id->context; iser_conn->state = ISER_CONN_TERMINATING; } @@ -526,12 +529,13 @@ static void iser_calc_scsi_params(struct iser_conn *iser_conn, */ static void iser_addr_handler(struct rdma_cm_id *cma_id) { + struct iser_conn *iser_conn = cma_id->context; struct iser_device *device; - struct iser_conn *iser_conn; struct ib_conn *ib_conn; int ret; - iser_conn = cma_id->context; + lockdep_assert_held(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ return; @@ -581,6 +585,8 @@ static void iser_route_handler(struct rdma_cm_id *cma_id) struct ib_conn *ib_conn = &iser_conn->ib_conn; struct ib_device *ib_dev = ib_conn->device->ib_device; + lockdep_assert_held(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ return; @@ -613,14 +619,18 @@ failure: iser_connect_error(cma_id); } +/* + * Called with state mutex held + */ static void iser_connected_handler(struct rdma_cm_id *cma_id, const void *private_data) { - struct iser_conn *iser_conn; + struct iser_conn *iser_conn = cma_id->context; struct ib_qp_attr attr; struct ib_qp_init_attr init_attr; - iser_conn = cma_id->context; + lockdep_assert_held(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ return; @@ -654,11 +664,15 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id) } } +/* + * Called with state mutex held + */ static void iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy) { struct iser_conn *iser_conn = cma_id->context; + lockdep_assert_held(&iser_conn->state_mutex); /* * We are not guaranteed that we visited disconnected_handler * by now, call it here to be safe that we handle CM drep From c1842f34fceef47d6285e558004f8e2d6ed91b91 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 16 Oct 2022 12:38:33 +0300 Subject: [PATCH 006/122] IB/iser: open code iser_disconnected_handler There is a single caller to iser_disconnected_handler. Open code its logic and remove it. Signed-off-by: Max Gurtovoy Link: https://lore.kernel.org/r/20221016093833.12537-4-mgurtovoy@nvidia.com Reviewed-by: Sergey Gorenko Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/iser/iser_verbs.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index f33e3a7f605d..1b8eda0dae4e 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -651,19 +651,6 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id, complete(&iser_conn->up_completion); } -static void iser_disconnected_handler(struct rdma_cm_id *cma_id) -{ - struct iser_conn *iser_conn = cma_id->context; - - if (iser_conn_terminate(iser_conn)) { - if (iser_conn->iscsi_conn) - iscsi_conn_failure(iser_conn->iscsi_conn, - ISCSI_ERR_CONN_FAILED); - else - iser_err("iscsi_iser connection isn't bound\n"); - } -} - /* * Called with state mutex held */ @@ -678,7 +665,13 @@ static void iser_cleanup_handler(struct rdma_cm_id *cma_id, * by now, call it here to be safe that we handle CM drep * and flush errors. */ - iser_disconnected_handler(cma_id); + if (iser_conn_terminate(iser_conn)) { + if (iser_conn->iscsi_conn) + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + else + iser_err("iscsi_iser connection isn't bound\n"); + } iser_free_ib_conn_res(iser_conn, destroy); complete(&iser_conn->ib_completion); } From d0b9f28f0da2808b339bb290b539518e69e48ac9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 21 Oct 2022 18:26:11 +0100 Subject: [PATCH 007/122] RDMA/qib: Remove not-used variable n The variable n being incremented but it is never referenced, it is redundant and can be removed. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20221021172611.26763-1-colin.i.king@gmail.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_tx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_tx.c b/drivers/infiniband/hw/qib/qib_tx.c index 6a8148851f21..1325110237cd 100644 --- a/drivers/infiniband/hw/qib/qib_tx.c +++ b/drivers/infiniband/hw/qib/qib_tx.c @@ -82,7 +82,6 @@ int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *rcd) struct qib_devdata *dd = rcd->dd; unsigned i; unsigned last; - unsigned n = 0; last = rcd->pio_base + rcd->piocnt; /* @@ -102,10 +101,8 @@ int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *rcd) } spin_lock_irq(&dd->pioavail_lock); for (i = rcd->pio_base; i < last; i++) { - if (__test_and_clear_bit(i, dd->pio_need_disarm)) { - n++; + if (__test_and_clear_bit(i, dd->pio_need_disarm)) dd->f_sendctrl(rcd->ppd, QIB_SENDCTRL_DISARM_BUF(i)); - } } spin_unlock_irq(&dd->pioavail_lock); return 0; From 5dc1b37d75e7136588480712e7173169b3d4164f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 21 Oct 2022 18:35:04 +0100 Subject: [PATCH 008/122] RDMA/qib: Remove not-used variable freeze_cnt The variable freeze_cnt being incremented but it is never referenced, it is redundant and can be removed. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20221021173504.27546-1-colin.i.king@gmail.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_iba6120.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index aea571943768..23a81edf3f7a 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -799,12 +799,9 @@ static void qib_handle_6120_hwerrors(struct qib_devdata *dd, char *msg, hwerrs &= ~TXE_PIO_PARITY; } - if (!hwerrs) { - static u32 freeze_cnt; - - freeze_cnt++; + if (!hwerrs) qib_6120_clear_freeze(dd); - } else + else isfatal = 1; } From 2d5206c4629dfe74a51bb9c54758095139f5d01d Mon Sep 17 00:00:00 2001 From: wangjianli Date: Sat, 22 Oct 2022 13:59:05 +0800 Subject: [PATCH 009/122] RDMA/qib: fix repeated words in comments Delete the redundant word 'the'. Signed-off-by: wangjianli Link: https://lore.kernel.org/r/20221022055905.49176-1-wangjianli@cdjrlc.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_user_sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index bf2f30d67949..9fe03d6ffac1 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -851,7 +851,7 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, } /* - * This assignment is a bit strange. it's because the + * This assignment is a bit strange. it's because * the pbc counts the number of 32 bit words in the full * packet _except_ the first word of the pbc itself... */ From c4bb733234b0ffd939030bb592b691ac19519455 Mon Sep 17 00:00:00 2001 From: wangjianli Date: Sat, 22 Oct 2022 14:00:30 +0800 Subject: [PATCH 010/122] RDMA/core: fix repeated words in comments Delete the redundant word 'the'. Signed-off-by: wangjianli Link: https://lore.kernel.org/r/20221022060030.50900-1-wangjianli@cdjrlc.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 4084d05a4510..2e91d8879326 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1422,7 +1422,7 @@ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, *vlan_id = vlan_dev_vlan_id(ndev); } else { /* If the netdev is upper device and if it's lower - * device is vlan device, consider vlan id of the + * device is vlan device, consider vlan id of * the lower vlan device for this gid entry. */ netdev_walk_all_lower_dev_rcu(attr->ndev, From 65bf03427cee48258a227431577dc56bf70461f3 Mon Sep 17 00:00:00 2001 From: wangjianli Date: Sat, 22 Oct 2022 13:52:57 +0800 Subject: [PATCH 011/122] RDMA/qedr: fix repeated words in comments Delete the redundant word 'the'. Signed-off-by: wangjianli Link: https://lore.kernel.org/r/20221022055257.42905-1-wangjianli@cdjrlc.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qedr/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 5152f10d2e6d..5e7069b76d46 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -472,7 +472,7 @@ static irqreturn_t qedr_irq_handler(int irq, void *handle) /* The CQ's CNQ notification counter is checked before * destroying the CQ in a busy-wait loop that waits for all of * the CQ's CNQ interrupts to be processed. It is increased - * here, only after the completion handler, to ensure that the + * here, only after the completion handler, to ensure that * the handler is not running when the CQ is destroyed. */ cq->cnq_notif++; From 71d236399160ad9beaae7267b93d2d487e8f19a0 Mon Sep 17 00:00:00 2001 From: "yangx.jy@fujitsu.com" Date: Fri, 21 Oct 2022 13:45:17 +0000 Subject: [PATCH 012/122] RDMA/rxe: Remove the member 'type' of struct rxe_mr The member 'type' is included in both struct rxe_mr and struct ib_mr so remove the duplicate one of struct rxe_mr. Signed-off-by: Xiao Yang Link: https://lore.kernel.org/r/20221021134513.17730-1-yangx.jy@fujitsu.com Reviewed-by: Bob Pearson Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_mr.c | 16 ++++++++-------- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 - 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 502e9ada99b3..d4f10c2d1aa7 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -26,7 +26,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) { - switch (mr->type) { + switch (mr->ibmr.type) { case IB_MR_TYPE_DMA: return 0; @@ -39,7 +39,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) default: pr_warn("%s: mr type (%d) not supported\n", - __func__, mr->type); + __func__, mr->ibmr.type); return -EFAULT; } } @@ -109,7 +109,7 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr) mr->access = access; mr->state = RXE_MR_STATE_VALID; - mr->type = IB_MR_TYPE_DMA; + mr->ibmr.type = IB_MR_TYPE_DMA; } int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -178,7 +178,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, mr->access = access; mr->offset = ib_umem_offset(umem); mr->state = RXE_MR_STATE_VALID; - mr->type = IB_MR_TYPE_USER; + mr->ibmr.type = IB_MR_TYPE_USER; return 0; @@ -205,7 +205,7 @@ int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) mr->max_buf = max_pages; mr->state = RXE_MR_STATE_FREE; - mr->type = IB_MR_TYPE_MEM_REG; + mr->ibmr.type = IB_MR_TYPE_MEM_REG; return 0; @@ -304,7 +304,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, if (length == 0) return 0; - if (mr->type == IB_MR_TYPE_DMA) { + if (mr->ibmr.type == IB_MR_TYPE_DMA) { u8 *src, *dest; src = (dir == RXE_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova); @@ -547,8 +547,8 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) goto err_drop_ref; } - if (unlikely(mr->type != IB_MR_TYPE_MEM_REG)) { - pr_warn("%s: mr->type (%d) is wrong type\n", __func__, mr->type); + if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { + pr_warn("%s: mr type (%d) is wrong\n", __func__, mr->ibmr.type); ret = -EINVAL; goto err_drop_ref; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 5f5cbfcb3569..22a299b0a9f0 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -304,7 +304,6 @@ struct rxe_mr { u32 lkey; u32 rkey; enum rxe_mr_state state; - enum ib_mr_type type; u32 offset; int access; From 5ebc548f4f54fe971d741d80cd108f1a45c9e88d Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Thu, 13 Oct 2022 10:47:23 +0900 Subject: [PATCH 013/122] RDMA/rxe: Make responder handle RDMA Read failures Currently, responder can reply packets with invalid payloads if it fails to copy messages to the packets. Add an error handling in read_reply() to inform a requesting node of the failure. Link: https://lore.kernel.org/r/20221013014724.3786212-1-matsuda-daisuke@fujitsu.com Suggested-by: Li Zhijian Signed-off-by: Daisuke Matsuda Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_resp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index ed5a09e86417..82b74e926e09 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -809,10 +809,14 @@ static enum resp_states read_reply(struct rxe_qp *qp, if (!skb) return RESPST_ERR_RNR; - rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), - payload, RXE_FROM_MR_OBJ); + err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), + payload, RXE_FROM_MR_OBJ); if (mr) rxe_put(mr); + if (err) { + kfree_skb(skb); + return RESPST_ERR_RKEY_VIOLATION; + } if (bth_pad(&ack_pkt)) { u8 *pad = payload_addr(&ack_pkt) + payload; From 5ac814e02ece516761d2e244cef93843df911ae0 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Thu, 13 Oct 2022 10:47:24 +0900 Subject: [PATCH 014/122] RDMA/rxe: Handle remote errors in the midst of a Read reply sequence Requesting nodes do not handle a reported error correctly if it is generated in the middle of multi-packet Read responses, and the node tries to resend the request endlessly. Let completer terminate the connection in that case. Link: https://lore.kernel.org/r/20221013014724.3786212-2-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_comp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index fb0c008af78c..c9170dd99f3a 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -200,6 +200,10 @@ static inline enum comp_state check_psn(struct rxe_qp *qp, */ if (pkt->psn == wqe->last_psn) return COMPST_COMP_ACK; + else if (pkt->opcode == IB_OPCODE_RC_ACKNOWLEDGE && + (qp->comp.opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST || + qp->comp.opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE)) + return COMPST_CHECK_ACK; else return COMPST_DONE; } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) { @@ -228,6 +232,10 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + /* Check NAK code to handle a remote error */ + if (pkt->opcode == IB_OPCODE_RC_ACKNOWLEDGE) + break; + if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { /* read retries of partial data may restart from From 686d348476ee8006087cfcbef591e28f4f91bd8b Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 24 Oct 2022 03:31:54 +0000 Subject: [PATCH 015/122] RDMA/rxe: Remove unnecessary mr testing Before the testing, we already passed it to rxe_mr_copy() where mr could be dereferenced. so this checking is not needed. The only way that mr is NULL is when it reaches below line 780 with 'qp->resp.mr = NULL', which is not possible in Bob's explanation[1]. 778 if (res->state == rdatm_res_state_new) { 779 if (!res->replay) { 780 mr = qp->resp.mr; 781 qp->resp.mr = NULL; 782 } else { [1] https://lore.kernel.org/lkml/30ff25c4-ce66-eac4-eaa2-64c0db203a19@gmail.com/ Link: https://lore.kernel.org/r/1666582315-2-1-git-send-email-lizhijian@fujitsu.com CC: Bob Pearson Signed-off-by: Li Zhijian Reviewed-by: Bob Pearson Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_resp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 82b74e926e09..95d372db934d 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -811,8 +811,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), payload, RXE_FROM_MR_OBJ); - if (mr) - rxe_put(mr); + rxe_put(mr); if (err) { kfree_skb(skb); return RESPST_ERR_RKEY_VIOLATION; From 4508d32ccced24c972bc4592104513e1ff8439b5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 25 Oct 2022 10:37:13 +0300 Subject: [PATCH 016/122] RDMA/core: Fix order of nldev_exit call Create symmetrical exit flow by calling to nldev_exit() after call to rdma_nl_unregister(RDMA_NL_LS). Fixes: 6c80b41abe22 ("RDMA/netlink: Add nldev initialization flows") Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/64e676774a53a406f4cde265d5a4cfd6b8e97df9.1666683334.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index ae60c73babcc..3409c55ea88b 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2843,8 +2843,8 @@ err: static void __exit ib_core_cleanup(void) { roce_gid_mgmt_cleanup(); - nldev_exit(); rdma_nl_unregister(RDMA_NL_LS); + nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); From 05e88ebb9ecfe9631ccc6483a79b0eabf554da60 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:02 -0500 Subject: [PATCH 017/122] RDMA/rxe: Remove redundant header files Remove unneeded include files. Link: https://lore.kernel.org/r/20221021200118.2163-2-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index ec2b7de1c497..3fbaba9eec39 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -4,10 +4,6 @@ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ -#include -#include -#include - #include "rxe.h" int __rxe_do_task(struct rxe_task *task) From 98a54f170617746b5d09b18b23b295efc7a42a5e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:03 -0500 Subject: [PATCH 018/122] RDMA/rxe: Remove init of task locks from rxe_qp.c The calls to spin_lock_init() for the tasklet spinlocks in rxe_qp_init_misc() are redundant since they are intiialized in rxe_init_task(). This patch removes them. Link: https://lore.kernel.org/r/20221021200118.2163-3-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index a62bab88415c..57c3f05ad15b 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -172,10 +172,6 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, spin_lock_init(&qp->state_lock); - spin_lock_init(&qp->req.task.state_lock); - spin_lock_init(&qp->resp.task.state_lock); - spin_lock_init(&qp->comp.task.state_lock); - spin_lock_init(&qp->sq.sq_lock); spin_lock_init(&qp->rq.producer_lock); spin_lock_init(&qp->rq.consumer_lock); From de669ae8af49ceed0eed44f5b3d51dc62affc5e4 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:04 -0500 Subject: [PATCH 019/122] RDMA/rxe: Removed unused name from rxe_task struct The name field in struct rxe_task is never used. This patch removes it. Link: https://lore.kernel.org/r/20221021200118.2163-4-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 9 +++------ drivers/infiniband/sw/rxe/rxe_task.c | 4 +--- drivers/infiniband/sw/rxe/rxe_task.h | 4 +--- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 57c3f05ad15b..03bd9f3e9956 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -238,10 +238,8 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, skb_queue_head_init(&qp->req_pkts); - rxe_init_task(&qp->req.task, qp, - rxe_requester, "req"); - rxe_init_task(&qp->comp.task, qp, - rxe_completer, "comp"); + rxe_init_task(&qp->req.task, qp, rxe_requester); + rxe_init_task(&qp->comp.task, qp, rxe_completer); qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ if (init->qp_type == IB_QPT_RC) { @@ -288,8 +286,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, skb_queue_head_init(&qp->resp_pkts); - rxe_init_task(&qp->resp.task, qp, - rxe_responder, "resp"); + rxe_init_task(&qp->resp.task, qp, rxe_responder); qp->resp.opcode = OPCODE_NONE; qp->resp.msn = 0; diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 3fbaba9eec39..0cbba455fefd 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -90,12 +90,10 @@ void rxe_do_task(struct tasklet_struct *t) task->ret = ret; } -int rxe_init_task(struct rxe_task *task, - void *arg, int (*func)(void *), char *name) +int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)) { task->arg = arg; task->func = func; - snprintf(task->name, sizeof(task->name), "%s", name); task->destroyed = false; tasklet_setup(&task->tasklet, rxe_do_task); diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index 7f612a1c68a7..b3dfd970d1dc 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -25,7 +25,6 @@ struct rxe_task { void *arg; int (*func)(void *arg); int ret; - char name[16]; bool destroyed; }; @@ -34,8 +33,7 @@ struct rxe_task { * arg => parameter to pass to fcn * func => function to call until it returns != 0 */ -int rxe_init_task(struct rxe_task *task, - void *arg, int (*func)(void *), char *name); +int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)); /* cleanup task */ void rxe_cleanup_task(struct rxe_task *task); From dccb23f6c312e4480fe32ccbc2afac1a5cac7e5e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:05 -0500 Subject: [PATCH 020/122] RDMA/rxe: Split rxe_run_task() into two subroutines Split rxe_run_task(task, sched) into rxe_run_task(task) and rxe_sched_task(task). Link: https://lore.kernel.org/r/20221021200118.2163-5-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 19 +++++++++++-------- drivers/infiniband/sw/rxe/rxe_net.c | 4 ++-- drivers/infiniband/sw/rxe/rxe_qp.c | 10 +++++----- drivers/infiniband/sw/rxe/rxe_req.c | 10 +++++----- drivers/infiniband/sw/rxe/rxe_resp.c | 5 ++++- drivers/infiniband/sw/rxe/rxe_task.c | 15 ++++++++++----- drivers/infiniband/sw/rxe/rxe_task.h | 7 +++---- drivers/infiniband/sw/rxe/rxe_verbs.c | 8 ++++---- 8 files changed, 44 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index c9170dd99f3a..66f392810c86 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -118,7 +118,7 @@ void retransmit_timer(struct timer_list *t) if (qp->valid) { qp->comp.timeout = 1; - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); } } @@ -132,7 +132,10 @@ void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) if (must_sched != 0) rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED); - rxe_run_task(&qp->comp.task, must_sched); + if (must_sched) + rxe_sched_task(&qp->comp.task); + else + rxe_run_task(&qp->comp.task); } static inline enum comp_state get_wqe(struct rxe_qp *qp, @@ -313,7 +316,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, qp->comp.psn = pkt->psn; if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); } } return COMPST_ERROR_RETRY; @@ -460,7 +463,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) */ if (qp->req.wait_fence) { qp->req.wait_fence = 0; - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); } } @@ -474,7 +477,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, if (qp->req.need_rd_atomic) { qp->comp.timeout_retry = 0; qp->req.need_rd_atomic = 0; - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); } } @@ -520,7 +523,7 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } } @@ -654,7 +657,7 @@ int rxe_completer(void *arg) if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } state = COMPST_DONE; @@ -722,7 +725,7 @@ int rxe_completer(void *arg) RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; qp->comp.started_retry = 1; - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); } goto done; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 35f327b9d4b8..c36cad9c7a66 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -345,7 +345,7 @@ static void rxe_skb_tx_dtor(struct sk_buff *skb) if (unlikely(qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); rxe_put(qp); } @@ -429,7 +429,7 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, if ((qp_type(qp) != IB_QPT_RC) && (pkt->mask & RXE_END_MASK)) { pkt->wqe->state = wqe_state_done; - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); } rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 03bd9f3e9956..3f6d62a80bea 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -536,10 +536,10 @@ static void rxe_qp_drain(struct rxe_qp *qp) if (qp->req.state != QP_STATE_DRAINED) { qp->req.state = QP_STATE_DRAIN; if (qp_type(qp) == IB_QPT_RC) - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); else __rxe_do_task(&qp->comp.task); - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } } } @@ -553,13 +553,13 @@ void rxe_qp_error(struct rxe_qp *qp) qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ - rxe_run_task(&qp->resp.task, 1); + rxe_sched_task(&qp->resp.task); if (qp_type(qp) == IB_QPT_RC) - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); else __rxe_do_task(&qp->comp.task); - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } /* called by the modify qp verb */ diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index f63771207970..41f1d84f0acb 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -105,7 +105,7 @@ void rnr_nak_timer(struct timer_list *t) /* request a send queue retry */ qp->req.need_retry = 1; qp->req.wait_for_rnr_timer = 0; - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); } static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) @@ -608,7 +608,7 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) * which can lead to a deadlock. So go ahead and complete * it now. */ - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); return 0; } @@ -733,7 +733,7 @@ int rxe_requester(void *arg) qp->req.wqe_index); wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; - rxe_run_task(&qp->comp.task, 0); + rxe_run_task(&qp->comp.task); goto done; } payload = mtu; @@ -795,7 +795,7 @@ int rxe_requester(void *arg) rollback_state(wqe, qp, &rollback_wqe, rollback_psn); if (err == -EAGAIN) { - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); goto exit; } @@ -817,7 +817,7 @@ err: qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); wqe->state = wqe_state_error; qp->req.state = QP_STATE_ERROR; - rxe_run_task(&qp->comp.task, 0); + rxe_run_task(&qp->comp.task); exit: ret = -EAGAIN; out: diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 95d372db934d..c32bc12cc82f 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -91,7 +91,10 @@ void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) || (skb_queue_len(&qp->req_pkts) > 1); - rxe_run_task(&qp->resp.task, must_sched); + if (must_sched) + rxe_sched_task(&qp->resp.task); + else + rxe_run_task(&qp->resp.task); } static inline enum resp_states get_req(struct rxe_qp *qp, diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 0cbba455fefd..442b7348acdc 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -123,15 +123,20 @@ void rxe_cleanup_task(struct rxe_task *task) tasklet_kill(&task->tasklet); } -void rxe_run_task(struct rxe_task *task, int sched) +void rxe_run_task(struct rxe_task *task) { if (task->destroyed) return; - if (sched) - tasklet_schedule(&task->tasklet); - else - rxe_do_task(&task->tasklet); + rxe_do_task(&task->tasklet); +} + +void rxe_sched_task(struct rxe_task *task) +{ + if (task->destroyed) + return; + + tasklet_schedule(&task->tasklet); } void rxe_disable_task(struct rxe_task *task) diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index b3dfd970d1dc..590b1c1d7e7c 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -52,10 +52,9 @@ int __rxe_do_task(struct rxe_task *task); */ void rxe_do_task(struct tasklet_struct *t); -/* run a task, else schedule it to run as a tasklet, The decision - * to run or schedule tasklet is based on the parameter sched. - */ -void rxe_run_task(struct rxe_task *task, int sched); +void rxe_run_task(struct rxe_task *task); + +void rxe_sched_task(struct rxe_task *task); /* keep a task from scheduling */ void rxe_disable_task(struct rxe_task *task); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 88825edc7dce..f2f82efbaf6d 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -695,9 +695,9 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *wr, wr = next; } - rxe_run_task(&qp->req.task, 1); + rxe_sched_task(&qp->req.task); if (unlikely(qp->req.state == QP_STATE_ERROR)) - rxe_run_task(&qp->comp.task, 1); + rxe_sched_task(&qp->comp.task); return err; } @@ -719,7 +719,7 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, if (qp->is_user) { /* Utilize process context to do protocol processing */ - rxe_run_task(&qp->req.task, 0); + rxe_run_task(&qp->req.task); return 0; } else return rxe_post_send_kernel(qp, wr, bad_wr); @@ -759,7 +759,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_unlock_irqrestore(&rq->producer_lock, flags); if (qp->resp.state == QP_STATE_ERROR) - rxe_run_task(&qp->resp.task, 1); + rxe_sched_task(&qp->resp.task); err1: return err; From dcef28528cce82a82134abd393aa0f38f2edf77e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:06 -0500 Subject: [PATCH 021/122] RDMA/rxe: Make rxe_do_task static The subroutine rxe_do_task() is only called in rxe_task.c. This patch makes it static and renames it do_task(). Link: https://lore.kernel.org/r/20221021200118.2163-6-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 6 +++--- drivers/infiniband/sw/rxe/rxe_task.h | 8 -------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 442b7348acdc..fb953f5195b8 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -24,7 +24,7 @@ int __rxe_do_task(struct rxe_task *task) * a second caller finds the task already running * but looks just after the last call to func */ -void rxe_do_task(struct tasklet_struct *t) +static void do_task(struct tasklet_struct *t) { int cont; int ret; @@ -96,7 +96,7 @@ int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)) task->func = func; task->destroyed = false; - tasklet_setup(&task->tasklet, rxe_do_task); + tasklet_setup(&task->tasklet, do_task); task->state = TASK_STATE_START; spin_lock_init(&task->state_lock); @@ -128,7 +128,7 @@ void rxe_run_task(struct rxe_task *task) if (task->destroyed) return; - rxe_do_task(&task->tasklet); + do_task(&task->tasklet); } void rxe_sched_task(struct rxe_task *task) diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index 590b1c1d7e7c..99e0173e5c46 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -44,14 +44,6 @@ void rxe_cleanup_task(struct rxe_task *task); */ int __rxe_do_task(struct rxe_task *task); -/* - * common function called by any of the main tasklets - * If there is any chance that there is additional - * work to do someone must reschedule the task before - * leaving - */ -void rxe_do_task(struct tasklet_struct *t); - void rxe_run_task(struct rxe_task *task); void rxe_sched_task(struct rxe_task *task); From 63a18baef2653f59a7c5b990283628bd54d062fd Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Oct 2022 15:01:07 -0500 Subject: [PATCH 022/122] RDMA/rxe: Rename task->state_lock to task->lock Rename task-state_lock to task->lock Link: https://lore.kernel.org/r/20221021200118.2163-7-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 18 +++++++++--------- drivers/infiniband/sw/rxe/rxe_task.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index fb953f5195b8..0208d833a41b 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -31,22 +31,22 @@ static void do_task(struct tasklet_struct *t) struct rxe_task *task = from_tasklet(task, t, tasklet); unsigned int iterations = RXE_MAX_ITERATIONS; - spin_lock_bh(&task->state_lock); + spin_lock_bh(&task->lock); switch (task->state) { case TASK_STATE_START: task->state = TASK_STATE_BUSY; - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); break; case TASK_STATE_BUSY: task->state = TASK_STATE_ARMED; fallthrough; case TASK_STATE_ARMED: - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); return; default: - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); pr_warn("%s failed with bad state %d\n", __func__, task->state); return; } @@ -55,7 +55,7 @@ static void do_task(struct tasklet_struct *t) cont = 0; ret = task->func(task->arg); - spin_lock_bh(&task->state_lock); + spin_lock_bh(&task->lock); switch (task->state) { case TASK_STATE_BUSY: if (ret) { @@ -84,7 +84,7 @@ static void do_task(struct tasklet_struct *t) pr_warn("%s failed with bad state %d\n", __func__, task->state); } - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); } while (cont); task->ret = ret; @@ -99,7 +99,7 @@ int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)) tasklet_setup(&task->tasklet, do_task); task->state = TASK_STATE_START; - spin_lock_init(&task->state_lock); + spin_lock_init(&task->lock); return 0; } @@ -115,9 +115,9 @@ void rxe_cleanup_task(struct rxe_task *task) task->destroyed = true; do { - spin_lock_bh(&task->state_lock); + spin_lock_bh(&task->lock); idle = (task->state == TASK_STATE_START); - spin_unlock_bh(&task->state_lock); + spin_unlock_bh(&task->lock); } while (!idle); tasklet_kill(&task->tasklet); diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index 99e0173e5c46..7b88129702ac 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -21,7 +21,7 @@ enum { struct rxe_task { struct tasklet_struct tasklet; int state; - spinlock_t state_lock; /* spinlock for task state */ + spinlock_t lock; void *arg; int (*func)(void *arg); int ret; From 875ab4a8d9a7e559c4aaad28f5886d39923301b7 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 27 Sep 2022 13:53:27 +0800 Subject: [PATCH 023/122] RDMA/rxe: Make sure requested access is a subset of {mr,mw}->access We should reject the requests with access flags that is not registered by MR/MW. For example, lookup_mr() should return NULL when requested access is 0x03 and mr->access is 0x01. Link: https://lore.kernel.org/r/20220927055337.22630-2-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- drivers/infiniband/sw/rxe/rxe_mw.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index d4f10c2d1aa7..014c27bba049 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -511,7 +511,7 @@ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || - mr_pd(mr) != pd || (access && !(access & mr->access)) || + mr_pd(mr) != pd || ((access & mr->access) != access) || mr->state != RXE_MR_STATE_VALID)) { rxe_put(mr); mr = NULL; diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index 902b7df7aaed..8df1c9066ed8 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -293,8 +293,7 @@ struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey) if (unlikely((mw->rkey != rkey) || rxe_mw_pd(mw) != pd || (mw->ibmw.type == IB_MW_TYPE_2 && mw->qp != qp) || - (mw->length == 0) || - (access && !(access & mw->access)) || + (mw->length == 0) || ((access & mw->access) != access) || mw->state != RXE_MW_STATE_VALID)) { rxe_put(mw); return NULL; From b071850ef62e36b2fc2ec81863f07be857151409 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 27 Oct 2022 07:31:33 +0000 Subject: [PATCH 024/122] RDMA/rxe: Remove the duplicate assignment of mr->map_shift mr->map_shift is set to ilog2(RXE_BUF_PER_MAP) in both rxe_mr_init() and rxe_mr_alloc() so remove the duplicate one in rxe_mr_init(). Link: https://lore.kernel.org/r/1666855893-145-1-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 014c27bba049..bc081002bddc 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -62,7 +62,6 @@ static void rxe_mr_init(int access, struct rxe_mr *mr) mr->rkey = mr->ibmr.rkey = rkey; mr->state = RXE_MR_STATE_INVALID; - mr->map_shift = ilog2(RXE_BUF_PER_MAP); } static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) From 692373d186205dfb1b56f35f22702412d94d9420 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 28 Oct 2022 15:50:53 +0800 Subject: [PATCH 025/122] RDMA/rxe: cleanup some error handling in rxe_verbs.c Instead of 'goto and return', just return directly to simplify the error handling, and avoid some unnecessary return value check. Link: https://lore.kernel.org/r/20221028075053.3990467-1-xuhaoyue1@hisilicon.com Signed-off-by: Yunsheng Lin Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 80 ++++++++------------------- 1 file changed, 23 insertions(+), 57 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f2f82efbaf6d..bcdfdadaebbc 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -238,7 +238,6 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) { - int err; int i; u32 length; struct rxe_recv_wqe *recv_wqe; @@ -246,15 +245,11 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) int full; full = queue_full(rq->queue, QUEUE_TYPE_TO_DRIVER); - if (unlikely(full)) { - err = -ENOMEM; - goto err1; - } + if (unlikely(full)) + return -ENOMEM; - if (unlikely(num_sge > rq->max_sge)) { - err = -EINVAL; - goto err1; - } + if (unlikely(num_sge > rq->max_sge)) + return -EINVAL; length = 0; for (i = 0; i < num_sge; i++) @@ -275,9 +270,6 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) queue_advance_producer(rq->queue, QUEUE_TYPE_TO_DRIVER); return 0; - -err1: - return err; } static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, @@ -343,10 +335,7 @@ static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, if (err) return err; - err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd, udata); - if (err) - return err; - return 0; + return rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd, udata); } static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) @@ -453,11 +442,11 @@ static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, err = rxe_qp_chk_attr(rxe, qp, attr, mask); if (err) - goto err1; + return err; err = rxe_qp_from_attr(qp, attr, mask, udata); if (err) - goto err1; + return err; if ((mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH)) qp->src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, @@ -465,9 +454,6 @@ static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp->attr.dest_qp_num); return 0; - -err1: - return err; } static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, @@ -501,24 +487,21 @@ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, struct rxe_sq *sq = &qp->sq; if (unlikely(num_sge > sq->max_sge)) - goto err1; + return -EINVAL; if (unlikely(mask & WR_ATOMIC_MASK)) { if (length < 8) - goto err1; + return -EINVAL; if (atomic_wr(ibwr)->remote_addr & 0x7) - goto err1; + return -EINVAL; } if (unlikely((ibwr->send_flags & IB_SEND_INLINE) && (length > sq->max_inline))) - goto err1; + return -EINVAL; return 0; - -err1: - return -EINVAL; } static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, @@ -735,14 +718,12 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) { *bad_wr = wr; - err = -EINVAL; - goto err1; + return -EINVAL; } if (unlikely(qp->srq)) { *bad_wr = wr; - err = -EINVAL; - goto err1; + return -EINVAL; } spin_lock_irqsave(&rq->producer_lock, flags); @@ -761,7 +742,6 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, if (qp->resp.state == QP_STATE_ERROR) rxe_sched_task(&qp->resp.task); -err1: return err; } @@ -826,16 +806,9 @@ static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) err = rxe_cq_chk_attr(rxe, cq, cqe, 0); if (err) - goto err1; + return err; - err = rxe_cq_resize_queue(cq, cqe, uresp, udata); - if (err) - goto err1; - - return 0; - -err1: - return err; + return rxe_cq_resize_queue(cq, cqe, uresp, udata); } static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) @@ -921,26 +894,22 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, struct rxe_mr *mr; mr = rxe_alloc(&rxe->mr_pool); - if (!mr) { - err = -ENOMEM; - goto err2; - } - + if (!mr) + return ERR_PTR(-ENOMEM); rxe_get(pd); mr->ibmr.pd = ibpd; err = rxe_mr_init_user(rxe, start, length, iova, access, mr); if (err) - goto err3; + goto err1; rxe_finalize(mr); return &mr->ibmr; -err3: +err1: rxe_cleanup(mr); -err2: return ERR_PTR(err); } @@ -956,25 +925,22 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, return ERR_PTR(-EINVAL); mr = rxe_alloc(&rxe->mr_pool); - if (!mr) { - err = -ENOMEM; - goto err1; - } + if (!mr) + return ERR_PTR(-ENOMEM); rxe_get(pd); mr->ibmr.pd = ibpd; err = rxe_mr_init_fast(max_num_sg, mr); if (err) - goto err2; + goto err1; rxe_finalize(mr); return &mr->ibmr; -err2: - rxe_cleanup(mr); err1: + rxe_cleanup(mr); return ERR_PTR(err); } From abef378c434e6f5abd46fd536e9972374fb74e98 Mon Sep 17 00:00:00 2001 From: Arumugam Kolappan Date: Tue, 1 Nov 2022 00:27:44 -0700 Subject: [PATCH 026/122] RDMA/mlx5: Change debug log level for remote access error syndromes The mlx5 driver dumps the entire CQE buffer by default for few syndromes. Some syndromes are expected due to the application behavior [ex: MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR, MLX5_CQE_SYNDROME_REMOTE_OP_ERR and MLX5_CQE_SYNDROME_LOCAL_PROT_ERR]. Hence, for these syndromes, the patch converts the log level from KERN_WARNING to KERN_DEBUG. This enables the application to get the CQE buffer dump by changing to KERN_DEBUG level as and when needed. Suggested-by: Leon Romanovsky Signed-off-by: Arumugam Kolappan Link: https://lore.kernel.org/r/1667287664-19377-1-git-send-email-aru.kolappan@oracle.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/cq.c | 27 +++++++++++++++------------ drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++++ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index be189e0525de..efc9e4a6df04 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -267,17 +267,20 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; } -static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe, + struct ib_wc *wc, const char *level) { - mlx5_ib_warn(dev, "dump error cqe\n"); - mlx5_dump_err_cqe(dev->mdev, cqe); + mlx5_ib_log(level, dev, "WC error: %d, Message: %s\n", wc->status, + ib_wc_status_msg(wc->status)); + print_hex_dump(level, "cqe_dump: ", DUMP_PREFIX_OFFSET, 16, 1, + cqe, sizeof(*cqe), false); } static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe, struct ib_wc *wc) { - int dump = 1; + const char *dump = KERN_WARNING; switch (cqe->syndrome) { case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: @@ -287,10 +290,11 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, wc->status = IB_WC_LOC_QP_OP_ERR; break; case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + dump = KERN_DEBUG; wc->status = IB_WC_LOC_PROT_ERR; break; case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: - dump = 0; + dump = NULL; wc->status = IB_WC_WR_FLUSH_ERR; break; case MLX5_CQE_SYNDROME_MW_BIND_ERR: @@ -306,18 +310,20 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, wc->status = IB_WC_REM_INV_REQ_ERR; break; case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + dump = KERN_DEBUG; wc->status = IB_WC_REM_ACCESS_ERR; break; case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + dump = KERN_DEBUG; wc->status = IB_WC_REM_OP_ERR; break; case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + dump = NULL; wc->status = IB_WC_RETRY_EXC_ERR; - dump = 0; break; case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + dump = NULL; wc->status = IB_WC_RNR_RETRY_EXC_ERR; - dump = 0; break; case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: wc->status = IB_WC_REM_ABORT_ERR; @@ -328,11 +334,8 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, } wc->vendor_err = cqe->vendor_err_synd; - if (dump) { - mlx5_ib_warn(dev, "WC error: %d, Message: %s\n", wc->status, - ib_wc_status_msg(wc->status)); - dump_cqe(dev, cqe); - } + if (dump) + dump_cqe(dev, cqe, wc, dump); } static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 4a7f7064bd0e..8b91babdd4c0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -38,6 +38,10 @@ dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ __LINE__, current->pid, ##arg) +#define mlx5_ib_log(lvl, _dev, format, arg...) \ + dev_printk(lvl, &(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, ##arg) + #define MLX5_IB_DEFAULT_UIDX 0xffffff #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) From 4eace75e0853273755b878ffa9cce6de84df975a Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 4 Nov 2022 18:49:57 -0500 Subject: [PATCH 027/122] RDMA/irdma: Report the correct link speed The active link speed is currently hard-coded in irdma_query_port due to which the port rate in ibstatus does reflect the active link speed. Call ib_get_eth_speed in irdma_query_port to get the active link speed. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Reported-by: Kamal Heib Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221104234957.1135-1-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 35 +++-------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index a22afbb25bc5..434241789f12 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -63,36 +63,6 @@ static int irdma_query_device(struct ib_device *ibdev, return 0; } -/** - * irdma_get_eth_speed_and_width - Get IB port speed and width from netdev speed - * @link_speed: netdev phy link speed - * @active_speed: IB port speed - * @active_width: IB port width - */ -static void irdma_get_eth_speed_and_width(u32 link_speed, u16 *active_speed, - u8 *active_width) -{ - if (link_speed <= SPEED_1000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_SDR; - } else if (link_speed <= SPEED_10000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_FDR10; - } else if (link_speed <= SPEED_20000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_DDR; - } else if (link_speed <= SPEED_25000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_EDR; - } else if (link_speed <= SPEED_40000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_FDR10; - } else { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_EDR; - } -} - /** * irdma_query_port - get port attributes * @ibdev: device pointer from stack @@ -120,8 +90,9 @@ static int irdma_query_port(struct ib_device *ibdev, u32 port, props->state = IB_PORT_DOWN; props->phys_state = IB_PORT_PHYS_STATE_DISABLED; } - irdma_get_eth_speed_and_width(SPEED_100000, &props->active_speed, - &props->active_width); + + ib_get_eth_speed(ibdev, port, &props->active_speed, + &props->active_width); if (rdma_protocol_roce(ibdev, 1)) { props->gid_tbl_len = 32; From ece43fad220ba03c529cc0f6f302d796044e8476 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 7 Nov 2022 10:18:43 +0800 Subject: [PATCH 028/122] RDMA/erdma: Extend access right field of FRMR and REG MR to support atomic To support atomic operations, IB_ACCESS_REMOTE_ATOMIC right should be passed to hardware for permission check. Since "access mode" field in FRMR SQE and RegMr command is never used by hw, we remove the "access mode" field, so that we can then have enough space to extend access fields. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20221107021845.44598-2-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_hw.h | 6 ++---- drivers/infiniband/hw/erdma/erdma_qp.c | 3 +-- drivers/infiniband/hw/erdma/erdma_verbs.c | 3 +-- drivers/infiniband/hw/erdma/erdma_verbs.h | 12 +++++++----- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index e788887732e1..2a9a4c73d52c 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -224,8 +224,7 @@ struct erdma_cmdq_create_cq_req { /* regmr cfg1 */ #define ERDMA_CMD_REGMR_PD_MASK GENMASK(31, 12) #define ERDMA_CMD_REGMR_TYPE_MASK GENMASK(7, 6) -#define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 2) -#define ERDMA_CMD_REGMR_ACC_MODE_MASK GENMASK(1, 0) +#define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 1) /* regmr cfg2 */ #define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27) @@ -370,8 +369,7 @@ struct erdma_rqe { #define ERDMA_SQE_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) /* REG MR attrs */ -#define ERDMA_SQE_MR_MODE_MASK GENMASK(1, 0) -#define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 2) +#define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 1) #define ERDMA_SQE_MR_MTT_TYPE_MASK GENMASK(7, 6) #define ERDMA_SQE_MR_MTT_CNT_MASK GENMASK(31, 12) diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 5fe1a339a435..c7f343173cb9 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -397,8 +397,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, regmr_sge->addr = cpu_to_le64(mr->ibmr.iova); regmr_sge->length = cpu_to_le32(mr->ibmr.length); regmr_sge->stag = cpu_to_le32(reg_wr(send_wr)->key); - attrs = FIELD_PREP(ERDMA_SQE_MR_MODE_MASK, 0) | - FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) | + attrs = FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) | FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK, mr->mem.mtt_nents); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 62be98e2b941..f3bf87f17527 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -118,8 +118,7 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8); req.cfg1 = FIELD_PREP(ERDMA_CMD_REGMR_PD_MASK, pd->pdn) | FIELD_PREP(ERDMA_CMD_REGMR_TYPE_MASK, mr->type) | - FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access) | - FIELD_PREP(ERDMA_CMD_REGMR_ACC_MODE_MASK, 0); + FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access); req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK, ilog2(mr->mem.page_size)) | FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) | diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index ab6380635e9e..a5574f0252bb 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -71,16 +71,18 @@ struct erdma_pd { #define ERDMA_MR_INLINE_MTT 0 #define ERDMA_MR_INDIRECT_MTT 1 -#define ERDMA_MR_ACC_LR BIT(0) -#define ERDMA_MR_ACC_LW BIT(1) -#define ERDMA_MR_ACC_RR BIT(2) -#define ERDMA_MR_ACC_RW BIT(3) +#define ERDMA_MR_ACC_RA BIT(0) +#define ERDMA_MR_ACC_LR BIT(1) +#define ERDMA_MR_ACC_LW BIT(2) +#define ERDMA_MR_ACC_RR BIT(3) +#define ERDMA_MR_ACC_RW BIT(4) static inline u8 to_erdma_access_flags(int access) { return (access & IB_ACCESS_REMOTE_READ ? ERDMA_MR_ACC_RR : 0) | (access & IB_ACCESS_LOCAL_WRITE ? ERDMA_MR_ACC_LW : 0) | - (access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0); + (access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0) | + (access & IB_ACCESS_REMOTE_ATOMIC ? ERDMA_MR_ACC_RA : 0); } struct erdma_mem { From 71c6925f280ae8cb52eafee2404ae75c176c28ba Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 7 Nov 2022 10:18:44 +0800 Subject: [PATCH 029/122] RDMA/erdma: Report atomic capacity when hardware supports atomic feature Introduce "capacity flags" field at where hardware put all zeros originally in "query device" response. Using this field, hardware can report atomic feature if supports. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20221107021845.44598-3-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 1 + drivers/infiniband/hw/erdma/erdma_hw.h | 5 +++++ drivers/infiniband/hw/erdma/erdma_main.c | 1 + drivers/infiniband/hw/erdma/erdma_verbs.c | 4 ++++ 4 files changed, 11 insertions(+) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 730783fbc894..bb23d897c710 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -124,6 +124,7 @@ struct erdma_devattr { u32 fw_version; unsigned char peer_addr[ETH_ALEN]; + unsigned long cap_flags; int numa_node; enum erdma_cc_alg cc; diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 2a9a4c73d52c..808e7ee56d93 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -303,6 +303,7 @@ struct erdma_cmdq_destroy_qp_req { /* cap qword 0 definition */ #define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) +#define ERDMA_CMD_DEV_CAP_FLAGS_MASK GENMASK_ULL(31, 24) #define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) #define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0) @@ -314,6 +315,10 @@ struct erdma_cmdq_destroy_qp_req { #define ERDMA_NQP_PER_QBLOCK 1024 +enum { + ERDMA_DEV_CAP_FLAGS_ATOMIC = 1 << 7, +}; + #define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0) /* CQE hdr */ diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 49778bb294ae..e44b06fea595 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -374,6 +374,7 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); dev->attrs.max_mr = dev->attrs.max_qp << 1; dev->attrs.max_cq = dev->attrs.max_qp << 1; + dev->attrs.cap_flags = ERDMA_GET_CAP(FLAGS, cap0); dev->attrs.max_send_wr = ERDMA_MAX_SEND_WR; dev->attrs.max_ord = ERDMA_MAX_ORD; diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index f3bf87f17527..d843ce1f35f3 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -288,6 +288,10 @@ int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, attr->max_mw = dev->attrs.max_mw; attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; + + if (dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_ATOMIC) + attr->atomic_cap = IB_ATOMIC_GLOB; + attr->fw_ver = dev->attrs.fw_version; if (dev->netdev) From 0ca9c2e2844aa285c3656a29d4803839cfa8bca9 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Mon, 7 Nov 2022 10:18:45 +0800 Subject: [PATCH 030/122] RDMA/erdma: Implement atomic operations support Add atomic operations support in post_send and poll_cq implementation. Also, rename 'laddr' and 'lkey' in struct erdma_sge to 'addr' and 'key', because this structure is used for both local and remote SGEs. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20221107021845.44598-4-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_cq.c | 2 ++ drivers/infiniband/hw/erdma/erdma_hw.h | 18 ++++++++++-- drivers/infiniband/hw/erdma/erdma_qp.c | 40 ++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c index 58e0dc5c75d1..cabd8678b355 100644 --- a/drivers/infiniband/hw/erdma/erdma_cq.c +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -64,6 +64,8 @@ static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { [ERDMA_OP_REG_MR] = IB_WC_REG_MR, [ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV, [ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ, + [ERDMA_OP_ATOMIC_CAS] = IB_WC_COMP_SWAP, + [ERDMA_OP_ATOMIC_FAD] = IB_WC_FETCH_ADD, }; static const struct { diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 808e7ee56d93..1b2e2b70678f 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -344,9 +344,9 @@ struct erdma_cqe { }; struct erdma_sge { - __aligned_le64 laddr; + __aligned_le64 addr; __le32 length; - __le32 lkey; + __le32 key; }; /* Receive Queue Element */ @@ -413,6 +413,16 @@ struct erdma_readreq_sqe { __le32 rsvd; }; +struct erdma_atomic_sqe { + __le64 hdr; + __le64 rsvd; + __le64 fetchadd_swap_data; + __le64 cmp_data; + + struct erdma_sge remote; + struct erdma_sge sgl; +}; + struct erdma_reg_mr_sqe { __le64 hdr; __le64 addr; @@ -472,7 +482,9 @@ enum erdma_opcode { ERDMA_OP_REG_MR = 14, ERDMA_OP_LOCAL_INV = 15, ERDMA_OP_READ_WITH_INV = 16, - ERDMA_NUM_OPCODES = 17, + ERDMA_OP_ATOMIC_CAS = 17, + ERDMA_OP_ATOMIC_FAD = 18, + ERDMA_NUM_OPCODES = 19, ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 }; diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index c7f343173cb9..521e97258de7 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -285,15 +285,16 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset; u32 idx = *pi & (qp->attrs.sq_size - 1); enum ib_wr_opcode op = send_wr->opcode; + struct erdma_atomic_sqe *atomic_sqe; struct erdma_readreq_sqe *read_sqe; struct erdma_reg_mr_sqe *regmr_sge; struct erdma_write_sqe *write_sqe; struct erdma_send_sqe *send_sqe; struct ib_rdma_wr *rdma_wr; - struct erdma_mr *mr; + struct erdma_sge *sge; __le32 *length_field; + struct erdma_mr *mr; u64 wqe_hdr, *entry; - struct ib_sge *sge; u32 attrs; int ret; @@ -360,9 +361,9 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1, qp->attrs.sq_size, SQEBB_SHIFT); - sge->addr = rdma_wr->remote_addr; - sge->lkey = rdma_wr->rkey; - sge->length = send_wr->sg_list[0].length; + sge->addr = cpu_to_le64(rdma_wr->remote_addr); + sge->key = cpu_to_le32(rdma_wr->rkey); + sge->length = cpu_to_le32(send_wr->sg_list[0].length); wqe_size = sizeof(struct erdma_readreq_sqe) + send_wr->num_sge * sizeof(struct ib_sge); @@ -423,6 +424,35 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, regmr_sge->stag = cpu_to_le32(send_wr->ex.invalidate_rkey); wqe_size = sizeof(struct erdma_reg_mr_sqe); goto out; + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + atomic_sqe = (struct erdma_atomic_sqe *)entry; + if (op == IB_WR_ATOMIC_CMP_AND_SWP) { + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, + ERDMA_OP_ATOMIC_CAS); + atomic_sqe->fetchadd_swap_data = + cpu_to_le64(atomic_wr(send_wr)->swap); + atomic_sqe->cmp_data = + cpu_to_le64(atomic_wr(send_wr)->compare_add); + } else { + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, + ERDMA_OP_ATOMIC_FAD); + atomic_sqe->fetchadd_swap_data = + cpu_to_le64(atomic_wr(send_wr)->compare_add); + } + + sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1, + qp->attrs.sq_size, SQEBB_SHIFT); + sge->addr = cpu_to_le64(atomic_wr(send_wr)->remote_addr); + sge->key = cpu_to_le32(atomic_wr(send_wr)->rkey); + sge++; + + sge->addr = cpu_to_le64(send_wr->sg_list[0].addr); + sge->key = cpu_to_le32(send_wr->sg_list[0].lkey); + sge->length = cpu_to_le32(send_wr->sg_list[0].length); + + wqe_size = sizeof(*atomic_sqe); + goto out; default: return -EOPNOTSUPP; } From bdf1da5df9da680589a7f74448dd0a94dd3e1446 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Mon, 7 Nov 2022 15:50:57 +0100 Subject: [PATCH 031/122] RDMA/siw: Fix immediate work request flush to completion queue Correctly set send queue element opcode during immediate work request flushing in post sendqueue operation, if the QP is in ERROR state. An undefined ocode value results in out-of-bounds access to an array for mapping the opcode between siw internal and RDMA core representation in work completion generation. It resulted in a KASAN BUG report of type 'global-out-of-bounds' during NFSoRDMA testing. This patch further fixes a potential case of a malicious user which may write undefined values for completion queue elements status or opcode, if the CQ is memory mapped to user land. It avoids the same out-of-bounds access to arrays for status and opcode mapping as described above. Fixes: 303ae1cdfdf7 ("rdma/siw: application interface") Fixes: b0fff7317bb4 ("rdma/siw: completion queue methods") Reported-by: Olga Kornievskaia Reviewed-by: Tom Talpey Signed-off-by: Bernard Metzler Link: https://lore.kernel.org/r/20221107145057.895747-1-bmt@zurich.ibm.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw_cq.c | 24 ++++++++++++++-- drivers/infiniband/sw/siw/siw_verbs.c | 40 ++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c index d68e37859e73..acc7bcd538b5 100644 --- a/drivers/infiniband/sw/siw/siw_cq.c +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -56,8 +56,6 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) { memset(wc, 0, sizeof(*wc)); wc->wr_id = cqe->id; - wc->status = map_cqe_status[cqe->status].ib; - wc->opcode = map_wc_opcode[cqe->opcode]; wc->byte_len = cqe->bytes; /* @@ -71,10 +69,32 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) wc->wc_flags = IB_WC_WITH_INVALIDATE; } wc->qp = cqe->base_qp; + wc->opcode = map_wc_opcode[cqe->opcode]; + wc->status = map_cqe_status[cqe->status].ib; siw_dbg_cq(cq, "idx %u, type %d, flags %2x, id 0x%pK\n", cq->cq_get % cq->num_cqe, cqe->opcode, cqe->flags, (void *)(uintptr_t)cqe->id); + } else { + /* + * A malicious user may set invalid opcode or + * status in the user mmapped CQE array. + * Sanity check and correct values in that case + * to avoid out-of-bounds access to global arrays + * for opcode and status mapping. + */ + u8 opcode = cqe->opcode; + u16 status = cqe->status; + + if (opcode >= SIW_NUM_OPCODES) { + opcode = 0; + status = IB_WC_GENERAL_ERR; + } else if (status >= SIW_NUM_WC_STATUS) { + status = IB_WC_GENERAL_ERR; + } + wc->opcode = map_wc_opcode[opcode]; + wc->status = map_cqe_status[status].ib; + } WRITE_ONCE(cqe->flags, 0); cq->cq_get++; diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 3e814cfb298c..906fde1a2a0d 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -676,13 +676,45 @@ static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { - struct siw_sqe sqe = {}; int rv = 0; while (wr) { - sqe.id = wr->wr_id; - sqe.opcode = wr->opcode; - rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR); + struct siw_sqe sqe = {}; + + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + sqe.opcode = SIW_OP_WRITE; + break; + case IB_WR_RDMA_READ: + sqe.opcode = SIW_OP_READ; + break; + case IB_WR_RDMA_READ_WITH_INV: + sqe.opcode = SIW_OP_READ_LOCAL_INV; + break; + case IB_WR_SEND: + sqe.opcode = SIW_OP_SEND; + break; + case IB_WR_SEND_WITH_IMM: + sqe.opcode = SIW_OP_SEND_WITH_IMM; + break; + case IB_WR_SEND_WITH_INV: + sqe.opcode = SIW_OP_SEND_REMOTE_INV; + break; + case IB_WR_LOCAL_INV: + sqe.opcode = SIW_OP_INVAL_STAG; + break; + case IB_WR_REG_MR: + sqe.opcode = SIW_OP_REG_MR; + break; + default: + rv = -EINVAL; + break; + } + if (!rv) { + sqe.id = wr->wr_id; + rv = siw_sqe_complete(qp, &sqe, 0, + SIW_WC_WR_FLUSH_ERR); + } if (rv) { if (bad_wr) *bad_wr = wr; From 837a55847ead27362aac80aa1cf402459a9757f7 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Mon, 7 Nov 2022 14:53:38 +0900 Subject: [PATCH 032/122] RDMA/rxe: Implement packet length validation on responder The function check_length() is supposed to check the length of inbound packets on responder, but it actually has been a stub since the driver was born. Let it check the payload length and the DMA length. Signed-off-by: Daisuke Matsuda Link: https://lore.kernel.org/r/20221107055338.357184-1-matsuda-daisuke@fujitsu.com Reviewed-by: Li Zhijian Acked-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_resp.c | 34 ++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c32bc12cc82f..382d2053db43 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -393,16 +393,36 @@ static enum resp_states check_resource(struct rxe_qp *qp, static enum resp_states check_length(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { - switch (qp_type(qp)) { - case IB_QPT_RC: - return RESPST_CHK_RKEY; + int mtu = qp->mtu; + u32 payload = payload_size(pkt); + u32 dmalen = reth_len(pkt); - case IB_QPT_UC: - return RESPST_CHK_RKEY; + /* RoCEv2 packets do not have LRH. + * Let's skip checking it. + */ - default: - return RESPST_CHK_RKEY; + if ((pkt->opcode & RXE_START_MASK) && + (pkt->opcode & RXE_END_MASK)) { + /* "only" packets */ + if (payload > mtu) + return RESPST_ERR_LENGTH; + } else if ((pkt->opcode & RXE_START_MASK) || + (pkt->opcode & RXE_MIDDLE_MASK)) { + /* "first" or "middle" packets */ + if (payload != mtu) + return RESPST_ERR_LENGTH; + } else if (pkt->opcode & RXE_END_MASK) { + /* "last" packets */ + if ((payload == 0) || (payload > mtu)) + return RESPST_ERR_LENGTH; } + + if (pkt->opcode & (RXE_WRITE_MASK | RXE_READ_MASK)) { + if (dmalen > (1 << 31)) + return RESPST_ERR_LENGTH; + } + + return RESPST_CHK_RKEY; } static enum resp_states check_rkey(struct rxe_qp *qp, From 5c20311d76cbaeb7ed2ecf9c8b8322f8fc4a7ae3 Mon Sep 17 00:00:00 2001 From: Leonid Ravich Date: Wed, 9 Nov 2022 11:57:17 +0200 Subject: [PATCH 033/122] IB/mad: Don't call to function that might sleep while in atomic context Tracepoints are not allowed to sleep, as such the following splat is generated due to call to ib_query_pkey() in atomic context. WARNING: CPU: 0 PID: 1888000 at kernel/trace/ring_buffer.c:2492 rb_commit+0xc1/0x220 CPU: 0 PID: 1888000 Comm: kworker/u9:0 Kdump: loaded Tainted: G OE --------- - - 4.18.0-305.3.1.el8.x86_64 #1 Hardware name: Red Hat KVM, BIOS 1.13.0-2.module_el8.3.0+555+a55c8938 04/01/2014 Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core] RIP: 0010:rb_commit+0xc1/0x220 RSP: 0000:ffffa8ac80f9bca0 EFLAGS: 00010202 RAX: ffff8951c7c01300 RBX: ffff8951c7c14a00 RCX: 0000000000000246 RDX: ffff8951c707c000 RSI: ffff8951c707c57c RDI: ffff8951c7c14a00 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8951c7c01300 R11: 0000000000000001 R12: 0000000000000246 R13: 0000000000000000 R14: ffffffff964c70c0 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8951fbc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f20e8f39010 CR3: 000000002ca10005 CR4: 0000000000170ef0 Call Trace: ring_buffer_unlock_commit+0x1d/0xa0 trace_buffer_unlock_commit_regs+0x3b/0x1b0 trace_event_buffer_commit+0x67/0x1d0 trace_event_raw_event_ib_mad_recv_done_handler+0x11c/0x160 [ib_core] ib_mad_recv_done+0x48b/0xc10 [ib_core] ? trace_event_raw_event_cq_poll+0x6f/0xb0 [ib_core] __ib_process_cq+0x91/0x1c0 [ib_core] ib_cq_poll_work+0x26/0x80 [ib_core] process_one_work+0x1a7/0x360 ? create_worker+0x1a0/0x1a0 worker_thread+0x30/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x116/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x35/0x40 ---[ end trace 78ba8509d3830a16 ]--- Fixes: 821bf1de45a1 ("IB/MAD: Add recv path trace point") Signed-off-by: Leonid Ravich Link: https://lore.kernel.org/r/Y2t5feomyznrVj7V@leonid-Inspiron-3421 Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/mad.c | 5 ----- include/trace/events/ib_mad.h | 13 ++++--------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 1893aa613ad7..674344eb8e2f 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -59,9 +59,6 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_qp_info *qp_info, struct trace_event_raw_ib_mad_send_template *entry) { - u16 pkey; - struct ib_device *dev = qp_info->port_priv->device; - u32 pnum = qp_info->port_priv->port_num; struct ib_ud_wr *wr = &mad_send_wr->send_wr; struct rdma_ah_attr attr = {}; @@ -69,8 +66,6 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, /* These are common */ entry->sl = attr.sl; - ib_query_pkey(dev, pnum, wr->pkey_index, &pkey); - entry->pkey = pkey; entry->rqpn = wr->remote_qpn; entry->rqkey = wr->remote_qkey; entry->dlid = rdma_ah_get_dlid(&attr); diff --git a/include/trace/events/ib_mad.h b/include/trace/events/ib_mad.h index 59363a083ecb..d92691c78cff 100644 --- a/include/trace/events/ib_mad.h +++ b/include/trace/events/ib_mad.h @@ -49,7 +49,6 @@ DECLARE_EVENT_CLASS(ib_mad_send_template, __field(int, retries_left) __field(int, max_retries) __field(int, retry) - __field(u16, pkey) ), TP_fast_assign( @@ -89,7 +88,7 @@ DECLARE_EVENT_CLASS(ib_mad_send_template, "hdr : base_ver 0x%x class 0x%x class_ver 0x%x " \ "method 0x%x status 0x%x class_specific 0x%x tid 0x%llx " \ "attr_id 0x%x attr_mod 0x%x => dlid 0x%08x sl %d "\ - "pkey 0x%x rpqn 0x%x rqpkey 0x%x", + "rpqn 0x%x rqpkey 0x%x", __entry->dev_index, __entry->port_num, __entry->qp_num, __entry->agent_priv, be64_to_cpu(__entry->wrtid), __entry->retries_left, __entry->max_retries, @@ -100,7 +99,7 @@ DECLARE_EVENT_CLASS(ib_mad_send_template, be16_to_cpu(__entry->class_specific), be64_to_cpu(__entry->tid), be16_to_cpu(__entry->attr_id), be32_to_cpu(__entry->attr_mod), - be32_to_cpu(__entry->dlid), __entry->sl, __entry->pkey, + be32_to_cpu(__entry->dlid), __entry->sl, __entry->rqpn, __entry->rqkey ) ); @@ -204,7 +203,6 @@ TRACE_EVENT(ib_mad_recv_done_handler, __field(u16, wc_status) __field(u32, slid) __field(u32, dev_index) - __field(u16, pkey) ), TP_fast_assign( @@ -224,9 +222,6 @@ TRACE_EVENT(ib_mad_recv_done_handler, __entry->slid = wc->slid; __entry->src_qp = wc->src_qp; __entry->sl = wc->sl; - ib_query_pkey(qp_info->port_priv->device, - qp_info->port_priv->port_num, - wc->pkey_index, &__entry->pkey); __entry->wc_status = wc->status; ), @@ -234,7 +229,7 @@ TRACE_EVENT(ib_mad_recv_done_handler, "base_ver 0x%02x class 0x%02x class_ver 0x%02x " \ "method 0x%02x status 0x%04x class_specific 0x%04x " \ "tid 0x%016llx attr_id 0x%04x attr_mod 0x%08x " \ - "slid 0x%08x src QP%d, sl %d pkey 0x%04x", + "slid 0x%08x src QP%d, sl %d", __entry->dev_index, __entry->port_num, __entry->qp_num, __entry->wc_status, __entry->length, @@ -244,7 +239,7 @@ TRACE_EVENT(ib_mad_recv_done_handler, be16_to_cpu(__entry->class_specific), be64_to_cpu(__entry->tid), be16_to_cpu(__entry->attr_id), be32_to_cpu(__entry->attr_mod), - __entry->slid, __entry->src_qp, __entry->sl, __entry->pkey + __entry->slid, __entry->src_qp, __entry->sl ) ); From 4554bac48a8c464ff00136a64efe8847e4da4ea8 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:09:59 -0500 Subject: [PATCH 034/122] RDMA/rxe: Add ibdev_dbg macros for rxe Add macros borrowed from siw to call dynamic debug macro ibdev_dbg. Link: https://lore.kernel.org/r/20221103171013.20659-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 30fbdf3bc76a..ab334900fcc3 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -38,6 +38,25 @@ #define RXE_ROCE_V2_SPORT (0xc000) +#define rxe_dbg(rxe, fmt, ...) ibdev_dbg(&(rxe)->ib_dev, \ + "%s: " fmt, __func__, ##__VA_ARGS__) +#define rxe_dbg_uc(uc, fmt, ...) ibdev_dbg((uc)->ibuc.device, \ + "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_pd(pd, fmt, ...) ibdev_dbg((pd)->ibpd.device, \ + "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_ah(ah, fmt, ...) ibdev_dbg((ah)->ibah.device, \ + "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_srq(srq, fmt, ...) ibdev_dbg((srq)->ibsrq.device, \ + "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_qp(qp, fmt, ...) ibdev_dbg((qp)->ibqp.device, \ + "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_cq(cq, fmt, ...) ibdev_dbg((cq)->ibcq.device, \ + "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_mr(mr, fmt, ...) ibdev_dbg((mr)->ibmr.device, \ + "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_mw(mw, fmt, ...) ibdev_dbg((mw)->ibmw.device, \ + "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) + void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); From 27c4c520bd3908c095401e93c71e7d3696ae8bdd Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:00 -0500 Subject: [PATCH 035/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_comp.c Replace calls to pr_xxx() in rxe_comp.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-3-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 66f392810c86..4dca4f8bbb5a 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -114,7 +114,7 @@ void retransmit_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, retrans_timer); - pr_debug("%s: fired for qp#%d\n", __func__, qp->elem.index); + rxe_dbg_qp(qp, "retransmit timer fired\n"); if (qp->valid) { qp->comp.timeout = 1; @@ -334,7 +334,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, return COMPST_ERROR; default: - pr_warn("unexpected nak %x\n", syn); + rxe_dbg_qp(qp, "unexpected nak %x\n", syn); wqe->status = IB_WC_REM_OP_ERR; return COMPST_ERROR; } @@ -345,7 +345,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, break; default: - pr_warn("unexpected opcode\n"); + rxe_dbg_qp(qp, "unexpected opcode\n"); } return COMPST_ERROR; @@ -598,8 +598,7 @@ int rxe_completer(void *arg) state = COMPST_GET_ACK; while (1) { - pr_debug("qp#%d state = %s\n", qp_num(qp), - comp_state_name[state]); + rxe_dbg_qp(qp, "state = %s\n", comp_state_name[state]); switch (state) { case COMPST_GET_ACK: skb = skb_dequeue(&qp->resp_pkts); @@ -746,8 +745,7 @@ int rxe_completer(void *arg) * rnr timer has fired */ qp->req.wait_for_rnr_timer = 1; - pr_debug("qp#%d set rnr nak timer\n", - qp_num(qp)); + rxe_dbg_qp(qp, "set rnr nak timer\n"); mod_timer(&qp->rnr_nak_timer, jiffies + rnrnak_jiffies(aeth_syn(pkt) & ~AETH_TYPE_MASK)); From 52920f537ab08237bd8a3d30ccbb06a9d57717cf Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:01 -0500 Subject: [PATCH 036/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_cq.c Replace calls to pr_xxx() in rxe_cq.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_cq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index b1a0ab3cd4bd..1df186534639 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -14,12 +14,12 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, int count; if (cqe <= 0) { - pr_warn("cqe(%d) <= 0\n", cqe); + rxe_dbg(rxe, "cqe(%d) <= 0\n", cqe); goto err1; } if (cqe > rxe->attr.max_cqe) { - pr_debug("cqe(%d) > max_cqe(%d)\n", + rxe_dbg(rxe, "cqe(%d) > max_cqe(%d)\n", cqe, rxe->attr.max_cqe); goto err1; } @@ -27,7 +27,7 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, if (cq) { count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT); if (cqe < count) { - pr_debug("cqe(%d) < current # elements in queue (%d)", + rxe_dbg_cq(cq, "cqe(%d) < current # elements in queue (%d)", cqe, count); goto err1; } @@ -65,7 +65,7 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, cq->queue = rxe_queue_init(rxe, &cqe, sizeof(struct rxe_cqe), type); if (!cq->queue) { - pr_warn("unable to create cq\n"); + rxe_dbg(rxe, "unable to create cq\n"); return -ENOMEM; } From 2778b72b1df0c8ef61e0b3b3ef1c8c62eb03fa75 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:02 -0500 Subject: [PATCH 037/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_mr.c Replace calls to pr_xxx() in rxe_mr.c by rxe_dbg_mr(). Link: https://lore.kernel.org/r/20221103171013.20659-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 40 ++++++++++++--------------- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 ++ 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index bc081002bddc..cd846cf82a84 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -38,8 +38,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) return 0; default: - pr_warn("%s: mr type (%d) not supported\n", - __func__, mr->ibmr.type); + rxe_dbg_mr(mr, "type (%d) not supported\n", mr->ibmr.type); return -EFAULT; } } @@ -125,8 +124,8 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { - pr_warn("%s: Unable to pin memory region err = %d\n", - __func__, (int)PTR_ERR(umem)); + rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", + (int)PTR_ERR(umem)); err = PTR_ERR(umem); goto err_out; } @@ -137,8 +136,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, err = rxe_mr_alloc(mr, num_buf); if (err) { - pr_warn("%s: Unable to allocate memory for map\n", - __func__); + rxe_dbg_mr(mr, "Unable to allocate memory for map\n"); goto err_release_umem; } @@ -159,8 +157,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, vaddr = page_address(sg_page_iter_page(&sg_iter)); if (!vaddr) { - pr_warn("%s: Unable to get virtual address\n", - __func__); + rxe_dbg_mr(mr, "Unable to get virtual address\n"); err = -ENOMEM; goto err_cleanup_map; } @@ -255,7 +252,7 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) void *addr; if (mr->state != RXE_MR_STATE_VALID) { - pr_warn("mr not in valid state\n"); + rxe_dbg_mr(mr, "Not in valid state\n"); addr = NULL; goto out; } @@ -266,7 +263,7 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) } if (mr_check_range(mr, iova, length)) { - pr_warn("range violation\n"); + rxe_dbg_mr(mr, "Range violation\n"); addr = NULL; goto out; } @@ -274,7 +271,7 @@ void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) lookup_iova(mr, iova, &m, &n, &offset); if (offset + length > mr->map[m]->buf[n].size) { - pr_warn("crosses page boundary\n"); + rxe_dbg_mr(mr, "Crosses page boundary\n"); addr = NULL; goto out; } @@ -527,27 +524,26 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); if (!mr) { - pr_err("%s: No MR for key %#x\n", __func__, key); + rxe_dbg_mr(mr, "No MR for key %#x\n", key); ret = -EINVAL; goto err; } if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) { - pr_err("%s: wr key (%#x) doesn't match mr key (%#x)\n", - __func__, key, (mr->rkey ? mr->rkey : mr->lkey)); + rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", + key, (mr->rkey ? mr->rkey : mr->lkey)); ret = -EINVAL; goto err_drop_ref; } if (atomic_read(&mr->num_mw) > 0) { - pr_warn("%s: Attempt to invalidate an MR while bound to MWs\n", - __func__); + rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n"); ret = -EINVAL; goto err_drop_ref; } if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { - pr_warn("%s: mr type (%d) is wrong\n", __func__, mr->ibmr.type); + rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type); ret = -EINVAL; goto err_drop_ref; } @@ -576,22 +572,20 @@ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) /* user can only register MR in free state */ if (unlikely(mr->state != RXE_MR_STATE_FREE)) { - pr_warn("%s: mr->lkey = 0x%x not free\n", - __func__, mr->lkey); + rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey); return -EINVAL; } /* user can only register mr with qp in same protection domain */ if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { - pr_warn("%s: qp->pd and mr->pd don't match\n", - __func__); + rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n"); return -EINVAL; } /* user is only allowed to change key portion of l/rkey */ if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { - pr_warn("%s: key = 0x%x has wrong index mr->lkey = 0x%x\n", - __func__, key, mr->lkey); + rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n", + key, mr->lkey); return -EINVAL; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index bcdfdadaebbc..510ae471ac7a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -875,6 +875,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) rxe_get(pd); mr->ibmr.pd = ibpd; + mr->ibmr.device = ibpd->device; rxe_mr_init_dma(access, mr); rxe_finalize(mr); @@ -899,6 +900,7 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, rxe_get(pd); mr->ibmr.pd = ibpd; + mr->ibmr.device = ibpd->device; err = rxe_mr_init_user(rxe, start, length, iova, access, mr); if (err) @@ -930,6 +932,7 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, rxe_get(pd); mr->ibmr.pd = ibpd; + mr->ibmr.device = ibpd->device; err = rxe_mr_init_fast(max_num_sg, mr); if (err) From e8a87efdf87455454d0a14fd486c679769bfeee2 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:03 -0500 Subject: [PATCH 038/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_mw.c Replace calls to pr_xxx() int rxe_mw.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-6-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mw.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index 8df1c9066ed8..afa5ce1a7116 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -52,14 +52,14 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, { if (mw->ibmw.type == IB_MW_TYPE_1) { if (unlikely(mw->state != RXE_MW_STATE_VALID)) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind a type 1 MW not in the valid state\n"); return -EINVAL; } /* o10-36.2.2 */ if (unlikely((mw->access & IB_ZERO_BASED))) { - pr_err_once("attempt to bind a zero based type 1 MW\n"); + rxe_dbg_mw(mw, "attempt to bind a zero based type 1 MW\n"); return -EINVAL; } } @@ -67,21 +67,21 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, if (mw->ibmw.type == IB_MW_TYPE_2) { /* o10-37.2.30 */ if (unlikely(mw->state != RXE_MW_STATE_FREE)) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind a type 2 MW not in the free state\n"); return -EINVAL; } /* C10-72 */ if (unlikely(qp->pd != to_rpd(mw->ibmw.pd))) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind type 2 MW with qp with different PD\n"); return -EINVAL; } /* o10-37.2.40 */ if (unlikely(!mr || wqe->wr.wr.mw.length == 0)) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to invalidate type 2 MW by binding with NULL or zero length MR\n"); return -EINVAL; } @@ -92,13 +92,13 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, return 0; if (unlikely(mr->access & IB_ZERO_BASED)) { - pr_err_once("attempt to bind MW to zero based MR\n"); + rxe_dbg_mw(mw, "attempt to bind MW to zero based MR\n"); return -EINVAL; } /* C10-73 */ if (unlikely(!(mr->access & IB_ACCESS_MW_BIND))) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind an MW to an MR without bind access\n"); return -EINVAL; } @@ -107,7 +107,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, if (unlikely((mw->access & (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) && !(mr->access & IB_ACCESS_LOCAL_WRITE))) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind an Writable MW to an MR without local write access\n"); return -EINVAL; } @@ -115,7 +115,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, /* C10-75 */ if (mw->access & IB_ZERO_BASED) { if (unlikely(wqe->wr.wr.mw.length > mr->ibmr.length)) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind a ZB MW outside of the MR\n"); return -EINVAL; } @@ -123,7 +123,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, if (unlikely((wqe->wr.wr.mw.addr < mr->ibmr.iova) || ((wqe->wr.wr.mw.addr + wqe->wr.wr.mw.length) > (mr->ibmr.iova + mr->ibmr.length)))) { - pr_err_once( + rxe_dbg_mw(mw, "attempt to bind a VA MW outside of the MR\n"); return -EINVAL; } From 34549e88e0a3088416177023abf1232fe40e721c Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:04 -0500 Subject: [PATCH 039/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_net.c Replace (some) calls to pr_xxx() in rxe_net.c with rxe_dbg_xxx(). Calls with a rxe device not yet in scope are left as is. Link: https://lore.kernel.org/r/20221103171013.20659-7-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 38 ++++++++++++++++------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index c36cad9c7a66..e02e1624bcf4 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -20,9 +20,10 @@ static struct rxe_recv_sockets recv_sockets; -static struct dst_entry *rxe_find_route4(struct net_device *ndev, - struct in_addr *saddr, - struct in_addr *daddr) +static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, + struct net_device *ndev, + struct in_addr *saddr, + struct in_addr *daddr) { struct rtable *rt; struct flowi4 fl = { { 0 } }; @@ -35,7 +36,7 @@ static struct dst_entry *rxe_find_route4(struct net_device *ndev, rt = ip_route_output_key(&init_net, &fl); if (IS_ERR(rt)) { - pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr); + rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr); return NULL; } @@ -43,7 +44,8 @@ static struct dst_entry *rxe_find_route4(struct net_device *ndev, } #if IS_ENABLED(CONFIG_IPV6) -static struct dst_entry *rxe_find_route6(struct net_device *ndev, +static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, + struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { @@ -60,12 +62,12 @@ static struct dst_entry *rxe_find_route6(struct net_device *ndev, recv_sockets.sk6->sk, &fl6, NULL); if (IS_ERR(ndst)) { - pr_err_ratelimited("no route to %pI6\n", daddr); + rxe_dbg_qp(qp, "no route to %pI6\n", daddr); return NULL; } if (unlikely(ndst->error)) { - pr_err("no route to %pI6\n", daddr); + rxe_dbg_qp(qp, "no route to %pI6\n", daddr); goto put; } @@ -77,7 +79,8 @@ put: #else -static struct dst_entry *rxe_find_route6(struct net_device *ndev, +static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, + struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { @@ -105,14 +108,14 @@ static struct dst_entry *rxe_find_route(struct net_device *ndev, saddr = &av->sgid_addr._sockaddr_in.sin_addr; daddr = &av->dgid_addr._sockaddr_in.sin_addr; - dst = rxe_find_route4(ndev, saddr, daddr); + dst = rxe_find_route4(qp, ndev, saddr, daddr); } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { struct in6_addr *saddr6; struct in6_addr *daddr6; saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; - dst = rxe_find_route6(ndev, saddr6, daddr6); + dst = rxe_find_route6(qp, ndev, saddr6, daddr6); #if IS_ENABLED(CONFIG_IPV6) if (dst) qp->dst_cookie = @@ -282,7 +285,7 @@ static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt, dst = rxe_find_route(skb->dev, qp, av); if (!dst) { - pr_err("Host not reachable\n"); + rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } @@ -306,7 +309,7 @@ static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt, dst = rxe_find_route(skb->dev, qp, av); if (!dst) { - pr_err("Host not reachable\n"); + rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } @@ -365,7 +368,8 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) } else if (skb->protocol == htons(ETH_P_IPV6)) { err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else { - pr_err("Unknown layer 3 protocol: %d\n", skb->protocol); + rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n", + skb->protocol); atomic_dec(&pkt->qp->skb_out); rxe_put(pkt->qp); kfree_skb(skb); @@ -373,7 +377,7 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) } if (unlikely(net_xmit_eval(err))) { - pr_debug("error sending packet: %d\n", err); + rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err); return -EAGAIN; } @@ -411,7 +415,7 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, if ((is_request && (qp->req.state != QP_STATE_READY)) || (!is_request && (qp->resp.state != QP_STATE_READY))) { - pr_info("Packet dropped. QP is not in ready state\n"); + rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } @@ -592,7 +596,7 @@ static int rxe_notify(struct notifier_block *not_blk, rxe_port_down(rxe); break; case NETDEV_CHANGEMTU: - pr_info("%s changed mtu to %d\n", ndev->name, ndev->mtu); + rxe_dbg(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; case NETDEV_CHANGE: @@ -604,7 +608,7 @@ static int rxe_notify(struct notifier_block *not_blk, case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: default: - pr_info("ignoring netdev event = %ld for %s\n", + rxe_dbg(rxe, "ignoring netdev event = %ld for %s\n", event, ndev->name); break; } From 6af70060d2e561d6479b33fe94cc2f38582e830b Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:05 -0500 Subject: [PATCH 040/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_qp.c Replace calls to pr_xxx() in rxe_qp.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-8-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 65 ++++++++++++++---------------- 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 3f6d62a80bea..bcbfe6068b8b 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -19,33 +19,33 @@ static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, int has_srq) { if (cap->max_send_wr > rxe->attr.max_qp_wr) { - pr_debug("invalid send wr = %u > %d\n", + rxe_dbg(rxe, "invalid send wr = %u > %d\n", cap->max_send_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_send_sge > rxe->attr.max_send_sge) { - pr_debug("invalid send sge = %u > %d\n", + rxe_dbg(rxe, "invalid send sge = %u > %d\n", cap->max_send_sge, rxe->attr.max_send_sge); goto err1; } if (!has_srq) { if (cap->max_recv_wr > rxe->attr.max_qp_wr) { - pr_debug("invalid recv wr = %u > %d\n", + rxe_dbg(rxe, "invalid recv wr = %u > %d\n", cap->max_recv_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_recv_sge > rxe->attr.max_recv_sge) { - pr_debug("invalid recv sge = %u > %d\n", + rxe_dbg(rxe, "invalid recv sge = %u > %d\n", cap->max_recv_sge, rxe->attr.max_recv_sge); goto err1; } } if (cap->max_inline_data > rxe->max_inline_data) { - pr_debug("invalid max inline data = %u > %d\n", + rxe_dbg(rxe, "invalid max inline data = %u > %d\n", cap->max_inline_data, rxe->max_inline_data); goto err1; } @@ -73,7 +73,7 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) } if (!init->recv_cq || !init->send_cq) { - pr_debug("missing cq\n"); + rxe_dbg(rxe, "missing cq\n"); goto err1; } @@ -82,14 +82,14 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) if (init->qp_type == IB_QPT_GSI) { if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) { - pr_debug("invalid port = %d\n", port_num); + rxe_dbg(rxe, "invalid port = %d\n", port_num); goto err1; } port = &rxe->port; if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { - pr_debug("GSI QP exists for port %d\n", port_num); + rxe_dbg(rxe, "GSI QP exists for port %d\n", port_num); goto err1; } } @@ -264,9 +264,6 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, wqe_size = rcv_wqe_size(qp->rq.max_sge); - pr_debug("qp#%d max_wr = %d, max_sge = %d, wqe_size = %d\n", - qp_num(qp), qp->rq.max_wr, qp->rq.max_sge, wqe_size); - type = QUEUE_TYPE_FROM_CLIENT; qp->rq.queue = rxe_queue_init(rxe, &qp->rq.max_wr, wqe_size, type); @@ -395,7 +392,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) { - pr_debug("invalid mask or state for qp\n"); + rxe_dbg_qp(qp, "invalid mask or state\n"); goto err1; } @@ -409,7 +406,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_PORT) { if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) { - pr_debug("invalid port %d\n", attr->port_num); + rxe_dbg_qp(qp, "invalid port %d\n", attr->port_num); goto err1; } } @@ -424,11 +421,11 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr)) goto err1; if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) { - pr_debug("invalid alt port %d\n", attr->alt_port_num); + rxe_dbg_qp(qp, "invalid alt port %d\n", attr->alt_port_num); goto err1; } if (attr->alt_timeout > 31) { - pr_debug("invalid QP alt timeout %d > 31\n", + rxe_dbg_qp(qp, "invalid alt timeout %d > 31\n", attr->alt_timeout); goto err1; } @@ -441,7 +438,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, enum ib_mtu mtu = attr->path_mtu; if (mtu > max_mtu) { - pr_debug("invalid mtu (%d) > (%d)\n", + rxe_dbg_qp(qp, "invalid mtu (%d) > (%d)\n", ib_mtu_enum_to_int(mtu), ib_mtu_enum_to_int(max_mtu)); goto err1; @@ -450,7 +447,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) { - pr_debug("invalid max_rd_atomic %d > %d\n", + rxe_dbg_qp(qp, "invalid max_rd_atomic %d > %d\n", attr->max_rd_atomic, rxe->attr.max_qp_rd_atom); goto err1; @@ -459,7 +456,8 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_TIMEOUT) { if (attr->timeout > 31) { - pr_debug("invalid QP timeout %d > 31\n", attr->timeout); + rxe_dbg_qp(qp, "invalid timeout %d > 31\n", + attr->timeout); goto err1; } } @@ -637,27 +635,24 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, if (mask & IB_QP_RETRY_CNT) { qp->attr.retry_cnt = attr->retry_cnt; qp->comp.retry_cnt = attr->retry_cnt; - pr_debug("qp#%d set retry count = %d\n", qp_num(qp), - attr->retry_cnt); + rxe_dbg_qp(qp, "set retry count = %d\n", attr->retry_cnt); } if (mask & IB_QP_RNR_RETRY) { qp->attr.rnr_retry = attr->rnr_retry; qp->comp.rnr_retry = attr->rnr_retry; - pr_debug("qp#%d set rnr retry count = %d\n", qp_num(qp), - attr->rnr_retry); + rxe_dbg_qp(qp, "set rnr retry count = %d\n", attr->rnr_retry); } if (mask & IB_QP_RQ_PSN) { qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK); qp->resp.psn = qp->attr.rq_psn; - pr_debug("qp#%d set resp psn = 0x%x\n", qp_num(qp), - qp->resp.psn); + rxe_dbg_qp(qp, "set resp psn = 0x%x\n", qp->resp.psn); } if (mask & IB_QP_MIN_RNR_TIMER) { qp->attr.min_rnr_timer = attr->min_rnr_timer; - pr_debug("qp#%d set min rnr timer = 0x%x\n", qp_num(qp), + rxe_dbg_qp(qp, "set min rnr timer = 0x%x\n", attr->min_rnr_timer); } @@ -665,7 +660,7 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK); qp->req.psn = qp->attr.sq_psn; qp->comp.psn = qp->attr.sq_psn; - pr_debug("qp#%d set req psn = 0x%x\n", qp_num(qp), qp->req.psn); + rxe_dbg_qp(qp, "set req psn = 0x%x\n", qp->req.psn); } if (mask & IB_QP_PATH_MIG_STATE) @@ -679,40 +674,40 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, switch (attr->qp_state) { case IB_QPS_RESET: - pr_debug("qp#%d state -> RESET\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> RESET\n"); rxe_qp_reset(qp); break; case IB_QPS_INIT: - pr_debug("qp#%d state -> INIT\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> INIT\n"); qp->req.state = QP_STATE_INIT; qp->resp.state = QP_STATE_INIT; qp->comp.state = QP_STATE_INIT; break; case IB_QPS_RTR: - pr_debug("qp#%d state -> RTR\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> RTR\n"); qp->resp.state = QP_STATE_READY; break; case IB_QPS_RTS: - pr_debug("qp#%d state -> RTS\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> RTS\n"); qp->req.state = QP_STATE_READY; qp->comp.state = QP_STATE_READY; break; case IB_QPS_SQD: - pr_debug("qp#%d state -> SQD\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> SQD\n"); rxe_qp_drain(qp); break; case IB_QPS_SQE: - pr_warn("qp#%d state -> SQE !!?\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> SQE !!?\n"); /* Not possible from modify_qp. */ break; case IB_QPS_ERR: - pr_debug("qp#%d state -> ERR\n", qp_num(qp)); + rxe_dbg_qp(qp, "state -> ERR\n"); rxe_qp_error(qp); break; } @@ -752,7 +747,7 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) attr->sq_draining = 0; } - pr_debug("attr->sq_draining = %d\n", attr->sq_draining); + rxe_dbg_qp(qp, "attr->sq_draining = %d\n", attr->sq_draining); return 0; } @@ -764,7 +759,7 @@ int rxe_qp_chk_destroy(struct rxe_qp *qp) * will fail immediately. */ if (atomic_read(&qp->mcg_num)) { - pr_debug("Attempt to destroy QP while attached to multicast group\n"); + rxe_dbg_qp(qp, "Attempt to destroy while attached to multicast group\n"); return -EBUSY; } From 0edfb15e30a53e8bd039c35a17f61e49cba9f4dd Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:06 -0500 Subject: [PATCH 041/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_req.c Replace calls to pr_xxx() in rxe_req.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-9-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 41f1d84f0acb..4d45f508392f 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -100,7 +100,7 @@ void rnr_nak_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); - pr_debug("%s: fired for qp#%d\n", __func__, qp_num(qp)); + rxe_dbg_qp(qp, "nak timer fired\n"); /* request a send queue retry */ qp->req.need_retry = 1; @@ -595,7 +595,7 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) } break; default: - pr_err("Unexpected send wqe opcode %d\n", opcode); + rxe_dbg_qp(qp, "Unexpected send wqe opcode %d\n", opcode); wqe->status = IB_WC_LOC_QP_OP_ERR; return -EINVAL; } @@ -748,14 +748,14 @@ int rxe_requester(void *arg) av = rxe_get_av(&pkt, &ah); if (unlikely(!av)) { - pr_err("qp#%d Failed no address vector\n", qp_num(qp)); + rxe_dbg_qp(qp, "Failed no address vector\n"); wqe->status = IB_WC_LOC_QP_OP_ERR; goto err; } skb = init_req_packet(qp, av, wqe, opcode, payload, &pkt); if (unlikely(!skb)) { - pr_err("qp#%d Failed allocating skb\n", qp_num(qp)); + rxe_dbg_qp(qp, "Failed allocating skb\n"); wqe->status = IB_WC_LOC_QP_OP_ERR; if (ah) rxe_put(ah); @@ -764,7 +764,7 @@ int rxe_requester(void *arg) err = finish_packet(qp, av, wqe, &pkt, skb, payload); if (unlikely(err)) { - pr_debug("qp#%d Error during finish packet\n", qp_num(qp)); + rxe_dbg_qp(qp, "Error during finish packet\n"); if (err == -EFAULT) wqe->status = IB_WC_LOC_PROT_ERR; else From 74ddf7233c571d51bcb802bb192a9f7d77cd8830 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:07 -0500 Subject: [PATCH 042/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_resp.c Replace calls to pr_xxx() in rxe_resp.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-10-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 382d2053db43..6761bcd1d4d8 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -317,7 +317,7 @@ static enum resp_states get_srq_wqe(struct rxe_qp *qp) /* don't trust user space data */ if (unlikely(wqe->dma.num_sge > srq->rq.max_sge)) { spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); - pr_warn("%s: invalid num_sge in SRQ entry\n", __func__); + rxe_dbg_qp(qp, "invalid num_sge in SRQ entry\n"); return RESPST_ERR_MALFORMED_WQE; } size = sizeof(*wqe) + wqe->dma.num_sge*sizeof(struct rxe_sge); @@ -473,15 +473,14 @@ static enum resp_states check_rkey(struct rxe_qp *qp, if (rkey_is_mw(rkey)) { mw = rxe_lookup_mw(qp, access, rkey); if (!mw) { - pr_debug("%s: no MW matches rkey %#x\n", - __func__, rkey); + rxe_dbg_qp(qp, "no MW matches rkey %#x\n", rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } mr = mw->mr; if (!mr) { - pr_err("%s: MW doesn't have an MR\n", __func__); + rxe_dbg_qp(qp, "MW doesn't have an MR\n"); state = RESPST_ERR_RKEY_VIOLATION; goto err; } @@ -494,8 +493,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { - pr_debug("%s: no MR matches rkey %#x\n", - __func__, rkey); + rxe_dbg_qp(qp, "no MR matches rkey %#x\n", rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } @@ -1064,7 +1062,7 @@ static int send_common_ack(struct rxe_qp *qp, u8 syndrome, u32 psn, err = rxe_xmit_packet(qp, &ack_pkt, skb); if (err) - pr_err_ratelimited("Failed sending %s\n", msg); + rxe_dbg_qp(qp, "Failed sending %s\n", msg); return err; } @@ -1310,8 +1308,7 @@ int rxe_responder(void *arg) } while (1) { - pr_debug("qp#%d state = %s\n", qp_num(qp), - resp_state_name[state]); + rxe_dbg_qp(qp, "state = %s\n", resp_state_name[state]); switch (state) { case RESPST_GET_REQ: state = get_req(qp, &pkt); @@ -1468,7 +1465,7 @@ int rxe_responder(void *arg) case RESPST_ERROR: qp->resp.goto_error = 0; - pr_debug("qp#%d moved to error state\n", qp_num(qp)); + rxe_dbg_qp(qp, "moved to error state\n"); rxe_qp_error(qp); goto exit; From 0e6090024b3ebf8d162f82c542eba9632a9c85fc Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:08 -0500 Subject: [PATCH 043/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_srq.c Replace calls to pr_xxx() in rxe_srq.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-11-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_srq.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 02b39498c370..82e37a41ced4 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -13,13 +13,13 @@ int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init) struct ib_srq_attr *attr = &init->attr; if (attr->max_wr > rxe->attr.max_srq_wr) { - pr_warn("max_wr(%d) > max_srq_wr(%d)\n", + rxe_dbg(rxe, "max_wr(%d) > max_srq_wr(%d)\n", attr->max_wr, rxe->attr.max_srq_wr); goto err1; } if (attr->max_wr <= 0) { - pr_warn("max_wr(%d) <= 0\n", attr->max_wr); + rxe_dbg(rxe, "max_wr(%d) <= 0\n", attr->max_wr); goto err1; } @@ -27,7 +27,7 @@ int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init) attr->max_wr = RXE_MIN_SRQ_WR; if (attr->max_sge > rxe->attr.max_srq_sge) { - pr_warn("max_sge(%d) > max_srq_sge(%d)\n", + rxe_dbg(rxe, "max_sge(%d) > max_srq_sge(%d)\n", attr->max_sge, rxe->attr.max_srq_sge); goto err1; } @@ -65,7 +65,7 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, type = QUEUE_TYPE_FROM_CLIENT; q = rxe_queue_init(rxe, &srq->rq.max_wr, srq_wqe_size, type); if (!q) { - pr_warn("unable to allocate queue for srq\n"); + rxe_dbg_srq(srq, "Unable to allocate queue\n"); return -ENOMEM; } @@ -94,24 +94,24 @@ int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask) { if (srq->error) { - pr_warn("srq in error state\n"); + rxe_dbg_srq(srq, "in error state\n"); goto err1; } if (mask & IB_SRQ_MAX_WR) { if (attr->max_wr > rxe->attr.max_srq_wr) { - pr_warn("max_wr(%d) > max_srq_wr(%d)\n", + rxe_dbg_srq(srq, "max_wr(%d) > max_srq_wr(%d)\n", attr->max_wr, rxe->attr.max_srq_wr); goto err1; } if (attr->max_wr <= 0) { - pr_warn("max_wr(%d) <= 0\n", attr->max_wr); + rxe_dbg_srq(srq, "max_wr(%d) <= 0\n", attr->max_wr); goto err1; } if (srq->limit && (attr->max_wr < srq->limit)) { - pr_warn("max_wr (%d) < srq->limit (%d)\n", + rxe_dbg_srq(srq, "max_wr (%d) < srq->limit (%d)\n", attr->max_wr, srq->limit); goto err1; } @@ -122,13 +122,13 @@ int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, if (mask & IB_SRQ_LIMIT) { if (attr->srq_limit > rxe->attr.max_srq_wr) { - pr_warn("srq_limit(%d) > max_srq_wr(%d)\n", + rxe_dbg_srq(srq, "srq_limit(%d) > max_srq_wr(%d)\n", attr->srq_limit, rxe->attr.max_srq_wr); goto err1; } if (attr->srq_limit > srq->rq.queue->buf->index_mask) { - pr_warn("srq_limit (%d) > cur limit(%d)\n", + rxe_dbg_srq(srq, "srq_limit (%d) > cur limit(%d)\n", attr->srq_limit, srq->rq.queue->buf->index_mask); goto err1; From 14e501fdb0de3b45b8ee8a2a031c3c64aaa01817 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:09 -0500 Subject: [PATCH 044/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_verbs.c Replace calls to pr_xxx() in rxe_verbs.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-12-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 510ae471ac7a..e6eca21c54e6 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1103,7 +1103,7 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) err = ib_register_device(dev, ibdev_name, NULL); if (err) - pr_warn("%s failed with error %d\n", __func__, err); + rxe_dbg(rxe, "failed with error %d\n", err); /* * Note that rxe may be invalid at this point if another thread From 25fd735a4c9e38c65904eea2769ed92f6f8586d0 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:10 -0500 Subject: [PATCH 045/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_av.c Replace calls to pr_xxx() in rxe_av.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-13-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_av.c | 43 ++++++++++++++++++++++----- drivers/infiniband/sw/rxe/rxe_loc.h | 8 ++--- drivers/infiniband/sw/rxe/rxe_qp.c | 4 +-- drivers/infiniband/sw/rxe/rxe_verbs.c | 13 ++++---- 4 files changed, 47 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 3b05314ca739..889d7adbd455 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -14,26 +14,45 @@ void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av) memcpy(av->dmac, attr->roce.dmac, ETH_ALEN); } -int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr) +static int chk_attr(void *obj, struct rdma_ah_attr *attr, bool obj_is_ah) { const struct ib_global_route *grh = rdma_ah_read_grh(attr); struct rxe_port *port; + struct rxe_dev *rxe; + struct rxe_qp *qp; + struct rxe_ah *ah; int type; + if (obj_is_ah) { + ah = obj; + rxe = to_rdev(ah->ibah.device); + } else { + qp = obj; + rxe = to_rdev(qp->ibqp.device); + } + port = &rxe->port; if (rdma_ah_get_ah_flags(attr) & IB_AH_GRH) { if (grh->sgid_index > port->attr.gid_tbl_len) { - pr_warn("invalid sgid index = %d\n", - grh->sgid_index); + if (obj_is_ah) + rxe_dbg_ah(ah, "invalid sgid index = %d\n", + grh->sgid_index); + else + rxe_dbg_qp(qp, "invalid sgid index = %d\n", + grh->sgid_index); return -EINVAL; } type = rdma_gid_attr_network_type(grh->sgid_attr); if (type < RDMA_NETWORK_IPV4 || type > RDMA_NETWORK_IPV6) { - pr_warn("invalid network type for rdma_rxe = %d\n", - type); + if (obj_is_ah) + rxe_dbg_ah(ah, "invalid network type for rdma_rxe = %d\n", + type); + else + rxe_dbg_qp(qp, "invalid network type for rdma_rxe = %d\n", + type); return -EINVAL; } } @@ -41,6 +60,16 @@ int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr) return 0; } +int rxe_av_chk_attr(struct rxe_qp *qp, struct rdma_ah_attr *attr) +{ + return chk_attr(qp, attr, false); +} + +int rxe_ah_chk_attr(struct rxe_ah *ah, struct rdma_ah_attr *attr) +{ + return chk_attr(ah, attr, true); +} + void rxe_av_from_attr(u8 port_num, struct rxe_av *av, struct rdma_ah_attr *attr) { @@ -121,12 +150,12 @@ struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp) /* only new user provider or kernel client */ ah = rxe_pool_get_index(&pkt->rxe->ah_pool, ah_num); if (!ah) { - pr_warn("Unable to find AH matching ah_num\n"); + rxe_dbg_qp(pkt->qp, "Unable to find AH matching ah_num\n"); return NULL; } if (rxe_ah_pd(ah) != pkt->qp->pd) { - pr_warn("PDs don't match for AH and QP\n"); + rxe_dbg_qp(pkt->qp, "PDs don't match for AH and QP\n"); rxe_put(ah); return NULL; } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index c2a5c8814a48..a22476d27b38 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -9,16 +9,12 @@ /* rxe_av.c */ void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av); - -int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr); - +int rxe_av_chk_attr(struct rxe_qp *qp, struct rdma_ah_attr *attr); +int rxe_ah_chk_attr(struct rxe_ah *ah, struct rdma_ah_attr *attr); void rxe_av_from_attr(u8 port_num, struct rxe_av *av, struct rdma_ah_attr *attr); - void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr); - void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr); - struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp); /* rxe_cq.c */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index bcbfe6068b8b..46f6c74ce00e 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -414,11 +414,11 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) goto err1; - if (mask & IB_QP_AV && rxe_av_chk_attr(rxe, &attr->ah_attr)) + if (mask & IB_QP_AV && rxe_av_chk_attr(qp, &attr->ah_attr)) goto err1; if (mask & IB_QP_ALT_PATH) { - if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr)) + if (rxe_av_chk_attr(qp, &attr->alt_ah_attr)) goto err1; if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) { rxe_dbg_qp(qp, "invalid alt port %d\n", attr->alt_port_num); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index e6eca21c54e6..025b35bf014e 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -172,10 +172,6 @@ static int rxe_create_ah(struct ib_ah *ibah, ah->is_user = false; } - err = rxe_av_chk_attr(rxe, init_attr->ah_attr); - if (err) - return err; - err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); if (err) @@ -184,6 +180,12 @@ static int rxe_create_ah(struct ib_ah *ibah, /* create index > 0 */ ah->ah_num = ah->elem.index; + err = rxe_ah_chk_attr(ah, init_attr->ah_attr); + if (err) { + rxe_cleanup(ah); + return err; + } + if (uresp) { /* only if new user provider */ err = copy_to_user(&uresp->ah_num, &ah->ah_num, @@ -206,10 +208,9 @@ static int rxe_create_ah(struct ib_ah *ibah, static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { int err; - struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); - err = rxe_av_chk_attr(rxe, attr); + err = rxe_ah_chk_attr(ah, attr); if (err) return err; From fc50597934411170ed149d3368b0b733bc5119f1 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:11 -0500 Subject: [PATCH 046/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_task.c Replace calls to pr_xxx() in rxe_task.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-14-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_task.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 0208d833a41b..60b90e33a884 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -29,6 +29,7 @@ static void do_task(struct tasklet_struct *t) int cont; int ret; struct rxe_task *task = from_tasklet(task, t, tasklet); + struct rxe_qp *qp = (struct rxe_qp *)task->arg; unsigned int iterations = RXE_MAX_ITERATIONS; spin_lock_bh(&task->lock); @@ -47,7 +48,7 @@ static void do_task(struct tasklet_struct *t) default: spin_unlock_bh(&task->lock); - pr_warn("%s failed with bad state %d\n", __func__, task->state); + rxe_dbg_qp(qp, "failed with bad state %d\n", task->state); return; } @@ -81,8 +82,8 @@ static void do_task(struct tasklet_struct *t) break; default: - pr_warn("%s failed with bad state %d\n", __func__, - task->state); + rxe_dbg_qp(qp, "failed with bad state %d\n", + task->state); } spin_unlock_bh(&task->lock); } while (cont); From c6aba5ea00550017629c56972b635f33bb6a6903 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:12 -0500 Subject: [PATCH 047/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe.c Replace calls to pr_xxx() in rxe.c with rxe_dbg_xxx(). Calls with a rxe device not yet in scope are left as is. Link: https://lore.kernel.org/r/20221103171013.20659-15-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 51daac5c4feb..136c2efe3466 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -187,14 +187,14 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) exists = rxe_get_dev_from_net(ndev); if (exists) { ib_device_put(&exists->ib_dev); - pr_err("already configured on %s\n", ndev->name); + rxe_dbg(exists, "already configured on %s\n", ndev->name); err = -EEXIST; goto err; } err = rxe_net_add(ibdev_name, ndev); if (err) { - pr_err("failed to add %s\n", ndev->name); + rxe_dbg(exists, "failed to add %s\n", ndev->name); goto err; } err: From 813728043b79a11f7029ba196decfc7f576ae487 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:13 -0500 Subject: [PATCH 048/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_icrc.c Replace calls to pr_xxx() in rxe_icrc.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-16-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_icrc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c index 46bb07c5c4df..71bc2c189588 100644 --- a/drivers/infiniband/sw/rxe/rxe_icrc.c +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -21,7 +21,7 @@ int rxe_icrc_init(struct rxe_dev *rxe) tfm = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(tfm)) { - pr_warn("failed to init crc32 algorithm err:%ld\n", + rxe_dbg(rxe, "failed to init crc32 algorithm err: %ld\n", PTR_ERR(tfm)); return PTR_ERR(tfm); } @@ -51,7 +51,7 @@ static __be32 rxe_crc32(struct rxe_dev *rxe, __be32 crc, void *next, size_t len) *(__be32 *)shash_desc_ctx(shash) = crc; err = crypto_shash_update(shash, next, len); if (unlikely(err)) { - pr_warn_ratelimited("failed crc calculation, err: %d\n", err); + rxe_dbg(rxe, "failed crc calculation, err: %d\n", err); return (__force __be32)crc32_le((__force u32)crc, next, len); } From 5de087250f1d8f7b81abaf94110884994793f073 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Nov 2022 12:10:14 -0500 Subject: [PATCH 049/122] RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_mmap.c Replace calls to pr_xxx() in rxe_mmap.c with rxe_dbg_xxx(). Link: https://lore.kernel.org/r/20221103171013.20659-17-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c index 9149b6095429..a47d72dbc537 100644 --- a/drivers/infiniband/sw/rxe/rxe_mmap.c +++ b/drivers/infiniband/sw/rxe/rxe_mmap.c @@ -79,7 +79,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) /* Don't allow a mmap larger than the object. */ if (size > ip->info.size) { - pr_err("mmap region is larger than the object!\n"); + rxe_dbg(rxe, "mmap region is larger than the object!\n"); spin_unlock_bh(&rxe->pending_lock); ret = -EINVAL; goto done; @@ -87,7 +87,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) goto found_it; } - pr_warn("unable to find pending mmap info\n"); + rxe_dbg(rxe, "unable to find pending mmap info\n"); spin_unlock_bh(&rxe->pending_lock); ret = -EINVAL; goto done; @@ -98,7 +98,7 @@ found_it: ret = remap_vmalloc_range(vma, ip->obj, 0); if (ret) { - pr_err("err %d from remap_vmalloc_range\n", ret); + rxe_dbg(rxe, "err %d from remap_vmalloc_range\n", ret); goto done; } From 0266a177631d4c6b963b5b12dd986a8c5abdbf06 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 3 Nov 2022 12:16:30 -0700 Subject: [PATCH 050/122] RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA). Co-developed-by: Ajay Sharma Signed-off-by: Ajay Sharma Reviewed-by: Dexuan Cui Signed-off-by: Long Li Link: https://lore.kernel.org/r/1667502990-2559-13-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky --- MAINTAINERS | 9 + drivers/infiniband/Kconfig | 1 + drivers/infiniband/hw/Makefile | 1 + drivers/infiniband/hw/mana/Kconfig | 10 + drivers/infiniband/hw/mana/Makefile | 4 + drivers/infiniband/hw/mana/cq.c | 79 ++++ drivers/infiniband/hw/mana/device.c | 117 ++++++ drivers/infiniband/hw/mana/main.c | 521 ++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 162 ++++++++ drivers/infiniband/hw/mana/mr.c | 198 +++++++++ drivers/infiniband/hw/mana/qp.c | 506 +++++++++++++++++++++++ drivers/infiniband/hw/mana/wq.c | 115 ++++++ include/net/mana/mana.h | 3 + include/uapi/rdma/ib_user_ioctl_verbs.h | 1 + include/uapi/rdma/mana-abi.h | 66 +++ 15 files changed, 1793 insertions(+) create mode 100644 drivers/infiniband/hw/mana/Kconfig create mode 100644 drivers/infiniband/hw/mana/Makefile create mode 100644 drivers/infiniband/hw/mana/cq.c create mode 100644 drivers/infiniband/hw/mana/device.c create mode 100644 drivers/infiniband/hw/mana/main.c create mode 100644 drivers/infiniband/hw/mana/mana_ib.h create mode 100644 drivers/infiniband/hw/mana/mr.c create mode 100644 drivers/infiniband/hw/mana/qp.c create mode 100644 drivers/infiniband/hw/mana/wq.c create mode 100644 include/uapi/rdma/mana-abi.h diff --git a/MAINTAINERS b/MAINTAINERS index 441a65d41eb4..4db8e4e02c05 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13669,6 +13669,15 @@ F: drivers/scsi/smartpqi/smartpqi*.[ch] F: include/linux/cciss*.h F: include/uapi/linux/cciss*.h +MICROSOFT MANA RDMA DRIVER +M: Long Li +M: Ajay Sharma +L: linux-rdma@vger.kernel.org +S: Supported +F: drivers/infiniband/hw/mana/ +F: include/net/mana +F: include/uapi/rdma/mana-abi.h + MICROSOFT SURFACE AGGREGATOR TABLET-MODE SWITCH M: Maximilian Luz L: platform-driver-x86@vger.kernel.org diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index aa36ac618e72..ccc874478f0b 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -85,6 +85,7 @@ source "drivers/infiniband/hw/erdma/Kconfig" source "drivers/infiniband/hw/hfi1/Kconfig" source "drivers/infiniband/hw/hns/Kconfig" source "drivers/infiniband/hw/irdma/Kconfig" +source "drivers/infiniband/hw/mana/Kconfig" source "drivers/infiniband/hw/mlx4/Kconfig" source "drivers/infiniband/hw/mlx5/Kconfig" source "drivers/infiniband/hw/mthca/Kconfig" diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index 6b3a88046125..1211f4317a9f 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_INFINIBAND_QIB) += qib/ obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ obj-$(CONFIG_INFINIBAND_EFA) += efa/ obj-$(CONFIG_INFINIBAND_IRDMA) += irdma/ +obj-$(CONFIG_MANA_INFINIBAND) += mana/ obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ diff --git a/drivers/infiniband/hw/mana/Kconfig b/drivers/infiniband/hw/mana/Kconfig new file mode 100644 index 000000000000..546640657bac --- /dev/null +++ b/drivers/infiniband/hw/mana/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config MANA_INFINIBAND + tristate "Microsoft Azure Network Adapter support" + depends on NETDEVICES && ETHERNET && PCI && MICROSOFT_MANA + help + This driver provides low-level RDMA support for Microsoft Azure + Network Adapter (MANA). MANA supports RDMA features that can be used + for workloads (e.g. DPDK, MPI etc) that uses RDMA verbs to directly + access hardware from user-mode processes in Microsoft Azure cloud + environment. diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile new file mode 100644 index 000000000000..88655fe5e398 --- /dev/null +++ b/drivers/infiniband/hw/mana/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o + +mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c new file mode 100644 index 000000000000..d141cab8a1e6 --- /dev/null +++ b/drivers/infiniband/hw/mana/cq.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct ib_device *ibdev = ibcq->device; + struct mana_ib_create_cq ucmd = {}; + struct mana_ib_dev *mdev; + int err; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + if (udata->inlen < sizeof(ucmd)) + return -EINVAL; + + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(ibdev, + "Failed to copy from udata for create cq, %d\n", err); + return err; + } + + if (attr->cqe > MAX_SEND_BUFFERS_PER_QUEUE) { + ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); + return -EINVAL; + } + + cq->cqe = attr->cqe; + cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(cq->umem)) { + err = PTR_ERR(cq->umem); + ibdev_dbg(ibdev, "Failed to get umem for create cq, err %d\n", + err); + return err; + } + + err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region); + if (err) { + ibdev_dbg(ibdev, + "Failed to create dma region for create cq, %d\n", + err); + goto err_release_umem; + } + + ibdev_dbg(ibdev, + "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", + err, cq->gdma_region); + + /* + * The CQ ID is not known at this time. The ID is generated at create_qp + */ + + return 0; + +err_release_umem: + ib_umem_release(cq->umem); + return err; +} + +int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct ib_device *ibdev = ibcq->device; + struct mana_ib_dev *mdev; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region); + ib_umem_release(cq->umem); + + return 0; +} diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c new file mode 100644 index 000000000000..d4541b8707e4 --- /dev/null +++ b/drivers/infiniband/hw/mana/device.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" +#include + +MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(NET_MANA); + +static const struct ib_device_ops mana_ib_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_MANA, + .uverbs_abi_ver = MANA_IB_UVERBS_ABI_VERSION, + + .alloc_pd = mana_ib_alloc_pd, + .alloc_ucontext = mana_ib_alloc_ucontext, + .create_cq = mana_ib_create_cq, + .create_qp = mana_ib_create_qp, + .create_rwq_ind_table = mana_ib_create_rwq_ind_table, + .create_wq = mana_ib_create_wq, + .dealloc_pd = mana_ib_dealloc_pd, + .dealloc_ucontext = mana_ib_dealloc_ucontext, + .dereg_mr = mana_ib_dereg_mr, + .destroy_cq = mana_ib_destroy_cq, + .destroy_qp = mana_ib_destroy_qp, + .destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table, + .destroy_wq = mana_ib_destroy_wq, + .disassociate_ucontext = mana_ib_disassociate_ucontext, + .get_port_immutable = mana_ib_get_port_immutable, + .mmap = mana_ib_mmap, + .modify_qp = mana_ib_modify_qp, + .modify_wq = mana_ib_modify_wq, + .query_device = mana_ib_query_device, + .query_gid = mana_ib_query_gid, + .query_port = mana_ib_query_port, + .reg_user_mr = mana_ib_reg_user_mr, + + INIT_RDMA_OBJ_SIZE(ib_cq, mana_ib_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, mana_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_qp, mana_ib_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_ucontext, mana_ib_ucontext, ibucontext), + INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mana_ib_rwq_ind_table, + ib_ind_table), +}; + +static int mana_ib_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mana_adev *madev = container_of(adev, struct mana_adev, adev); + struct gdma_dev *mdev = madev->mdev; + struct mana_context *mc; + struct mana_ib_dev *dev; + int ret; + + mc = mdev->driver_data; + + dev = ib_alloc_device(mana_ib_dev, ib_dev); + if (!dev) + return -ENOMEM; + + ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops); + + dev->ib_dev.phys_port_cnt = mc->num_ports; + + ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev, + mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt); + + dev->gdma_dev = mdev; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + + /* + * num_comp_vectors needs to set to the max MSIX index + * when interrupts and event queues are implemented + */ + dev->ib_dev.num_comp_vectors = 1; + dev->ib_dev.dev.parent = mdev->gdma_context->dev; + + ret = ib_register_device(&dev->ib_dev, "mana_%d", + mdev->gdma_context->dev); + if (ret) { + ib_dealloc_device(&dev->ib_dev); + return ret; + } + + dev_set_drvdata(&adev->dev, dev); + + return 0; +} + +static void mana_ib_remove(struct auxiliary_device *adev) +{ + struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); + + ib_unregister_device(&dev->ib_dev); + ib_dealloc_device(&dev->ib_dev); +} + +static const struct auxiliary_device_id mana_id_table[] = { + { + .name = "mana.rdma", + }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, mana_id_table); + +static struct auxiliary_driver mana_driver = { + .name = "rdma", + .probe = mana_ib_probe, + .remove = mana_ib_remove, + .id_table = mana_id_table, +}; + +module_auxiliary_driver(mana_driver); diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c new file mode 100644 index 000000000000..8b3bc302d6f3 --- /dev/null +++ b/drivers/infiniband/hw/mana/main.c @@ -0,0 +1,521 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, + u32 port) +{ + struct gdma_dev *gd = dev->gdma_dev; + struct mana_port_context *mpc; + struct net_device *ndev; + struct mana_context *mc; + + mc = gd->driver_data; + ndev = mc->ports[port]; + mpc = netdev_priv(ndev); + + mutex_lock(&pd->vport_mutex); + + pd->vport_use_count--; + WARN_ON(pd->vport_use_count < 0); + + if (!pd->vport_use_count) + mana_uncfg_vport(mpc); + + mutex_unlock(&pd->vport_mutex); +} + +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd, + u32 doorbell_id) +{ + struct gdma_dev *mdev = dev->gdma_dev; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + int err; + + mc = mdev->driver_data; + ndev = mc->ports[port]; + mpc = netdev_priv(ndev); + + mutex_lock(&pd->vport_mutex); + + pd->vport_use_count++; + if (pd->vport_use_count > 1) { + ibdev_dbg(&dev->ib_dev, + "Skip as this PD is already configured vport\n"); + mutex_unlock(&pd->vport_mutex); + return 0; + } + + err = mana_cfg_vport(mpc, pd->pdn, doorbell_id); + if (err) { + pd->vport_use_count--; + mutex_unlock(&pd->vport_mutex); + + ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n", err); + return err; + } + + mutex_unlock(&pd->vport_mutex); + + pd->tx_shortform_allowed = mpc->tx_shortform_allowed; + pd->tx_vp_offset = mpc->tx_vp_offset; + + ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n", + mpc->port_handle, pd->pdn, doorbell_id); + + return 0; +} + +int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct ib_device *ibdev = ibpd->device; + struct gdma_create_pd_resp resp = {}; + struct gdma_create_pd_req req = {}; + enum gdma_pd_flags flags = 0; + struct mana_ib_dev *dev; + struct gdma_dev *mdev; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + mdev = dev->gdma_dev; + + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req), + sizeof(resp)); + + req.flags = flags; + err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req, + sizeof(resp), &resp); + + if (err || resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, + "Failed to get pd_id err %d status %u\n", err, + resp.hdr.status); + if (!err) + err = -EPROTO; + + return err; + } + + pd->pd_handle = resp.pd_handle; + pd->pdn = resp.pd_id; + ibdev_dbg(&dev->ib_dev, "pd_handle 0x%llx pd_id %d\n", + pd->pd_handle, pd->pdn); + + mutex_init(&pd->vport_mutex); + pd->vport_use_count = 0; + return 0; +} + +int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct ib_device *ibdev = ibpd->device; + struct gdma_destory_pd_resp resp = {}; + struct gdma_destroy_pd_req req = {}; + struct mana_ib_dev *dev; + struct gdma_dev *mdev; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + mdev = dev->gdma_dev; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_PD, sizeof(req), + sizeof(resp)); + + req.pd_handle = pd->pd_handle; + err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req, + sizeof(resp), &resp); + + if (err || resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, + "Failed to destroy pd_handle 0x%llx err %d status %u", + pd->pd_handle, err, resp.hdr.status); + if (!err) + err = -EPROTO; + } + + return err; +} + +static int mana_gd_destroy_doorbell_page(struct gdma_context *gc, + int doorbell_page) +{ + struct gdma_destroy_resource_range_req req = {}; + struct gdma_resp_hdr resp = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_RESOURCE_RANGE, + sizeof(req), sizeof(resp)); + + req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; + req.num_resources = 1; + req.allocated_resources = doorbell_page; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.status) { + dev_err(gc->dev, + "Failed to destroy doorbell page: ret %d, 0x%x\n", + err, resp.status); + return err ?: -EPROTO; + } + + return 0; +} + +static int mana_gd_allocate_doorbell_page(struct gdma_context *gc, + int *doorbell_page) +{ + struct gdma_allocate_resource_range_req req = {}; + struct gdma_allocate_resource_range_resp resp = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOCATE_RESOURCE_RANGE, + sizeof(req), sizeof(resp)); + + req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; + req.num_resources = 1; + req.alignment = 1; + + /* Have GDMA start searching from 0 */ + req.allocated_resources = 0; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + dev_err(gc->dev, + "Failed to allocate doorbell page: ret %d, 0x%x\n", + err, resp.hdr.status); + return err ?: -EPROTO; + } + + *doorbell_page = resp.allocated_resources; + + return 0; +} + +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext, + struct ib_udata *udata) +{ + struct mana_ib_ucontext *ucontext = + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); + struct ib_device *ibdev = ibcontext->device; + struct mana_ib_dev *mdev; + struct gdma_context *gc; + struct gdma_dev *dev; + int doorbell_page; + int ret; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + dev = mdev->gdma_dev; + gc = dev->gdma_context; + + /* Allocate a doorbell page index */ + ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page); + if (ret) { + ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret); + return ret; + } + + ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page); + + ucontext->doorbell = doorbell_page; + + return 0; +} + +void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mana_ib_ucontext *mana_ucontext = + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); + struct ib_device *ibdev = ibcontext->device; + struct mana_ib_dev *mdev; + struct gdma_context *gc; + int ret; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + gc = mdev->gdma_dev->gdma_context; + + ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext->doorbell); + if (ret) + ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret); +} + +static int +mana_ib_gd_first_dma_region(struct mana_ib_dev *dev, + struct gdma_context *gc, + struct gdma_create_dma_region_req *create_req, + size_t num_pages, mana_handle_t *gdma_region) +{ + struct gdma_create_dma_region_resp create_resp = {}; + unsigned int create_req_msg_size; + int err; + + create_req_msg_size = + struct_size(create_req, page_addr_list, num_pages); + create_req->page_addr_list_len = num_pages; + + err = mana_gd_send_request(gc, create_req_msg_size, create_req, + sizeof(create_resp), &create_resp); + if (err || create_resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, + "Failed to create DMA region: %d, 0x%x\n", + err, create_resp.hdr.status); + if (!err) + err = -EPROTO; + + return err; + } + + *gdma_region = create_resp.dma_region_handle; + ibdev_dbg(&dev->ib_dev, "Created DMA region handle 0x%llx\n", + *gdma_region); + + return 0; +} + +static int +mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, + struct gdma_dma_region_add_pages_req *add_req, + unsigned int num_pages, u32 expected_status) +{ + unsigned int add_req_msg_size = + struct_size(add_req, page_addr_list, num_pages); + struct gdma_general_resp add_resp = {}; + int err; + + mana_gd_init_req_hdr(&add_req->hdr, GDMA_DMA_REGION_ADD_PAGES, + add_req_msg_size, sizeof(add_resp)); + add_req->page_addr_list_len = num_pages; + + err = mana_gd_send_request(gc, add_req_msg_size, add_req, + sizeof(add_resp), &add_resp); + if (err || add_resp.hdr.status != expected_status) { + ibdev_dbg(&dev->ib_dev, + "Failed to create DMA region: %d, 0x%x\n", + err, add_resp.hdr.status); + + if (!err) + err = -EPROTO; + + return err; + } + + return 0; +} + +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region) +{ + struct gdma_dma_region_add_pages_req *add_req = NULL; + size_t num_pages_processed = 0, num_pages_to_handle; + struct gdma_create_dma_region_req *create_req; + unsigned int create_req_msg_size; + struct hw_channel_context *hwc; + struct ib_block_iter biter; + size_t max_pgs_add_cmd = 0; + size_t max_pgs_create_cmd; + struct gdma_context *gc; + size_t num_pages_total; + struct gdma_dev *mdev; + unsigned long page_sz; + unsigned int tail = 0; + u64 *page_addr_list; + void *request_buf; + int err; + + mdev = dev->gdma_dev; + gc = mdev->gdma_context; + hwc = gc->hwc.driver_data; + + /* Hardware requires dma region to align to chosen page size */ + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); + if (!page_sz) { + ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); + return -ENOMEM; + } + num_pages_total = ib_umem_num_dma_blocks(umem, page_sz); + + max_pgs_create_cmd = + (hwc->max_req_msg_size - sizeof(*create_req)) / sizeof(u64); + num_pages_to_handle = + min_t(size_t, num_pages_total, max_pgs_create_cmd); + create_req_msg_size = + struct_size(create_req, page_addr_list, num_pages_to_handle); + + request_buf = kzalloc(hwc->max_req_msg_size, GFP_KERNEL); + if (!request_buf) + return -ENOMEM; + + create_req = request_buf; + mana_gd_init_req_hdr(&create_req->hdr, GDMA_CREATE_DMA_REGION, + create_req_msg_size, + sizeof(struct gdma_create_dma_region_resp)); + + create_req->length = umem->length; + create_req->offset_in_page = umem->address & (page_sz - 1); + create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->page_count = num_pages_total; + + ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", + umem->length, num_pages_total); + + ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n", + page_sz, create_req->offset_in_page); + + ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u", + num_pages_to_handle, create_req->gdma_page_type); + + page_addr_list = create_req->page_addr_list; + rdma_umem_for_each_dma_block(umem, &biter, page_sz) { + page_addr_list[tail++] = rdma_block_iter_dma_address(&biter); + if (tail < num_pages_to_handle) + continue; + + if (!num_pages_processed) { + /* First create message */ + err = mana_ib_gd_first_dma_region(dev, gc, create_req, + tail, gdma_region); + if (err) + goto out; + + max_pgs_add_cmd = (hwc->max_req_msg_size - + sizeof(*add_req)) / sizeof(u64); + + add_req = request_buf; + add_req->dma_region_handle = *gdma_region; + add_req->reserved3 = 0; + page_addr_list = add_req->page_addr_list; + } else { + /* Subsequent create messages */ + u32 expected_s = 0; + + if (num_pages_processed + num_pages_to_handle < + num_pages_total) + expected_s = GDMA_STATUS_MORE_ENTRIES; + + err = mana_ib_gd_add_dma_region(dev, gc, add_req, tail, + expected_s); + if (err) + break; + } + + num_pages_processed += tail; + tail = 0; + + /* The remaining pages to create */ + num_pages_to_handle = + min_t(size_t, + num_pages_total - num_pages_processed, + max_pgs_add_cmd); + } + + if (err) + mana_ib_gd_destroy_dma_region(dev, *gdma_region); + +out: + kfree(request_buf); + return err; +} + +int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, u64 gdma_region) +{ + struct gdma_dev *mdev = dev->gdma_dev; + struct gdma_context *gc; + + gc = mdev->gdma_context; + ibdev_dbg(&dev->ib_dev, "destroy dma region 0x%llx\n", gdma_region); + + return mana_gd_destroy_dma_region(gc, gdma_region); +} + +int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mana_ib_ucontext *mana_ucontext = + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); + struct ib_device *ibdev = ibcontext->device; + struct mana_ib_dev *mdev; + struct gdma_context *gc; + phys_addr_t pfn; + pgprot_t prot; + int ret; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + gc = mdev->gdma_dev->gdma_context; + + if (vma->vm_pgoff != 0) { + ibdev_dbg(ibdev, "Unexpected vm_pgoff %lu\n", vma->vm_pgoff); + return -EINVAL; + } + + /* Map to the page indexed by ucontext->doorbell */ + pfn = (gc->phys_db_page_base + + gc->db_page_size * mana_ucontext->doorbell) >> + PAGE_SHIFT; + prot = pgprot_writecombine(vma->vm_page_prot); + + ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + NULL); + if (ret) + ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); + else + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", + pfn, gc->db_page_size, ret); + + return ret; +} + +int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable) +{ + /* + * This version only support RAW_PACKET + * other values need to be filled for other types + */ + immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + + return 0; +} + +int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw) +{ + props->max_qp = MANA_MAX_NUM_QUEUES; + props->max_qp_wr = MAX_SEND_BUFFERS_PER_QUEUE; + + /* + * max_cqe could be potentially much bigger. + * As this version of driver only support RAW QP, set it to the same + * value as max_qp_wr + */ + props->max_cqe = MAX_SEND_BUFFERS_PER_QUEUE; + + props->max_mr_size = MANA_IB_MAX_MR_SIZE; + props->max_mr = MANA_IB_MAX_MR; + props->max_send_sge = MAX_TX_WQE_SGL_ENTRIES; + props->max_recv_sge = MAX_RX_WQE_SGL_ENTRIES; + + return 0; +} + +int mana_ib_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props) +{ + /* This version doesn't return port properties */ + return 0; +} + +int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index, + union ib_gid *gid) +{ + /* This version doesn't return GID properties */ + return 0; +} + +void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h new file mode 100644 index 000000000000..502cc8672eef --- /dev/null +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + */ + +#ifndef _MANA_IB_H_ +#define _MANA_IB_H_ + +#include +#include +#include +#include +#include + +#include + +#define PAGE_SZ_BM \ + (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K | \ + SZ_512K | SZ_1M | SZ_2M) + +/* MANA doesn't have any limit for MR size */ +#define MANA_IB_MAX_MR_SIZE U64_MAX + +/* + * The hardware limit of number of MRs is greater than maximum number of MRs + * that can possibly represent in 24 bits + */ +#define MANA_IB_MAX_MR 0xFFFFFFu + +struct mana_ib_dev { + struct ib_device ib_dev; + struct gdma_dev *gdma_dev; +}; + +struct mana_ib_wq { + struct ib_wq ibwq; + struct ib_umem *umem; + int wqe; + u32 wq_buf_size; + u64 gdma_region; + u64 id; + mana_handle_t rx_object; +}; + +struct mana_ib_pd { + struct ib_pd ibpd; + u32 pdn; + mana_handle_t pd_handle; + + /* Mutex for sharing access to vport_use_count */ + struct mutex vport_mutex; + int vport_use_count; + + bool tx_shortform_allowed; + u32 tx_vp_offset; +}; + +struct mana_ib_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + mana_handle_t mr_handle; +}; + +struct mana_ib_cq { + struct ib_cq ibcq; + struct ib_umem *umem; + int cqe; + u64 gdma_region; + u64 id; +}; + +struct mana_ib_qp { + struct ib_qp ibqp; + + /* Work queue info */ + struct ib_umem *sq_umem; + int sqe; + u64 sq_gdma_region; + u64 sq_id; + mana_handle_t tx_object; + + /* The port on the IB device, starting with 1 */ + u32 port; +}; + +struct mana_ib_ucontext { + struct ib_ucontext ibucontext; + u32 doorbell; +}; + +struct mana_ib_rwq_ind_table { + struct ib_rwq_ind_table ib_ind_table; +}; + +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region); + +int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, + mana_handle_t gdma_region); + +struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); + +int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); + +int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata); + +int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); + +int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl); + +struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags); + +struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 iova, int access_flags, + struct ib_udata *udata); + +int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); + +int mana_ib_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata); + +int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); + +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port_id, + struct mana_ib_pd *pd, u32 doorbell_id); +void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, + u32 port); + +int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); + +int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); + +int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); + +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext, + struct ib_udata *udata); +void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); + +int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma); + +int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable); +int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw); +int mana_ib_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props); +int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index, + union ib_gid *gid); + +void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext); + +#endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c new file mode 100644 index 000000000000..a56236cdd9ee --- /dev/null +++ b/drivers/infiniband/hw/mana/mr.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +#define VALID_MR_FLAGS \ + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ) + +static enum gdma_mr_access_flags +mana_ib_verbs_to_gdma_access_flags(int access_flags) +{ + enum gdma_mr_access_flags flags = GDMA_ACCESS_FLAG_LOCAL_READ; + + if (access_flags & IB_ACCESS_LOCAL_WRITE) + flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_WRITE) + flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + flags |= GDMA_ACCESS_FLAG_REMOTE_READ; + + return flags; +} + +static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, + struct gdma_create_mr_params *mr_params) +{ + struct gdma_create_mr_response resp = {}; + struct gdma_create_mr_request req = {}; + struct gdma_dev *mdev = dev->gdma_dev; + struct gdma_context *gc; + int err; + + gc = mdev->gdma_context; + + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req), + sizeof(resp)); + req.pd_handle = mr_params->pd_handle; + req.mr_type = mr_params->mr_type; + + switch (mr_params->mr_type) { + case GDMA_MR_TYPE_GVA: + req.gva.dma_region_handle = mr_params->gva.dma_region_handle; + req.gva.virtual_address = mr_params->gva.virtual_address; + req.gva.access_flags = mr_params->gva.access_flags; + break; + + default: + ibdev_dbg(&dev->ib_dev, + "invalid param (GDMA_MR_TYPE) passed, type %d\n", + req.mr_type); + return -EINVAL; + } + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + + if (err || resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err, + resp.hdr.status); + if (!err) + err = -EPROTO; + + return err; + } + + mr->ibmr.lkey = resp.lkey; + mr->ibmr.rkey = resp.rkey; + mr->mr_handle = resp.mr_handle; + + return 0; +} + +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, + gdma_obj_handle_t mr_handle) +{ + struct gdma_destroy_mr_response resp = {}; + struct gdma_destroy_mr_request req = {}; + struct gdma_dev *mdev = dev->gdma_dev; + struct gdma_context *gc; + int err; + + gc = mdev->gdma_context; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req), + sizeof(resp)); + + req.mr_handle = mr_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = -EPROTO; + return err; + } + + return 0; +} + +struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 iova, int access_flags, + struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct ib_device *ibdev = ibpd->device; + gdma_obj_handle_t dma_region_handle; + struct mana_ib_dev *dev; + struct mana_ib_mr *mr; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + ibdev_dbg(ibdev, + "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x", + start, iova, length, access_flags); + + if (access_flags & ~VALID_MR_FLAGS) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->umem = ib_umem_get(ibdev, start, length, access_flags); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + ibdev_dbg(ibdev, + "Failed to get umem for register user-mr, %d\n", err); + goto err_free; + } + + err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle); + if (err) { + ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", + err); + goto err_umem; + } + + ibdev_dbg(ibdev, + "mana_ib_gd_create_dma_region ret %d gdma_region %llx\n", err, + dma_region_handle); + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_GVA; + mr_params.gva.dma_region_handle = dma_region_handle; + mr_params.gva.virtual_address = iova; + mr_params.gva.access_flags = + mana_ib_verbs_to_gdma_access_flags(access_flags); + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_dma_region; + + /* + * There is no need to keep track of dma_region_handle after MR is + * successfully created. The dma_region_handle is tracked in the PF + * as part of the lifecycle of this MR. + */ + + return &mr->ibmr; + +err_dma_region: + mana_gd_destroy_dma_region(dev->gdma_dev->gdma_context, + dma_region_handle); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr, ibmr); + struct ib_device *ibdev = ibmr->device; + struct mana_ib_dev *dev; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + err = mana_ib_gd_destroy_mr(dev, mr->mr_handle); + if (err) + return err; + + if (mr->umem) + ib_umem_release(mr->umem); + + kfree(mr); + + return 0; +} diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c new file mode 100644 index 000000000000..ea15ec77e321 --- /dev/null +++ b/drivers/infiniband/hw/mana/qp.c @@ -0,0 +1,506 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, + struct net_device *ndev, + mana_handle_t default_rxobj, + mana_handle_t ind_table[], + u32 log_ind_tbl_size, u32 rx_hash_key_len, + u8 *rx_hash_key) +{ + struct mana_port_context *mpc = netdev_priv(ndev); + struct mana_cfg_rx_steer_req *req = NULL; + struct mana_cfg_rx_steer_resp resp = {}; + mana_handle_t *req_indir_tab; + struct gdma_context *gc; + struct gdma_dev *mdev; + u32 req_buf_size; + int i, err; + + mdev = dev->gdma_dev; + gc = mdev->gdma_context; + + req_buf_size = + sizeof(*req) + sizeof(mana_handle_t) * MANA_INDIRECT_TABLE_SIZE; + req = kzalloc(req_buf_size, GFP_KERNEL); + if (!req) + return -ENOMEM; + + mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size, + sizeof(resp)); + + req->vport = mpc->port_handle; + req->rx_enable = 1; + req->update_default_rxobj = 1; + req->default_rxobj = default_rxobj; + req->hdr.dev_id = mdev->dev_id; + + /* If there are more than 1 entries in indirection table, enable RSS */ + if (log_ind_tbl_size) + req->rss_enable = true; + + req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE; + req->indir_tab_offset = sizeof(*req); + req->update_indir_tab = true; + + req_indir_tab = (mana_handle_t *)(req + 1); + /* The ind table passed to the hardware must have + * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb + * ind_table to MANA_INDIRECT_TABLE_SIZE if required + */ + ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size); + for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) { + req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)]; + ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i, + req_indir_tab[i]); + } + + req->update_hashkey = true; + if (rx_hash_key_len) + memcpy(req->hashkey, rx_hash_key, rx_hash_key_len); + else + netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE); + + ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n", + req->vport, default_rxobj); + + err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp), &resp); + if (err) { + netdev_err(ndev, "Failed to configure vPort RX: %d\n", err); + goto out; + } + + if (resp.hdr.status) { + netdev_err(ndev, "vPort RX configuration failed: 0x%x\n", + resp.hdr.status); + err = -EPROTO; + goto out; + } + + netdev_info(ndev, "Configured steering vPort %llu log_entries %u\n", + mpc->port_handle, log_ind_tbl_size); + +out: + kfree(req); + return err; +} + +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_ib_dev *mdev = + container_of(pd->device, struct mana_ib_dev, ib_dev); + struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl; + struct mana_ib_create_qp_rss_resp resp = {}; + struct mana_ib_create_qp_rss ucmd = {}; + struct gdma_dev *gd = mdev->gdma_dev; + mana_handle_t *mana_ind_table; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct mana_ib_cq *cq; + struct mana_ib_wq *wq; + unsigned int ind_tbl_size; + struct ib_cq *ibcq; + struct ib_wq *ibwq; + int i = 0; + u32 port; + int ret; + + mc = gd->driver_data; + + if (!udata || udata->inlen < sizeof(ucmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (ret) { + ibdev_dbg(&mdev->ib_dev, + "Failed copy from udata for create rss-qp, err %d\n", + ret); + return ret; + } + + if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_recv_wr %d exceeding limit\n", + attr->cap.max_recv_wr); + return -EINVAL; + } + + if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_recv_sge %d exceeding limit\n", + attr->cap.max_recv_sge); + return -EINVAL; + } + + ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size; + if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) { + ibdev_dbg(&mdev->ib_dev, + "Indirect table size %d exceeding limit\n", + ind_tbl_size); + return -EINVAL; + } + + if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ) { + ibdev_dbg(&mdev->ib_dev, + "RX Hash function is not supported, %d\n", + ucmd.rx_hash_function); + return -EINVAL; + } + + /* IB ports start with 1, MANA start with 0 */ + port = ucmd.port; + if (port < 1 || port > mc->num_ports) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n", + port); + return -EINVAL; + } + ndev = mc->ports[port - 1]; + mpc = netdev_priv(ndev); + + ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n", + ucmd.rx_hash_function, port); + + mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t), + GFP_KERNEL); + if (!mana_ind_table) { + ret = -ENOMEM; + goto fail; + } + + qp->port = port; + + for (i = 0; i < ind_tbl_size; i++) { + struct mana_obj_spec wq_spec = {}; + struct mana_obj_spec cq_spec = {}; + + ibwq = ind_tbl->ind_tbl[i]; + wq = container_of(ibwq, struct mana_ib_wq, ibwq); + + ibcq = ibwq->cq; + cq = container_of(ibcq, struct mana_ib_cq, ibcq); + + wq_spec.gdma_region = wq->gdma_region; + wq_spec.queue_size = wq->wq_buf_size; + + cq_spec.gdma_region = cq->gdma_region; + cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = GDMA_CQ_NO_EQ; + + ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, + &wq_spec, &cq_spec, &wq->rx_object); + if (ret) + goto fail; + + /* The GDMA regions are now owned by the WQ object */ + wq->gdma_region = GDMA_INVALID_DMA_REGION; + cq->gdma_region = GDMA_INVALID_DMA_REGION; + + wq->id = wq_spec.queue_index; + cq->id = cq_spec.queue_index; + + ibdev_dbg(&mdev->ib_dev, + "ret %d rx_object 0x%llx wq id %llu cq id %llu\n", + ret, wq->rx_object, wq->id, cq->id); + + resp.entries[i].cqid = cq->id; + resp.entries[i].wqid = wq->id; + + mana_ind_table[i] = wq->rx_object; + } + resp.num_entries = i; + + ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object, + mana_ind_table, + ind_tbl->log_ind_tbl_size, + ucmd.rx_hash_key_len, + ucmd.rx_hash_key); + if (ret) + goto fail; + + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) { + ibdev_dbg(&mdev->ib_dev, + "Failed to copy to udata create rss-qp, %d\n", + ret); + goto fail; + } + + kfree(mana_ind_table); + + return 0; + +fail: + while (i-- > 0) { + ibwq = ind_tbl->ind_tbl[i]; + wq = container_of(ibwq, struct mana_ib_wq, ibwq); + mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); + } + + kfree(mana_ind_table); + + return ret; +} + +static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_ib_dev *mdev = + container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_cq *send_cq = + container_of(attr->send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_ucontext *mana_ucontext = + rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, + ibucontext); + struct mana_ib_create_qp_resp resp = {}; + struct gdma_dev *gd = mdev->gdma_dev; + struct mana_ib_create_qp ucmd = {}; + struct mana_obj_spec wq_spec = {}; + struct mana_obj_spec cq_spec = {}; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct ib_umem *umem; + int err; + u32 port; + + mc = gd->driver_data; + + if (!mana_ucontext || udata->inlen < sizeof(ucmd)) + return -EINVAL; + + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to copy from udata create qp-raw, %d\n", err); + return err; + } + + /* IB ports start with 1, MANA Ethernet ports start with 0 */ + port = ucmd.port; + if (ucmd.port > mc->num_ports) + return -EINVAL; + + if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_send_wr %d exceeding limit\n", + attr->cap.max_send_wr); + return -EINVAL; + } + + if (attr->cap.max_send_sge > MAX_TX_WQE_SGL_ENTRIES) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_send_sge %d exceeding limit\n", + attr->cap.max_send_sge); + return -EINVAL; + } + + ndev = mc->ports[port - 1]; + mpc = netdev_priv(ndev); + ibdev_dbg(&mdev->ib_dev, "port %u ndev %p mpc %p\n", port, ndev, mpc); + + err = mana_ib_cfg_vport(mdev, port - 1, pd, mana_ucontext->doorbell); + if (err) + return -ENODEV; + + qp->port = port; + + ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n", + ucmd.sq_buf_addr, ucmd.port); + + umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr, ucmd.sq_buf_size, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + ibdev_dbg(&mdev->ib_dev, + "Failed to get umem for create qp-raw, err %d\n", + err); + goto err_free_vport; + } + qp->sq_umem = umem; + + err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, + &qp->sq_gdma_region); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to create dma region for create qp-raw, %d\n", + err); + goto err_release_umem; + } + + ibdev_dbg(&mdev->ib_dev, + "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", + err, qp->sq_gdma_region); + + /* Create a WQ on the same port handle used by the Ethernet */ + wq_spec.gdma_region = qp->sq_gdma_region; + wq_spec.queue_size = ucmd.sq_buf_size; + + cq_spec.gdma_region = send_cq->gdma_region; + cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = GDMA_CQ_NO_EQ; + + err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec, + &cq_spec, &qp->tx_object); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to create wq for create raw-qp, err %d\n", + err); + goto err_destroy_dma_region; + } + + /* The GDMA regions are now owned by the WQ object */ + qp->sq_gdma_region = GDMA_INVALID_DMA_REGION; + send_cq->gdma_region = GDMA_INVALID_DMA_REGION; + + qp->sq_id = wq_spec.queue_index; + send_cq->id = cq_spec.queue_index; + + ibdev_dbg(&mdev->ib_dev, + "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err, + qp->tx_object, qp->sq_id, send_cq->id); + + resp.sqid = qp->sq_id; + resp.cqid = send_cq->id; + resp.tx_vp_offset = pd->tx_vp_offset; + + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed copy udata for create qp-raw, %d\n", + err); + goto err_destroy_wq_obj; + } + + return 0; + +err_destroy_wq_obj: + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + +err_destroy_dma_region: + mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); + +err_release_umem: + ib_umem_release(umem); + +err_free_vport: + mana_ib_uncfg_vport(mdev, pd, port - 1); + + return err; +} + +int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + switch (attr->qp_type) { + case IB_QPT_RAW_PACKET: + /* When rwq_ind_tbl is used, it's for creating WQs for RSS */ + if (attr->rwq_ind_tbl) + return mana_ib_create_qp_rss(ibqp, ibqp->pd, attr, + udata); + + return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata); + default: + /* Creating QP other than IB_QPT_RAW_PACKET is not supported */ + ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n", + attr->qp_type); + } + + return -EINVAL; +} + +int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + /* modify_qp is not supported by this version of the driver */ + return -EOPNOTSUPP; +} + +static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp, + struct ib_rwq_ind_table *ind_tbl, + struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct gdma_dev *gd = mdev->gdma_dev; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct mana_ib_wq *wq; + struct ib_wq *ibwq; + int i; + + mc = gd->driver_data; + ndev = mc->ports[qp->port - 1]; + mpc = netdev_priv(ndev); + + for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { + ibwq = ind_tbl->ind_tbl[i]; + wq = container_of(ibwq, struct mana_ib_wq, ibwq); + ibdev_dbg(&mdev->ib_dev, "destroying wq->rx_object %llu\n", + wq->rx_object); + mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); + } + + return 0; +} + +static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct gdma_dev *gd = mdev->gdma_dev; + struct ib_pd *ibpd = qp->ibqp.pd; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct mana_ib_pd *pd; + + mc = gd->driver_data; + ndev = mc->ports[qp->port - 1]; + mpc = netdev_priv(ndev); + pd = container_of(ibpd, struct mana_ib_pd, ibpd); + + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + + if (qp->sq_umem) { + mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); + ib_umem_release(qp->sq_umem); + } + + mana_ib_uncfg_vport(mdev, pd, qp->port - 1); + + return 0; +} + +int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + + switch (ibqp->qp_type) { + case IB_QPT_RAW_PACKET: + if (ibqp->rwq_ind_tbl) + return mana_ib_destroy_qp_rss(qp, ibqp->rwq_ind_tbl, + udata); + + return mana_ib_destroy_qp_raw(qp, udata); + + default: + ibdev_dbg(ibqp->device, "Unexpected QP type %u\n", + ibqp->qp_type); + } + + return -ENOENT; +} diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c new file mode 100644 index 000000000000..372d361510e0 --- /dev/null +++ b/drivers/infiniband/hw/mana/wq.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(pd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_create_wq ucmd = {}; + struct mana_ib_wq *wq; + struct ib_umem *umem; + int err; + + if (udata->inlen < sizeof(ucmd)) + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to copy from udata for create wq, %d\n", err); + return ERR_PTR(err); + } + + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return ERR_PTR(-ENOMEM); + + ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr); + + umem = ib_umem_get(pd->device, ucmd.wq_buf_addr, ucmd.wq_buf_size, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + ibdev_dbg(&mdev->ib_dev, + "Failed to get umem for create wq, err %d\n", err); + goto err_free_wq; + } + + wq->umem = umem; + wq->wqe = init_attr->max_wr; + wq->wq_buf_size = ucmd.wq_buf_size; + wq->rx_object = INVALID_MANA_HANDLE; + + err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to create dma region for create wq, %d\n", + err); + goto err_release_umem; + } + + ibdev_dbg(&mdev->ib_dev, + "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", + err, wq->gdma_region); + + /* WQ ID is returned at wq_create time, doesn't know the value yet */ + + return &wq->ibwq; + +err_release_umem: + ib_umem_release(umem); + +err_free_wq: + kfree(wq); + + return ERR_PTR(err); +} + +int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + /* modify_wq is not supported by this version of the driver */ + return -EOPNOTSUPP; +} + +int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata) +{ + struct mana_ib_wq *wq = container_of(ibwq, struct mana_ib_wq, ibwq); + struct ib_device *ib_dev = ibwq->device; + struct mana_ib_dev *mdev; + + mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev); + + mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region); + ib_umem_release(wq->umem); + + kfree(wq); + + return 0; +} + +int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + /* + * There is no additional data in ind_table to be maintained by this + * driver, do nothing + */ + return 0; +} + +int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + /* + * There is no additional data in ind_table to be maintained by this + * driver, do nothing + */ + return 0; +} diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 713a8f8cca9a..20212ffeefb9 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -412,6 +412,9 @@ int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); extern const struct ethtool_ops mana_ethtool_ops; +/* A CQ can be created not associated with any EQ */ +#define GDMA_CQ_NO_EQ 0xffff + struct mana_obj_spec { u32 queue_index; u64 gdma_region; diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 7dd56210226f..e0c25537fd2e 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -251,6 +251,7 @@ enum rdma_driver_id { RDMA_DRIVER_EFA, RDMA_DRIVER_SIW, RDMA_DRIVER_ERDMA, + RDMA_DRIVER_MANA, }; enum ib_uverbs_gid_type { diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h new file mode 100644 index 000000000000..5fcb31b37fb9 --- /dev/null +++ b/include/uapi/rdma/mana-abi.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#ifndef MANA_ABI_USER_H +#define MANA_ABI_USER_H + +#include +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ + +#define MANA_IB_UVERBS_ABI_VERSION 1 + +struct mana_ib_create_cq { + __aligned_u64 buf_addr; +}; + +struct mana_ib_create_qp { + __aligned_u64 sq_buf_addr; + __u32 sq_buf_size; + __u32 port; +}; + +struct mana_ib_create_qp_resp { + __u32 sqid; + __u32 cqid; + __u32 tx_vp_offset; + __u32 reserved; +}; + +struct mana_ib_create_wq { + __aligned_u64 wq_buf_addr; + __u32 wq_buf_size; + __u32 reserved; +}; + +/* RX Hash function flags */ +enum mana_ib_rx_hash_function_flags { + MANA_IB_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +struct mana_ib_create_qp_rss { + __aligned_u64 rx_hash_fields_mask; + __u8 rx_hash_function; + __u8 reserved[7]; + __u32 rx_hash_key_len; + __u8 rx_hash_key[40]; + __u32 port; +}; + +struct rss_resp_entry { + __u32 cqid; + __u32 wqid; +}; + +struct mana_ib_create_qp_rss_resp { + __aligned_u64 num_entries; + struct rss_resp_entry entries[64]; +}; + +#endif From 3574cfdca28543e2e8db649297cd6659ea8e4bb8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 11 Nov 2022 11:55:29 +0200 Subject: [PATCH 051/122] RDMA/mana: Remove redefinition of basic u64 type gdma_obj_handle_t is no more than redefinition of basic u64 type. Remove such obfuscation. Link: https://lore.kernel.org/r/3c1e821279e6a165d058655d2343722d6650e776.1668160486.git.leonro@nvidia.com Acked-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/mr.c | 5 ++- .../net/ethernet/microsoft/mana/gdma_main.c | 3 +- include/net/mana/gdma.h | 31 +++++++++---------- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index a56236cdd9ee..351207c60eb6 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -73,8 +73,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, return 0; } -static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, - gdma_obj_handle_t mr_handle) +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, u64 mr_handle) { struct gdma_destroy_mr_response resp = {}; struct gdma_destroy_mr_request req = {}; @@ -108,9 +107,9 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); struct gdma_create_mr_params mr_params = {}; struct ib_device *ibdev = ibpd->device; - gdma_obj_handle_t dma_region_handle; struct mana_ib_dev *dev; struct mana_ib_mr *mr; + u64 dma_region_handle; int err; dev = container_of(ibdev, struct mana_ib_dev, ib_dev); diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 46a7d1e6ece9..69224ff8efb6 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -671,8 +671,7 @@ free_q: return err; } -int mana_gd_destroy_dma_region(struct gdma_context *gc, - gdma_obj_handle_t dma_region_handle) +int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle) { struct gdma_destroy_dma_region_req req = {}; struct gdma_general_resp resp = {}; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 221adc96340c..a9fdae14d24c 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -65,8 +65,6 @@ enum { GDMA_DEVICE_MANA = 2, }; -typedef u64 gdma_obj_handle_t; - struct gdma_resource { /* Protect the bitmap */ spinlock_t lock; @@ -200,7 +198,7 @@ struct gdma_mem_info { u64 length; /* Allocated by the PF driver */ - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; }; #define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8 @@ -624,7 +622,7 @@ struct gdma_create_queue_req { u32 reserved1; u32 pdid; u32 doolbell_id; - gdma_obj_handle_t gdma_region; + u64 gdma_region; u32 reserved2; u32 queue_size; u32 log2_throttle_limit; @@ -699,14 +697,14 @@ struct gdma_create_dma_region_req { struct gdma_create_dma_region_resp { struct gdma_resp_hdr hdr; - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; }; /* HW DATA */ /* GDMA_DMA_REGION_ADD_PAGES */ struct gdma_dma_region_add_pages_req { struct gdma_req_hdr hdr; - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; u32 page_addr_list_len; u32 reserved3; @@ -718,7 +716,7 @@ struct gdma_dma_region_add_pages_req { struct gdma_destroy_dma_region_req { struct gdma_req_hdr hdr; - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; }; /* HW DATA */ enum gdma_pd_flags { @@ -733,14 +731,14 @@ struct gdma_create_pd_req { struct gdma_create_pd_resp { struct gdma_resp_hdr hdr; - gdma_obj_handle_t pd_handle; + u64 pd_handle; u32 pd_id; u32 reserved; };/* HW DATA */ struct gdma_destroy_pd_req { struct gdma_req_hdr hdr; - gdma_obj_handle_t pd_handle; + u64 pd_handle; };/* HW DATA */ struct gdma_destory_pd_resp { @@ -756,11 +754,11 @@ enum gdma_mr_type { }; struct gdma_create_mr_params { - gdma_obj_handle_t pd_handle; + u64 pd_handle; enum gdma_mr_type mr_type; union { struct { - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; @@ -769,13 +767,13 @@ struct gdma_create_mr_params { struct gdma_create_mr_request { struct gdma_req_hdr hdr; - gdma_obj_handle_t pd_handle; + u64 pd_handle; enum gdma_mr_type mr_type; u32 reserved_1; union { struct { - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; @@ -786,14 +784,14 @@ struct gdma_create_mr_request { struct gdma_create_mr_response { struct gdma_resp_hdr hdr; - gdma_obj_handle_t mr_handle; + u64 mr_handle; u32 lkey; u32 rkey; };/* HW DATA */ struct gdma_destroy_mr_request { struct gdma_req_hdr hdr; - gdma_obj_handle_t mr_handle; + u64 mr_handle; };/* HW DATA */ struct gdma_destroy_mr_response { @@ -827,7 +825,6 @@ void mana_gd_free_memory(struct gdma_mem_info *gmi); int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, u32 resp_len, void *resp); -int mana_gd_destroy_dma_region(struct gdma_context *gc, - gdma_obj_handle_t dma_region_handle); +int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); #endif /* _GDMA_H */ From dac153f2802db1ad46207283cb9b2aae3d707a45 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 7 Nov 2022 10:51:34 +0200 Subject: [PATCH 052/122] RDMA/restrack: Release MR restrack when delete The MR restrack also needs to be released when delete it, otherwise it cause memory leak as the task struct won't be released. Fixes: 13ef5539def7 ("RDMA/restrack: Count references to the verbs objects") Signed-off-by: Mark Zhang Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/703db18e8d4ef628691fb93980a709be673e62e3.1667810736.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/restrack.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 1f935d9f6178..01a499a8b88d 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -343,8 +343,6 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) rt = &dev->res[res->type]; old = xa_erase(&rt->xa, res->id); - if (res->type == RDMA_RESTRACK_MR) - return; WARN_ON(old != res); out: From 5e15ff29b156bbbdeadae230c8ecd5ecd8ca2477 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 7 Nov 2022 10:51:35 +0200 Subject: [PATCH 053/122] RDMA/core: Make sure "ib_port" is valid when access sysfs node The "ib_port" structure must be set before adding the sysfs kobject, and reset after removing it, otherwise it may crash when accessing the sysfs node: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000050 Mem abort info: ESR = 0x96000006 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000006 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000e85f5ba5 [0000000000000050] pgd=0000000848fd9003, pud=000000085b387003, pmd=0000000000000000 Internal error: Oops: 96000006 [#2] PREEMPT SMP Modules linked in: ib_umad(O) mlx5_ib(O) nfnetlink_cttimeout(E) nfnetlink(E) act_gact(E) cls_flower(E) sch_ingress(E) openvswitch(E) nsh(E) nf_nat_ipv6(E) nf_nat_ipv4(E) nf_conncount(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) mst_pciconf(O) ipmi_devintf(E) ipmi_msghandler(E) ipmb_dev_int(OE) mlx5_core(O) mlxfw(O) mlxdevm(O) auxiliary(O) ib_uverbs(O) ib_core(O) mlx_compat(O) psample(E) sbsa_gwdt(E) uio_pdrv_genirq(E) uio(E) mlxbf_pmc(OE) mlxbf_gige(OE) mlxbf_tmfifo(OE) gpio_mlxbf2(OE) pwr_mlxbf(OE) mlx_trio(OE) i2c_mlxbf(OE) mlx_bootctl(OE) bluefield_edac(OE) knem(O) ip_tables(E) ipv6(E) crc_ccitt(E) [last unloaded: mst_pci] Process grep (pid: 3372, stack limit = 0x0000000022055c92) CPU: 5 PID: 3372 Comm: grep Tainted: G D OE 4.19.161-mlnx.47.gadcd9e3 #1 Hardware name: https://www.mellanox.com BlueField SoC/BlueField SoC, BIOS BlueField:3.9.2-15-ga2403ab Sep 8 2022 pstate: 40000005 (nZcv daif -PAN -UAO) pc : hw_stat_port_show+0x4c/0x80 [ib_core] lr : port_attr_show+0x40/0x58 [ib_core] sp : ffff000029f43b50 x29: ffff000029f43b50 x28: 0000000019375000 x27: ffff8007b821a540 x26: ffff000029f43e30 x25: 0000000000008000 x24: ffff000000eaa958 x23: 0000000000001000 x22: ffff8007a4ce3000 x21: ffff8007baff8000 x20: ffff8007b9066ac0 x19: ffff8007bae97578 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : ffff8007a4ce4000 x7 : 0000000000000000 x6 : 000000000000003f x5 : ffff000000e6a280 x4 : ffff8007a4ce3000 x3 : 0000000000000000 x2 : aaaaaaaaaaaaaaab x1 : ffff8007b9066a10 x0 : ffff8007baff8000 Call trace: hw_stat_port_show+0x4c/0x80 [ib_core] port_attr_show+0x40/0x58 [ib_core] sysfs_kf_seq_show+0x8c/0x150 kernfs_seq_show+0x44/0x50 seq_read+0x1b4/0x45c kernfs_fop_read+0x148/0x1d8 __vfs_read+0x58/0x180 vfs_read+0x94/0x154 ksys_read+0x68/0xd8 __arm64_sys_read+0x28/0x34 el0_svc_common+0x88/0x18c el0_svc_handler+0x78/0x94 el0_svc+0x8/0xe8 Code: f2955562 aa1603e4 aa1503e0 f9405683 (f9402861) Fixes: d8a5883814b9 ("RDMA/core: Replace the ib_port_data hw_stats pointers with a ib_port pointer") Signed-off-by: Mark Zhang Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/88867e705c42c1cd2011e45201c25eecdb9fef94.1667810736.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/sysfs.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 84c53bd2a52d..ee59d7391568 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1213,6 +1213,9 @@ static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num, p->port_num = port_num; kobject_init(&p->kobj, &port_type); + if (device->port_data && is_full_dev) + device->port_data[port_num].sysfs = p; + cur_group = p->groups_list; ret = alloc_port_table_group("gids", &p->groups[0], p->attrs_list, attr->gid_tbl_len, show_port_gid); @@ -1258,9 +1261,6 @@ static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num, } list_add_tail(&p->kobj.entry, &coredev->port_list); - if (device->port_data && is_full_dev) - device->port_data[port_num].sysfs = p; - return p; err_groups: @@ -1268,6 +1268,8 @@ err_groups: err_del: kobject_del(&p->kobj); err_put: + if (device->port_data && is_full_dev) + device->port_data[port_num].sysfs = NULL; kobject_put(&p->kobj); return ERR_PTR(ret); } @@ -1276,14 +1278,17 @@ static void destroy_port(struct ib_core_device *coredev, struct ib_port *port) { bool is_full_dev = &port->ibdev->coredev == coredev; - if (port->ibdev->port_data && - port->ibdev->port_data[port->port_num].sysfs == port) - port->ibdev->port_data[port->port_num].sysfs = NULL; list_del(&port->kobj.entry); if (is_full_dev) sysfs_remove_groups(&port->kobj, port->ibdev->ops.port_groups); + sysfs_remove_groups(&port->kobj, port->groups_list); kobject_del(&port->kobj); + + if (port->ibdev->port_data && + port->ibdev->port_data[port->port_num].sysfs == port) + port->ibdev->port_data[port->port_num].sysfs = NULL; + kobject_put(&port->kobj); } From ecacb3751f254572af0009b9501e2cdc83a30b6a Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 7 Nov 2022 10:51:36 +0200 Subject: [PATCH 054/122] RDMA/nldev: Return "-EAGAIN" if the cm_id isn't from expected port When filling a cm_id entry, return "-EAGAIN" instead of 0 if the cm_id doesn'the have the same port as requested, otherwise an incomplete entry may be returned, which causes "rdam res show cm_id" to return an error. For example on a machine with two rdma devices with "rping -C 1 -v -s" running background, the "rdma" command fails: $ rdma -V rdma utility, iproute2-5.19.0 $ rdma res show cm_id link mlx5_0/- cm-idn 0 state LISTEN ps TCP pid 28056 comm rping src-addr 0.0.0.0:7174 error: Protocol not available While with this fix it succeeds: $ rdma res show cm_id link mlx5_0/- cm-idn 0 state LISTEN ps TCP pid 26395 comm rping src-addr 0.0.0.0:7174 link mlx5_1/- cm-idn 0 state LISTEN ps TCP pid 26395 comm rping src-addr 0.0.0.0:7174 Fixes: 00313983cda6 ("RDMA/nldev: provide detailed CM_ID information") Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/a08e898cdac5e28428eb749a99d9d981571b8ea7.1667810736.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index b92358f606d0..2be76a3fdd87 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -552,7 +552,7 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_cm_id *cm_id = &id_priv->id; if (port && port != cm_id->port_num) - return 0; + return -EAGAIN; if (cm_id->port_num && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) From 60da2d11fcbc043304910e4d2ca82f9bab953e63 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Tue, 15 Nov 2022 18:07:47 +0100 Subject: [PATCH 055/122] RDMA/siw: Set defined status for work completion with undefined status A malicious user may write undefined values into memory mapped completion queue elements status or opcode. Undefined status or opcode values will result in out-of-bounds access to an array mapping siw internal representation of opcode and status to RDMA core representation when reaping CQ elements. While siw detects those undefined values, it did not correctly set completion status to a defined value, thus defeating the whole purpose of the check. This bug leads to the following Smatch static checker warning: drivers/infiniband/sw/siw/siw_cq.c:96 siw_reap_cqe() error: buffer overflow 'map_cqe_status' 10 <= 21 Fixes: bdf1da5df9da ("RDMA/siw: Fix immediate work request flush to completion queue") Link: https://lore.kernel.org/r/20221115170747.1263298-1-bmt@zurich.ibm.com Reported-by: Dan Carpenter Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c index acc7bcd538b5..403029de6b92 100644 --- a/drivers/infiniband/sw/siw/siw_cq.c +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -88,9 +88,9 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) if (opcode >= SIW_NUM_OPCODES) { opcode = 0; - status = IB_WC_GENERAL_ERR; + status = SIW_WC_GENERAL_ERR; } else if (status >= SIW_NUM_WC_STATUS) { - status = IB_WC_GENERAL_ERR; + status = SIW_WC_GENERAL_ERR; } wc->opcode = map_wc_opcode[opcode]; wc->status = map_cqe_status[status].ib; From 4f44e519b6a945068755708119cca5b74d01d1f6 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 14 Nov 2022 19:16:59 -0600 Subject: [PATCH 056/122] RDMA/irdma: Fix inline for multiple SGE's Currently, inline send and inline write assume a single SGE and only copy data from the first one. Add support for multiple SGE's. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221115011701.1379-2-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/uk.c | 147 ++++++++++++++++++---------- drivers/infiniband/hw/irdma/user.h | 19 +--- drivers/infiniband/hw/irdma/verbs.c | 57 ++++------- 3 files changed, 119 insertions(+), 104 deletions(-) diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index a6e5d350a94c..1a57ed9d77ff 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -566,21 +566,37 @@ static void irdma_set_mw_bind_wqe_gen_1(__le64 *wqe, /** * irdma_copy_inline_data_gen_1 - Copy inline data to wqe - * @dest: pointer to wqe - * @src: pointer to inline data - * @len: length of inline data to copy + * @wqe: pointer to wqe + * @sge_list: table of pointers to inline data + * @num_sges: Total inline data length * @polarity: compatibility parameter */ -static void irdma_copy_inline_data_gen_1(u8 *dest, u8 *src, u32 len, - u8 polarity) +static void irdma_copy_inline_data_gen_1(u8 *wqe, struct ib_sge *sge_list, + u32 num_sges, u8 polarity) { - if (len <= 16) { - memcpy(dest, src, len); - } else { - memcpy(dest, src, 16); - src += 16; - dest = dest + 32; - memcpy(dest, src, len - 16); + u32 quanta_bytes_remaining = 16; + int i; + + for (i = 0; i < num_sges; i++) { + u8 *cur_sge = (u8 *)(uintptr_t)sge_list[i].addr; + u32 sge_len = sge_list[i].length; + + while (sge_len) { + u32 bytes_copied; + + bytes_copied = min(sge_len, quanta_bytes_remaining); + memcpy(wqe, cur_sge, bytes_copied); + wqe += bytes_copied; + cur_sge += bytes_copied; + quanta_bytes_remaining -= bytes_copied; + sge_len -= bytes_copied; + + if (!quanta_bytes_remaining) { + /* Remaining inline bytes reside after hdr */ + wqe += 16; + quanta_bytes_remaining = 32; + } + } } } @@ -612,35 +628,51 @@ static void irdma_set_mw_bind_wqe(__le64 *wqe, /** * irdma_copy_inline_data - Copy inline data to wqe - * @dest: pointer to wqe - * @src: pointer to inline data - * @len: length of inline data to copy + * @wqe: pointer to wqe + * @sge_list: table of pointers to inline data + * @num_sges: number of SGE's * @polarity: polarity of wqe valid bit */ -static void irdma_copy_inline_data(u8 *dest, u8 *src, u32 len, u8 polarity) +static void irdma_copy_inline_data(u8 *wqe, struct ib_sge *sge_list, + u32 num_sges, u8 polarity) { u8 inline_valid = polarity << IRDMA_INLINE_VALID_S; - u32 copy_size; + u32 quanta_bytes_remaining = 8; + bool first_quanta = true; + int i; - dest += 8; - if (len <= 8) { - memcpy(dest, src, len); - return; - } - - *((u64 *)dest) = *((u64 *)src); - len -= 8; - src += 8; - dest += 24; /* point to additional 32 byte quanta */ - - while (len) { - copy_size = len < 31 ? len : 31; - memcpy(dest, src, copy_size); - *(dest + 31) = inline_valid; - len -= copy_size; - dest += 32; - src += copy_size; + wqe += 8; + + for (i = 0; i < num_sges; i++) { + u8 *cur_sge = (u8 *)(uintptr_t)sge_list[i].addr; + u32 sge_len = sge_list[i].length; + + while (sge_len) { + u32 bytes_copied; + + bytes_copied = min(sge_len, quanta_bytes_remaining); + memcpy(wqe, cur_sge, bytes_copied); + wqe += bytes_copied; + cur_sge += bytes_copied; + quanta_bytes_remaining -= bytes_copied; + sge_len -= bytes_copied; + + if (!quanta_bytes_remaining) { + quanta_bytes_remaining = 31; + + /* Remaining inline bytes reside after hdr */ + if (first_quanta) { + first_quanta = false; + wqe += 16; + } else { + *wqe = inline_valid; + wqe++; + } + } + } } + if (!first_quanta && quanta_bytes_remaining < 31) + *(wqe + quanta_bytes_remaining) = inline_valid; } /** @@ -679,20 +711,27 @@ int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, bool post_sq) { __le64 *wqe; - struct irdma_inline_rdma_write *op_info; + struct irdma_rdma_write *op_info; u64 hdr = 0; u32 wqe_idx; bool read_fence = false; + u32 i, total_size = 0; u16 quanta; info->push_wqe = qp->push_db ? true : false; - op_info = &info->op.inline_rdma_write; + op_info = &info->op.rdma_write; - if (op_info->len > qp->max_inline_data) + if (unlikely(qp->max_sq_frag_cnt < op_info->num_lo_sges)) return -EINVAL; - quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(op_info->len); - wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, op_info->len, + for (i = 0; i < op_info->num_lo_sges; i++) + total_size += op_info->lo_sg_list[i].length; + + if (unlikely(total_size > qp->max_inline_data)) + return -EINVAL; + + quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(total_size); + wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size, info); if (!wqe) return -ENOMEM; @@ -705,7 +744,7 @@ int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, hdr = FIELD_PREP(IRDMAQPSQ_REMSTAG, op_info->rem_addr.lkey) | FIELD_PREP(IRDMAQPSQ_OPCODE, info->op_type) | - FIELD_PREP(IRDMAQPSQ_INLINEDATALEN, op_info->len) | + FIELD_PREP(IRDMAQPSQ_INLINEDATALEN, total_size) | FIELD_PREP(IRDMAQPSQ_REPORTRTT, info->report_rtt ? 1 : 0) | FIELD_PREP(IRDMAQPSQ_INLINEDATAFLAG, 1) | FIELD_PREP(IRDMAQPSQ_IMMDATAFLAG, info->imm_data_valid ? 1 : 0) | @@ -719,7 +758,8 @@ int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, set_64bit_val(wqe, 0, FIELD_PREP(IRDMAQPSQ_IMMDATA, info->imm_data)); - qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->data, op_info->len, + qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->lo_sg_list, + op_info->num_lo_sges, qp->swqe_polarity); dma_wmb(); /* make sure WQE is populated before valid bit is set */ @@ -745,20 +785,27 @@ int irdma_uk_inline_send(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, bool post_sq) { __le64 *wqe; - struct irdma_post_inline_send *op_info; + struct irdma_post_send *op_info; u64 hdr; u32 wqe_idx; bool read_fence = false; + u32 i, total_size = 0; u16 quanta; info->push_wqe = qp->push_db ? true : false; - op_info = &info->op.inline_send; + op_info = &info->op.send; - if (op_info->len > qp->max_inline_data) + if (unlikely(qp->max_sq_frag_cnt < op_info->num_sges)) return -EINVAL; - quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(op_info->len); - wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, op_info->len, + for (i = 0; i < op_info->num_sges; i++) + total_size += op_info->sg_list[i].length; + + if (unlikely(total_size > qp->max_inline_data)) + return -EINVAL; + + quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(total_size); + wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size, info); if (!wqe) return -ENOMEM; @@ -773,7 +820,7 @@ int irdma_uk_inline_send(struct irdma_qp_uk *qp, hdr = FIELD_PREP(IRDMAQPSQ_REMSTAG, info->stag_to_inv) | FIELD_PREP(IRDMAQPSQ_AHID, op_info->ah_id) | FIELD_PREP(IRDMAQPSQ_OPCODE, info->op_type) | - FIELD_PREP(IRDMAQPSQ_INLINEDATALEN, op_info->len) | + FIELD_PREP(IRDMAQPSQ_INLINEDATALEN, total_size) | FIELD_PREP(IRDMAQPSQ_IMMDATAFLAG, (info->imm_data_valid ? 1 : 0)) | FIELD_PREP(IRDMAQPSQ_REPORTRTT, (info->report_rtt ? 1 : 0)) | @@ -789,8 +836,8 @@ int irdma_uk_inline_send(struct irdma_qp_uk *qp, if (info->imm_data_valid) set_64bit_val(wqe, 0, FIELD_PREP(IRDMAQPSQ_IMMDATA, info->imm_data)); - qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->data, op_info->len, - qp->swqe_polarity); + qp->wqe_ops.iw_copy_inline_data((u8 *)wqe, op_info->sg_list, + op_info->num_sges, qp->swqe_polarity); dma_wmb(); /* make sure WQE is populated before valid bit is set */ diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 2ef61923c926..424d4aa8cdcd 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -173,14 +173,6 @@ struct irdma_post_send { u32 ah_id; }; -struct irdma_post_inline_send { - void *data; - u32 len; - u32 qkey; - u32 dest_qp; - u32 ah_id; -}; - struct irdma_post_rq_info { u64 wr_id; struct ib_sge *sg_list; @@ -193,12 +185,6 @@ struct irdma_rdma_write { struct ib_sge rem_addr; }; -struct irdma_inline_rdma_write { - void *data; - u32 len; - struct ib_sge rem_addr; -}; - struct irdma_rdma_read { struct ib_sge *lo_sg_list; u32 num_lo_sges; @@ -241,8 +227,6 @@ struct irdma_post_sq_info { struct irdma_rdma_read rdma_read; struct irdma_bind_window bind_window; struct irdma_inv_local_stag inv_local_stag; - struct irdma_inline_rdma_write inline_rdma_write; - struct irdma_post_inline_send inline_send; } op; }; @@ -291,7 +275,8 @@ int irdma_uk_stag_local_invalidate(struct irdma_qp_uk *qp, bool post_sq); struct irdma_wqe_uk_ops { - void (*iw_copy_inline_data)(u8 *dest, u8 *src, u32 len, u8 polarity); + void (*iw_copy_inline_data)(u8 *dest, struct ib_sge *sge_list, + u32 num_sges, u8 polarity); u16 (*iw_inline_data_size_to_quanta)(u32 data_size); void (*iw_set_fragment)(__le64 *wqe, u32 offset, struct ib_sge *sge, u8 valid); diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 434241789f12..e252f431e2ac 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3136,30 +3136,20 @@ static int irdma_post_send(struct ib_qp *ibqp, info.stag_to_inv = ib_wr->ex.invalidate_rkey; } - if (ib_wr->send_flags & IB_SEND_INLINE) { - info.op.inline_send.data = (void *)(unsigned long) - ib_wr->sg_list[0].addr; - info.op.inline_send.len = ib_wr->sg_list[0].length; - if (iwqp->ibqp.qp_type == IB_QPT_UD || - iwqp->ibqp.qp_type == IB_QPT_GSI) { - ah = to_iwah(ud_wr(ib_wr)->ah); - info.op.inline_send.ah_id = ah->sc_ah.ah_info.ah_idx; - info.op.inline_send.qkey = ud_wr(ib_wr)->remote_qkey; - info.op.inline_send.dest_qp = ud_wr(ib_wr)->remote_qpn; - } - err = irdma_uk_inline_send(ukqp, &info, false); - } else { - info.op.send.num_sges = ib_wr->num_sge; - info.op.send.sg_list = ib_wr->sg_list; - if (iwqp->ibqp.qp_type == IB_QPT_UD || - iwqp->ibqp.qp_type == IB_QPT_GSI) { - ah = to_iwah(ud_wr(ib_wr)->ah); - info.op.send.ah_id = ah->sc_ah.ah_info.ah_idx; - info.op.send.qkey = ud_wr(ib_wr)->remote_qkey; - info.op.send.dest_qp = ud_wr(ib_wr)->remote_qpn; - } - err = irdma_uk_send(ukqp, &info, false); + info.op.send.num_sges = ib_wr->num_sge; + info.op.send.sg_list = ib_wr->sg_list; + if (iwqp->ibqp.qp_type == IB_QPT_UD || + iwqp->ibqp.qp_type == IB_QPT_GSI) { + ah = to_iwah(ud_wr(ib_wr)->ah); + info.op.send.ah_id = ah->sc_ah.ah_info.ah_idx; + info.op.send.qkey = ud_wr(ib_wr)->remote_qkey; + info.op.send.dest_qp = ud_wr(ib_wr)->remote_qpn; } + + if (ib_wr->send_flags & IB_SEND_INLINE) + err = irdma_uk_inline_send(ukqp, &info, false); + else + err = irdma_uk_send(ukqp, &info, false); break; case IB_WR_RDMA_WRITE_WITH_IMM: if (ukqp->qp_caps & IRDMA_WRITE_WITH_IMM) { @@ -3176,22 +3166,15 @@ static int irdma_post_send(struct ib_qp *ibqp, else info.op_type = IRDMA_OP_TYPE_RDMA_WRITE; - if (ib_wr->send_flags & IB_SEND_INLINE) { - info.op.inline_rdma_write.data = (void *)(uintptr_t)ib_wr->sg_list[0].addr; - info.op.inline_rdma_write.len = - ib_wr->sg_list[0].length; - info.op.inline_rdma_write.rem_addr.addr = - rdma_wr(ib_wr)->remote_addr; - info.op.inline_rdma_write.rem_addr.lkey = - rdma_wr(ib_wr)->rkey; + info.op.rdma_write.num_lo_sges = ib_wr->num_sge; + info.op.rdma_write.lo_sg_list = ib_wr->sg_list; + info.op.rdma_write.rem_addr.addr = + rdma_wr(ib_wr)->remote_addr; + info.op.rdma_write.rem_addr.lkey = rdma_wr(ib_wr)->rkey; + if (ib_wr->send_flags & IB_SEND_INLINE) err = irdma_uk_inline_rdma_write(ukqp, &info, false); - } else { - info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list; - info.op.rdma_write.num_lo_sges = ib_wr->num_sge; - info.op.rdma_write.rem_addr.addr = rdma_wr(ib_wr)->remote_addr; - info.op.rdma_write.rem_addr.lkey = rdma_wr(ib_wr)->rkey; + else err = irdma_uk_rdma_write(ukqp, &info, false); - } break; case IB_WR_RDMA_READ_WITH_INV: inv_stag = true; From 24419777e9431137d5923a747f546facb1e49b1f Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 14 Nov 2022 19:17:00 -0600 Subject: [PATCH 057/122] RDMA/irdma: Fix RQ completion opcode The opcode written by HW, in the RQ CQE, is the RoCEv2/iWARP protocol opcode from the received packet and not the SW opcode as currently assumed. Fix this by returning the raw operation type and queue type in the CQE to irdma_process_cqe and add 2 helpers set_ib_wc_op_sq set_ib_wc_op_rq to map IRDMA HW op types to IB op types. Note that for iWARP, only Write with Immediate is supported so the opcode can only be IB_WC_RECV_RDMA_WITH_IMM when there is immediate data present. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221115011701.1379-3-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/uk.c | 21 +++++------- drivers/infiniband/hw/irdma/user.h | 1 + drivers/infiniband/hw/irdma/utils.c | 2 ++ drivers/infiniband/hw/irdma/verbs.c | 39 ++++----------------- drivers/infiniband/hw/irdma/verbs.h | 53 +++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 45 deletions(-) diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 1a57ed9d77ff..16183e894da7 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -1049,11 +1049,10 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, __le64 *cqe; struct irdma_qp_uk *qp; struct irdma_ring *pring = NULL; - u32 wqe_idx, q_type; + u32 wqe_idx; int ret_code; bool move_cq_head = true; u8 polarity; - u8 op_type; bool ext_valid; __le64 *ext_cqe; @@ -1121,7 +1120,7 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->ud_vlan_valid = false; } - q_type = (u8)FIELD_GET(IRDMA_CQ_SQ, qword3); + info->q_type = (u8)FIELD_GET(IRDMA_CQ_SQ, qword3); info->error = (bool)FIELD_GET(IRDMA_CQ_ERROR, qword3); info->push_dropped = (bool)FIELD_GET(IRDMACQ_PSHDROP, qword3); info->ipv4 = (bool)FIELD_GET(IRDMACQ_IPV4, qword3); @@ -1160,8 +1159,9 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, } wqe_idx = (u32)FIELD_GET(IRDMA_CQ_WQEIDX, qword3); info->qp_handle = (irdma_qp_handle)(unsigned long)qp; + info->op_type = (u8)FIELD_GET(IRDMA_CQ_SQ, qword3); - if (q_type == IRDMA_CQE_QTYPE_RQ) { + if (info->q_type == IRDMA_CQE_QTYPE_RQ) { u32 array_idx; array_idx = wqe_idx / qp->rq_wqe_size_multiplier; @@ -1181,10 +1181,6 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->bytes_xfered = (u32)FIELD_GET(IRDMACQ_PAYLDLEN, qword0); - if (info->imm_valid) - info->op_type = IRDMA_OP_TYPE_REC_IMM; - else - info->op_type = IRDMA_OP_TYPE_REC; if (qword3 & IRDMACQ_STAG) { info->stag_invalid_set = true; info->inv_stag = (u32)FIELD_GET(IRDMACQ_INVSTAG, qword2); @@ -1242,17 +1238,18 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, sw_wqe = qp->sq_base[tail].elem; get_64bit_val(sw_wqe, 24, &wqe_qword); - op_type = (u8)FIELD_GET(IRDMAQPSQ_OPCODE, wqe_qword); - info->op_type = op_type; + info->op_type = (u8)FIELD_GET(IRDMAQPSQ_OPCODE, + wqe_qword); IRDMA_RING_SET_TAIL(qp->sq_ring, tail + qp->sq_wrtrk_array[tail].quanta); - if (op_type != IRDMAQP_OP_NOP) { + if (info->op_type != IRDMAQP_OP_NOP) { info->wr_id = qp->sq_wrtrk_array[tail].wrid; info->bytes_xfered = qp->sq_wrtrk_array[tail].wr_len; break; } } while (1); - if (op_type == IRDMA_OP_TYPE_BIND_MW && info->minor_err == FLUSH_PROT_ERR) + if (info->op_type == IRDMA_OP_TYPE_BIND_MW && + info->minor_err == FLUSH_PROT_ERR) info->minor_err = FLUSH_MW_BIND_ERR; qp->sq_flush_seen = true; if (!IRDMA_RING_MORE_WORK(qp->sq_ring)) diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 424d4aa8cdcd..d0cdf609f5e0 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -245,6 +245,7 @@ struct irdma_cq_poll_info { u16 ud_vlan; u8 ud_smac[6]; u8 op_type; + u8 q_type; bool stag_invalid_set:1; /* or L_R_Key set */ bool push_dropped:1; bool error:1; diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 8dfc9e154d73..445e69e86409 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2591,6 +2591,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) sw_wqe = qp->sq_base[wqe_idx].elem; get_64bit_val(sw_wqe, 24, &wqe_qword); cmpl->cpi.op_type = (u8)FIELD_GET(IRDMAQPSQ_OPCODE, IRDMAQPSQ_OPCODE); + cmpl->cpi.q_type = IRDMA_CQE_QTYPE_SQ; /* remove the SQ WR by moving SQ tail*/ IRDMA_RING_SET_TAIL(*sq_ring, sq_ring->tail + qp->sq_wrtrk_array[sq_ring->tail].quanta); @@ -2629,6 +2630,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl->cpi.wr_id = qp->rq_wrid_array[wqe_idx]; cmpl->cpi.op_type = IRDMA_OP_TYPE_REC; + cmpl->cpi.q_type = IRDMA_CQE_QTYPE_RQ; /* remove the RQ WR by moving RQ tail */ IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1); ibdev_dbg(iwqp->iwrcq->ibcq.device, diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index e252f431e2ac..01d0dc4b5649 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3334,7 +3334,6 @@ static enum ib_wc_status irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode static void irdma_process_cqe(struct ib_wc *entry, struct irdma_cq_poll_info *cq_poll_info) { - struct irdma_qp *iwqp; struct irdma_sc_qp *qp; entry->wc_flags = 0; @@ -3342,7 +3341,6 @@ static void irdma_process_cqe(struct ib_wc *entry, entry->wr_id = cq_poll_info->wr_id; qp = cq_poll_info->qp_handle; - iwqp = qp->qp_uk.back_qp; entry->qp = qp->qp_uk.back_qp; if (cq_poll_info->error) { @@ -3375,42 +3373,17 @@ static void irdma_process_cqe(struct ib_wc *entry, } } - switch (cq_poll_info->op_type) { - case IRDMA_OP_TYPE_RDMA_WRITE: - case IRDMA_OP_TYPE_RDMA_WRITE_SOL: - entry->opcode = IB_WC_RDMA_WRITE; - break; - case IRDMA_OP_TYPE_RDMA_READ_INV_STAG: - case IRDMA_OP_TYPE_RDMA_READ: - entry->opcode = IB_WC_RDMA_READ; - break; - case IRDMA_OP_TYPE_SEND_INV: - case IRDMA_OP_TYPE_SEND_SOL: - case IRDMA_OP_TYPE_SEND_SOL_INV: - case IRDMA_OP_TYPE_SEND: - entry->opcode = IB_WC_SEND; - break; - case IRDMA_OP_TYPE_FAST_REG_NSMR: - entry->opcode = IB_WC_REG_MR; - break; - case IRDMA_OP_TYPE_INV_STAG: - entry->opcode = IB_WC_LOCAL_INV; - break; - case IRDMA_OP_TYPE_REC_IMM: - case IRDMA_OP_TYPE_REC: - entry->opcode = cq_poll_info->op_type == IRDMA_OP_TYPE_REC_IMM ? - IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; + if (cq_poll_info->q_type == IRDMA_CQE_QTYPE_SQ) { + set_ib_wc_op_sq(cq_poll_info, entry); + } else { + set_ib_wc_op_rq(cq_poll_info, entry, + qp->qp_uk.qp_caps & IRDMA_SEND_WITH_IMM ? + true : false); if (qp->qp_uk.qp_type != IRDMA_QP_TYPE_ROCE_UD && cq_poll_info->stag_invalid_set) { entry->ex.invalidate_rkey = cq_poll_info->inv_stag; entry->wc_flags |= IB_WC_WITH_INVALIDATE; } - break; - default: - ibdev_err(&iwqp->iwdev->ibdev, - "Invalid opcode = %d in CQE\n", cq_poll_info->op_type); - entry->status = IB_WC_GENERAL_ERR; - return; } if (qp->qp_uk.qp_type == IRDMA_QP_TYPE_ROCE_UD) { diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index 4309b7159f42..a536e9fa85eb 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -232,6 +232,59 @@ static inline u16 irdma_fw_minor_ver(struct irdma_sc_dev *dev) return (u16)FIELD_GET(IRDMA_FW_VER_MINOR, dev->feature_info[IRDMA_FEATURE_FW_INFO]); } +static inline void set_ib_wc_op_sq(struct irdma_cq_poll_info *cq_poll_info, + struct ib_wc *entry) +{ + switch (cq_poll_info->op_type) { + case IRDMA_OP_TYPE_RDMA_WRITE: + case IRDMA_OP_TYPE_RDMA_WRITE_SOL: + entry->opcode = IB_WC_RDMA_WRITE; + break; + case IRDMA_OP_TYPE_RDMA_READ_INV_STAG: + case IRDMA_OP_TYPE_RDMA_READ: + entry->opcode = IB_WC_RDMA_READ; + break; + case IRDMA_OP_TYPE_SEND_SOL: + case IRDMA_OP_TYPE_SEND_SOL_INV: + case IRDMA_OP_TYPE_SEND_INV: + case IRDMA_OP_TYPE_SEND: + entry->opcode = IB_WC_SEND; + break; + case IRDMA_OP_TYPE_FAST_REG_NSMR: + entry->opcode = IB_WC_REG_MR; + break; + case IRDMA_OP_TYPE_INV_STAG: + entry->opcode = IB_WC_LOCAL_INV; + break; + default: + entry->status = IB_WC_GENERAL_ERR; + } +} + +static inline void set_ib_wc_op_rq(struct irdma_cq_poll_info *cq_poll_info, + struct ib_wc *entry, bool send_imm_support) +{ + /** + * iWARP does not support sendImm, so the presence of Imm data + * must be WriteImm. + */ + if (!send_imm_support) { + entry->opcode = cq_poll_info->imm_valid ? + IB_WC_RECV_RDMA_WITH_IMM : + IB_WC_RECV; + return; + } + + switch (cq_poll_info->op_type) { + case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE: + case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE: + entry->opcode = IB_WC_RECV_RDMA_WITH_IMM; + break; + default: + entry->opcode = IB_WC_RECV; + } +} + void irdma_mcast_mac(u32 *ip_addr, u8 *mac, bool ipv4); int irdma_ib_register_device(struct irdma_device *iwdev); void irdma_ib_unregister_device(struct irdma_device *iwdev); From 8f7e2daa6336f9f4b6f8a4715a809674606df16b Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 14 Nov 2022 19:17:01 -0600 Subject: [PATCH 058/122] RDMA/irdma: Do not request 2-level PBLEs for CQ alloc When allocating PBLE's for a large CQ, it is possible that a 2-level PBLE is returned which would cause the CQ allocation to fail since 1-level is assumed and checked for. Fix this by requesting a level one PBLE only. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221115011701.1379-4-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 01d0dc4b5649..dc3f5f3fee90 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2329,9 +2329,10 @@ static bool irdma_check_mr_contiguous(struct irdma_pble_alloc *palloc, * @rf: RDMA PCI function * @iwmr: mr pointer for this memory registration * @use_pbles: flag if to use pble's + * @lvl_1_only: request only level 1 pble if true */ static int irdma_setup_pbles(struct irdma_pci_f *rf, struct irdma_mr *iwmr, - bool use_pbles) + bool use_pbles, bool lvl_1_only) { struct irdma_pbl *iwpbl = &iwmr->iwpbl; struct irdma_pble_alloc *palloc = &iwpbl->pble_alloc; @@ -2342,7 +2343,7 @@ static int irdma_setup_pbles(struct irdma_pci_f *rf, struct irdma_mr *iwmr, if (use_pbles) { status = irdma_get_pble(rf->pble_rsrc, palloc, iwmr->page_cnt, - false); + lvl_1_only); if (status) return status; @@ -2385,16 +2386,10 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev, bool ret = true; pg_size = iwmr->page_size; - err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles); + err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, true); if (err) return err; - if (use_pbles && palloc->level != PBLE_LEVEL_1) { - irdma_free_pble(iwdev->rf->pble_rsrc, palloc); - iwpbl->pbl_allocated = false; - return -ENOMEM; - } - if (use_pbles) arr = palloc->level1.addr; @@ -2870,7 +2865,7 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, case IRDMA_MEMREG_TYPE_MEM: use_pbles = (iwmr->page_cnt != 1); - err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles); + err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, false); if (err) goto error; From d7115727e32e94c77a5441892e74ae0339afa1a5 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:38 +0800 Subject: [PATCH 059/122] RDMA/rtrs-srv: Refactor rtrs_srv_rdma_cm_handler The RDMA_CM_EVENT_CONNECT_REQUEST is quite different to other types, let's check it separately at the beginning of routine, then we can avoid the indentation accordingly. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-2-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 22d7ba05e9fe..5fe3699cb8ff 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -1950,22 +1950,21 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id, { struct rtrs_srv_path *srv_path = NULL; struct rtrs_path *s = NULL; + struct rtrs_con *c = NULL; - if (ev->event != RDMA_CM_EVENT_CONNECT_REQUEST) { - struct rtrs_con *c = cm_id->context; - - s = c->path; - srv_path = to_srv_path(s); - } - - switch (ev->event) { - case RDMA_CM_EVENT_CONNECT_REQUEST: + if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST) /* * In case of error cma.c will destroy cm_id, * see cma_process_remove() */ return rtrs_rdma_connect(cm_id, ev->param.conn.private_data, ev->param.conn.private_data_len); + + c = cm_id->context; + s = c->path; + srv_path = to_srv_path(s); + + switch (ev->event) { case RDMA_CM_EVENT_ESTABLISHED: /* Nothing here */ break; From 0f597ac618d04beb9de997fda59a29c9d3818fb2 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:39 +0800 Subject: [PATCH 060/122] RDMA/rtrs-srv: Refactor the handling of failure case in map_cont_bufs Let's call unmap_cont_bufs when failure happens, and also only update mrs_num after everything is settled which means we can remove 'mri'. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-3-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 47 +++++++++++--------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 5fe3699cb8ff..b877dd57b6b9 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -561,9 +561,11 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) { struct rtrs_srv_sess *srv = srv_path->srv; struct rtrs_path *ss = &srv_path->s; - int i, mri, err, mrs_num; + int i, err, mrs_num; unsigned int chunk_bits; int chunks_per_mr = 1; + struct ib_mr *mr; + struct sg_table *sgt; /* * Here we map queue_depth chunks to MR. Firstly we have to @@ -586,16 +588,14 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) if (!srv_path->mrs) return -ENOMEM; - srv_path->mrs_num = mrs_num; - - for (mri = 0; mri < mrs_num; mri++) { - struct rtrs_srv_mr *srv_mr = &srv_path->mrs[mri]; - struct sg_table *sgt = &srv_mr->sgt; + for (srv_path->mrs_num = 0; srv_path->mrs_num < mrs_num; + srv_path->mrs_num++) { + struct rtrs_srv_mr *srv_mr = &srv_path->mrs[srv_path->mrs_num]; struct scatterlist *s; - struct ib_mr *mr; int nr, nr_sgt, chunks; - chunks = chunks_per_mr * mri; + sgt = &srv_mr->sgt; + chunks = chunks_per_mr * srv_path->mrs_num; if (!always_invalidate) chunks_per_mr = min_t(int, chunks_per_mr, srv->queue_depth - chunks); @@ -644,31 +644,24 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); srv_mr->mr = mr; - - continue; -err: - while (mri--) { - srv_mr = &srv_path->mrs[mri]; - sgt = &srv_mr->sgt; - mr = srv_mr->mr; - rtrs_iu_free(srv_mr->iu, srv_path->s.dev->ib_dev, 1); -dereg_mr: - ib_dereg_mr(mr); -unmap_sg: - ib_dma_unmap_sg(srv_path->s.dev->ib_dev, sgt->sgl, - sgt->nents, DMA_BIDIRECTIONAL); -free_sg: - sg_free_table(sgt); - } - kfree(srv_path->mrs); - - return err; } chunk_bits = ilog2(srv->queue_depth - 1) + 1; srv_path->mem_bits = (MAX_IMM_PAYL_BITS - chunk_bits); return 0; + +dereg_mr: + ib_dereg_mr(mr); +unmap_sg: + ib_dma_unmap_sg(srv_path->s.dev->ib_dev, sgt->sgl, + sgt->nents, DMA_BIDIRECTIONAL); +free_sg: + sg_free_table(sgt); +err: + unmap_cont_bufs(srv_path); + + return err; } static void rtrs_srv_hb_err_handler(struct rtrs_con *c) From 102d2f70ec0999a5cde181f1ccbe8a81cba45b10 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:40 +0800 Subject: [PATCH 061/122] RDMA/rtrs-srv: Correct the checking of ib_map_mr_sg We should check with nr_sgt, also the only successful case is that all sg elements are mapped, so make it explicitly. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-4-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index b877dd57b6b9..581c850e71d6 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -622,7 +622,7 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) } nr = ib_map_mr_sg(mr, sgt->sgl, nr_sgt, NULL, max_chunk_size); - if (nr < 0 || nr < sgt->nents) { + if (nr != nr_sgt) { err = nr < 0 ? nr : -EINVAL; goto dereg_mr; } From f5708e6699c230f64736107c90b63a53bdc0a613 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:41 +0800 Subject: [PATCH 062/122] RDMA/rtrs-clt: Correct the checking of ib_map_mr_sg We should check with count, also the only successful case is that all sg elements are mapped, so make it explicitly. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-5-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 8546b8816524..be7c8480f947 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1064,10 +1064,8 @@ static int rtrs_map_sg_fr(struct rtrs_clt_io_req *req, size_t count) /* Align the MR to a 4K page size to match the block virt boundary */ nr = ib_map_mr_sg(req->mr, req->sglist, count, NULL, SZ_4K); - if (nr < 0) - return nr; - if (nr < req->sg_cnt) - return -EINVAL; + if (nr != count) + return nr < 0 ? nr : -EINVAL; ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); return nr; From a4399563356c86eafeadcdc155d5d93320713b6e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:42 +0800 Subject: [PATCH 063/122] RDMA/rtrs-srv: Remove outdated comments from create_con Remove the orphan comments. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-6-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 581c850e71d6..d1703e2c0b82 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -1671,12 +1671,6 @@ static int create_con(struct rtrs_srv_path *srv_path, srv->queue_depth * (1 + 2) + 1); max_recv_wr = srv->queue_depth + 1; - /* - * If we have all receive requests posted and - * all write requests posted and each read request - * requires an invalidate request + drain - * and qp gets into error state. - */ } cq_num = max_send_wr + max_recv_wr; atomic_set(&con->c.sq_wr_avail, max_send_wr); From 7526198f27107278c600a3aee41f1a77ed84dd78 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:43 +0800 Subject: [PATCH 064/122] RDMA/rtrs: Clean up rtrs_rdma_dev_pd_ops Let's remove them since the three members are not used. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-7-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-pri.h | 3 --- drivers/infiniband/ulp/rtrs/rtrs.c | 22 ++++------------------ 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h index a2420eecaf5a..ab25619261d2 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-pri.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h @@ -68,10 +68,7 @@ enum { struct rtrs_ib_dev; struct rtrs_rdma_dev_pd_ops { - struct rtrs_ib_dev *(*alloc)(void); - void (*free)(struct rtrs_ib_dev *dev); int (*init)(struct rtrs_ib_dev *dev); - void (*deinit)(struct rtrs_ib_dev *dev); }; struct rtrs_rdma_dev_pd { diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index ed324b47d93a..4bf9d868cc52 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -557,7 +557,6 @@ EXPORT_SYMBOL(rtrs_addr_to_sockaddr); void rtrs_rdma_dev_pd_init(enum ib_pd_flags pd_flags, struct rtrs_rdma_dev_pd *pool) { - WARN_ON(pool->ops && (!pool->ops->alloc ^ !pool->ops->free)); INIT_LIST_HEAD(&pool->list); mutex_init(&pool->mutex); pool->pd_flags = pd_flags; @@ -583,15 +582,8 @@ static void dev_free(struct kref *ref) list_del(&dev->entry); mutex_unlock(&pool->mutex); - if (pool->ops && pool->ops->deinit) - pool->ops->deinit(dev); - ib_dealloc_pd(dev->ib_pd); - - if (pool->ops && pool->ops->free) - pool->ops->free(dev); - else - kfree(dev); + kfree(dev); } int rtrs_ib_dev_put(struct rtrs_ib_dev *dev) @@ -618,11 +610,8 @@ rtrs_ib_dev_find_or_add(struct ib_device *ib_dev, goto out_unlock; } mutex_unlock(&pool->mutex); - if (pool->ops && pool->ops->alloc) - dev = pool->ops->alloc(); - else - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (IS_ERR_OR_NULL(dev)) + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) goto out_err; kref_init(&dev->ref); @@ -644,10 +633,7 @@ out_unlock: out_free_pd: ib_dealloc_pd(dev->ib_pd); out_free_dev: - if (pool->ops && pool->ops->free) - pool->ops->free(dev); - else - kfree(dev); + kfree(dev); out_err: return NULL; } From 6af4609c18b3aa69209d022b9c00e6db78c57ae5 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:44 +0800 Subject: [PATCH 065/122] RDMA/rtrs-srv: Fix several issues in rtrs_srv_destroy_path_files There are several issues in the function which is supposed to be paired with rtrs_srv_create_path_files. 1. rtrs_srv_stats_attr_group is not removed though it is created in rtrs_srv_create_stats_files. 2. it makes more sense to check kobj_stats.state_in_sysfs before destroy kobj_stats instead of rely on kobj.state_in_sysfs. 3. kobject_init_and_add is used for both kobjs (srv_path->kobj and srv_path->stats->kobj_stats), however we missed to call kobject_del for srv_path->kobj which was called in free_path. 4. rtrs_srv_destroy_once_sysfs_root_folders is independent of either kobj or kobj_stats. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-8-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index 2a3c9ac64a42..da8e205ce331 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -304,12 +304,18 @@ destroy_root: void rtrs_srv_destroy_path_files(struct rtrs_srv_path *srv_path) { - if (srv_path->kobj.state_in_sysfs) { + if (srv_path->stats->kobj_stats.state_in_sysfs) { + sysfs_remove_group(&srv_path->stats->kobj_stats, + &rtrs_srv_stats_attr_group); kobject_del(&srv_path->stats->kobj_stats); kobject_put(&srv_path->stats->kobj_stats); - sysfs_remove_group(&srv_path->kobj, &rtrs_srv_path_attr_group); - kobject_put(&srv_path->kobj); - - rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } + + if (srv_path->kobj.state_in_sysfs) { + sysfs_remove_group(&srv_path->kobj, &rtrs_srv_path_attr_group); + kobject_del(&srv_path->kobj); + kobject_put(&srv_path->kobj); + } + + rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } From 34a046f08b62fb855ac590c1f4dfb1934f1fdb64 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 17 Nov 2022 18:19:45 +0800 Subject: [PATCH 066/122] RDMA/rtrs-srv: Remove kobject_del from rtrs_srv_destroy_once_sysfs_root_folders The kobj_paths which is created dynamically by kobject_create_and_add, and per the comment above kobject_create_and_add, we only need to call kobject_put which is not same as other kobjs such as stats->kobj_stats and srv_path->kobj. Acked-by: Md Haris Iqbal Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20221117101945.6317-9-guoqing.jiang@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index da8e205ce331..c76ba29da1e2 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -203,7 +203,6 @@ rtrs_srv_destroy_once_sysfs_root_folders(struct rtrs_srv_path *srv_path) mutex_lock(&srv->paths_mutex); if (!--srv->dev_ref) { - kobject_del(srv->kobj_paths); kobject_put(srv->kobj_paths); mutex_unlock(&srv->paths_mutex); device_del(&srv->dev); From 8f649b57856b855f8a28724321e7ae72e31d513a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 18 Nov 2022 13:58:51 -0800 Subject: [PATCH 067/122] IB/hfi1: Replace 1-element array with singleton Zero-length arrays are deprecated[1] and are being replaced with flexible array members in support of the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy(), correctly instrument array indexing with UBSAN_BOUNDS, and to globally enable -fstrict-flex-arrays=3. Replace zero-length array with flexible-array member "lvs" in struct opa_port_data_counters_msg and struct opa_port_error_counters64_msg. Additionally, the "port" member of several structs is defined as a single-element, but is only ever accessed at index 0. Replace it with a singleton so that flexible array usage is sane. This results in no differences in binary output. [1] https://github.com/KSPP/linux/issues/78 Link: https://lore.kernel.org/r/20221118215847.never.416-kees@kernel.org Signed-off-by: Kees Cook Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/mad.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 4146a2113a95..e5e783c45810 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -2437,9 +2437,9 @@ struct opa_port_data_counters_msg { __be64 port_vl_xmit_wait_data; __be64 port_vl_rcv_bubble; __be64 port_vl_mark_fecn; - } vls[0]; + } vls[]; /* array size defined by #bits set in vl_select_mask*/ - } port[1]; /* array size defined by #ports in attribute modifier */ + } port; }; struct opa_port_error_counters64_msg { @@ -2470,9 +2470,9 @@ struct opa_port_error_counters64_msg { u8 reserved3[7]; struct _vls_ectrs { __be64 port_vl_xmit_discards; - } vls[0]; + } vls[]; /* array size defined by #bits set in vl_select_mask */ - } port[1]; /* array size defined by #ports in attribute modifier */ + } port; }; struct opa_port_error_info_msg { @@ -2543,7 +2543,7 @@ struct opa_port_error_info_msg { u8 error_info; } __packed fm_config_ei; __u32 reserved9; - } port[1]; /* actual array size defined by #ports in attr modifier */ + } port; }; /* opa_port_error_info_msg error_info_select_mask bit definitions */ @@ -2966,7 +2966,7 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, } /* Sanity check */ - response_data_size = struct_size(req, port[0].vls, num_vls); + response_data_size = struct_size(req, port.vls, num_vls); if (response_data_size > sizeof(pmp->data)) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; @@ -2986,7 +2986,7 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } - rsp = &req->port[0]; + rsp = &req->port; memset(rsp, 0, sizeof(*rsp)); rsp->port_number = port; @@ -3182,7 +3182,7 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } - response_data_size = struct_size(req, port[0].vls, num_vls); + response_data_size = struct_size(req, port.vls, num_vls); if (response_data_size > sizeof(pmp->data)) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; @@ -3201,7 +3201,7 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } - rsp = &req->port[0]; + rsp = &req->port; ibp = to_iport(ibdev, port_num); ppd = ppd_from_ibp(ibp); @@ -3340,7 +3340,7 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, u64 reg; req = (struct opa_port_error_info_msg *)pmp->data; - rsp = &req->port[0]; + rsp = &req->port; num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); @@ -3590,7 +3590,7 @@ static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp, u32 error_info_select; req = (struct opa_port_error_info_msg *)pmp->data; - rsp = &req->port[0]; + rsp = &req->port; num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); From 8e1a76493be9868fef34f977296a69769dcdfa6f Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 11 Nov 2022 21:35:37 -0500 Subject: [PATCH 068/122] RDMA/rxe: Remove reliable datagram support The rdma_rxe driver does not actually support the reliable datagram transport but contains a variable with RD opcodes in driver code. And this variable is never used. So remove it. Link: https://lore.kernel.org/r/20221112023537.432912-1-yanjun.zhu@intel.com Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_hdr.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h index e432f9e37795..804594b76040 100644 --- a/drivers/infiniband/sw/rxe/rxe_hdr.h +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -742,7 +742,6 @@ enum aeth_syndrome { AETH_NAK_INVALID_REQ = 0x61, AETH_NAK_REM_ACC_ERR = 0x62, AETH_NAK_REM_OP_ERR = 0x63, - AETH_NAK_INV_RD_REQ = 0x64, }; static inline u8 __aeth_syn(void *arg) From 7d984dac8f6bf4ebd3398af82b357e1d181ecaac Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Sun, 30 Oct 2022 03:04:33 +0000 Subject: [PATCH 069/122] RDMA/rxe: Fix mr->map double free rxe_mr_cleanup() which tries to free mr->map again will be called when rxe_mr_init_user() fails: CPU: 0 PID: 4917 Comm: rdma_flush_serv Kdump: loaded Not tainted 6.1.0-rc1-roce-flush+ #25 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x5d panic+0x19e/0x349 end_report.part.0+0x54/0x7c kasan_report.cold+0xa/0xf rxe_mr_cleanup+0x9d/0xf0 [rdma_rxe] __rxe_cleanup+0x10a/0x1e0 [rdma_rxe] rxe_reg_user_mr+0xb7/0xd0 [rdma_rxe] ib_uverbs_reg_mr+0x26a/0x480 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0x1a2/0x250 [ib_uverbs] ib_uverbs_cmd_verbs+0x1397/0x15a0 [ib_uverbs] This issue was firstly exposed since commit b18c7da63fcb ("RDMA/rxe: Fix memory leak in error path code") and then we fixed it in commit 8ff5f5d9d8cf ("RDMA/rxe: Prevent double freeing rxe_map_set()") but this fix was reverted together at last by commit 1e75550648da (Revert "RDMA/rxe: Create duplicate mapping tables for FMRs") Simply let rxe_mr_cleanup() always handle freeing the mr->map once it is successfully allocated. Fixes: 1e75550648da ("Revert "RDMA/rxe: Create duplicate mapping tables for FMRs"") Link: https://lore.kernel.org/r/1667099073-2-1-git-send-email-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index cd846cf82a84..b1423000e4bc 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -97,6 +97,7 @@ err2: kfree(mr->map[i]); kfree(mr->map); + mr->map = NULL; err1: return -ENOMEM; } @@ -120,7 +121,6 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int num_buf; void *vaddr; int err; - int i; umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { @@ -159,9 +159,8 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, if (!vaddr) { rxe_dbg_mr(mr, "Unable to get virtual address\n"); err = -ENOMEM; - goto err_cleanup_map; + goto err_release_umem; } - buf->addr = (uintptr_t)vaddr; buf->size = PAGE_SIZE; num_buf++; @@ -178,10 +177,6 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, return 0; -err_cleanup_map: - for (i = 0; i < mr->num_map; i++) - kfree(mr->map[i]); - kfree(mr->map); err_release_umem: ib_umem_release(umem); err_out: From 8eaa6f7d569b4a22bfc1b0a3fdfeeb401feb65a4 Mon Sep 17 00:00:00 2001 From: Luoyouming Date: Tue, 8 Nov 2022 21:38:46 +0800 Subject: [PATCH 070/122] RDMA/hns: Fix ext_sge num error when post send In the HNS ROCE driver, The sge is divided into standard sge and extended sge. There are 2 standard sge in RC/XRC, and the UD standard sge is 0. In the scenario of RC SQ inline, if the data does not exceed 32bytes, the standard sge will be used. If it exceeds, only the extended sge will be used to fill the data. Currently, when filling the extended sge, max_gs is directly used as the number of the extended sge, which did not subtract the number of standard sge. There is a logical error. The new algorithm subtracts the number of standard sge from max_gs to get the actual number of extended sge. Fixes: 30b707886aeb ("RDMA/hns: Support inline data in extented sge space for RC") Link: https://lore.kernel.org/r/20221108133847.2304539-2-xuhaoyue1@hisilicon.com Signed-off-by: Luoyouming Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 1ead35fb031b..dcb59c05edfd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -188,20 +188,29 @@ static void set_atomic_seg(const struct ib_send_wr *wr, hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); } +static unsigned int get_std_sge_num(struct hns_roce_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) + return 0; + + return HNS_ROCE_SGE_IN_WQE; +} + static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_idx, u32 msg_len) { struct ib_device *ibdev = &(to_hr_dev(qp->ibqp.device))->ib_dev; - unsigned int ext_sge_sz = qp->sq.max_gs * HNS_ROCE_SGE_SIZE; unsigned int left_len_in_pg; unsigned int idx = *sge_idx; + unsigned int std_sge_num; unsigned int i = 0; unsigned int len; void *addr; void *dseg; - if (msg_len > ext_sge_sz) { + std_sge_num = get_std_sge_num(qp); + if (msg_len > (qp->sq.max_gs - std_sge_num) * HNS_ROCE_SGE_SIZE) { ibdev_err(ibdev, "no enough extended sge space for inline data.\n"); return -EINVAL; From 0c5e259b06a8efc69f929ad777ea49281bb58e37 Mon Sep 17 00:00:00 2001 From: Luoyouming Date: Tue, 8 Nov 2022 21:38:47 +0800 Subject: [PATCH 071/122] RDMA/hns: Fix incorrect sge nums calculation The user usually configures the number of sge through the max_send_sge parameter when creating qp, and configures the maximum size of inline data that can be sent through max_inline_data. Inline uses sge to fill data to send. Expect the following: 1) When the sge space cannot hold inline data, the sge space needs to be expanded to accommodate all inline data 2) When the sge space is enough to accommodate inline data, the upper limit of inline data can be increased so that users can send larger inline data Currently case one is not implemented. When the inline data is larger than the sge space, an error of insufficient sge space occurs. This part of the code needs to be reimplemented according to the expected rules. The calculation method of sge num is modified to take the maximum value of max_send_sge and the sge for max_inline_data to solve this problem. Fixes: 05201e01be93 ("RDMA/hns: Refactor process of setting extended sge") Fixes: 30b707886aeb ("RDMA/hns: Support inline data in extented sge space for RC") Link: https://lore.kernel.org/r/20221108133847.2304539-3-xuhaoyue1@hisilicon.com Signed-off-by: Luoyouming Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 +-- drivers/infiniband/hw/hns/hns_roce_main.c | 18 +++- drivers/infiniband/hw/hns/hns_roce_qp.c | 107 ++++++++++++++++---- include/uapi/rdma/hns-abi.h | 15 +++ 5 files changed, 125 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 723e55a7de8d..f701cc86896b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -202,6 +202,7 @@ struct hns_roce_ucontext { struct list_head page_list; struct mutex page_mutex; struct hns_user_mmap_entry *db_mmap_entry; + u32 config; }; struct hns_roce_pd { @@ -334,6 +335,7 @@ struct hns_roce_wq { u32 head; u32 tail; void __iomem *db_reg; + u32 ext_sge_cnt; }; struct hns_roce_sge { @@ -635,6 +637,7 @@ struct hns_roce_qp { struct list_head rq_node; /* all recv qps are on a list */ struct list_head sq_node; /* all send qps are on a list */ struct hns_user_mmap_entry *dwqe_mmap_entry; + u32 config; }; struct hns_roce_ib_iboe { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dcb59c05edfd..939811867249 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -188,14 +188,6 @@ static void set_atomic_seg(const struct ib_send_wr *wr, hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); } -static unsigned int get_std_sge_num(struct hns_roce_qp *qp) -{ - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) - return 0; - - return HNS_ROCE_SGE_IN_WQE; -} - static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_idx, u32 msg_len) @@ -203,14 +195,12 @@ static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, struct ib_device *ibdev = &(to_hr_dev(qp->ibqp.device))->ib_dev; unsigned int left_len_in_pg; unsigned int idx = *sge_idx; - unsigned int std_sge_num; unsigned int i = 0; unsigned int len; void *addr; void *dseg; - std_sge_num = get_std_sge_num(qp); - if (msg_len > (qp->sq.max_gs - std_sge_num) * HNS_ROCE_SGE_SIZE) { + if (msg_len > qp->sq.ext_sge_cnt * HNS_ROCE_SGE_SIZE) { ibdev_err(ibdev, "no enough extended sge space for inline data.\n"); return -EINVAL; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index dcf89689a4c6..8ba68ac12388 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -354,10 +354,11 @@ static int hns_roce_alloc_uar_entry(struct ib_ucontext *uctx) static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { - int ret; struct hns_roce_ucontext *context = to_hr_ucontext(uctx); - struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); + struct hns_roce_ib_alloc_ucontext_resp resp = {}; + struct hns_roce_ib_alloc_ucontext ucmd = {}; + int ret; if (!hr_dev->active) return -EAGAIN; @@ -365,6 +366,19 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.qp_tab_size = hr_dev->caps.num_qps; resp.srq_tab_size = hr_dev->caps.num_srqs; + ret = ib_copy_from_udata(&ucmd, udata, + min(udata->inlen, sizeof(ucmd))); + if (ret) + return ret; + + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + context->config = ucmd.config & HNS_ROCE_EXSGE_FLAGS; + + if (context->config & HNS_ROCE_EXSGE_FLAGS) { + resp.config |= HNS_ROCE_RSP_EXSGE_FLAGS; + resp.max_inline_data = hr_dev->caps.max_sq_inline; + } + ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index f0bd82a18069..0ae335fb205c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -476,38 +476,109 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, return 0; } -static u32 get_wqe_ext_sge_cnt(struct hns_roce_qp *qp) +static u32 get_max_inline_data(struct hns_roce_dev *hr_dev, + struct ib_qp_cap *cap) { - /* GSI/UD QP only has extended sge */ - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) - return qp->sq.max_gs; - - if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) - return qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE; + if (cap->max_inline_data) { + cap->max_inline_data = roundup_pow_of_two(cap->max_inline_data); + return min(cap->max_inline_data, + hr_dev->caps.max_sq_inline); + } return 0; } +static void update_inline_data(struct hns_roce_qp *hr_qp, + struct ib_qp_cap *cap) +{ + u32 sge_num = hr_qp->sq.ext_sge_cnt; + + if (hr_qp->config & HNS_ROCE_EXSGE_FLAGS) { + if (!(hr_qp->ibqp.qp_type == IB_QPT_GSI || + hr_qp->ibqp.qp_type == IB_QPT_UD)) + sge_num = max((u32)HNS_ROCE_SGE_IN_WQE, sge_num); + + cap->max_inline_data = max(cap->max_inline_data, + sge_num * HNS_ROCE_SGE_SIZE); + } + + hr_qp->max_inline_data = cap->max_inline_data; +} + +static u32 get_sge_num_from_max_send_sge(bool is_ud_or_gsi, + u32 max_send_sge) +{ + unsigned int std_sge_num; + unsigned int min_sge; + + std_sge_num = is_ud_or_gsi ? 0 : HNS_ROCE_SGE_IN_WQE; + min_sge = is_ud_or_gsi ? 1 : 0; + return max_send_sge > std_sge_num ? (max_send_sge - std_sge_num) : + min_sge; +} + +static unsigned int get_sge_num_from_max_inl_data(bool is_ud_or_gsi, + u32 max_inline_data) +{ + unsigned int inline_sge; + + inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; + + /* + * if max_inline_data less than + * HNS_ROCE_SGE_IN_WQE * HNS_ROCE_SGE_SIZE, + * In addition to ud's mode, no need to extend sge. + */ + if (!is_ud_or_gsi && inline_sge <= HNS_ROCE_SGE_IN_WQE) + inline_sge = 0; + + return inline_sge; +} + static void set_ext_sge_param(struct hns_roce_dev *hr_dev, u32 sq_wqe_cnt, struct hns_roce_qp *hr_qp, struct ib_qp_cap *cap) { + bool is_ud_or_gsi = (hr_qp->ibqp.qp_type == IB_QPT_GSI || + hr_qp->ibqp.qp_type == IB_QPT_UD); + unsigned int std_sge_num; + u32 inline_ext_sge = 0; + u32 ext_wqe_sge_cnt; u32 total_sge_cnt; - u32 wqe_sge_cnt; + + cap->max_inline_data = get_max_inline_data(hr_dev, cap); hr_qp->sge.sge_shift = HNS_ROCE_SGE_SHIFT; + std_sge_num = is_ud_or_gsi ? 0 : HNS_ROCE_SGE_IN_WQE; + ext_wqe_sge_cnt = get_sge_num_from_max_send_sge(is_ud_or_gsi, + cap->max_send_sge); - hr_qp->sq.max_gs = max(1U, cap->max_send_sge); + if (hr_qp->config & HNS_ROCE_EXSGE_FLAGS) { + inline_ext_sge = max(ext_wqe_sge_cnt, + get_sge_num_from_max_inl_data(is_ud_or_gsi, + cap->max_inline_data)); + hr_qp->sq.ext_sge_cnt = inline_ext_sge ? + roundup_pow_of_two(inline_ext_sge) : 0; - wqe_sge_cnt = get_wqe_ext_sge_cnt(hr_qp); + hr_qp->sq.max_gs = max(1U, (hr_qp->sq.ext_sge_cnt + std_sge_num)); + hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); + + ext_wqe_sge_cnt = hr_qp->sq.ext_sge_cnt; + } else { + hr_qp->sq.max_gs = max(1U, cap->max_send_sge); + hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); + hr_qp->sq.ext_sge_cnt = hr_qp->sq.max_gs; + } /* If the number of extended sge is not zero, they MUST use the * space of HNS_HW_PAGE_SIZE at least. */ - if (wqe_sge_cnt) { - total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * wqe_sge_cnt); + if (ext_wqe_sge_cnt) { + total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * ext_wqe_sge_cnt); hr_qp->sge.sge_cnt = max(total_sge_cnt, (u32)HNS_HW_PAGE_SIZE / HNS_ROCE_SGE_SIZE); } + + update_inline_data(hr_qp, cap); } static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev, @@ -556,6 +627,7 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift = ucmd->log_sq_stride; hr_qp->sq.wqe_cnt = cnt; + cap->max_send_sge = hr_qp->sq.max_gs; return 0; } @@ -986,13 +1058,9 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_ib_create_qp *ucmd) { struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_ucontext *uctx; int ret; - if (init_attr->cap.max_inline_data > hr_dev->caps.max_sq_inline) - init_attr->cap.max_inline_data = hr_dev->caps.max_sq_inline; - - hr_qp->max_inline_data = init_attr->cap.max_inline_data; - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR; else @@ -1015,12 +1083,17 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, return ret; } + uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, + ibucontext); + hr_qp->config = uctx->config; ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd); if (ret) ibdev_err(ibdev, "failed to set user SQ size, ret = %d.\n", ret); } else { + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + hr_qp->config = HNS_ROCE_EXSGE_FLAGS; ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp); if (ret) ibdev_err(ibdev, diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index f6fde06db4b4..745790ce3c26 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -85,11 +85,26 @@ struct hns_roce_ib_create_qp_resp { __aligned_u64 dwqe_mmap_key; }; +enum { + HNS_ROCE_EXSGE_FLAGS = 1 << 0, +}; + +enum { + HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0, +}; + struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 cqe_size; __u32 srq_tab_size; __u32 reserved; + __u32 config; + __u32 max_inline_data; +}; + +struct hns_roce_ib_alloc_ucontext { + __u32 config; + __u32 reserved; }; struct hns_roce_ib_alloc_pd_resp { From 2a402120a8d413238999a67ebff5b7dca0e5d14c Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Wed, 16 Nov 2022 10:45:35 +0100 Subject: [PATCH 072/122] IB/isert: use the ISCSI_LOGIN_CURRENT_STAGE macro Use the proper macro to get the current_stage value. Link: https://lore.kernel.org/r/20221116094535.138298-1-mlombard@redhat.com Signed-off-by: Maurizio Lombardi Reviewed-by: Mike Christie Acked-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/isert/ib_isert.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index b360a1527cd1..75404885cf98 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -993,9 +993,8 @@ isert_rx_login_req(struct isert_conn *isert_conn) * login request PDU. */ login->leading_connection = (!login_req->tsih) ? 1 : 0; - login->current_stage = - (login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK) - >> 2; + login->current_stage = ISCSI_LOGIN_CURRENT_STAGE( + login_req->flags); login->version_min = login_req->min_version; login->version_max = login_req->max_version; memcpy(login->isid, login_req->isid, 6); From 9b51d072da1d27e1193e84708201c48e385ad912 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Thu, 17 Nov 2022 21:15:46 +0800 Subject: [PATCH 073/122] RDMA/hfi: Decrease PCI device reference count in error path pci_get_device() will increase the reference count for the returned pci_dev, and also decrease the reference count for the input parameter *from* if it is not NULL. If we break out the loop in node_affinity_init() with 'dev' not NULL, we need to call pci_dev_put() to decrease the reference count. Add missing pci_dev_put() in error path. Fixes: c513de490f80 ("IB/hfi1: Invalid NUMA node information can cause a divide by zero") Signed-off-by: Xiongfeng Wang Link: https://lore.kernel.org/r/20221117131546.113280-1-wangxiongfeng2@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/affinity.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 877f8e84a672..77ee77d4000f 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -177,6 +177,8 @@ out: for (node = 0; node < node_affinity.num_possible_nodes; node++) hfi1_per_node_cntr[node] = 1; + pci_dev_put(dev); + return 0; } From 9907526d25c4ad8a6e3006487a544140776ba005 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 21 Nov 2022 18:44:10 -0600 Subject: [PATCH 074/122] RDMA/irdma: Initialize net_type before checking it The av->net_type is not initialized before it is checked in irdma_modify_qp_roce. This leads to an incorrect update to the ARP cache and QP context. RoCEv2 connections might fail as result. Set the net_type using rdma_gid_attr_network_type. Fixes: 80005c43d4c8 ("RDMA/irdma: Use net_type to check network type") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Link: https://lore.kernel.org/r/20221122004410.1471-1-shiraz.saleem@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index dc3f5f3fee90..f6973ea55eda 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1213,6 +1213,7 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, av->attrs = attr->ah_attr; rdma_gid2ip((struct sockaddr *)&av->sgid_addr, &sgid_attr->gid); rdma_gid2ip((struct sockaddr *)&av->dgid_addr, &attr->ah_attr.grh.dgid); + av->net_type = rdma_gid_attr_network_type(sgid_attr); if (av->net_type == RDMA_NETWORK_IPV6) { __be32 *daddr = av->dgid_addr.saddr_in6.sin6_addr.in6_u.u6_addr32; From a115aa00b18f7b8982b8f458149632caf64a862a Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Sat, 19 Nov 2022 15:08:34 +0800 Subject: [PATCH 075/122] RDMA/hns: fix memory leak in hns_roce_alloc_mr() When hns_roce_mr_enable() failed in hns_roce_alloc_mr(), mr_key is not released. Compiled test only. Fixes: 9b2cf76c9f05 ("RDMA/hns: Optimize PBL buffer allocation process") Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20221119070834.48502-1-shaozhengchao@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_mr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 845ac7d3831f..37a5cf62f88b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -392,10 +392,10 @@ struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, return &mr->ibmr; -err_key: - free_mr_key(hr_dev, mr); err_pbl: free_mr_pbl(hr_dev, mr); +err_key: + free_mr_key(hr_dev, mr); err_free: kfree(mr); return ERR_PTR(ret); From cb6562c380832a930ffd1722ac9d479b454aed4e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 22 Nov 2022 15:37:39 -0400 Subject: [PATCH 076/122] RDMA/rxe: Do not NULL deref on debugging failure path Correct the mistake, mr is obviously NULL in this code path. Fixes: 2778b72b1df0 ("RDMA/rxe: Replace pr_xxx by rxe_dbg_xxx in rxe_mr.c") Link: https://lore.kernel.org/r/Y3eeJW0AdyJYhYyQ@kili Reported-by: Dan Carpenter Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index b1423000e4bc..b7c9ff1ddf0e 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -519,7 +519,7 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); if (!mr) { - rxe_dbg_mr(mr, "No MR for key %#x\n", key); + rxe_dbg_qp(qp, "No MR for key %#x\n", key); ret = -EINVAL; goto err; } From f67376d801499f4fa0838c18c1efcad8840e550d Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Tue, 22 Nov 2022 23:14:37 +0800 Subject: [PATCH 077/122] RDMA/rxe: Fix NULL-ptr-deref in rxe_qp_do_cleanup() when socket create failed There is a null-ptr-deref when mount.cifs over rdma: BUG: KASAN: null-ptr-deref in rxe_qp_do_cleanup+0x2f3/0x360 [rdma_rxe] Read of size 8 at addr 0000000000000018 by task mount.cifs/3046 CPU: 2 PID: 3046 Comm: mount.cifs Not tainted 6.1.0-rc5+ #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc3 Call Trace: dump_stack_lvl+0x34/0x44 kasan_report+0xad/0x130 rxe_qp_do_cleanup+0x2f3/0x360 [rdma_rxe] execute_in_process_context+0x25/0x90 __rxe_cleanup+0x101/0x1d0 [rdma_rxe] rxe_create_qp+0x16a/0x180 [rdma_rxe] create_qp.part.0+0x27d/0x340 ib_create_qp_kernel+0x73/0x160 rdma_create_qp+0x100/0x230 _smbd_get_connection+0x752/0x20f0 smbd_get_connection+0x21/0x40 cifs_get_tcp_session+0x8ef/0xda0 mount_get_conns+0x60/0x750 cifs_mount+0x103/0xd00 cifs_smb3_do_mount+0x1dd/0xcb0 smb3_get_tree+0x1d5/0x300 vfs_get_tree+0x41/0xf0 path_mount+0x9b3/0xdd0 __x64_sys_mount+0x190/0x1d0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 The root cause of the issue is the socket create failed in rxe_qp_init_req(). So move the reset rxe_qp_do_cleanup() after the NULL ptr check. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20221122151437.1057671-1-zhangxiaoxu5@huawei.com Signed-off-by: Zhang Xiaoxu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 46f6c74ce00e..ab72db68b58f 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -817,12 +817,12 @@ static void rxe_qp_do_cleanup(struct work_struct *work) if (qp->resp.mr) rxe_put(qp->resp.mr); - if (qp_type(qp) == IB_QPT_RC) - sk_dst_reset(qp->sk->sk); - free_rd_atomic_resources(qp); if (qp->sk) { + if (qp_type(qp) == IB_QPT_RC) + sk_dst_reset(qp->sk->sk); + kernel_sock_shutdown(qp->sk, SHUT_RDWR); sock_release(qp->sk); } From b4d46c57d2fb0fa2611fa2ffbaf715925989f83f Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Thu, 24 Nov 2022 19:49:33 +0800 Subject: [PATCH 078/122] RDMA/erdma: Fix a typo in annotation A non-ASCII character was wrongly put in a comment, use the ACSII version. Fixes: bee85e0e31ec ("RDMA/erdma: Add main include file") Link: https://lore.kernel.org/r/20221124114933.77250-1-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index bb23d897c710..35726f25a989 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -219,7 +219,7 @@ struct erdma_dev { DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT); /* * We provide max 496 uContexts that each has one SQ normal Db, - * and one directWQE db。 + * and one directWQE db. */ DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT); From 35765dccaf3485575a4420da529c72484c980345 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 16 Nov 2022 10:31:05 +0800 Subject: [PATCH 079/122] RDMA/erdma: Add a workqueue for WRs reflushing ERDMA driver use a workqueue for asynchronous reflush command posting. Implement the lifecycle of this workqueue. Link: https://lore.kernel.org/r/20221116023107.82835-2-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma.h | 1 + drivers/infiniband/hw/erdma/erdma_main.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 35726f25a989..3d8c11aa23a2 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -190,6 +190,7 @@ struct erdma_dev { struct net_device *netdev; struct pci_dev *pdev; struct notifier_block netdev_nb; + struct workqueue_struct *reflush_wq; resource_size_t func_bar_addr; resource_size_t func_bar_len; diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index e44b06fea595..5dc31e5df5cb 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -521,13 +521,22 @@ static int erdma_ib_device_add(struct pci_dev *pdev) u64_to_ether_addr(mac, dev->attrs.peer_addr); + dev->reflush_wq = alloc_workqueue("erdma-reflush-wq", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + if (!dev->reflush_wq) { + ret = -ENOMEM; + goto err_alloc_workqueue; + } + ret = erdma_device_register(dev); if (ret) - goto err_out; + goto err_register; return 0; -err_out: +err_register: + destroy_workqueue(dev->reflush_wq); +err_alloc_workqueue: xa_destroy(&dev->qp_xa); xa_destroy(&dev->cq_xa); @@ -543,6 +552,7 @@ static void erdma_ib_device_remove(struct pci_dev *pdev) unregister_netdevice_notifier(&dev->netdev_nb); ib_unregister_device(&dev->ibdev); + destroy_workqueue(dev->reflush_wq); erdma_res_cb_free(dev); xa_destroy(&dev->qp_xa); xa_destroy(&dev->cq_xa); From 54d8fffc2a500953ba90ff9462ae06bb05ca2354 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 16 Nov 2022 10:31:06 +0800 Subject: [PATCH 080/122] RDMA/erdma: Implement the lifecycle of reflushing work for each QP Each QP has a work for reflushing purpose. In the work, driver will report the latest pi to hardware. Link: https://lore.kernel.org/r/20221116023107.82835-3-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_hw.h | 8 ++++++++ drivers/infiniband/hw/erdma/erdma_verbs.c | 18 ++++++++++++++++++ drivers/infiniband/hw/erdma/erdma_verbs.h | 2 ++ 3 files changed, 28 insertions(+) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 1b2e2b70678f..ab371fec610c 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -145,6 +145,7 @@ enum CMDQ_RDMA_OPCODE { CMDQ_OPCODE_MODIFY_QP = 3, CMDQ_OPCODE_CREATE_CQ = 4, CMDQ_OPCODE_DESTROY_CQ = 5, + CMDQ_OPCODE_REFLUSH = 6, CMDQ_OPCODE_REG_MR = 8, CMDQ_OPCODE_DEREG_MR = 9 }; @@ -301,6 +302,13 @@ struct erdma_cmdq_destroy_qp_req { u32 qpn; }; +struct erdma_cmdq_reflush_req { + u64 hdr; + u32 qpn; + u32 sq_pi; + u32 rq_pi; +}; + /* cap qword 0 definition */ #define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) #define ERDMA_CMD_DEV_CAP_FLAGS_MASK GENMASK_ULL(31, 24) diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index d843ce1f35f3..5dab1e87975b 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -379,6 +379,21 @@ int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) return 0; } +static void erdma_flush_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct erdma_qp *qp = + container_of(dwork, struct erdma_qp, reflush_dwork); + struct erdma_cmdq_reflush_req req; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_REFLUSH); + req.qpn = QP_ID(qp); + req.sq_pi = qp->kern_qp.sq_pi; + req.rq_pi = qp->kern_qp.rq_pi; + erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, NULL); +} + static int erdma_qp_validate_cap(struct erdma_dev *dev, struct ib_qp_init_attr *attrs) { @@ -735,6 +750,7 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, qp->attrs.max_send_sge = attrs->cap.max_send_sge; qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; qp->attrs.state = ERDMA_QP_STATE_IDLE; + INIT_DELAYED_WORK(&qp->reflush_dwork, erdma_flush_worker); ret = create_qp_cmd(dev, qp); if (ret) @@ -1028,6 +1044,8 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); up_write(&qp->state_lock); + cancel_delayed_work_sync(&qp->reflush_dwork); + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_DESTROY_QP); req.qpn = QP_ID(qp); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index a5574f0252bb..9f341d032069 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -197,6 +197,8 @@ struct erdma_qp { struct erdma_cep *cep; struct rw_semaphore state_lock; + struct delayed_work reflush_dwork; + union { struct erdma_kqp kern_qp; struct erdma_uqp user_qp; From 0edf42cbcc8690ef349d4432fea74d7791e3c645 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 16 Nov 2022 10:31:07 +0800 Subject: [PATCH 081/122] RDMA/erdma: Notify the latest PI to FW for reflushing when necessary Firmware is responsible for flushing WRs in HW, and it's a little difficult for firmware to get the latest PI of QPs, especially for RQs after QP state being changed to ERROR. So we introduce a new CMDQ command, by which driver can notify to latest PI to FW, and then FW can flush all posted WRs. Link: https://lore.kernel.org/r/20221116023107.82835-4-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_qp.c | 30 ++++++++++++++++------- drivers/infiniband/hw/erdma/erdma_verbs.h | 5 ++++ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 521e97258de7..d088d6bef431 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -120,6 +120,7 @@ static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp, int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, enum erdma_qp_attr_mask mask) { + bool need_reflush = false; int drop_conn, ret = 0; if (!mask) @@ -135,6 +136,7 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, ret = erdma_modify_qp_state_to_rts(qp, attrs, mask); } else if (attrs->state == ERDMA_QP_STATE_ERROR) { qp->attrs.state = ERDMA_QP_STATE_ERROR; + need_reflush = true; if (qp->cep) { erdma_cep_put(qp->cep); qp->cep = NULL; @@ -145,17 +147,12 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, case ERDMA_QP_STATE_RTS: drop_conn = 0; - if (attrs->state == ERDMA_QP_STATE_CLOSING) { + if (attrs->state == ERDMA_QP_STATE_CLOSING || + attrs->state == ERDMA_QP_STATE_TERMINATE || + attrs->state == ERDMA_QP_STATE_ERROR) { ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); drop_conn = 1; - } else if (attrs->state == ERDMA_QP_STATE_TERMINATE) { - qp->attrs.state = ERDMA_QP_STATE_TERMINATE; - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); - drop_conn = 1; - } else if (attrs->state == ERDMA_QP_STATE_ERROR) { - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); - qp->attrs.state = ERDMA_QP_STATE_ERROR; - drop_conn = 1; + need_reflush = true; } if (drop_conn) @@ -180,6 +177,12 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, break; } + if (need_reflush && !ret && rdma_is_kernel_res(&qp->ibqp.res)) { + qp->flags |= ERDMA_QP_IN_FLUSHING; + mod_delayed_work(qp->dev->reflush_wq, &qp->reflush_dwork, + usecs_to_jiffies(100)); + } + return ret; } @@ -527,6 +530,10 @@ int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, } spin_unlock_irqrestore(&qp->lock, flags); + if (unlikely(qp->flags & ERDMA_QP_IN_FLUSHING)) + mod_delayed_work(qp->dev->reflush_wq, &qp->reflush_dwork, + usecs_to_jiffies(100)); + return ret; } @@ -580,5 +587,10 @@ int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, } spin_unlock_irqrestore(&qp->lock, flags); + + if (unlikely(qp->flags & ERDMA_QP_IN_FLUSHING)) + mod_delayed_work(qp->dev->reflush_wq, &qp->reflush_dwork, + usecs_to_jiffies(100)); + return ret; } diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 9f341d032069..e0a993bc032a 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -173,6 +173,10 @@ enum erdma_qp_attr_mask { ERDMA_QP_ATTR_MPA = (1 << 7) }; +enum erdma_qp_flags { + ERDMA_QP_IN_FLUSHING = (1 << 0), +}; + struct erdma_qp_attrs { enum erdma_qp_state state; enum erdma_cc_alg cc; /* Congestion control algorithm */ @@ -197,6 +201,7 @@ struct erdma_qp { struct erdma_cep *cep; struct rw_semaphore state_lock; + unsigned long flags; struct delayed_work reflush_dwork; union { From 09f530f0c6d6689eee5e690c6d98f495fcc3a0f9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 23 Nov 2022 20:27:14 -0400 Subject: [PATCH 082/122] RDMA: Add netdevice_tracker to ib_device_set_netdev() This will cause an informative backtrace to print if the user of ib_device_set_netdev() isn't careful about tearing down the ibdevice before its the netdevice parent is destroyed. Such as like this: unregister_netdevice: waiting for vlan0 to become free. Usage count = 2 leaked reference. ib_device_set_netdev+0x266/0x730 siw_newlink+0x4e0/0xfd0 nldev_newlink+0x35c/0x5c0 rdma_nl_rcv_msg+0x36d/0x690 rdma_nl_rcv+0x2ee/0x430 netlink_unicast+0x543/0x7f0 netlink_sendmsg+0x918/0xe20 sock_sendmsg+0xcf/0x120 ____sys_sendmsg+0x70d/0x8b0 ___sys_sendmsg+0x11d/0x1b0 __sys_sendmsg+0xfa/0x1d0 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd This will help debug the issues syzkaller is seeing. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-a7c81b3842ce+e5-netdev_tracker_jgg@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 6 ++++-- include/rdma/ib_verbs.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 3409c55ea88b..ff35cebb25e2 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2159,14 +2159,16 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, return 0; } + if (old_ndev) + netdev_tracker_free(ndev, &pdata->netdev_tracker); if (ndev) - dev_hold(ndev); + netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); rcu_assign_pointer(pdata->netdev, ndev); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); if (old_ndev) - dev_put(old_ndev); + __dev_put(old_ndev); return 0; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a1f4d53a4bb6..77dd9148815b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2203,6 +2203,7 @@ struct ib_port_data { struct ib_port_cache cache; struct net_device __rcu *netdev; + netdevice_tracker netdev_tracker; struct hlist_node ndev_hash_link; struct rdma_port_counter port_counter; struct ib_port *sysfs; From ea5ef136e215fdef35f14010bc51fcd6686e6922 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Sat, 26 Nov 2022 04:34:10 +0000 Subject: [PATCH 083/122] RDMA/nldev: Add checks for nla_nest_start() in fill_stat_counter_qps() As the nla_nest_start() may fail with NULL returned, the return value needs to be checked. Fixes: c4ffee7c9bdb ("RDMA/netlink: Implement counter dumpit calback") Signed-off-by: Yuan Can Link: https://lore.kernel.org/r/20221126043410.85632-1-yuancan@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 2be76a3fdd87..ca0ed7d14326 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -894,6 +894,8 @@ static int fill_stat_counter_qps(struct sk_buff *msg, int ret = 0; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + if (!table_attr) + return -EMSGSIZE; rt = &counter->device->res[RDMA_RESTRACK_QP]; xa_lock(&rt->xa); From 2d6c66f5253e7d168a76048d18e1209c52f98a2b Mon Sep 17 00:00:00 2001 From: zhang songyi Date: Tue, 29 Nov 2022 15:54:07 +0800 Subject: [PATCH 084/122] RDMA/mlx4: Remove NULL check before dev_{put, hold} The call netdev_{put, hold} of dev_{put, hold} will check NULL, so there is no need to check before using dev_{put, hold}. Fix the following coccicheck warnings: /drivers/infiniband/hw/mlx4/main.c:1311:2-10: WARNING: WARNING NULL check before dev_{put, hold} functions is not needed. /drivers/infiniband/hw/mlx4/main.c:148:2-10: WARNING: WARNING NULL check before dev_{put, hold} functions is not needed. /drivers/infiniband/hw/mlx4/main.c:1959:3-11: WARNING: WARNING NULL check before dev_{put, hold} functions is not needed. /drivers/infiniband/hw/mlx4/main.c:1962:3-10: WARNING: WARNING NULL check before dev_{put, hold} functions is not needed. Signed-off-by: zhang songyi Link: https://lore.kernel.org/r/202211291554079687539@zte.com.cn Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx4/main.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ba47874f90d3..dceebcd885bb 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -144,8 +144,7 @@ static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, } } } - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; @@ -1307,8 +1306,7 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, spin_lock_bh(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; - if (ndev) - dev_hold(ndev); + dev_hold(ndev); spin_unlock_bh(&mdev->iboe.lock); if (ndev) { @@ -1955,11 +1953,9 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) if (ge) { spin_lock_bh(&mdev->iboe.lock); ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; - if (ndev) - dev_hold(ndev); + dev_hold(ndev); spin_unlock_bh(&mdev->iboe.lock); - if (ndev) - dev_put(ndev); + dev_put(ndev); list_del(&ge->list); kfree(ge); } else From 67e6272d53386f9708f91c4d0015c4a1c470eef5 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Mon, 28 Nov 2022 13:52:45 +0200 Subject: [PATCH 085/122] RDMA/nldev: Add NULL check to silence false warnings Using nlmsg_put causes static analysis tools to many false positives of not checking the return value of nlmsg_put. In all uses in nldev.c, payload parameter is 0 so NULL will never be returned. So let's add useless checks to silence the warnings. Signed-off-by: Or Har-Toov Reviewed-by: Michael Guralnik Link: https://lore.kernel.org/r/bd924da89d5b4f5291a4a01d9b5ae47c0a9b6a3f.1669636336.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 44 +++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ca0ed7d14326..b4716cda65d8 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1043,6 +1043,10 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); + if (!nlh) { + err = -EMSGSIZE; + goto err_free; + } err = fill_dev_info(msg, device); if (err) @@ -1128,7 +1132,7 @@ static int _nldev_get_dumpit(struct ib_device *device, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, NLM_F_MULTI); - if (fill_dev_info(skb, device)) { + if (!nlh || fill_dev_info(skb, device)) { nlmsg_cancel(skb, nlh); goto out; } @@ -1187,6 +1191,10 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); + if (!nlh) { + err = -EMSGSIZE; + goto err_free; + } err = fill_port_info(msg, device, port, sock_net(skb->sk)); if (err) @@ -1248,7 +1256,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, RDMA_NLDEV_CMD_PORT_GET), 0, NLM_F_MULTI); - if (fill_port_info(skb, device, p, sock_net(skb->sk))) { + if (!nlh || fill_port_info(skb, device, p, sock_net(skb->sk))) { nlmsg_cancel(skb, nlh); goto out; } @@ -1290,6 +1298,10 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, 0); + if (!nlh) { + ret = -EMSGSIZE; + goto err_free; + } ret = fill_res_info(msg, device); if (ret) @@ -1321,7 +1333,7 @@ static int _nldev_res_get_dumpit(struct ib_device *device, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, NLM_F_MULTI); - if (fill_res_info(skb, device)) { + if (!nlh || fill_res_info(skb, device)) { nlmsg_cancel(skb, nlh); goto out; } @@ -1456,7 +1468,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_OP(nlh->nlmsg_type)), 0, 0); - if (fill_nldev_handle(msg, device)) { + if (!nlh || fill_nldev_handle(msg, device)) { ret = -EMSGSIZE; goto err_free; } @@ -1535,7 +1547,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), 0, NLM_F_MULTI); - if (fill_nldev_handle(skb, device)) { + if (!nlh || fill_nldev_handle(skb, device)) { ret = -EMSGSIZE; goto err; } @@ -1797,6 +1809,10 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET_CHARDEV), 0, 0); + if (!nlh) { + err = -EMSGSIZE; + goto out_nlmsg; + } data.nl_msg = msg; err = ib_get_client_nl_info(ibdev, client_name, &data); @@ -1854,6 +1870,10 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SYS_GET), 0, 0); + if (!nlh) { + nlmsg_free(msg); + return -EMSGSIZE; + } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE, (u8)ib_devices_shared_netns); @@ -2034,7 +2054,7 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); - if (fill_nldev_handle(msg, device) || + if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_free_msg; @@ -2103,6 +2123,10 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); + if (!nlh) { + ret = -EMSGSIZE; + goto err_fill; + } cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); @@ -2173,7 +2197,7 @@ static int stat_get_doit_default_counter(struct sk_buff *skb, RDMA_NLDEV_CMD_STAT_GET), 0, 0); - if (fill_nldev_handle(msg, device) || + if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_msg; @@ -2261,6 +2285,10 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET), 0, 0); + if (!nlh) { + ret = -EMSGSIZE; + goto err_msg; + } ret = rdma_counter_get_mode(device, port, &mode, &mask); if (ret) @@ -2393,7 +2421,7 @@ static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, 0, 0); ret = -EMSGSIZE; - if (fill_nldev_handle(msg, device) || + if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) goto err_msg; From fc8f93ad3e5485d45c992233c96acd902992dfc4 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 28 Nov 2022 13:52:46 +0200 Subject: [PATCH 086/122] RDMA/nldev: Fix failure to send large messages Return "-EMSGSIZE" instead of "-EINVAL" when filling a QP entry, so that new SKBs will be allocated if there's not enough room in current SKB. Fixes: 65959522f806 ("RDMA: Add support to dump resource tracker in RAW format") Signed-off-by: Mark Zhang Reviewed-by: Patrisious Haddad Link: https://lore.kernel.org/r/b5e9c62f6b8369acab5648b661bf539cbceeffdc.1669636336.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index b4716cda65d8..a981ac2f0975 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -513,7 +513,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, /* In create_qp() port is not set yet */ if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) - return -EINVAL; + return -EMSGSIZE; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); if (ret) From 10aa7cd398a9ead7464a7f8b49d4e4c843806813 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 30 Nov 2022 17:44:37 +0800 Subject: [PATCH 087/122] IB/hfi1: Switch to netif_napi_add() There is no need to use netif_napi_add_weight() when the weight argument is 64. See "net: drop the weight argument from netif_napi_add". Signed-off-by: Yang Yang Link: https://lore.kernel.org/r/202211301744378304494@zte.com.cn Reviewed-by: xu xin Reviewed-by: Zhang Yunkai Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/netdev_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 3dfa5aff2512..720d4c85c9c9 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -216,7 +216,7 @@ static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx) * right now. */ set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state); - netif_napi_add_weight(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); + netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi); rc = msix_netdev_request_rcd_irq(rxq->rcd); if (rc) goto bail_context_irq_failure; From efa2afc3969e166702fd2ae3cfb1a7a195ef3533 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:05 +0000 Subject: [PATCH 088/122] RDMA: Extend RDMA user ABI to support atomic write 1) Define new atomic write request/completion in userspace. 2) Define new atomic write capability in userspace. Link: https://lore.kernel.org/r/1669905432-14-2-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_verbs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 43672cb1fd57..237814815544 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -466,6 +466,7 @@ enum ib_uverbs_wc_opcode { IB_UVERBS_WC_BIND_MW = 5, IB_UVERBS_WC_LOCAL_INV = 6, IB_UVERBS_WC_TSO = 7, + IB_UVERBS_WC_ATOMIC_WRITE = 9, }; struct ib_uverbs_wc { @@ -784,6 +785,7 @@ enum ib_uverbs_wr_opcode { IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + IB_UVERBS_WR_ATOMIC_WRITE = 15, /* Review enum ib_wr_opcode before modifying this */ }; @@ -1331,6 +1333,8 @@ enum ib_uverbs_device_cap_flags { /* Deprecated. Please use IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS. */ IB_UVERBS_DEVICE_RAW_SCATTER_FCS = 1ULL << 34, IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING = 1ULL << 36, + /* Atomic write attributes */ + IB_UVERBS_DEVICE_ATOMIC_WRITE = 1ULL << 40, }; enum ib_uverbs_raw_packet_caps { From 3ff81e827b8d5cea36ff374a11c200b4306f45d2 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:06 +0000 Subject: [PATCH 089/122] RDMA: Extend RDMA kernel ABI to support atomic write 1) Define new atomic write request/completion in kernel. 2) Define new atomic write capability in kernel. 3) Define new atomic write opcode for RC service in packet. Link: https://lore.kernel.org/r/1669905432-14-3-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- include/rdma/ib_pack.h | 2 ++ include/rdma/ib_verbs.h | 3 +++ 2 files changed, 5 insertions(+) diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index a9162f25beaf..f932d164af63 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -84,6 +84,7 @@ enum { /* opcode 0x15 is reserved */ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, + IB_OPCODE_ATOMIC_WRITE = 0x1D, /* real constants follow -- see comment about above IB_OPCODE() macro for more details */ @@ -112,6 +113,7 @@ enum { IB_OPCODE(RC, FETCH_ADD), IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), + IB_OPCODE(RC, ATOMIC_WRITE), /* UC */ IB_OPCODE(UC, SEND_FIRST), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 77dd9148815b..df6bb26ba0be 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -270,6 +270,7 @@ enum ib_device_cap_flags { /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, + IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; enum ib_kernel_cap_flags { @@ -982,6 +983,7 @@ enum ib_wc_opcode { IB_WC_BIND_MW = IB_UVERBS_WC_BIND_MW, IB_WC_LOCAL_INV = IB_UVERBS_WC_LOCAL_INV, IB_WC_LSO = IB_UVERBS_WC_TSO, + IB_WC_ATOMIC_WRITE = IB_UVERBS_WC_ATOMIC_WRITE, IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, @@ -1325,6 +1327,7 @@ enum ib_wr_opcode { IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, From c2d939002934fa9d7b802f196b069963b46da194 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:07 +0000 Subject: [PATCH 090/122] RDMA/rxe: Extend rxe user ABI to support atomic write Define an atomic_wr array to store 8-byte value. Link: https://lore.kernel.org/r/1669905432-14-4-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_rxe.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 73f679dfd2df..d20d1ecf046f 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -146,6 +146,7 @@ struct rxe_dma_info { __u32 reserved; union { __DECLARE_FLEX_ARRAY(__u8, inline_data); + __DECLARE_FLEX_ARRAY(__u8, atomic_wr); __DECLARE_FLEX_ARRAY(struct rxe_sge, sge); }; }; From 5c7af6c7938466aa2f3c52057f4dd28b4a1e9e42 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:08 +0000 Subject: [PATCH 091/122] RDMA/rxe: Extend rxe packet format to support atomic write Extend rxe_wr_opcode_info[] and rxe_opcode[] for new atomic write opcode. Link: https://lore.kernel.org/r/1669905432-14-5-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_opcode.c | 18 ++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_opcode.h | 3 +++ 2 files changed, 21 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c index d4ba4d506f17..fb196029048e 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.c +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c @@ -101,6 +101,12 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { [IB_QPT_UC] = WR_LOCAL_OP_MASK, }, }, + [IB_WR_ATOMIC_WRITE] = { + .name = "IB_WR_ATOMIC_WRITE", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_WRITE_MASK, + }, + }, }; struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { @@ -378,6 +384,18 @@ struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { RXE_IETH_BYTES, } }, + [IB_OPCODE_RC_ATOMIC_WRITE] = { + .name = "IB_OPCODE_RC_ATOMIC_WRITE", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_ATOMIC_WRITE_MASK | RXE_START_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + RXE_RETH_BYTES, + } + }, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = { diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h index 8f9aaaf260f2..a470e9b0b884 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.h +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -20,6 +20,7 @@ enum rxe_wr_mask { WR_READ_MASK = BIT(3), WR_WRITE_MASK = BIT(4), WR_LOCAL_OP_MASK = BIT(5), + WR_ATOMIC_WRITE_MASK = BIT(7), WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK, @@ -81,6 +82,8 @@ enum rxe_hdr_mask { RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), + RXE_ATOMIC_WRITE_MASK = BIT(NUM_HDR_TYPES + 14), + RXE_READ_OR_ATOMIC_MASK = (RXE_READ_MASK | RXE_ATOMIC_MASK), RXE_WRITE_OR_SEND_MASK = (RXE_WRITE_MASK | RXE_SEND_MASK), RXE_READ_OR_WRITE_MASK = (RXE_READ_MASK | RXE_WRITE_MASK), From abb633cf28049e6a7c37c44f83a8584f7dbded7d Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:39:25 +0000 Subject: [PATCH 092/122] RDMA/rxe: Make requester support atomic write on RC service Make requester process and send an atomic write request on RC service. Link: https://lore.kernel.org/r/1669905568-62-1-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 4d45f508392f..2713e9058922 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -258,6 +258,10 @@ static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits) else return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE : IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_ATOMIC_WRITE: + return IB_OPCODE_RC_ATOMIC_WRITE; + case IB_WR_REG_MR: case IB_WR_LOCAL_INV: return opcode; @@ -486,6 +490,11 @@ static int finish_packet(struct rxe_qp *qp, struct rxe_av *av, } } + if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { + memcpy(payload_addr(pkt), wqe->dma.atomic_wr, payload); + wqe->dma.resid -= payload; + } + return 0; } @@ -709,13 +718,15 @@ int rxe_requester(void *arg) } mask = rxe_opcode[opcode].mask; - if (unlikely(mask & RXE_READ_OR_ATOMIC_MASK)) { + if (unlikely(mask & (RXE_READ_OR_ATOMIC_MASK | + RXE_ATOMIC_WRITE_MASK))) { if (check_init_depth(qp, wqe)) goto exit; } mtu = get_mtu(qp); - payload = (mask & RXE_WRITE_OR_SEND_MASK) ? wqe->dma.resid : 0; + payload = (mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) ? + wqe->dma.resid : 0; if (payload > mtu) { if (qp_type(qp) == IB_QPT_UD) { /* C10-93.1.1: If the total sum of all the buffer lengths specified for a From 034e285f8b99062a0cf29112e1232154a6a44aa5 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:39:26 +0000 Subject: [PATCH 093/122] RDMA/rxe: Make responder support atomic write on RC service Make responder process an atomic write request and send a read response on RC service. Link: https://lore.kernel.org/r/1669905568-62-2-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 84 ++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 6761bcd1d4d8..6ac544477f3f 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -22,6 +22,7 @@ enum resp_states { RESPST_EXECUTE, RESPST_READ_REPLY, RESPST_ATOMIC_REPLY, + RESPST_ATOMIC_WRITE_REPLY, RESPST_COMPLETE, RESPST_ACKNOWLEDGE, RESPST_CLEANUP, @@ -57,6 +58,7 @@ static char *resp_state_name[] = { [RESPST_EXECUTE] = "EXECUTE", [RESPST_READ_REPLY] = "READ_REPLY", [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", + [RESPST_ATOMIC_WRITE_REPLY] = "ATOMIC_WRITE_REPLY", [RESPST_COMPLETE] = "COMPLETE", [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", [RESPST_CLEANUP] = "CLEANUP", @@ -263,7 +265,7 @@ static enum resp_states check_op_valid(struct rxe_qp *qp, case IB_QPT_RC: if (((pkt->mask & RXE_READ_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || - ((pkt->mask & RXE_WRITE_MASK) && + ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || ((pkt->mask & RXE_ATOMIC_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) { @@ -367,7 +369,7 @@ static enum resp_states check_resource(struct rxe_qp *qp, } } - if (pkt->mask & RXE_READ_OR_ATOMIC_MASK) { + if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) { /* it is the requesters job to not send * too many read/atomic ops, we just * recycle the responder resource queue @@ -438,7 +440,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, enum resp_states state; int access; - if (pkt->mask & RXE_READ_OR_WRITE_MASK) { + if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (pkt->mask & RXE_RETH_MASK) { qp->resp.va = reth_va(pkt); qp->resp.offset = 0; @@ -504,7 +506,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, goto err; } - if (pkt->mask & RXE_WRITE_MASK) { + if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (resid > mtu) { if (pktlen != mtu || bth_pad(pkt)) { state = RESPST_ERR_LENGTH; @@ -604,6 +606,7 @@ static struct resp_res *rxe_prepare_res(struct rxe_qp *qp, res->state = rdatm_res_state_new; break; case RXE_ATOMIC_MASK: + case RXE_ATOMIC_WRITE_MASK: res->first_psn = pkt->psn; res->last_psn = pkt->psn; res->cur_psn = pkt->psn; @@ -673,6 +676,55 @@ out: return ret; } +static enum resp_states atomic_write_reply(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + u64 src, *dst; + struct resp_res *res = qp->resp.res; + struct rxe_mr *mr = qp->resp.mr; + int payload = payload_size(pkt); + + if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK); + qp->resp.res = res; + } + + if (!res->replay) { +#ifdef CONFIG_64BIT + if (mr->state != RXE_MR_STATE_VALID) + return RESPST_ERR_RKEY_VIOLATION; + + memcpy(&src, payload_addr(pkt), payload); + + dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, payload); + /* check vaddr is 8 bytes aligned. */ + if (!dst || (uintptr_t)dst & 7) + return RESPST_ERR_MISALIGNED_ATOMIC; + + /* Do atomic write after all prior operations have completed */ + smp_store_release(dst, src); + + /* decrease resp.resid to zero */ + qp->resp.resid -= sizeof(payload); + + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + return RESPST_ACKNOWLEDGE; +#else + return RESPST_ERR_UNSUPPORTED_OPCODE; +#endif /* CONFIG_64BIT */ + } + + return RESPST_ACKNOWLEDGE; +} + static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, struct rxe_pkt_info *ack, int opcode, @@ -912,6 +964,8 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) return RESPST_READ_REPLY; } else if (pkt->mask & RXE_ATOMIC_MASK) { return RESPST_ATOMIC_REPLY; + } else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { + return RESPST_ATOMIC_WRITE_REPLY; } else { /* Unreachable */ WARN_ON_ONCE(1); @@ -1085,6 +1139,19 @@ static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) return ret; } +static int send_read_response_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) +{ + int ret = send_common_ack(qp, syndrome, psn, + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY, + "RDMA READ response of length zero ACK"); + + /* have to clear this since it is used to trigger + * long read replies + */ + qp->resp.res = NULL; + return ret; +} + static enum resp_states acknowledge(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { @@ -1095,6 +1162,8 @@ static enum resp_states acknowledge(struct rxe_qp *qp, send_ack(qp, qp->resp.aeth_syndrome, pkt->psn); else if (pkt->mask & RXE_ATOMIC_MASK) send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); + else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) + send_read_response_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); else if (bth_ack(pkt)) send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); @@ -1206,7 +1275,9 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, res->replay = 1; res->cur_psn = pkt->psn; qp->resp.res = res; - rc = RESPST_ATOMIC_REPLY; + rc = pkt->mask & RXE_ATOMIC_MASK ? + RESPST_ATOMIC_REPLY : + RESPST_ATOMIC_WRITE_REPLY; goto out; } @@ -1343,6 +1414,9 @@ int rxe_responder(void *arg) case RESPST_ATOMIC_REPLY: state = atomic_reply(qp, pkt); break; + case RESPST_ATOMIC_WRITE_REPLY: + state = atomic_write_reply(qp, pkt); + break; case RESPST_ACKNOWLEDGE: state = acknowledge(qp, pkt); break; From 3aec427bb1499bd3325d2e251edb729a5a8643df Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:39:27 +0000 Subject: [PATCH 094/122] RDMA/rxe: Implement atomic write completion Generate an atomic write completion when the atomic write request has been finished. Link: https://lore.kernel.org/r/1669905568-62-3-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 4dca4f8bbb5a..1c525325e271 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -104,6 +104,7 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV; case IB_WR_REG_MR: return IB_WC_REG_MR; case IB_WR_BIND_MW: return IB_WC_BIND_MW; + case IB_WR_ATOMIC_WRITE: return IB_WC_ATOMIC_WRITE; default: return 0xff; @@ -269,6 +270,9 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, if ((syn & AETH_TYPE_MASK) != AETH_ACK) return COMPST_ERROR; + if (wqe->wr.opcode == IB_WR_ATOMIC_WRITE) + return COMPST_WRITE_SEND; + fallthrough; /* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH) */ From 4cd9f1d320f905e7bc60f030566d15003745ba91 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:39:28 +0000 Subject: [PATCH 095/122] RDMA/rxe: Enable atomic write capability for rxe device The capability shows that rxe device supports atomic write operation. Link: https://lore.kernel.org/r/1669905568-62-4-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_param.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 86c7a8bf3cbb..bbc88cd71d95 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -51,7 +51,12 @@ enum rxe_device_param { | IB_DEVICE_SRQ_RESIZE | IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_MEM_WINDOW +#ifdef CONFIG_64BIT + | IB_DEVICE_MEM_WINDOW_TYPE_2B + | IB_DEVICE_ATOMIC_WRITE, +#else | IB_DEVICE_MEM_WINDOW_TYPE_2B, +#endif /* CONFIG_64BIT */ RXE_MAX_SGE = 32, RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + sizeof(struct ib_sge) * RXE_MAX_SGE, From 323a74fc20f53c0d0e13a16aee703a30d9751235 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 2 Dec 2022 13:19:40 -0800 Subject: [PATCH 096/122] RDMA: Disable IB HW for UML Disable all of drivers/infiniband/hw/ and rdmavt for UML builds until someone needs it and provides patches to support it. This prevents build errors in hw/qib/qib_wc_x86_64.c. Fixes: 68f5d3f3b654 ("um: add PCI over virtio emulation driver") Signed-off-by: Randy Dunlap Cc: Jason Gunthorpe Cc: Dennis Dalessandro Cc: Christoph Hellwig Cc: linux-rdma@vger.kernel.org Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: linux-um@lists.infradead.org Link: https://lore.kernel.org/r/20221202211940.29111-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index ccc874478f0b..a5827d11e934 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -78,6 +78,7 @@ config INFINIBAND_VIRT_DMA def_bool !HIGHMEM if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS +if !UML source "drivers/infiniband/hw/bnxt_re/Kconfig" source "drivers/infiniband/hw/cxgb4/Kconfig" source "drivers/infiniband/hw/efa/Kconfig" @@ -95,6 +96,7 @@ source "drivers/infiniband/hw/qib/Kconfig" source "drivers/infiniband/hw/usnic/Kconfig" source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" +endif # !UML source "drivers/infiniband/sw/rxe/Kconfig" source "drivers/infiniband/sw/siw/Kconfig" endif From 725349f8ba1e78a146c6ff8f3ee5e2712e517106 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 2 Dec 2022 12:00:37 +0800 Subject: [PATCH 097/122] RDMA/hfi1: Fix error return code in parse_platform_config() In the previous iteration of the while loop, the "ret" may have been assigned a value of 0, so the error return code -EINVAL may have been incorrectly set to 0. To fix set valid return code before calling to goto. Fixes: 97167e813415 ("staging/rdma/hfi1: Tune for unknown channel if configuration file is absent") Signed-off-by: Wang Yufen Link: https://lore.kernel.org/r/1669953638-11747-1-git-send-email-wangyufen@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/firmware.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c index 1d77514ebbee..0c0cef5b1e0e 100644 --- a/drivers/infiniband/hw/hfi1/firmware.c +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -1743,6 +1743,7 @@ int parse_platform_config(struct hfi1_devdata *dd) if (!dd->platform_config.data) { dd_dev_err(dd, "%s: Missing config file\n", __func__); + ret = -EINVAL; goto bail; } ptr = (u32 *)dd->platform_config.data; @@ -1751,6 +1752,7 @@ int parse_platform_config(struct hfi1_devdata *dd) ptr++; if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) { dd_dev_err(dd, "%s: Bad config file\n", __func__); + ret = -EINVAL; goto bail; } @@ -1774,6 +1776,7 @@ int parse_platform_config(struct hfi1_devdata *dd) if (file_length > dd->platform_config.size) { dd_dev_info(dd, "%s:File claims to be larger than read size\n", __func__); + ret = -EINVAL; goto bail; } else if (file_length < dd->platform_config.size) { dd_dev_info(dd, @@ -1794,6 +1797,7 @@ int parse_platform_config(struct hfi1_devdata *dd) dd_dev_err(dd, "%s: Failed validation at offset %ld\n", __func__, (ptr - (u32 *) dd->platform_config.data)); + ret = -EINVAL; goto bail; } @@ -1837,6 +1841,7 @@ int parse_platform_config(struct hfi1_devdata *dd) __func__, table_type, (ptr - (u32 *) dd->platform_config.data)); + ret = -EINVAL; goto bail; /* We don't trust this file now */ } pcfgcache->config_tables[table_type].table = ptr; @@ -1856,6 +1861,7 @@ int parse_platform_config(struct hfi1_devdata *dd) __func__, table_type, (ptr - (u32 *)dd->platform_config.data)); + ret = -EINVAL; goto bail; /* We don't trust this file now */ } pcfgcache->config_tables[table_type].table_metadata = From ed461b30b22c8fa85c25189c14cb89f29595cd14 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 2 Dec 2022 12:00:38 +0800 Subject: [PATCH 098/122] RDMA/srp: Fix error return code in srp_parse_options() In the previous iteration of the while loop, the "ret" may have been assigned a value of 0, so the error return code -EINVAL may have been incorrectly set to 0. To fix set valid return code before calling to goto. Also investigate each case separately as Andy suggessted. Fixes: e711f968c49c ("IB/srp: replace custom implementation of hex2bin()") Fixes: 2a174df0c602 ("IB/srp: Use kstrtoull() instead of simple_strtoull()") Fixes: 19f313438c77 ("IB/srp: Add RDMA/CM support") Signed-off-by: Wang Yufen Link: https://lore.kernel.org/r/1669953638-11747-2-git-send-email-wangyufen@huawei.com Reviewed-by: Bart Van Assche Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/srp/ib_srp.c | 96 ++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 1075c2ac8fe2..b4d6a4a5ae81 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3410,7 +3410,8 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_PKEY: - if (match_hex(args, &token)) { + ret = match_hex(args, &token); + if (ret) { pr_warn("bad P_Key parameter '%s'\n", p); goto out; } @@ -3470,7 +3471,8 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_MAX_SECT: - if (match_int(args, &token)) { + ret = match_int(args, &token); + if (ret) { pr_warn("bad max sect parameter '%s'\n", p); goto out; } @@ -3478,8 +3480,15 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_QUEUE_SIZE: - if (match_int(args, &token) || token < 1) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for queue_size parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1) { pr_warn("bad queue_size parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->scsi_host->can_queue = token; @@ -3490,25 +3499,40 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_MAX_CMD_PER_LUN: - if (match_int(args, &token) || token < 1) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max cmd_per_lun parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1) { pr_warn("bad max cmd_per_lun parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->scsi_host->cmd_per_lun = token; break; case SRP_OPT_TARGET_CAN_QUEUE: - if (match_int(args, &token) || token < 1) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max target_can_queue parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1) { pr_warn("bad max target_can_queue parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->target_can_queue = token; break; case SRP_OPT_IO_CLASS: - if (match_hex(args, &token)) { + ret = match_hex(args, &token); + if (ret) { pr_warn("bad IO class parameter '%s'\n", p); goto out; } @@ -3517,6 +3541,7 @@ static int srp_parse_options(struct net *net, const char *buf, pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n", token, SRP_REV10_IB_IO_CLASS, SRP_REV16A_IB_IO_CLASS); + ret = -EINVAL; goto out; } target->io_class = token; @@ -3539,16 +3564,24 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_CMD_SG_ENTRIES: - if (match_int(args, &token) || token < 1 || token > 255) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max cmd_sg_entries parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1 || token > 255) { pr_warn("bad max cmd_sg_entries parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->cmd_sg_cnt = token; break; case SRP_OPT_ALLOW_EXT_SG: - if (match_int(args, &token)) { + ret = match_int(args, &token); + if (ret) { pr_warn("bad allow_ext_sg parameter '%s'\n", p); goto out; } @@ -3556,43 +3589,77 @@ static int srp_parse_options(struct net *net, const char *buf, break; case SRP_OPT_SG_TABLESIZE: - if (match_int(args, &token) || token < 1 || - token > SG_MAX_SEGMENTS) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max sg_tablesize parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1 || token > SG_MAX_SEGMENTS) { pr_warn("bad max sg_tablesize parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->sg_tablesize = token; break; case SRP_OPT_COMP_VECTOR: - if (match_int(args, &token) || token < 0) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for comp_vector parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 0) { pr_warn("bad comp_vector parameter '%s'\n", p); + ret = -EINVAL; goto out; } target->comp_vector = token; break; case SRP_OPT_TL_RETRY_COUNT: - if (match_int(args, &token) || token < 2 || token > 7) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for tl_retry_count parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 2 || token > 7) { pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", p); + ret = -EINVAL; goto out; } target->tl_retry_count = token; break; case SRP_OPT_MAX_IT_IU_SIZE: - if (match_int(args, &token) || token < 0) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for max it_iu_size parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 0) { pr_warn("bad maximum initiator to target IU size '%s'\n", p); + ret = -EINVAL; goto out; } target->max_it_iu_size = token; break; case SRP_OPT_CH_COUNT: - if (match_int(args, &token) || token < 1) { + ret = match_int(args, &token); + if (ret) { + pr_warn("match_int() failed for channel count parameter '%s', Error %d\n", + p, ret); + goto out; + } + if (token < 1) { pr_warn("bad channel count %s\n", p); + ret = -EINVAL; goto out; } target->ch_count = token; @@ -3601,6 +3668,7 @@ static int srp_parse_options(struct net *net, const char *buf, default: pr_warn("unknown parameter or missing value '%s' in target creation request\n", p); + ret = -EINVAL; goto out; } } From 6978837ce42f8bea85041fc08c854f4e28852b3e Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Sat, 3 Dec 2022 11:37:14 +0800 Subject: [PATCH 099/122] RDMA/mlx5: no need to kfree NULL pointer Goto label 'free' where it will kfree the 'in' is not needed though it's safe to kfree NULL. Return err code directly to simplify the code. 1973 free: 1974 kfree(in); 1975 return err; Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20221203033714.25870-1-lizhijian@fujitsu.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 410cc5fd2523..053fe946e45a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1929,10 +1929,8 @@ int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); in = kzalloc(inlen, GFP_KERNEL); - if (!in) { - err = -ENOMEM; - goto free; - } + if (!in) + return -ENOMEM; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); From d074f0aebde5649f7a9f1807551efc019b8e81c4 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Wed, 7 Dec 2022 16:32:18 +0800 Subject: [PATCH 100/122] RDMA/hfi1: use sysfs_emit() to instead of scnprintf() Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202212071632188074249@zte.com.cn Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 8e71bef9d982..bcc6bc0540f0 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -112,7 +112,7 @@ static int hfi1_caps_get(char *buffer, const struct kernel_param *kp) cap_mask &= ~HFI1_CAP_LOCKED_SMASK; cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT); - return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask); + return sysfs_emit(buffer, "0x%lx\n", cap_mask); } struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi) From fb4907f487254375830f135dcfe5dd7e6f8b705f Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Fri, 25 Nov 2022 09:00:26 +0800 Subject: [PATCH 101/122] RDMA/cma: Change RoCE packet life time from 18 to 16 The ack timeout retransmission time is affected by the following two factors: one is packet life time, another is the HCA processing time. Now the default packet lifetime(CMA_IBOE_PACKET_LIFETIME) is 18. That means the minimum ack timeout is 2 seconds (2^(18+1)*4us=2.097seconds). The packet lifetime means the maximum transmission time of packets on the network, 2 seconds is too long. Assume the network is a clos topology with three layers, every packet will pass through five hops of switches. Assume the buffer of every switch is 128MB and the port transmission rate is 25 Gbit/s, the maximum transmission time of the packet is 200ms(128MB*5/25Gbit/s). Add double redundancy, it is less than 500ms. So change the CMA_IBOE_PACKET_LIFETIME to 16, the maximum transmission time of the packet will be about 500+ms, it is long enough. Link: https://lore.kernel.org/r/20221125010026.755-1-lengchao@huawei.com Signed-off-by: Chao Leng Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index cc2222b85c88..2f5b5e6f3d11 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -47,7 +47,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) -#define CMA_IBOE_PACKET_LIFETIME 18 +#define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { From 487d65090a3dce1ae54946aded55d0f8ac87cbab Mon Sep 17 00:00:00 2001 From: Yixing Liu Date: Sat, 26 Nov 2022 18:29:06 +0800 Subject: [PATCH 102/122] RDMA/hns: Fix the gid problem caused by free mr After the hns roce driver is loaded, if you modify the mac address of the network port, the following error will appear: __ib_cache_gid_add: unable to add gid fe80:0000:0000:0000:4600:4dff:fe22:abb5 error=-28 hns3 0000:7d:00.0 hns_0: attr path_mtu(1) invalid while modify qp The reason for the error is that the gid being occupied will cause the failure to modify the gid. The gid is occupied by the loopback QP used by free mr. When the mac address is modified, the gid will change. If there is a busy QP at this time, the gid will not be released and the modification will fail. The QP of free mr is created using the ib interface. The ib interface will add a reference count to the gid, resulting in this error scenario. Considering that free mr is solving a bug in HIP08, not an actual business, it is not necessary to use ib interfaces. Fixes: 70f92521584f ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT") Link: https://lore.kernel.org/r/20221126102911.2921820-2-xuhaoyue1@hisilicon.com Signed-off-by: Yixing Liu Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 175 ++++++++++++++++----- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 8 +- 2 files changed, 137 insertions(+), 46 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 939811867249..93c677239686 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2634,31 +2634,124 @@ static void free_dip_list(struct hns_roce_dev *hr_dev) spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); } +static struct ib_pd *free_mr_init_pd(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_pd *hr_pd; + struct ib_pd *pd; + + hr_pd = kzalloc(sizeof(*hr_pd), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hr_pd)) + return NULL; + pd = &hr_pd->ibpd; + pd->device = ibdev; + + if (hns_roce_alloc_pd(pd, NULL)) { + ibdev_err(ibdev, "failed to create pd for free mr.\n"); + kfree(hr_pd); + return NULL; + } + free_mr->rsv_pd = to_hr_pd(pd); + free_mr->rsv_pd->ibpd.device = &hr_dev->ib_dev; + free_mr->rsv_pd->ibpd.uobject = NULL; + free_mr->rsv_pd->ibpd.__internal_mr = NULL; + atomic_set(&free_mr->rsv_pd->ibpd.usecnt, 0); + + return pd; +} + +static struct ib_cq *free_mr_init_cq(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct ib_cq_init_attr cq_init_attr = {}; + struct hns_roce_cq *hr_cq; + struct ib_cq *cq; + + cq_init_attr.cqe = HNS_ROCE_FREE_MR_USED_CQE_NUM; + + hr_cq = kzalloc(sizeof(*hr_cq), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hr_cq)) + return NULL; + + cq = &hr_cq->ib_cq; + cq->device = ibdev; + + if (hns_roce_create_cq(cq, &cq_init_attr, NULL)) { + ibdev_err(ibdev, "failed to create cq for free mr.\n"); + kfree(hr_cq); + return NULL; + } + free_mr->rsv_cq = to_hr_cq(cq); + free_mr->rsv_cq->ib_cq.device = &hr_dev->ib_dev; + free_mr->rsv_cq->ib_cq.uobject = NULL; + free_mr->rsv_cq->ib_cq.comp_handler = NULL; + free_mr->rsv_cq->ib_cq.event_handler = NULL; + free_mr->rsv_cq->ib_cq.cq_context = NULL; + atomic_set(&free_mr->rsv_cq->ib_cq.usecnt, 0); + + return cq; +} + +static int free_mr_init_qp(struct hns_roce_dev *hr_dev, struct ib_cq *cq, + struct ib_qp_init_attr *init_attr, int i) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_qp *hr_qp; + struct ib_qp *qp; + int ret; + + hr_qp = kzalloc(sizeof(*hr_qp), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hr_qp)) + return -ENOMEM; + + qp = &hr_qp->ibqp; + qp->device = ibdev; + + ret = hns_roce_create_qp(qp, init_attr, NULL); + if (ret) { + ibdev_err(ibdev, "failed to create qp for free mr.\n"); + kfree(hr_qp); + return ret; + } + + free_mr->rsv_qp[i] = hr_qp; + free_mr->rsv_qp[i]->ibqp.recv_cq = cq; + free_mr->rsv_qp[i]->ibqp.send_cq = cq; + + return 0; +} + static void free_mr_exit(struct hns_roce_dev *hr_dev) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; - int ret; + struct ib_qp *qp; int i; for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { if (free_mr->rsv_qp[i]) { - ret = ib_destroy_qp(free_mr->rsv_qp[i]); - if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to destroy qp in free mr.\n"); - + qp = &free_mr->rsv_qp[i]->ibqp; + hns_roce_v2_destroy_qp(qp, NULL); + kfree(free_mr->rsv_qp[i]); free_mr->rsv_qp[i] = NULL; } } if (free_mr->rsv_cq) { - ib_destroy_cq(free_mr->rsv_cq); + hns_roce_destroy_cq(&free_mr->rsv_cq->ib_cq, NULL); + kfree(free_mr->rsv_cq); free_mr->rsv_cq = NULL; } if (free_mr->rsv_pd) { - ib_dealloc_pd(free_mr->rsv_pd); + hns_roce_dealloc_pd(&free_mr->rsv_pd->ibpd, NULL); + kfree(free_mr->rsv_pd); free_mr->rsv_pd = NULL; } } @@ -2667,55 +2760,46 @@ static int free_mr_alloc_res(struct hns_roce_dev *hr_dev) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; - struct ib_device *ibdev = &hr_dev->ib_dev; - struct ib_cq_init_attr cq_init_attr = {}; struct ib_qp_init_attr qp_init_attr = {}; struct ib_pd *pd; struct ib_cq *cq; - struct ib_qp *qp; int ret; int i; - pd = ib_alloc_pd(ibdev, 0); - if (IS_ERR(pd)) { - ibdev_err(ibdev, "failed to create pd for free mr.\n"); - return PTR_ERR(pd); - } - free_mr->rsv_pd = pd; + pd = free_mr_init_pd(hr_dev); + if (!pd) + return -ENOMEM; - cq_init_attr.cqe = HNS_ROCE_FREE_MR_USED_CQE_NUM; - cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_init_attr); - if (IS_ERR(cq)) { - ibdev_err(ibdev, "failed to create cq for free mr.\n"); - ret = PTR_ERR(cq); - goto create_failed; + cq = free_mr_init_cq(hr_dev); + if (!cq) { + ret = -ENOMEM; + goto create_failed_cq; } - free_mr->rsv_cq = cq; qp_init_attr.qp_type = IB_QPT_RC; qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; - qp_init_attr.send_cq = free_mr->rsv_cq; - qp_init_attr.recv_cq = free_mr->rsv_cq; + qp_init_attr.send_cq = cq; + qp_init_attr.recv_cq = cq; for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { qp_init_attr.cap.max_send_wr = HNS_ROCE_FREE_MR_USED_SQWQE_NUM; qp_init_attr.cap.max_send_sge = HNS_ROCE_FREE_MR_USED_SQSGE_NUM; qp_init_attr.cap.max_recv_wr = HNS_ROCE_FREE_MR_USED_RQWQE_NUM; qp_init_attr.cap.max_recv_sge = HNS_ROCE_FREE_MR_USED_RQSGE_NUM; - qp = ib_create_qp(free_mr->rsv_pd, &qp_init_attr); - if (IS_ERR(qp)) { - ibdev_err(ibdev, "failed to create qp for free mr.\n"); - ret = PTR_ERR(qp); - goto create_failed; - } - - free_mr->rsv_qp[i] = qp; + ret = free_mr_init_qp(hr_dev, cq, &qp_init_attr, i); + if (ret) + goto create_failed_qp; } return 0; -create_failed: - free_mr_exit(hr_dev); +create_failed_qp: + hns_roce_destroy_cq(cq, NULL); + kfree(cq); + +create_failed_cq: + hns_roce_dealloc_pd(pd, NULL); + kfree(pd); return ret; } @@ -2731,14 +2815,17 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, int mask; int ret; - hr_qp = to_hr_qp(free_mr->rsv_qp[sl_num]); + hr_qp = to_hr_qp(&free_mr->rsv_qp[sl_num]->ibqp); hr_qp->free_mr_en = 1; + hr_qp->ibqp.device = ibdev; + hr_qp->ibqp.qp_type = IB_QPT_RC; mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS; attr->qp_state = IB_QPS_INIT; attr->port_num = 1; attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; - ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, + IB_QPS_INIT); if (ret) { ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", ret); @@ -2759,7 +2846,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, rdma_ah_set_sl(&attr->ah_attr, (u8)sl_num); - ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, + IB_QPS_RTR); hr_dev->loop_idc = loopback; if (ret) { ibdev_err(ibdev, "failed to modify qp to rtr, ret = %d.\n", @@ -2773,7 +2861,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, attr->sq_psn = HNS_ROCE_FREE_MR_USED_PSN; attr->retry_cnt = HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT; attr->timeout = HNS_ROCE_FREE_MR_USED_QP_TIMEOUT; - ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_RTR, + IB_QPS_RTS); if (ret) ibdev_err(ibdev, "failed to modify qp to rts, ret = %d.\n", ret); @@ -3416,7 +3505,7 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) mutex_lock(&free_mr->mutex); for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { - hr_qp = to_hr_qp(free_mr->rsv_qp[i]); + hr_qp = free_mr->rsv_qp[i]; ret = free_mr_post_send_lp_wqe(hr_qp); if (ret) { @@ -3431,7 +3520,7 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) end = msecs_to_jiffies(HNS_ROCE_V2_FREE_MR_TIMEOUT) + jiffies; while (cqe_cnt) { - npolled = hns_roce_v2_poll_cq(free_mr->rsv_cq, cqe_cnt, wc); + npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc); if (npolled < 0) { ibdev_err(ibdev, "failed to poll cqe for free mr, remain %d cqe.\n", @@ -5474,7 +5563,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, return ret; } -static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index b11579027e82..017462e52843 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1329,9 +1329,9 @@ struct hns_roce_link_table { #define HNS_ROCE_EXT_LLM_MIN_PAGES(que_num) ((que_num) * 4 + 2) struct hns_roce_v2_free_mr { - struct ib_qp *rsv_qp[HNS_ROCE_FREE_MR_USED_QP_NUM]; - struct ib_cq *rsv_cq; - struct ib_pd *rsv_pd; + struct hns_roce_qp *rsv_qp[HNS_ROCE_FREE_MR_USED_QP_NUM]; + struct hns_roce_cq *rsv_cq; + struct hns_roce_pd *rsv_pd; struct mutex mutex; }; @@ -1461,6 +1461,8 @@ struct hns_roce_sccc_clr_done { __le32 rsv[5]; }; +int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); + static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2], void __iomem *dest) { From bc34c04f7b97c3794dec5a6d6d27ffd5f0e4f5c8 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:07 +0800 Subject: [PATCH 103/122] RDMA/hns: Fix AH attr queried by query_qp The queried AH attr is invalid. This patch fix it. This problem is found by rdma-core test test_mr_rereg_pd ERROR: test_mr_rereg_pd (tests.test_mr.MRTest) Test that cover rereg MR's PD with this flow: ---------------------------------------------------------------------- Traceback (most recent call last): File "./tests/test_mr.py", line 157, in test_mr_rereg_pd self.restate_qps() File "./tests/test_mr.py", line 113, in restate_qps self.server.qp.to_rts(self.server_qp_attr) File "qp.pyx", line 1137, in pyverbs.qp.QP.to_rts File "qp.pyx", line 1123, in pyverbs.qp.QP.to_rtr pyverbs.pyverbs_error.PyverbsRDMAError: Failed to modify QP state to RTR. Errno: 22, Invalid argument Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC") Link: https://lore.kernel.org/r/20221126102911.2921820-3-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 93c677239686..a8f8c790d31d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5470,6 +5470,8 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, rdma_ah_set_sl(&qp_attr->ah_attr, hr_reg_read(&context, QPC_SL)); + rdma_ah_set_port_num(&qp_attr->ah_attr, hr_qp->port + 1); + rdma_ah_set_ah_flags(&qp_attr->ah_attr, IB_AH_GRH); grh->flow_label = hr_reg_read(&context, QPC_FL); grh->sgid_index = hr_reg_read(&context, QPC_GMV_IDX); grh->hop_limit = hr_reg_read(&context, QPC_HOPLIMIT); From 9fb39ef2ff3e18f1740625ba04093dfbef086d2b Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:08 +0800 Subject: [PATCH 104/122] RDMA/hns: Fix PBL page MTR find Now, The address of the first two pages in the MR will be searched, which use to speed up the lookup of the pbl table for hardware. An exception will occur when there is only one page in this MR. This patch fix the number of page to search. Fixes: 9b2cf76c9f05 ("RDMA/hns: Optimize PBL buffer allocation process") Link: https://lore.kernel.org/r/20221126102911.2921820-4-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a8f8c790d31d..41835cb05983 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3274,7 +3274,8 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev, int i, count; count = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages, - ARRAY_SIZE(pages), &pbl_ba); + min_t(int, ARRAY_SIZE(pages), mr->npages), + &pbl_ba); if (count < 1) { ibdev_err(ibdev, "failed to find PBL mtr, count = %d.\n", count); From 99dc5a0712883d5d13b620d25b3759d429577bc8 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:09 +0800 Subject: [PATCH 105/122] RDMA/hns: Fix page size cap from firmware Add verification to make sure the roce page size cap is supported by the system page size. Fixes: ba6bb7e97421 ("RDMA/hns: Add interfaces to get pf capabilities from firmware") Link: https://lore.kernel.org/r/20221126102911.2921820-5-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 41835cb05983..6e74735bbcf8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2345,6 +2345,9 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev) caps->wqe_sge_hop_num = hr_reg_read(resp_d, PF_CAPS_D_EX_SGE_HOP_NUM); caps->wqe_rq_hop_num = hr_reg_read(resp_d, PF_CAPS_D_RQWQE_HOP_NUM); + if (!(caps->page_size_cap & PAGE_SIZE)) + caps->page_size_cap = HNS_ROCE_V2_PAGE_SIZE_SUPPORTED; + return 0; } From 667d6164b84884c64de3fc18670cd5a98b0b10cf Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:10 +0800 Subject: [PATCH 106/122] RDMA/hns: Fix error code of CMD The error code is fixed to EIO when CMD fails to excute. This patch converts the error status reported by firmware to linux errno. Fixes: a04ff739f2a9 ("RDMA/hns: Add command queue support for hip08 RoCE driver") Link: https://lore.kernel.org/r/20221126102911.2921820-6-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 26 +++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 +++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 6e74735bbcf8..f32100c6f1d9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1277,6 +1277,30 @@ static void update_cmdq_status(struct hns_roce_dev *hr_dev) hr_dev->cmd.state = HNS_ROCE_CMDQ_STATE_FATAL_ERR; } +static int hns_roce_cmd_err_convert_errno(u16 desc_ret) +{ + struct hns_roce_cmd_errcode errcode_table[] = { + {CMD_EXEC_SUCCESS, 0}, + {CMD_NO_AUTH, -EPERM}, + {CMD_NOT_EXIST, -EOPNOTSUPP}, + {CMD_CRQ_FULL, -EXFULL}, + {CMD_NEXT_ERR, -ENOSR}, + {CMD_NOT_EXEC, -ENOTBLK}, + {CMD_PARA_ERR, -EINVAL}, + {CMD_RESULT_ERR, -ERANGE}, + {CMD_TIMEOUT, -ETIME}, + {CMD_HILINK_ERR, -ENOLINK}, + {CMD_INFO_ILLEGAL, -ENXIO}, + {CMD_INVALID, -EBADR}, + }; + u16 i; + + for (i = 0; i < ARRAY_SIZE(errcode_table); i++) + if (desc_ret == errcode_table[i].return_status) + return errcode_table[i].errno; + return -EIO; +} + static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, struct hns_roce_cmq_desc *desc, int num) { @@ -1322,7 +1346,7 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, dev_err_ratelimited(hr_dev->dev, "Cmdq IO error, opcode = 0x%x, return = 0x%x.\n", desc->opcode, desc_ret); - ret = -EIO; + ret = hns_roce_cmd_err_convert_errno(desc_ret); } } else { /* FW/HW reset or incorrect number of desc */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 017462e52843..47fad456839d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -273,6 +273,11 @@ enum hns_roce_cmd_return_status { CMD_OTHER_ERR = 0xff }; +struct hns_roce_cmd_errcode { + enum hns_roce_cmd_return_status return_status; + int errno; +}; + enum hns_roce_sgid_type { GID_TYPE_FLAG_ROCE_V1 = 0, GID_TYPE_FLAG_ROCE_V2_IPV4, From 682c0722addae4b4a1440c9db9d8c86cb8e09ce5 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Sat, 26 Nov 2022 18:29:11 +0800 Subject: [PATCH 107/122] RDMA/hns: Fix XRC caps on HIP08 XRC caps has been set by default. But in fact, XRC is not supported in HIP08. Fixes: 32548870d438 ("RDMA/hns: Add support for XRC on HIP09") Link: https://lore.kernel.org/r/20221126102911.2921820-7-xuhaoyue1@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f32100c6f1d9..2716852f5e92 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2051,13 +2051,14 @@ static void set_default_caps(struct hns_roce_dev *hr_dev) caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC | HNS_ROCE_CAP_FLAG_MW | HNS_ROCE_CAP_FLAG_SRQ | HNS_ROCE_CAP_FLAG_FRMR | - HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL | HNS_ROCE_CAP_FLAG_XRC; + HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { caps->flags |= HNS_ROCE_CAP_FLAG_STASH | - HNS_ROCE_CAP_FLAG_DIRECT_WQE; + HNS_ROCE_CAP_FLAG_DIRECT_WQE | + HNS_ROCE_CAP_FLAG_XRC; caps->max_sq_inline = HNS_ROCE_V3_MAX_SQ_INLINE; } else { caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE; From 6cfe7bd0dfd33033683639039b5608d6534c19eb Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 8 Dec 2022 05:19:54 -0500 Subject: [PATCH 108/122] RDMA/mlx5: Remove not-used IB_FLOW_SPEC_IB define IB_FLOW_SPEC_IB is not used in mlx5 and can be deleted. Signed-off-by: Zhu Yanjun Link: https://lore.kernel.org/r/20221208101954.687960-1-yanjun.zhu@intel.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/fs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 490ec308e309..3008632a6c20 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -127,7 +127,6 @@ static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask) } #define LAST_ETH_FIELD vlan_tag -#define LAST_IB_FIELD sl #define LAST_IPV4_FIELD tos #define LAST_IPV6_FIELD traffic_class #define LAST_TCP_UDP_FIELD src_port From 3282a549cf9b300e2d1b007925ed007ab24e4131 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 9 Dec 2022 13:59:26 +0900 Subject: [PATCH 109/122] RDMA/rxe: Fix oops with zero length reads The commit 686d348476ee ("RDMA/rxe: Remove unnecessary mr testing") causes a kernel crash. If responder get a zero-byte RDMA Read request, qp->resp.mr is not set in check_rkey() (see IBA C9-88). The mr is NULL in this case, and a NULL pointer dereference occurs as shown below. BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] PREEMPT SMP PTI CPU: 2 PID: 3622 Comm: python3 Kdump: loaded Not tainted 6.1.0-rc3+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:__rxe_put+0xc/0x60 [rdma_rxe] Code: cc cc cc 31 f6 e8 64 36 1b d3 41 b8 01 00 00 00 44 89 c0 c3 cc cc cc cc 41 89 c0 eb c1 90 0f 1f 44 00 00 41 54 b8 ff ff ff ff 0f c1 47 10 83 f8 01 74 11 45 31 e4 85 c0 7e 20 44 89 e0 41 5c RSP: 0018:ffffb27bc012ce78 EFLAGS: 00010246 RAX: 00000000ffffffff RBX: ffff9790857b0580 RCX: 0000000000000000 RDX: ffff979080fe145a RSI: 000055560e3e0000 RDI: 0000000000000000 RBP: ffff97909c7dd800 R08: 0000000000000001 R09: e7ce43d97f7bed0f R10: ffff97908b29c300 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: ffff97908b29c300 R15: 0000000000000000 FS: 00007f276f7bd740(0000) GS:ffff9792b5c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000114230002 CR4: 0000000000060ee0 Call Trace: read_reply+0xda/0x310 [rdma_rxe] rxe_responder+0x82d/0xe50 [rdma_rxe] do_task+0x84/0x170 [rdma_rxe] tasklet_action_common.constprop.0+0xa7/0x120 __do_softirq+0xcb/0x2ac do_softirq+0x63/0x90 Support a NULL mr during read_reply() Fixes: 686d348476ee ("RDMA/rxe: Remove unnecessary mr testing") Fixes: b5f9a01fae42 ("RDMA/rxe: Fix mr leak in RESPST_ERR_RNR") Link: https://lore.kernel.org/r/20221209045926.531689-1-matsuda-daisuke@fujitsu.com Link: https://lore.kernel.org/r/20221202145713.13152-1-lizhijian@fujitsu.com Signed-off-by: Daisuke Matsuda Signed-off-by: Li Zhijian Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 12eb85e8d415..0c8bec281937 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -460,7 +460,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, return RESPST_EXECUTE; } - /* A zero-byte op is not required to set an addr or rkey. */ + /* A zero-byte op is not required to set an addr or rkey. See C9-88 */ if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { @@ -880,13 +880,15 @@ static enum resp_states read_reply(struct rxe_qp *qp, skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, res->cur_psn, AETH_ACK_UNLIMITED); if (!skb) { - rxe_put(mr); + if (mr) + rxe_put(mr); return RESPST_ERR_RNR; } err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), payload, RXE_FROM_MR_OBJ); - rxe_put(mr); + if (mr) + rxe_put(mr); if (err) { kfree_skb(skb); return RESPST_ERR_RKEY_VIOLATION; From 689c5421bfe0eac65526bd97a466b9590a6aad3c Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 8 Dec 2022 15:09:46 -0600 Subject: [PATCH 110/122] RDMA/rxe: Fix incorrect responder length checking The code in rxe_resp.c at check_length() is incorrect as it compares pkt->opcode an 8 bit value against various mask bits which are all higher than 256 so nothing is ever reported. This patch rewrites this to compare against pkt->mask which is correct. However this now exposes another error. For UD send packets the value of the pmtu cannot be determined from qp->mtu. All that is required here is to later check if the payload fits into the posted receive buffer in that case. Fixes: 837a55847ead ("RDMA/rxe: Implement packet length validation on responder") Link: https://lore.kernel.org/r/20221208210945.28607-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Reviewed-by: Daisuke Matsuda Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 60 ++++++++++++++++------------ 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 0c8bec281937..abbaa41017e8 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -392,36 +392,46 @@ static enum resp_states check_resource(struct rxe_qp *qp, return RESPST_CHK_LENGTH; } -static enum resp_states check_length(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) +static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) { - int mtu = qp->mtu; - u32 payload = payload_size(pkt); - u32 dmalen = reth_len(pkt); - - /* RoCEv2 packets do not have LRH. - * Let's skip checking it. + /* + * See IBA C9-92 + * For UD QPs we only check if the packet will fit in the + * receive buffer later. For rmda operations additional + * length checks are performed in check_rkey. */ + if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) || + (qp_type(qp) == IB_QPT_UC))) { + unsigned int mtu = qp->mtu; + unsigned int payload = payload_size(pkt); - if ((pkt->opcode & RXE_START_MASK) && - (pkt->opcode & RXE_END_MASK)) { - /* "only" packets */ - if (payload > mtu) - return RESPST_ERR_LENGTH; - } else if ((pkt->opcode & RXE_START_MASK) || - (pkt->opcode & RXE_MIDDLE_MASK)) { - /* "first" or "middle" packets */ - if (payload != mtu) - return RESPST_ERR_LENGTH; - } else if (pkt->opcode & RXE_END_MASK) { - /* "last" packets */ - if ((payload == 0) || (payload > mtu)) - return RESPST_ERR_LENGTH; + if ((pkt->mask & RXE_START_MASK) && + (pkt->mask & RXE_END_MASK)) { + if (unlikely(payload > mtu)) { + rxe_dbg_qp(qp, "only packet too long"); + return RESPST_ERR_LENGTH; + } + } else if ((pkt->mask & RXE_START_MASK) || + (pkt->mask & RXE_MIDDLE_MASK)) { + if (unlikely(payload != mtu)) { + rxe_dbg_qp(qp, "first or middle packet not mtu"); + return RESPST_ERR_LENGTH; + } + } else if (pkt->mask & RXE_END_MASK) { + if (unlikely((payload == 0) || (payload > mtu))) { + rxe_dbg_qp(qp, "last packet zero or too long"); + return RESPST_ERR_LENGTH; + } + } } - if (pkt->opcode & (RXE_WRITE_MASK | RXE_READ_MASK)) { - if (dmalen > (1 << 31)) + /* See IBA C9-94 */ + if (pkt->mask & RXE_RETH_MASK) { + if (reth_len(pkt) > (1U << 31)) { + rxe_dbg_qp(qp, "dma length too long"); return RESPST_ERR_LENGTH; + } } return RESPST_CHK_RKEY; @@ -1401,7 +1411,7 @@ int rxe_responder(void *arg) state = check_resource(qp, pkt); break; case RESPST_CHK_LENGTH: - state = check_length(qp, pkt); + state = rxe_resp_check_length(qp, pkt); break; case RESPST_CHK_RKEY: state = check_rkey(qp, pkt); From 0c17da492dc6c33cc5b99633adb4bd7b2587153c Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:52 +0800 Subject: [PATCH 111/122] RDMA: Extend RDMA user ABI to support flush This commit extends the RDMA user ABI to support the flush operation defined in IBA A19.4.1. These changes are backward compatible with the existing RDMA user ABI. Link: https://lore.kernel.org/r/20221206130201.30986-2-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_ioctl_verbs.h | 2 ++ include/uapi/rdma/ib_user_verbs.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index e0c25537fd2e..d7c5aaa32744 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -57,6 +57,8 @@ enum ib_uverbs_access_flags { IB_UVERBS_ACCESS_ZERO_BASED = 1 << 5, IB_UVERBS_ACCESS_ON_DEMAND = 1 << 6, IB_UVERBS_ACCESS_HUGETLB = 1 << 7, + IB_UVERBS_ACCESS_FLUSH_GLOBAL = 1 << 8, + IB_UVERBS_ACCESS_FLUSH_PERSISTENT = 1 << 9, IB_UVERBS_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_OPTIONAL_FIRST, IB_UVERBS_ACCESS_OPTIONAL_RANGE = diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 237814815544..e16650f0c85d 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -105,6 +105,18 @@ enum { IB_USER_VERBS_EX_CMD_MODIFY_CQ }; +/* see IBA A19.4.1.1 Placement Types */ +enum ib_placement_type { + IB_FLUSH_GLOBAL = 1U << 0, + IB_FLUSH_PERSISTENT = 1U << 1, +}; + +/* see IBA A19.4.1.2 Selectivity Level */ +enum ib_selectivity_level { + IB_FLUSH_RANGE = 0, + IB_FLUSH_MR, +}; + /* * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to @@ -466,6 +478,7 @@ enum ib_uverbs_wc_opcode { IB_UVERBS_WC_BIND_MW = 5, IB_UVERBS_WC_LOCAL_INV = 6, IB_UVERBS_WC_TSO = 7, + IB_UVERBS_WC_FLUSH = 8, IB_UVERBS_WC_ATOMIC_WRITE = 9, }; @@ -785,6 +798,7 @@ enum ib_uverbs_wr_opcode { IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + IB_UVERBS_WR_FLUSH = 14, IB_UVERBS_WR_ATOMIC_WRITE = 15, /* Review enum ib_wr_opcode before modifying this */ }; @@ -1333,6 +1347,9 @@ enum ib_uverbs_device_cap_flags { /* Deprecated. Please use IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS. */ IB_UVERBS_DEVICE_RAW_SCATTER_FCS = 1ULL << 34, IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING = 1ULL << 36, + /* Flush placement types */ + IB_UVERBS_DEVICE_FLUSH_GLOBAL = 1ULL << 38, + IB_UVERBS_DEVICE_FLUSH_PERSISTENT = 1ULL << 39, /* Atomic write attributes */ IB_UVERBS_DEVICE_ATOMIC_WRITE = 1ULL << 40, }; From 208e3a134b50d95ea3962d7a37b4d8a8f5368376 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:53 +0800 Subject: [PATCH 112/122] RDMA: Extend RDMA kernel verbs ABI to support flush This commit extends the RDMA kernel verbs ABI to support the flush operation defined in IBA A19.4.1. These changes are backward compatible with the existing RDMA kernel verbs ABI. It makes device/HCA support new FLUSH attributes/capabilities, and it also makes memory region support new FLUSH access flags. Users can use ibv_reg_mr(3) to register flush access flags. Only the access flags also supported by device's capabilities can be registered successfully. Once registered successfully, it means the MR is flushable. Similarly, A flushable MR should also have one or both of GLOBAL_VISIBILITY and PERSISTENT attributes/capabilities like device/HCA. Link: https://lore.kernel.org/r/20221206130201.30986-3-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- include/rdma/ib_pack.h | 3 +++ include/rdma/ib_verbs.h | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index f932d164af63..b8c56d7dc35d 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -84,6 +84,7 @@ enum { /* opcode 0x15 is reserved */ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, + IB_OPCODE_FLUSH = 0x1C, IB_OPCODE_ATOMIC_WRITE = 0x1D, /* real constants follow -- see comment about above IB_OPCODE() @@ -113,6 +114,7 @@ enum { IB_OPCODE(RC, FETCH_ADD), IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), + IB_OPCODE(RC, FLUSH), IB_OPCODE(RC, ATOMIC_WRITE), /* UC */ @@ -151,6 +153,7 @@ enum { IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE), IB_OPCODE(RD, COMPARE_SWAP), IB_OPCODE(RD, FETCH_ADD), + IB_OPCODE(RD, FLUSH), /* UD */ IB_OPCODE(UD, SEND_ONLY), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index df6bb26ba0be..a9a429172c0a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -270,6 +270,9 @@ enum ib_device_cap_flags { /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, + /* Placement type attributes */ + IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL, + IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT, IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; @@ -987,6 +990,7 @@ enum ib_wc_opcode { IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, + IB_WC_FLUSH = IB_UVERBS_WC_FLUSH, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). @@ -1327,6 +1331,7 @@ enum ib_wr_opcode { IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + IB_WR_FLUSH = IB_UVERBS_WR_FLUSH, IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ @@ -1461,10 +1466,12 @@ enum ib_access_flags { IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND, IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB, IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING, + IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL, + IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT, IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE, IB_ACCESS_SUPPORTED = - ((IB_ACCESS_HUGETLB << 1) - 1) | IB_ACCESS_OPTIONAL, + ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL, }; /* @@ -4325,6 +4332,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata); static inline int ib_check_mr_access(struct ib_device *ib_dev, unsigned int flags) { + u64 device_cap = ib_dev->attrs.device_cap_flags; + /* * Local write permission is required if remote write or * remote atomic permission is also requested. @@ -4339,6 +4348,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev, if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) return -EOPNOTSUPP; + + if ((flags & IB_ACCESS_FLUSH_GLOBAL && + !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) || + (flags & IB_ACCESS_FLUSH_PERSISTENT && + !(device_cap & IB_DEVICE_FLUSH_PERSISTENT))) + return -EOPNOTSUPP; + return 0; } From 668ce52d5eef477c0def757610768a1a3ccc9785 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:54 +0800 Subject: [PATCH 113/122] RDMA/rxe: Extend rxe user ABI to support flush This commit extends the rxe user ABI to support the flush operation defined in IBA A19.4.1. These changes are backward compatible with the existing rxe user ABI. The user API request a flush by filling this structure. Link: https://lore.kernel.org/r/20221206130201.30986-4-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_rxe.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index d20d1ecf046f..bb092fccb813 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -82,6 +82,13 @@ struct rxe_send_wr { __u32 invalidate_rkey; } ex; union { + struct { + __aligned_u64 remote_addr; + __u32 length; + __u32 rkey; + __u8 type; + __u8 level; + } flush; struct { __aligned_u64 remote_addr; __u32 rkey; From 02ea0a511558c907bde0e01fdebcd4536924d996 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:55 +0800 Subject: [PATCH 114/122] RDMA/rxe: Allow registering persistent flag for pmem MR only Memory region could support at most 2 flush access flags: IB_ACCESS_FLUSH_PERSISTENT and IB_ACCESS_FLUSH_GLOBAL But we only allow user to register persistent flush flags to the pmem MR where it has the ability of persisting data across power cycles. So registering a persistent access flag to a non-pmem MR will be rejected. Link: https://lore.kernel.org/r/20221206130201.30986-5-lizhijian@fujitsu.com CC: Dan Williams Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index b7c9ff1ddf0e..81a438e5010a 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -111,6 +111,15 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr) mr->ibmr.type = IB_MR_TYPE_DMA; } +static bool is_pmem_page(struct page *pg) +{ + unsigned long paddr = page_to_phys(pg); + + return REGION_INTERSECTS == + region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); +} + int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr) { @@ -146,16 +155,25 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, num_buf = 0; map = mr->map; if (length > 0) { - buf = map[0]->buf; + bool persistent_access = access & IB_ACCESS_FLUSH_PERSISTENT; + buf = map[0]->buf; for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { + struct page *pg = sg_page_iter_page(&sg_iter); + + if (persistent_access && !is_pmem_page(pg)) { + rxe_dbg_mr(mr, "Unable to register persistent access to non-pmem device\n"); + err = -EINVAL; + goto err_release_umem; + } + if (num_buf >= RXE_BUF_PER_MAP) { map++; buf = map[0]->buf; num_buf = 0; } - vaddr = page_address(sg_page_iter_page(&sg_iter)); + vaddr = page_address(pg); if (!vaddr) { rxe_dbg_mr(mr, "Unable to get virtual address\n"); err = -ENOMEM; From 02e9a31c897d17981508ceaac4430b93ff56ffc7 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:56 +0800 Subject: [PATCH 115/122] RDMA/rxe: Extend rxe packet format to support flush Extend rxe opcode tables, headers, helper and constants to support flush operations. Refer to the IBA A19.4.1 for more FETH definition details Link: https://lore.kernel.org/r/20221206130201.30986-6-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_hdr.h | 47 ++++++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_opcode.c | 17 ++++++++++ drivers/infiniband/sw/rxe/rxe_opcode.h | 14 +++++--- 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h index 804594b76040..46f82b27fcd2 100644 --- a/drivers/infiniband/sw/rxe/rxe_hdr.h +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -607,6 +607,52 @@ static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len) rxe_opcode[pkt->opcode].offset[RXE_RETH], len); } +/****************************************************************************** + * FLUSH Extended Transport Header + ******************************************************************************/ + +struct rxe_feth { + __be32 bits; +}; + +#define FETH_PLT_MASK (0x0000000f) /* bits 3-0 */ +#define FETH_SEL_MASK (0x00000030) /* bits 5-4 */ +#define FETH_SEL_SHIFT (4U) + +static inline u32 __feth_plt(void *arg) +{ + struct rxe_feth *feth = arg; + + return be32_to_cpu(feth->bits) & FETH_PLT_MASK; +} + +static inline u32 __feth_sel(void *arg) +{ + struct rxe_feth *feth = arg; + + return (be32_to_cpu(feth->bits) & FETH_SEL_MASK) >> FETH_SEL_SHIFT; +} + +static inline u32 feth_plt(struct rxe_pkt_info *pkt) +{ + return __feth_plt(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); +} + +static inline u32 feth_sel(struct rxe_pkt_info *pkt) +{ + return __feth_sel(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); +} + +static inline void feth_init(struct rxe_pkt_info *pkt, u8 type, u8 level) +{ + struct rxe_feth *feth = (struct rxe_feth *) + (pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); + u32 bits = ((level << FETH_SEL_SHIFT) & FETH_SEL_MASK) | + (type & FETH_PLT_MASK); + + feth->bits = cpu_to_be32(bits); +} + /****************************************************************************** * Atomic Extended Transport Header ******************************************************************************/ @@ -909,6 +955,7 @@ enum rxe_hdr_length { RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth), RXE_IETH_BYTES = sizeof(struct rxe_ieth), RXE_RDETH_BYTES = sizeof(struct rxe_rdeth), + RXE_FETH_BYTES = sizeof(struct rxe_feth), }; static inline size_t header_size(struct rxe_pkt_info *pkt) diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c index fb196029048e..5c0d5c6ffda4 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.c +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c @@ -101,6 +101,12 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { [IB_QPT_UC] = WR_LOCAL_OP_MASK, }, }, + [IB_WR_FLUSH] = { + .name = "IB_WR_FLUSH", + .mask = { + [IB_QPT_RC] = WR_FLUSH_MASK, + }, + }, [IB_WR_ATOMIC_WRITE] = { .name = "IB_WR_ATOMIC_WRITE", .mask = { @@ -384,6 +390,17 @@ struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { RXE_IETH_BYTES, } }, + [IB_OPCODE_RC_FLUSH] = { + .name = "IB_OPCODE_RC_FLUSH", + .mask = RXE_FETH_MASK | RXE_RETH_MASK | RXE_FLUSH_MASK | + RXE_START_MASK | RXE_END_MASK | RXE_REQ_MASK, + .length = RXE_BTH_BYTES + RXE_FETH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_FETH] = RXE_BTH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + RXE_FETH_BYTES, + } + }, [IB_OPCODE_RC_ATOMIC_WRITE] = { .name = "IB_OPCODE_RC_ATOMIC_WRITE", .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h index a470e9b0b884..cea4e0a63919 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.h +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -20,6 +20,7 @@ enum rxe_wr_mask { WR_READ_MASK = BIT(3), WR_WRITE_MASK = BIT(4), WR_LOCAL_OP_MASK = BIT(5), + WR_FLUSH_MASK = BIT(6), WR_ATOMIC_WRITE_MASK = BIT(7), WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, @@ -48,6 +49,7 @@ enum rxe_hdr_type { RXE_RDETH, RXE_DETH, RXE_IMMDT, + RXE_FETH, RXE_PAYLOAD, NUM_HDR_TYPES }; @@ -64,6 +66,7 @@ enum rxe_hdr_mask { RXE_IETH_MASK = BIT(RXE_IETH), RXE_RDETH_MASK = BIT(RXE_RDETH), RXE_DETH_MASK = BIT(RXE_DETH), + RXE_FETH_MASK = BIT(RXE_FETH), RXE_PAYLOAD_MASK = BIT(RXE_PAYLOAD), RXE_REQ_MASK = BIT(NUM_HDR_TYPES + 0), @@ -72,13 +75,14 @@ enum rxe_hdr_mask { RXE_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), RXE_READ_MASK = BIT(NUM_HDR_TYPES + 4), RXE_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), + RXE_FLUSH_MASK = BIT(NUM_HDR_TYPES + 6), - RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 6), - RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 7), + RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 7), + RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 8), - RXE_START_MASK = BIT(NUM_HDR_TYPES + 8), - RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), - RXE_END_MASK = BIT(NUM_HDR_TYPES + 10), + RXE_START_MASK = BIT(NUM_HDR_TYPES + 9), + RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 10), + RXE_END_MASK = BIT(NUM_HDR_TYPES + 11), RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), From fa1fd682ad3ef35b0e532c3bb14140786d17527c Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:57 +0800 Subject: [PATCH 116/122] RDMA/rxe: Implement RC RDMA FLUSH service in requester side Implement FLUSH request operation in the requester. Link: https://lore.kernel.org/r/20221206130201.30986-7-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 2713e9058922..899c8779f800 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -241,6 +241,9 @@ static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits) IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE : IB_OPCODE_RC_SEND_FIRST; + case IB_WR_FLUSH: + return IB_OPCODE_RC_FLUSH; + case IB_WR_RDMA_READ: return IB_OPCODE_RC_RDMA_READ_REQUEST; @@ -425,11 +428,18 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, /* init optional headers */ if (pkt->mask & RXE_RETH_MASK) { - reth_set_rkey(pkt, ibwr->wr.rdma.rkey); + if (pkt->mask & RXE_FETH_MASK) + reth_set_rkey(pkt, ibwr->wr.flush.rkey); + else + reth_set_rkey(pkt, ibwr->wr.rdma.rkey); reth_set_va(pkt, wqe->iova); reth_set_len(pkt, wqe->dma.resid); } + /* Fill Flush Extension Transport Header */ + if (pkt->mask & RXE_FETH_MASK) + feth_init(pkt, ibwr->wr.flush.type, ibwr->wr.flush.level); + if (pkt->mask & RXE_IMMDT_MASK) immdt_set_imm(pkt, ibwr->ex.imm_data); @@ -488,6 +498,9 @@ static int finish_packet(struct rxe_qp *qp, struct rxe_av *av, memset(pad, 0, bth_pad(pkt)); } + } else if (pkt->mask & RXE_FLUSH_MASK) { + /* oA19-2: shall have no payload. */ + wqe->dma.resid = 0; } if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { From ea1bb00ee9a5527b032a6efebe4a879db4cb42bb Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:58 +0800 Subject: [PATCH 117/122] RDMA/rxe: Implement flush execution in responder side Only the requested placement types that also registered in the destination memory region are acceptable. Otherwise, responder will also reply NAK "Remote Access Error" if it found a placement type violation. We will persist data via arch_wb_cache_pmem(), which could be architecture specific. This commit also adds 2 helpers to update qp.resp from the incoming packet. Link: https://lore.kernel.org/r/20221206130201.30986-8-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_loc.h | 1 + drivers/infiniband/sw/rxe/rxe_mr.c | 36 ++++++ drivers/infiniband/sw/rxe/rxe_resp.c | 160 ++++++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe_verbs.h | 6 + 4 files changed, 183 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index a22476d27b38..948ce4902b10 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -64,6 +64,7 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr); int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr); int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr); +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length); int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, enum rxe_mr_copy_dir dir); int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 81a438e5010a..072eac4b65d2 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -4,6 +4,8 @@ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ +#include + #include "rxe.h" #include "rxe_loc.h" @@ -192,6 +194,7 @@ int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, mr->offset = ib_umem_offset(umem); mr->state = RXE_MR_STATE_VALID; mr->ibmr.type = IB_MR_TYPE_USER; + mr->ibmr.page_size = PAGE_SIZE; return 0; @@ -295,6 +298,39 @@ out: return addr; } +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length) +{ + size_t offset; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + offset = (iova - mr->ibmr.iova + mr->offset) & mr->page_mask; + while (length > 0) { + u8 *va; + int bytes; + + bytes = mr->ibmr.page_size - offset; + if (bytes > length) + bytes = length; + + va = iova_to_vaddr(mr, iova, length); + if (!va) + return -EFAULT; + + arch_wb_cache_pmem(va, bytes); + + length -= bytes; + iova += bytes; + offset = 0; + } + + return 0; +} + /* copy data from a range (vaddr, vaddr+length-1) to or from * a mr object starting at iova. */ diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index abbaa41017e8..7a60c7709da0 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -23,6 +23,7 @@ enum resp_states { RESPST_READ_REPLY, RESPST_ATOMIC_REPLY, RESPST_ATOMIC_WRITE_REPLY, + RESPST_PROCESS_FLUSH, RESPST_COMPLETE, RESPST_ACKNOWLEDGE, RESPST_CLEANUP, @@ -59,6 +60,7 @@ static char *resp_state_name[] = { [RESPST_READ_REPLY] = "READ_REPLY", [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", [RESPST_ATOMIC_WRITE_REPLY] = "ATOMIC_WRITE_REPLY", + [RESPST_PROCESS_FLUSH] = "PROCESS_FLUSH", [RESPST_COMPLETE] = "COMPLETE", [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", [RESPST_CLEANUP] = "CLEANUP", @@ -258,19 +260,37 @@ static enum resp_states check_op_seq(struct rxe_qp *qp, } } +static bool check_qp_attr_access(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + if (((pkt->mask & RXE_READ_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || + ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || + ((pkt->mask & RXE_ATOMIC_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + return false; + + if (pkt->mask & RXE_FLUSH_MASK) { + u32 flush_type = feth_plt(pkt); + + if ((flush_type & IB_FLUSH_GLOBAL && + !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_GLOBAL)) || + (flush_type & IB_FLUSH_PERSISTENT && + !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_PERSISTENT))) + return false; + } + + return true; +} + static enum resp_states check_op_valid(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { switch (qp_type(qp)) { case IB_QPT_RC: - if (((pkt->mask & RXE_READ_MASK) && - !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || - ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && - !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || - ((pkt->mask & RXE_ATOMIC_MASK) && - !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) { + if (!check_qp_attr_access(qp, pkt)) return RESPST_ERR_UNSUPPORTED_OPCODE; - } break; @@ -437,6 +457,23 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, return RESPST_CHK_RKEY; } +static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + qp->resp.va = reth_va(pkt); + qp->resp.offset = 0; + qp->resp.rkey = reth_rkey(pkt); + qp->resp.resid = reth_len(pkt); + qp->resp.length = reth_len(pkt); +} + +static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + qp->resp.va = atmeth_va(pkt); + qp->resp.offset = 0; + qp->resp.rkey = atmeth_rkey(pkt); + qp->resp.resid = sizeof(u64); +} + static enum resp_states check_rkey(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { @@ -448,23 +485,26 @@ static enum resp_states check_rkey(struct rxe_qp *qp, u32 pktlen; int mtu = qp->mtu; enum resp_states state; - int access; + int access = 0; if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { - if (pkt->mask & RXE_RETH_MASK) { - qp->resp.va = reth_va(pkt); - qp->resp.offset = 0; - qp->resp.rkey = reth_rkey(pkt); - qp->resp.resid = reth_len(pkt); - qp->resp.length = reth_len(pkt); - } + if (pkt->mask & RXE_RETH_MASK) + qp_resp_from_reth(qp, pkt); + access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ : IB_ACCESS_REMOTE_WRITE; + } else if (pkt->mask & RXE_FLUSH_MASK) { + u32 flush_type = feth_plt(pkt); + + if (pkt->mask & RXE_RETH_MASK) + qp_resp_from_reth(qp, pkt); + + if (flush_type & IB_FLUSH_GLOBAL) + access |= IB_ACCESS_FLUSH_GLOBAL; + if (flush_type & IB_FLUSH_PERSISTENT) + access |= IB_ACCESS_FLUSH_PERSISTENT; } else if (pkt->mask & RXE_ATOMIC_MASK) { - qp->resp.va = atmeth_va(pkt); - qp->resp.offset = 0; - qp->resp.rkey = atmeth_rkey(pkt); - qp->resp.resid = sizeof(u64); + qp_resp_from_atmeth(qp, pkt); access = IB_ACCESS_REMOTE_ATOMIC; } else { return RESPST_EXECUTE; @@ -511,11 +551,20 @@ static enum resp_states check_rkey(struct rxe_qp *qp, } } + if (pkt->mask & RXE_FLUSH_MASK) { + /* FLUSH MR may not set va or resid + * no need to check range since we will flush whole mr + */ + if (feth_sel(pkt) == IB_FLUSH_MR) + goto skip_check_range; + } + if (mr_check_range(mr, va + qp->resp.offset, resid)) { state = RESPST_ERR_RKEY_VIOLATION; goto err; } +skip_check_range: if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (resid > mtu) { if (pktlen != mtu || bth_pad(pkt)) { @@ -621,11 +670,61 @@ static struct resp_res *rxe_prepare_res(struct rxe_qp *qp, res->last_psn = pkt->psn; res->cur_psn = pkt->psn; break; + case RXE_FLUSH_MASK: + res->flush.va = qp->resp.va + qp->resp.offset; + res->flush.length = qp->resp.length; + res->flush.type = feth_plt(pkt); + res->flush.level = feth_sel(pkt); } return res; } +static enum resp_states process_flush(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + u64 length, start; + struct rxe_mr *mr = qp->resp.mr; + struct resp_res *res = qp->resp.res; + + /* oA19-14, oA19-15 */ + if (res && res->replay) + return RESPST_ACKNOWLEDGE; + else if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_FLUSH_MASK); + qp->resp.res = res; + } + + if (res->flush.level == IB_FLUSH_RANGE) { + start = res->flush.va; + length = res->flush.length; + } else { /* level == IB_FLUSH_MR */ + start = mr->ibmr.iova; + length = mr->ibmr.length; + } + + if (res->flush.type & IB_FLUSH_PERSISTENT) { + if (rxe_flush_pmem_iova(mr, start, length)) + return RESPST_ERR_RKEY_VIOLATION; + /* Make data persistent. */ + wmb(); + } else if (res->flush.type & IB_FLUSH_GLOBAL) { + /* Make data global visibility. */ + wmb(); + } + + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + return RESPST_ACKNOWLEDGE; +} + /* Guarantee atomicity of atomic operations at the machine level. */ static DEFINE_SPINLOCK(atomic_ops_lock); @@ -980,6 +1079,8 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) return RESPST_ATOMIC_REPLY; } else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { return RESPST_ATOMIC_WRITE_REPLY; + } else if (pkt->mask & RXE_FLUSH_MASK) { + return RESPST_PROCESS_FLUSH; } else { /* Unreachable */ WARN_ON_ONCE(1); @@ -1176,7 +1277,7 @@ static enum resp_states acknowledge(struct rxe_qp *qp, send_ack(qp, qp->resp.aeth_syndrome, pkt->psn); else if (pkt->mask & RXE_ATOMIC_MASK) send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); - else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) + else if (pkt->mask & (RXE_FLUSH_MASK | RXE_ATOMIC_WRITE_MASK)) send_read_response_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); else if (bth_ack(pkt)) send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); @@ -1234,6 +1335,22 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, /* SEND. Ack again and cleanup. C9-105. */ send_ack(qp, AETH_ACK_UNLIMITED, prev_psn); return RESPST_CLEANUP; + } else if (pkt->mask & RXE_FLUSH_MASK) { + struct resp_res *res; + + /* Find the operation in our list of responder resources. */ + res = find_resource(qp, pkt->psn); + if (res) { + res->replay = 1; + res->cur_psn = pkt->psn; + qp->resp.res = res; + rc = RESPST_PROCESS_FLUSH; + goto out; + } + + /* Resource not found. Class D error. Drop the request. */ + rc = RESPST_CLEANUP; + goto out; } else if (pkt->mask & RXE_READ_MASK) { struct resp_res *res; @@ -1431,6 +1548,9 @@ int rxe_responder(void *arg) case RESPST_ATOMIC_WRITE_REPLY: state = atomic_write_reply(qp, pkt); break; + case RESPST_PROCESS_FLUSH: + state = process_flush(qp, pkt); + break; case RESPST_ACKNOWLEDGE: state = acknowledge(qp, pkt); break; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 22a299b0a9f0..19ddfa890480 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -165,6 +165,12 @@ struct resp_res { u64 va; u32 resid; } read; + struct { + u32 length; + u64 va; + u8 type; + u8 level; + } flush; }; }; From 70aad902ce8aeb094dd3ef14988a652f24cce7c8 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:59 +0800 Subject: [PATCH 118/122] RDMA/rxe: Implement flush completion Per IBA SPEC, FLUSH will ack in rdma read response with 0 length. Use IB_WC_FLUSH (aka IB_UVERBS_WC_FLUSH) code to tell userspace a FLUSH completion. Link: https://lore.kernel.org/r/20221206130201.30986-9-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 1c525325e271..20737fec392b 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -105,6 +105,7 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) case IB_WR_REG_MR: return IB_WC_REG_MR; case IB_WR_BIND_MW: return IB_WC_BIND_MW; case IB_WR_ATOMIC_WRITE: return IB_WC_ATOMIC_WRITE; + case IB_WR_FLUSH: return IB_WC_FLUSH; default: return 0xff; @@ -278,7 +279,8 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, */ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: if (wqe->wr.opcode != IB_WR_RDMA_READ && - wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) { + wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV && + wqe->wr.opcode != IB_WR_FLUSH) { wqe->status = IB_WC_FATAL_ERR; return COMPST_ERROR; } From 8b4d379b399d19f4c803e565bfe13f07b66b5ad7 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:02:00 +0800 Subject: [PATCH 119/122] RDMA/cm: Make QP FLUSHABLE for supported device Similar to RDMA and Atomic qp attributes enabled by default in CM, enable FLUSH attribute for supported device. That makes applications that are built with rdma_create_ep, rdma_accept APIs have FLUSH qp attribute natively so that user is able to request FLUSH operation simpler. Note that, a FLUSH operation requires FLUSH are supported by both device(HCA) and memory region(MR) and QP at the same time, so it's safe to enable FLUSH qp attribute by default here. FLUSH attribute can be disable by modify_qp() interface. Link: https://lore.kernel.org/r/20221206130201.30986-10-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 1f9938a2c475..603c0aecc361 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -4094,9 +4094,18 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; - if (cm_id_priv->responder_resources) + if (cm_id_priv->responder_resources) { + struct ib_device *ib_dev = cm_id_priv->id.device; + u64 support_flush = ib_dev->attrs.device_cap_flags & + (IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT); + u32 flushable = support_flush ? + (IB_ACCESS_FLUSH_GLOBAL | + IB_ACCESS_FLUSH_PERSISTENT) : 0; + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_ATOMIC; + IB_ACCESS_REMOTE_ATOMIC | + flushable; + } qp_attr->pkey_index = cm_id_priv->av.pkey_index; if (cm_id_priv->av.port) qp_attr->port_num = cm_id_priv->av.port->port_num; From 124011e6e933bead5852c3f69b32dec43919fe1a Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:02:01 +0800 Subject: [PATCH 120/122] RDMA/rxe: Enable RDMA FLUSH capability for rxe device Now we are ready to enable RDMA FLUSH capability for RXE. It can support Global Visibility and Persistence placement types. Link: https://lore.kernel.org/r/20221206130201.30986-11-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_param.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index bbc88cd71d95..a754fc902e3d 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -51,6 +51,8 @@ enum rxe_device_param { | IB_DEVICE_SRQ_RESIZE | IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_MEM_WINDOW + | IB_DEVICE_FLUSH_GLOBAL + | IB_DEVICE_FLUSH_PERSISTENT #ifdef CONFIG_64BIT | IB_DEVICE_MEM_WINDOW_TYPE_2B | IB_DEVICE_ATOMIC_WRITE, From e42f9c2e6aad583986e91979bf2fce47aaced1c2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 9 Dec 2022 10:21:56 -0400 Subject: [PATCH 121/122] RDMA: Add missed netdev_put() for the netdevice_tracker The netdev core will detect if any untracked puts are done on tracked pointers and throw refcount warnings: refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 1 PID: 33 at lib/refcount.c:31 refcount_warn_saturate+0x1d7/0x1f0 lib/refcount.c:31 Modules linked in: CPU: 1 PID: 33 Comm: kworker/u4:2 Not tainted 6.1.0-rc8-next-20221207-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: ib-unreg-wq ib_unregister_work RIP: 0010:refcount_warn_saturate+0x1d7/0x1f0 lib/refcount.c:31 Code: 05 5a 60 51 0a 01 e8 35 0a b5 05 0f 0b e9 d3 fe ff ff e8 6c 9b 75 fd 48 c7 c7 c0 6d a6 8a c6 05 37 60 51 0a 01 e8 16 0a b5 05 <0f> 0b e9 b4 fe +ff ff 48 89 ef e8 5a b5 c3 fd e9 5c fe ff ff 0f 1f RSP: 0018:ffffc90000aa7b30 EFLAGS: 00010082 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff8880172f9d40 RSI: ffffffff8166b1dc RDI: fffff52000154f58 RBP: ffff88807906c600 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000080000001 R11: 0000000000000000 R12: 1ffff92000154f6b R13: 0000000000000000 R14: ffff88807906c600 R15: ffff888046894000 FS: 0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffe350a8ff8 CR3: 000000007a9e7000 CR4: 00000000003526e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_dec include/linux/refcount.h:344 [inline] refcount_dec include/linux/refcount.h:359 [inline] ref_tracker_free+0x539/0x6b0 lib/ref_tracker.c:118 netdev_tracker_free include/linux/netdevice.h:4039 [inline] netdev_put include/linux/netdevice.h:4056 [inline] dev_put include/linux/netdevice.h:4082 [inline] free_netdevs+0x1f8/0x470 drivers/infiniband/core/device.c:2204 __ib_unregister_device+0xa0/0x1a0 drivers/infiniband/core/device.c:1478 ib_unregister_work+0x19/0x30 drivers/infiniband/core/device.c:1586 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x669/0x1090 kernel/workqueue.c:2436 kthread+0x2e8/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 So change the missed dev_put for pdata->netdev to also follow the tracker. Fixes: 09f530f0c6d6 ("RDMA: Add netdevice_tracker to ib_device_set_netdev()") Reported-by: syzbot+3fd8326d9a0812d19218@syzkaller.appspotmail.com Reported-by: syzbot+a1ed8ffe3121380cd5dd@syzkaller.appspotmail.com Reported-by: syzbot+8d0a099c8a6d1e4e601c@syzkaller.appspotmail.com Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-e99919867b8d+1e2-netdev_tracker2_jgg@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 9211d2794aac..894c06846224 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2201,7 +2201,7 @@ static void free_netdevs(struct ib_device *ib_dev) * comparisons after the put */ rcu_assign_pointer(pdata->netdev, NULL); - dev_put(ndev); + netdev_put(ndev, &pdata->netdev_tracker); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } From dbc94a0fb81771a38733c0e8f2ea8c4fa6934dc1 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 8 Dec 2022 09:52:54 +0200 Subject: [PATCH 122/122] IB/IPoIB: Fix queue count inconsistency for PKEY child interfaces There are 2 ways to create IPoIB PKEY child interfaces: 1) Writing a PKEY to /sys/class/net//create_child. 2) Using netlink with iproute. While with sysfs the child interface has the same number of tx and rx queues as the parent, with netlink there will always be 1 tx and 1 rx queue for the child interface. That's because the get_num_tx/rx_queues() netlink ops are missing and the default value of 1 is taken for the number of queues (in rtnl_create_link()). This change adds the get_num_tx/rx_queues() ops which allows for interfaces with multiple queues to be created over netlink. This constant only represents the max number of tx and rx queues on that net device. Fixes: 9baa0b036410 ("IB/ipoib: Add rtnl_link_ops support") Signed-off-by: Dragos Tatulea Link: https://lore.kernel.org/r/f4a42c8aa43c02d5ae5559a60c3e5e0f18c82531.1670485816.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index ea16ba5d8da6..9ad8d9856275 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -41,6 +41,11 @@ static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = { [IFLA_IPOIB_UMCAST] = { .type = NLA_U16 }, }; +static unsigned int ipoib_get_max_num_queues(void) +{ + return min_t(unsigned int, num_possible_cpus(), 128); +} + static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -172,6 +177,8 @@ static struct rtnl_link_ops ipoib_link_ops __read_mostly = { .changelink = ipoib_changelink, .get_size = ipoib_get_size, .fill_info = ipoib_fill_info, + .get_num_rx_queues = ipoib_get_max_num_queues, + .get_num_tx_queues = ipoib_get_max_num_queues, }; struct rtnl_link_ops *ipoib_get_link_ops(void)