From 2be6053318aa4f2787b7bc5cf5160017d7d1586b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 6 Oct 2011 09:32:24 -0700 Subject: [PATCH 01/62] RDMA/amso1100: Use '%pM' format option to print MAC Signed-off-by: Andy Shevchenko Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/amso1100/c2_provider.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index f101bb73be63..12f923d64e42 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -753,10 +753,7 @@ static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev) memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6); /* Print out the MAC address */ - pr_debug("%s: MAC %02X:%02X:%02X:%02X:%02X:%02X\n", - netdev->name, - netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2], - netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5]); + pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr); #if 0 /* Disable network packets */ From 9595480c5dd1f01e477e8c993d6b24fa484eca3f Mon Sep 17 00:00:00 2001 From: "Hefty, Sean" Date: Thu, 6 Oct 2011 09:32:33 -0700 Subject: [PATCH 02/62] RDMA/cma: Fix crash in cma_req_handler The RDMA CM uses the local qp_type to determine how to process an incoming request. This can result in an incoming REQ being treated as a SIDR REQ and vice versa. Fix this by switching off the event type instead, and for good measure verify that the listener supports the incoming connection request. This problem showed up when a user space application mismatched the QP types between a client and server app. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ca4c5dcd7133..31d958e2c9ec 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1179,6 +1179,15 @@ static void cma_set_req_event_data(struct rdma_cm_event *event, event->param.conn.qp_num = req_data->remote_qpn; } +static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event) +{ + return (((ib_event->event == IB_CM_REQ_RECEIVED) || + (ib_event->param.req_rcvd.qp_type == id->qp_type)) || + ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && + (id->qp_type == IB_QPT_UD)) || + (!id->qp_type)); +} + static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id; @@ -1186,13 +1195,16 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) int offset, ret; listen_id = cm_id->context; + if (!cma_check_req_qp_type(&listen_id->id, ib_event)) + return -EINVAL; + if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) return -ECONNABORTED; memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id->id.ps); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; - if (listen_id->id.qp_type == IB_QPT_UD) { + if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_new_udp_id(&listen_id->id, ib_event); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = From 9efe10a1e1a1ab1dba0af0f520e0697f6e81ebf1 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 6 Oct 2011 09:32:44 -0700 Subject: [PATCH 03/62] RDMA/cxgb4: Fail RDMA initialization for unsupported cards The iw_cxgb4 module crashes at init time if the T4 card does not support RDMA. So clean up the init logic to correctly deal with non-RDMA cards. - If any RDMA resources are not available, then fail the initialization logging an info message. - Clean up properly on initialization failures. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/device.c | 41 ++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 40a13cc633a3..6d0df6ec161b 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -376,10 +376,8 @@ struct uld_ctx { struct c4iw_dev *dev; }; -static void c4iw_remove(struct uld_ctx *ctx) +static void c4iw_dealloc(struct uld_ctx *ctx) { - PDBG("%s c4iw_dev %p\n", __func__, ctx->dev); - c4iw_unregister_device(ctx->dev); c4iw_rdev_close(&ctx->dev->rdev); idr_destroy(&ctx->dev->cqidr); idr_destroy(&ctx->dev->qpidr); @@ -389,11 +387,30 @@ static void c4iw_remove(struct uld_ctx *ctx) ctx->dev = NULL; } +static void c4iw_remove(struct uld_ctx *ctx) +{ + PDBG("%s c4iw_dev %p\n", __func__, ctx->dev); + c4iw_unregister_device(ctx->dev); + c4iw_dealloc(ctx); +} + +static int rdma_supported(const struct cxgb4_lld_info *infop) +{ + return infop->vr->stag.size > 0 && infop->vr->pbl.size > 0 && + infop->vr->rq.size > 0 && infop->vr->qp.size > 0 && + infop->vr->cq.size > 0 && infop->vr->ocq.size > 0; +} + static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) { struct c4iw_dev *devp; int ret; + if (!rdma_supported(infop)) { + printk(KERN_INFO MOD "%s: RDMA not supported on this device.\n", + pci_name(infop->pdev)); + return ERR_PTR(-ENOSYS); + } devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp)); if (!devp) { printk(KERN_ERR MOD "Cannot allocate ib device\n"); @@ -414,7 +431,6 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) ret = c4iw_rdev_open(&devp->rdev); if (ret) { - mutex_unlock(&dev_mutex); printk(KERN_ERR MOD "Unable to open CXIO rdev err %d\n", ret); ib_dealloc_device(&devp->ibdev); return ERR_PTR(ret); @@ -519,15 +535,24 @@ static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state) case CXGB4_STATE_UP: printk(KERN_INFO MOD "%s: Up\n", pci_name(ctx->lldi.pdev)); if (!ctx->dev) { - int ret = 0; + int ret; ctx->dev = c4iw_alloc(&ctx->lldi); - if (!IS_ERR(ctx->dev)) - ret = c4iw_register_device(ctx->dev); - if (IS_ERR(ctx->dev) || ret) + if (IS_ERR(ctx->dev)) { + printk(KERN_ERR MOD + "%s: initialization failed: %ld\n", + pci_name(ctx->lldi.pdev), + PTR_ERR(ctx->dev)); + ctx->dev = NULL; + break; + } + ret = c4iw_register_device(ctx->dev); + if (ret) { printk(KERN_ERR MOD "%s: RDMA registration failed: %d\n", pci_name(ctx->lldi.pdev), ret); + c4iw_dealloc(ctx); + } } break; case CXGB4_STATE_DOWN: From 10889a36437ffe92c9b81041525ac9661a8f2c92 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Thu, 6 Oct 2011 09:33:04 -0700 Subject: [PATCH 04/62] IB/ehca: Remove IRQF_DISABLED, since it's a no-op Since commit e58aa3d2d0cc ("genirq: Run irq handlers with interrupts disabled"), we run all interrupt handlers with interrupts disabled and we even check and yell when an interrupt handler returns with interrupts enabled -- cf commit b738a50a2026 ("genirq: Warn when handler enables interrupts"). So now this flag is a no-op and can be removed. Signed-off-by: Yong Zhang Acked-by: Hoang-Nam Nguyen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_eq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c index d9b1bb40f480..818d721fc448 100644 --- a/drivers/infiniband/hw/ehca/ehca_eq.c +++ b/drivers/infiniband/hw/ehca/ehca_eq.c @@ -125,7 +125,7 @@ int ehca_create_eq(struct ehca_shca *shca, tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca); ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq, - IRQF_DISABLED, "ehca_eq", + 0, "ehca_eq", (void *)shca); if (ret < 0) ehca_err(ib_dev, "Can't map interrupt handler."); @@ -133,7 +133,7 @@ int ehca_create_eq(struct ehca_shca *shca, tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca); ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq, - IRQF_DISABLED, "ehca_neq", + 0, "ehca_neq", (void *)shca); if (ret < 0) ehca_err(ib_dev, "Can't map interrupt handler."); From f45ee80eb0dda1fbf32bf63189627a9e1e157a95 Mon Sep 17 00:00:00 2001 From: "Hefty, Sean" Date: Thu, 6 Oct 2011 09:33:04 -0700 Subject: [PATCH 05/62] RDMA/cma: Check for NULL conn_param in rdma_accept Check that conn_param is not null before dereferencing it when processing rdma_accept(). This is necessary to prevent a possible system crash, which can be caused by user space. Problem found by code inspection. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 38 +++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ca4c5dcd7133..79b16028e898 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2616,14 +2616,16 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, if (ret) goto out; - iw_param.ord = conn_param->initiator_depth; - iw_param.ird = conn_param->responder_resources; - iw_param.private_data = conn_param->private_data; - iw_param.private_data_len = conn_param->private_data_len; - if (id_priv->id.qp) + if (conn_param) { + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; + iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; + } else { + memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; - else - iw_param.qpn = conn_param->qp_num; + } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { @@ -2765,14 +2767,20 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (id->qp_type == IB_QPT_UD) - ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, - conn_param->private_data, - conn_param->private_data_len); - else if (conn_param) - ret = cma_accept_ib(id_priv, conn_param); - else - ret = cma_rep_recv(id_priv); + if (id->qp_type == IB_QPT_UD) { + if (conn_param) + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + conn_param->private_data, + conn_param->private_data_len); + else + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + NULL, 0); + } else { + if (conn_param) + ret = cma_accept_ib(id_priv, conn_param); + else + ret = cma_rep_recv(id_priv); + } break; case RDMA_TRANSPORT_IWARP: ret = cma_accept_iw(id_priv, conn_param); From b7ab0b19a85fffaa04ad0b59471d3a607eef0a56 Mon Sep 17 00:00:00 2001 From: "Hefty, Sean" Date: Thu, 6 Oct 2011 09:33:05 -0700 Subject: [PATCH 06/62] IB/mad: Verify mgmt class in received MADs If a received MAD contains an invalid or reserved mgmt class, we will attempt to access method_table outside of its range. Add a check to ensure that mgmt class falls within the handled range. Found by code inspection. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/mad.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index b4d8672a3e4e..056389229ea7 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1596,6 +1596,9 @@ find_mad_agent(struct ib_mad_port_private *port_priv, mad->mad_hdr.class_version].class; if (!class) goto out; + if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >= + IB_MGMT_MAX_METHODS) + goto out; method = class->method_table[convert_mgmt_class( mad->mad_hdr.mgmt_class)]; if (method) From caf6e3f221ddc12ccabd1cd0ba149561db0090d4 Mon Sep 17 00:00:00 2001 From: "Hefty, Sean" Date: Thu, 6 Oct 2011 09:33:05 -0700 Subject: [PATCH 07/62] RDMA/ucm: Removed checks for unsigned value < 0 cmd is unsigned, no need to check for < 0. Found by code inspection. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/ucm.c | 2 +- drivers/infiniband/core/ucma.c | 2 +- drivers/infiniband/core/user_mad.c | 5 ++--- drivers/infiniband/core/uverbs_main.c | 3 +-- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 08f948df8fa9..b8a0b4a7811b 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -1122,7 +1122,7 @@ static ssize_t ib_ucm_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; - if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) + if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) return -EINVAL; if (hdr.in + sizeof(hdr) > len) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 71be5eebd683..b3f3dddd528d 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1270,7 +1270,7 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; - if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) + if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) return -EINVAL; if (hdr.in + sizeof(hdr) > len) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 8d261b6ea5fe..07db22997e97 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -458,8 +458,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, goto err; } - if (packet->mad.hdr.id < 0 || - packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { + if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { ret = -EINVAL; goto err; } @@ -703,7 +702,7 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); - if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { + if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { ret = -EINVAL; goto out; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 56898b6578a4..7c30e1e12c90 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -557,8 +557,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (hdr.in_words * 4 != count) return -EINVAL; - if (hdr.command < 0 || - hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || + if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || !uverbs_cmd_table[hdr.command]) return -EINVAL; From 0f6740c7c455693f719580f34bb8afa8a298ea36 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 6 Oct 2011 09:33:11 -0700 Subject: [PATCH 08/62] mlx4_core: Clean up error flow in mlx4_register_mac() Fix a leak of entry if radix_tree_insert() fails. Also, reduce the indentation and make the flow easier to read by sticking to the conventional err = do_something(); if (err) return err; err = do_another(); if (err) return err; rather than mixing the direction of the test as err = do_something(); if (!err) { err = do_another(); if (err) return err; } else return err; Signed-off-by: Roland Dreier --- drivers/net/mlx4/port.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c index 609e0ec14cee..ef10113da356 100644 --- a/drivers/net/mlx4/port.c +++ b/drivers/net/mlx4/port.c @@ -148,22 +148,26 @@ int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap) if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) { err = mlx4_uc_steer_add(dev, port, mac, qpn, 1); - if (!err) { - entry = kmalloc(sizeof *entry, GFP_KERNEL); - if (!entry) { - mlx4_uc_steer_release(dev, port, mac, *qpn, 1); - return -ENOMEM; - } - entry->mac = mac; - err = radix_tree_insert(&info->mac_tree, *qpn, entry); - if (err) { - mlx4_uc_steer_release(dev, port, mac, *qpn, 1); - return err; - } - } else + if (err) return err; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) { + mlx4_uc_steer_release(dev, port, mac, *qpn, 1); + return -ENOMEM; + } + + entry->mac = mac; + err = radix_tree_insert(&info->mac_tree, *qpn, entry); + if (err) { + kfree(entry); + mlx4_uc_steer_release(dev, port, mac, *qpn, 1); + return err; + } } + mlx4_dbg(dev, "Registering MAC: 0x%llx\n", (unsigned long long) mac); + mutex_lock(&table->mutex); for (i = 0; i < MLX4_MAX_MAC_NUM - 1; i++) { if (free < 0 && !table->refs[i]) { From a8dc0dffae651befa24dc518ac1715b8d3173c34 Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Thu, 6 Oct 2011 09:33:12 -0700 Subject: [PATCH 09/62] mlx4_core: Use the right function to free eq->page_list entries Fix the memory release function to be consistent with the memory allocation one to prevent problems if the implementation of pci_free_consistent() and dma_free_coherent() are different. Signed-off-by: Dotan Barak Reviewed-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/net/mlx4/eq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index 1ad1f6029af8..869a2c220a7b 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -484,7 +484,7 @@ static void mlx4_free_eq(struct mlx4_dev *dev, mlx4_mtt_cleanup(dev, &eq->mtt); for (i = 0; i < npages; ++i) - pci_free_consistent(dev->pdev, PAGE_SIZE, + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, eq->page_list[i].buf, eq->page_list[i].map); From a8312755db7ac800c83ec036058c3109892e0949 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 6 Oct 2011 09:33:12 -0700 Subject: [PATCH 10/62] mlx4_core: Fix buddy->num_free allocation size The num_free field of mlx4_buddy has a type of array of unsigned int while it was allocated as an array of pointers. On 64-bit platforms this allocates twice more than required. Fix this by allocating the correct size for the type. Signed-off-by: Eli Cohen [ Convert to kcalloc(). - Roland ] Signed-off-by: Roland Dreier --- drivers/net/mlx4/mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 9c188bdd7f4f..ab639cfef78e 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -139,7 +139,7 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *), GFP_KERNEL); - buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *), + buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free, GFP_KERNEL); if (!buddy->bits || !buddy->num_free) goto err_out; From e2e435f290795e409a649423f19574ba77942854 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 6 Oct 2011 09:33:24 -0700 Subject: [PATCH 11/62] RDMA/nes: Add missing calls to ib_umem_release() Add calls to ib_umem_release(), as in the other error-handling code in nes_reg_user_mr(). Signed-off-by: Julia Lawall Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 9f2f7d4b1197..394d0e7e4a5c 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2338,8 +2338,10 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, skip_pages = ((u32)region->offset) >> 12; - if (ib_copy_from_udata(&req, udata, sizeof(req))) + if (ib_copy_from_udata(&req, udata, sizeof(req))) { + ib_umem_release(region); return ERR_PTR(-EFAULT); + } nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type); switch (req.reg_type) { @@ -2631,6 +2633,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return &nesmr->ibmr; } + ib_umem_release(region); return ERR_PTR(-ENOSYS); } From 53ab1c6498371723c31b18400fab10a902a15a63 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Thu, 6 Oct 2011 09:33:35 -0700 Subject: [PATCH 12/62] IB/qib: Correct nfreectxts for multiple HCAs The code that was recently introduced to report the number of free contexts is flawed for multiple HCAs: /* Return the number of free user ports (contexts) available. */ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->cfgctxts - dd->first_user_ctxt - (u32)qib_stats.sps_ctxts); The qib_stats is global to the module, not per HCA, so the code is broken for multiple HCAs. This patch adds a qib_devdata field, freectxts, that reflects the free contexts for this HCA. Signed-off-by: Mike Marciniszyn Reviewed-by: Ram Vepa Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib.h | 4 ++++ drivers/infiniband/hw/qib/qib_file_ops.c | 2 ++ drivers/infiniband/hw/qib/qib_init.c | 1 + drivers/infiniband/hw/qib/qib_sysfs.c | 3 +-- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index c9624ea87209..ee993e725d38 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -807,6 +807,10 @@ struct qib_devdata { * supports, less gives more pio bufs/ctxt, etc. */ u32 cfgctxts; + /* + * number of ctxts available for PSM open + */ + u32 freectxts; /* * hint that we should update pioavailshadow before diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 26253039d2c7..77633666f81c 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -1284,6 +1284,7 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, strlcpy(rcd->comm, current->comm, sizeof(rcd->comm)); ctxt_fp(fp) = rcd; qib_stats.sps_ctxts++; + dd->freectxts++; ret = 0; goto bail; @@ -1792,6 +1793,7 @@ static int qib_close(struct inode *in, struct file *fp) if (dd->pageshadow) unlock_expected_tids(rcd); qib_stats.sps_ctxts--; + dd->freectxts--; } mutex_unlock(&qib_mutex); diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index a01f3fce8eb3..021636dbeae6 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -398,6 +398,7 @@ static void enable_chip(struct qib_devdata *dd) if (rcd) dd->f_rcvctrl(rcd->ppd, rcvmask, i); } + dd->freectxts = dd->cfgctxts - dd->first_user_ctxt; } static void verify_interrupt(unsigned long opaque) diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index 14d129de4320..78fbd56879d4 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -515,8 +515,7 @@ static ssize_t show_nfreectxts(struct device *device, struct qib_devdata *dd = dd_from_dev(dev); /* Return the number of free user ports (contexts) available. */ - return scnprintf(buf, PAGE_SIZE, "%u\n", dd->cfgctxts - - dd->first_user_ctxt - (u32)qib_stats.sps_ctxts); + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); } static ssize_t show_serial(struct device *device, From 3ebeebc38b4b13384aba97f2e4acd6b48d47a65c Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Sun, 25 Sep 2011 20:17:43 +0530 Subject: [PATCH 13/62] RDMA/iwcm: Propagate ird/ord values upwards Update struct iw_cm_event to support propagating the ird/ord values upwards to the application. Signed-off-by: Kumar Sanghvi Reviewed-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 8 ++++++-- include/rdma/iw_cm.h | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 31d958e2c9ec..3e7104554597 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1340,6 +1340,8 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: @@ -1355,6 +1357,8 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; break; default: BUG_ON(1); @@ -1445,8 +1449,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; - event.param.conn.initiator_depth = attr.max_qp_init_rd_atom; - event.param.conn.responder_resources = attr.max_qp_rd_atom; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; /* * Protect against the user destroying conn_id from another thread diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 2d0191c90f9e..1a046b1595cc 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -52,8 +52,10 @@ struct iw_cm_event { struct sockaddr_in local_addr; struct sockaddr_in remote_addr; void *private_data; - u8 private_data_len; void *provider_data; + u8 private_data_len; + u8 ord; + u8 ird; }; /** From 56da00fc92e6f227874bba36f127ffc8847ee1f8 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Sun, 25 Sep 2011 20:17:45 +0530 Subject: [PATCH 14/62] RDMA/{amso1100,cxgb3}: Minimal MPAv2 support As part of MPAv2 Enhanced RDMA Negotiation, pass max supported ird/ord values upwards for the time being in iw_cxgb3 and amso1100. Signed-off-by: Kumar Sanghvi Reviewed-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/amso1100/c2_ae.c | 5 +++++ drivers/infiniband/hw/amso1100/c2_intr.c | 5 +++++ drivers/infiniband/hw/cxgb3/iwch_cm.c | 10 ++++++++++ 3 files changed, 20 insertions(+) diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c index 24f9e3a90e8e..32d34e88d5cf 100644 --- a/drivers/infiniband/hw/amso1100/c2_ae.c +++ b/drivers/infiniband/hw/amso1100/c2_ae.c @@ -288,6 +288,11 @@ void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) cm_event.private_data_len = be32_to_cpu(req->private_data_length); cm_event.private_data = req->private_data; + /* + * Until ird/ord negotiation via MPAv2 support is added, send + * max supported values + */ + cm_event.ird = cm_event.ord = 128; if (cm_id->event_handler) cm_id->event_handler(cm_id, &cm_event); diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c index 0ebe4e806b86..8951db4ae29d 100644 --- a/drivers/infiniband/hw/amso1100/c2_intr.c +++ b/drivers/infiniband/hw/amso1100/c2_intr.c @@ -183,6 +183,11 @@ static void handle_vq(struct c2_dev *c2dev, u32 mq_index) case IW_CM_EVENT_ESTABLISHED: c2_set_qp_state(req->qp, C2_QP_STATE_RTS); + /* + * Until ird/ord negotiation via MPAv2 support is added, send + * max supported values + */ + cm_event.ird = cm_event.ord = 128; case IW_CM_EVENT_CLOSE: /* diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 6cd642aaa4de..de6d0774e609 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -753,6 +753,11 @@ static void connect_request_upcall(struct iwch_ep *ep) event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); event.provider_data = ep; + /* + * Until ird/ord negotiation via MPAv2 support is added, send max + * supported values + */ + event.ird = event.ord = 8; if (state_read(&ep->parent_ep->com) != DEAD) { get_ep(&ep->com); ep->parent_ep->com.cm_id->event_handler( @@ -770,6 +775,11 @@ static void established_upcall(struct iwch_ep *ep) PDBG("%s ep %p\n", __func__, ep); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_ESTABLISHED; + /* + * Until ird/ord negotiation via MPAv2 support is added, send max + * supported values + */ + event.ird = event.ord = 8; if (ep->com.cm_id) { PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); From d2fe99e86bb2ccbb87df20b0136d5983b6a4cc09 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Sun, 25 Sep 2011 20:17:44 +0530 Subject: [PATCH 15/62] RDMA/cxgb4: Add support for MPAv2 Enhanced RDMA Negotiation This patch adds support for Enhanced RDMA Connection Establishment (draft-ietf-storm-mpa-peer-connect-06), aka MPAv2. Details of draft can be obtained from: The patch updates the following functions for initiator perspective: - send_mpa_request - process_mpa_reply - post_terminate for TERM error codes - destroy_qp for TERM related change - adds layer/etype/ecode to c4iw_qp_attrs for sending with TERM - peer_abort for retrying connection attempt with MPA_v1 message - added c4iw_reconnect function The patch updates the following functions for responder perspective: - process_mpa_request - send_mpa_reply - c4iw_accept_cr - passes ird/ord to upper layers Signed-off-by: Kumar Sanghvi Reviewed-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cm.c | 469 +++++++++++++++++++++++-- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 22 +- drivers/infiniband/hw/cxgb4/qp.c | 14 +- 3 files changed, 466 insertions(+), 39 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 77f769d9227d..b36cdac9c558 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -103,7 +103,8 @@ MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " static int mpa_rev = 1; module_param(mpa_rev, int, 0644); MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " - "1 is spec compliant. (default=1)"); + "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft" + " compliant (default=1)"); static int markers_enabled; module_param(markers_enabled, int, 0644); @@ -497,17 +498,21 @@ static int send_connect(struct c4iw_ep *ep) return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } -static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb) +static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, + u8 mpa_rev_to_use) { int mpalen, wrlen; struct fw_ofld_tx_data_wr *req; struct mpa_message *mpa; + struct mpa_v2_conn_params mpa_v2_params; PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen); BUG_ON(skb_cloned(skb)); mpalen = sizeof(*mpa) + ep->plen; + if (mpa_rev_to_use == 2) + mpalen += sizeof(struct mpa_v2_conn_params); wrlen = roundup(mpalen + sizeof *req, 16); skb = get_skb(skb, wrlen, GFP_KERNEL); if (!skb) { @@ -533,12 +538,39 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb) mpa = (struct mpa_message *)(req + 1); memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); mpa->flags = (crc_enabled ? MPA_CRC : 0) | - (markers_enabled ? MPA_MARKERS : 0); + (markers_enabled ? MPA_MARKERS : 0) | + (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0); mpa->private_data_size = htons(ep->plen); - mpa->revision = mpa_rev; + mpa->revision = mpa_rev_to_use; + if (mpa_rev_to_use == 1) + ep->tried_with_mpa_v1 = 1; - if (ep->plen) - memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); + if (mpa_rev_to_use == 2) { + mpa->private_data_size += + htons(sizeof(struct mpa_v2_conn_params)); + mpa_v2_params.ird = htons((u16)ep->ird); + mpa_v2_params.ord = htons((u16)ep->ord); + + if (peer2peer) { + mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); + if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_WRITE_RTR); + else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_READ_RTR); + } + memcpy(mpa->private_data, &mpa_v2_params, + sizeof(struct mpa_v2_conn_params)); + + if (ep->plen) + memcpy(mpa->private_data + + sizeof(struct mpa_v2_conn_params), + ep->mpa_pkt + sizeof(*mpa), ep->plen); + } else + if (ep->plen) + memcpy(mpa->private_data, + ep->mpa_pkt + sizeof(*mpa), ep->plen); /* * Reference the mpa skb. This ensures the data area @@ -562,10 +594,13 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) struct fw_ofld_tx_data_wr *req; struct mpa_message *mpa; struct sk_buff *skb; + struct mpa_v2_conn_params mpa_v2_params; PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen); mpalen = sizeof(*mpa) + plen; + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) + mpalen += sizeof(struct mpa_v2_conn_params); wrlen = roundup(mpalen + sizeof *req, 16); skb = get_skb(NULL, wrlen, GFP_KERNEL); @@ -595,8 +630,29 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) mpa->flags = MPA_REJECT; mpa->revision = mpa_rev; mpa->private_data_size = htons(plen); - if (plen) - memcpy(mpa->private_data, pdata, plen); + + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { + mpa->flags |= MPA_ENHANCED_RDMA_CONN; + mpa->private_data_size += + htons(sizeof(struct mpa_v2_conn_params)); + mpa_v2_params.ird = htons(((u16)ep->ird) | + (peer2peer ? MPA_V2_PEER2PEER_MODEL : + 0)); + mpa_v2_params.ord = htons(((u16)ep->ord) | (peer2peer ? + (p2p_type == + FW_RI_INIT_P2PTYPE_RDMA_WRITE ? + MPA_V2_RDMA_WRITE_RTR : p2p_type == + FW_RI_INIT_P2PTYPE_READ_REQ ? + MPA_V2_RDMA_READ_RTR : 0) : 0)); + memcpy(mpa->private_data, &mpa_v2_params, + sizeof(struct mpa_v2_conn_params)); + + if (ep->plen) + memcpy(mpa->private_data + + sizeof(struct mpa_v2_conn_params), pdata, plen); + } else + if (plen) + memcpy(mpa->private_data, pdata, plen); /* * Reference the mpa skb again. This ensures the data area @@ -617,10 +673,13 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) struct fw_ofld_tx_data_wr *req; struct mpa_message *mpa; struct sk_buff *skb; + struct mpa_v2_conn_params mpa_v2_params; PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen); mpalen = sizeof(*mpa) + plen; + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) + mpalen += sizeof(struct mpa_v2_conn_params); wrlen = roundup(mpalen + sizeof *req, 16); skb = get_skb(NULL, wrlen, GFP_KERNEL); @@ -649,10 +708,36 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | (markers_enabled ? MPA_MARKERS : 0); - mpa->revision = mpa_rev; + mpa->revision = ep->mpa_attr.version; mpa->private_data_size = htons(plen); - if (plen) - memcpy(mpa->private_data, pdata, plen); + + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { + mpa->flags |= MPA_ENHANCED_RDMA_CONN; + mpa->private_data_size += + htons(sizeof(struct mpa_v2_conn_params)); + mpa_v2_params.ird = htons((u16)ep->ird); + mpa_v2_params.ord = htons((u16)ep->ord); + if (peer2peer && (ep->mpa_attr.p2p_type != + FW_RI_INIT_P2PTYPE_DISABLED)) { + mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); + + if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_WRITE_RTR); + else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_READ_RTR); + } + + memcpy(mpa->private_data, &mpa_v2_params, + sizeof(struct mpa_v2_conn_params)); + + if (ep->plen) + memcpy(mpa->private_data + + sizeof(struct mpa_v2_conn_params), pdata, plen); + } else + if (plen) + memcpy(mpa->private_data, pdata, plen); /* * Reference the mpa skb. This ensures the data area @@ -695,7 +780,10 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) /* start MPA negotiation */ send_flowc(ep, NULL); - send_mpa_req(ep, skb); + if (ep->retry_with_mpa_v1) + send_mpa_req(ep, skb, 1); + else + send_mpa_req(ep, skb, mpa_rev); return 0; } @@ -769,8 +857,19 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status) event.remote_addr = ep->com.remote_addr; if ((status == 0) || (status == -ECONNREFUSED)) { - event.private_data_len = ep->plen; - event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + if (!ep->tried_with_mpa_v1) { + /* this means MPA_v2 is used */ + event.private_data_len = ep->plen - + sizeof(struct mpa_v2_conn_params); + event.private_data = ep->mpa_pkt + + sizeof(struct mpa_message) + + sizeof(struct mpa_v2_conn_params); + } else { + /* this means MPA_v1 is used */ + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + + sizeof(struct mpa_message); + } } PDBG("%s ep %p tid %u status %d\n", __func__, ep, @@ -793,9 +892,22 @@ static void connect_request_upcall(struct c4iw_ep *ep) event.event = IW_CM_EVENT_CONNECT_REQUEST; event.local_addr = ep->com.local_addr; event.remote_addr = ep->com.remote_addr; - event.private_data_len = ep->plen; - event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); event.provider_data = ep; + if (!ep->tried_with_mpa_v1) { + /* this means MPA_v2 is used */ + event.ord = ep->ord; + event.ird = ep->ird; + event.private_data_len = ep->plen - + sizeof(struct mpa_v2_conn_params); + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) + + sizeof(struct mpa_v2_conn_params); + } else { + /* this means MPA_v1 is used. Send max supported */ + event.ord = c4iw_max_read_depth; + event.ird = c4iw_max_read_depth; + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + } if (state_read(&ep->parent_ep->com) != DEAD) { c4iw_get_ep(&ep->com); ep->parent_ep->com.cm_id->event_handler( @@ -813,6 +925,8 @@ static void established_upcall(struct c4iw_ep *ep) PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_ESTABLISHED; + event.ird = ep->ird; + event.ord = ep->ord; if (ep->com.cm_id) { PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); @@ -848,7 +962,10 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits) static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) { struct mpa_message *mpa; + struct mpa_v2_conn_params *mpa_v2_params; u16 plen; + u16 resp_ird, resp_ord; + u8 rtr_mismatch = 0, insuff_ird = 0; struct c4iw_qp_attributes attrs; enum c4iw_qp_attr_mask mask; int err; @@ -888,7 +1005,9 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) mpa = (struct mpa_message *) ep->mpa_pkt; /* Validate MPA header. */ - if (mpa->revision != mpa_rev) { + if (mpa->revision > mpa_rev) { + printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," + " Received = %d\n", __func__, mpa_rev, mpa->revision); err = -EPROTO; goto err; } @@ -938,13 +1057,66 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; - ep->mpa_attr.version = mpa_rev; - ep->mpa_attr.p2p_type = peer2peer ? p2p_type : - FW_RI_INIT_P2PTYPE_DISABLED; + ep->mpa_attr.version = mpa->revision; + ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; + + if (mpa->revision == 2) { + ep->mpa_attr.enhanced_rdma_conn = + mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; + if (ep->mpa_attr.enhanced_rdma_conn) { + mpa_v2_params = (struct mpa_v2_conn_params *) + (ep->mpa_pkt + sizeof(*mpa)); + resp_ird = ntohs(mpa_v2_params->ird) & + MPA_V2_IRD_ORD_MASK; + resp_ord = ntohs(mpa_v2_params->ord) & + MPA_V2_IRD_ORD_MASK; + + /* + * This is a double-check. Ideally, below checks are + * not required since ird/ord stuff has been taken + * care of in c4iw_accept_cr + */ + if ((ep->ird < resp_ord) || (ep->ord > resp_ird)) { + err = -ENOMEM; + ep->ird = resp_ord; + ep->ord = resp_ird; + insuff_ird = 1; + } + + if (ntohs(mpa_v2_params->ird) & + MPA_V2_PEER2PEER_MODEL) { + if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_WRITE_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_RDMA_WRITE; + else if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_READ_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_READ_REQ; + } + } + } else if (mpa->revision == 1) + if (peer2peer) + ep->mpa_attr.p2p_type = p2p_type; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " - "xmit_marker_enabled=%d, version=%d\n", __func__, - ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, - ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + "xmit_marker_enabled=%d, version=%d p2p_type=%d local-p2p_type = " + "%d\n", __func__, ep->mpa_attr.crc_enabled, + ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, + ep->mpa_attr.p2p_type, p2p_type); + + /* + * If responder's RTR does not match with that of initiator, assign + * FW_RI_INIT_P2PTYPE_DISABLED in mpa attributes so that RTR is not + * generated when moving QP to RTS state. + * A TERM message will be sent after QP has moved to RTS state + */ + if ((ep->mpa_attr.version == 2) && + (ep->mpa_attr.p2p_type != p2p_type)) { + ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; + rtr_mismatch = 1; + } attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; @@ -961,6 +1133,39 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) ep->com.qp, mask, &attrs, 1); if (err) goto err; + + /* + * If responder's RTR requirement did not match with what initiator + * supports, generate TERM message + */ + if (rtr_mismatch) { + printk(KERN_ERR "%s: RTR mismatch, sending TERM\n", __func__); + attrs.layer_etype = LAYER_MPA | DDP_LLP; + attrs.ecode = MPA_NOMATCH_RTR; + attrs.next_state = C4IW_QP_STATE_TERMINATE; + err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + err = -ENOMEM; + goto out; + } + + /* + * Generate TERM if initiator IRD is not sufficient for responder + * provided ORD. Currently, we do the same behaviour even when + * responder provided IRD is also not sufficient as regards to + * initiator ORD. + */ + if (insuff_ird) { + printk(KERN_ERR "%s: Insufficient IRD, sending TERM\n", + __func__); + attrs.layer_etype = LAYER_MPA | DDP_LLP; + attrs.ecode = MPA_INSUFF_IRD; + attrs.next_state = C4IW_QP_STATE_TERMINATE; + err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + err = -ENOMEM; + goto out; + } goto out; err: state_set(&ep->com, ABORTING); @@ -973,6 +1178,7 @@ out: static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) { struct mpa_message *mpa; + struct mpa_v2_conn_params *mpa_v2_params; u16 plen; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); @@ -1013,7 +1219,9 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) /* * Validate MPA Header. */ - if (mpa->revision != mpa_rev) { + if (mpa->revision > mpa_rev) { + printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," + " Received = %d\n", __func__, mpa_rev, mpa->revision); abort_connection(ep, skb, GFP_KERNEL); return; } @@ -1056,9 +1264,37 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; - ep->mpa_attr.version = mpa_rev; - ep->mpa_attr.p2p_type = peer2peer ? p2p_type : - FW_RI_INIT_P2PTYPE_DISABLED; + ep->mpa_attr.version = mpa->revision; + if (mpa->revision == 1) + ep->tried_with_mpa_v1 = 1; + ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; + + if (mpa->revision == 2) { + ep->mpa_attr.enhanced_rdma_conn = + mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; + if (ep->mpa_attr.enhanced_rdma_conn) { + mpa_v2_params = (struct mpa_v2_conn_params *) + (ep->mpa_pkt + sizeof(*mpa)); + ep->ird = ntohs(mpa_v2_params->ird) & + MPA_V2_IRD_ORD_MASK; + ep->ord = ntohs(mpa_v2_params->ord) & + MPA_V2_IRD_ORD_MASK; + if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL) + if (peer2peer) { + if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_WRITE_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_RDMA_WRITE; + else if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_READ_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_READ_REQ; + } + } + } else if (mpa->revision == 1) + if (peer2peer) + ep->mpa_attr.p2p_type = p2p_type; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " "xmit_marker_enabled=%d, version=%d p2p_type=%d\n", __func__, ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, @@ -1550,6 +1786,112 @@ static int is_neg_adv_abort(unsigned int status) status == CPL_ERR_PERSIST_NEG_ADVICE; } +static int c4iw_reconnect(struct c4iw_ep *ep) +{ + int err = 0; + struct rtable *rt; + struct net_device *pdev; + struct neighbour *neigh; + int step; + + PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id); + init_timer(&ep->timer); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep); + if (ep->atid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + + /* find a route */ + rt = find_route(ep->com.dev, + ep->com.cm_id->local_addr.sin_addr.s_addr, + ep->com.cm_id->remote_addr.sin_addr.s_addr, + ep->com.cm_id->local_addr.sin_port, + ep->com.cm_id->remote_addr.sin_port, 0); + if (!rt) { + printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); + err = -EHOSTUNREACH; + goto fail3; + } + ep->dst = &rt->dst; + + neigh = dst_get_neighbour(ep->dst); + + /* get a l2t entry */ + if (neigh->dev->flags & IFF_LOOPBACK) { + PDBG("%s LOOPBACK\n", __func__); + pdev = ip_dev_find(&init_net, + ep->com.cm_id->remote_addr.sin_addr.s_addr); + ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t, + neigh, pdev, 0); + ep->mtu = pdev->mtu; + ep->tx_chan = cxgb4_port_chan(pdev); + ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; + step = ep->com.dev->rdev.lldi.ntxq / + ep->com.dev->rdev.lldi.nchan; + ep->txq_idx = cxgb4_port_idx(pdev) * step; + step = ep->com.dev->rdev.lldi.nrxq / + ep->com.dev->rdev.lldi.nchan; + ep->ctrlq_idx = cxgb4_port_idx(pdev); + ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[ + cxgb4_port_idx(pdev) * step]; + dev_put(pdev); + } else { + ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t, + neigh, neigh->dev, 0); + ep->mtu = dst_mtu(ep->dst); + ep->tx_chan = cxgb4_port_chan(neigh->dev); + ep->smac_idx = (cxgb4_port_viid(neigh->dev) & 0x7F) << 1; + step = ep->com.dev->rdev.lldi.ntxq / + ep->com.dev->rdev.lldi.nchan; + ep->txq_idx = cxgb4_port_idx(neigh->dev) * step; + ep->ctrlq_idx = cxgb4_port_idx(neigh->dev); + step = ep->com.dev->rdev.lldi.nrxq / + ep->com.dev->rdev.lldi.nchan; + ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[ + cxgb4_port_idx(neigh->dev) * step]; + } + if (!ep->l2t) { + printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); + err = -ENOMEM; + goto fail4; + } + + PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", + __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid, + ep->l2t->idx); + + state_set(&ep->com, CONNECTING); + ep->tos = 0; + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + cxgb4_l2t_release(ep->l2t); +fail4: + dst_release(ep->dst); +fail3: + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); +fail2: + /* + * remember to send notification to upper layer. + * We are in here so the upper layer is not aware that this is + * re-connect attempt and so, upper layer is still waiting for + * response of 1st connect request. + */ + connect_reply_upcall(ep, -ECONNRESET); + c4iw_put_ep(&ep->com); +out: + return err; +} + static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_abort_req_rss *req = cplhdr(skb); @@ -1573,8 +1915,11 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) /* * Wake up any threads in rdma_init() or rdma_fini(). + * However, this is not needed if com state is just + * MPA_REQ_SENT */ - c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + if (ep->com.state != MPA_REQ_SENT) + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); mutex_lock(&ep->com.mutex); switch (ep->com.state) { @@ -1585,7 +1930,21 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) break; case MPA_REQ_SENT: stop_ep_timer(ep); - connect_reply_upcall(ep, -ECONNRESET); + if (mpa_rev == 2 && ep->tried_with_mpa_v1) + connect_reply_upcall(ep, -ECONNRESET); + else { + /* + * we just don't send notification upwards because we + * want to retry with mpa_v1 without upper layers even + * knowing it. + * + * do some housekeeping so as to re-initiate the + * connection + */ + PDBG("%s: mpa_rev=%d. Retrying with mpav1\n", __func__, + mpa_rev); + ep->retry_with_mpa_v1 = 1; + } break; case MPA_REP_SENT: break; @@ -1621,7 +1980,9 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) dst_confirm(ep->dst); if (ep->com.state != ABORTING) { __state_set(&ep->com, DEAD); - release = 1; + /* we don't release if we want to retry with mpa_v1 */ + if (!ep->retry_with_mpa_v1) + release = 1; } mutex_unlock(&ep->com.mutex); @@ -1641,6 +2002,15 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) out: if (release) release_ep_resources(ep); + + /* retry with mpa-v1 */ + if (ep && ep->retry_with_mpa_v1) { + cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_reconnect(ep); + } + return 0; } @@ -1792,18 +2162,40 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) goto err; } - cm_id->add_ref(cm_id); - ep->com.cm_id = cm_id; - ep->com.qp = qp; + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { + if (conn_param->ord > ep->ird) { + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + send_mpa_reject(ep, conn_param->private_data, + conn_param->private_data_len); + abort_connection(ep, NULL, GFP_KERNEL); + err = -ENOMEM; + goto err; + } + if (conn_param->ird > ep->ord) { + if (!ep->ord) + conn_param->ird = 1; + else { + abort_connection(ep, NULL, GFP_KERNEL); + err = -ENOMEM; + goto err; + } + } + } ep->ird = conn_param->ird; ep->ord = conn_param->ord; - if (peer2peer && ep->ird == 0) - ep->ird = 1; + if (ep->mpa_attr.version != 2) + if (peer2peer && ep->ird == 0) + ep->ird = 1; PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord); + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = qp; + /* bind QP to EP and move to RTS */ attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; @@ -1944,6 +2336,8 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->com.dev->rdev.lldi.nchan; ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[ cxgb4_port_idx(neigh->dev) * step]; + ep->retry_with_mpa_v1 = 0; + ep->tried_with_mpa_v1 = 0; } if (!ep->l2t) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); @@ -2323,8 +2717,11 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) /* * Wake up any threads in rdma_init() or rdma_fini(). + * However, this is not needed if com state is just + * MPA_REQ_SENT */ - c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + if (ep->com.state != MPA_REQ_SENT) + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); sched(dev, skb); return 0; } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 4f045375c8e2..62cea0e2b158 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -323,6 +323,7 @@ struct c4iw_mpa_attributes { u8 recv_marker_enabled; u8 xmit_marker_enabled; u8 crc_enabled; + u8 enhanced_rdma_conn; u8 version; u8 p2p_type; }; @@ -349,6 +350,8 @@ struct c4iw_qp_attributes { u8 is_terminate_local; struct c4iw_mpa_attributes mpa_attr; struct c4iw_ep *llp_stream_handle; + u8 layer_etype; + u8 ecode; }; struct c4iw_qp { @@ -501,11 +504,18 @@ enum c4iw_mmid_state { #define MPA_KEY_REP "MPA ID Rep Frame" #define MPA_MAX_PRIVATE_DATA 256 +#define MPA_ENHANCED_RDMA_CONN 0x10 #define MPA_REJECT 0x20 #define MPA_CRC 0x40 #define MPA_MARKERS 0x80 #define MPA_FLAGS_MASK 0xE0 +#define MPA_V2_PEER2PEER_MODEL 0x8000 +#define MPA_V2_ZERO_LEN_FPDU_RTR 0x4000 +#define MPA_V2_RDMA_WRITE_RTR 0x8000 +#define MPA_V2_RDMA_READ_RTR 0x4000 +#define MPA_V2_IRD_ORD_MASK 0x3FFF + #define c4iw_put_ep(ep) { \ PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \ ep, atomic_read(&((ep)->kref.refcount))); \ @@ -528,6 +538,11 @@ struct mpa_message { u8 private_data[0]; }; +struct mpa_v2_conn_params { + __be16 ird; + __be16 ord; +}; + struct terminate_message { u8 layer_etype; u8 ecode; @@ -580,7 +595,10 @@ enum c4iw_ddp_ecodes { enum c4iw_mpa_ecodes { MPA_CRC_ERR = 0x02, - MPA_MARKER_ERR = 0x03 + MPA_MARKER_ERR = 0x03, + MPA_LOCAL_CATA = 0x05, + MPA_INSUFF_IRD = 0x06, + MPA_NOMATCH_RTR = 0x07, }; enum c4iw_ep_state { @@ -651,6 +669,8 @@ struct c4iw_ep { u16 txq_idx; u16 ctrlq_idx; u8 tos; + u8 retry_with_mpa_v1; + u8 tried_with_mpa_v1; }; static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index a41578e48c7b..ec3ce675fdff 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -917,7 +917,11 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, wqe->u.terminate.type = FW_RI_TYPE_TERMINATE; wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term); term = (struct terminate_message *)wqe->u.terminate.termmsg; - build_term_codes(err_cqe, &term->layer_etype, &term->ecode); + if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) { + term->layer_etype = qhp->attr.layer_etype; + term->ecode = qhp->attr.ecode; + } else + build_term_codes(err_cqe, &term->layer_etype, &term->ecode); c4iw_ofld_send(&qhp->rhp->rdev, skb); } @@ -1012,6 +1016,7 @@ out: static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init) { + PDBG("%s p2p_type = %d\n", __func__, p2p_type); memset(&init->u, 0, sizeof init->u); switch (p2p_type) { case FW_RI_INIT_P2PTYPE_RDMA_WRITE: @@ -1212,6 +1217,8 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, break; case C4IW_QP_STATE_TERMINATE: set_state(qhp, C4IW_QP_STATE_TERMINATE); + qhp->attr.layer_etype = attrs->layer_etype; + qhp->attr.ecode = attrs->ecode; if (qhp->ibqp.uobject) t4_set_wq_in_error(&qhp->wq); ep = qhp->ep; @@ -1334,7 +1341,10 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp) rhp = qhp->rhp; attrs.next_state = C4IW_QP_STATE_ERROR; - c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + if (qhp->attr.state == C4IW_QP_STATE_TERMINATE) + c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + else + c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); wait_event(qhp->wait, !qhp->ep); remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); From 615eb715ae10cbaa8079ab8cacf8f4596be4087a Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Sun, 25 Sep 2011 20:17:46 +0530 Subject: [PATCH 16/62] RDMA/nes: Add support for MPAv2 Enhanced RDMA Negotiation This patch adds support for Enhanced RDMA Connection Establishment (draft-ietf-storm-mpa-peer-connect-06), aka MPAv2. Details of draft can be obtained from: For backwards compatibility, the MPAv2 enabled driver reverts to MPAv1 if the remote node doesn't support MPAv2. Signed-off-by: Tatyana Nikolova Signed-off-by: Faisal Latif Reviewed-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 1102 +++++++++++++++---------- drivers/infiniband/hw/nes/nes_cm.h | 73 +- drivers/infiniband/hw/nes/nes_verbs.h | 3 +- 3 files changed, 703 insertions(+), 475 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index c118663e4437..7dc43ea7d4e7 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -77,26 +77,19 @@ atomic_t cm_nodes_destroyed; atomic_t cm_accel_dropped_pkts; atomic_t cm_resets_recvd; -static inline int mini_cm_accelerated(struct nes_cm_core *, - struct nes_cm_node *); -static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, - struct nes_vnic *, struct nes_cm_info *); +static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *); +static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, struct nes_vnic *, struct nes_cm_info *); static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *); -static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, - struct nes_vnic *, u16, void *, struct nes_cm_info *); +static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, struct nes_vnic *, u16, void *, struct nes_cm_info *); static int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *); -static int mini_cm_accept(struct nes_cm_core *, struct ietf_mpa_frame *, - struct nes_cm_node *); -static int mini_cm_reject(struct nes_cm_core *, struct ietf_mpa_frame *, - struct nes_cm_node *); -static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, - struct sk_buff *); +static int mini_cm_accept(struct nes_cm_core *, struct nes_cm_node *); +static int mini_cm_reject(struct nes_cm_core *, struct nes_cm_node *); +static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *); static int mini_cm_dealloc_core(struct nes_cm_core *); static int mini_cm_get(struct nes_cm_core *); static int mini_cm_set(struct nes_cm_core *, u32, u32); -static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, - void *, u32, void *, u32, u8); +static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8); static int add_ref_cm_node(struct nes_cm_node *); static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *); @@ -111,16 +104,14 @@ static int send_syn(struct nes_cm_node *, u32, struct sk_buff *); static int send_reset(struct nes_cm_node *, struct sk_buff *); static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb); static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb); -static void process_packet(struct nes_cm_node *, struct sk_buff *, - struct nes_cm_core *); +static void process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *); static void active_open_err(struct nes_cm_node *, struct sk_buff *, int); static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int); static void cleanup_retrans_entry(struct nes_cm_node *); static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *); static void free_retrans_entry(struct nes_cm_node *cm_node); -static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, - struct sk_buff *skb, int optionsize, int passive); +static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, struct sk_buff *skb, int optionsize, int passive); /* CM event handler functions */ static void cm_event_connected(struct nes_cm_event *); @@ -130,6 +121,12 @@ static void cm_event_mpa_req(struct nes_cm_event *); static void cm_event_mpa_reject(struct nes_cm_event *); static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node); +/* MPA build functions */ +static int cm_build_mpa_frame(struct nes_cm_node *, u8 **, u16 *, u8 *, u8); +static void build_mpa_v2(struct nes_cm_node *, void *, u8); +static void build_mpa_v1(struct nes_cm_node *, void *, u8); +static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **); + static void print_core(struct nes_cm_core *core); /* External CM API Interface */ @@ -163,8 +160,8 @@ atomic_t cm_rejects; /** * create_event */ -static struct nes_cm_event *create_event(struct nes_cm_node *cm_node, - enum nes_cm_event_type type) +static struct nes_cm_event *create_event(struct nes_cm_node * cm_node, + enum nes_cm_event_type type) { struct nes_cm_event *event; @@ -186,10 +183,10 @@ static struct nes_cm_event *create_event(struct nes_cm_node *cm_node, event->cm_info.cm_id = cm_node->cm_id; nes_debug(NES_DBG_CM, "cm_node=%p Created event=%p, type=%u, " - "dst_addr=%08x[%x], src_addr=%08x[%x]\n", - cm_node, event, type, event->cm_info.loc_addr, - event->cm_info.loc_port, event->cm_info.rem_addr, - event->cm_info.rem_port); + "dst_addr=%08x[%x], src_addr=%08x[%x]\n", + cm_node, event, type, event->cm_info.loc_addr, + event->cm_info.loc_port, event->cm_info.rem_addr, + event->cm_info.rem_port); nes_cm_post_event(event); return event; @@ -201,14 +198,19 @@ static struct nes_cm_event *create_event(struct nes_cm_node *cm_node, */ static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb) { + u8 start_addr = 0; + u8 *start_ptr = &start_addr; + u8 **start_buff = &start_ptr; + u16 buff_len = 0; + if (!skb) { nes_debug(NES_DBG_CM, "skb set to NULL\n"); return -1; } /* send an MPA Request frame */ - form_cm_frame(skb, cm_node, NULL, 0, &cm_node->mpa_frame, - cm_node->mpa_frame_size, SET_ACK); + cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REQUEST); + form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK); return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); } @@ -217,7 +219,11 @@ static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb) static int send_mpa_reject(struct nes_cm_node *cm_node) { - struct sk_buff *skb = NULL; + struct sk_buff *skb = NULL; + u8 start_addr = 0; + u8 *start_ptr = &start_addr; + u8 **start_buff = &start_ptr; + u16 buff_len = 0; skb = dev_alloc_skb(MAX_CM_BUFFER); if (!skb) { @@ -226,8 +232,8 @@ static int send_mpa_reject(struct nes_cm_node *cm_node) } /* send an MPA reject frame */ - form_cm_frame(skb, cm_node, NULL, 0, &cm_node->mpa_frame, - cm_node->mpa_frame_size, SET_ACK | SET_FIN); + cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REPLY); + form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK | SET_FIN); cm_node->state = NES_CM_STATE_FIN_WAIT1; return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); @@ -239,24 +245,31 @@ static int send_mpa_reject(struct nes_cm_node *cm_node) * IETF MPA frame */ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, - u32 len) + u32 len) { - struct ietf_mpa_frame *mpa_frame; + struct ietf_mpa_v1 *mpa_frame; + struct ietf_mpa_v2 *mpa_v2_frame; + struct ietf_rtr_msg *rtr_msg; + int mpa_hdr_len; + int priv_data_len; *type = NES_MPA_REQUEST_ACCEPT; /* assume req frame is in tcp data payload */ - if (len < sizeof(struct ietf_mpa_frame)) { + if (len < sizeof(struct ietf_mpa_v1)) { nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len); return -EINVAL; } - mpa_frame = (struct ietf_mpa_frame *)buffer; - cm_node->mpa_frame_size = ntohs(mpa_frame->priv_data_len); + /* points to the beginning of the frame, which could be MPA V1 or V2 */ + mpa_frame = (struct ietf_mpa_v1 *)buffer; + mpa_hdr_len = sizeof(struct ietf_mpa_v1); + priv_data_len = ntohs(mpa_frame->priv_data_len); + /* make sure mpa private data len is less than 512 bytes */ - if (cm_node->mpa_frame_size > IETF_MAX_PRIV_DATA_LEN) { + if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) { nes_debug(NES_DBG_CM, "The received Length of Private" - " Data field exceeds 512 octets\n"); + " Data field exceeds 512 octets\n"); return -EINVAL; } /* @@ -264,11 +277,22 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, * received MPA version and MPA key information * */ - if (mpa_frame->rev != mpa_version) { + if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) { nes_debug(NES_DBG_CM, "The received mpa version" - " can not be interoperated\n"); + " is not supported\n"); return -EINVAL; } + /* + * backwards compatibility only + */ + if (mpa_frame->rev > cm_node->mpa_frame_rev) { + nes_debug(NES_DBG_CM, "The received mpa version" + " can not be interoperated\n"); + return -EINVAL; + } else { + cm_node->mpa_frame_rev = mpa_frame->rev; + } + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) { nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); @@ -281,25 +305,75 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, } } - if (cm_node->mpa_frame_size + sizeof(struct ietf_mpa_frame) != len) { + + if (priv_data_len + mpa_hdr_len != len) { nes_debug(NES_DBG_CM, "The received ietf buffer was not right" - " complete (%x + %x != %x)\n", - cm_node->mpa_frame_size, - (u32)sizeof(struct ietf_mpa_frame), len); + " complete (%x + %x != %x)\n", + priv_data_len, mpa_hdr_len, len); return -EINVAL; } /* make sure it does not exceed the max size */ if (len > MAX_CM_BUFFER) { nes_debug(NES_DBG_CM, "The received ietf buffer was too large" - " (%x + %x != %x)\n", - cm_node->mpa_frame_size, - (u32)sizeof(struct ietf_mpa_frame), len); + " (%x + %x != %x)\n", + priv_data_len, mpa_hdr_len, len); return -EINVAL; } + cm_node->mpa_frame_size = priv_data_len; + + switch (mpa_frame->rev) { + case IETF_MPA_V2: { + u16 ird_size; + u16 ord_size; + mpa_v2_frame = (struct ietf_mpa_v2 *)buffer; + mpa_hdr_len += IETF_RTR_MSG_SIZE; + cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE; + rtr_msg = &mpa_v2_frame->rtr_msg; + + /* parse rtr message */ + rtr_msg->ctrl_ird = ntohs(rtr_msg->ctrl_ird); + rtr_msg->ctrl_ord = ntohs(rtr_msg->ctrl_ord); + ird_size = rtr_msg->ctrl_ird & IETF_NO_IRD_ORD; + ord_size = rtr_msg->ctrl_ord & IETF_NO_IRD_ORD; + + if (!(rtr_msg->ctrl_ird & IETF_PEER_TO_PEER)) { + /* send reset */ + return -EINVAL; + } + + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + /* responder */ + if (cm_node->ord_size > ird_size) + cm_node->ord_size = ird_size; + } else { + /* initiator */ + if (cm_node->ord_size > ird_size) + cm_node->ord_size = ird_size; + + if (cm_node->ird_size < ord_size) { + /* no resources available */ + /* send terminate message */ + return -EINVAL; + } + } + + if (rtr_msg->ctrl_ord & IETF_RDMA0_READ) { + cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; + } else if (rtr_msg->ctrl_ord & IETF_RDMA0_WRITE) { + cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO; + } else { /* Not supported RDMA0 operation */ + return -EINVAL; + } + break; + } + case IETF_MPA_V1: + default: + break; + } + /* copy entire MPA frame to our cm_node's frame */ - memcpy(cm_node->mpa_frame_buf, buffer + sizeof(struct ietf_mpa_frame), - cm_node->mpa_frame_size); + memcpy(cm_node->mpa_frame_buf, buffer + mpa_hdr_len, cm_node->mpa_frame_size); if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT) *type = NES_MPA_REQUEST_REJECT; @@ -312,8 +386,8 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, * node info to build. */ static void form_cm_frame(struct sk_buff *skb, - struct nes_cm_node *cm_node, void *options, u32 optionsize, - void *data, u32 datasize, u8 flags) + struct nes_cm_node *cm_node, void *options, u32 optionsize, + void *data, u32 datasize, u8 flags) { struct tcphdr *tcph; struct iphdr *iph; @@ -322,14 +396,14 @@ static void form_cm_frame(struct sk_buff *skb, u16 packetsize = sizeof(*iph); packetsize += sizeof(*tcph); - packetsize += optionsize + datasize; + packetsize += optionsize + datasize; + skb_trim(skb, 0); memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph)); - skb->len = 0; buf = skb_put(skb, packetsize + ETH_HLEN); - ethh = (struct ethhdr *) buf; + ethh = (struct ethhdr *)buf; buf += ETH_HLEN; iph = (struct iphdr *)buf; @@ -337,7 +411,7 @@ static void form_cm_frame(struct sk_buff *skb, tcph = (struct tcphdr *)buf; skb_reset_mac_header(skb); skb_set_network_header(skb, ETH_HLEN); - skb_set_transport_header(skb, ETH_HLEN+sizeof(*iph)); + skb_set_transport_header(skb, ETH_HLEN + sizeof(*iph)); buf += sizeof(*tcph); skb->ip_summed = CHECKSUM_PARTIAL; @@ -350,14 +424,14 @@ static void form_cm_frame(struct sk_buff *skb, ethh->h_proto = htons(0x0800); iph->version = IPVERSION; - iph->ihl = 5; /* 5 * 4Byte words, IP headr len */ + iph->ihl = 5; /* 5 * 4Byte words, IP headr len */ iph->tos = 0; iph->tot_len = htons(packetsize); iph->id = htons(++cm_node->tcp_cntxt.loc_id); iph->frag_off = htons(0x4000); iph->ttl = 0x40; - iph->protocol = 0x06; /* IPPROTO_TCP */ + iph->protocol = 0x06; /* IPPROTO_TCP */ iph->saddr = htonl(cm_node->loc_addr); iph->daddr = htonl(cm_node->rem_addr); @@ -370,14 +444,16 @@ static void form_cm_frame(struct sk_buff *skb, cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt; tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num); tcph->ack = 1; - } else + } else { tcph->ack_seq = 0; + } if (flags & SET_SYN) { cm_node->tcp_cntxt.loc_seq_num++; tcph->syn = 1; - } else + } else { cm_node->tcp_cntxt.loc_seq_num += datasize; + } if (flags & SET_FIN) { cm_node->tcp_cntxt.loc_seq_num++; @@ -398,10 +474,8 @@ static void form_cm_frame(struct sk_buff *skb, skb_shinfo(skb)->nr_frags = 0; cm_packets_created++; - } - /** * print_core - dump a cm core */ @@ -413,7 +487,7 @@ static void print_core(struct nes_cm_core *core) return; nes_debug(NES_DBG_CM, "---------------------------------------------\n"); - nes_debug(NES_DBG_CM, "State : %u \n", core->state); + nes_debug(NES_DBG_CM, "State : %u \n", core->state); nes_debug(NES_DBG_CM, "Listen Nodes : %u \n", atomic_read(&core->listen_node_cnt)); nes_debug(NES_DBG_CM, "Active Nodes : %u \n", atomic_read(&core->node_cnt)); @@ -423,6 +497,147 @@ static void print_core(struct nes_cm_core *core) nes_debug(NES_DBG_CM, "-------------- end core ---------------\n"); } +/** + * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame + */ +static int cm_build_mpa_frame(struct nes_cm_node *cm_node, u8 **start_buff, + u16 *buff_len, u8 *pci_mem, u8 mpa_key) +{ + int ret = 0; + + *start_buff = (pci_mem) ? pci_mem : &cm_node->mpa_frame_buf[0]; + + switch (cm_node->mpa_frame_rev) { + case IETF_MPA_V1: + *start_buff = (u8 *)*start_buff + sizeof(struct ietf_rtr_msg); + *buff_len = sizeof(struct ietf_mpa_v1) + cm_node->mpa_frame_size; + build_mpa_v1(cm_node, *start_buff, mpa_key); + break; + case IETF_MPA_V2: + *buff_len = sizeof(struct ietf_mpa_v2) + cm_node->mpa_frame_size; + build_mpa_v2(cm_node, *start_buff, mpa_key); + break; + default: + ret = -EINVAL; + } + return ret; +} + +/** + * build_mpa_v2 - build a MPA V2 frame + */ +static void build_mpa_v2(struct nes_cm_node *cm_node, + void *start_addr, u8 mpa_key) +{ + struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr; + struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg; + + /* initialize the upper 5 bytes of the frame */ + build_mpa_v1(cm_node, start_addr, mpa_key); + mpa_frame->flags |= IETF_MPA_V2_FLAG; /* set a bit to indicate MPA V2 */ + mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE); + + /* initialize RTR msg */ + rtr_msg->ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? + IETF_NO_IRD_ORD : cm_node->ird_size; + rtr_msg->ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? + IETF_NO_IRD_ORD : cm_node->ord_size; + + rtr_msg->ctrl_ird |= IETF_PEER_TO_PEER; + rtr_msg->ctrl_ird |= IETF_FLPDU_ZERO_LEN; + + switch (mpa_key) { + case MPA_KEY_REQUEST: + rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE; + rtr_msg->ctrl_ord |= IETF_RDMA0_READ; + break; + case MPA_KEY_REPLY: + switch (cm_node->send_rdma0_op) { + case SEND_RDMA_WRITE_ZERO: + rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE; + break; + case SEND_RDMA_READ_ZERO: + rtr_msg->ctrl_ord |= IETF_RDMA0_READ; + break; + } + } + rtr_msg->ctrl_ird = htons(rtr_msg->ctrl_ird); + rtr_msg->ctrl_ord = htons(rtr_msg->ctrl_ord); +} + +/** + * build_mpa_v1 - build a MPA V1 frame + */ +static void build_mpa_v1(struct nes_cm_node *cm_node, void *start_addr, u8 mpa_key) +{ + struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr; + + switch (mpa_key) { + case MPA_KEY_REQUEST: + memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE); + break; + case MPA_KEY_REPLY: + memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE); + break; + } + mpa_frame->flags = IETF_MPA_FLAGS_CRC; + mpa_frame->rev = cm_node->mpa_frame_rev; + mpa_frame->priv_data_len = htons(cm_node->mpa_frame_size); +} + +static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_addr) +{ + u64 u64temp; + struct nes_qp *nesqp = *nesqp_addr; + struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0]; + + u64temp = (unsigned long)nesqp; + u64temp |= NES_SW_CONTEXT_ALIGN >> 1; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp); + + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0; + + switch (cm_node->send_rdma0_op) { + case SEND_RDMA_WRITE_ZERO: + nes_debug(NES_DBG_CM, "Sending first write.\n"); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_IWARP_SQ_OP_RDMAW); + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0; + break; + + case SEND_RDMA_READ_ZERO: + default: + if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO) { + printk(KERN_ERR "%s[%u]: Unsupported RDMA0 len operation=%u\n", + __func__, __LINE__, cm_node->send_rdma0_op); + WARN_ON(1); + } + nes_debug(NES_DBG_CM, "Sending first rdma operation.\n"); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_IWARP_SQ_OP_RDMAR); + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX] = 1; + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_STAG_IDX] = 1; + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 1; + break; + } + + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + + /*use the reserved spot on the WQ for the extra first WQE*/ + nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | + NES_QPCONTEXT_ORDIRD_WRPDU | + NES_QPCONTEXT_ORDIRD_ALSMM)); + nesqp->skip_lsmm = 1; + nesqp->hwqp.sq_tail = 0; +} /** * schedule_nes_timer @@ -430,10 +645,10 @@ static void print_core(struct nes_cm_core *core) * rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node); */ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, - enum nes_timer_type type, int send_retrans, - int close_when_complete) + enum nes_timer_type type, int send_retrans, + int close_when_complete) { - unsigned long flags; + unsigned long flags; struct nes_cm_core *cm_core = cm_node->cm_core; struct nes_timer_entry *new_send; int ret = 0; @@ -454,7 +669,7 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, new_send->close_when_complete = close_when_complete; if (type == NES_TIMER_TYPE_CLOSE) { - new_send->timetosend += (HZ/10); + new_send->timetosend += (HZ / 10); if (cm_node->recv_entry) { kfree(new_send); WARN_ON(1); @@ -475,7 +690,7 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev); if (ret != NETDEV_TX_OK) { nes_debug(NES_DBG_CM, "Error sending packet %p " - "(jiffies = %lu)\n", new_send, jiffies); + "(jiffies = %lu)\n", new_send, jiffies); new_send->timetosend = jiffies; ret = NETDEV_TX_OK; } else { @@ -504,6 +719,7 @@ static void nes_retrans_expired(struct nes_cm_node *cm_node) struct iw_cm_id *cm_id = cm_node->cm_id; enum nes_cm_node_state state = cm_node->state; cm_node->state = NES_CM_STATE_CLOSED; + switch (state) { case NES_CM_STATE_SYN_RCVD: case NES_CM_STATE_CLOSING: @@ -536,10 +752,10 @@ static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node) spin_lock_irqsave(&nesqp->lock, qplockflags); if (nesqp->cm_id) { nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, " - "refcount = %d: HIT A " - "NES_TIMER_TYPE_CLOSE with something " - "to do!!!\n", nesqp->hwqp.qp_id, cm_id, - atomic_read(&nesqp->refcount)); + "refcount = %d: HIT A " + "NES_TIMER_TYPE_CLOSE with something " + "to do!!!\n", nesqp->hwqp.qp_id, cm_id, + atomic_read(&nesqp->refcount)); nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; nesqp->ibqp_state = IB_QPS_ERR; @@ -548,10 +764,10 @@ static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node) } else { spin_unlock_irqrestore(&nesqp->lock, qplockflags); nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, " - "refcount = %d: HIT A " - "NES_TIMER_TYPE_CLOSE with nothing " - "to do!!!\n", nesqp->hwqp.qp_id, cm_id, - atomic_read(&nesqp->refcount)); + "refcount = %d: HIT A " + "NES_TIMER_TYPE_CLOSE with nothing " + "to do!!!\n", nesqp->hwqp.qp_id, cm_id, + atomic_read(&nesqp->refcount)); } } else if (rem_node) { /* TIME_WAIT state */ @@ -580,11 +796,12 @@ static void nes_cm_timer_tick(unsigned long pass) int ret = NETDEV_TX_OK; struct list_head timer_list; + INIT_LIST_HEAD(&timer_list); spin_lock_irqsave(&cm_core->ht_lock, flags); list_for_each_safe(list_node, list_core_temp, - &cm_core->connected_nodes) { + &cm_core->connected_nodes) { cm_node = container_of(list_node, struct nes_cm_node, list); if ((cm_node->recv_entry) || (cm_node->send_entry)) { add_ref_cm_node(cm_node); @@ -595,18 +812,19 @@ static void nes_cm_timer_tick(unsigned long pass) list_for_each_safe(list_node, list_core_temp, &timer_list) { cm_node = container_of(list_node, struct nes_cm_node, - timer_entry); + timer_entry); recv_entry = cm_node->recv_entry; if (recv_entry) { if (time_after(recv_entry->timetosend, jiffies)) { if (nexttimeout > recv_entry->timetosend || - !settimer) { + !settimer) { nexttimeout = recv_entry->timetosend; settimer = 1; } - } else + } else { handle_recv_entry(cm_node, 1); + } } spin_lock_irqsave(&cm_node->retrans_list_lock, flags); @@ -617,8 +835,8 @@ static void nes_cm_timer_tick(unsigned long pass) if (time_after(send_entry->timetosend, jiffies)) { if (cm_node->state != NES_CM_STATE_TSA) { if ((nexttimeout > - send_entry->timetosend) || - !settimer) { + send_entry->timetosend) || + !settimer) { nexttimeout = send_entry->timetosend; settimer = 1; @@ -630,13 +848,13 @@ static void nes_cm_timer_tick(unsigned long pass) } if ((cm_node->state == NES_CM_STATE_TSA) || - (cm_node->state == NES_CM_STATE_CLOSED)) { + (cm_node->state == NES_CM_STATE_CLOSED)) { free_retrans_entry(cm_node); break; } if (!send_entry->retranscount || - !send_entry->retrycount) { + !send_entry->retrycount) { cm_packets_dropped++; free_retrans_entry(cm_node); @@ -645,28 +863,28 @@ static void nes_cm_timer_tick(unsigned long pass) nes_retrans_expired(cm_node); cm_node->state = NES_CM_STATE_CLOSED; spin_lock_irqsave(&cm_node->retrans_list_lock, - flags); + flags); break; } atomic_inc(&send_entry->skb->users); cm_packets_retrans++; nes_debug(NES_DBG_CM, "Retransmitting send_entry %p " - "for node %p, jiffies = %lu, time to send = " - "%lu, retranscount = %u, send_entry->seq_num = " - "0x%08X, cm_node->tcp_cntxt.rem_ack_num = " - "0x%08X\n", send_entry, cm_node, jiffies, - send_entry->timetosend, - send_entry->retranscount, - send_entry->seq_num, - cm_node->tcp_cntxt.rem_ack_num); + "for node %p, jiffies = %lu, time to send = " + "%lu, retranscount = %u, send_entry->seq_num = " + "0x%08X, cm_node->tcp_cntxt.rem_ack_num = " + "0x%08X\n", send_entry, cm_node, jiffies, + send_entry->timetosend, + send_entry->retranscount, + send_entry->seq_num, + cm_node->tcp_cntxt.rem_ack_num); spin_unlock_irqrestore(&cm_node->retrans_list_lock, - flags); + flags); ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev); spin_lock_irqsave(&cm_node->retrans_list_lock, flags); if (ret != NETDEV_TX_OK) { nes_debug(NES_DBG_CM, "rexmit failed for " - "node=%p\n", cm_node); + "node=%p\n", cm_node); cm_packets_bounced++; send_entry->retrycount--; nexttimeout = jiffies + NES_SHORT_TIME; @@ -676,18 +894,18 @@ static void nes_cm_timer_tick(unsigned long pass) cm_packets_sent++; } nes_debug(NES_DBG_CM, "Packet Sent: retrans count = " - "%u, retry count = %u.\n", - send_entry->retranscount, - send_entry->retrycount); + "%u, retry count = %u.\n", + send_entry->retranscount, + send_entry->retrycount); if (send_entry->send_retrans) { send_entry->retranscount--; timetosend = (NES_RETRY_TIMEOUT << - (NES_DEFAULT_RETRANS - send_entry->retranscount)); + (NES_DEFAULT_RETRANS - send_entry->retranscount)); send_entry->timetosend = jiffies + - min(timetosend, NES_MAX_TIMEOUT); + min(timetosend, NES_MAX_TIMEOUT); if (nexttimeout > send_entry->timetosend || - !settimer) { + !settimer) { nexttimeout = send_entry->timetosend; settimer = 1; } @@ -696,11 +914,11 @@ static void nes_cm_timer_tick(unsigned long pass) close_when_complete = send_entry->close_when_complete; nes_debug(NES_DBG_CM, "cm_node=%p state=%d\n", - cm_node, cm_node->state); + cm_node, cm_node->state); free_retrans_entry(cm_node); if (close_when_complete) rem_ref_cm_node(cm_node->cm_core, - cm_node); + cm_node); } } while (0); @@ -710,7 +928,7 @@ static void nes_cm_timer_tick(unsigned long pass) if (settimer) { if (!timer_pending(&cm_core->tcp_timer)) { - cm_core->tcp_timer.expires = nexttimeout; + cm_core->tcp_timer.expires = nexttimeout; add_timer(&cm_core->tcp_timer); } } @@ -721,13 +939,13 @@ static void nes_cm_timer_tick(unsigned long pass) * send_syn */ static int send_syn(struct nes_cm_node *cm_node, u32 sendack, - struct sk_buff *skb) + struct sk_buff *skb) { int ret; int flags = SET_SYN; char optionsbuffer[sizeof(struct option_mss) + - sizeof(struct option_windowscale) + sizeof(struct option_base) + - TCP_OPTIONS_PADDING]; + sizeof(struct option_windowscale) + sizeof(struct option_base) + + TCP_OPTIONS_PADDING]; int optionssize = 0; /* Sending MSS option */ @@ -854,7 +1072,7 @@ static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb) * find_node - find a cm node that matches the reference cm node */ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, - u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr) + u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr) { unsigned long flags; struct list_head *hte; @@ -868,12 +1086,12 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, list_for_each_entry(cm_node, hte, list) { /* compare quad, return node handle if a match */ nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n", - cm_node->loc_addr, cm_node->loc_port, - loc_addr, loc_port, - cm_node->rem_addr, cm_node->rem_port, - rem_addr, rem_port); + cm_node->loc_addr, cm_node->loc_port, + loc_addr, loc_port, + cm_node->rem_addr, cm_node->rem_port, + rem_addr, rem_port); if ((cm_node->loc_addr == loc_addr) && (cm_node->loc_port == loc_port) && - (cm_node->rem_addr == rem_addr) && (cm_node->rem_port == rem_port)) { + (cm_node->rem_addr == rem_addr) && (cm_node->rem_port == rem_port)) { add_ref_cm_node(cm_node); spin_unlock_irqrestore(&cm_core->ht_lock, flags); return cm_node; @@ -890,7 +1108,7 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, * find_listener - find a cm node listening on this addr-port pair */ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, - nes_addr_t dst_addr, u16 dst_port, enum nes_cm_listener_state listener_state) + nes_addr_t dst_addr, u16 dst_port, enum nes_cm_listener_state listener_state) { unsigned long flags; struct nes_cm_listener *listen_node; @@ -900,9 +1118,9 @@ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, list_for_each_entry(listen_node, &cm_core->listen_list.list, list) { /* compare node pair, return node handle if a match */ if (((listen_node->loc_addr == dst_addr) || - listen_node->loc_addr == 0x00000000) && - (listen_node->loc_port == dst_port) && - (listener_state & listen_node->listener_state)) { + listen_node->loc_addr == 0x00000000) && + (listen_node->loc_port == dst_port) && + (listener_state & listen_node->listener_state)) { atomic_inc(&listen_node->ref_count); spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); return listen_node; @@ -927,7 +1145,7 @@ static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node return -EINVAL; nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n", - cm_node); + cm_node); spin_lock_irqsave(&cm_core->ht_lock, flags); @@ -946,7 +1164,7 @@ static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node * mini_cm_dec_refcnt_listen */ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, - struct nes_cm_listener *listener, int free_hanging_nodes) + struct nes_cm_listener *listener, int free_hanging_nodes) { int ret = -EINVAL; int err = 0; @@ -957,8 +1175,8 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, struct list_head reset_list; nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, " - "refcnt=%d\n", listener, free_hanging_nodes, - atomic_read(&listener->ref_count)); + "refcnt=%d\n", listener, free_hanging_nodes, + atomic_read(&listener->ref_count)); /* free non-accelerated child nodes for this listener */ INIT_LIST_HEAD(&reset_list); if (free_hanging_nodes) { @@ -966,7 +1184,7 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, list_for_each_safe(list_pos, list_temp, &g_cm_core->connected_nodes) { cm_node = container_of(list_pos, struct nes_cm_node, - list); + list); if ((cm_node->listener == listener) && (!cm_node->accelerated)) { add_ref_cm_node(cm_node); @@ -978,7 +1196,7 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, list_for_each_safe(list_pos, list_temp, &reset_list) { cm_node = container_of(list_pos, struct nes_cm_node, - reset_entry); + reset_entry); { struct nes_cm_node *loopback = cm_node->loopbackpartner; enum nes_cm_node_state old_state; @@ -990,7 +1208,7 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, err = send_reset(cm_node, NULL); if (err) { cm_node->state = - NES_CM_STATE_CLOSED; + NES_CM_STATE_CLOSED; WARN_ON(1); } else { old_state = cm_node->state; @@ -1035,10 +1253,9 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - if (listener->nesvnic) { + if (listener->nesvnic) nes_manage_apbvt(listener->nesvnic, listener->loc_port, - PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); - } + PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); @@ -1052,8 +1269,8 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, if (listener) { if (atomic_read(&listener->pend_accepts_cnt) > 0) nes_debug(NES_DBG_CM, "destroying listener (%p)" - " with non-zero pending accepts=%u\n", - listener, atomic_read(&listener->pend_accepts_cnt)); + " with non-zero pending accepts=%u\n", + listener, atomic_read(&listener->pend_accepts_cnt)); } return ret; @@ -1064,7 +1281,7 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, * mini_cm_del_listen */ static int mini_cm_del_listen(struct nes_cm_core *cm_core, - struct nes_cm_listener *listener) + struct nes_cm_listener *listener) { listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE; listener->cm_id = NULL; /* going to be destroyed pretty soon */ @@ -1076,9 +1293,10 @@ static int mini_cm_del_listen(struct nes_cm_core *cm_core, * mini_cm_accelerated */ static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, - struct nes_cm_node *cm_node) + struct nes_cm_node *cm_node) { u32 was_timer_set; + cm_node->accelerated = 1; if (cm_node->accept_pend) { @@ -1112,7 +1330,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi rt = ip_route_output(&init_net, htonl(dst_ip), 0, 0, 0); if (IS_ERR(rt)) { printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n", - __func__, dst_ip); + __func__, dst_ip); return rc; } @@ -1130,7 +1348,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi if (arpindex >= 0) { if (!memcmp(nesadapter->arp_table[arpindex].mac_addr, - neigh->ha, ETH_ALEN)){ + neigh->ha, ETH_ALEN)) { /* Mac address same as in nes_arp_table */ neigh_release(neigh); ip_rt_put(rt); @@ -1138,8 +1356,8 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi } nes_manage_arp_cache(nesvnic->netdev, - nesadapter->arp_table[arpindex].mac_addr, - dst_ip, NES_ARP_DELETE); + nesadapter->arp_table[arpindex].mac_addr, + dst_ip, NES_ARP_DELETE); } nes_manage_arp_cache(nesvnic->netdev, neigh->ha, @@ -1161,8 +1379,8 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi * make_cm_node - create a new instance of a cm node */ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, struct nes_cm_info *cm_info, - struct nes_cm_listener *listener) + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info, + struct nes_cm_listener *listener) { struct nes_cm_node *cm_node; struct timespec ts; @@ -1181,7 +1399,12 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->rem_addr = cm_info->rem_addr; cm_node->loc_port = cm_info->loc_port; cm_node->rem_port = cm_info->rem_port; - cm_node->send_write0 = send_first; + + cm_node->mpa_frame_rev = mpa_version; + cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; + cm_node->ird_size = IETF_NO_IRD_ORD; + cm_node->ord_size = IETF_NO_IRD_ORD; + nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n", &cm_node->loc_addr, cm_node->loc_port, &cm_node->rem_addr, cm_node->rem_port); @@ -1191,7 +1414,7 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN); nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", cm_node->listener, - cm_node->cm_id); + cm_node->cm_id); spin_lock_init(&cm_node->retrans_list_lock); @@ -1202,11 +1425,11 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID; cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >> - NES_CM_DEFAULT_RCV_WND_SCALE; + NES_CM_DEFAULT_RCV_WND_SCALE; ts = current_kernel_time(); cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec); cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) - - sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN; + sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN; cm_node->tcp_cntxt.rcv_nxt = 0; /* get a unique session ID , add thread_id to an upcounter to handle race */ atomic_inc(&cm_core->node_cnt); @@ -1222,12 +1445,11 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->loopbackpartner = NULL; /* get the mac addr for the remote node */ - if (ipv4_is_loopback(htonl(cm_node->rem_addr))) + if (ipv4_is_loopback(htonl(cm_node->rem_addr))) { arpindex = nes_arp_table(nesdev, ntohl(nesvnic->local_ipaddr), NULL, NES_ARP_RESOLVE); - else { + } else { oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr, oldarpindex); - } if (arpindex < 0) { kfree(cm_node); @@ -1260,7 +1482,7 @@ static int add_ref_cm_node(struct nes_cm_node *cm_node) * rem_ref_cm_node - destroy an instance of a cm node */ static int rem_ref_cm_node(struct nes_cm_core *cm_core, - struct nes_cm_node *cm_node) + struct nes_cm_node *cm_node) { unsigned long flags; struct nes_qp *nesqp; @@ -1291,9 +1513,9 @@ static int rem_ref_cm_node(struct nes_cm_core *cm_core, } else { if (cm_node->apbvt_set && cm_node->nesvnic) { nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port, - PCI_FUNC( - cm_node->nesvnic->nesdev->pcidev->devfn), - NES_MANAGE_APBVT_DEL); + PCI_FUNC( + cm_node->nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); } } @@ -1314,7 +1536,7 @@ static int rem_ref_cm_node(struct nes_cm_core *cm_core, * process_options */ static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc, - u32 optionsize, u32 syn_packet) + u32 optionsize, u32 syn_packet) { u32 tmp; u32 offset = 0; @@ -1332,15 +1554,15 @@ static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc, continue; case OPTION_NUMBER_MSS: nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d " - "Size: %d\n", __func__, - all_options->as_mss.length, offset, optionsize); + "Size: %d\n", __func__, + all_options->as_mss.length, offset, optionsize); got_mss_option = 1; if (all_options->as_mss.length != 4) { return 1; } else { tmp = ntohs(all_options->as_mss.mss); if (tmp > 0 && tmp < - cm_node->tcp_cntxt.mss) + cm_node->tcp_cntxt.mss) cm_node->tcp_cntxt.mss = tmp; } break; @@ -1348,12 +1570,9 @@ static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc, cm_node->tcp_cntxt.snd_wscale = all_options->as_windowscale.shiftcount; break; - case OPTION_NUMBER_WRITE0: - cm_node->send_write0 = 1; - break; default: nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n", - all_options->as_base.optionnum); + all_options->as_base.optionnum); break; } offset += all_options->as_base.length; @@ -1372,8 +1591,8 @@ static void drop_packet(struct sk_buff *skb) static void handle_fin_pkt(struct nes_cm_node *cm_node) { nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. " - "refcnt=%d\n", cm_node, cm_node->state, - atomic_read(&cm_node->ref_count)); + "refcnt=%d\n", cm_node, cm_node->state, + atomic_read(&cm_node->ref_count)); switch (cm_node->state) { case NES_CM_STATE_SYN_RCVD: case NES_CM_STATE_SYN_SENT: @@ -1439,7 +1658,20 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " "listener=%p state=%d\n", __func__, __LINE__, cm_node, cm_node->listener, cm_node->state); - active_open_err(cm_node, skb, reset); + switch (cm_node->mpa_frame_rev) { + case IETF_MPA_V2: + cm_node->mpa_frame_rev = IETF_MPA_V1; + /* send a syn and goto syn sent state */ + cm_node->state = NES_CM_STATE_SYN_SENT; + if (send_syn(cm_node, 0, NULL)) { + active_open_err(cm_node, skb, reset); + } + break; + case IETF_MPA_V1: + default: + active_open_err(cm_node, skb, reset); + break; + } break; case NES_CM_STATE_MPAREQ_RCVD: atomic_inc(&cm_node->passive_state); @@ -1475,21 +1707,21 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb) { - - int ret = 0; + int ret = 0; int datasize = skb->len; u8 *dataloc = skb->data; enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN; - u32 res_type; + u32 res_type; + ret = parse_mpa(cm_node, dataloc, &res_type, datasize); if (ret) { nes_debug(NES_DBG_CM, "didn't like MPA Request\n"); if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) { nes_debug(NES_DBG_CM, "%s[%u] create abort for " - "cm_node=%p listener=%p state=%d\n", __func__, - __LINE__, cm_node, cm_node->listener, - cm_node->state); + "cm_node=%p listener=%p state=%d\n", __func__, + __LINE__, cm_node, cm_node->listener, + cm_node->state); active_open_err(cm_node, skb, 1); } else { passive_open_err(cm_node, skb, 1); @@ -1499,16 +1731,15 @@ static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb) switch (cm_node->state) { case NES_CM_STATE_ESTABLISHED: - if (res_type == NES_MPA_REQUEST_REJECT) { + if (res_type == NES_MPA_REQUEST_REJECT) /*BIG problem as we are receiving the MPA.. So should - * not be REJECT.. This is Passive Open.. We can - * only receive it Reject for Active Open...*/ + * not be REJECT.. This is Passive Open.. We can + * only receive it Reject for Active Open...*/ WARN_ON(1); - } cm_node->state = NES_CM_STATE_MPAREQ_RCVD; type = NES_CM_EVENT_MPA_REQ; atomic_set(&cm_node->passive_state, - NES_PASSIVE_STATE_INDICATED); + NES_PASSIVE_STATE_INDICATED); break; case NES_CM_STATE_MPAREQ_SENT: cleanup_retrans_entry(cm_node); @@ -1535,8 +1766,8 @@ static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb) case NES_CM_STATE_SYN_SENT: case NES_CM_STATE_MPAREQ_SENT: nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " - "listener=%p state=%d\n", __func__, __LINE__, cm_node, - cm_node->listener, cm_node->state); + "listener=%p state=%d\n", __func__, __LINE__, cm_node, + cm_node->listener, cm_node->state); active_open_err(cm_node, skb, 1); break; case NES_CM_STATE_ESTABLISHED: @@ -1550,11 +1781,11 @@ static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb) } static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph, - struct sk_buff *skb) + struct sk_buff *skb) { int err; - err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num))? 0 : 1; + err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num)) ? 0 : 1; if (err) active_open_err(cm_node, skb, 1); @@ -1562,7 +1793,7 @@ static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph, } static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph, - struct sk_buff *skb) + struct sk_buff *skb) { int err = 0; u32 seq; @@ -1570,21 +1801,22 @@ static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph, u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num; u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt; u32 rcv_wnd; + seq = ntohl(tcph->seq); ack_seq = ntohl(tcph->ack_seq); rcv_wnd = cm_node->tcp_cntxt.rcv_wnd; if (ack_seq != loc_seq_num) err = 1; - else if (!between(seq, rcv_nxt, (rcv_nxt+rcv_wnd))) + else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd))) err = 1; if (err) { nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " - "listener=%p state=%d\n", __func__, __LINE__, cm_node, - cm_node->listener, cm_node->state); + "listener=%p state=%d\n", __func__, __LINE__, cm_node, + cm_node->listener, cm_node->state); indicate_pkt_err(cm_node, skb); nes_debug(NES_DBG_CM, "seq ERROR cm_node =%p seq=0x%08X " - "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt, - rcv_wnd); + "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt, + rcv_wnd); } return err; } @@ -1594,9 +1826,8 @@ static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph, * is created with a listener or it may comein as rexmitted packet which in * that case will be just dropped. */ - static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, - struct tcphdr *tcph) + struct tcphdr *tcph) { int ret; u32 inc_sequence; @@ -1615,15 +1846,15 @@ static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, case NES_CM_STATE_LISTENING: /* Passive OPEN */ if (atomic_read(&cm_node->listener->pend_accepts_cnt) > - cm_node->listener->backlog) { + cm_node->listener->backlog) { nes_debug(NES_DBG_CM, "drop syn due to backlog " - "pressure \n"); + "pressure \n"); cm_backlog_drops++; passive_open_err(cm_node, skb, 0); break; } ret = handle_tcp_options(cm_node, tcph, skb, optionsize, - 1); + 1); if (ret) { passive_open_err(cm_node, skb, 0); /* drop pkt */ @@ -1657,9 +1888,8 @@ static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, } static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, - struct tcphdr *tcph) + struct tcphdr *tcph) { - int ret; u32 inc_sequence; int optionsize; @@ -1678,7 +1908,7 @@ static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 0); if (ret) { nes_debug(NES_DBG_CM, "cm_node=%p tcp_options failed\n", - cm_node); + cm_node); break; } cleanup_retrans_entry(cm_node); @@ -1717,12 +1947,13 @@ static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, } static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, - struct tcphdr *tcph) + struct tcphdr *tcph) { int datasize = 0; u32 inc_sequence; int ret = 0; int optionsize; + optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); if (check_seq(cm_node, tcph, skb)) @@ -1743,8 +1974,9 @@ static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, if (datasize) { cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; handle_rcv_mpa(cm_node, skb); - } else /* rcvd ACK only */ + } else { /* rcvd ACK only */ dev_kfree_skb_any(skb); + } break; case NES_CM_STATE_ESTABLISHED: /* Passive OPEN */ @@ -1752,16 +1984,18 @@ static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, if (datasize) { cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; handle_rcv_mpa(cm_node, skb); - } else + } else { drop_packet(skb); + } break; case NES_CM_STATE_MPAREQ_SENT: cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); if (datasize) { cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; handle_rcv_mpa(cm_node, skb); - } else /* Could be just an ack pkt.. */ + } else { /* Could be just an ack pkt.. */ dev_kfree_skb_any(skb); + } break; case NES_CM_STATE_LISTENING: cleanup_retrans_entry(cm_node); @@ -1802,14 +2036,15 @@ static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, - struct sk_buff *skb, int optionsize, int passive) + struct sk_buff *skb, int optionsize, int passive) { u8 *optionsloc = (u8 *)&tcph[1]; + if (optionsize) { if (process_options(cm_node, optionsloc, optionsize, - (u32)tcph->syn)) { + (u32)tcph->syn)) { nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n", - __func__, cm_node); + __func__, cm_node); if (passive) passive_open_err(cm_node, skb, 1); else @@ -1819,7 +2054,7 @@ static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, } cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) << - cm_node->tcp_cntxt.snd_wscale; + cm_node->tcp_cntxt.snd_wscale; if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd) cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd; @@ -1830,18 +2065,18 @@ static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, * active_open_err() will send reset() if flag set.. * It will also send ABORT event. */ - static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, - int reset) + int reset) { cleanup_retrans_entry(cm_node); if (reset) { nes_debug(NES_DBG_CM, "ERROR active err called for cm_node=%p, " - "state=%d\n", cm_node, cm_node->state); + "state=%d\n", cm_node, cm_node->state); add_ref_cm_node(cm_node); send_reset(cm_node, skb); - } else + } else { dev_kfree_skb_any(skb); + } cm_node->state = NES_CM_STATE_CLOSED; create_event(cm_node, NES_CM_EVENT_ABORTED); @@ -1851,15 +2086,14 @@ static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, * passive_open_err() will either do a reset() or will free up the skb and * remove the cm_node. */ - static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, - int reset) + int reset) { cleanup_retrans_entry(cm_node); cm_node->state = NES_CM_STATE_CLOSED; if (reset) { nes_debug(NES_DBG_CM, "passive_open_err sending RST for " - "cm_node=%p state =%d\n", cm_node, cm_node->state); + "cm_node=%p state =%d\n", cm_node, cm_node->state); send_reset(cm_node, skb); } else { dev_kfree_skb_any(skb); @@ -1874,6 +2108,7 @@ static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, static void free_retrans_entry(struct nes_cm_node *cm_node) { struct nes_timer_entry *send_entry; + send_entry = cm_node->send_entry; if (send_entry) { cm_node->send_entry = NULL; @@ -1897,26 +2132,28 @@ static void cleanup_retrans_entry(struct nes_cm_node *cm_node) * Returns skb if to be freed, else it will return NULL if already used.. */ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, - struct nes_cm_core *cm_core) + struct nes_cm_core *cm_core) { - enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN; + enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN; struct tcphdr *tcph = tcp_hdr(skb); - u32 fin_set = 0; + u32 fin_set = 0; int ret = 0; + skb_pull(skb, ip_hdr(skb)->ihl << 2); nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d " - "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn, - tcph->ack, tcph->rst, tcph->fin); + "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn, + tcph->ack, tcph->rst, tcph->fin); - if (tcph->rst) + if (tcph->rst) { pkt_type = NES_PKT_TYPE_RST; - else if (tcph->syn) { + } else if (tcph->syn) { pkt_type = NES_PKT_TYPE_SYN; if (tcph->ack) pkt_type = NES_PKT_TYPE_SYNACK; - } else if (tcph->ack) + } else if (tcph->ack) { pkt_type = NES_PKT_TYPE_ACK; + } if (tcph->fin) fin_set = 1; @@ -1947,17 +2184,17 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, * mini_cm_listen - create a listen node with params */ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) { struct nes_cm_listener *listener; unsigned long flags; nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n", - cm_info->loc_addr, cm_info->loc_port); + cm_info->loc_addr, cm_info->loc_port); /* cannot have multiple matching listeners */ listener = find_listener(cm_core, htonl(cm_info->loc_addr), - htons(cm_info->loc_port), NES_CM_LISTENER_EITHER_STATE); + htons(cm_info->loc_port), NES_CM_LISTENER_EITHER_STATE); if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) { /* find automatically incs ref count ??? */ atomic_dec(&listener->ref_count); @@ -2003,9 +2240,9 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, } nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x," - " listener = %p, backlog = %d, cm_id = %p.\n", - cm_info->loc_addr, cm_info->loc_port, - listener, listener->backlog, listener->cm_id); + " listener = %p, backlog = %d, cm_id = %p.\n", + cm_info->loc_addr, cm_info->loc_port, + listener, listener->backlog, listener->cm_id); return listener; } @@ -2015,26 +2252,20 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, * mini_cm_connect - make a connection node with params */ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, u16 private_data_len, - void *private_data, struct nes_cm_info *cm_info) + struct nes_vnic *nesvnic, u16 private_data_len, + void *private_data, struct nes_cm_info *cm_info) { int ret = 0; struct nes_cm_node *cm_node; struct nes_cm_listener *loopbackremotelistener; struct nes_cm_node *loopbackremotenode; struct nes_cm_info loopback_cm_info; - u16 mpa_frame_size = sizeof(struct ietf_mpa_frame) + private_data_len; - struct ietf_mpa_frame *mpa_frame = NULL; + u8 *start_buff; /* create a CM connection node */ cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL); if (!cm_node) return NULL; - mpa_frame = &cm_node->mpa_frame; - memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE); - mpa_frame->flags = IETF_MPA_FLAGS_CRC; - mpa_frame->rev = IETF_MPA_VERSION; - mpa_frame->priv_data_len = htons(private_data_len); /* set our node side to client (active) side */ cm_node->tcp_cntxt.client = 1; @@ -2042,8 +2273,8 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (cm_info->loc_addr == cm_info->rem_addr) { loopbackremotelistener = find_listener(cm_core, - ntohl(nesvnic->local_ipaddr), cm_node->rem_port, - NES_CM_LISTENER_ACTIVE_STATE); + ntohl(nesvnic->local_ipaddr), cm_node->rem_port, + NES_CM_LISTENER_ACTIVE_STATE); if (loopbackremotelistener == NULL) { create_event(cm_node, NES_CM_EVENT_ABORTED); } else { @@ -2052,7 +2283,7 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, loopback_cm_info.rem_port = cm_info->loc_port; loopback_cm_info.cm_id = loopbackremotelistener->cm_id; loopbackremotenode = make_cm_node(cm_core, nesvnic, - &loopback_cm_info, loopbackremotelistener); + &loopback_cm_info, loopbackremotelistener); if (!loopbackremotenode) { rem_ref_cm_node(cm_node->cm_core, cm_node); return NULL; @@ -2063,7 +2294,7 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, NES_CM_DEFAULT_RCV_WND_SCALE; cm_node->loopbackpartner = loopbackremotenode; memcpy(loopbackremotenode->mpa_frame_buf, private_data, - private_data_len); + private_data_len); loopbackremotenode->mpa_frame_size = private_data_len; /* we are done handling this state. */ @@ -2091,12 +2322,10 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, return cm_node; } - /* set our node side to client (active) side */ - cm_node->tcp_cntxt.client = 1; - /* init our MPA frame ptr */ - memcpy(mpa_frame->priv_data, private_data, private_data_len); + start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2); + cm_node->mpa_frame_size = private_data_len; - cm_node->mpa_frame_size = mpa_frame_size; + memcpy(start_buff, private_data, private_data_len); /* send a syn and goto syn sent state */ cm_node->state = NES_CM_STATE_SYN_SENT; @@ -2105,18 +2334,19 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (ret) { /* error in sending the syn free up the cm_node struct */ nes_debug(NES_DBG_CM, "Api - connect() FAILED: dest " - "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n", - cm_node->rem_addr, cm_node->rem_port, cm_node, - cm_node->cm_id); + "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n", + cm_node->rem_addr, cm_node->rem_port, cm_node, + cm_node->cm_id); rem_ref_cm_node(cm_node->cm_core, cm_node); cm_node = NULL; } - if (cm_node) + if (cm_node) { nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X," - "port=0x%04x, cm_node=%p, cm_id = %p.\n", - cm_node->rem_addr, cm_node->rem_port, cm_node, - cm_node->cm_id); + "port=0x%04x, cm_node=%p, cm_id = %p.\n", + cm_node->rem_addr, cm_node->rem_port, cm_node, + cm_node->cm_id); + } return cm_node; } @@ -2126,8 +2356,7 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, * mini_cm_accept - accept a connection * This function is never called */ -static int mini_cm_accept(struct nes_cm_core *cm_core, - struct ietf_mpa_frame *mpa_frame, struct nes_cm_node *cm_node) +static int mini_cm_accept(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) { return 0; } @@ -2136,8 +2365,7 @@ static int mini_cm_accept(struct nes_cm_core *cm_core, /** * mini_cm_reject - reject and teardown a connection */ -static int mini_cm_reject(struct nes_cm_core *cm_core, - struct ietf_mpa_frame *mpa_frame, struct nes_cm_node *cm_node) +static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) { int ret = 0; int err = 0; @@ -2147,7 +2375,7 @@ static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *loopback = cm_node->loopbackpartner; nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n", - __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state); + __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state); if (cm_node->tcp_cntxt.client) return ret; @@ -2168,8 +2396,9 @@ static int mini_cm_reject(struct nes_cm_core *cm_core, err = send_reset(cm_node, NULL); if (err) WARN_ON(1); - } else + } else { cm_id->add_ref(cm_id); + } } } } else { @@ -2244,7 +2473,7 @@ static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_nod case NES_CM_STATE_TSA: if (cm_node->send_entry) printk(KERN_ERR "ERROR Close got called from STATE_TSA " - "send_entry=%p\n", cm_node->send_entry); + "send_entry=%p\n", cm_node->send_entry); ret = rem_ref_cm_node(cm_core, cm_node); break; } @@ -2257,7 +2486,7 @@ static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_nod * node state machine */ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, struct sk_buff *skb) + struct nes_vnic *nesvnic, struct sk_buff *skb) { struct nes_cm_node *cm_node = NULL; struct nes_cm_listener *listener = NULL; @@ -2269,9 +2498,8 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, if (!skb) return 0; - if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr)) { + if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr)) return 0; - } iph = (struct iphdr *)skb->data; tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr)); @@ -2289,8 +2517,8 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, do { cm_node = find_node(cm_core, - nfo.rem_port, nfo.rem_addr, - nfo.loc_port, nfo.loc_addr); + nfo.rem_port, nfo.rem_addr, + nfo.loc_port, nfo.loc_addr); if (!cm_node) { /* Only type of packet accepted are for */ @@ -2300,8 +2528,8 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, break; } listener = find_listener(cm_core, nfo.loc_addr, - nfo.loc_port, - NES_CM_LISTENER_ACTIVE_STATE); + nfo.loc_port, + NES_CM_LISTENER_ACTIVE_STATE); if (!listener) { nfo.cm_id = NULL; nfo.conn_type = 0; @@ -2312,10 +2540,10 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, nfo.cm_id = listener->cm_id; nfo.conn_type = listener->conn_type; cm_node = make_cm_node(cm_core, nesvnic, &nfo, - listener); + listener); if (!cm_node) { nes_debug(NES_DBG_CM, "Unable to allocate " - "node\n"); + "node\n"); cm_packets_dropped++; atomic_dec(&listener->ref_count); dev_kfree_skb_any(skb); @@ -2363,7 +2591,7 @@ static struct nes_cm_core *nes_cm_alloc_core(void) init_timer(&cm_core->tcp_timer); cm_core->tcp_timer.function = nes_cm_timer_tick; - cm_core->mtu = NES_CM_DEFAULT_MTU; + cm_core->mtu = NES_CM_DEFAULT_MTU; cm_core->state = NES_CM_STATE_INITED; cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS; @@ -2401,9 +2629,8 @@ static int mini_cm_dealloc_core(struct nes_cm_core *cm_core) barrier(); - if (timer_pending(&cm_core->tcp_timer)) { + if (timer_pending(&cm_core->tcp_timer)) del_timer(&cm_core->tcp_timer); - } destroy_workqueue(cm_core->event_wq); destroy_workqueue(cm_core->disconn_wq); @@ -2458,8 +2685,8 @@ static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_nod return -EINVAL; nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 | - NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG | - NES_QPCONTEXT_MISC_DROS); + NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG | + NES_QPCONTEXT_MISC_DROS); if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale) nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE); @@ -2469,15 +2696,15 @@ static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_nod nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16); nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32( - (u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT); + (u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT); nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( - (cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) & - NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK); + (cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) & + NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK); nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( - (cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) & - NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK); + (cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) & + NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK); nesqp->nesqp_context->keepalive = cpu_to_le32(0x80); nesqp->nesqp_context->ts_recent = 0; @@ -2486,24 +2713,24 @@ static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_nod nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd); nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd << - cm_node->tcp_cntxt.rcv_wscale); + cm_node->tcp_cntxt.rcv_wscale); nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); nesqp->nesqp_context->srtt = 0; nesqp->nesqp_context->rttvar = cpu_to_le32(0x6); nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000); - nesqp->nesqp_context->cwnd = cpu_to_le32(2*cm_node->tcp_cntxt.mss); + nesqp->nesqp_context->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss); nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd); nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X," - " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n", - nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt), - le32_to_cpu(nesqp->nesqp_context->snd_nxt), - cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale), - le32_to_cpu(nesqp->nesqp_context->rcv_wnd), - le32_to_cpu(nesqp->nesqp_context->misc)); + " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n", + nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt), + le32_to_cpu(nesqp->nesqp_context->snd_nxt), + cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale), + le32_to_cpu(nesqp->nesqp_context->rcv_wnd), + le32_to_cpu(nesqp->nesqp_context->misc)); nes_debug(NES_DBG_CM, " snd_wnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd)); nes_debug(NES_DBG_CM, " snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd)); nes_debug(NES_DBG_CM, " max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd)); @@ -2524,7 +2751,7 @@ int nes_cm_disconn(struct nes_qp *nesqp) work = kzalloc(sizeof *work, GFP_ATOMIC); if (!work) - return -ENOMEM; /* Timer will clean up */ + return -ENOMEM; /* Timer will clean up */ nes_add_ref(&nesqp->ibqp); work->nesqp = nesqp; @@ -2544,7 +2771,7 @@ static void nes_disconnect_worker(struct work_struct *work) kfree(dwork); nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n", - nesqp->last_aeq, nesqp->hwqp.qp_id); + nesqp->last_aeq, nesqp->hwqp.qp_id); nes_cm_disconn_true(nesqp); nes_rem_ref(&nesqp->ibqp); } @@ -2580,7 +2807,7 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) /* make sure we havent already closed this connection */ if (!cm_id) { nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n", - nesqp->hwqp.qp_id); + nesqp->hwqp.qp_id); spin_unlock_irqrestore(&nesqp->lock, flags); return -1; } @@ -2589,7 +2816,7 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id); original_hw_tcp_state = nesqp->hw_tcp_state; - original_ibqp_state = nesqp->ibqp_state; + original_ibqp_state = nesqp->ibqp_state; last_ae = nesqp->last_aeq; if (nesqp->term_flags) { @@ -2647,16 +2874,16 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) cm_event.private_data_len = 0; nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event" - " for QP%u, SQ Head = %u, SQ Tail = %u. " - "cm_id = %p, refcount = %u.\n", - nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, - nesqp->hwqp.sq_tail, cm_id, - atomic_read(&nesqp->refcount)); + " for QP%u, SQ Head = %u, SQ Tail = %u. " + "cm_id = %p, refcount = %u.\n", + nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, + nesqp->hwqp.sq_tail, cm_id, + atomic_read(&nesqp->refcount)); ret = cm_id->event_handler(cm_id, &cm_event); if (ret) nes_debug(NES_DBG_CM, "OFA CM event_handler " - "returned, ret=%d\n", ret); + "returned, ret=%d\n", ret); } if (issue_close) { @@ -2674,9 +2901,8 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) cm_event.private_data_len = 0; ret = cm_id->event_handler(cm_id, &cm_event); - if (ret) { + if (ret) nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); - } cm_id->rem_ref(cm_id); } @@ -2716,8 +2942,8 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt) if (nesqp->lsmm_mr) nesibdev->ibdev.dereg_mr(nesqp->lsmm_mr); pci_free_consistent(nesdev->pcidev, - nesqp->private_data_len+sizeof(struct ietf_mpa_frame), - nesqp->ietf_frame, nesqp->ietf_frame_pbase); + nesqp->private_data_len + nesqp->ietf_frame_size, + nesqp->ietf_frame, nesqp->ietf_frame_pbase); } } @@ -2756,6 +2982,12 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct ib_phys_buf ibphysbuf; struct nes_pd *nespd; u64 tagged_offset; + u8 mpa_frame_offset = 0; + struct ietf_mpa_v2 *mpa_v2_frame; + u8 start_addr = 0; + u8 *start_ptr = &start_addr; + u8 **start_buff = &start_ptr; + u16 buff_len = 0; ibqp = nes_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) @@ -2796,53 +3028,49 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", netdev_refcnt_read(nesvnic->netdev)); + nesqp->ietf_frame_size = sizeof(struct ietf_mpa_v2); /* allocate the ietf frame and space for private data */ nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, - sizeof(struct ietf_mpa_frame) + conn_param->private_data_len, - &nesqp->ietf_frame_pbase); + nesqp->ietf_frame_size + conn_param->private_data_len, + &nesqp->ietf_frame_pbase); if (!nesqp->ietf_frame) { - nes_debug(NES_DBG_CM, "Unable to allocate memory for private " - "data\n"); + nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n"); return -ENOMEM; } + mpa_v2_frame = (struct ietf_mpa_v2 *)nesqp->ietf_frame; + if (cm_node->mpa_frame_rev == IETF_MPA_V1) + mpa_frame_offset = 4; - /* setup the MPA frame */ + memcpy(mpa_v2_frame->priv_data, conn_param->private_data, + conn_param->private_data_len); + + cm_build_mpa_frame(cm_node, start_buff, &buff_len, nesqp->ietf_frame, MPA_KEY_REPLY); nesqp->private_data_len = conn_param->private_data_len; - memcpy(nesqp->ietf_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE); - - memcpy(nesqp->ietf_frame->priv_data, conn_param->private_data, - conn_param->private_data_len); - - nesqp->ietf_frame->priv_data_len = - cpu_to_be16(conn_param->private_data_len); - nesqp->ietf_frame->rev = mpa_version; - nesqp->ietf_frame->flags = IETF_MPA_FLAGS_CRC; /* setup our first outgoing iWarp send WQE (the IETF frame response) */ wqe = &nesqp->hwqp.sq_vbase[0]; if (cm_id->remote_addr.sin_addr.s_addr != - cm_id->local_addr.sin_addr.s_addr) { + cm_id->local_addr.sin_addr.s_addr) { u64temp = (unsigned long)nesqp; nesibdev = nesvnic->nesibdev; nespd = nesqp->nespd; - ibphysbuf.addr = nesqp->ietf_frame_pbase; - ibphysbuf.size = conn_param->private_data_len + - sizeof(struct ietf_mpa_frame); - tagged_offset = (u64)(unsigned long)nesqp->ietf_frame; + ibphysbuf.addr = nesqp->ietf_frame_pbase + mpa_frame_offset; + ibphysbuf.size = buff_len; + tagged_offset = (u64)(unsigned long)*start_buff; ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd, - &ibphysbuf, 1, - IB_ACCESS_LOCAL_WRITE, - &tagged_offset); + &ibphysbuf, 1, + IB_ACCESS_LOCAL_WRITE, + &tagged_offset); if (!ibmr) { nes_debug(NES_DBG_CM, "Unable to register memory region" - "for lSMM for cm_node = %p \n", - cm_node); + "for lSMM for cm_node = %p \n", + cm_node); pci_free_consistent(nesdev->pcidev, - nesqp->private_data_len+sizeof(struct ietf_mpa_frame), - nesqp->ietf_frame, nesqp->ietf_frame_pbase); + nesqp->private_data_len + nesqp->ietf_frame_size, + nesqp->ietf_frame, nesqp->ietf_frame_pbase); return -ENOMEM; } @@ -2850,22 +3078,20 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ibmr->device = nespd->ibpd.device; nesqp->lsmm_mr = ibmr; - u64temp |= NES_SW_CONTEXT_ALIGN>>1; + u64temp |= NES_SW_CONTEXT_ALIGN >> 1; set_wqe_64bit_value(wqe->wqe_words, - NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, - u64temp); + NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, + u64temp); wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING | - NES_IWARP_SQ_WQE_WRPDU); + NES_IWARP_SQ_WQE_WRPDU); wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = - cpu_to_le32(conn_param->private_data_len + - sizeof(struct ietf_mpa_frame)); + cpu_to_le32(buff_len); set_wqe_64bit_value(wqe->wqe_words, - NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, - (u64)(unsigned long)nesqp->ietf_frame); + NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, + (u64)(unsigned long)(*start_buff)); wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = - cpu_to_le32(conn_param->private_data_len + - sizeof(struct ietf_mpa_frame)); + cpu_to_le32(buff_len); wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey; if (nesqp->sq_kmapped) { nesqp->sq_kmapped = 0; @@ -2874,7 +3100,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | - NES_QPCONTEXT_ORDIRD_WRPDU); + NES_QPCONTEXT_ORDIRD_WRPDU); } else { nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU); @@ -2888,11 +3114,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) /* nesqp->cm_node = (void *)cm_id->provider_data; */ cm_id->provider_data = nesqp; - nesqp->active_conn = 0; + nesqp->active_conn = 0; if (cm_node->state == NES_CM_STATE_TSA) nes_debug(NES_DBG_CM, "Already state = TSA for cm_node=%p\n", - cm_node); + cm_node); nes_cm_init_tsa_conn(nesqp, cm_node); @@ -2909,13 +3135,13 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cpu_to_le32(ntohl(cm_id->remote_addr.sin_addr.s_addr)); nesqp->nesqp_context->misc2 |= cpu_to_le32( - (u32)PCI_FUNC(nesdev->pcidev->devfn) << - NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); + (u32)PCI_FUNC(nesdev->pcidev->devfn) << + NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32(nes_arp_table(nesdev, - le32_to_cpu(nesqp->nesqp_context->ip0), NULL, - NES_ARP_RESOLVE) << 16); + le32_to_cpu(nesqp->nesqp_context->ip0), NULL, + NES_ARP_RESOLVE) << 16); nesqp->nesqp_context->ts_val_delta = cpu_to_le32( jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW)); @@ -2941,7 +3167,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) crc_value = get_crc_value(&nes_quad); nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n", - nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask); + nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask); nesqp->hte_index &= adapter->hte_index_mask; nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); @@ -2949,17 +3175,15 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = " - "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + " - "private data length=%zu.\n", nesqp->hwqp.qp_id, - ntohl(cm_id->remote_addr.sin_addr.s_addr), - ntohs(cm_id->remote_addr.sin_port), - ntohl(cm_id->local_addr.sin_addr.s_addr), - ntohs(cm_id->local_addr.sin_port), - le32_to_cpu(nesqp->nesqp_context->rcv_nxt), - le32_to_cpu(nesqp->nesqp_context->snd_nxt), - conn_param->private_data_len + - sizeof(struct ietf_mpa_frame)); - + "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + " + "private data length=%u.\n", nesqp->hwqp.qp_id, + ntohl(cm_id->remote_addr.sin_addr.s_addr), + ntohs(cm_id->remote_addr.sin_port), + ntohl(cm_id->local_addr.sin_addr.s_addr), + ntohs(cm_id->local_addr.sin_port), + le32_to_cpu(nesqp->nesqp_context->rcv_nxt), + le32_to_cpu(nesqp->nesqp_context->snd_nxt), + buff_len); /* notify OF layer that accept event was successful */ cm_id->add_ref(cm_id); @@ -2980,12 +3204,12 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nesqp->private_data_len; /* copy entire MPA frame to our cm_node's frame */ memcpy(cm_node->loopbackpartner->mpa_frame_buf, - nesqp->ietf_frame->priv_data, nesqp->private_data_len); + conn_param->private_data, conn_param->private_data_len); create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED); } if (ret) printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " - "ret=%d\n", __func__, __LINE__, ret); + "ret=%d\n", __func__, __LINE__, ret); return 0; } @@ -2998,34 +3222,28 @@ int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) { struct nes_cm_node *cm_node; struct nes_cm_node *loopback; - struct nes_cm_core *cm_core; + u8 *start_buff; atomic_inc(&cm_rejects); - cm_node = (struct nes_cm_node *) cm_id->provider_data; + cm_node = (struct nes_cm_node *)cm_id->provider_data; loopback = cm_node->loopbackpartner; cm_core = cm_node->cm_core; cm_node->cm_id = cm_id; - cm_node->mpa_frame_size = sizeof(struct ietf_mpa_frame) + pdata_len; - if (cm_node->mpa_frame_size > MAX_CM_BUFFER) + if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER) return -EINVAL; - memcpy(&cm_node->mpa_frame.key[0], IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE); if (loopback) { memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len); loopback->mpa_frame.priv_data_len = pdata_len; - loopback->mpa_frame_size = sizeof(struct ietf_mpa_frame) + - pdata_len; + loopback->mpa_frame_size = pdata_len; } else { - memcpy(&cm_node->mpa_frame.priv_data, pdata, pdata_len); - cm_node->mpa_frame.priv_data_len = cpu_to_be16(pdata_len); + start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2); + cm_node->mpa_frame_size = pdata_len; + memcpy(start_buff, pdata, pdata_len); } - - cm_node->mpa_frame.rev = mpa_version; - cm_node->mpa_frame.flags = IETF_MPA_FLAGS_CRC | IETF_MPA_FLAGS_REJECT; - - return cm_core->api->reject(cm_core, &cm_node->mpa_frame, cm_node); + return cm_core->api->reject(cm_core, cm_node); } @@ -3052,7 +3270,7 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nesvnic = to_nesvnic(nesqp->ibqp.device); if (!nesvnic) return -EINVAL; - nesdev = nesvnic->nesdev; + nesdev = nesvnic->nesdev; if (!nesdev) return -EINVAL; @@ -3060,12 +3278,12 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) return -EINVAL; nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = " - "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id, - ntohl(nesvnic->local_ipaddr), - ntohl(cm_id->remote_addr.sin_addr.s_addr), - ntohs(cm_id->remote_addr.sin_port), - ntohl(cm_id->local_addr.sin_addr.s_addr), - ntohs(cm_id->local_addr.sin_port)); + "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id, + ntohl(nesvnic->local_ipaddr), + ntohl(cm_id->remote_addr.sin_addr.s_addr), + ntohs(cm_id->remote_addr.sin_port), + ntohl(cm_id->local_addr.sin_addr.s_addr), + ntohs(cm_id->local_addr.sin_port)); atomic_inc(&cm_connects); nesqp->active_conn = 1; @@ -3079,12 +3297,12 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32((u32)conn_param->ord); nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord); nes_debug(NES_DBG_CM, "mpa private data len =%u\n", - conn_param->private_data_len); + conn_param->private_data_len); if (cm_id->local_addr.sin_addr.s_addr != - cm_id->remote_addr.sin_addr.s_addr) { + cm_id->remote_addr.sin_addr.s_addr) { nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port), - PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); + PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); apbvt_set = 1; } @@ -3100,13 +3318,13 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) /* create a connect CM node connection */ cm_node = g_cm_core->api->connect(g_cm_core, nesvnic, - conn_param->private_data_len, (void *)conn_param->private_data, - &cm_info); + conn_param->private_data_len, (void *)conn_param->private_data, + &cm_info); if (!cm_node) { if (apbvt_set) nes_manage_apbvt(nesvnic, ntohs(cm_id->local_addr.sin_port), - PCI_FUNC(nesdev->pcidev->devfn), - NES_MANAGE_APBVT_DEL); + PCI_FUNC(nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); cm_id->rem_ref(cm_id); return -ENOMEM; @@ -3156,7 +3374,7 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info); if (!cm_node) { printk(KERN_ERR "%s[%u] Error returned from listen API call\n", - __func__, __LINE__); + __func__, __LINE__); return -ENOMEM; } @@ -3164,12 +3382,12 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) if (!cm_node->reused_node) { err = nes_manage_apbvt(nesvnic, - ntohs(cm_id->local_addr.sin_port), - PCI_FUNC(nesvnic->nesdev->pcidev->devfn), - NES_MANAGE_APBVT_ADD); + ntohs(cm_id->local_addr.sin_port), + PCI_FUNC(nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_ADD); if (err) { printk(KERN_ERR "nes_manage_apbvt call returned %d.\n", - err); + err); g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node); return err; } @@ -3206,13 +3424,13 @@ int nes_destroy_listen(struct iw_cm_id *cm_id) int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice) { int rc = 0; + cm_packets_received++; - if ((g_cm_core) && (g_cm_core->api)) { + if ((g_cm_core) && (g_cm_core->api)) rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb); - } else { + else nes_debug(NES_DBG_CM, "Unable to process packet for CM," - " cm is not setup properly.\n"); - } + " cm is not setup properly.\n"); return rc; } @@ -3227,11 +3445,10 @@ int nes_cm_start(void) nes_debug(NES_DBG_CM, "\n"); /* create the primary CM core, pass this handle to subsequent core inits */ g_cm_core = nes_cm_alloc_core(); - if (g_cm_core) { + if (g_cm_core) return 0; - } else { + else return -ENOMEM; - } } @@ -3252,7 +3469,6 @@ int nes_cm_stop(void) */ static void cm_event_connected(struct nes_cm_event *event) { - u64 u64temp; struct nes_qp *nesqp; struct nes_vnic *nesvnic; struct nes_device *nesdev; @@ -3261,7 +3477,6 @@ static void cm_event_connected(struct nes_cm_event *event) struct ib_qp_attr attr; struct iw_cm_id *cm_id; struct iw_cm_event cm_event; - struct nes_hw_qp_wqe *wqe; struct nes_v4_quad nes_quad; u32 crc_value; int ret; @@ -3275,17 +3490,16 @@ static void cm_event_connected(struct nes_cm_event *event) nesdev = nesvnic->nesdev; nesadapter = nesdev->nesadapter; - if (nesqp->destroyed) { + if (nesqp->destroyed) return; - } atomic_inc(&cm_connecteds); nes_debug(NES_DBG_CM, "QP%u attempting to connect to 0x%08X:0x%04X on" - " local port 0x%04X. jiffies = %lu.\n", - nesqp->hwqp.qp_id, - ntohl(cm_id->remote_addr.sin_addr.s_addr), - ntohs(cm_id->remote_addr.sin_port), - ntohs(cm_id->local_addr.sin_port), - jiffies); + " local port 0x%04X. jiffies = %lu.\n", + nesqp->hwqp.qp_id, + ntohl(cm_id->remote_addr.sin_addr.s_addr), + ntohs(cm_id->remote_addr.sin_port), + ntohs(cm_id->local_addr.sin_port), + jiffies); nes_cm_init_tsa_conn(nesqp, cm_node); @@ -3316,40 +3530,12 @@ static void cm_event_connected(struct nes_cm_event *event) NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT); /* Adjust tail for not having a LSMM */ - nesqp->hwqp.sq_tail = 1; + /*nesqp->hwqp.sq_tail = 1;*/ -#if defined(NES_SEND_FIRST_WRITE) - if (cm_node->send_write0) { - nes_debug(NES_DBG_CM, "Sending first write.\n"); - wqe = &nesqp->hwqp.sq_vbase[0]; - u64temp = (unsigned long)nesqp; - u64temp |= NES_SW_CONTEXT_ALIGN>>1; - set_wqe_64bit_value(wqe->wqe_words, - NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp); - wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = - cpu_to_le32(NES_IWARP_SQ_OP_RDMAW); - wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0; - wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0; - wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0; - wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0; - wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0; + build_rdma0_msg(cm_node, &nesqp); - if (nesqp->sq_kmapped) { - nesqp->sq_kmapped = 0; - kunmap(nesqp->page); - } - - /* use the reserved spot on the WQ for the extra first WQE */ - nesqp->nesqp_context->ird_ord_sizes &= - cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | - NES_QPCONTEXT_ORDIRD_WRPDU | - NES_QPCONTEXT_ORDIRD_ALSMM)); - nesqp->skip_lsmm = 1; - nesqp->hwqp.sq_tail = 0; - nes_write32(nesdev->regs + NES_WQE_ALLOC, - (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); - } -#endif + nes_write32(nesdev->regs + NES_WQE_ALLOC, + (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); memset(&nes_quad, 0, sizeof(nes_quad)); @@ -3366,13 +3552,13 @@ static void cm_event_connected(struct nes_cm_event *event) crc_value = get_crc_value(&nes_quad); nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n", - nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); + nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); nesqp->hte_index &= nesadapter->hte_index_mask; nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); nesqp->ietf_frame = &cm_node->mpa_frame; - nesqp->private_data_len = (u8) cm_node->mpa_frame_size; + nesqp->private_data_len = (u8)cm_node->mpa_frame_size; cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); /* notify OF layer we successfully created the requested connection */ @@ -3384,7 +3570,9 @@ static void cm_event_connected(struct nes_cm_event *event) cm_event.remote_addr = cm_id->remote_addr; cm_event.private_data = (void *)event->cm_node->mpa_frame_buf; - cm_event.private_data_len = (u8) event->cm_node->mpa_frame_size; + cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size; + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; cm_event.local_addr.sin_addr.s_addr = event->cm_info.rem_addr; ret = cm_id->event_handler(cm_id, &cm_event); @@ -3392,12 +3580,12 @@ static void cm_event_connected(struct nes_cm_event *event) if (ret) printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " - "ret=%d\n", __func__, __LINE__, ret); + "ret=%d\n", __func__, __LINE__, ret); attr.qp_state = IB_QPS_RTS; nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = " - "%lu\n", nesqp->hwqp.qp_id, jiffies); + "%lu\n", nesqp->hwqp.qp_id, jiffies); return; } @@ -3418,16 +3606,14 @@ static void cm_event_connect_error(struct nes_cm_event *event) return; cm_id = event->cm_node->cm_id; - if (!cm_id) { + if (!cm_id) return; - } nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id); nesqp = cm_id->provider_data; - if (!nesqp) { + if (!nesqp) return; - } /* notify OF layer about this connection error event */ /* cm_id->rem_ref(cm_id); */ @@ -3442,14 +3628,14 @@ static void cm_event_connect_error(struct nes_cm_event *event) cm_event.private_data_len = 0; nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, " - "remove_addr=%08x\n", cm_event.local_addr.sin_addr.s_addr, - cm_event.remote_addr.sin_addr.s_addr); + "remove_addr=%08x\n", cm_event.local_addr.sin_addr.s_addr, + cm_event.remote_addr.sin_addr.s_addr); ret = cm_id->event_handler(cm_id, &cm_event); nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); if (ret) printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " - "ret=%d\n", __func__, __LINE__, ret); + "ret=%d\n", __func__, __LINE__, ret); cm_id->rem_ref(cm_id); rem_ref_cm_node(event->cm_node->cm_core, event->cm_node); @@ -3519,7 +3705,7 @@ static void cm_event_reset(struct nes_cm_event *event) */ static void cm_event_mpa_req(struct nes_cm_event *event) { - struct iw_cm_id *cm_id; + struct iw_cm_id *cm_id; struct iw_cm_event cm_event; int ret; struct nes_cm_node *cm_node; @@ -3531,7 +3717,7 @@ static void cm_event_mpa_req(struct nes_cm_event *event) atomic_inc(&cm_connect_reqs); nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n", - cm_node, cm_id, jiffies); + cm_node, cm_id, jiffies); cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; cm_event.status = 0; @@ -3545,19 +3731,21 @@ static void cm_event_mpa_req(struct nes_cm_event *event) cm_event.remote_addr.sin_port = htons(event->cm_info.rem_port); cm_event.remote_addr.sin_addr.s_addr = htonl(event->cm_info.rem_addr); cm_event.private_data = cm_node->mpa_frame_buf; - cm_event.private_data_len = (u8) cm_node->mpa_frame_size; + cm_event.private_data_len = (u8)cm_node->mpa_frame_size; + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; ret = cm_id->event_handler(cm_id, &cm_event); if (ret) printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n", - __func__, __LINE__, ret); + __func__, __LINE__, ret); return; } static void cm_event_mpa_reject(struct nes_cm_event *event) { - struct iw_cm_id *cm_id; + struct iw_cm_id *cm_id; struct iw_cm_event cm_event; struct nes_cm_node *cm_node; int ret; @@ -3569,7 +3757,7 @@ static void cm_event_mpa_reject(struct nes_cm_event *event) atomic_inc(&cm_connect_reqs); nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n", - cm_node, cm_id, jiffies); + cm_node, cm_id, jiffies); cm_event.event = IW_CM_EVENT_CONNECT_REPLY; cm_event.status = -ECONNREFUSED; @@ -3584,17 +3772,17 @@ static void cm_event_mpa_reject(struct nes_cm_event *event) cm_event.remote_addr.sin_addr.s_addr = htonl(event->cm_info.rem_addr); cm_event.private_data = cm_node->mpa_frame_buf; - cm_event.private_data_len = (u8) cm_node->mpa_frame_size; + cm_event.private_data_len = (u8)cm_node->mpa_frame_size; nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, " - "remove_addr=%08x\n", - cm_event.local_addr.sin_addr.s_addr, - cm_event.remote_addr.sin_addr.s_addr); + "remove_addr=%08x\n", + cm_event.local_addr.sin_addr.s_addr, + cm_event.remote_addr.sin_addr.s_addr); ret = cm_id->event_handler(cm_id, &cm_event); if (ret) printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n", - __func__, __LINE__, ret); + __func__, __LINE__, ret); return; } @@ -3613,7 +3801,7 @@ static int nes_cm_post_event(struct nes_cm_event *event) event->cm_info.cm_id->add_ref(event->cm_info.cm_id); INIT_WORK(&event->event_work, nes_cm_event_handler); nes_debug(NES_DBG_CM, "cm_node=%p queue_work, event=%p\n", - event->cm_node, event); + event->cm_node, event); queue_work(event->cm_node->cm_core->event_wq, &event->event_work); @@ -3630,7 +3818,7 @@ static int nes_cm_post_event(struct nes_cm_event *event) static void nes_cm_event_handler(struct work_struct *work) { struct nes_cm_event *event = container_of(work, struct nes_cm_event, - event_work); + event_work); struct nes_cm_core *cm_core; if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core)) @@ -3638,29 +3826,29 @@ static void nes_cm_event_handler(struct work_struct *work) cm_core = event->cm_node->cm_core; nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n", - event, event->type, atomic_read(&cm_core->events_posted)); + event, event->type, atomic_read(&cm_core->events_posted)); switch (event->type) { case NES_CM_EVENT_MPA_REQ: cm_event_mpa_req(event); nes_debug(NES_DBG_CM, "cm_node=%p CM Event: MPA REQUEST\n", - event->cm_node); + event->cm_node); break; case NES_CM_EVENT_RESET: nes_debug(NES_DBG_CM, "cm_node = %p CM Event: RESET\n", - event->cm_node); + event->cm_node); cm_event_reset(event); break; case NES_CM_EVENT_CONNECTED: if ((!event->cm_node->cm_id) || - (event->cm_node->state != NES_CM_STATE_TSA)) + (event->cm_node->state != NES_CM_STATE_TSA)) break; cm_event_connected(event); nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n"); break; case NES_CM_EVENT_MPA_REJECT: if ((!event->cm_node->cm_id) || - (event->cm_node->state == NES_CM_STATE_TSA)) + (event->cm_node->state == NES_CM_STATE_TSA)) break; cm_event_mpa_reject(event); nes_debug(NES_DBG_CM, "CM Event: REJECT\n"); @@ -3668,7 +3856,7 @@ static void nes_cm_event_handler(struct work_struct *work) case NES_CM_EVENT_ABORTED: if ((!event->cm_node->cm_id) || - (event->cm_node->state == NES_CM_STATE_TSA)) + (event->cm_node->state == NES_CM_STATE_TSA)) break; cm_event_connect_error(event); nes_debug(NES_DBG_CM, "CM Event: ABORTED\n"); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index d9825fda70a1..85f53d9521b0 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -48,7 +48,16 @@ #define IETF_MPA_KEY_SIZE 16 #define IETF_MPA_VERSION 1 #define IETF_MAX_PRIV_DATA_LEN 512 -#define IETF_MPA_FRAME_SIZE 20 +#define IETF_MPA_FRAME_SIZE 20 +#define IETF_RTR_MSG_SIZE 4 +#define IETF_MPA_V2_FLAG 0x10 + +/* IETF RTR MSG Fields */ +#define IETF_PEER_TO_PEER 0x8000 +#define IETF_FLPDU_ZERO_LEN 0x4000 +#define IETF_RDMA0_WRITE 0x8000 +#define IETF_RDMA0_READ 0x4000 +#define IETF_NO_IRD_ORD 0x3FFF enum ietf_mpa_flags { IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ @@ -56,7 +65,7 @@ enum ietf_mpa_flags { IETF_MPA_FLAGS_REJECT = 0x20, /* Reject */ }; -struct ietf_mpa_frame { +struct ietf_mpa_v1 { u8 key[IETF_MPA_KEY_SIZE]; u8 flags; u8 rev; @@ -66,6 +75,20 @@ struct ietf_mpa_frame { #define ietf_mpa_req_resp_frame ietf_mpa_frame +struct ietf_rtr_msg { + __be16 ctrl_ird; + __be16 ctrl_ord; +}; + +struct ietf_mpa_v2 { + u8 key[IETF_MPA_KEY_SIZE]; + u8 flags; + u8 rev; + __be16 priv_data_len; + struct ietf_rtr_msg rtr_msg; + u8 priv_data[0]; +}; + struct nes_v4_quad { u32 rsvd0; __le32 DstIpAdrIndex; /* Only most significant 5 bits are valid */ @@ -171,8 +194,7 @@ struct nes_timer_entry { #define NES_CM_DEF_SEQ2 0x18ed5740 #define NES_CM_DEF_LOCAL_ID2 0xb807 -#define MAX_CM_BUFFER (IETF_MPA_FRAME_SIZE + IETF_MAX_PRIV_DATA_LEN) - +#define MAX_CM_BUFFER (IETF_MPA_FRAME_SIZE + IETF_RTR_MSG_SIZE + IETF_MAX_PRIV_DATA_LEN) typedef u32 nes_addr_t; @@ -204,6 +226,21 @@ enum nes_cm_node_state { NES_CM_STATE_CLOSED }; +enum mpa_frame_version { + IETF_MPA_V1 = 1, + IETF_MPA_V2 = 2 +}; + +enum mpa_frame_key { + MPA_KEY_REQUEST, + MPA_KEY_REPLY +}; + +enum send_rdma0 { + SEND_RDMA_READ_ZERO = 1, + SEND_RDMA_WRITE_ZERO = 2 +}; + enum nes_tcpip_pkt_type { NES_PKT_TYPE_UNKNOWN, NES_PKT_TYPE_SYN, @@ -245,9 +282,9 @@ struct nes_cm_tcp_context { enum nes_cm_listener_state { - NES_CM_LISTENER_PASSIVE_STATE=1, - NES_CM_LISTENER_ACTIVE_STATE=2, - NES_CM_LISTENER_EITHER_STATE=3 + NES_CM_LISTENER_PASSIVE_STATE = 1, + NES_CM_LISTENER_ACTIVE_STATE = 2, + NES_CM_LISTENER_EITHER_STATE = 3 }; struct nes_cm_listener { @@ -283,16 +320,20 @@ struct nes_cm_node { struct nes_cm_node *loopbackpartner; - struct nes_timer_entry *send_entry; - + struct nes_timer_entry *send_entry; + struct nes_timer_entry *recv_entry; spinlock_t retrans_list_lock; - struct nes_timer_entry *recv_entry; + enum send_rdma0 send_rdma0_op; - int send_write0; union { - struct ietf_mpa_frame mpa_frame; - u8 mpa_frame_buf[MAX_CM_BUFFER]; + struct ietf_mpa_v1 mpa_frame; + struct ietf_mpa_v2 mpa_v2_frame; + u8 mpa_frame_buf[MAX_CM_BUFFER]; }; + enum mpa_frame_version mpa_frame_rev; + u16 ird_size; + u16 ord_size; + u16 mpa_frame_size; struct iw_cm_id *cm_id; struct list_head list; @@ -399,10 +440,8 @@ struct nes_cm_ops { struct nes_vnic *, u16, void *, struct nes_cm_info *); int (*close)(struct nes_cm_core *, struct nes_cm_node *); - int (*accept)(struct nes_cm_core *, struct ietf_mpa_frame *, - struct nes_cm_node *); - int (*reject)(struct nes_cm_core *, struct ietf_mpa_frame *, - struct nes_cm_node *); + int (*accept)(struct nes_cm_core *, struct nes_cm_node *); + int (*reject)(struct nes_cm_core *, struct nes_cm_node *); int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *); int (*destroy_cm_core)(struct nes_cm_core *); diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 2df9993e0cac..854316d6694f 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -139,7 +139,8 @@ struct nes_qp { struct nes_cq *nesrcq; struct nes_pd *nespd; void *cm_node; /* handle of the node this QP is associated with */ - struct ietf_mpa_frame *ietf_frame; + void *ietf_frame; + u8 ietf_frame_size; dma_addr_t ietf_frame_pbase; struct ib_mr *lsmm_mr; struct nes_hw_qp hwqp; From bab3a9f43f211554626e189519b91ab9ef952f2d Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Sun, 25 Sep 2011 20:27:52 -0500 Subject: [PATCH 17/62] RDMA/nes: Fix terminate connection Fixes a crash that occurs during close when error async event is received. Terminate message is not sent to the remote node if already processing close. Signed-off-by: Tatyana Nikolova Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index be36cbeae630..43bebe805d2e 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -3553,9 +3553,9 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]); if (aeq_info & NES_AEQE_QP) { - if ((!nes_is_resource_allocated(nesadapter, nesadapter->allocated_qps, - aeqe_cq_id)) || - (atomic_read(&nesqp->close_timer_started))) + if (!nes_is_resource_allocated(nesadapter, + nesadapter->allocated_qps, + aeqe_cq_id)) return; } @@ -3566,8 +3566,7 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, if (atomic_inc_return(&nesqp->close_timer_started) == 1) { if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) && - (nesqp->ibqp_state == IB_QPS_RTS) && - ((nesadapter->eeprom_version >> 16) != NES_A0)) { + (nesqp->ibqp_state == IB_QPS_RTS)) { spin_lock_irqsave(&nesqp->lock, flags); nesqp->hw_iwarp_state = iwarp_state; nesqp->hw_tcp_state = tcp_state; @@ -3594,9 +3593,10 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, return; } spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; spin_unlock_irqrestore(&nesqp->lock, flags); - nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_CLOSING, 0, 0); nes_cm_disconn(nesqp); break; @@ -3694,7 +3694,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP: printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n", nesqp->hwqp.qp_id, async_event_id); - nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + if (!atomic_read(&nesqp->close_timer_started)) + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); break; case NES_AEQE_AEID_CQ_OPERATION_ERROR: From 6224c7eeff586e9dbf51b872d7e0bae291fa00ed Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Sun, 25 Sep 2011 20:33:44 -0500 Subject: [PATCH 18/62] RDMA/nes: Print IP address for critcal errors Print the IP address of the remote host when a critical asynchronous event is received. Signed-off-by: Tatyana Nikolova Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_hw.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 43bebe805d2e..f85ccbad8fa2 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -110,6 +110,14 @@ static unsigned char *nes_tcp_state_str[] = { }; #endif +static inline void print_ip(struct nes_cm_node *cm_node) +{ + unsigned char *rem_addr; + if (cm_node) { + rem_addr = (unsigned char *)&cm_node->rem_addr; + printk(KERN_ERR PFX "Remote IP addr: %pI4\n", rem_addr); + } +} /** * nes_nic_init_timer_defaults @@ -3694,6 +3702,7 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP: printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n", nesqp->hwqp.qp_id, async_event_id); + print_ip(nesqp->cm_node); if (!atomic_read(&nesqp->close_timer_started)) nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); break; From 0f0bee8bbc2b3e49baa703118041f99db9ef41a1 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Sun, 25 Sep 2011 20:34:00 -0500 Subject: [PATCH 19/62] RDMA/nes: Support for Packed And Unaligned fpdus Support for Packed and Unaligned (PAU) FPDUs is needed for interoperability between NES and non-NES nodes. When the NES hardware detects a PAU frame, it will pass it to the driver to process the frame. NES driver creates a new frame for each FPDU and forwards it to the hardware to be sent to its associated qp. Signed-off-by: Tatyana Nikolova Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/Makefile | 2 +- drivers/infiniband/hw/nes/nes.c | 8 +- drivers/infiniband/hw/nes/nes.h | 17 +- drivers/infiniband/hw/nes/nes_cm.c | 19 +- drivers/infiniband/hw/nes/nes_cm.h | 2 + drivers/infiniband/hw/nes/nes_hw.c | 73 +- drivers/infiniband/hw/nes/nes_hw.h | 35 +- drivers/infiniband/hw/nes/nes_mgt.c | 1162 +++++++++++++++++++++++++ drivers/infiniband/hw/nes/nes_mgt.h | 97 +++ drivers/infiniband/hw/nes/nes_nic.c | 4 + drivers/infiniband/hw/nes/nes_utils.c | 53 +- drivers/infiniband/hw/nes/nes_verbs.c | 3 +- drivers/infiniband/hw/nes/nes_verbs.h | 9 +- 13 files changed, 1434 insertions(+), 50 deletions(-) create mode 100644 drivers/infiniband/hw/nes/nes_mgt.c create mode 100644 drivers/infiniband/hw/nes/nes_mgt.h diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile index 35148513c47e..97820c23ecef 100644 --- a/drivers/infiniband/hw/nes/Makefile +++ b/drivers/infiniband/hw/nes/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o -iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o +iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o nes_mgt.o diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 2d668c69f6d9..5965b3df8f2f 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -84,7 +84,7 @@ module_param(send_first, int, 0644); MODULE_PARM_DESC(send_first, "Send RDMA Message First on Active Connection"); -unsigned int nes_drv_opt = 0; +unsigned int nes_drv_opt = NES_DRV_OPT_DISABLE_INT_MOD | NES_DRV_OPT_ENABLE_PAU; module_param(nes_drv_opt, int, 0644); MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters"); @@ -130,9 +130,6 @@ static struct notifier_block nes_net_notifier = { .notifier_call = nes_net_event }; - - - /** * nes_inetaddr_event */ @@ -321,6 +318,9 @@ void nes_rem_ref(struct ib_qp *ibqp) } if (atomic_dec_and_test(&nesqp->refcount)) { + if (nesqp->pau_mode) + nes_destroy_pau_qp(nesdev, nesqp); + /* Destroy the QP */ cqp_request = nes_get_cqp_request(nesdev); if (cqp_request == NULL) { diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 6fe79876009e..568b4f11380a 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -102,6 +102,7 @@ #define NES_DRV_OPT_NO_INLINE_DATA 0x00000080 #define NES_DRV_OPT_DISABLE_INT_MOD 0x00000100 #define NES_DRV_OPT_DISABLE_VIRT_WQ 0x00000200 +#define NES_DRV_OPT_ENABLE_PAU 0x00000400 #define NES_AEQ_EVENT_TIMEOUT 2500 #define NES_DISCONNECT_EVENT_TIMEOUT 2000 @@ -128,6 +129,7 @@ #define NES_DBG_IW_RX 0x00020000 #define NES_DBG_IW_TX 0x00040000 #define NES_DBG_SHUTDOWN 0x00080000 +#define NES_DBG_PAU 0x00100000 #define NES_DBG_RSVD1 0x10000000 #define NES_DBG_RSVD2 0x20000000 #define NES_DBG_RSVD3 0x40000000 @@ -162,6 +164,7 @@ do { \ #include "nes_context.h" #include "nes_user.h" #include "nes_cm.h" +#include "nes_mgt.h" extern int max_mtu; #define max_frame_len (max_mtu+ETH_HLEN) @@ -202,6 +205,8 @@ extern atomic_t cm_nodes_created; extern atomic_t cm_nodes_destroyed; extern atomic_t cm_accel_dropped_pkts; extern atomic_t cm_resets_recvd; +extern atomic_t pau_qps_created; +extern atomic_t pau_qps_destroyed; extern u32 int_mod_timer_init; extern u32 int_mod_cq_depth_256; @@ -273,6 +278,14 @@ struct nes_device { u8 link_recheck; }; +/* Receive skb private area - must fit in skb->cb area */ +struct nes_rskb_cb { + u64 busaddr; + u32 maplen; + u32 seqnum; + u8 *data_start; + struct nes_qp *nesqp; +}; static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad) { @@ -305,8 +318,8 @@ set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value) static inline void nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev) { - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_COMP_CTX_LOW_IDX, - (u64)((unsigned long) &nesdev->cqp)); + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0; cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX] = 0; cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX] = 0; cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0; diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index c118663e4437..401b7bb828d0 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -159,6 +159,15 @@ atomic_t cm_connecteds; atomic_t cm_connect_reqs; atomic_t cm_rejects; +int nes_add_ref_cm_node(struct nes_cm_node *cm_node) +{ + return add_ref_cm_node(cm_node); +} + +int nes_rem_ref_cm_node(struct nes_cm_node *cm_node) +{ + return rem_ref_cm_node(cm_node->cm_core, cm_node); +} /** * create_event @@ -2331,9 +2340,13 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, } add_ref_cm_node(cm_node); } else if (cm_node->state == NES_CM_STATE_TSA) { - rem_ref_cm_node(cm_core, cm_node); - atomic_inc(&cm_accel_dropped_pkts); - dev_kfree_skb_any(skb); + if (cm_node->nesqp->pau_mode) + nes_queue_mgt_skbs(skb, nesvnic, cm_node->nesqp); + else { + rem_ref_cm_node(cm_core, cm_node); + atomic_inc(&cm_accel_dropped_pkts); + dev_kfree_skb_any(skb); + } break; } skb_reset_network_header(skb); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index d9825fda70a1..130c185cde0d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -422,5 +422,7 @@ int nes_destroy_listen(struct iw_cm_id *); int nes_cm_recv(struct sk_buff *, struct net_device *); int nes_cm_start(void); int nes_cm_stop(void); +int nes_add_ref_cm_node(struct nes_cm_node *cm_node); +int nes_rem_ref_cm_node(struct nes_cm_node *cm_node); #endif /* NES_CM_H */ diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index f85ccbad8fa2..7c0ff19ce382 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -1563,6 +1563,7 @@ static void nes_replenish_nic_rq(struct nes_vnic *nesvnic) struct nes_hw_nic_rq_wqe *nic_rqe; struct nes_hw_nic *nesnic; struct nes_device *nesdev; + struct nes_rskb_cb *cb; u32 rx_wqes_posted = 0; nesnic = &nesvnic->nic; @@ -1588,6 +1589,9 @@ static void nes_replenish_nic_rq(struct nes_vnic *nesvnic) bus_address = pci_map_single(nesdev->pcidev, skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = bus_address; + cb->maplen = nesvnic->max_frame_size; nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head]; nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = @@ -1677,6 +1681,7 @@ int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) u32 cqp_head; u32 counter; u32 wqe_count; + struct nes_rskb_cb *cb; u8 jumbomode=0; /* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */ @@ -1853,6 +1858,9 @@ int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) pmem = pci_map_single(nesdev->pcidev, skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = pmem; + cb->maplen = nesvnic->max_frame_size; nic_rqe = &nesvnic->nic.rq_vbase[counter]; nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size); @@ -1881,6 +1889,13 @@ int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) jumbomode = 1; nes_nic_init_timer_defaults(nesdev, jumbomode); } + if ((nesdev->nesadapter->allow_unaligned_fpdus) && + (nes_init_mgt_qp(nesdev, netdev, nesvnic))) { + nes_debug(NES_DBG_INIT, "%s: Out of memory for pau nic\n", netdev->name); + nes_destroy_nic_qp(nesvnic); + return -ENOMEM; + } + nesvnic->lro_mgr.max_aggr = nes_lro_max_aggr; nesvnic->lro_mgr.max_desc = NES_MAX_LRO_DESCRIPTORS; nesvnic->lro_mgr.lro_arr = nesvnic->lro_desc; @@ -1903,28 +1918,29 @@ void nes_destroy_nic_qp(struct nes_vnic *nesvnic) struct nes_device *nesdev = nesvnic->nesdev; struct nes_hw_cqp_wqe *cqp_wqe; struct nes_hw_nic_sq_wqe *nic_sqe; - struct nes_hw_nic_rq_wqe *nic_rqe; __le16 *wqe_fragment_length; u16 wqe_fragment_index; - u64 wqe_frag; u32 cqp_head; u32 wqm_cfg0; unsigned long flags; + struct sk_buff *rx_skb; + struct nes_rskb_cb *cb; int ret; + if (nesdev->nesadapter->allow_unaligned_fpdus) + nes_destroy_mgt(nesvnic); + /* clear wqe stall before destroying NIC QP */ wqm_cfg0 = nes_read_indexed(nesdev, NES_IDX_WQM_CONFIG0); nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0 & 0xFFFF7FFF); /* Free remaining NIC receive buffers */ while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) { - nic_rqe = &nesvnic->nic.rq_vbase[nesvnic->nic.rq_tail]; - wqe_frag = (u64)le32_to_cpu( - nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]); - wqe_frag |= ((u64)le32_to_cpu( - nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX]))<<32; - pci_unmap_single(nesdev->pcidev, (dma_addr_t)wqe_frag, - nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + rx_skb = nesvnic->nic.rx_skb[nesvnic->nic.rq_tail]; + cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; + pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, + PCI_DMA_FROMDEVICE); + dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]); nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1); } @@ -2783,6 +2799,7 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) struct nes_hw_nic_sq_wqe *nic_sqe; struct sk_buff *skb; struct sk_buff *rx_skb; + struct nes_rskb_cb *cb; __le16 *wqe_fragment_length; u32 head; u32 cq_size; @@ -2867,6 +2884,8 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32; pci_unmap_single(nesdev->pcidev, bus_address, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; + cb->busaddr = 0; /* rx_skb->tail = rx_skb->data + rx_pkt_size; */ /* rx_skb->len = rx_pkt_size; */ rx_skb->len = 0; /* TODO: see if this is necessary */ @@ -2991,6 +3010,7 @@ skip_rx_indicate0: } + /** * nes_cqp_ce_handler */ @@ -3005,6 +3025,8 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) u32 cq_size; u32 cqe_count=0; u32 error_code; + u32 opcode; + u32 ctx_index; /* u32 counter; */ head = cq->cq_head; @@ -3015,12 +3037,9 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) /* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head, le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */ - if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) { - u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head]. - cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) | - ((u64)(le32_to_cpu(cq->cq_vbase[head]. - cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]))); - cqp = *((struct nes_hw_cqp **)&u64temp); + opcode = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]); + if (opcode & NES_CQE_VALID) { + cqp = &nesdev->cqp; error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]); if (error_code) { @@ -3029,15 +3048,14 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f, (u16)(error_code >> 16), (u16)error_code); - nes_debug(NES_DBG_CQP, "cqp: qp_id=%u, sq_head=%u, sq_tail=%u\n", - cqp->qp_id, cqp->sq_head, cqp->sq_tail); } - u64temp = (((u64)(le32_to_cpu(nesdev->cqp.sq_vbase[cqp->sq_tail]. - wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) | - ((u64)(le32_to_cpu(nesdev->cqp.sq_vbase[cqp->sq_tail]. - wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX]))); - cqp_request = *((struct nes_cqp_request **)&u64temp); + u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head]. + cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) | + ((u64)(le32_to_cpu(cq->cq_vbase[head]. + cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]))); + + cqp_request = (struct nes_cqp_request *)(unsigned long)u64temp; if (cqp_request) { if (cqp_request->waiting) { /* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */ @@ -3083,9 +3101,15 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) cqp_wqe = &nesdev->cqp.sq_vbase[head]; memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); barrier(); - cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX] = + + opcode = cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]; + if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT) + ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX; + else + ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX; + cqp_wqe->wqe_words[ctx_index] = cpu_to_le32((u32)((unsigned long)cqp_request)); - cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX] = + cqp_wqe->wqe_words[ctx_index + 1] = cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request))); nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n", cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head); @@ -3101,7 +3125,6 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) nes_read32(nesdev->regs+NES_CQE_ALLOC); } - static u8 *locate_mpa(u8 *pkt, u32 aeq_info) { if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) { diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index c3241479ec0e..0b590e152c6a 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -47,6 +47,11 @@ #define NES_MULTICAST_PF_MAX 8 #define NES_A0 3 +#define NES_ENABLE_PAU 0x07000001 +#define NES_DISABLE_PAU 0x07000000 +#define NES_PAU_COUNTER 10 +#define NES_CQP_OPCODE_MASK 0x3f + enum pci_regs { NES_INT_STAT = 0x0000, NES_INT_MASK = 0x0004, @@ -73,8 +78,10 @@ enum indexed_regs { NES_IDX_QP_CONTROL = 0x0040, NES_IDX_FLM_CONTROL = 0x0080, NES_IDX_INT_CPU_STATUS = 0x00a0, + NES_IDX_GPR_TRIGGER = 0x00bc, NES_IDX_GPIO_CONTROL = 0x00f0, NES_IDX_GPIO_DATA = 0x00f4, + NES_IDX_GPR2 = 0x010c, NES_IDX_TCP_CONFIG0 = 0x01e4, NES_IDX_TCP_TIMER_CONFIG = 0x01ec, NES_IDX_TCP_NOW = 0x01f0, @@ -202,6 +209,7 @@ enum nes_cqp_opcodes { NES_CQP_REGISTER_SHARED_STAG = 0x0c, NES_CQP_DEALLOCATE_STAG = 0x0d, NES_CQP_MANAGE_ARP_CACHE = 0x0f, + NES_CQP_DOWNLOAD_SEGMENT = 0x10, NES_CQP_SUSPEND_QPS = 0x11, NES_CQP_UPLOAD_CONTEXT = 0x13, NES_CQP_CREATE_CEQ = 0x16, @@ -210,7 +218,8 @@ enum nes_cqp_opcodes { NES_CQP_DESTROY_AEQ = 0x1b, NES_CQP_LMI_ACCESS = 0x20, NES_CQP_FLUSH_WQES = 0x22, - NES_CQP_MANAGE_APBVT = 0x23 + NES_CQP_MANAGE_APBVT = 0x23, + NES_CQP_MANAGE_QUAD_HASH = 0x25 }; enum nes_cqp_wqe_word_idx { @@ -222,6 +231,14 @@ enum nes_cqp_wqe_word_idx { NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5, }; +enum nes_cqp_wqe_word_download_idx { /* format differs from other cqp ops */ + NES_CQP_WQE_DL_OPCODE_IDX = 0, + NES_CQP_WQE_DL_COMP_CTX_LOW_IDX = 1, + NES_CQP_WQE_DL_COMP_CTX_HIGH_IDX = 2, + NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX = 3 + /* For index values 4-15 use NES_NIC_SQ_WQE_ values */ +}; + enum nes_cqp_cq_wqeword_idx { NES_CQP_CQ_WQE_PBL_LOW_IDX = 6, NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7, @@ -242,6 +259,7 @@ enum nes_cqp_stag_wqeword_idx { NES_CQP_STAG_WQE_PBL_LEN_IDX = 14 }; +#define NES_CQP_OP_LOGICAL_PORT_SHIFT 26 #define NES_CQP_OP_IWARP_STATE_SHIFT 28 #define NES_CQP_OP_TERMLEN_SHIFT 28 @@ -599,6 +617,7 @@ enum nes_nic_sq_wqe_bits { enum nes_nic_cqe_word_idx { NES_NIC_CQE_ACCQP_ID_IDX = 0, + NES_NIC_CQE_HASH_RCVNXT = 1, NES_NIC_CQE_TAG_PKT_TYPE_IDX = 2, NES_NIC_CQE_MISC_IDX = 3, }; @@ -1005,6 +1024,11 @@ struct nes_arp_entry { #define NES_NIC_CQ_DOWNWARD_TREND 16 #define NES_PFT_SIZE 48 +#define NES_MGT_WQ_COUNT 32 +#define NES_MGT_CTX_SIZE ((NES_NIC_CTX_RQ_SIZE_32) | (NES_NIC_CTX_SQ_SIZE_32)) +#define NES_MGT_QP_OFFSET 36 +#define NES_MGT_QP_COUNT 4 + struct nes_hw_tune_timer { /* u16 cq_count; */ u16 threshold_low; @@ -1118,6 +1142,7 @@ struct nes_adapter { u32 et_rate_sample_interval; u32 timer_int_limit; u32 wqm_quanta; + u8 allow_unaligned_fpdus; /* Adapter base MAC address */ u32 mac_addr_low; @@ -1251,6 +1276,14 @@ struct nes_vnic { enum ib_event_type delayed_event; enum ib_event_type last_dispatched_event; spinlock_t port_ibevent_lock; + u32 mgt_mem_size; + void *mgt_vbase; + dma_addr_t mgt_pbase; + struct nes_vnic_mgt *mgtvnic[NES_MGT_QP_COUNT]; + struct task_struct *mgt_thread; + wait_queue_head_t mgt_wait_queue; + struct sk_buff_head mgt_skb_list; + }; struct nes_ib_device { diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c new file mode 100644 index 000000000000..b3b2a240c6e9 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_mgt.c @@ -0,0 +1,1162 @@ +/* + * Copyright (c) 2006 - 2009 Intel-NE, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include "nes.h" +#include "nes_mgt.h" + +atomic_t pau_qps_created; +atomic_t pau_qps_destroyed; + +static void nes_replenish_mgt_rq(struct nes_vnic_mgt *mgtvnic) +{ + unsigned long flags; + dma_addr_t bus_address; + struct sk_buff *skb; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_hw_mgt *nesmgt; + struct nes_device *nesdev; + struct nes_rskb_cb *cb; + u32 rx_wqes_posted = 0; + + nesmgt = &mgtvnic->mgt; + nesdev = mgtvnic->nesvnic->nesdev; + spin_lock_irqsave(&nesmgt->rq_lock, flags); + if (nesmgt->replenishing_rq != 0) { + if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) && + (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) { + atomic_set(&mgtvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2); /* 1/2 second */ + add_timer(&mgtvnic->rq_wqes_timer); + } else { + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + } + return; + } + nesmgt->replenishing_rq = 1; + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + do { + skb = dev_alloc_skb(mgtvnic->nesvnic->max_frame_size); + if (skb) { + skb->dev = mgtvnic->nesvnic->netdev; + + bus_address = pci_map_single(nesdev->pcidev, + skb->data, mgtvnic->nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = bus_address; + cb->maplen = mgtvnic->nesvnic->max_frame_size; + + nic_rqe = &nesmgt->rq_vbase[mgtvnic->mgt.rq_head]; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = + cpu_to_le32(mgtvnic->nesvnic->max_frame_size); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)bus_address); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)bus_address >> 32)); + nesmgt->rx_skb[nesmgt->rq_head] = skb; + nesmgt->rq_head++; + nesmgt->rq_head &= nesmgt->rq_size - 1; + atomic_dec(&mgtvnic->rx_skbs_needed); + barrier(); + if (++rx_wqes_posted == 255) { + nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id); + rx_wqes_posted = 0; + } + } else { + spin_lock_irqsave(&nesmgt->rq_lock, flags); + if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) && + (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) { + atomic_set(&mgtvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2); /* 1/2 second */ + add_timer(&mgtvnic->rq_wqes_timer); + } else { + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + } + break; + } + } while (atomic_read(&mgtvnic->rx_skbs_needed)); + barrier(); + if (rx_wqes_posted) + nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id); + nesmgt->replenishing_rq = 0; +} + +/** + * nes_mgt_rq_wqes_timeout + */ +static void nes_mgt_rq_wqes_timeout(unsigned long parm) +{ + struct nes_vnic_mgt *mgtvnic = (struct nes_vnic_mgt *)parm; + + atomic_set(&mgtvnic->rx_skb_timer_running, 0); + if (atomic_read(&mgtvnic->rx_skbs_needed)) + nes_replenish_mgt_rq(mgtvnic); +} + +/** + * nes_mgt_free_skb - unmap and free skb + */ +static void nes_mgt_free_skb(struct nes_device *nesdev, struct sk_buff *skb, u32 dir) +{ + struct nes_rskb_cb *cb; + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, dir); + cb->busaddr = 0; + dev_kfree_skb_any(skb); +} + +/** + * nes_download_callback - handle download completions + */ +static void nes_download_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) +{ + struct pau_fpdu_info *fpdu_info = cqp_request->cqp_callback_pointer; + struct nes_qp *nesqp = fpdu_info->nesqp; + struct sk_buff *skb; + int i; + + for (i = 0; i < fpdu_info->frag_cnt; i++) { + skb = fpdu_info->frags[i].skb; + if (fpdu_info->frags[i].cmplt) { + nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + } + } + + if (fpdu_info->hdr_vbase) + pci_free_consistent(nesdev->pcidev, fpdu_info->hdr_len, + fpdu_info->hdr_vbase, fpdu_info->hdr_pbase); + kfree(fpdu_info); +} + +/** + * nes_get_seq - Get the seq, ack_seq and window from the packet + */ +static u32 nes_get_seq(struct sk_buff *skb, u32 *ack, u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd) +{ + struct nes_rskb_cb *cb = (struct nes_rskb_cb *)&skb->cb[0]; + struct iphdr *iph = (struct iphdr *)(cb->data_start + ETH_HLEN); + struct tcphdr *tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + + *ack = be32_to_cpu(tcph->ack_seq); + *wnd = be16_to_cpu(tcph->window); + *fin_rcvd = tcph->fin; + *rst_rcvd = tcph->rst; + return be32_to_cpu(tcph->seq); +} + +/** + * nes_get_next_skb - Get the next skb based on where current skb is in the queue + */ +static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp *nesqp, + struct sk_buff *skb, u32 nextseq, u32 *ack, + u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd) +{ + u32 seq; + bool processacks; + struct sk_buff *old_skb; + + if (skb) { + /* Continue processing fpdu */ + if (skb->next == (struct sk_buff *)&nesqp->pau_list) + goto out; + skb = skb->next; + processacks = false; + } else { + /* Starting a new one */ + if (skb_queue_empty(&nesqp->pau_list)) + goto out; + skb = skb_peek(&nesqp->pau_list); + processacks = true; + } + + while (1) { + seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd); + if (seq == nextseq) { + if (skb->len || processacks) + break; + } else if (after(seq, nextseq)) { + goto out; + } + + if (skb->next == (struct sk_buff *)&nesqp->pau_list) + goto out; + + old_skb = skb; + skb = skb->next; + skb_unlink(old_skb, &nesqp->pau_list); + nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + } + return skb; + +out: + return NULL; +} + +/** + * get_fpdu_info - Find the next complete fpdu and return its fragments. + */ +static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, + struct pau_fpdu_info **pau_fpdu_info) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *tcph; + struct nes_rskb_cb *cb; + struct pau_fpdu_info *fpdu_info = NULL; + struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; + unsigned long flags; + u32 fpdu_len = 0; + u32 tmp_len; + int frag_cnt = 0; + u32 tot_len; + u32 frag_tot; + u32 ack; + u32 fin_rcvd; + u32 rst_rcvd; + u16 wnd; + int i; + int rc = 0; + + *pau_fpdu_info = NULL; + + spin_lock_irqsave(&nesqp->pau_lock, flags); + skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd); + if (!skb) { + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + goto out; + } + cb = (struct nes_rskb_cb *)&skb->cb[0]; + if (skb->len) { + fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING; + fpdu_len = (fpdu_len + 3) & 0xfffffffc; + tmp_len = fpdu_len; + + /* See if we have all of the fpdu */ + frag_tot = 0; + memset(&frags, 0, sizeof frags); + for (i = 0; i < MAX_FPDU_FRAGS; i++) { + frags[i].physaddr = cb->busaddr; + frags[i].physaddr += skb->data - cb->data_start; + frags[i].frag_len = min(tmp_len, skb->len); + frags[i].skb = skb; + frags[i].cmplt = (skb->len == frags[i].frag_len); + frag_tot += frags[i].frag_len; + frag_cnt++; + + tmp_len -= frags[i].frag_len; + if (tmp_len == 0) + break; + + skb = nes_get_next_skb(nesdev, nesqp, skb, + nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd); + if (!skb) { + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + goto out; + } else if (rst_rcvd) { + /* rst received in the middle of fpdu */ + for (; i >= 0; i--) { + skb_unlink(frags[i].skb, &nesqp->pau_list); + nes_mgt_free_skb(nesdev, frags[i].skb, PCI_DMA_TODEVICE); + } + cb = (struct nes_rskb_cb *)&skb->cb[0]; + frags[0].physaddr = cb->busaddr; + frags[0].physaddr += skb->data - cb->data_start; + frags[0].frag_len = skb->len; + frags[0].skb = skb; + frags[0].cmplt = true; + frag_cnt = 1; + break; + } + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + } + } else { + /* no data */ + frags[0].physaddr = cb->busaddr; + frags[0].frag_len = 0; + frags[0].skb = skb; + frags[0].cmplt = true; + frag_cnt = 1; + } + + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + + /* Found one */ + fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC); + if (fpdu_info == NULL) { + nes_debug(NES_DBG_PAU, "Failed to alloc a fpdu_info.\n"); + rc = -ENOMEM; + goto out; + } + + fpdu_info->cqp_request = nes_get_cqp_request(nesdev); + if (fpdu_info->cqp_request == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); + rc = -ENOMEM; + goto out; + } + + cb = (struct nes_rskb_cb *)&frags[0].skb->cb[0]; + iph = (struct iphdr *)(cb->data_start + ETH_HLEN); + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + fpdu_info->hdr_len = (((unsigned char *)tcph) + 4 * (tcph->doff)) - cb->data_start; + fpdu_info->data_len = fpdu_len; + tot_len = fpdu_info->hdr_len + fpdu_len - ETH_HLEN; + + if (frags[0].cmplt) { + fpdu_info->hdr_pbase = cb->busaddr; + fpdu_info->hdr_vbase = NULL; + } else { + fpdu_info->hdr_vbase = pci_alloc_consistent(nesdev->pcidev, + fpdu_info->hdr_len, &fpdu_info->hdr_pbase); + if (!fpdu_info->hdr_vbase) { + nes_debug(NES_DBG_PAU, "Unable to allocate memory for pau first frag\n"); + rc = -ENOMEM; + goto out; + } + + /* Copy hdrs, adjusting len and seqnum */ + memcpy(fpdu_info->hdr_vbase, cb->data_start, fpdu_info->hdr_len); + iph = (struct iphdr *)(fpdu_info->hdr_vbase + ETH_HLEN); + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + } + + iph->tot_len = cpu_to_be16(tot_len); + iph->saddr = cpu_to_be32(0x7f000001); + + tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt); + tcph->ack_seq = cpu_to_be32(ack); + tcph->window = cpu_to_be16(wnd); + + nesqp->pau_rcv_nxt += fpdu_len + fin_rcvd; + + memcpy(fpdu_info->frags, frags, sizeof(fpdu_info->frags)); + fpdu_info->frag_cnt = frag_cnt; + fpdu_info->nesqp = nesqp; + *pau_fpdu_info = fpdu_info; + + /* Update skb's for next pass */ + for (i = 0; i < frag_cnt; i++) { + cb = (struct nes_rskb_cb *)&frags[i].skb->cb[0]; + skb_pull(frags[i].skb, frags[i].frag_len); + + if (frags[i].skb->len == 0) { + /* Pull skb off the list - it will be freed in the callback */ + spin_lock_irqsave(&nesqp->pau_lock, flags); + skb_unlink(frags[i].skb, &nesqp->pau_list); + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + } else { + /* Last skb still has data so update the seq */ + iph = (struct iphdr *)(cb->data_start + ETH_HLEN); + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt); + } + } + +out: + if (rc) { + if (fpdu_info) { + if (fpdu_info->cqp_request) + nes_put_cqp_request(nesdev, fpdu_info->cqp_request); + kfree(fpdu_info); + } + } + return rc; +} + +/** + * forward_fpdu - send complete fpdus, one at a time + */ +static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct pau_fpdu_info *fpdu_info; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + u64 u64tmp; + u32 u32tmp; + int rc; + + while (1) { + rc = get_fpdu_info(nesdev, nesqp, &fpdu_info); + if (fpdu_info == NULL) + return rc; + + cqp_request = fpdu_info->cqp_request; + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_OPCODE_IDX, + NES_CQP_DOWNLOAD_SEGMENT | + (((u32)nesvnic->logical_port) << NES_CQP_OP_LOGICAL_PORT_SHIFT)); + + u32tmp = fpdu_info->hdr_len << 16; + u32tmp |= fpdu_info->hdr_len + (u32)fpdu_info->data_len; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX, + u32tmp); + + u32tmp = (fpdu_info->frags[1].frag_len << 16) | fpdu_info->frags[0].frag_len; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_2_1_IDX, + u32tmp); + + u32tmp = (fpdu_info->frags[3].frag_len << 16) | fpdu_info->frags[2].frag_len; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_4_3_IDX, + u32tmp); + + u64tmp = (u64)fpdu_info->hdr_pbase; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX, + lower_32_bits(u64tmp)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX, + upper_32_bits(u64tmp >> 32)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, + lower_32_bits(fpdu_info->frags[0].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_HIGH_IDX, + upper_32_bits(fpdu_info->frags[0].physaddr)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_LOW_IDX, + lower_32_bits(fpdu_info->frags[1].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_HIGH_IDX, + upper_32_bits(fpdu_info->frags[1].physaddr)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_LOW_IDX, + lower_32_bits(fpdu_info->frags[2].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_HIGH_IDX, + upper_32_bits(fpdu_info->frags[2].physaddr)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_LOW_IDX, + lower_32_bits(fpdu_info->frags[3].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_HIGH_IDX, + upper_32_bits(fpdu_info->frags[3].physaddr)); + + cqp_request->cqp_callback_pointer = fpdu_info; + cqp_request->callback = 1; + cqp_request->cqp_callback = nes_download_callback; + + atomic_set(&cqp_request->refcount, 1); + nes_post_cqp_request(nesdev, cqp_request); + } + + return 0; +} + +static void process_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + int again = 1; + unsigned long flags; + + do { + /* Ignore rc - if it failed, tcp retries will cause it to try again */ + forward_fpdus(nesvnic, nesqp); + + spin_lock_irqsave(&nesqp->pau_lock, flags); + if (nesqp->pau_pending) { + nesqp->pau_pending = 0; + } else { + nesqp->pau_busy = 0; + again = 0; + } + + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + } while (again); +} + +/** + * queue_fpdus - Handle fpdu's that hw passed up to sw + */ +static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct sk_buff *tmpskb; + struct nes_rskb_cb *cb; + struct iphdr *iph; + struct tcphdr *tcph; + unsigned char *tcph_end; + u32 rcv_nxt; + u32 rcv_wnd; + u32 seqnum; + u32 len; + bool process_it = false; + unsigned long flags; + + /* Move data ptr to after tcp header */ + iph = (struct iphdr *)skb->data; + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + seqnum = be32_to_cpu(tcph->seq); + tcph_end = (((char *)tcph) + (4 * tcph->doff)); + + len = be16_to_cpu(iph->tot_len); + if (skb->len > len) + skb_trim(skb, len); + skb_pull(skb, tcph_end - skb->data); + + /* Initialize tracking values */ + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->seqnum = seqnum; + + /* Make sure data is in the receive window */ + rcv_nxt = nesqp->pau_rcv_nxt; + rcv_wnd = le32_to_cpu(nesqp->nesqp_context->rcv_wnd); + if (!between(seqnum, rcv_nxt, (rcv_nxt + rcv_wnd))) { + nes_mgt_free_skb(nesvnic->nesdev, skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + return; + } + + spin_lock_irqsave(&nesqp->pau_lock, flags); + + if (nesqp->pau_busy) + nesqp->pau_pending = 1; + else + nesqp->pau_busy = 1; + + /* Queue skb by sequence number */ + if (skb_queue_len(&nesqp->pau_list) == 0) { + skb_queue_head(&nesqp->pau_list, skb); + } else { + tmpskb = nesqp->pau_list.next; + while (tmpskb != (struct sk_buff *)&nesqp->pau_list) { + cb = (struct nes_rskb_cb *)&tmpskb->cb[0]; + if (before(seqnum, cb->seqnum)) + break; + tmpskb = tmpskb->next; + } + skb_insert(tmpskb, skb, &nesqp->pau_list); + } + if (nesqp->pau_state == PAU_READY) + process_it = true; + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + + if (process_it) + process_fpdus(nesvnic, nesqp); + + return; +} + +/** + * mgt_thread - Handle mgt skbs in a safe context + */ +static int mgt_thread(void *context) +{ + struct nes_vnic *nesvnic = context; + struct sk_buff *skb; + struct nes_rskb_cb *cb; + + while (!kthread_should_stop()) { + wait_event_interruptible(nesvnic->mgt_wait_queue, + skb_queue_len(&nesvnic->mgt_skb_list) || kthread_should_stop()); + while ((skb_queue_len(&nesvnic->mgt_skb_list)) && !kthread_should_stop()) { + skb = skb_dequeue(&nesvnic->mgt_skb_list); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->data_start = skb->data - ETH_HLEN; + cb->busaddr = pci_map_single(nesvnic->nesdev->pcidev, cb->data_start, + nesvnic->max_frame_size, PCI_DMA_TODEVICE); + queue_fpdus(skb, nesvnic, cb->nesqp); + } + } + + /* Closing down so delete any entries on the queue */ + while (skb_queue_len(&nesvnic->mgt_skb_list)) { + skb = skb_dequeue(&nesvnic->mgt_skb_list); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + nes_rem_ref_cm_node(cb->nesqp->cm_node); + dev_kfree_skb_any(skb); + } + return 0; +} + +/** + * nes_queue_skbs - Queue skb so it can be handled in a thread context + */ +void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct nes_rskb_cb *cb; + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->nesqp = nesqp; + skb_queue_tail(&nesvnic->mgt_skb_list, skb); + wake_up_interruptible(&nesvnic->mgt_wait_queue); +} + +void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp) +{ + struct sk_buff *skb; + unsigned long flags; + atomic_inc(&pau_qps_destroyed); + + /* Free packets that have not yet been forwarded */ + /* Lock is acquired by skb_dequeue when removing the skb */ + spin_lock_irqsave(&nesqp->pau_lock, flags); + while (skb_queue_len(&nesqp->pau_list)) { + skb = skb_dequeue(&nesqp->pau_list); + nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + } + spin_unlock_irqrestore(&nesqp->pau_lock, flags); +} + +static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) +{ + struct pau_qh_chg *qh_chg = cqp_request->cqp_callback_pointer; + struct nes_cqp_request *new_request; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_adapter *nesadapter; + struct nes_qp *nesqp; + struct nes_v4_quad nes_quad; + u32 crc_value; + u64 u64temp; + + nesadapter = nesdev->nesadapter; + nesqp = qh_chg->nesqp; + + /* Should we handle the bad completion */ + if (cqp_request->major_code) { + printk(KERN_ERR PFX "Invalid cqp_request major_code=0x%x\n", + cqp_request->major_code); + WARN_ON(1); + } + + switch (nesqp->pau_state) { + case PAU_DEL_QH: + /* Old hash code deleted, now set the new one */ + nesqp->pau_state = PAU_ADD_LB_QH; + new_request = nes_get_cqp_request(nesdev); + if (new_request == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a new_request.\n"); + WARN_ON(1); + return; + } + + memset(&nes_quad, 0, sizeof(nes_quad)); + nes_quad.DstIpAdrIndex = + cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); + nes_quad.SrcIpadr = cpu_to_be32(0x7f000001); + nes_quad.TcpPorts[0] = swab16(nesqp->nesqp_context->tcpPorts[1]); + nes_quad.TcpPorts[1] = swab16(nesqp->nesqp_context->tcpPorts[0]); + + /* Produce hash key */ + crc_value = get_crc_value(&nes_quad); + nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); + nes_debug(NES_DBG_PAU, "new HTE Index = 0x%08X, CRC = 0x%08X\n", + nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); + + nesqp->hte_index &= nesadapter->hte_index_mask; + nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); + nesqp->nesqp_context->ip0 = cpu_to_le32(0x7f000001); + nesqp->nesqp_context->rcv_nxt = cpu_to_le32(nesqp->pau_rcv_nxt); + + cqp_wqe = &new_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, + NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | + NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + nes_debug(NES_DBG_PAU, "Waiting for CQP completion for adding the quad hash.\n"); + + new_request->cqp_callback_pointer = qh_chg; + new_request->callback = 1; + new_request->cqp_callback = nes_chg_qh_handler; + atomic_set(&new_request->refcount, 1); + nes_post_cqp_request(nesdev, new_request); + break; + + case PAU_ADD_LB_QH: + /* Start processing the queued fpdu's */ + nesqp->pau_state = PAU_READY; + process_fpdus(qh_chg->nesvnic, qh_chg->nesqp); + kfree(qh_chg); + break; + } +} + +/** + * nes_change_quad_hash + */ +static int nes_change_quad_hash(struct nes_device *nesdev, + struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct nes_cqp_request *cqp_request = NULL; + struct pau_qh_chg *qh_chg = NULL; + u64 u64temp; + struct nes_hw_cqp_wqe *cqp_wqe; + int ret = 0; + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); + ret = -ENOMEM; + goto chg_qh_err; + } + + qh_chg = kmalloc(sizeof *qh_chg, GFP_ATOMIC); + if (qh_chg == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); + ret = -ENOMEM; + goto chg_qh_err; + } + qh_chg->nesdev = nesdev; + qh_chg->nesvnic = nesvnic; + qh_chg->nesqp = nesqp; + nesqp->pau_state = PAU_DEL_QH; + + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, + NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | NES_CQP_QP_DEL_HTE | + NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + nes_debug(NES_DBG_PAU, "Waiting for CQP completion for deleting the quad hash.\n"); + + cqp_request->cqp_callback_pointer = qh_chg; + cqp_request->callback = 1; + cqp_request->cqp_callback = nes_chg_qh_handler; + atomic_set(&cqp_request->refcount, 1); + nes_post_cqp_request(nesdev, cqp_request); + + return ret; + +chg_qh_err: + kfree(qh_chg); + if (cqp_request) + nes_put_cqp_request(nesdev, cqp_request); + return ret; +} + +/** + * nes_mgt_ce_handler + * This management code deals with any packed and unaligned (pau) fpdu's + * that the hardware cannot handle. + */ +static void nes_mgt_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) +{ + struct nes_vnic_mgt *mgtvnic = container_of(cq, struct nes_vnic_mgt, mgt_cq); + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 head; + u32 cq_size; + u32 cqe_count = 0; + u32 cqe_misc; + u32 qp_id = 0; + u32 skbs_needed; + unsigned long context; + struct nes_qp *nesqp; + struct sk_buff *rx_skb; + struct nes_rskb_cb *cb; + + head = cq->cq_head; + cq_size = cq->cq_size; + + while (1) { + cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]); + if (!(cqe_misc & NES_NIC_CQE_VALID)) + break; + + nesqp = NULL; + if (cqe_misc & NES_NIC_CQE_ACCQP_VALID) { + qp_id = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_ACCQP_ID_IDX]); + qp_id &= 0x001fffff; + if (qp_id < nesadapter->max_qp) { + context = (unsigned long)nesadapter->qp_table[qp_id - NES_FIRST_QPN]; + nesqp = (struct nes_qp *)context; + } + } + + if (nesqp) { + if (nesqp->pau_mode == false) { + nesqp->pau_mode = true; /* First time for this qp */ + nesqp->pau_rcv_nxt = le32_to_cpu( + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_HASH_RCVNXT]); + skb_queue_head_init(&nesqp->pau_list); + spin_lock_init(&nesqp->pau_lock); + atomic_inc(&pau_qps_created); + nes_change_quad_hash(nesdev, mgtvnic->nesvnic, nesqp); + } + + rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail]; + rx_skb->len = 0; + skb_put(rx_skb, cqe_misc & 0x0000ffff); + rx_skb->protocol = eth_type_trans(rx_skb, mgtvnic->nesvnic->netdev); + cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; + pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, PCI_DMA_FROMDEVICE); + cb->busaddr = 0; + mgtvnic->mgt.rq_tail++; + mgtvnic->mgt.rq_tail &= mgtvnic->mgt.rq_size - 1; + + nes_add_ref_cm_node(nesqp->cm_node); + nes_queue_mgt_skbs(rx_skb, mgtvnic->nesvnic, nesqp); + } else { + printk(KERN_ERR PFX "Invalid QP %d for packed/unaligned handling\n", qp_id); + } + + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0; + cqe_count++; + if (++head >= cq_size) + head = 0; + + if (cqe_count == 255) { + /* Replenish mgt CQ */ + nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16)); + nesdev->currcq_count += cqe_count; + cqe_count = 0; + } + + skbs_needed = atomic_inc_return(&mgtvnic->rx_skbs_needed); + if (skbs_needed > (mgtvnic->mgt.rq_size >> 1)) + nes_replenish_mgt_rq(mgtvnic); + } + + cq->cq_head = head; + nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + cq->cq_number | (cqe_count << 16)); + nes_read32(nesdev->regs + NES_CQE_ALLOC); + nesdev->currcq_count += cqe_count; +} + +/** + * nes_init_mgt_qp + */ +int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic) +{ + struct nes_vnic_mgt *mgtvnic; + u32 counter; + void *vmem; + dma_addr_t pmem; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 cqp_head; + unsigned long flags; + struct nes_hw_nic_qp_context *mgt_context; + u64 u64temp; + struct nes_hw_nic_rq_wqe *mgt_rqe; + struct sk_buff *skb; + u32 wqe_count; + struct nes_rskb_cb *cb; + u32 mgt_mem_size; + void *mgt_vbase; + dma_addr_t mgt_pbase; + int i; + int ret; + + /* Allocate space the all mgt QPs once */ + mgtvnic = kzalloc(NES_MGT_QP_COUNT * sizeof(struct nes_vnic_mgt), GFP_KERNEL); + if (mgtvnic == NULL) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt structure\n"); + return -ENOMEM; + } + + /* Allocate fragment, RQ, and CQ; Reuse CEQ based on the PCI function */ + /* We are not sending from this NIC so sq is not allocated */ + mgt_mem_size = 256 + + (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)) + + (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_cqe)) + + sizeof(struct nes_hw_nic_qp_context); + mgt_mem_size = (mgt_mem_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + mgt_vbase = pci_alloc_consistent(nesdev->pcidev, NES_MGT_QP_COUNT * mgt_mem_size, &mgt_pbase); + if (!mgt_vbase) { + kfree(mgtvnic); + nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt host descriptor rings\n"); + return -ENOMEM; + } + + nesvnic->mgt_mem_size = NES_MGT_QP_COUNT * mgt_mem_size; + nesvnic->mgt_vbase = mgt_vbase; + nesvnic->mgt_pbase = mgt_pbase; + + skb_queue_head_init(&nesvnic->mgt_skb_list); + init_waitqueue_head(&nesvnic->mgt_wait_queue); + nesvnic->mgt_thread = kthread_run(mgt_thread, nesvnic, "nes_mgt_thread"); + + for (i = 0; i < NES_MGT_QP_COUNT; i++) { + mgtvnic->nesvnic = nesvnic; + mgtvnic->mgt.qp_id = nesdev->mac_index + NES_MGT_QP_OFFSET + i; + memset(mgt_vbase, 0, mgt_mem_size); + nes_debug(NES_DBG_INIT, "Allocated mgt QP structures at %p (phys = %016lX), size = %u.\n", + mgt_vbase, (unsigned long)mgt_pbase, mgt_mem_size); + + vmem = (void *)(((unsigned long)mgt_vbase + (256 - 1)) & + ~(unsigned long)(256 - 1)); + pmem = (dma_addr_t)(((unsigned long long)mgt_pbase + (256 - 1)) & + ~(unsigned long long)(256 - 1)); + + spin_lock_init(&mgtvnic->mgt.rq_lock); + + /* setup the RQ */ + mgtvnic->mgt.rq_vbase = vmem; + mgtvnic->mgt.rq_pbase = pmem; + mgtvnic->mgt.rq_head = 0; + mgtvnic->mgt.rq_tail = 0; + mgtvnic->mgt.rq_size = NES_MGT_WQ_COUNT; + + /* setup the CQ */ + vmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)); + pmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)); + + mgtvnic->mgt_cq.cq_number = mgtvnic->mgt.qp_id; + mgtvnic->mgt_cq.cq_vbase = vmem; + mgtvnic->mgt_cq.cq_pbase = pmem; + mgtvnic->mgt_cq.cq_head = 0; + mgtvnic->mgt_cq.cq_size = NES_MGT_WQ_COUNT; + + mgtvnic->mgt_cq.ce_handler = nes_mgt_ce_handler; + + /* Send CreateCQ request to CQP */ + spin_lock_irqsave(&nesdev->cqp.lock, flags); + cqp_head = nesdev->cqp.sq_head; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( + NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + ((u32)mgtvnic->mgt_cq.cq_size << 16)); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32( + mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16)); + u64temp = (u64)mgtvnic->mgt_cq.cq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (unsigned long)&mgtvnic->mgt_cq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + /* Send CreateQP request to CQP */ + mgt_context = (void *)(&mgtvnic->mgt_cq.cq_vbase[mgtvnic->mgt_cq.cq_size]); + mgt_context->context_words[NES_NIC_CTX_MISC_IDX] = + cpu_to_le32((u32)NES_MGT_CTX_SIZE | + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12)); + nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n", + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE), + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE)); + if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) + mgt_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE); + + u64temp = (u64)mgtvnic->mgt.rq_pbase; + mgt_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + mgt_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + u64temp = (u64)mgtvnic->mgt.rq_pbase; + mgt_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + mgt_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP | + NES_CQP_QP_TYPE_NIC); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(mgtvnic->mgt.qp_id); + u64temp = (u64)mgtvnic->mgt_cq.cq_pbase + + (mgtvnic->mgt_cq.cq_size * sizeof(struct nes_hw_nic_cqe)); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + nesdev->cqp.sq_head = cqp_head; + + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_INIT, "Waiting for create MGT QP%u to complete.\n", + mgtvnic->mgt.qp_id); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_INIT, "Create MGT QP%u completed, wait_event_timeout ret = %u.\n", + mgtvnic->mgt.qp_id, ret); + if (!ret) { + nes_debug(NES_DBG_INIT, "MGT QP%u create timeout expired\n", mgtvnic->mgt.qp_id); + if (i == 0) { + pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase, + nesvnic->mgt_pbase); + kfree(mgtvnic); + } else { + nes_destroy_mgt(nesvnic); + } + return -EIO; + } + + /* Populate the RQ */ + for (counter = 0; counter < (NES_MGT_WQ_COUNT - 1); counter++) { + skb = dev_alloc_skb(nesvnic->max_frame_size); + if (!skb) { + nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name); + return -ENOMEM; + } + + skb->dev = netdev; + + pmem = pci_map_single(nesdev->pcidev, skb->data, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = pmem; + cb->maplen = nesvnic->max_frame_size; + + mgt_rqe = &mgtvnic->mgt.rq_vbase[counter]; + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32((u32)nesvnic->max_frame_size); + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32)); + mgtvnic->mgt.rx_skb[counter] = skb; + } + + init_timer(&mgtvnic->rq_wqes_timer); + mgtvnic->rq_wqes_timer.function = nes_mgt_rq_wqes_timeout; + mgtvnic->rq_wqes_timer.data = (unsigned long)mgtvnic; + + wqe_count = NES_MGT_WQ_COUNT - 1; + mgtvnic->mgt.rq_head = wqe_count; + barrier(); + do { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs + NES_WQE_ALLOC, (counter << 24) | mgtvnic->mgt.qp_id); + } while (wqe_count); + + nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + mgtvnic->mgt_cq.cq_number); + nes_read32(nesdev->regs + NES_CQE_ALLOC); + + mgt_vbase += mgt_mem_size; + mgt_pbase += mgt_mem_size; + nesvnic->mgtvnic[i] = mgtvnic++; + } + return 0; +} + + +void nes_destroy_mgt(struct nes_vnic *nesvnic) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_vnic_mgt *mgtvnic; + struct nes_vnic_mgt *first_mgtvnic; + unsigned long flags; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 cqp_head; + struct sk_buff *rx_skb; + int i; + int ret; + + kthread_stop(nesvnic->mgt_thread); + + /* Free remaining NIC receive buffers */ + first_mgtvnic = nesvnic->mgtvnic[0]; + for (i = 0; i < NES_MGT_QP_COUNT; i++) { + mgtvnic = nesvnic->mgtvnic[i]; + if (mgtvnic == NULL) + continue; + + while (mgtvnic->mgt.rq_head != mgtvnic->mgt.rq_tail) { + rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail]; + nes_mgt_free_skb(nesdev, rx_skb, PCI_DMA_FROMDEVICE); + mgtvnic->mgt.rq_tail++; + mgtvnic->mgt.rq_tail &= (mgtvnic->mgt.rq_size - 1); + } + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + /* Destroy NIC QP */ + cqp_head = nesdev->cqp.sq_head; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + mgtvnic->mgt.qp_id); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + + /* Destroy NIC CQ */ + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_CQ | ((u32)mgtvnic->mgt_cq.cq_size << 16))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16))); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + nesdev->cqp.sq_head = cqp_head; + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u," + " cqp.sq_tail=%u, cqp.sq_size=%u\n", + cqp_head, nesdev->cqp.sq_head, + nesdev->cqp.sq_tail, nesdev->cqp.sq_size); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + + nes_debug(NES_DBG_SHUTDOWN, "Destroy MGT QP returned, wait_event_timeout ret = %u, cqp_head=%u," + " cqp.sq_head=%u, cqp.sq_tail=%u\n", + ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail); + if (!ret) + nes_debug(NES_DBG_SHUTDOWN, "MGT QP%u destroy timeout expired\n", + mgtvnic->mgt.qp_id); + + nesvnic->mgtvnic[i] = NULL; + } + + if (nesvnic->mgt_vbase) { + pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase, + nesvnic->mgt_pbase); + nesvnic->mgt_vbase = NULL; + nesvnic->mgt_pbase = 0; + } + + kfree(first_mgtvnic); +} diff --git a/drivers/infiniband/hw/nes/nes_mgt.h b/drivers/infiniband/hw/nes/nes_mgt.h new file mode 100644 index 000000000000..8c8af254555a --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_mgt.h @@ -0,0 +1,97 @@ +/* +* Copyright (c) 2010 Intel-NE, Inc. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenIB.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ + +#ifndef __NES_MGT_H +#define __NES_MGT_H + +#define MPA_FRAMING 6 /* length is 2 bytes, crc is 4 bytes */ + +int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic); +void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp); +void nes_destroy_mgt(struct nes_vnic *nesvnic); +void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp); + +struct nes_hw_mgt { + struct nes_hw_nic_rq_wqe *rq_vbase; /* virtual address of rq */ + dma_addr_t rq_pbase; /* PCI memory for host rings */ + struct sk_buff *rx_skb[NES_NIC_WQ_SIZE]; + u16 qp_id; + u16 sq_head; + u16 rq_head; + u16 rq_tail; + u16 rq_size; + u8 replenishing_rq; + u8 reserved; + spinlock_t rq_lock; +}; + +struct nes_vnic_mgt { + struct nes_vnic *nesvnic; + struct nes_hw_mgt mgt; + struct nes_hw_nic_cq mgt_cq; + atomic_t rx_skbs_needed; + struct timer_list rq_wqes_timer; + atomic_t rx_skb_timer_running; +}; + +#define MAX_FPDU_FRAGS 4 +struct pau_fpdu_frag { + struct sk_buff *skb; + u64 physaddr; + u32 frag_len; + bool cmplt; +}; + +struct pau_fpdu_info { + struct nes_qp *nesqp; + struct nes_cqp_request *cqp_request; + void *hdr_vbase; + dma_addr_t hdr_pbase; + int hdr_len; + u16 data_len; + u16 frag_cnt; + struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; +}; + +enum pau_qh_state { + PAU_DEL_QH, + PAU_ADD_LB_QH, + PAU_READY +}; + +struct pau_qh_chg { + struct nes_device *nesdev; + struct nes_vnic *nesvnic; + struct nes_qp *nesqp; +}; + +#endif /* __NES_MGT_H */ diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 9d7ffebff213..64f91d840366 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -1090,6 +1090,8 @@ static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = { "LRO aggregated", "LRO flushed", "LRO no_desc", + "PAU CreateQPs", + "PAU DestroyQPs", }; #define NES_ETHTOOL_STAT_COUNT ARRAY_SIZE(nes_ethtool_stringset) @@ -1305,6 +1307,8 @@ static void nes_netdev_get_ethtool_stats(struct net_device *netdev, target_stat_values[++index] = nesvnic->lro_mgr.stats.aggregated; target_stat_values[++index] = nesvnic->lro_mgr.stats.flushed; target_stat_values[++index] = nesvnic->lro_mgr.stats.no_desc; + target_stat_values[++index] = atomic_read(&pau_qps_created); + target_stat_values[++index] = atomic_read(&pau_qps_destroyed); } /** diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index f9c417c6b3b3..cd10968bfa22 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -51,13 +51,34 @@ #include "nes.h" - - static u16 nes_read16_eeprom(void __iomem *addr, u16 offset); u32 mh_detected; u32 mh_pauses_sent; +u32 nes_set_pau(struct nes_device *nesdev) +{ + u32 ret = 0; + u32 counter; + + nes_write_indexed(nesdev, NES_IDX_GPR2, NES_ENABLE_PAU); + nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1); + + for (counter = 0; counter < NES_PAU_COUNTER; counter++) { + udelay(30); + if (!nes_read_indexed(nesdev, NES_IDX_GPR2)) { + printk(KERN_INFO PFX "PAU is supported.\n"); + break; + } + nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1); + } + if (counter == NES_PAU_COUNTER) { + printk(KERN_INFO PFX "PAU is not supported.\n"); + return -EPERM; + } + return ret; +} + /** * nes_read_eeprom_values - */ @@ -187,6 +208,11 @@ int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesada if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3)) nesadapter->send_term_ok = 1; + if (nes_drv_opt & NES_DRV_OPT_ENABLE_PAU) { + if (!nes_set_pau(nesdev)) + nesadapter->allow_unaligned_fpdus = 1; + } + nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8)) << 16) + (u32)((u8)eeprom_data); @@ -594,6 +620,7 @@ void nes_put_cqp_request(struct nes_device *nesdev, nes_free_cqp_request(nesdev, cqp_request); } + /** * nes_post_cqp_request */ @@ -604,6 +631,8 @@ void nes_post_cqp_request(struct nes_device *nesdev, unsigned long flags; u32 cqp_head; u64 u64temp; + u32 opcode; + int ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX; spin_lock_irqsave(&nesdev->cqp.lock, flags); @@ -614,17 +643,20 @@ void nes_post_cqp_request(struct nes_device *nesdev, nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); + opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]); + if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT) + ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX; barrier(); u64temp = (unsigned long)cqp_request; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_COMP_SCRATCH_LOW_IDX, - u64temp); + set_wqe_64bit_value(cqp_wqe->wqe_words, ctx_index, u64temp); nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ," - " request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u," - " waiting = %d, refcount = %d.\n", - le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, - le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request, - nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size, - cqp_request->waiting, atomic_read(&cqp_request->refcount)); + " request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u," + " waiting = %d, refcount = %d.\n", + opcode & NES_CQP_OPCODE_MASK, + le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request, + nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size, + cqp_request->waiting, atomic_read(&cqp_request->refcount)); + barrier(); /* Ring doorbell (1 WQEs) */ @@ -645,7 +677,6 @@ void nes_post_cqp_request(struct nes_device *nesdev, return; } - /** * nes_arp_table */ diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 394d0e7e4a5c..5095bc41c6cc 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1458,7 +1458,7 @@ static int nes_destroy_qp(struct ib_qp *ibqp) struct ib_qp_attr attr; struct iw_cm_id *cm_id; struct iw_cm_event cm_event; - int ret; + int ret = 0; atomic_inc(&sw_qps_destroyed); nesqp->destroyed = 1; @@ -1511,7 +1511,6 @@ static int nes_destroy_qp(struct ib_qp *ibqp) if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq)) nes_clean_cq(nesqp, nesqp->nesrcq); } - nes_rem_ref(&nesqp->ibqp); return 0; } diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 2df9993e0cac..280057353343 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -154,6 +154,7 @@ struct nes_qp { u32 mmap_sq_db_index; u32 mmap_rq_db_index; spinlock_t lock; + spinlock_t pau_lock; struct nes_qp_context *nesqp_context; dma_addr_t nesqp_context_pbase; void *pbl_vbase; @@ -161,6 +162,8 @@ struct nes_qp { struct page *page; struct timer_list terminate_timer; enum ib_event_type terminate_eventtype; + struct sk_buff_head pau_list; + u32 pau_rcv_nxt; u16 active_conn:1; u16 skip_lsmm:1; u16 user_mode:1; @@ -168,7 +171,8 @@ struct nes_qp { u16 flush_issued:1; u16 destroyed:1; u16 sig_all:1; - u16 rsvd:9; + u16 pau_mode:1; + u16 rsvd:8; u16 private_data_len; u16 term_sq_flush_code; u16 term_rq_flush_code; @@ -176,5 +180,8 @@ struct nes_qp { u8 hw_tcp_state; u8 term_flags; u8 sq_kmapped; + u8 pau_busy; + u8 pau_pending; + u8 pau_state; }; #endif /* NES_VERBS_H */ From 3e60a77ea218d1915bdc9127f88ab866fd2ac643 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 7 Oct 2011 13:59:41 -0700 Subject: [PATCH 20/62] IB/ipath: Add missing in ipath_chip_init.c Fix build errors: drivers/infiniband/hw/ipath/ipath_init_chip.c:54:1: error: 'S_IRUGO' undeclared here (not in a function) drivers/infiniband/hw/ipath/ipath_init_chip.c:54:1: error: bit-field '' width not an integer constant drivers/infiniband/hw/ipath/ipath_init_chip.c:67:1: error: 'S_IWUSR' undeclared here (not in a function) drivers/infiniband/hw/ipath/ipath_init_chip.c:67:1: error: bit-field '' width not an integer constant Signed-off-by: Randy Dunlap Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_init_chip.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index 7c1eebe8c7c9..824a4d508836 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include "ipath_kernel.h" From 71eeba161d7611238ecb6f525a82325aa35339f0 Mon Sep 17 00:00:00 2001 From: Marcel Apfelbaum Date: Wed, 5 Oct 2011 14:21:47 +0300 Subject: [PATCH 21/62] IB: Add new InfiniBand link speeds Introduce support for the following extended speeds: FDR-10: a Mellanox proprietary link speed which is 10.3125 Gbps with 64b/66b encoding rather than 8b/10b encoding. FDR: IBA extended speed 14.0625 Gbps. EDR: IBA extended speed 25.78125 Gbps. Signed-off-by: Marcel Apfelbaum Reviewed-by: Hal Rosenstock Reviewed-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/sysfs.c | 26 ++++++++++++++++++++++---- drivers/infiniband/core/verbs.c | 25 +++++++++++++++++++++++++ include/rdma/ib_verbs.h | 18 +++++++++++++++++- 3 files changed, 64 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 9ab5df72df7b..2b59b72b57f9 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -185,17 +185,35 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, if (ret) return ret; + rate = (25 * attr.active_speed) / 10; + switch (attr.active_speed) { - case 2: speed = " DDR"; break; - case 4: speed = " QDR"; break; + case 2: + speed = " DDR"; + break; + case 4: + speed = " QDR"; + break; + case 8: + speed = " FDR10"; + rate = 10; + break; + case 16: + speed = " FDR"; + rate = 14; + break; + case 32: + speed = " EDR"; + rate = 25; + break; } - rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed; + rate *= ib_width_enum_to_int(attr.active_width); if (rate < 0) return -EINVAL; return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", - rate / 10, rate % 10 ? ".5" : "", + rate, (attr.active_speed == 1) ? ".5" : "", ib_width_enum_to_int(attr.active_width), speed); } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index af7a8b08b2e9..2abed810d8d5 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -77,6 +77,31 @@ enum ib_rate mult_to_ib_rate(int mult) } EXPORT_SYMBOL(mult_to_ib_rate); +int ib_rate_to_mbps(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 2500; + case IB_RATE_5_GBPS: return 5000; + case IB_RATE_10_GBPS: return 10000; + case IB_RATE_20_GBPS: return 20000; + case IB_RATE_30_GBPS: return 30000; + case IB_RATE_40_GBPS: return 40000; + case IB_RATE_60_GBPS: return 60000; + case IB_RATE_80_GBPS: return 80000; + case IB_RATE_120_GBPS: return 120000; + case IB_RATE_14_GBPS: return 14062; + case IB_RATE_56_GBPS: return 56250; + case IB_RATE_112_GBPS: return 112500; + case IB_RATE_168_GBPS: return 168750; + case IB_RATE_25_GBPS: return 25781; + case IB_RATE_100_GBPS: return 103125; + case IB_RATE_200_GBPS: return 206250; + case IB_RATE_300_GBPS: return 309375; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mbps); + enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 228be3e220d9..97c98c0d89b1 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -207,6 +207,7 @@ enum ib_port_cap_flags { IB_PORT_SM_DISABLED = 1 << 10, IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IB_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IB_PORT_CM_SUP = 1 << 16, IB_PORT_SNMP_TUNNEL_SUP = 1 << 17, IB_PORT_REINIT_SUP = 1 << 18, @@ -415,7 +416,15 @@ enum ib_rate { IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, - IB_RATE_120_GBPS = 10 + IB_RATE_120_GBPS = 10, + IB_RATE_14_GBPS = 11, + IB_RATE_56_GBPS = 12, + IB_RATE_112_GBPS = 13, + IB_RATE_168_GBPS = 14, + IB_RATE_25_GBPS = 15, + IB_RATE_100_GBPS = 16, + IB_RATE_200_GBPS = 17, + IB_RATE_300_GBPS = 18 }; /** @@ -426,6 +435,13 @@ enum ib_rate { */ int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; +/** + * ib_rate_to_mbps - Convert the IB rate enum to Mbps. + * For example, IB_RATE_2_5_GBPS will be converted to 2500. + * @rate: rate to convert. + */ +int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__; + /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. From e36fb88a9a0fb8ac4b87c8ac709214a408de6d97 Mon Sep 17 00:00:00 2001 From: Marcel Apfelbaum Date: Tue, 4 Oct 2011 15:28:23 +0300 Subject: [PATCH 22/62] IPoIB: Handle extended rates in debugfs Use new function ib_rate_to_mbps() to handle printing rate in debugfs, so that we handle extended rates. Signed-off-by: Marcel Apfelbaum Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_fs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 86eae229dc49..0e2fe4631ba8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -212,16 +212,15 @@ static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) gid_buf, path.pathrec.dlid ? "yes" : "no"); if (path.pathrec.dlid) { - rate = ib_rate_to_mult(path.pathrec.rate) * 25; + rate = ib_rate_to_mbps(path.pathrec.rate); seq_printf(file, " DLID: 0x%04x\n" " SL: %12d\n" - " rate: %*d%s Gb/sec\n", + " rate: %8d.%d Gb/sec\n", be16_to_cpu(path.pathrec.dlid), path.pathrec.sl, - 10 - ((rate % 10) ? 2 : 0), - rate / 10, rate % 10 ? ".5" : ""); + rate / 1000, rate % 1000); } seq_putc(file, '\n'); From 59991f94eb32e954aa767f659eb642461e9e8b37 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 23 May 2011 17:52:46 -0700 Subject: [PATCH 23/62] RDMA/core: Add XRC domain support XRC ("eXtended reliable connected") is an IB transport that provides better scalability by allowing senders to specify which shared receive queue (SRQ) should be used to receive a message, which essentially allows one transport context (QP connection) to serve multiple destinations (as long as they share an adapter, of course). A few new concepts are introduced to support this. This patch adds: - A new device capability flag, IB_DEVICE_XRC, which low-level drivers set to indicate that a device supports XRC. - A new object type, XRC domains (struct ib_xrcd), and new verbs ib_alloc_xrcd()/ib_dealloc_xrcd(). XRCDs are used to limit which XRC SRQs an incoming message can target. This patch is derived from work by Jack Morgenstein . Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/verbs.c | 26 ++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 22 ++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index af7a8b08b2e9..5e03ab78fc8e 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -920,3 +920,29 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) return qp->device->detach_mcast(qp, gid, lid); } EXPORT_SYMBOL(ib_detach_mcast); + +struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) +{ + struct ib_xrcd *xrcd; + + if (!device->alloc_xrcd) + return ERR_PTR(-ENOSYS); + + xrcd = device->alloc_xrcd(device, NULL, NULL); + if (!IS_ERR(xrcd)) { + xrcd->device = device; + atomic_set(&xrcd->usecnt, 0); + } + + return xrcd; +} +EXPORT_SYMBOL(ib_alloc_xrcd); + +int ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + if (atomic_read(&xrcd->usecnt)) + return -EBUSY; + + return xrcd->device->dealloc_xrcd(xrcd); +} +EXPORT_SYMBOL(ib_dealloc_xrcd); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 228be3e220d9..d2a5c9b991d1 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -112,6 +112,7 @@ enum ib_device_cap_flags { */ IB_DEVICE_UD_IP_CSUM = (1<<18), IB_DEVICE_UD_TSO = (1<<19), + IB_DEVICE_XRC = (1<<20), IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21), IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22), }; @@ -858,6 +859,11 @@ struct ib_pd { atomic_t usecnt; /* count all resources */ }; +struct ib_xrcd { + struct ib_device *device; + atomic_t usecnt; /* count all resources */ +}; + struct ib_ah { struct ib_device *device; struct ib_pd *pd; @@ -1149,6 +1155,10 @@ struct ib_device { struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad); + struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, + struct ib_ucontext *ucontext, + struct ib_udata *udata); + int (*dealloc_xrcd)(struct ib_xrcd *xrcd); struct ib_dma_mapping_ops *dma_ops; @@ -2060,4 +2070,16 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); +/** + * ib_alloc_xrcd - Allocates an XRC domain. + * @device: The device on which to allocate the XRC domain. + */ +struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); + +/** + * ib_dealloc_xrcd - Deallocates an XRC domain. + * @xrcd: The XRC domain to deallocate. + */ +int ib_dealloc_xrcd(struct ib_xrcd *xrcd); + #endif /* IB_VERBS_H */ From 96104eda01695a26da2c8f7423ec0ba3509c8c97 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 23 May 2011 16:31:36 -0700 Subject: [PATCH 24/62] RDMA/core: Add SRQ type field Currently, there is only a single ("basic") type of SRQ, but with XRC support we will add a second. Prepare for this by defining an SRQ type and setting all current users to IB_SRQT_BASIC. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 1 + drivers/infiniband/core/verbs.c | 1 + drivers/infiniband/hw/ehca/ehca_qp.c | 3 +++ drivers/infiniband/hw/ipath/ipath_srq.c | 5 +++++ drivers/infiniband/hw/mlx4/srq.c | 3 +++ drivers/infiniband/hw/mthca/mthca_provider.c | 3 +++ drivers/infiniband/hw/qib/qib_srq.c | 5 +++++ drivers/infiniband/ulp/ipoib/ipoib_cm.c | 1 + include/rdma/ib_verbs.h | 6 ++++++ 9 files changed, 28 insertions(+) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c42699285f8e..48d21d09afe7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2013,6 +2013,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, attr.event_handler = ib_uverbs_srq_event_handler; attr.srq_context = file; + attr.srq_type = IB_SRQT_BASIC; attr.attr.max_wr = cmd.max_wr; attr.attr.max_sge = cmd.max_sge; attr.attr.srq_limit = cmd.srq_limit; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 5e03ab78fc8e..cbe85ddaf898 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -250,6 +250,7 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->uobject = NULL; srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; + srq->srq_type = srq_init_attr->srq_type; atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); } diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 32fb34201aba..964f85520798 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -977,6 +977,9 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd, struct hcp_modify_qp_control_block *mqpcb; u64 hret, update_mask; + if (srq_init_attr->srq_type != IB_SRQT_BASIC) + return ERR_PTR(-ENOSYS); + /* For common attributes, internal_create_qp() takes its info * out of qp_init_attr, so copy all common attrs there. */ diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c index 386e2c717c53..26271984b717 100644 --- a/drivers/infiniband/hw/ipath/ipath_srq.c +++ b/drivers/infiniband/hw/ipath/ipath_srq.c @@ -107,6 +107,11 @@ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, u32 sz; struct ib_srq *ret; + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + ret = ERR_PTR(-ENOSYS); + goto done; + } + if (srq_init_attr->attr.max_wr == 0) { ret = ERR_PTR(-EINVAL); goto done; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 818b7ecace5e..4f7f7600d27c 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -81,6 +81,9 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, int err; int i; + if (init_attr->srq_type != IB_SRQT_BASIC) + return ERR_PTR(-ENOSYS); + /* Sanity check SRQ size before proceeding */ if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes || init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 365fe0e14192..cb9a0b976804 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -438,6 +438,9 @@ static struct ib_srq *mthca_create_srq(struct ib_pd *pd, struct mthca_srq *srq; int err; + if (init_attr->srq_type != IB_SRQT_BASIC) + return ERR_PTR(-ENOSYS); + srq = kmalloc(sizeof *srq, GFP_KERNEL); if (!srq) return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/qib/qib_srq.c b/drivers/infiniband/hw/qib/qib_srq.c index c3ec8efc2ed8..d6235931a1ba 100644 --- a/drivers/infiniband/hw/qib/qib_srq.c +++ b/drivers/infiniband/hw/qib/qib_srq.c @@ -107,6 +107,11 @@ struct ib_srq *qib_create_srq(struct ib_pd *ibpd, u32 sz; struct ib_srq *ret; + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + ret = ERR_PTR(-ENOSYS); + goto done; + } + if (srq_init_attr->attr.max_sge == 0 || srq_init_attr->attr.max_sge > ib_qib_max_srq_sges || srq_init_attr->attr.max_wr == 0 || diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 39913a065f99..ef003cc54a43 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1496,6 +1496,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_srq_init_attr srq_init_attr = { + .srq_type = IB_SRQT_BASIC, .attr = { .max_wr = ipoib_recvq_size, .max_sge = max_sge diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d2a5c9b991d1..d0c2dc034054 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -523,6 +523,10 @@ enum ib_cq_notify_flags { IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; +enum ib_srq_type { + IB_SRQT_BASIC +}; + enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, @@ -538,6 +542,7 @@ struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; + enum ib_srq_type srq_type; }; struct ib_qp_cap { @@ -888,6 +893,7 @@ struct ib_srq { struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; + enum ib_srq_type srq_type; atomic_t usecnt; }; From 418d51307d102e72e745031adb4f5ba0ddb646e2 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 23 May 2011 19:42:29 -0700 Subject: [PATCH 25/62] RDMA/core: Add XRC SRQ type XRC ("eXtended reliable connected") is an IB transport that provides better scalability by allowing senders to specify which shared receive queue (SRQ) should be used to receive a message, which essentially allows one transport context (QP connection) to serve multiple destinations (as long as they share an adapter, of course). XRC defines SRQs that are specifically used by XRC connections. Expand the SRQ code to support XRC SRQs. An XRC SRQ is currently restricted to only XRC use according to the IB XRC Annex. Portions of this patch were derived from work by Jack Morgenstein . Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/verbs.c | 21 ++++++++++++++++++++- include/rdma/ib_verbs.h | 18 +++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index cbe85ddaf898..25c1c1ea4460 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -251,6 +251,12 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; + if (srq->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; + srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; + atomic_inc(&srq->ext.xrc.xrcd->usecnt); + atomic_inc(&srq->ext.xrc.cq->usecnt); + } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); } @@ -280,16 +286,29 @@ EXPORT_SYMBOL(ib_query_srq); int ib_destroy_srq(struct ib_srq *srq) { struct ib_pd *pd; + enum ib_srq_type srq_type; + struct ib_xrcd *uninitialized_var(xrcd); + struct ib_cq *uninitialized_var(cq); int ret; if (atomic_read(&srq->usecnt)) return -EBUSY; pd = srq->pd; + srq_type = srq->srq_type; + if (srq_type == IB_SRQT_XRC) { + xrcd = srq->ext.xrc.xrcd; + cq = srq->ext.xrc.cq; + } ret = srq->device->destroy_srq(srq); - if (!ret) + if (!ret) { atomic_dec(&pd->usecnt); + if (srq_type == IB_SRQT_XRC) { + atomic_dec(&xrcd->usecnt); + atomic_dec(&cq->usecnt); + } + } return ret; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d0c2dc034054..516647a22135 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -524,7 +524,8 @@ enum ib_cq_notify_flags { }; enum ib_srq_type { - IB_SRQT_BASIC + IB_SRQT_BASIC, + IB_SRQT_XRC }; enum ib_srq_attr_mask { @@ -543,6 +544,13 @@ struct ib_srq_init_attr { void *srq_context; struct ib_srq_attr attr; enum ib_srq_type srq_type; + + union { + struct { + struct ib_xrcd *xrcd; + struct ib_cq *cq; + } xrc; + } ext; }; struct ib_qp_cap { @@ -895,6 +903,14 @@ struct ib_srq { void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; + + union { + struct { + struct ib_xrcd *xrcd; + struct ib_cq *cq; + u32 srq_num; + } xrc; + } ext; }; struct ib_qp { From b42b63cf0dde2af6eec462b2d6cca7d938702a28 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 23 May 2011 19:59:25 -0700 Subject: [PATCH 26/62] RDMA/core: Add XRC QPs XRC ("eXtended reliable connected") is an IB transport that provides better scalability by allowing senders to specify which shared receive queue (SRQ) should be used to receive a message, which essentially allows one transport context (QP connection) to serve multiple destinations (as long as they share an adapter, of course). XRC communication is between an initiator (INI) QP and a target (TGT) QP. Target QPs are associated with SRQs through an XRCD. An XRC TGT QP behaves like a receive-only RD QP. XRC INI QPs behave similarly to RC QPs, except that work requests posted to an XRC INI QP must specify the remote SRQ that is the target of the work request. We define two new QP types for XRC, to distinguish between INI and TGT QPs, and update the core layer to support XRC QPs. This patch is derived from work by Jack Morgenstein Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/verbs.c | 150 +++++++++++++++++++++++++++----- include/rdma/ib_verbs.h | 9 +- 2 files changed, 138 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 25c1c1ea4460..89277e5129be 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -320,24 +320,44 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct ib_qp *qp; + struct ib_device *device; - qp = pd->device->create_qp(pd, qp_init_attr, NULL); + device = pd ? pd->device : qp_init_attr->xrcd->device; + qp = device->create_qp(pd, qp_init_attr, NULL); if (!IS_ERR(qp)) { - qp->device = pd->device; - qp->pd = pd; - qp->send_cq = qp_init_attr->send_cq; - qp->recv_cq = qp_init_attr->recv_cq; - qp->srq = qp_init_attr->srq; + qp->device = device; + + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + } else { + if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { + qp->recv_cq = NULL; + qp->srq = NULL; + } else { + qp->recv_cq = qp_init_attr->recv_cq; + atomic_inc(&qp_init_attr->recv_cq->usecnt); + qp->srq = qp_init_attr->srq; + if (qp->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + } + + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->xrcd = NULL; + + atomic_inc(&pd->usecnt); + atomic_inc(&qp_init_attr->send_cq->usecnt); + } + qp->uobject = NULL; qp->event_handler = qp_init_attr->event_handler; qp->qp_context = qp_init_attr->qp_context; qp->qp_type = qp_init_attr->qp_type; - atomic_inc(&pd->usecnt); - atomic_inc(&qp_init_attr->send_cq->usecnt); - atomic_inc(&qp_init_attr->recv_cq->usecnt); - if (qp_init_attr->srq) - atomic_inc(&qp_init_attr->srq->usecnt); } return qp; @@ -346,8 +366,8 @@ EXPORT_SYMBOL(ib_create_qp); static const struct { int valid; - enum ib_qp_attr_mask req_param[IB_QPT_RAW_ETHERTYPE + 1]; - enum ib_qp_attr_mask opt_param[IB_QPT_RAW_ETHERTYPE + 1]; + enum ib_qp_attr_mask req_param[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -363,6 +383,12 @@ static const struct { [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | @@ -385,6 +411,12 @@ static const struct { [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | @@ -404,6 +436,16 @@ static const struct { IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_XRC_TGT] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | @@ -414,6 +456,12 @@ static const struct { [IB_QPT_RC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), + [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | @@ -434,6 +482,13 @@ static const struct { IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | + IB_QP_SQ_PSN), [IB_QPT_SMI] = IB_QP_SQ_PSN, [IB_QPT_GSI] = IB_QP_SQ_PSN, }, @@ -449,6 +504,15 @@ static const struct { IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | @@ -473,6 +537,15 @@ static const struct { IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | @@ -485,6 +558,8 @@ static const struct { [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY } @@ -507,6 +582,15 @@ static const struct { IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | @@ -535,6 +619,25 @@ static const struct { IB_QP_PKEY_INDEX | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | @@ -619,20 +722,27 @@ int ib_destroy_qp(struct ib_qp *qp) struct ib_pd *pd; struct ib_cq *scq, *rcq; struct ib_srq *srq; + struct ib_xrcd *xrcd; int ret; - pd = qp->pd; - scq = qp->send_cq; - rcq = qp->recv_cq; - srq = qp->srq; + pd = qp->pd; + scq = qp->send_cq; + rcq = qp->recv_cq; + srq = qp->srq; + xrcd = qp->xrcd; ret = qp->device->destroy_qp(qp); if (!ret) { - atomic_dec(&pd->usecnt); - atomic_dec(&scq->usecnt); - atomic_dec(&rcq->usecnt); + if (pd) + atomic_dec(&pd->usecnt); + if (scq) + atomic_dec(&scq->usecnt); + if (rcq) + atomic_dec(&rcq->usecnt); if (srq) atomic_dec(&srq->usecnt); + if (xrcd) + atomic_dec(&xrcd->usecnt); } return ret; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 516647a22135..c3d7602b5e90 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -579,7 +579,11 @@ enum ib_qp_type { IB_QPT_UC, IB_QPT_UD, IB_QPT_RAW_IPV6, - IB_QPT_RAW_ETHERTYPE + IB_QPT_RAW_ETHERTYPE, + /* Save 8 for RAW_PACKET */ + IB_QPT_XRC_INI = 9, + IB_QPT_XRC_TGT, + IB_QPT_MAX }; enum ib_qp_create_flags { @@ -593,6 +597,7 @@ struct ib_qp_init_attr { struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; + struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; @@ -784,6 +789,7 @@ struct ib_send_wr { u32 rkey; } fast_reg; } wr; + u32 xrc_remote_srq_num; /* XRC TGT QPs only */ }; struct ib_recv_wr { @@ -919,6 +925,7 @@ struct ib_qp { struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; + struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *qp_context; From d3d72d909e783d048ee39046aa7b4fa798a4dda8 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 26 May 2011 23:06:44 -0700 Subject: [PATCH 27/62] RDMA/verbs: Cleanup XRC TGT QPs when destroying XRCD XRC TGT QPs are intended to be shared among multiple users and processes. Allow the destruction of an XRC TGT QP to be done explicitly through ib_destroy_qp() or when the XRCD is destroyed. To support destroying an XRC TGT QP, we need to track TGT QPs with the XRCD. When the XRCD is destroyed, all tracked XRC TGT QPs are also cleaned up. To avoid stale reference issues, if a user is holding a reference on a TGT QP, we increment a reference count on the QP. The user releases the reference by calling ib_release_qp. This releases any access to the QP from a user above verbs, but allows the QP to continue to exist until destroyed by the XRCD. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/verbs.c | 47 +++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 17 +++++++++++- 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 89277e5129be..8c6da5bda4c6 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -316,6 +316,20 @@ EXPORT_SYMBOL(ib_destroy_srq); /* Queue pairs */ +static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) +{ + mutex_lock(&xrcd->tgt_qp_mutex); + list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); + mutex_unlock(&xrcd->tgt_qp_mutex); +} + +static void __ib_remove_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) +{ + mutex_lock(&xrcd->tgt_qp_mutex); + list_del(&qp->xrcd_list); + mutex_unlock(&xrcd->tgt_qp_mutex); +} + struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { @@ -334,6 +348,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, qp->srq = NULL; qp->xrcd = qp_init_attr->xrcd; atomic_inc(&qp_init_attr->xrcd->usecnt); + __ib_insert_xrcd_qp(qp_init_attr->xrcd, qp); } else { if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { qp->recv_cq = NULL; @@ -730,6 +745,8 @@ int ib_destroy_qp(struct ib_qp *qp) rcq = qp->recv_cq; srq = qp->srq; xrcd = qp->xrcd; + if (xrcd) + __ib_remove_xrcd_qp(xrcd, qp); ret = qp->device->destroy_qp(qp); if (!ret) { @@ -743,12 +760,30 @@ int ib_destroy_qp(struct ib_qp *qp) atomic_dec(&srq->usecnt); if (xrcd) atomic_dec(&xrcd->usecnt); + } else if (xrcd) { + __ib_insert_xrcd_qp(xrcd, qp); } return ret; } EXPORT_SYMBOL(ib_destroy_qp); +int ib_release_qp(struct ib_qp *qp) +{ + unsigned long flags; + + if (qp->qp_type != IB_QPT_XRC_TGT) + return -EINVAL; + + spin_lock_irqsave(&qp->device->event_handler_lock, flags); + qp->event_handler = NULL; + spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); + + atomic_dec(&qp->xrcd->usecnt); + return 0; +} +EXPORT_SYMBOL(ib_release_qp); + /* Completion queues */ struct ib_cq *ib_create_cq(struct ib_device *device, @@ -1062,6 +1097,8 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) if (!IS_ERR(xrcd)) { xrcd->device = device; atomic_set(&xrcd->usecnt, 0); + mutex_init(&xrcd->tgt_qp_mutex); + INIT_LIST_HEAD(&xrcd->tgt_qp_list); } return xrcd; @@ -1070,9 +1107,19 @@ EXPORT_SYMBOL(ib_alloc_xrcd); int ib_dealloc_xrcd(struct ib_xrcd *xrcd) { + struct ib_qp *qp; + int ret; + if (atomic_read(&xrcd->usecnt)) return -EBUSY; + while (!list_empty(&xrcd->tgt_qp_list)) { + qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); + ret = ib_destroy_qp(qp); + if (ret) + return ret; + } + return xrcd->device->dealloc_xrcd(xrcd); } EXPORT_SYMBOL(ib_dealloc_xrcd); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c3d7602b5e90..ac46dcf04358 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -880,7 +880,10 @@ struct ib_pd { struct ib_xrcd { struct ib_device *device; - atomic_t usecnt; /* count all resources */ + atomic_t usecnt; /* count all exposed resources */ + + struct mutex tgt_qp_mutex; + struct list_head tgt_qp_list; }; struct ib_ah { @@ -926,6 +929,7 @@ struct ib_qp { struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ + struct list_head xrcd_list; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *qp_context; @@ -1481,6 +1485,17 @@ int ib_query_qp(struct ib_qp *qp, */ int ib_destroy_qp(struct ib_qp *qp); +/** + * ib_release_qp - Release an external reference to a QP. + * @qp: The QP handle to release + * + * The specified QP handle is released by the caller. If the QP is + * referenced internally, it is not destroyed until all internal + * references are released. After releasing the qp, the caller + * can no longer access it and all events on the QP are discarded. + */ +int ib_release_qp(struct ib_qp *qp); + /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. From 53d0bd1e7ff2fc626321f337c609fb76ae5d12c9 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Tue, 24 May 2011 08:33:46 -0700 Subject: [PATCH 28/62] RDMA/uverbs: Export XRC domains to user space Allow user space to create XRC domains. Because XRCDs are expected to be shared among multiple processes, we use inodes to identify an XRCD. Based on patches by Jack Morgenstein Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 11 + drivers/infiniband/core/uverbs_cmd.c | 318 ++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_main.c | 17 ++ drivers/infiniband/core/verbs.c | 1 + include/rdma/ib_user_verbs.h | 19 +- include/rdma/ib_verbs.h | 2 + 6 files changed, 367 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index a078e5624d22..461e2f357c1e 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -76,6 +76,8 @@ struct ib_uverbs_device { struct ib_device *ib_dev; int devnum; struct cdev cdev; + struct rb_root xrcd_tree; + struct mutex xrcd_tree_mutex; }; struct ib_uverbs_event_file { @@ -120,6 +122,11 @@ struct ib_uevent_object { u32 events_reported; }; +struct ib_uxrcd_object { + struct ib_uobject uobject; + atomic_t refcnt; +}; + struct ib_uqp_object { struct ib_uevent_object uevent; struct list_head mcast_list; @@ -142,6 +149,7 @@ extern struct idr ib_uverbs_ah_idr; extern struct idr ib_uverbs_cq_idr; extern struct idr ib_uverbs_qp_idr; extern struct idr ib_uverbs_srq_idr; +extern struct idr ib_uverbs_xrcd_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); @@ -161,6 +169,7 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); +void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ @@ -195,5 +204,7 @@ IB_UVERBS_DECLARE_CMD(create_srq); IB_UVERBS_DECLARE_CMD(modify_srq); IB_UVERBS_DECLARE_CMD(query_srq); IB_UVERBS_DECLARE_CMD(destroy_srq); +IB_UVERBS_DECLARE_CMD(open_xrcd); +IB_UVERBS_DECLARE_CMD(close_xrcd); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 48d21d09afe7..c8b2a843fa00 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -47,6 +47,7 @@ static struct lock_class_key cq_lock_key; static struct lock_class_key qp_lock_key; static struct lock_class_key ah_lock_key; static struct lock_class_key srq_lock_key; +static struct lock_class_key xrcd_lock_key; #define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ do { \ @@ -255,6 +256,18 @@ static void put_srq_read(struct ib_srq *srq) put_uobj_read(srq->uobject); } +static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context, + struct ib_uobject **uobj) +{ + *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0); + return *uobj ? (*uobj)->object : NULL; +} + +static void put_xrcd_read(struct ib_uobject *uobj) +{ + put_uobj_read(uobj); +} + ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -298,6 +311,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); + INIT_LIST_HEAD(&ucontext->xrcd_list); ucontext->closing = 0; resp.num_comp_vectors = file->device->num_comp_vectors; @@ -579,6 +593,310 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, return in_len; } +struct xrcd_table_entry { + struct rb_node node; + struct ib_xrcd *xrcd; + struct inode *inode; +}; + +static int xrcd_table_insert(struct ib_uverbs_device *dev, + struct inode *inode, + struct ib_xrcd *xrcd) +{ + struct xrcd_table_entry *entry, *scan; + struct rb_node **p = &dev->xrcd_tree.rb_node; + struct rb_node *parent = NULL; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->xrcd = xrcd; + entry->inode = inode; + + while (*p) { + parent = *p; + scan = rb_entry(parent, struct xrcd_table_entry, node); + + if (inode < scan->inode) { + p = &(*p)->rb_left; + } else if (inode > scan->inode) { + p = &(*p)->rb_right; + } else { + kfree(entry); + return -EEXIST; + } + } + + rb_link_node(&entry->node, parent, p); + rb_insert_color(&entry->node, &dev->xrcd_tree); + igrab(inode); + return 0; +} + +static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev, + struct inode *inode) +{ + struct xrcd_table_entry *entry; + struct rb_node *p = dev->xrcd_tree.rb_node; + + while (p) { + entry = rb_entry(p, struct xrcd_table_entry, node); + + if (inode < entry->inode) + p = p->rb_left; + else if (inode > entry->inode) + p = p->rb_right; + else + return entry; + } + + return NULL; +} + +static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode) +{ + struct xrcd_table_entry *entry; + + entry = xrcd_table_search(dev, inode); + if (!entry) + return NULL; + + return entry->xrcd; +} + +static void xrcd_table_delete(struct ib_uverbs_device *dev, + struct inode *inode) +{ + struct xrcd_table_entry *entry; + + entry = xrcd_table_search(dev, inode); + if (entry) { + iput(inode); + rb_erase(&entry->node, &dev->xrcd_tree); + kfree(entry); + } +} + +ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_open_xrcd cmd; + struct ib_uverbs_open_xrcd_resp resp; + struct ib_udata udata; + struct ib_uxrcd_object *obj; + struct ib_xrcd *xrcd = NULL; + struct file *f = NULL; + struct inode *inode = NULL; + int ret = 0; + int new_xrcd = 0; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + mutex_lock(&file->device->xrcd_tree_mutex); + + if (cmd.fd != -1) { + /* search for file descriptor */ + f = fget(cmd.fd); + if (!f) { + ret = -EBADF; + goto err_tree_mutex_unlock; + } + + inode = f->f_dentry->d_inode; + if (!inode) { + ret = -EBADF; + goto err_tree_mutex_unlock; + } + + xrcd = find_xrcd(file->device, inode); + if (!xrcd && !(cmd.oflags & O_CREAT)) { + /* no file descriptor. Need CREATE flag */ + ret = -EAGAIN; + goto err_tree_mutex_unlock; + } + + if (xrcd && cmd.oflags & O_EXCL) { + ret = -EINVAL; + goto err_tree_mutex_unlock; + } + } + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) { + ret = -ENOMEM; + goto err_tree_mutex_unlock; + } + + init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_key); + + down_write(&obj->uobject.mutex); + + if (!xrcd) { + xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, + file->ucontext, &udata); + if (IS_ERR(xrcd)) { + ret = PTR_ERR(xrcd); + goto err; + } + + xrcd->inode = inode; + xrcd->device = file->device->ib_dev; + atomic_set(&xrcd->usecnt, 0); + mutex_init(&xrcd->tgt_qp_mutex); + INIT_LIST_HEAD(&xrcd->tgt_qp_list); + new_xrcd = 1; + } + + atomic_set(&obj->refcnt, 0); + obj->uobject.object = xrcd; + ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); + if (ret) + goto err_idr; + + memset(&resp, 0, sizeof resp); + resp.xrcd_handle = obj->uobject.id; + + if (inode) { + if (new_xrcd) { + /* create new inode/xrcd table entry */ + ret = xrcd_table_insert(file->device, inode, xrcd); + if (ret) + goto err_insert_xrcd; + } + atomic_inc(&xrcd->usecnt); + } + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + if (f) + fput(f); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list); + mutex_unlock(&file->mutex); + + obj->uobject.live = 1; + up_write(&obj->uobject.mutex); + + mutex_unlock(&file->device->xrcd_tree_mutex); + return in_len; + +err_copy: + if (inode) { + if (new_xrcd) + xrcd_table_delete(file->device, inode); + atomic_dec(&xrcd->usecnt); + } + +err_insert_xrcd: + idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); + +err_idr: + ib_dealloc_xrcd(xrcd); + +err: + put_uobj_write(&obj->uobject); + +err_tree_mutex_unlock: + if (f) + fput(f); + + mutex_unlock(&file->device->xrcd_tree_mutex); + + return ret; +} + +ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_close_xrcd cmd; + struct ib_uobject *uobj; + struct ib_xrcd *xrcd = NULL; + struct inode *inode = NULL; + struct ib_uxrcd_object *obj; + int live; + int ret = 0; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + mutex_lock(&file->device->xrcd_tree_mutex); + uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext); + if (!uobj) { + ret = -EINVAL; + goto out; + } + + xrcd = uobj->object; + inode = xrcd->inode; + obj = container_of(uobj, struct ib_uxrcd_object, uobject); + if (atomic_read(&obj->refcnt)) { + put_uobj_write(uobj); + ret = -EBUSY; + goto out; + } + + if (!inode || atomic_dec_and_test(&xrcd->usecnt)) { + ret = ib_dealloc_xrcd(uobj->object); + if (!ret) + uobj->live = 0; + } + + live = uobj->live; + if (inode && ret) + atomic_inc(&xrcd->usecnt); + + put_uobj_write(uobj); + + if (ret) + goto out; + + if (inode && !live) + xrcd_table_delete(file->device, inode); + + idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + ret = in_len; + +out: + mutex_unlock(&file->device->xrcd_tree_mutex); + return ret; +} + +void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, + struct ib_xrcd *xrcd) +{ + struct inode *inode; + + inode = xrcd->inode; + if (inode && !atomic_dec_and_test(&xrcd->usecnt)) + return; + + ib_dealloc_xrcd(xrcd); + + if (inode) + xrcd_table_delete(dev, inode); +} + ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 56898b6578a4..bb9dcea8fede 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -72,6 +72,7 @@ DEFINE_IDR(ib_uverbs_ah_idr); DEFINE_IDR(ib_uverbs_cq_idr); DEFINE_IDR(ib_uverbs_qp_idr); DEFINE_IDR(ib_uverbs_srq_idr); +DEFINE_IDR(ib_uverbs_xrcd_idr); static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); @@ -107,6 +108,8 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd, + [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, }; static void ib_uverbs_add_one(struct ib_device *device); @@ -241,6 +244,18 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uobj); } + mutex_lock(&file->device->xrcd_tree_mutex); + list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) { + struct ib_xrcd *xrcd = uobj->object; + struct ib_uxrcd_object *uxrcd = + container_of(uobj, struct ib_uxrcd_object, uobject); + + idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); + ib_uverbs_dealloc_xrcd(file->device, xrcd); + kfree(uxrcd); + } + mutex_unlock(&file->device->xrcd_tree_mutex); + list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) { struct ib_pd *pd = uobj->object; @@ -741,6 +756,8 @@ static void ib_uverbs_add_one(struct ib_device *device) kref_init(&uverbs_dev->ref); init_completion(&uverbs_dev->comp); + uverbs_dev->xrcd_tree = RB_ROOT; + mutex_init(&uverbs_dev->xrcd_tree_mutex); spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 8c6da5bda4c6..a6d95e635699 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1096,6 +1096,7 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) xrcd = device->alloc_xrcd(device, NULL, NULL); if (!IS_ERR(xrcd)) { xrcd->device = device; + xrcd->inode = NULL; atomic_set(&xrcd->usecnt, 0); mutex_init(&xrcd->tgt_qp_mutex); INIT_LIST_HEAD(&xrcd->tgt_qp_list); diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index fe5b05177a2c..b5cb63f41f0f 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -81,7 +81,9 @@ enum { IB_USER_VERBS_CMD_MODIFY_SRQ, IB_USER_VERBS_CMD_QUERY_SRQ, IB_USER_VERBS_CMD_DESTROY_SRQ, - IB_USER_VERBS_CMD_POST_SRQ_RECV + IB_USER_VERBS_CMD_POST_SRQ_RECV, + IB_USER_VERBS_CMD_OPEN_XRCD, + IB_USER_VERBS_CMD_CLOSE_XRCD }; /* @@ -222,6 +224,21 @@ struct ib_uverbs_dealloc_pd { __u32 pd_handle; }; +struct ib_uverbs_open_xrcd { + __u64 response; + __u32 fd; + __u32 oflags; + __u64 driver_data[0]; +}; + +struct ib_uverbs_open_xrcd_resp { + __u32 xrcd_handle; +}; + +struct ib_uverbs_close_xrcd { + __u32 xrcd_handle; +}; + struct ib_uverbs_reg_mr { __u64 response; __u64 start; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ac46dcf04358..dfd9b87b7ffd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -851,6 +851,7 @@ struct ib_ucontext { struct list_head qp_list; struct list_head srq_list; struct list_head ah_list; + struct list_head xrcd_list; int closing; }; @@ -881,6 +882,7 @@ struct ib_pd { struct ib_xrcd { struct ib_device *device; atomic_t usecnt; /* count all exposed resources */ + struct inode *inode; struct mutex tgt_qp_mutex; struct list_head tgt_qp_list; From 8541f8de0583f562c652008a4e7a65e537842a7e Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 25 May 2011 17:08:38 -0700 Subject: [PATCH 29/62] RDMA/uverbs: Export XRC SRQs to user space We require additional information to create XRC SRQs than we can exchange using the existing create SRQ ABI. Provide an enhanced create ABI for extended SRQ types. Based on patches by Jack Morgenstein and Roland Dreier Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 6 + drivers/infiniband/core/uverbs_cmd.c | 250 ++++++++++++++++++-------- drivers/infiniband/core/uverbs_main.c | 1 + include/rdma/ib_user_verbs.h | 19 +- 4 files changed, 195 insertions(+), 81 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 461e2f357c1e..00ce032fcead 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -127,6 +127,11 @@ struct ib_uxrcd_object { atomic_t refcnt; }; +struct ib_usrq_object { + struct ib_uevent_object uevent; + struct ib_uxrcd_object *uxrcd; +}; + struct ib_uqp_object { struct ib_uevent_object uevent; struct list_head mcast_list; @@ -204,6 +209,7 @@ IB_UVERBS_DECLARE_CMD(create_srq); IB_UVERBS_DECLARE_CMD(modify_srq); IB_UVERBS_DECLARE_CMD(query_srq); IB_UVERBS_DECLARE_CMD(destroy_srq); +IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c8b2a843fa00..2e66b14acd43 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1400,7 +1400,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, rcq = cmd.recv_cq_handle == cmd.send_cq_handle ? scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1); - if (!pd || !scq || !rcq || (cmd.is_srq && !srq)) { + if (!pd || !scq || !rcq || + (cmd.is_srq && (!srq || srq->srq_type != IB_SRQT_BASIC))) { ret = -EINVAL; goto err_put; } @@ -2293,17 +2294,180 @@ out_put: return ret ? ret : in_len; } +int __uverbs_create_xsrq(struct ib_uverbs_file *file, + struct ib_uverbs_create_xsrq *cmd, + struct ib_udata *udata) +{ + struct ib_uverbs_create_srq_resp resp; + struct ib_usrq_object *obj; + struct ib_pd *pd; + struct ib_srq *srq; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_srq_init_attr attr; + int ret; + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_key); + down_write(&obj->uevent.uobject.mutex); + + pd = idr_read_pd(cmd->pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err; + } + + if (cmd->srq_type == IB_SRQT_XRC) { + attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); + if (!attr.ext.xrc.cq) { + ret = -EINVAL; + goto err_put_pd; + } + + attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); + if (!attr.ext.xrc.xrcd) { + ret = -EINVAL; + goto err_put_cq; + } + + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + } + + attr.event_handler = ib_uverbs_srq_event_handler; + attr.srq_context = file; + attr.srq_type = cmd->srq_type; + attr.attr.max_wr = cmd->max_wr; + attr.attr.max_sge = cmd->max_sge; + attr.attr.srq_limit = cmd->srq_limit; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + + srq = pd->device->create_srq(pd, &attr, udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err_put; + } + + srq->device = pd->device; + srq->pd = pd; + srq->srq_type = cmd->srq_type; + srq->uobject = &obj->uevent.uobject; + srq->event_handler = attr.event_handler; + srq->srq_context = attr.srq_context; + + if (cmd->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.cq = attr.ext.xrc.cq; + srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; + atomic_inc(&attr.ext.xrc.cq->usecnt); + atomic_inc(&attr.ext.xrc.xrcd->usecnt); + } + + atomic_inc(&pd->usecnt); + atomic_set(&srq->usecnt, 0); + + obj->uevent.uobject.object = srq; + ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); + if (ret) + goto err_destroy; + + memset(&resp, 0, sizeof resp); + resp.srq_handle = obj->uevent.uobject.id; + resp.max_wr = attr.attr.max_wr; + resp.max_sge = attr.attr.max_sge; + if (cmd->srq_type == IB_SRQT_XRC) + resp.srqn = srq->ext.xrc.srq_num; + + if (copy_to_user((void __user *) (unsigned long) cmd->response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + if (cmd->srq_type == IB_SRQT_XRC) { + put_uobj_read(xrcd_uobj); + put_cq_read(attr.ext.xrc.cq); + } + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + + up_write(&obj->uevent.uobject.mutex); + + return 0; + +err_copy: + idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); + +err_destroy: + ib_destroy_srq(srq); + +err_put: + if (cmd->srq_type == IB_SRQT_XRC) { + atomic_dec(&obj->uxrcd->refcnt); + put_uobj_read(xrcd_uobj); + } + +err_put_cq: + if (cmd->srq_type == IB_SRQT_XRC) + put_cq_read(attr.ext.xrc.cq); + +err_put_pd: + put_pd_read(pd); + +err: + put_uobj_write(&obj->uevent.uobject); + return ret; +} + ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_srq cmd; + struct ib_uverbs_create_xsrq xcmd; + struct ib_uverbs_create_srq_resp resp; + struct ib_udata udata; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + xcmd.response = cmd.response; + xcmd.user_handle = cmd.user_handle; + xcmd.srq_type = IB_SRQT_BASIC; + xcmd.pd_handle = cmd.pd_handle; + xcmd.max_wr = cmd.max_wr; + xcmd.max_sge = cmd.max_sge; + xcmd.srq_limit = cmd.srq_limit; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + ret = __uverbs_create_xsrq(file, &xcmd, &udata); + if (ret) + return ret; + + return in_len; +} + +ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, int out_len) +{ + struct ib_uverbs_create_xsrq cmd; struct ib_uverbs_create_srq_resp resp; struct ib_udata udata; - struct ib_uevent_object *obj; - struct ib_pd *pd; - struct ib_srq *srq; - struct ib_srq_init_attr attr; int ret; if (out_len < sizeof resp) @@ -2316,83 +2480,11 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - obj = kmalloc(sizeof *obj, GFP_KERNEL); - if (!obj) - return -ENOMEM; - - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &srq_lock_key); - down_write(&obj->uobject.mutex); - - pd = idr_read_pd(cmd.pd_handle, file->ucontext); - if (!pd) { - ret = -EINVAL; - goto err; - } - - attr.event_handler = ib_uverbs_srq_event_handler; - attr.srq_context = file; - attr.srq_type = IB_SRQT_BASIC; - attr.attr.max_wr = cmd.max_wr; - attr.attr.max_sge = cmd.max_sge; - attr.attr.srq_limit = cmd.srq_limit; - - obj->events_reported = 0; - INIT_LIST_HEAD(&obj->event_list); - - srq = pd->device->create_srq(pd, &attr, &udata); - if (IS_ERR(srq)) { - ret = PTR_ERR(srq); - goto err_put; - } - - srq->device = pd->device; - srq->pd = pd; - srq->uobject = &obj->uobject; - srq->event_handler = attr.event_handler; - srq->srq_context = attr.srq_context; - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); - - obj->uobject.object = srq; - ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject); + ret = __uverbs_create_xsrq(file, &cmd, &udata); if (ret) - goto err_destroy; - - memset(&resp, 0, sizeof resp); - resp.srq_handle = obj->uobject.id; - resp.max_wr = attr.attr.max_wr; - resp.max_sge = attr.attr.max_sge; - - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - ret = -EFAULT; - goto err_copy; - } - - put_pd_read(pd); - - mutex_lock(&file->mutex); - list_add_tail(&obj->uobject.list, &file->ucontext->srq_list); - mutex_unlock(&file->mutex); - - obj->uobject.live = 1; - - up_write(&obj->uobject.mutex); + return ret; return in_len; - -err_copy: - idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject); - -err_destroy: - ib_destroy_srq(srq); - -err_put: - put_pd_read(pd); - -err: - put_uobj_write(&obj->uobject); - return ret; } ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index bb9dcea8fede..6ad221b87158 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -110,6 +110,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, + [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq }; static void ib_uverbs_add_one(struct ib_device *device); diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index b5cb63f41f0f..100abbbd6e6e 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -83,7 +83,8 @@ enum { IB_USER_VERBS_CMD_DESTROY_SRQ, IB_USER_VERBS_CMD_POST_SRQ_RECV, IB_USER_VERBS_CMD_OPEN_XRCD, - IB_USER_VERBS_CMD_CLOSE_XRCD + IB_USER_VERBS_CMD_CLOSE_XRCD, + IB_USER_VERBS_CMD_CREATE_XSRQ }; /* @@ -665,11 +666,25 @@ struct ib_uverbs_create_srq { __u64 driver_data[0]; }; +struct ib_uverbs_create_xsrq { + __u64 response; + __u64 user_handle; + __u32 srq_type; + __u32 pd_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 reserved; + __u32 xrcd_handle; + __u32 cq_handle; + __u64 driver_data[0]; +}; + struct ib_uverbs_create_srq_resp { __u32 srq_handle; __u32 max_wr; __u32 max_sge; - __u32 reserved; + __u32 srqn; }; struct ib_uverbs_modify_srq { From 9977f4f64bfeb5d907a793a6880aab2d43b0bed2 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 26 May 2011 08:17:04 -0700 Subject: [PATCH 30/62] RDMA/uverbs: Export XRC INI QPs to userspace XRC INI QPs are similar to send only RC QPs. Allow user space to create INI QPs. Note that INI QPs do not require receive CQs. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 48 +++++++++++++++++++++------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2e66b14acd43..3eba64c51b0b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1371,8 +1371,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, struct ib_udata udata; struct ib_uqp_object *obj; struct ib_pd *pd; - struct ib_cq *scq, *rcq; - struct ib_srq *srq; + struct ib_cq *scq, *rcq = NULL; + struct ib_srq *srq = NULL; struct ib_qp *qp; struct ib_qp_init_attr attr; int ret; @@ -1394,18 +1394,31 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key); down_write(&obj->uevent.uobject.mutex); - srq = cmd.is_srq ? idr_read_srq(cmd.srq_handle, file->ucontext) : NULL; pd = idr_read_pd(cmd.pd_handle, file->ucontext); scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, 0); - rcq = cmd.recv_cq_handle == cmd.send_cq_handle ? - scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1); - - if (!pd || !scq || !rcq || - (cmd.is_srq && (!srq || srq->srq_type != IB_SRQT_BASIC))) { + if (!pd || !scq) { ret = -EINVAL; goto err_put; } + if (cmd.qp_type == IB_QPT_XRC_INI) { + cmd.max_recv_wr = cmd.max_recv_sge = 0; + } else { + if (cmd.is_srq) { + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq || srq->srq_type != IB_SRQT_BASIC) { + ret = -EINVAL; + goto err_put; + } + } + rcq = (cmd.recv_cq_handle == cmd.send_cq_handle) ? + scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1); + if (!rcq) { + ret = -EINVAL; + goto err_put; + } + } + attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.send_cq = scq; @@ -1442,7 +1455,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, qp->qp_type = attr.qp_type; atomic_inc(&pd->usecnt); atomic_inc(&attr.send_cq->usecnt); - atomic_inc(&attr.recv_cq->usecnt); + if (attr.recv_cq) + atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); @@ -1468,7 +1482,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, put_pd_read(pd); put_cq_read(scq); - if (rcq != scq) + if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); @@ -1603,6 +1617,17 @@ out: return ret ? ret : in_len; } +/* Remove ignored fields set in the attribute mask */ +static int modify_qp_mask(enum ib_qp_type qp_type, int mask) +{ + switch (qp_type) { + case IB_QPT_XRC_INI: + return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER); + default: + return mask; + } +} + ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -1675,7 +1700,8 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; - ret = qp->device->modify_qp(qp, attr, cmd.attr_mask, &udata); + ret = qp->device->modify_qp(qp, attr, + modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); put_qp_read(qp); From b93f3c18727634a2e847f067e549762d096921cf Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Fri, 27 May 2011 00:00:12 -0700 Subject: [PATCH 31/62] RDMA/uverbs: Export XRC TGT QPs to user space Allow user space to operate on XRC TGT QPs the same way as other types of QPs, with one notable exception: since XRC TGT QPs may be shared among multiple processes, the XRC TGT QP is allowed to exist beyond the lifetime of the creating process. The process that creates the QP is allowed to destroy it, but if the process exits without destroying the QP, then the QP will be left bound to the lifetime of the XRCD. TGT QPs are not associated with CQs or a PD. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 105 ++++++++++++++++---------- drivers/infiniband/core/uverbs_main.c | 8 +- 2 files changed, 73 insertions(+), 40 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 3eba64c51b0b..9058e38ca4cd 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1370,8 +1370,11 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, struct ib_uverbs_create_qp_resp resp; struct ib_udata udata; struct ib_uqp_object *obj; - struct ib_pd *pd; - struct ib_cq *scq, *rcq = NULL; + struct ib_device *device; + struct ib_pd *pd = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_cq *scq = NULL, *rcq = NULL; struct ib_srq *srq = NULL; struct ib_qp *qp; struct ib_qp_init_attr attr; @@ -1394,29 +1397,39 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key); down_write(&obj->uevent.uobject.mutex); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); - scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, 0); - if (!pd || !scq) { - ret = -EINVAL; - goto err_put; - } - - if (cmd.qp_type == IB_QPT_XRC_INI) { - cmd.max_recv_wr = cmd.max_recv_sge = 0; + if (cmd.qp_type == IB_QPT_XRC_TGT) { + xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); + if (!xrcd) { + ret = -EINVAL; + goto err_put; + } + device = xrcd->device; } else { - if (cmd.is_srq) { - srq = idr_read_srq(cmd.srq_handle, file->ucontext); - if (!srq || srq->srq_type != IB_SRQT_BASIC) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, 0); + if (!pd || !scq) { + ret = -EINVAL; + goto err_put; + } + + if (cmd.qp_type == IB_QPT_XRC_INI) { + cmd.max_recv_wr = cmd.max_recv_sge = 0; + } else { + if (cmd.is_srq) { + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq || srq->srq_type != IB_SRQT_BASIC) { + ret = -EINVAL; + goto err_put; + } + } + rcq = (cmd.recv_cq_handle == cmd.send_cq_handle) ? + scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1); + if (!rcq) { ret = -EINVAL; goto err_put; } } - rcq = (cmd.recv_cq_handle == cmd.send_cq_handle) ? - scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1); - if (!rcq) { - ret = -EINVAL; - goto err_put; - } + device = pd->device; } attr.event_handler = ib_uverbs_qp_event_handler; @@ -1424,6 +1437,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, attr.send_cq = scq; attr.recv_cq = rcq; attr.srq = srq; + attr.xrcd = xrcd; attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; attr.qp_type = cmd.qp_type; attr.create_flags = 0; @@ -1438,27 +1452,33 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); - qp = pd->device->create_qp(pd, &attr, &udata); + if (cmd.qp_type == IB_QPT_XRC_TGT) + qp = ib_create_qp(pd, &attr); + else + qp = device->create_qp(pd, &attr, &udata); + if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } - qp->device = pd->device; - qp->pd = pd; - qp->send_cq = attr.send_cq; - qp->recv_cq = attr.recv_cq; - qp->srq = attr.srq; - qp->uobject = &obj->uevent.uobject; - qp->event_handler = attr.event_handler; - qp->qp_context = attr.qp_context; - qp->qp_type = attr.qp_type; - atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); - if (attr.recv_cq) - atomic_inc(&attr.recv_cq->usecnt); - if (attr.srq) - atomic_inc(&attr.srq->usecnt); + if (cmd.qp_type != IB_QPT_XRC_TGT) { + qp->device = device; + qp->pd = pd; + qp->send_cq = attr.send_cq; + qp->recv_cq = attr.recv_cq; + qp->srq = attr.srq; + qp->event_handler = attr.event_handler; + qp->qp_context = attr.qp_context; + qp->qp_type = attr.qp_type; + atomic_inc(&pd->usecnt); + atomic_inc(&attr.send_cq->usecnt); + if (attr.recv_cq) + atomic_inc(&attr.recv_cq->usecnt); + if (attr.srq) + atomic_inc(&attr.srq->usecnt); + } + qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); @@ -1480,8 +1500,12 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, goto err_copy; } - put_pd_read(pd); - put_cq_read(scq); + if (xrcd) + put_xrcd_read(xrcd_uobj); + if (pd) + put_pd_read(pd); + if (scq) + put_cq_read(scq); if (rcq && rcq != scq) put_cq_read(rcq); if (srq) @@ -1504,6 +1528,8 @@ err_destroy: ib_destroy_qp(qp); err_put: + if (xrcd) + put_xrcd_read(xrcd_uobj); if (pd) put_pd_read(pd); if (scq) @@ -1623,6 +1649,9 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask) switch (qp_type) { case IB_QPT_XRC_INI: return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER); + case IB_QPT_XRC_TGT: + return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY); default: return mask; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 6ad221b87158..0cb69e039f75 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -206,8 +206,12 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, container_of(uobj, struct ib_uqp_object, uevent.uobject); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); - ib_uverbs_detach_umcast(qp, uqp); - ib_destroy_qp(qp); + if (qp->qp_type == IB_QPT_XRC_TGT) { + ib_release_qp(qp); + } else { + ib_uverbs_detach_umcast(qp, uqp); + ib_destroy_qp(qp); + } ib_uverbs_release_uevent(file, &uqp->uevent); kfree(uqp); } From d26a360b776d527429cf13300837711b0b2fde20 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Fri, 13 May 2011 10:46:20 -0700 Subject: [PATCH 32/62] IB/cm: Update protocol to support XRC Update the REQ and REP messages to support XRC connection setup according to the XRC Annex. Several existing fields must be set to 0 or 1 when connecting XRC QPs, and a reserved field is changed to an extended transport type. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cm.c | 46 +++++++++++++++++++++---------- drivers/infiniband/core/cm_msgs.h | 13 ++++++++- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index fc0f2bd9ca82..d2e1cfb206b0 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1008,7 +1008,6 @@ static void cm_format_req(struct cm_req_msg *req_msg, req_msg->service_id = param->service_id; req_msg->local_ca_guid = cm_id_priv->id.device->node_guid; cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num)); - cm_req_set_resp_res(req_msg, param->responder_resources); cm_req_set_init_depth(req_msg, param->initiator_depth); cm_req_set_remote_resp_timeout(req_msg, param->remote_cm_response_timeout); @@ -1017,12 +1016,16 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn)); cm_req_set_local_resp_timeout(req_msg, param->local_cm_response_timeout); - cm_req_set_retry_count(req_msg, param->retry_count); req_msg->pkey = param->primary_path->pkey; cm_req_set_path_mtu(req_msg, param->primary_path->mtu); - cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); cm_req_set_max_cm_retries(req_msg, param->max_cm_retries); - cm_req_set_srq(req_msg, param->srq); + + if (param->qp_type != IB_QPT_XRC_INI) { + cm_req_set_resp_res(req_msg, param->responder_resources); + cm_req_set_retry_count(req_msg, param->retry_count); + cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); + cm_req_set_srq(req_msg, param->srq); + } if (pri_path->hop_limit <= 1) { req_msg->primary_local_lid = pri_path->slid; @@ -1080,7 +1083,8 @@ static int cm_validate_req_param(struct ib_cm_req_param *param) if (!param->primary_path) return -EINVAL; - if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC) + if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC && + param->qp_type != IB_QPT_XRC_INI) return -EINVAL; if (param->private_data && @@ -1604,15 +1608,20 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num)); cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn)); rep_msg->resp_resources = param->responder_resources; - rep_msg->initiator_depth = param->initiator_depth; cm_rep_set_target_ack_delay(rep_msg, cm_id_priv->av.port->cm_dev->ack_delay); cm_rep_set_failover(rep_msg, param->failover_accepted); - cm_rep_set_flow_ctrl(rep_msg, param->flow_control); cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count); - cm_rep_set_srq(rep_msg, param->srq); rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid; + if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) { + rep_msg->initiator_depth = param->initiator_depth; + cm_rep_set_flow_ctrl(rep_msg, param->flow_control); + cm_rep_set_srq(rep_msg, param->srq); + } else { + cm_rep_set_srq(rep_msg, 1); + } + if (param->private_data && param->private_data_len) memcpy(rep_msg->private_data, param->private_data, param->private_data_len); @@ -3492,7 +3501,8 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); - if (cm_id_priv->qp_type == IB_QPT_RC) { + if (cm_id_priv->qp_type == IB_QPT_RC || + cm_id_priv->qp_type == IB_QPT_XRC_TGT) { *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; qp_attr->max_dest_rd_atomic = @@ -3537,15 +3547,21 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) { *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN; qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn); - if (cm_id_priv->qp_type == IB_QPT_RC) { - *qp_attr_mask |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | + switch (cm_id_priv->qp_type) { + case IB_QPT_RC: + case IB_QPT_XRC_INI: + *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC; - qp_attr->timeout = cm_id_priv->av.timeout; qp_attr->retry_cnt = cm_id_priv->retry_count; qp_attr->rnr_retry = cm_id_priv->rnr_retry_count; - qp_attr->max_rd_atomic = - cm_id_priv->initiator_depth; + qp_attr->max_rd_atomic = cm_id_priv->initiator_depth; + /* fall through */ + case IB_QPT_XRC_TGT: + *qp_attr_mask |= IB_QP_TIMEOUT; + qp_attr->timeout = cm_id_priv->av.timeout; + break; + default: + break; } if (cm_id_priv->alt_av.ah_attr.dlid) { *qp_attr_mask |= IB_QP_PATH_MIG_STATE; diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h index 7e63c08f697c..3ade32025974 100644 --- a/drivers/infiniband/core/cm_msgs.h +++ b/drivers/infiniband/core/cm_msgs.h @@ -86,7 +86,7 @@ struct cm_req_msg { __be16 pkey; /* path MTU:4, RDC exists:1, RNR retry count:3. */ u8 offset50; - /* max CM Retries:4, SRQ:1, rsvd:3 */ + /* max CM Retries:4, SRQ:1, extended transport type:3 */ u8 offset51; __be16 primary_local_lid; @@ -175,6 +175,11 @@ static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg) switch(transport_type) { case 0: return IB_QPT_RC; case 1: return IB_QPT_UC; + case 3: + switch (req_msg->offset51 & 0x7) { + case 1: return IB_QPT_XRC_TGT; + default: return 0; + } default: return 0; } } @@ -188,6 +193,12 @@ static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg, req_msg->offset40) & 0xFFFFFFF9) | 0x2); break; + case IB_QPT_XRC_INI: + req_msg->offset40 = cpu_to_be32((be32_to_cpu( + req_msg->offset40) & + 0xFFFFFFF9) | 0x6); + req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1; + break; default: req_msg->offset40 = cpu_to_be32(be32_to_cpu( req_msg->offset40) & From ef70044647b260cb6b7863f392384a06670d0b2a Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Tue, 2 Aug 2011 11:08:22 -0700 Subject: [PATCH 33/62] IB/cm: Update XRC support based on XRC annex errata The XRC annex was updated to have XRC behave more like RD. Specifically, the XRC TGT QPN moves from the local QPN to local EECN field. Lookup of SRQN is done using the REQ/REP protocol. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cm.c | 15 ++++++++------- drivers/infiniband/core/cm_msgs.h | 19 ++++++++++++++++++- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index d2e1cfb206b0..42a7a9bae44e 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1605,7 +1605,6 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid); rep_msg->local_comm_id = cm_id_priv->id.local_id; rep_msg->remote_comm_id = cm_id_priv->id.remote_id; - cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num)); cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn)); rep_msg->resp_resources = param->responder_resources; cm_rep_set_target_ack_delay(rep_msg, @@ -1618,8 +1617,10 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, rep_msg->initiator_depth = param->initiator_depth; cm_rep_set_flow_ctrl(rep_msg, param->flow_control); cm_rep_set_srq(rep_msg, param->srq); + cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num)); } else { cm_rep_set_srq(rep_msg, 1); + cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num)); } if (param->private_data && param->private_data_len) @@ -1669,7 +1670,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg); - cm_id_priv->local_qpn = cm_rep_get_local_qpn(rep_msg); + cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; @@ -1740,7 +1741,7 @@ error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); } EXPORT_SYMBOL(ib_send_cm_rtu); -static void cm_format_rep_event(struct cm_work *work) +static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) { struct cm_rep_msg *rep_msg; struct ib_cm_rep_event_param *param; @@ -1749,7 +1750,7 @@ static void cm_format_rep_event(struct cm_work *work) param = &work->cm_event.param.rep_rcvd; param->remote_ca_guid = rep_msg->local_ca_guid; param->remote_qkey = be32_to_cpu(rep_msg->local_qkey); - param->remote_qpn = be32_to_cpu(cm_rep_get_local_qpn(rep_msg)); + param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type)); param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg)); param->responder_resources = rep_msg->initiator_depth; param->initiator_depth = rep_msg->resp_resources; @@ -1817,7 +1818,7 @@ static int cm_rep_handler(struct cm_work *work) return -EINVAL; } - cm_format_rep_event(work); + cm_format_rep_event(work, cm_id_priv->qp_type); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { @@ -1832,7 +1833,7 @@ static int cm_rep_handler(struct cm_work *work) cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id; cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid; - cm_id_priv->timewait_info->remote_qpn = cm_rep_get_local_qpn(rep_msg); + cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); spin_lock(&cm.lock); /* Check for duplicate REP. */ @@ -1859,7 +1860,7 @@ static int cm_rep_handler(struct cm_work *work) cm_id_priv->id.state = IB_CM_REP_RCVD; cm_id_priv->id.remote_id = rep_msg->local_comm_id; - cm_id_priv->remote_qpn = cm_rep_get_local_qpn(rep_msg); + cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); cm_id_priv->initiator_depth = rep_msg->resp_resources; cm_id_priv->responder_resources = rep_msg->initiator_depth; cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg); diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h index 3ade32025974..505db2a59e7f 100644 --- a/drivers/infiniband/core/cm_msgs.h +++ b/drivers/infiniband/core/cm_msgs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * @@ -538,6 +538,23 @@ static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn) (be32_to_cpu(rep_msg->offset12) & 0x000000FF)); } +static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg) +{ + return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8); +} + +static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn) +{ + rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) | + (be32_to_cpu(rep_msg->offset16) & 0x000000FF)); +} + +static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type) +{ + return (qp_type == IB_QPT_XRC_INI) ? + cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg); +} + static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg) { return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8); From 2d2e94152928209de13dea0535242c0e457bdcbb Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 28 May 2011 21:56:39 -0700 Subject: [PATCH 34/62] RDMA/cm: Define new RDMA port space specific to IB Add RDMA_PS_IB. XRC QP types will use the IB port space when operating over the RDMA CM. For the 'IP protocol' field value, we select 0x3F, which is listed as being for 'any local network'. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 5 +++++ include/rdma/rdma_cm.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ca4c5dcd7133..8801ea91b0a6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -81,6 +81,7 @@ static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); static DEFINE_IDR(ipoib_ps); +static DEFINE_IDR(ib_ps); struct cma_device { struct list_head list; @@ -2234,6 +2235,9 @@ static int cma_get_port(struct rdma_id_private *id_priv) case RDMA_PS_IPOIB: ps = &ipoib_ps; break; + case RDMA_PS_IB: + ps = &ib_ps; + break; default: return -EPROTONOSUPPORT; } @@ -3460,6 +3464,7 @@ static void __exit cma_cleanup(void) idr_destroy(&tcp_ps); idr_destroy(&udp_ps); idr_destroy(&ipoib_ps); + idr_destroy(&ib_ps); } module_init(cma_init); diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 26977c149c41..51988f808181 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -65,6 +65,7 @@ enum rdma_cm_event_type { enum rdma_port_space { RDMA_PS_SDP = 0x0001, RDMA_PS_IPOIB = 0x0002, + RDMA_PS_IB = 0x013F, RDMA_PS_TCP = 0x0106, RDMA_PS_UDP = 0x0111, }; From 638ef7a6c6a599b2e4c00ebf35d281721d2a42f6 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Tue, 14 Jun 2011 16:31:53 -0700 Subject: [PATCH 35/62] RDMA/ucm: Allow user to specify QP type when creating id Allow the user to indicate the QP type separately from the port space when allocating an rdma_cm_id. With RDMA_PS_IB, there is no longer a 1:1 relationship between the QP type and port space, so we need to switch on the QP type to select between UD and connected QPs. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/ucma.c | 5 ++++- include/rdma/rdma_user_cm.h | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 71be5eebd683..eb0a8e8a6943 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -276,7 +276,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, ucma_set_event_context(ctx, event, uevent); uevent->resp.event = event->event; uevent->resp.status = event->status; - if (cm_id->ps == RDMA_PS_UDP || cm_id->ps == RDMA_PS_IPOIB) + if (cm_id->qp_type == IB_QPT_UD) ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, @@ -377,6 +377,9 @@ static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_ case RDMA_PS_IPOIB: *qp_type = IB_QPT_UD; return 0; + case RDMA_PS_IB: + *qp_type = cmd->qp_type; + return 0; default: return -EINVAL; } diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h index fc82c1896f75..5348a000c8f3 100644 --- a/include/rdma/rdma_user_cm.h +++ b/include/rdma/rdma_user_cm.h @@ -77,7 +77,8 @@ struct rdma_ucm_create_id { __u64 uid; __u64 response; __u16 ps; - __u8 reserved[6]; + __u8 qp_type; + __u8 reserved[5]; }; struct rdma_ucm_create_id_resp { From 18c441a6c3741991bfb87a3c6c541d30f0eb9c7c Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 28 May 2011 23:26:06 -0700 Subject: [PATCH 36/62] RDMA/cma: Support XRC QPs Allow users to connect XRC QPs through the rdma_cm. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 8801ea91b0a6..b1fd805c9d6c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2573,7 +2573,7 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, req.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &route->addr.dst_addr); req.qp_num = id_priv->qp_num; - req.qp_type = IB_QPT_RC; + req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; From 2622e18ef407a8e8e3ddc3d6f0c77b756c493798 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 30 May 2011 22:30:46 -0700 Subject: [PATCH 37/62] IB/cm: Do not automatically disconnect XRC TGT QPs Because an XRC TGT QP can end up being shared among multiple processes, don't have the ib_cm automatically send a DREQ when the userspace process that owns the ib_cm_id exits. Disconnect can be initiated by the user directly; otherwise, the owner of the XRC INI QP controls the connection. Note that as a result of the process exiting, the ib_cm will stop tracking the XRC connection on the target side. For the purposes of disconnecting, this isn't a big deal. The ib_cm will respond to the DREQ appropriately. For other messages, mainly LAP, the CM will reject the request, since there's no one available to route the request to. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 42a7a9bae44e..4104ea2427c2 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -889,6 +889,8 @@ retest: break; case IB_CM_ESTABLISHED: spin_unlock_irq(&cm_id_priv->lock); + if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) + break; ib_send_cm_dreq(cm_id, NULL, 0); goto retest; case IB_CM_DREQ_SENT: From 012a8ff577f95211c6ffd3b77a94c34ebae009b6 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 2 Jun 2011 09:01:33 -0700 Subject: [PATCH 38/62] IB/mlx4: Add support for XRC domains Support creating and destroying XRC domains. Any sharing of the XRCD is managed above the low-level driver. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 59 ++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx4/mlx4_ib.h | 12 ++++++ drivers/net/mlx4/fw.c | 6 +++ drivers/net/mlx4/fw.h | 2 + drivers/net/mlx4/main.c | 18 ++++++++- drivers/net/mlx4/mlx4.h | 3 ++ drivers/net/mlx4/pd.c | 30 ++++++++++++++ include/linux/mlx4/device.h | 5 +++ 8 files changed, 134 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index fa643f4f4e28..23e45df9ae36 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -566,6 +566,57 @@ static int mlx4_ib_dealloc_pd(struct ib_pd *pd) return 0; } +static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_xrcd *xrcd; + int err; + + if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); + if (err) + goto err1; + + xrcd->pd = ib_alloc_pd(ibdev); + if (IS_ERR(xrcd->pd)) { + err = PTR_ERR(xrcd->pd); + goto err2; + } + + xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0); + if (IS_ERR(xrcd->cq)) { + err = PTR_ERR(xrcd->cq); + goto err3; + } + + return &xrcd->ibxrcd; + +err3: + ib_dealloc_pd(xrcd->pd); +err2: + mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); +err1: + kfree(xrcd); + return ERR_PTR(err); +} + +static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + ib_destroy_cq(to_mxrcd(xrcd)->cq); + ib_dealloc_pd(to_mxrcd(xrcd)->pd); + mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); + kfree(xrcd); + + return 0; +} + static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); @@ -1093,6 +1144,14 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { + ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; + ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + spin_lock_init(&iboe->lock); if (init_node_data(ibdev)) diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e4bf2cff8662..ce150b0e2cc8 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -56,6 +56,13 @@ struct mlx4_ib_pd { u32 pdn; }; +struct mlx4_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; + struct ib_pd *pd; + struct ib_cq *cq; +}; + struct mlx4_ib_cq_buf { struct mlx4_buf buf; struct mlx4_mtt mtt; @@ -211,6 +218,11 @@ static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd) return container_of(ibpd, struct mlx4_ib_pd, ibpd); } +static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd); +} + static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq) { return container_of(ibcq, struct mlx4_ib_cq, ibcq); diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 7eb8ba822e97..875838b8799c 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -204,6 +204,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_MCG_OFFSET 0x63 #define QUERY_DEV_CAP_RSVD_PD_OFFSET 0x64 #define QUERY_DEV_CAP_MAX_PD_OFFSET 0x65 +#define QUERY_DEV_CAP_RSVD_XRC_OFFSET 0x66 +#define QUERY_DEV_CAP_MAX_XRC_OFFSET 0x67 #define QUERY_DEV_CAP_MAX_COUNTERS_OFFSET 0x68 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET 0x80 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET 0x82 @@ -318,6 +320,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->reserved_pds = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET); dev_cap->max_pds = 1 << (field & 0x3f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET); + dev_cap->reserved_xrcds = field >> 4; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET); + dev_cap->max_xrcds = 1 << (field & 0x1f); MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET); dev_cap->rdmarc_entry_sz = size; diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h index 1e8ecc3708e2..bf5ec2286528 100644 --- a/drivers/net/mlx4/fw.h +++ b/drivers/net/mlx4/fw.h @@ -93,6 +93,8 @@ struct mlx4_dev_cap { int max_mcgs; int reserved_pds; int max_pds; + int reserved_xrcds; + int max_xrcds; int qpc_entry_sz; int rdmarc_entry_sz; int altc_entry_sz; diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index f0ee35df4dd7..660b691ede76 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -220,6 +220,10 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.reserved_mrws = dev_cap->reserved_mrws; dev->caps.reserved_uars = dev_cap->reserved_uars; dev->caps.reserved_pds = dev_cap->reserved_pds; + dev->caps.reserved_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? + dev_cap->reserved_xrcds : 0; + dev->caps.max_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? + dev_cap->max_xrcds : 0; dev->caps.mtt_entry_sz = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz; dev->caps.max_msg_sz = dev_cap->max_msg_sz; dev->caps.page_size_cap = ~(u32) (dev_cap->min_page_sz - 1); @@ -912,11 +916,18 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) goto err_kar_unmap; } + err = mlx4_init_xrcd_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize " + "reliable connection domain table, aborting.\n"); + goto err_pd_table_free; + } + err = mlx4_init_mr_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "memory region table, aborting.\n"); - goto err_pd_table_free; + goto err_xrcd_table_free; } err = mlx4_init_eq_table(dev); @@ -1033,6 +1044,9 @@ err_eq_table_free: err_mr_table_free: mlx4_cleanup_mr_table(dev); +err_xrcd_table_free: + mlx4_cleanup_xrcd_table(dev); + err_pd_table_free: mlx4_cleanup_pd_table(dev); @@ -1355,6 +1369,7 @@ err_port: mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); mlx4_cleanup_mr_table(dev); + mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); mlx4_cleanup_uar_table(dev); @@ -1416,6 +1431,7 @@ static void mlx4_remove_one(struct pci_dev *pdev) mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); mlx4_cleanup_mr_table(dev); + mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); iounmap(priv->kar); diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index a2fcd8402d37..fee2d05aa1dc 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -335,6 +335,7 @@ struct mlx4_priv { struct mlx4_cmd cmd; struct mlx4_bitmap pd_bitmap; + struct mlx4_bitmap xrcd_bitmap; struct mlx4_uar_table uar_table; struct mlx4_mr_table mr_table; struct mlx4_cq_table cq_table; @@ -384,6 +385,7 @@ int mlx4_alloc_eq_table(struct mlx4_dev *dev); void mlx4_free_eq_table(struct mlx4_dev *dev); int mlx4_init_pd_table(struct mlx4_dev *dev); +int mlx4_init_xrcd_table(struct mlx4_dev *dev); int mlx4_init_uar_table(struct mlx4_dev *dev); int mlx4_init_mr_table(struct mlx4_dev *dev); int mlx4_init_eq_table(struct mlx4_dev *dev); @@ -393,6 +395,7 @@ int mlx4_init_srq_table(struct mlx4_dev *dev); int mlx4_init_mcg_table(struct mlx4_dev *dev); void mlx4_cleanup_pd_table(struct mlx4_dev *dev); +void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev); void mlx4_cleanup_uar_table(struct mlx4_dev *dev); void mlx4_cleanup_mr_table(struct mlx4_dev *dev); void mlx4_cleanup_eq_table(struct mlx4_dev *dev); diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c index 1286b886dcea..3736163e30e9 100644 --- a/drivers/net/mlx4/pd.c +++ b/drivers/net/mlx4/pd.c @@ -61,6 +61,24 @@ void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn) } EXPORT_SYMBOL_GPL(mlx4_pd_free); +int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + *xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap); + if (*xrcdn == -1) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc); + +void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn) +{ + mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn); +} +EXPORT_SYMBOL_GPL(mlx4_xrcd_free); + int mlx4_init_pd_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -74,6 +92,18 @@ void mlx4_cleanup_pd_table(struct mlx4_dev *dev) mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap); } +int mlx4_init_xrcd_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16), + (1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0); +} + +void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev) +{ + mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap); +} int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar) { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 53ef894bfa05..6a3478d0b330 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -61,6 +61,7 @@ enum { MLX4_DEV_CAP_FLAG_RC = 1LL << 0, MLX4_DEV_CAP_FLAG_UC = 1LL << 1, MLX4_DEV_CAP_FLAG_UD = 1LL << 2, + MLX4_DEV_CAP_FLAG_XRC = 1LL << 3, MLX4_DEV_CAP_FLAG_SRQ = 1LL << 6, MLX4_DEV_CAP_FLAG_IPOIB_CSUM = 1LL << 7, MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1LL << 8, @@ -256,6 +257,8 @@ struct mlx4_caps { int num_qp_per_mgm; int num_pds; int reserved_pds; + int max_xrcds; + int reserved_xrcds; int mtt_entry_sz; u32 max_msg_sz; u32 page_size_cap; @@ -499,6 +502,8 @@ static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn); void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn); +int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn); +void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn); int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar); void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar); From 18abd5ea571608a7c726fc56e21d3e31f9febfd0 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 2 Jun 2011 10:43:26 -0700 Subject: [PATCH 39/62] IB/mlx4: Add support for XRC SRQs Allow the user to create XRC SRQs. This patch is based on a patch from Jack Morgenstrein . Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 3 ++- drivers/infiniband/hw/mlx4/srq.c | 13 +++++++++---- drivers/net/mlx4/srq.c | 20 +++++++++++--------- include/linux/mlx4/device.h | 4 ++-- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 23e45df9ae36..42a538e5df36 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1095,7 +1095,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 4f7f7600d27c..39542f3703b8 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -76,14 +76,13 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, struct mlx4_ib_srq *srq; struct mlx4_wqe_srq_next_seg *next; struct mlx4_wqe_data_seg *scatter; + u32 cqn; + u16 xrcdn; int desc_size; int buf_size; int err; int i; - if (init_attr->srq_type != IB_SRQT_BASIC) - return ERR_PTR(-ENOSYS); - /* Sanity check SRQ size before proceeding */ if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes || init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) @@ -177,12 +176,18 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, } } - err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, &srq->mtt, + cqn = (init_attr->srq_type == IB_SRQT_XRC) ? + to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0; + xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ? + to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn : + (u16) dev->dev->caps.reserved_xrcds; + err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt, srq->db.dma, &srq->msrq); if (err) goto err_wrid; srq->msrq.event = mlx4_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; if (pd->uobject) if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c index 3b07b80a0456..a20b141dbb5c 100644 --- a/drivers/net/mlx4/srq.c +++ b/drivers/net/mlx4/srq.c @@ -40,20 +40,20 @@ struct mlx4_srq_context { __be32 state_logsize_srqn; u8 logstride; - u8 reserved1[3]; - u8 pg_offset; - u8 reserved2[3]; - u32 reserved3; + u8 reserved1; + __be16 xrcd; + __be32 pg_offset_cqn; + u32 reserved2; u8 log_page_size; - u8 reserved4[2]; + u8 reserved3[2]; u8 mtt_base_addr_h; __be32 mtt_base_addr_l; __be32 pd; __be16 limit_watermark; __be16 wqe_cnt; - u16 reserved5; + u16 reserved4; __be16 wqe_counter; - u32 reserved6; + u32 reserved5; __be64 db_rec_addr; }; @@ -109,8 +109,8 @@ static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox MLX4_CMD_TIME_CLASS_A); } -int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt, - u64 db_rec, struct mlx4_srq *srq) +int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, + struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq) { struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; struct mlx4_cmd_mailbox *mailbox; @@ -148,6 +148,8 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt, srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) | srq->srqn); srq_context->logstride = srq->wqe_shift - 4; + srq_context->xrcd = cpu_to_be16(xrcd); + srq_context->pg_offset_cqn = cpu_to_be32(cqn & 0xffffff); srq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; mtt_addr = mlx4_mtt_addr(dev, mtt); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 6a3478d0b330..9ab0f6a80d60 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -543,8 +543,8 @@ void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp); void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp); -int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt, - u64 db_rec, struct mlx4_srq *srq); +int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn, + struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq); void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq); int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark); int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark); From 0a1405da9952a72dd587829a3321695adde7dca1 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 2 Jun 2011 11:32:15 -0700 Subject: [PATCH 40/62] IB/mlx4: Add support for XRC QPs Support the creation of XRC INI and TGT QPs. To handle the case where a CQ or PD is not provided, we allocate them internally with the xrcd. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 2 + drivers/infiniband/hw/mlx4/mlx4_ib.h | 1 + drivers/infiniband/hw/mlx4/qp.c | 120 +++++++++++++++++++-------- drivers/net/mlx4/qp.c | 3 + include/linux/mlx4/qp.h | 3 +- 5 files changed, 95 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 42a538e5df36..aec76ad77872 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -128,6 +128,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) + props->device_cap_flags |= IB_DEVICE_XRC; props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index ce150b0e2cc8..ed80345c99ae 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -145,6 +145,7 @@ struct mlx4_ib_qp { struct mlx4_mtt mtt; int buf_size; struct mutex mutex; + u16 xrcdn; u32 flags; u8 port; u8 alt_port; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 3a91d9d8dc51..1bbe64103674 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -302,15 +302,14 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) } static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - int is_user, int has_srq, struct mlx4_ib_qp *qp) + int is_user, int has_rq, struct mlx4_ib_qp *qp) { /* Sanity check RQ size before proceeding */ if (cap->max_recv_wr > dev->dev->caps.max_wqes || cap->max_recv_sge > dev->dev->caps.max_rq_sg) return -EINVAL; - if (has_srq) { - /* QPs attached to an SRQ should have no RQ */ + if (!has_rq) { if (cap->max_recv_wr) return -EINVAL; @@ -463,6 +462,14 @@ static int set_user_sq_size(struct mlx4_ib_dev *dev, return 0; } +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) + return 0; + + return !attr->srq; +} + static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp) @@ -479,7 +486,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp); if (err) goto err; @@ -513,7 +520,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - if (!init_attr->srq) { + if (qp_has_rq(init_attr)) { err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &qp->db); if (err) @@ -532,7 +539,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err; - if (!init_attr->srq) { + if (qp_has_rq(init_attr)) { err = mlx4_db_alloc(dev->dev, &qp->db, 0); if (err) goto err; @@ -575,6 +582,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_qpn; + if (init_attr->qp_type == IB_QPT_XRC_TGT) + qp->mqp.qpn |= (1 << 23); + /* * Hardware wants QPN written in big-endian order (after * shifting) for send doorbell. Precompute this value to save @@ -592,9 +602,8 @@ err_qpn: err_wrid: if (pd->uobject) { - if (!init_attr->srq) - mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), - &qp->db); + if (qp_has_rq(init_attr)) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); } else { kfree(qp->sq.wrid); kfree(qp->rq.wrid); @@ -610,7 +619,7 @@ err_buf: mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); err_db: - if (!pd->uobject && !init_attr->srq) + if (!pd->uobject && qp_has_rq(init_attr)) mlx4_db_free(dev->dev, &qp->db); err: @@ -671,6 +680,33 @@ static void del_gid_entries(struct mlx4_ib_qp *qp) } } +static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_XRC_TGT) + return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd); + else + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(struct mlx4_ib_qp *qp, + struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq); + *recv_cq = *send_cq; + break; + case IB_QPT_XRC_INI: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = *send_cq; + break; + default: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = to_mcq(qp->ibqp.recv_cq); + break; + } +} + static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, int is_user) { @@ -682,8 +718,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n", qp->mqp.qpn); - send_cq = to_mcq(qp->ibqp.send_cq); - recv_cq = to_mcq(qp->ibqp.recv_cq); + get_cqs(qp, &send_cq, &recv_cq); mlx4_ib_lock_cqs(send_cq, recv_cq); @@ -706,7 +741,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_mtt_cleanup(dev->dev, &qp->mtt); if (is_user) { - if (!qp->ibqp.srq) + if (qp->rq.wqe_cnt) mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), &qp->db); ib_umem_release(qp->umem); @@ -714,7 +749,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, kfree(qp->sq.wrid); kfree(qp->rq.wrid); mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); - if (!qp->ibqp.srq) + if (qp->rq.wqe_cnt) mlx4_db_free(dev->dev, &qp->db); } @@ -725,10 +760,10 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { - struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_sqp *sqp; struct mlx4_ib_qp *qp; int err; + u16 xrcdn = 0; /* * We only support LSO and multicast loopback blocking, and @@ -739,10 +774,20 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, return ERR_PTR(-EINVAL); if (init_attr->create_flags && - (pd->uobject || init_attr->qp_type != IB_QPT_UD)) + (udata || init_attr->qp_type != IB_QPT_UD)) return ERR_PTR(-EINVAL); switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + pd = to_mxrcd(init_attr->xrcd)->pd; + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq; + /* fall through */ + case IB_QPT_XRC_INI: + if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + init_attr->recv_cq = init_attr->send_cq; + /* fall through */ case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: @@ -751,13 +796,14 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, if (!qp) return ERR_PTR(-ENOMEM); - err = create_qp_common(dev, pd, init_attr, udata, 0, qp); + err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, 0, qp); if (err) { kfree(qp); return ERR_PTR(err); } qp->ibqp.qp_num = qp->mqp.qpn; + qp->xrcdn = xrcdn; break; } @@ -765,7 +811,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, case IB_QPT_GSI: { /* Userspace is not allowed to create special QPs: */ - if (pd->uobject) + if (udata) return ERR_PTR(-EINVAL); sqp = kzalloc(sizeof *sqp, GFP_KERNEL); @@ -774,8 +820,8 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, qp = &sqp->qp; - err = create_qp_common(dev, pd, init_attr, udata, - dev->dev->caps.sqp_start + + err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, + to_mdev(pd->device)->dev->caps.sqp_start + (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) + init_attr->port_num - 1, qp); @@ -801,11 +847,13 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) { struct mlx4_ib_dev *dev = to_mdev(qp->device); struct mlx4_ib_qp *mqp = to_mqp(qp); + struct mlx4_ib_pd *pd; if (is_qp0(dev, mqp)) mlx4_CLOSE_PORT(dev->dev, mqp->port); - destroy_qp_common(dev, mqp, !!qp->pd->uobject); + pd = get_pd(mqp); + destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); if (is_sqp(dev, mqp)) kfree(to_msqp(mqp)); @@ -821,6 +869,8 @@ static int to_mlx4_st(enum ib_qp_type type) case IB_QPT_RC: return MLX4_QP_ST_RC; case IB_QPT_UC: return MLX4_QP_ST_UC; case IB_QPT_UD: return MLX4_QP_ST_UD; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC; case IB_QPT_SMI: case IB_QPT_GSI: return MLX4_QP_ST_MLX; default: return -1; @@ -959,6 +1009,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, { struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_ib_pd *pd; + struct mlx4_ib_cq *send_cq, *recv_cq; struct mlx4_qp_context *context; enum mlx4_qp_optpar optpar = 0; int sqd_event; @@ -1014,8 +1066,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; context->sq_size_stride |= qp->sq.wqe_shift - 4; - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->sq_size_stride |= !!qp->sq_no_prefetch << 7; + context->xrcd = cpu_to_be32((u32) qp->xrcdn); + } if (qp->ibqp.uobject) context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index); @@ -1079,8 +1133,12 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; } - context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pdn); - context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); + pd = get_pd(qp); + get_cqs(qp, &send_cq, &recv_cq); + context->pd = cpu_to_be32(pd->pdn); + context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); + context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); + context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); /* Set "fast registration enabled" for all kernel QPs */ if (!qp->ibqp.uobject) @@ -1106,8 +1164,6 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (attr_mask & IB_QP_SQ_PSN) context->next_send_psn = cpu_to_be32(attr->sq_psn); - context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn); - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { if (attr->max_dest_rd_atomic) context->params2 |= @@ -1130,8 +1186,6 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (attr_mask & IB_QP_RQ_PSN) context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); - context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn); - if (attr_mask & IB_QP_QKEY) { context->qkey = cpu_to_be32(attr->qkey); optpar |= MLX4_QP_OPTPAR_Q_KEY; @@ -1140,7 +1194,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (ibqp->srq) context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); - if (!ibqp->srq && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->db_rec_addr = cpu_to_be64(qp->db.dma); if (cur_state == IB_QPS_INIT && @@ -1225,17 +1279,17 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET && !ibqp->uobject) { - mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn, + mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, ibqp->srq ? to_msrq(ibqp->srq): NULL); - if (ibqp->send_cq != ibqp->recv_cq) - mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL); + if (send_cq != recv_cq) + mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); qp->rq.head = 0; qp->rq.tail = 0; qp->sq.head = 0; qp->sq.tail = 0; qp->sq_next_wqe = 0; - if (!ibqp->srq) + if (qp->rq.wqe_cnt) *qp->db.db = 0; } diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c index ec9350e5f21a..51c53898c35f 100644 --- a/drivers/net/mlx4/qp.c +++ b/drivers/net/mlx4/qp.c @@ -280,6 +280,9 @@ int mlx4_init_qp_table(struct mlx4_dev *dev) * We reserve 2 extra QPs per port for the special QPs. The * block of special QPs must be aligned to a multiple of 8, so * round up. + * + * We also reserve the MSB of the 24-bit QP number to indicate + * that a QP is an XRC QP. */ dev->caps.sqp_start = ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8); diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 4001c8249dbb..48cc4cb97858 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -75,6 +75,7 @@ enum { MLX4_QP_ST_UC = 0x1, MLX4_QP_ST_RD = 0x2, MLX4_QP_ST_UD = 0x3, + MLX4_QP_ST_XRC = 0x6, MLX4_QP_ST_MLX = 0x7 }; @@ -137,7 +138,7 @@ struct mlx4_qp_context { __be32 ssn; __be32 params2; __be32 rnr_nextrecvpsn; - __be32 srcd; + __be32 xrcd; __be32 cqn_recv; __be64 db_rec_addr; __be32 qkey; From 0e0ec7e0638ef48e0c661873dfcc8caccab984c6 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 8 Aug 2011 15:31:51 -0700 Subject: [PATCH 41/62] RDMA/core: Export ib_open_qp() to share XRC TGT QPs XRC TGT QPs are shared resources among multiple processes. Since the creating process may exit, allow other processes which share the same XRC domain to open an existing QP. This allows us to transfer ownership of an XRC TGT QP to another process. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 13 +- drivers/infiniband/core/uverbs_main.c | 4 +- drivers/infiniband/core/verbs.c | 167 ++++++++++++++++++++------ include/rdma/ib_verbs.h | 30 ++++- 4 files changed, 164 insertions(+), 50 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 9058e38ca4cd..c4c308cd2034 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1463,6 +1463,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, } if (cmd.qp_type != IB_QPT_XRC_TGT) { + qp->real_qp = qp; qp->device = device; qp->pd = pd; qp->send_cq = attr.send_cq; @@ -1729,8 +1730,12 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; - ret = qp->device->modify_qp(qp, attr, - modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); + if (qp->real_qp == qp) { + ret = qp->device->modify_qp(qp, attr, + modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); + } else { + ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask)); + } put_qp_read(qp); @@ -1927,7 +1932,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, } resp.bad_wr = 0; - ret = qp->device->post_send(qp, wr, &bad_wr); + ret = qp->device->post_send(qp->real_qp, wr, &bad_wr); if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; @@ -2065,7 +2070,7 @@ ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, goto out; resp.bad_wr = 0; - ret = qp->device->post_recv(qp, wr, &bad_wr); + ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr); put_qp_read(qp); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 0cb69e039f75..9c877e24eb60 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -206,8 +206,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, container_of(uobj, struct ib_uqp_object, uevent.uobject); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); - if (qp->qp_type == IB_QPT_XRC_TGT) { - ib_release_qp(qp); + if (qp != qp->real_qp) { + ib_close_qp(qp); } else { ib_uverbs_detach_umcast(qp, uqp); ib_destroy_qp(qp); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a6d95e635699..e02898bcc991 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -316,6 +317,14 @@ EXPORT_SYMBOL(ib_destroy_srq); /* Queue pairs */ +static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) +{ + struct ib_qp *qp = context; + + list_for_each_entry(event->element.qp, &qp->open_list, open_list) + event->element.qp->event_handler(event, event->element.qp->qp_context); +} + static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) { mutex_lock(&xrcd->tgt_qp_mutex); @@ -323,33 +332,90 @@ static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) mutex_unlock(&xrcd->tgt_qp_mutex); } -static void __ib_remove_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) +static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, + void (*event_handler)(struct ib_event *, void *), + void *qp_context) { - mutex_lock(&xrcd->tgt_qp_mutex); - list_del(&qp->xrcd_list); - mutex_unlock(&xrcd->tgt_qp_mutex); + struct ib_qp *qp; + unsigned long flags; + + qp = kzalloc(sizeof *qp, GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->real_qp = real_qp; + atomic_inc(&real_qp->usecnt); + qp->device = real_qp->device; + qp->event_handler = event_handler; + qp->qp_context = qp_context; + qp->qp_num = real_qp->qp_num; + qp->qp_type = real_qp->qp_type; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_add(&qp->open_list, &real_qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + return qp; } +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, + struct ib_qp_open_attr *qp_open_attr) +{ + struct ib_qp *qp, *real_qp; + + if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) + return ERR_PTR(-EINVAL); + + qp = ERR_PTR(-EINVAL); + mutex_lock(&xrcd->tgt_qp_mutex); + list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { + if (real_qp->qp_num == qp_open_attr->qp_num) { + qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, + qp_open_attr->qp_context); + break; + } + } + mutex_unlock(&xrcd->tgt_qp_mutex); + return qp; +} +EXPORT_SYMBOL(ib_open_qp); + struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { - struct ib_qp *qp; + struct ib_qp *qp, *real_qp; struct ib_device *device; device = pd ? pd->device : qp_init_attr->xrcd->device; qp = device->create_qp(pd, qp_init_attr, NULL); if (!IS_ERR(qp)) { - qp->device = device; + qp->device = device; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = qp_init_attr->qp_type; if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; qp->pd = NULL; qp->send_cq = qp->recv_cq = NULL; qp->srq = NULL; qp->xrcd = qp_init_attr->xrcd; atomic_inc(&qp_init_attr->xrcd->usecnt); - __ib_insert_xrcd_qp(qp_init_attr->xrcd, qp); + INIT_LIST_HEAD(&qp->open_list); + atomic_set(&qp->usecnt, 0); + + real_qp = qp; + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (!IS_ERR(qp)) + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + else + real_qp->device->destroy_qp(real_qp); } else { + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { qp->recv_cq = NULL; qp->srq = NULL; @@ -368,11 +434,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, atomic_inc(&pd->usecnt); atomic_inc(&qp_init_attr->send_cq->usecnt); } - - qp->uobject = NULL; - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; - qp->qp_type = qp_init_attr->qp_type; } return qp; @@ -717,7 +778,7 @@ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { - return qp->device->modify_qp(qp, qp_attr, qp_attr_mask, NULL); + return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); @@ -727,26 +788,76 @@ int ib_query_qp(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr) { return qp->device->query_qp ? - qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) : + qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : -ENOSYS; } EXPORT_SYMBOL(ib_query_qp); +int ib_close_qp(struct ib_qp *qp) +{ + struct ib_qp *real_qp; + unsigned long flags; + + real_qp = qp->real_qp; + if (real_qp == qp) + return -EINVAL; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_del(&qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + atomic_dec(&real_qp->usecnt); + kfree(qp); + + return 0; +} +EXPORT_SYMBOL(ib_close_qp); + +static int __ib_destroy_shared_qp(struct ib_qp *qp) +{ + struct ib_xrcd *xrcd; + struct ib_qp *real_qp; + int ret; + + real_qp = qp->real_qp; + xrcd = real_qp->xrcd; + + mutex_lock(&xrcd->tgt_qp_mutex); + ib_close_qp(qp); + if (atomic_read(&real_qp->usecnt) == 0) + list_del(&real_qp->xrcd_list); + else + real_qp = NULL; + mutex_unlock(&xrcd->tgt_qp_mutex); + + if (real_qp) { + ret = ib_destroy_qp(real_qp); + if (!ret) + atomic_dec(&xrcd->usecnt); + else + __ib_insert_xrcd_qp(xrcd, real_qp); + } + + return 0; +} + int ib_destroy_qp(struct ib_qp *qp) { struct ib_pd *pd; struct ib_cq *scq, *rcq; struct ib_srq *srq; - struct ib_xrcd *xrcd; int ret; + if (atomic_read(&qp->usecnt)) + return -EBUSY; + + if (qp->real_qp != qp) + return __ib_destroy_shared_qp(qp); + pd = qp->pd; scq = qp->send_cq; rcq = qp->recv_cq; srq = qp->srq; - xrcd = qp->xrcd; - if (xrcd) - __ib_remove_xrcd_qp(xrcd, qp); ret = qp->device->destroy_qp(qp); if (!ret) { @@ -758,32 +869,12 @@ int ib_destroy_qp(struct ib_qp *qp) atomic_dec(&rcq->usecnt); if (srq) atomic_dec(&srq->usecnt); - if (xrcd) - atomic_dec(&xrcd->usecnt); - } else if (xrcd) { - __ib_insert_xrcd_qp(xrcd, qp); } return ret; } EXPORT_SYMBOL(ib_destroy_qp); -int ib_release_qp(struct ib_qp *qp) -{ - unsigned long flags; - - if (qp->qp_type != IB_QPT_XRC_TGT) - return -EINVAL; - - spin_lock_irqsave(&qp->device->event_handler_lock, flags); - qp->event_handler = NULL; - spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); - - atomic_dec(&qp->xrcd->usecnt); - return 0; -} -EXPORT_SYMBOL(ib_release_qp); - /* Completion queues */ struct ib_cq *ib_create_cq(struct ib_device *device, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index dfd9b87b7ffd..8705539bce75 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -605,6 +605,13 @@ struct ib_qp_init_attr { u8 port_num; /* special QP types only */ }; +struct ib_qp_open_attr { + void (*event_handler)(struct ib_event *, void *); + void *qp_context; + u32 qp_num; + enum ib_qp_type qp_type; +}; + enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, @@ -932,6 +939,9 @@ struct ib_qp { struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; + atomic_t usecnt; /* count times opened */ + struct list_head open_list; + struct ib_qp *real_qp; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *qp_context; @@ -1488,15 +1498,23 @@ int ib_query_qp(struct ib_qp *qp, int ib_destroy_qp(struct ib_qp *qp); /** - * ib_release_qp - Release an external reference to a QP. + * ib_open_qp - Obtain a reference to an existing sharable QP. + * @xrcd - XRC domain + * @qp_open_attr: Attributes identifying the QP to open. + * + * Returns a reference to a sharable QP. + */ +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, + struct ib_qp_open_attr *qp_open_attr); + +/** + * ib_close_qp - Release an external reference to a QP. * @qp: The QP handle to release * - * The specified QP handle is released by the caller. If the QP is - * referenced internally, it is not destroyed until all internal - * references are released. After releasing the qp, the caller - * can no longer access it and all events on the QP are discarded. + * The opened QP handle is released by the caller. The underlying + * shared QP is not destroyed until all internal references are released. */ -int ib_release_qp(struct ib_qp *qp); +int ib_close_qp(struct ib_qp *qp); /** * ib_post_send - Posts a list of work requests to the send queue of From 42849b2697c36abdafa6aef64186b15055392046 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 11 Aug 2011 13:57:43 -0700 Subject: [PATCH 42/62] RDMA/uverbs: Export ib_open_qp() capability to user space Allow processes that share the same XRC domain to open an existing shareable QP. This permits those processes to receive events on the shared QP and transfer ownership, so that any process may modify the QP. The latter allows the creating process to exit, while a remaining process can still transition it for path migration purposes. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_cmd.c | 92 +++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_main.c | 3 +- drivers/infiniband/hw/mlx4/main.c | 3 +- include/rdma/ib_user_verbs.h | 14 +++- 5 files changed, 110 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 00ce032fcead..5bcb2afd3dcb 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -195,6 +195,7 @@ IB_UVERBS_DECLARE_CMD(poll_cq); IB_UVERBS_DECLARE_CMD(req_notify_cq); IB_UVERBS_DECLARE_CMD(destroy_cq); IB_UVERBS_DECLARE_CMD(create_qp); +IB_UVERBS_DECLARE_CMD(open_qp); IB_UVERBS_DECLARE_CMD(query_qp); IB_UVERBS_DECLARE_CMD(modify_qp); IB_UVERBS_DECLARE_CMD(destroy_qp); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c4c308cd2034..254f1649c734 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1544,6 +1544,98 @@ err_put: return ret; } +ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, int out_len) +{ + struct ib_uverbs_open_qp cmd; + struct ib_uverbs_create_qp_resp resp; + struct ib_udata udata; + struct ib_uqp_object *obj; + struct ib_xrcd *xrcd; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_qp *qp; + struct ib_qp_open_attr attr; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key); + down_write(&obj->uevent.uobject.mutex); + + xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); + if (!xrcd) { + ret = -EINVAL; + goto err_put; + } + + attr.event_handler = ib_uverbs_qp_event_handler; + attr.qp_context = file; + attr.qp_num = cmd.qpn; + attr.qp_type = cmd.qp_type; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + + qp = ib_open_qp(xrcd, &attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + + qp->uobject = &obj->uevent.uobject; + + obj->uevent.uobject.object = qp; + ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + if (ret) + goto err_destroy; + + memset(&resp, 0, sizeof resp); + resp.qpn = qp->qp_num; + resp.qp_handle = obj->uevent.uobject.id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_remove; + } + + put_xrcd_read(xrcd_uobj); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + + up_write(&obj->uevent.uobject.mutex); + + return in_len; + +err_remove: + idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + +err_destroy: + ib_destroy_qp(qp); + +err_put: + put_xrcd_read(xrcd_uobj); + put_uobj_write(&obj->uevent.uobject); + return ret; +} + ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 9c877e24eb60..485ea6c60d8b 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -110,7 +110,8 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, - [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq + [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, + [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp }; static void ib_uverbs_add_one(struct ib_device *device); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index aec76ad77872..8a9ae720b56a 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1098,7 +1098,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | - (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ); + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index 100abbbd6e6e..81aba3a73aa3 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -84,7 +84,8 @@ enum { IB_USER_VERBS_CMD_POST_SRQ_RECV, IB_USER_VERBS_CMD_OPEN_XRCD, IB_USER_VERBS_CMD_CLOSE_XRCD, - IB_USER_VERBS_CMD_CREATE_XSRQ + IB_USER_VERBS_CMD_CREATE_XSRQ, + IB_USER_VERBS_CMD_OPEN_QP }; /* @@ -422,6 +423,17 @@ struct ib_uverbs_create_qp { __u64 driver_data[0]; }; +struct ib_uverbs_open_qp { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 qpn; + __u8 qp_type; + __u8 reserved[7]; + __u64 driver_data[0]; +}; + +/* also used for open response */ struct ib_uverbs_create_qp_resp { __u32 qp_handle; __u32 qpn; From 01e7da6ba53ca4d6189a1eae45607c0331c871f2 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Thu, 13 Oct 2011 13:51:30 +0530 Subject: [PATCH 43/62] RDMA/cxgb4: Make sure flush CQ entries are collected on connection close At the time when a peer closes the connection, iw_cxgb4 will not send a cq event if ibqp.uobject exists. In that case, its possible for a user application to get blocked in ibv_get_cq_event(). To resolve this, call the cq's comp_handler to unblock any read from ibv_get_cq_event(). This will trigger userspace to poll the cq and collect flush status completions for any pending work requests. Signed-off-by: Kumar Sanghvi Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/qp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index a41578e48c7b..892fa7c6d310 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -966,8 +966,12 @@ static void flush_qp(struct c4iw_qp *qhp) if (qhp->ibqp.uobject) { t4_set_wq_in_error(&qhp->wq); t4_set_cq_in_error(&rchp->cq); - if (schp != rchp) + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + if (schp != rchp) { t4_set_cq_in_error(&schp->cq); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + } return; } __flush_qp(qhp, rchp, schp); From e14d62c05c0b8eff61c6fd46b4a78fb27c8cf38b Mon Sep 17 00:00:00 2001 From: Jonathan Lallinger Date: Thu, 13 Oct 2011 13:56:59 -0500 Subject: [PATCH 44/62] RDMA/cxgb4: Use correct QID in insert_recv_cqe() When creating flushed receive CQEs, set the QPID field in the t4_cqe to the SQ QID and not the RQ QID. Otherwise the poll code will not find the correct QP context. Signed-off by: Jonathan Lallinger Signed-off by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 1720dc790d13..901c5fbf71a4 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -185,7 +185,7 @@ static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq) V_CQE_OPCODE(FW_RI_SEND) | V_CQE_TYPE(0) | V_CQE_SWCQE(1) | - V_CQE_QPID(wq->rq.qid)); + V_CQE_QPID(wq->sq.qid)); cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen)); cq->sw_queue[cq->sw_pidx] = cqe; t4_swcq_produce(cq); From 787adb9d6ad9afb498a1580a7d8ad05f779c488a Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Tue, 18 Oct 2011 15:22:14 +0200 Subject: [PATCH 45/62] IPoIB: Use the right function to do DMA unmap pages Pages that were mapped using ib_dma_map_page() should be unmapped using ib_dma_unmap_page(). Signed-off-by: Dotan Barak Reviewed-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 39913a065f99..483b098f41fe 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -84,7 +84,7 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); for (i = 0; i < frags; ++i) - ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); } static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) @@ -183,7 +183,7 @@ partial_error: ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); for (; i > 0; --i) - ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); dev_kfree_skb_any(skb); return NULL; From 2fc109c890f7d9620cfed8d439be071a8b2a8bbd Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 23 Sep 2011 13:16:29 -0400 Subject: [PATCH 46/62] IB/qib: Optimize RC/UC code by IB operation The memset for zeroing work completions had been unconditional. This patch removes the memset and moves the zeroing into the work completion with a more explicit field by field set. With this patch, non-ONLY/non-LAST packets will avoid the overhead since they will not generate a completion. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_rc.c | 19 +++++++++++++------ drivers/infiniband/hw/qib/qib_uc.c | 14 +++++++++++--- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index eca0c41f1226..5c1c49b299f1 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1955,8 +1955,6 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, break; } - memset(&wc, 0, sizeof wc); - if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) { qp->r_flags |= QIB_R_COMM_EST; if (qp->ibqp.event_handler) { @@ -2009,16 +2007,19 @@ send_middle: goto rnr_nak; qp->r_rcv_len = 0; if (opcode == OP(SEND_ONLY)) - goto send_last; - /* FALLTHROUGH */ + goto no_immediate_data; + /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */ case OP(SEND_LAST_WITH_IMMEDIATE): send_last_imm: wc.ex.imm_data = ohdr->u.imm_data; hdrsize += 4; wc.wc_flags = IB_WC_WITH_IMM; - /* FALLTHROUGH */ + goto send_last; case OP(SEND_LAST): case OP(RDMA_WRITE_LAST): +no_immediate_data: + wc.wc_flags = 0; + wc.ex.imm_data = 0; send_last: /* Get the number of bytes the message was padded by. */ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; @@ -2051,6 +2052,12 @@ send_last: wc.src_qp = qp->remote_qpn; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + wc.csum_ok = 0; /* Signal completion event if the solicited bit is set. */ qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & @@ -2089,7 +2096,7 @@ send_last: if (opcode == OP(RDMA_WRITE_FIRST)) goto send_middle; else if (opcode == OP(RDMA_WRITE_ONLY)) - goto send_last; + goto no_immediate_data; ret = qib_get_rwqe(qp, 1); if (ret < 0) goto nack_op_err; diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 32ccf3c824ca..d31f33a99887 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -270,7 +270,6 @@ void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, psn = be32_to_cpu(ohdr->bth[2]); opcode >>= 24; - memset(&wc, 0, sizeof wc); /* Compare the PSN verses the expected PSN. */ if (unlikely(qib_cmp24(psn, qp->r_psn) != 0)) { @@ -370,7 +369,7 @@ send_first: } qp->r_rcv_len = 0; if (opcode == OP(SEND_ONLY)) - goto send_last; + goto no_immediate_data; else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) goto send_last_imm; /* FALLTHROUGH */ @@ -389,8 +388,11 @@ send_last_imm: wc.ex.imm_data = ohdr->u.imm_data; hdrsize += 4; wc.wc_flags = IB_WC_WITH_IMM; - /* FALLTHROUGH */ + goto send_last; case OP(SEND_LAST): +no_immediate_data: + wc.ex.imm_data = 0; + wc.wc_flags = 0; send_last: /* Get the number of bytes the message was padded by. */ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; @@ -418,6 +420,12 @@ last_imm: wc.src_qp = qp->remote_qpn; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + wc.csum_ok = 0; /* Signal completion event if the solicited bit is set. */ qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & From cc6ea1385b43487f6ef03bdc91416c8366d28311 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 23 Sep 2011 13:16:34 -0400 Subject: [PATCH 47/62] IB/qib: Decode path MTU optimization Store both the encoded and decoded MTU in the QP structure as a minor optimization for UC/RC receive routines. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_qp.c | 4 +++- drivers/infiniband/hw/qib/qib_rc.c | 6 +++--- drivers/infiniband/hw/qib/qib_uc.c | 4 ++-- drivers/infiniband/hw/qib/qib_verbs.h | 1 + 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index e16751f8639e..9d094f910360 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -765,8 +765,10 @@ int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } } - if (attr_mask & IB_QP_PATH_MTU) + if (attr_mask & IB_QP_PATH_MTU) { qp->path_mtu = pmtu; + qp->pmtu = ib_mtu_enum_to_int(pmtu); + } if (attr_mask & IB_QP_RETRY_CNT) { qp->s_retry_cnt = attr->retry_cnt; diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 5c1c49b299f1..5d77cb8aa052 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -239,7 +239,7 @@ int qib_make_rc_req(struct qib_qp *qp) u32 len; u32 bth0; u32 bth2; - u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + u32 pmtu = qp->pmtu; char newreq; unsigned long flags; int ret = 0; @@ -1732,7 +1732,7 @@ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, * same request. */ offset = ((psn - e->psn) & QIB_PSN_MASK) * - ib_mtu_enum_to_int(qp->path_mtu); + qp->pmtu; len = be32_to_cpu(reth->length); if (unlikely(offset + len != e->rdma_sge.sge_length)) goto unlock_done; @@ -1876,7 +1876,7 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, u32 psn; u32 pad; struct ib_wc wc; - u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + u32 pmtu = qp->pmtu; int diff; struct ib_reth *reth; unsigned long flags; diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index d31f33a99887..9a4630f8276f 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -51,7 +51,7 @@ int qib_make_uc_req(struct qib_qp *qp) u32 hwords; u32 bth0; u32 len; - u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + u32 pmtu = qp->pmtu; int ret = 0; spin_lock_irqsave(&qp->s_lock, flags); @@ -249,7 +249,7 @@ void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, u32 psn; u32 pad; struct ib_wc wc; - u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + u32 pmtu = qp->pmtu; struct ib_reth *reth; int ret; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 95e5b47223b3..ec3711f743c5 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -485,6 +485,7 @@ struct qib_qp { u8 alt_timeout; /* Alternate path timeout for this QP */ u8 port_num; enum ib_mtu path_mtu; + u32 pmtu; /* decoded from path_mtu */ u32 remote_qpn; u32 qkey; /* QKEY for this QP (for UD or RD) */ u32 s_size; /* send work queue size */ From 9e1c0e43257b6df1ef012dd37c3f0f93b1ee47af Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 23 Sep 2011 13:16:39 -0400 Subject: [PATCH 48/62] IB/qib: Eliminate divide/mod in converting idx to egr buf pointer The context init now saves a shift from rcvegrbufs_perchunk rcvegrbufs_perchunk_shift using ilog2. A BUG_ON() protects the power of 2 assumption. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib.h | 8 ++++++-- drivers/infiniband/hw/qib/qib_driver.c | 6 +++--- drivers/infiniband/hw/qib/qib_iba6120.c | 2 ++ drivers/infiniband/hw/qib/qib_iba7220.c | 2 ++ drivers/infiniband/hw/qib/qib_iba7322.c | 2 ++ drivers/infiniband/hw/qib/qib_init.c | 3 +++ 6 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index ee993e725d38..97e623383e1a 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -171,7 +171,9 @@ struct qib_ctxtdata { /* how many alloc_pages() chunks in rcvegrbuf_pages */ u32 rcvegrbuf_chunks; /* how many egrbufs per chunk */ - u32 rcvegrbufs_perchunk; + u16 rcvegrbufs_perchunk; + /* ilog2 of above */ + u16 rcvegrbufs_perchunk_shift; /* order for rcvegrbuf_pages */ size_t rcvegrbuf_size; /* rcvhdrq size (for freeing) */ @@ -940,7 +942,9 @@ struct qib_devdata { /* chip address space used by 4k pio buffers */ u32 align4k; /* size of each rcvegrbuffer */ - u32 rcvegrbufsize; + u16 rcvegrbufsize; + /* log2 of above */ + u16 rcvegrbufsize_shift; /* localbus width (1, 2,4,8,16,32) from config space */ u32 lbus_width; /* localbus speed in MHz */ diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 23e584f4c36c..89264ffc7ee9 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -279,10 +279,10 @@ bail: */ static inline void *qib_get_egrbuf(const struct qib_ctxtdata *rcd, u32 etail) { - const u32 chunk = etail / rcd->rcvegrbufs_perchunk; - const u32 idx = etail % rcd->rcvegrbufs_perchunk; + const u32 chunk = etail >> rcd->rcvegrbufs_perchunk_shift; + const u32 idx = etail & ((u32)rcd->rcvegrbufs_perchunk - 1); - return rcd->rcvegrbuf[chunk] + idx * rcd->dd->rcvegrbufsize; + return rcd->rcvegrbuf[chunk] + (idx << rcd->dd->rcvegrbufsize_shift); } /* diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index d8ca0a0b970d..781a802a321f 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -3273,6 +3273,8 @@ static int init_6120_variables(struct qib_devdata *dd) /* we always allocate at least 2048 bytes for eager buffers */ ret = ib_mtu_enum_to_int(qib_ibmtu); dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU; + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); qib_6120_tidtemplate(dd); diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index e1f947446c2a..3f1d562ba898 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -4085,6 +4085,8 @@ static int qib_init_7220_variables(struct qib_devdata *dd) /* we always allocate at least 2048 bytes for eager buffers */ ret = ib_mtu_enum_to_int(qib_ibmtu); dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU; + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); qib_7220_tidtemplate(dd); diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 5ea9ece23b33..f3f4b55262c2 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -6205,6 +6205,8 @@ static int qib_init_7322_variables(struct qib_devdata *dd) /* we always allocate at least 2048 bytes for eager buffers */ dd->rcvegrbufsize = max(mtu, 2048); + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); qib_7322_tidtemplate(dd); diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 021636dbeae6..21ffa7c0915f 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -183,6 +183,9 @@ struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt) rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt + rcd->rcvegrbufs_perchunk - 1) / rcd->rcvegrbufs_perchunk; + BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk)); + rcd->rcvegrbufs_perchunk_shift = + ilog2(rcd->rcvegrbufs_perchunk); } return rcd; } From af061a644a0e4d4778fe6cd2246479c1962e153b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 23 Sep 2011 13:16:44 -0400 Subject: [PATCH 49/62] IB/qib: Use RCU for qpn lookup The heavy weight spinlock in qib_lookup_qpn() is replaced with RCU. The hash list itself is now accessed via jhash functions instead of mod. The changes should benefit multiple receive contexts in different processors by not contending for the lock just to read the hash structures. The patch also adds a lookaside_qp (pointer) and a lookaside_qpn in the context. The interrupt handler will test the current packet's qpn against lookaside_qpn if the lookaside_qp pointer is non-NULL. The pointer is NULL'ed when the interrupt handler exits. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib.h | 3 + drivers/infiniband/hw/qib/qib_driver.c | 9 +++ drivers/infiniband/hw/qib/qib_qp.c | 87 ++++++++++++++++---------- drivers/infiniband/hw/qib/qib_verbs.c | 36 +++++++---- drivers/infiniband/hw/qib/qib_verbs.h | 3 +- 5 files changed, 92 insertions(+), 46 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 97e623383e1a..b881bdc401f5 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -223,6 +223,9 @@ struct qib_ctxtdata { /* ctxt rcvhdrq head offset */ u32 head; u32 pkt_count; + /* lookaside fields */ + struct qib_qp *lookaside_qp; + u32 lookaside_qpn; /* QPs waiting for context processing */ struct list_head qp_wait_list; }; diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 89264ffc7ee9..d35c9d38ceee 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -547,6 +547,15 @@ move_along: updegr = 0; } } + /* + * Notify qib_destroy_qp() if it is waiting + * for lookaside_qp to finish. + */ + if (rcd->lookaside_qp) { + if (atomic_dec_and_test(&rcd->lookaside_qp->refcount)) + wake_up(&rcd->lookaside_qp->wait); + rcd->lookaside_qp = NULL; + } rcd->head = l; rcd->pkt_count += i; diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 9d094f910360..500981bce9c0 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -34,6 +34,7 @@ #include #include +#include #include "qib.h" @@ -204,6 +205,13 @@ static void free_qpn(struct qib_qpn_table *qpt, u32 qpn) clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); } +static inline unsigned qpn_hash(struct qib_ibdev *dev, u32 qpn) +{ + return jhash_1word(qpn, dev->qp_rnd) & + (dev->qp_table_size - 1); +} + + /* * Put the QP into the hash table. * The hash table holds a reference to the QP. @@ -211,22 +219,23 @@ static void free_qpn(struct qib_qpn_table *qpt, u32 qpn) static void insert_qp(struct qib_ibdev *dev, struct qib_qp *qp) { struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - unsigned n = qp->ibqp.qp_num % dev->qp_table_size; unsigned long flags; + unsigned n = qpn_hash(dev, qp->ibqp.qp_num); spin_lock_irqsave(&dev->qpt_lock, flags); - - if (qp->ibqp.qp_num == 0) - ibp->qp0 = qp; - else if (qp->ibqp.qp_num == 1) - ibp->qp1 = qp; - else { - qp->next = dev->qp_table[n]; - dev->qp_table[n] = qp; - } atomic_inc(&qp->refcount); + if (qp->ibqp.qp_num == 0) + rcu_assign_pointer(ibp->qp0, qp); + else if (qp->ibqp.qp_num == 1) + rcu_assign_pointer(ibp->qp1, qp); + else { + qp->next = dev->qp_table[n]; + rcu_assign_pointer(dev->qp_table[n], qp); + } + spin_unlock_irqrestore(&dev->qpt_lock, flags); + synchronize_rcu(); } /* @@ -236,29 +245,32 @@ static void insert_qp(struct qib_ibdev *dev, struct qib_qp *qp) static void remove_qp(struct qib_ibdev *dev, struct qib_qp *qp) { struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - struct qib_qp *q, **qpp; + unsigned n = qpn_hash(dev, qp->ibqp.qp_num); unsigned long flags; - qpp = &dev->qp_table[qp->ibqp.qp_num % dev->qp_table_size]; - spin_lock_irqsave(&dev->qpt_lock, flags); if (ibp->qp0 == qp) { - ibp->qp0 = NULL; atomic_dec(&qp->refcount); + rcu_assign_pointer(ibp->qp0, NULL); } else if (ibp->qp1 == qp) { - ibp->qp1 = NULL; atomic_dec(&qp->refcount); - } else + rcu_assign_pointer(ibp->qp1, NULL); + } else { + struct qib_qp *q, **qpp; + + qpp = &dev->qp_table[n]; for (; (q = *qpp) != NULL; qpp = &q->next) if (q == qp) { - *qpp = qp->next; - qp->next = NULL; atomic_dec(&qp->refcount); + rcu_assign_pointer(*qpp, qp->next); + qp->next = NULL; break; } + } spin_unlock_irqrestore(&dev->qpt_lock, flags); + synchronize_rcu(); } /** @@ -280,21 +292,24 @@ unsigned qib_free_all_qps(struct qib_devdata *dd) if (!qib_mcast_tree_empty(ibp)) qp_inuse++; - if (ibp->qp0) + rcu_read_lock(); + if (rcu_dereference(ibp->qp0)) qp_inuse++; - if (ibp->qp1) + if (rcu_dereference(ibp->qp1)) qp_inuse++; + rcu_read_unlock(); } spin_lock_irqsave(&dev->qpt_lock, flags); for (n = 0; n < dev->qp_table_size; n++) { qp = dev->qp_table[n]; - dev->qp_table[n] = NULL; + rcu_assign_pointer(dev->qp_table[n], NULL); for (; qp; qp = qp->next) qp_inuse++; } spin_unlock_irqrestore(&dev->qpt_lock, flags); + synchronize_rcu(); return qp_inuse; } @@ -309,25 +324,28 @@ unsigned qib_free_all_qps(struct qib_devdata *dd) */ struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn) { - struct qib_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev; - unsigned long flags; - struct qib_qp *qp; + struct qib_qp *qp = NULL; - spin_lock_irqsave(&dev->qpt_lock, flags); + if (unlikely(qpn <= 1)) { + rcu_read_lock(); + if (qpn == 0) + qp = rcu_dereference(ibp->qp0); + else + qp = rcu_dereference(ibp->qp1); + } else { + struct qib_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev; + unsigned n = qpn_hash(dev, qpn); - if (qpn == 0) - qp = ibp->qp0; - else if (qpn == 1) - qp = ibp->qp1; - else - for (qp = dev->qp_table[qpn % dev->qp_table_size]; qp; - qp = qp->next) + rcu_read_lock(); + for (qp = dev->qp_table[n]; rcu_dereference(qp); qp = qp->next) if (qp->ibqp.qp_num == qpn) break; + } if (qp) - atomic_inc(&qp->refcount); + if (unlikely(!atomic_inc_not_zero(&qp->refcount))) + qp = NULL; - spin_unlock_irqrestore(&dev->qpt_lock, flags); + rcu_read_unlock(); return qp; } @@ -1015,6 +1033,7 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, ret = ERR_PTR(-ENOMEM); goto bail_swq; } + RCU_INIT_POINTER(qp->next, NULL); if (init_attr->srq) sz = 0; else { diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 9fab40488850..9627cb737125 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -38,11 +38,12 @@ #include #include #include +#include #include "qib.h" #include "qib_common.h" -static unsigned int ib_qib_qp_table_size = 251; +static unsigned int ib_qib_qp_table_size = 256; module_param_named(qp_table_size, ib_qib_qp_table_size, uint, S_IRUGO); MODULE_PARM_DESC(qp_table_size, "QP table size"); @@ -659,17 +660,25 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) if (atomic_dec_return(&mcast->refcount) <= 1) wake_up(&mcast->wait); } else { - qp = qib_lookup_qpn(ibp, qp_num); - if (!qp) - goto drop; + if (rcd->lookaside_qp) { + if (rcd->lookaside_qpn != qp_num) { + if (atomic_dec_and_test( + &rcd->lookaside_qp->refcount)) + wake_up( + &rcd->lookaside_qp->wait); + rcd->lookaside_qp = NULL; + } + } + if (!rcd->lookaside_qp) { + qp = qib_lookup_qpn(ibp, qp_num); + if (!qp) + goto drop; + rcd->lookaside_qp = qp; + rcd->lookaside_qpn = qp_num; + } else + qp = rcd->lookaside_qp; ibp->n_unicast_rcv++; qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp); - /* - * Notify qib_destroy_qp() if it is waiting - * for us to finish. - */ - if (atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); } return; @@ -1974,6 +1983,8 @@ static void init_ibport(struct qib_pportdata *ppd) ibp->z_excessive_buffer_overrun_errors = cntrs.excessive_buffer_overrun_errors; ibp->z_vl15_dropped = cntrs.vl15_dropped; + RCU_INIT_POINTER(ibp->qp0, NULL); + RCU_INIT_POINTER(ibp->qp1, NULL); } /** @@ -1990,12 +2001,15 @@ int qib_register_ib_device(struct qib_devdata *dd) int ret; dev->qp_table_size = ib_qib_qp_table_size; - dev->qp_table = kzalloc(dev->qp_table_size * sizeof *dev->qp_table, + get_random_bytes(&dev->qp_rnd, sizeof(dev->qp_rnd)); + dev->qp_table = kmalloc(dev->qp_table_size * sizeof *dev->qp_table, GFP_KERNEL); if (!dev->qp_table) { ret = -ENOMEM; goto err_qpt; } + for (i = 0; i < dev->qp_table_size; i++) + RCU_INIT_POINTER(dev->qp_table[i], NULL); for (i = 0; i < dd->num_pports; i++) init_ibport(ppd + i); diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index ec3711f743c5..d7b6109528a4 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -724,7 +724,8 @@ struct qib_ibdev { dma_addr_t pio_hdrs_phys; /* list of QPs waiting for RNR timer */ spinlock_t pending_lock; /* protect wait lists, PMA counters, etc. */ - unsigned qp_table_size; /* size of the hash table */ + u32 qp_table_size; /* size of the hash table */ + u32 qp_rnd; /* random bytes for hash */ spinlock_t qpt_lock; u32 n_piowait; From d0f2faf72d51dacf5c5e8dec7dca22d0395896e2 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 23 Sep 2011 13:16:49 -0400 Subject: [PATCH 50/62] IB/qib: Precompute timeout jiffies to optimize latency A new field is added to qib_qp called timeout_jiffies. It is initialized upon create and modify. The field is now used instead of a computation based on qp->timeout. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_qp.c | 9 ++++++++- drivers/infiniband/hw/qib/qib_rc.c | 7 ++----- drivers/infiniband/hw/qib/qib_verbs.h | 1 + 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 500981bce9c0..7e7e16fbee99 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -801,8 +801,12 @@ int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_MIN_RNR_TIMER) qp->r_min_rnr_timer = attr->min_rnr_timer; - if (attr_mask & IB_QP_TIMEOUT) + if (attr_mask & IB_QP_TIMEOUT) { qp->timeout = attr->timeout; + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); + } if (attr_mask & IB_QP_QKEY) qp->qkey = attr->qkey; @@ -1034,6 +1038,9 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, goto bail_swq; } RCU_INIT_POINTER(qp->next, NULL); + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); if (init_attr->srq) sz = 0; else { diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 5d77cb8aa052..ecfa087ad45e 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -59,8 +59,7 @@ static void start_timer(struct qib_qp *qp) qp->s_flags |= QIB_S_TIMER; qp->s_timer.function = rc_timeout; /* 4.096 usec. * (1 << qp->timeout) */ - qp->s_timer.expires = jiffies + - usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / 1000UL); + qp->s_timer.expires = jiffies + qp->timeout_jiffies; add_timer(&qp->s_timer); } @@ -1519,9 +1518,7 @@ read_middle: * 4.096 usec. * (1 << qp->timeout) */ qp->s_flags |= QIB_S_TIMER; - mod_timer(&qp->s_timer, jiffies + - usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / - 1000UL)); + mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies); if (qp->s_flags & QIB_S_WAIT_ACK) { qp->s_flags &= ~QIB_S_WAIT_ACK; qib_schedule_send(qp); diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index d7b6109528a4..0c19ef0c4123 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -496,6 +496,7 @@ struct qib_qp { u32 s_last; /* last completed entry */ u32 s_ssn; /* SSN of tail entry */ u32 s_lsn; /* limit sequence number (credit) */ + unsigned long timeout_jiffies; /* computed from timeout */ struct qib_swqe *s_wq; /* send work queue */ struct qib_swqe *s_wqe; struct qib_rq r_rq; /* receive work queue */ From 9fd5473deb421eb7e5575a5f9d7e43ca67c04fe9 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 23 Sep 2011 13:17:00 -0400 Subject: [PATCH 51/62] IB/qib: Remove s_lock around header validation Review of qib_ruc_check_hdr() shows that the s_lock is not required in the normal case. The r_lock is held in all cases, and protects the qp fields that are read. The s_lock will be needed to around the call to qib_migrate_qp() to insure that the send engine sees a consistent set of fields. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_driver.c | 5 ----- drivers/infiniband/hw/qib/qib_rc.c | 4 +--- drivers/infiniband/hw/qib/qib_ruc.c | 7 ++++++- drivers/infiniband/hw/qib/qib_uc.c | 7 +------ 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index d35c9d38ceee..9a9047f385ae 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -310,7 +310,6 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, u32 opcode; u32 psn; int diff; - unsigned long flags; /* Sanity check packet */ if (tlen < 24) @@ -365,7 +364,6 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, switch (qp->ibqp.qp_type) { case IB_QPT_RC: - spin_lock_irqsave(&qp->s_lock, flags); ruc_res = qib_ruc_check_hdr( ibp, hdr, @@ -373,11 +371,8 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, qp, be32_to_cpu(ohdr->bth[0])); if (ruc_res) { - spin_unlock_irqrestore(&qp->s_lock, - flags); goto unlock; } - spin_unlock_irqrestore(&qp->s_lock, flags); /* Only deal with RDMA Writes for now */ if (opcode < diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index ecfa087ad45e..afaf4ac79f42 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1889,10 +1889,8 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, } opcode = be32_to_cpu(ohdr->bth[0]); - spin_lock_irqsave(&qp->s_lock, flags); if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode)) - goto sunlock; - spin_unlock_irqrestore(&qp->s_lock, flags); + return; psn = be32_to_cpu(ohdr->bth[2]); opcode >>= 24; diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index eb78d9367f06..b4b37e47321a 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -260,12 +260,15 @@ static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) /* * - * This should be called with the QP s_lock held. + * This should be called with the QP r_lock held. + * + * The s_lock will be acquired around the qib_migrate_qp() call. */ int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, int has_grh, struct qib_qp *qp, u32 bth0) { __be64 guid; + unsigned long flags; if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) { if (!has_grh) { @@ -295,7 +298,9 @@ int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid || ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num) goto err; + spin_lock_irqsave(&qp->s_lock, flags); qib_migrate_qp(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); } else { if (!has_grh) { if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 9a4630f8276f..847e7afdfd94 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -243,7 +243,6 @@ void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, int has_grh, void *data, u32 tlen, struct qib_qp *qp) { struct qib_other_headers *ohdr; - unsigned long flags; u32 opcode; u32 hdrsize; u32 psn; @@ -263,10 +262,8 @@ void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, } opcode = be32_to_cpu(ohdr->bth[0]); - spin_lock_irqsave(&qp->s_lock, flags); if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode)) - goto sunlock; - spin_unlock_irqrestore(&qp->s_lock, flags); + return; psn = be32_to_cpu(ohdr->bth[2]); opcode >>= 24; @@ -554,6 +551,4 @@ op_err: qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); return; -sunlock: - spin_unlock_irqrestore(&qp->s_lock, flags); } From 44d75d3d92304a1df8131f48b38de08df9011fa2 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 19 Oct 2011 16:42:23 -0400 Subject: [PATCH 52/62] IB/qib: Clean up checkpatch issue This was probably present from initial submission. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index f3f4b55262c2..708f4fea6afa 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2853,9 +2853,8 @@ static irqreturn_t qib_7322intr(int irq, void *data) for (i = 0; i < dd->first_user_ctxt; i++) { if (ctxtrbits & rmask) { ctxtrbits &= ~rmask; - if (dd->rcd[i]) { + if (dd->rcd[i]) qib_kreceive(dd->rcd[i], NULL, &npkts); - } } rmask <<= 1; } From dde05cbdf8b1c404344c370fe6e18ff160d6da6a Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Wed, 19 Oct 2011 18:46:40 -0400 Subject: [PATCH 53/62] IB/qib: Hold links until tuning data is available Hold the link state machine until the tuning data is read from the QSFP EEPROM so correct tuning settings are applied before the state machine attempts to bring the link up. Link is also held on cable unplug in case a different cable is used. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 120 +++++++++++++++--------- drivers/infiniband/hw/qib/qib_init.c | 4 - drivers/infiniband/hw/qib/qib_qsfp.c | 25 +++-- drivers/infiniband/hw/qib/qib_qsfp.h | 2 + 4 files changed, 94 insertions(+), 57 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 708f4fea6afa..86575fbcaef7 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2381,17 +2381,17 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, IBLinkEn); set_vls(ppd); + /* Hold the link state machine for mezz boards */ + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + + /* be paranoid against later code motion, etc. */ spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvIBPortEnable); qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl); spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); - /* Hold the link state machine for mezz boards */ - if (IS_QMH(dd) || IS_QME(dd)) - qib_set_ib_7322_lstate(ppd, 0, - QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); - /* Also enable IBSTATUSCHG interrupt. */ val = qib_read_kreg_port(ppd, krp_errmask); qib_write_kreg_port(ppd, krp_errmask, @@ -5229,6 +5229,8 @@ static int qib_7322_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs) QIBL_IB_AUTONEG_INPROG))) set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled); if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + struct qib_qsfp_data *qd = + &ppd->cpspec->qsfp_data; /* unlock the Tx settings, speed may change */ qib_write_kreg_port(ppd, krp_tx_deemph_override, SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, @@ -5236,6 +5238,12 @@ static int qib_7322_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs) qib_cancel_sends(ppd); /* on link down, ensure sane pcs state */ qib_7322_mini_pcs_reset(ppd); + /* schedule the qsfp refresh which should turn the link + off */ + if (ppd->dd->flags & QIB_HAS_QSFP) { + qd->t_insert = get_jiffies_64(); + schedule_work(&qd->work); + } spin_lock_irqsave(&ppd->sdma_lock, flags); if (__qib_sdma_running(ppd)) __qib_sdma_process_event(ppd, @@ -5591,38 +5599,62 @@ static void qsfp_7322_event(struct work_struct *work) qd = container_of(work, struct qib_qsfp_data, work); ppd = qd->ppd; - pwrup = qd->t_insert + msecs_to_jiffies(QSFP_PWR_LAG_MSEC); + pwrup = qd->t_insert + + msecs_to_jiffies(QSFP_PWR_LAG_MSEC - QSFP_MODPRS_LAG_MSEC); - /* - * Some QSFP's not only do not respond until the full power-up - * time, but may behave badly if we try. So hold off responding - * to insertion. - */ - while (1) { - u64 now = get_jiffies_64(); - if (time_after64(now, pwrup)) - break; - msleep(20); - } - ret = qib_refresh_qsfp_cache(ppd, &qd->cache); - /* - * Need to change LE2 back to defaults if we couldn't - * read the cable type (to handle cable swaps), so do this - * even on failure to read cable information. We don't - * get here for QME, so IS_QME check not needed here. - */ - if (!ret && !ppd->dd->cspec->r1) { - if (QSFP_IS_ACTIVE_FAR(qd->cache.tech)) - le2 = LE2_QME; - else if (qd->cache.atten[1] >= qib_long_atten && - QSFP_IS_CU(qd->cache.tech)) - le2 = LE2_5m; - else + /* Delay for 20 msecs to allow ModPrs resistor to setup */ + mdelay(QSFP_MODPRS_LAG_MSEC); + + if (!qib_qsfp_mod_present(ppd)) + /* Set the physical link to disabled */ + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + else { + /* + * Some QSFP's not only do not respond until the full power-up + * time, but may behave badly if we try. So hold off responding + * to insertion. + */ + while (1) { + u64 now = get_jiffies_64(); + if (time_after64(now, pwrup)) + break; + msleep(20); + } + + ret = qib_refresh_qsfp_cache(ppd, &qd->cache); + + /* + * Need to change LE2 back to defaults if we couldn't + * read the cable type (to handle cable swaps), so do this + * even on failure to read cable information. We don't + * get here for QME, so IS_QME check not needed here. + */ + if (!ret && !ppd->dd->cspec->r1) { + if (QSFP_IS_ACTIVE_FAR(qd->cache.tech)) + le2 = LE2_QME; + else if (qd->cache.atten[1] >= qib_long_atten && + QSFP_IS_CU(qd->cache.tech)) + le2 = LE2_5m; + else + le2 = LE2_DEFAULT; + } else le2 = LE2_DEFAULT; - } else - le2 = LE2_DEFAULT; - ibsd_wr_allchans(ppd, 13, (le2 << 7), BMASK(9, 7)); - init_txdds_table(ppd, 0); + ibsd_wr_allchans(ppd, 13, (le2 << 7), BMASK(9, 7)); + /* + * We always change parameteters, since we can choose + * values for cables without eeproms, and the cable may have + * changed from a cable with full or partial eeprom content + * to one with partial or no content. + */ + init_txdds_table(ppd, 0); + /* The physical link is being re-enabled only when the + previous state was DISABLED. This should only happen when + the cable has been physically pulled. */ + if (ppd->lflags & QIBL_IB_LINK_DISABLED) + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_SLEEP); + } } /* @@ -5726,7 +5758,8 @@ static void set_no_qsfp_atten(struct qib_devdata *dd, int change) /* now change the IBC and serdes, overriding generic */ init_txdds_table(ppd, 1); /* Re-enable the physical state machine on mezz boards - * now that the correct settings have been set. */ + * now that the correct settings have been set. + * QSFP boards are handles by the QSFP event handler */ if (IS_QMH(dd) || IS_QME(dd)) qib_set_ib_7322_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_SLEEP); @@ -7148,7 +7181,8 @@ static void find_best_ent(struct qib_pportdata *ppd, } } - /* Lookup serdes setting by cable type and attenuation */ + /* Active cables don't have attenuation so we only set SERDES + * settings to account for the attenuation of the board traces. */ if (!override && QSFP_IS_ACTIVE(qd->tech)) { *sdr_dds = txdds_sdr + ppd->dd->board_atten; *ddr_dds = txdds_ddr + ppd->dd->board_atten; @@ -7465,12 +7499,6 @@ static int serdes_7322_init_new(struct qib_pportdata *ppd) u32 le_val, rxcaldone; int chan, chan_done = (1 << SERDES_CHANS) - 1; - /* - * Initialize the Tx DDS tables. Also done every QSFP event, - * for adapters with QSFP - */ - init_txdds_table(ppd, 0); - /* Clear cmode-override, may be set from older driver */ ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14); @@ -7656,6 +7684,12 @@ static int serdes_7322_init_new(struct qib_pportdata *ppd) /* VGA output common mode */ ibsd_wr_allchans(ppd, 12, (3 << 2), BMASK(3, 2)); + /* + * Initialize the Tx DDS tables. Also done every QSFP event, + * for adapters with QSFP + */ + init_txdds_table(ppd, 0); + return 0; } diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 21ffa7c0915f..b093a0b53b2f 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -585,10 +585,6 @@ int qib_init(struct qib_devdata *dd, int reinit) continue; } - /* let link come up, and enable IBC */ - spin_lock_irqsave(&ppd->lflags_lock, flags); - ppd->lflags &= ~QIBL_IB_LINK_DISABLED; - spin_unlock_irqrestore(&ppd->lflags_lock, flags); portok++; } diff --git a/drivers/infiniband/hw/qib/qib_qsfp.c b/drivers/infiniband/hw/qib/qib_qsfp.c index 3374a52232c1..e06c4ed383f1 100644 --- a/drivers/infiniband/hw/qib/qib_qsfp.c +++ b/drivers/infiniband/hw/qib/qib_qsfp.c @@ -273,18 +273,12 @@ int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, struct qib_qsfp_cache *cp) int ret; int idx; u16 cks; - u32 mask; u8 peek[4]; /* ensure sane contents on invalid reads, for cable swaps */ memset(cp, 0, sizeof(*cp)); - mask = QSFP_GPIO_MOD_PRS_N; - if (ppd->hw_pidx) - mask <<= QSFP_GPIO_PORT2_SHIFT; - - ret = ppd->dd->f_gpio_mod(ppd->dd, 0, 0, 0); - if (ret & mask) { + if (!qib_qsfp_mod_present(ppd)) { ret = -ENODEV; goto bail; } @@ -444,6 +438,19 @@ const char * const qib_qsfp_devtech[16] = { static const char *pwr_codes = "1.5W2.0W2.5W3.5W"; +int qib_qsfp_mod_present(struct qib_pportdata *ppd) +{ + u32 mask; + int ret; + + mask = QSFP_GPIO_MOD_PRS_N << + (ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT); + ret = ppd->dd->f_gpio_mod(ppd->dd, 0, 0, 0); + + return !((ret & mask) >> + ((ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT) + 3)); +} + /* * Initialize structures that control access to QSFP. Called once per port * on cards that support QSFP. @@ -452,7 +459,6 @@ void qib_qsfp_init(struct qib_qsfp_data *qd, void (*fevent)(struct work_struct *)) { u32 mask, highs; - int pins; struct qib_devdata *dd = qd->ppd->dd; @@ -480,8 +486,7 @@ void qib_qsfp_init(struct qib_qsfp_data *qd, mask <<= QSFP_GPIO_PORT2_SHIFT; /* Do not try to wait here. Better to let event handle it */ - pins = dd->f_gpio_mod(dd, 0, 0, 0); - if (pins & mask) + if (!qib_qsfp_mod_present(qd->ppd)) goto bail; /* We see a module, but it may be unwise to look yet. Just schedule */ qd->t_insert = get_jiffies_64(); diff --git a/drivers/infiniband/hw/qib/qib_qsfp.h b/drivers/infiniband/hw/qib/qib_qsfp.h index c109bbdc90ac..786a92a25c25 100644 --- a/drivers/infiniband/hw/qib/qib_qsfp.h +++ b/drivers/infiniband/hw/qib/qib_qsfp.h @@ -34,6 +34,7 @@ #define QSFP_DEV 0xA0 #define QSFP_PWR_LAG_MSEC 2000 +#define QSFP_MODPRS_LAG_MSEC 20 /* * Below are masks for various QSFP signals, for Port 1. @@ -181,6 +182,7 @@ struct qib_qsfp_data { extern int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, struct qib_qsfp_cache *cp); +extern int qib_qsfp_mod_present(struct qib_pportdata *ppd); extern void qib_qsfp_init(struct qib_qsfp_data *qd, void (*fevent)(struct work_struct *)); extern void qib_qsfp_deinit(struct qib_qsfp_data *qd); From 97285b78174423e5576b2e06aa45f64df009da5b Mon Sep 17 00:00:00 2001 From: Marcel Apfelbaum Date: Mon, 24 Oct 2011 11:02:34 +0200 Subject: [PATCH 54/62] mlx4_core: Add extended port capabilities support An Extended Port Info packet is sent to each hw port during HCA init. If it returns without error, we assume the port supports extended port capabilities. Signed-off-by: Marcel Apfelbaum Reviewed-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/net/mlx4/main.c | 7 +++++++ drivers/net/mlx4/mlx4.h | 1 + drivers/net/mlx4/port.c | 42 +++++++++++++++++++++++++++++++++++++ include/linux/mlx4/device.h | 7 +++++++ 4 files changed, 57 insertions(+) diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index f0ee35df4dd7..017616a722d7 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -998,6 +998,13 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) "ib capabilities (%d). Continuing with " "caps = 0\n", port, err); dev->caps.ib_port_def_cap[port] = ib_port_default_caps; + + err = mlx4_check_ext_port_caps(dev, port); + if (err) + mlx4_warn(dev, "failed to get port %d extended " + "port capabilities support info (%d)." + " Assuming not supported\n", port, err); + err = mlx4_SET_PORT(dev, port); if (err) { mlx4_err(dev, "Failed to set port %d, aborting\n", diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index a2fcd8402d37..9ba9c565b1a0 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -450,6 +450,7 @@ void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table); int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port); int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps); +int mlx4_check_ext_port_caps(struct mlx4_dev *dev, u8 port); int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], enum mlx4_protocol prot, enum mlx4_steer_type steer); diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c index 609e0ec14cee..7b2a2dafbaa4 100644 --- a/drivers/net/mlx4/port.c +++ b/drivers/net/mlx4/port.c @@ -464,6 +464,48 @@ int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps) return err; } +int mlx4_check_ext_port_caps(struct mlx4_dev *dev, u8 port) +{ + struct mlx4_cmd_mailbox *inmailbox, *outmailbox; + u8 *inbuf, *outbuf; + int err, packet_error; + + inmailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(inmailbox)) + return PTR_ERR(inmailbox); + + outmailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(outmailbox)) { + mlx4_free_cmd_mailbox(dev, inmailbox); + return PTR_ERR(outmailbox); + } + + inbuf = inmailbox->buf; + outbuf = outmailbox->buf; + memset(inbuf, 0, 256); + memset(outbuf, 0, 256); + inbuf[0] = 1; + inbuf[1] = 1; + inbuf[2] = 1; + inbuf[3] = 1; + + *(__be16 *) (&inbuf[16]) = MLX4_ATTR_EXTENDED_PORT_INFO; + *(__be32 *) (&inbuf[20]) = cpu_to_be32(port); + + err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C); + + packet_error = be16_to_cpu(*(__be16 *) (outbuf + 4)); + + dev->caps.ext_port_cap[port] = (!err && !packet_error) ? + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO + : 0; + + mlx4_free_cmd_mailbox(dev, inmailbox); + mlx4_free_cmd_mailbox(dev, outmailbox); + return err; +} + int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port) { struct mlx4_cmd_mailbox *mailbox; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 53ef894bfa05..ce9ef491addf 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -82,6 +82,12 @@ enum { MLX4_DEV_CAP_FLAG_COUNTERS = 1LL << 48 }; +#define MLX4_ATTR_EXTENDED_PORT_INFO cpu_to_be16(0xff90) + +enum { + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO = 1 << 0 +}; + enum { MLX4_BMME_FLAG_LOCAL_INV = 1 << 6, MLX4_BMME_FLAG_REMOTE_INV = 1 << 7, @@ -276,6 +282,7 @@ struct mlx4_caps { u32 port_mask; enum mlx4_port_type possible_type[MLX4_MAX_PORTS + 1]; u32 max_counters; + u8 ext_port_cap[MLX4_MAX_PORTS + 1]; }; struct mlx4_buf_list { From a5e12dff757b562bbecd6a2359fdc4c43d4d97de Mon Sep 17 00:00:00 2001 From: Marcel Apfelbaum Date: Mon, 3 Oct 2011 19:04:20 +0300 Subject: [PATCH 55/62] IB/mlx4: Configure extended active speeds Set the extended active speeds based on the hardware configuration. Signed-off-by: Marcel Apfelbaum Reviewed-by: Hal Rosenstock [ Move FDR-10 handling into ib_link_query_port(). - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 39 ++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index fa643f4f4e28..4e5b654e6375 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -181,8 +181,12 @@ mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) static int ib_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, + struct ib_smp *in_mad, struct ib_smp *out_mad) { + int ext_active_speed; + int err; + props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); @@ -203,6 +207,39 @@ static int ib_link_query_port(struct ib_device *ibdev, u8 port, props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = 16; /* FDR */ + break; + case 2: + props->active_speed = 32; /* EDR */ + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == 4) { + if (to_mdev(ibdev)->dev->caps.ext_port_cap[port] & + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { + init_query_mad(in_mad); + in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, + NULL, NULL, in_mad, out_mad); + if (err) + return err; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = 8; + } + } + return 0; } @@ -274,7 +311,7 @@ static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, goto out; err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? - ib_link_query_port(ibdev, port, props, out_mad) : + ib_link_query_port(ibdev, port, props, in_mad, out_mad) : eth_link_query_port(ibdev, port, props, out_mad); out: From 16d99812d58b8af2df29cd337a74cd965b53da04 Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Wed, 19 Oct 2011 18:46:47 -0400 Subject: [PATCH 56/62] IB/qib: Fix issue with link states and QSFP cables Fix an issue where the link would come up after replugging a cable even if it has been DISABLED manually. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 30 ++++++++++++++++--------- drivers/infiniband/hw/qib/qib_qsfp.h | 1 + 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 86575fbcaef7..efd0a110091f 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2310,12 +2310,15 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) val = ppd->cpspec->ibcctrl_a | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + ppd->cpspec->ibcctrl_a = val; /* * Reset the PCS interface to the serdes (and also ibc, which is still * in reset from above). Writes new value of ibcctrl_a as last step. */ qib_7322_mini_pcs_reset(ppd); qib_write_kreg(dd, kr_scratch, 0ULL); + /* clear the linkinit cmds */ + ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, LinkInitCmd); if (!ppd->cpspec->ibcctrl_b) { unsigned lse = ppd->link_speed_enabled; @@ -2381,11 +2384,6 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, IBLinkEn); set_vls(ppd); - /* Hold the link state machine for mezz boards */ - qib_set_ib_7322_lstate(ppd, 0, - QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); - - /* be paranoid against later code motion, etc. */ spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvIBPortEnable); @@ -5594,6 +5592,7 @@ static void qsfp_7322_event(struct work_struct *work) struct qib_qsfp_data *qd; struct qib_pportdata *ppd; u64 pwrup; + unsigned long flags; int ret; u32 le2; @@ -5605,11 +5604,15 @@ static void qsfp_7322_event(struct work_struct *work) /* Delay for 20 msecs to allow ModPrs resistor to setup */ mdelay(QSFP_MODPRS_LAG_MSEC); - if (!qib_qsfp_mod_present(ppd)) + if (!qib_qsfp_mod_present(ppd)) { + ppd->cpspec->qsfp_data.modpresent = 0; /* Set the physical link to disabled */ qib_set_ib_7322_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); - else { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else { /* * Some QSFP's not only do not respond until the full power-up * time, but may behave badly if we try. So hold off responding @@ -5649,11 +5652,18 @@ static void qsfp_7322_event(struct work_struct *work) */ init_txdds_table(ppd, 0); /* The physical link is being re-enabled only when the - previous state was DISABLED. This should only happen when - the cable has been physically pulled. */ - if (ppd->lflags & QIBL_IB_LINK_DISABLED) + * previous state was DISABLED and the VALID bit is not + * set. This should only happen when the cable has been + * physically pulled. */ + if (!ppd->cpspec->qsfp_data.modpresent && + (ppd->lflags & (QIBL_LINKV | QIBL_IB_LINK_DISABLED))) { + ppd->cpspec->qsfp_data.modpresent = 1; qib_set_ib_7322_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_SLEEP); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } } } diff --git a/drivers/infiniband/hw/qib/qib_qsfp.h b/drivers/infiniband/hw/qib/qib_qsfp.h index 786a92a25c25..46002a9417c0 100644 --- a/drivers/infiniband/hw/qib/qib_qsfp.h +++ b/drivers/infiniband/hw/qib/qib_qsfp.h @@ -178,6 +178,7 @@ struct qib_qsfp_data { struct work_struct work; struct qib_qsfp_cache cache; u64 t_insert; + u8 modpresent; }; extern int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, From f7cc25d018f1e9af6767ee7774bbe83452e9fdf4 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Mon, 24 Oct 2011 21:20:22 +0530 Subject: [PATCH 57/62] RDMA/cxgb3: Serialize calls to CQ's comp_handler iw_cxgb3 has a potential problem where a CQ's comp_handler can get called simultaneously from different places in iw_cxgb3 driver. This does not comply with Documentation/infiniband/core_locking.txt, which states that at a given point of time, there should be only one callback per CQ should be active. Such problem was reported by Parav Pandit for iw_cxgb4 driver. Based on discussion between Parav Pandit and Steve Wise, this patch fixes the above problem by serializing the calls to a CQ's comp_handler using a spin_lock. Signed-off-by: Kumar Sanghvi Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/iwch_ev.c | 6 ++++++ drivers/infiniband/hw/cxgb3/iwch_provider.c | 1 + drivers/infiniband/hw/cxgb3/iwch_provider.h | 1 + drivers/infiniband/hw/cxgb3/iwch_qp.c | 14 ++++++++++++-- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c index 71e0d845da3d..abcc9e76962b 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_ev.c +++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c @@ -46,6 +46,7 @@ static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp, struct ib_event event; struct iwch_qp_attributes attrs; struct iwch_qp *qhp; + unsigned long flag; spin_lock(&rnicp->lock); qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); @@ -94,7 +95,9 @@ static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp, if (qhp->ibqp.event_handler) (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); + spin_lock_irqsave(&chp->comp_handler_lock, flag); (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); if (atomic_dec_and_test(&qhp->refcnt)) wake_up(&qhp->wait); @@ -107,6 +110,7 @@ void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb) struct iwch_cq *chp; struct iwch_qp *qhp; u32 cqid = RSPQ_CQID(rsp_msg); + unsigned long flag; rnicp = (struct iwch_dev *) rdev_p->ulp; spin_lock(&rnicp->lock); @@ -170,7 +174,9 @@ void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb) */ if (qhp->ep && SQ_TYPE(rsp_msg->cqe)) dst_confirm(qhp->ep->dst); + spin_lock_irqsave(&chp->comp_handler_lock, flag); (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); break; case TPT_ERR_STAG: diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index c7d9411f2954..37c224fc3ad9 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -190,6 +190,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int ve chp->rhp = rhp; chp->ibcq.cqe = 1 << chp->cq.size_log2; spin_lock_init(&chp->lock); + spin_lock_init(&chp->comp_handler_lock); atomic_set(&chp->refcnt, 1); init_waitqueue_head(&chp->wait); if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) { diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h index 9a342c9b220d..87c14b0c5ac0 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.h +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -103,6 +103,7 @@ struct iwch_cq { struct iwch_dev *rhp; struct t3_cq cq; spinlock_t lock; + spinlock_t comp_handler_lock; atomic_t refcnt; wait_queue_head_t wait; u32 __user *user_rptr_addr; diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index ecd313f359a4..bea5839d89ee 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -822,8 +822,11 @@ static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp, flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&rchp->lock, *flag); - if (flushed) + if (flushed) { + spin_lock_irqsave(&rchp->comp_handler_lock, *flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, *flag); + } /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&schp->lock, *flag); @@ -833,8 +836,11 @@ static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp, flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&schp->lock, *flag); - if (flushed) + if (flushed) { + spin_lock_irqsave(&schp->comp_handler_lock, *flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, *flag); + } /* deref */ if (atomic_dec_and_test(&qhp->refcnt)) @@ -853,11 +859,15 @@ static void flush_qp(struct iwch_qp *qhp, unsigned long *flag) if (qhp->ibqp.uobject) { cxio_set_wq_in_error(&qhp->wq); cxio_set_cq_in_error(&rchp->cq); + spin_lock_irqsave(&rchp->comp_handler_lock, *flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, *flag); if (schp != rchp) { cxio_set_cq_in_error(&schp->cq); + spin_lock_irqsave(&schp->comp_handler_lock, *flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, *flag); } return; } From 581bbe2cd0694a935e0c3ccd7f011e10094f1df6 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Mon, 24 Oct 2011 21:20:21 +0530 Subject: [PATCH 58/62] RDMA/cxgb4: Serialize calls to CQ's comp_handler Commit 01e7da6ba53c ("RDMA/cxgb4: Make sure flush CQ entries are collected on connection close") introduced a potential problem where a CQ's comp_handler can get called simultaneously from different places in the iw_cxgb4 driver. This does not comply with Documentation/infiniband/core_locking.txt, which states that at a given point of time, there should be only one callback per CQ should be active. This problem was reported by Parav Pandit . Based on discussion between Parav Pandit and Steve Wise, this patch fixes the above problem by serializing the calls to a CQ's comp_handler using a spin_lock. Reported-by: Parav Pandit Signed-off-by: Kumar Sanghvi Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cq.c | 1 + drivers/infiniband/hw/cxgb4/ev.c | 10 ++++++++-- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 1 + drivers/infiniband/hw/cxgb4/qp.c | 15 +++++++++++++-- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 901c5fbf71a4..f35a935267e7 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -818,6 +818,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, chp->cq.size--; /* status page */ chp->ibcq.cqe = entries - 2; spin_lock_init(&chp->lock); + spin_lock_init(&chp->comp_handler_lock); atomic_set(&chp->refcnt, 1); init_waitqueue_head(&chp->wait); ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c index c13041a0aeba..397cb36cf103 100644 --- a/drivers/infiniband/hw/cxgb4/ev.c +++ b/drivers/infiniband/hw/cxgb4/ev.c @@ -42,6 +42,7 @@ static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp, { struct ib_event event; struct c4iw_qp_attributes attrs; + unsigned long flag; if ((qhp->attr.state == C4IW_QP_STATE_ERROR) || (qhp->attr.state == C4IW_QP_STATE_TERMINATE)) { @@ -72,7 +73,9 @@ static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp, if (qhp->ibqp.event_handler) (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); + spin_lock_irqsave(&chp->comp_handler_lock, flag); (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); } void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe) @@ -183,11 +186,14 @@ out: int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid) { struct c4iw_cq *chp; + unsigned long flag; chp = get_chp(dev, qid); - if (chp) + if (chp) { + spin_lock_irqsave(&chp->comp_handler_lock, flag); (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); - else + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); + } else PDBG("%s unknown cqid 0x%x\n", __func__, qid); return 0; } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 4f045375c8e2..02f015fc3ed2 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -309,6 +309,7 @@ struct c4iw_cq { struct c4iw_dev *rhp; struct t4_cq cq; spinlock_t lock; + spinlock_t comp_handler_lock; atomic_t refcnt; wait_queue_head_t wait; }; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 892fa7c6d310..62c7262a9eb3 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -941,8 +941,11 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&rchp->lock, flag); - if (flushed) + if (flushed) { + spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + } /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&schp->lock, flag); @@ -952,13 +955,17 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, flushed = c4iw_flush_sq(&qhp->wq, &schp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&schp->lock, flag); - if (flushed) + if (flushed) { + spin_lock_irqsave(&schp->comp_handler_lock, flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, flag); + } } static void flush_qp(struct c4iw_qp *qhp) { struct c4iw_cq *rchp, *schp; + unsigned long flag; rchp = get_chp(qhp->rhp, qhp->attr.rcq); schp = get_chp(qhp->rhp, qhp->attr.scq); @@ -966,11 +973,15 @@ static void flush_qp(struct c4iw_qp *qhp) if (qhp->ibqp.uobject) { t4_set_wq_in_error(&qhp->wq); t4_set_cq_in_error(&rchp->cq); + spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); if (schp != rchp) { t4_set_cq_in_error(&schp->cq); + spin_lock_irqsave(&schp->comp_handler_lock, flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, flag); } return; } From d32ae393dbf0daf778f9e33b0bc6591cd102391e Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 25 Oct 2011 16:38:30 +0530 Subject: [PATCH 59/62] RDMA/cxgb4: Mark QP in error before disabling the queue in firmware QPs need to be moved to error before telling the firwmare to shutdown the queue. Otherwise, the application can submit WRs that will never get fetched by the hardware and never flushed by the driver. Signed-off-by: Kumar Sanghvi Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/qp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 62c7262a9eb3..2466cfcc9ffc 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1221,6 +1221,8 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, disconnect = 1; c4iw_get_ep(&qhp->ep->com); } + if (qhp->ibqp.uobject) + t4_set_wq_in_error(&qhp->wq); ret = rdma_fini(rhp, qhp, ep); if (ret) goto err; @@ -1237,6 +1239,8 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, break; case C4IW_QP_STATE_ERROR: set_state(qhp, C4IW_QP_STATE_ERROR); + if (qhp->ibqp.uobject) + t4_set_wq_in_error(&qhp->wq); if (!internal) { abort = 1; disconnect = 1; From bcacb897569f0e7aab7643b22567d8de22ef9dfc Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 10 Oct 2011 10:53:41 +0200 Subject: [PATCH 60/62] IB/mlx4: Enable 4K mtu for IBoE The IBoE port MTU is derived from the corresponding Ethernet netdevice MTU, which can support jumbo frames of 9K, and hence surely supports the max IB mtu of 4K. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index fa643f4f4e28..e953cf9ffac6 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -227,7 +227,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->pkey_tbl_len = 1; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); - props->max_mtu = IB_MTU_2048; + props->max_mtu = IB_MTU_4096; props->subnet_timeout = 0; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = 0; From 80a2dcd8d05c11d42f4e606d7a5f3eaa2794ab34 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 10 Oct 2011 10:54:42 +0200 Subject: [PATCH 61/62] IB/mlx4: Don't set VLAN in IBoE WQEs' control segment There's no need to set the vlan-related fields in an IBoE send WQE control segment: - the vlan to be used by a UD QP is set in the datagram segment. - for GSI (CM) QP, all the headers down to 8021q and MAC are built by the software anyway. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/qp.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 3a91d9d8dc51..819b1a535b55 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1547,14 +1547,13 @@ static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, } static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, - struct ib_send_wr *wr, __be16 *vlan) + struct ib_send_wr *wr) { memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); - *vlan = dseg->vlan; } static void set_mlx_icrc_seg(void *dseg) @@ -1657,7 +1656,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 uninitialized_var(lso_hdr_sz); __be32 blh; int i; - __be16 vlan = cpu_to_be16(0xffff); spin_lock_irqsave(&qp->sq.lock, flags); @@ -1761,7 +1759,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, break; case IB_QPT_UD: - set_datagram_seg(wqe, wr, &vlan); + set_datagram_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; @@ -1824,11 +1822,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; - if (be16_to_cpu(vlan) < 0x1000) { - ctrl->ins_vlan = 1 << 6; - ctrl->vlan_tag = vlan; - } - /* * Make sure descriptor is fully written before * setting ownership bit (because HW can start From cb29688aaa4caa4d54df2976118fe99a839bb433 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 16 Oct 2011 10:26:21 +0200 Subject: [PATCH 62/62] mlx4_core: Deprecate log_num_vlan module param Enable the maximum size (128) supported by the device for the shadow vlans table, ignoring the module parameter that overrides it. This table is only used by the IBoE control plane for setting a vlan index into an RC/UC QP context or UD Address Handle. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/net/mlx4/main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index f0ee35df4dd7..0c0c7d95133b 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -96,6 +96,8 @@ MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)"); static int log_num_vlan; module_param_named(log_num_vlan, log_num_vlan, int, 0444); MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)"); +/* Log2 max number of VLANs per ETH port (0-7) */ +#define MLX4_LOG_NUM_VLANS 7 static int use_prio; module_param_named(use_prio, use_prio, bool, 0444); @@ -230,7 +232,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.max_gso_sz = dev_cap->max_gso_sz; dev->caps.log_num_macs = log_num_mac; - dev->caps.log_num_vlans = log_num_vlan; + dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS; dev->caps.log_num_prios = use_prio ? 3 : 0; for (i = 1; i <= dev->caps.num_ports; ++i) { @@ -1489,10 +1491,9 @@ static int __init mlx4_verify_params(void) return -1; } - if ((log_num_vlan < 0) || (log_num_vlan > 7)) { - pr_warning("mlx4_core: bad num_vlan: %d\n", log_num_vlan); - return -1; - } + if (log_num_vlan != 0) + pr_warning("mlx4_core: log_num_vlan - obsolete module param, using %d\n", + MLX4_LOG_NUM_VLANS); if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) { pr_warning("mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg);