From 75c21ae9aa75b0452318d05f737ea838672137f5 Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Mon, 1 Dec 2008 20:58:57 -0800 Subject: [PATCH 01/26] IB/ehca: Fix locking for shca_list_lock shca_list_lock is taken from softirq context in ehca_poll_eqs, so we need to lock IRQ safe elsewhere. Found by lockdep. Signed-off-by: Michael Ellerman Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_main.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index bec7e0249358..3b77b674cbf6 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -717,6 +717,7 @@ static int __devinit ehca_probe(struct of_device *dev, const u64 *handle; struct ib_pd *ibpd; int ret, i, eq_size; + unsigned long flags; handle = of_get_property(dev->node, "ibm,hca-handle", NULL); if (!handle) { @@ -830,9 +831,9 @@ static int __devinit ehca_probe(struct of_device *dev, ehca_err(&shca->ib_device, "Cannot create device attributes ret=%d", ret); - spin_lock(&shca_list_lock); + spin_lock_irqsave(&shca_list_lock, flags); list_add(&shca->shca_list, &shca_list); - spin_unlock(&shca_list_lock); + spin_unlock_irqrestore(&shca_list_lock, flags); return 0; @@ -878,6 +879,7 @@ probe1: static int __devexit ehca_remove(struct of_device *dev) { struct ehca_shca *shca = dev->dev.driver_data; + unsigned long flags; int ret; sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp); @@ -915,9 +917,9 @@ static int __devexit ehca_remove(struct of_device *dev) ib_dealloc_device(&shca->ib_device); - spin_lock(&shca_list_lock); + spin_lock_irqsave(&shca_list_lock, flags); list_del(&shca->shca_list); - spin_unlock(&shca_list_lock); + spin_unlock_irqrestore(&shca_list_lock, flags); return ret; } @@ -975,6 +977,7 @@ static int ehca_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { static unsigned long ehca_dmem_warn_time; + unsigned long flags; switch (action) { case MEM_CANCEL_OFFLINE: @@ -985,12 +988,12 @@ static int ehca_mem_notifier(struct notifier_block *nb, case MEM_GOING_ONLINE: case MEM_GOING_OFFLINE: /* only ok if no hca is attached to the lpar */ - spin_lock(&shca_list_lock); + spin_lock_irqsave(&shca_list_lock, flags); if (list_empty(&shca_list)) { - spin_unlock(&shca_list_lock); + spin_unlock_irqrestore(&shca_list_lock, flags); return NOTIFY_OK; } else { - spin_unlock(&shca_list_lock); + spin_unlock_irqrestore(&shca_list_lock, flags); if (printk_timed_ratelimit(&ehca_dmem_warn_time, 30 * 1000)) ehca_gen_err("DMEM operations are not allowed" From 64f22fa17c1a531e682ebc882566856ea5718495 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 1 Dec 2008 20:59:07 -0800 Subject: [PATCH 02/26] IB/ipath: Fix pointer-to-pointer thinko in ipath_fs.c The return from lookup_one_len() is assigned to *dentry, so that's what we should be checking with IS_ERR(). Signed-off-by: Michael Ellerman Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index 8bb5170b4e41..53912c327bfe 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -86,7 +86,7 @@ static int create_file(const char *name, mode_t mode, *dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); *dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry)) + if (!IS_ERR(*dentry)) error = ipathfs_mknod(parent->d_inode, *dentry, mode, fops, data); else From 7c37d74474c8ee8ddcd5a2d2a9571d4a1290c844 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 1 Dec 2008 20:59:08 -0800 Subject: [PATCH 03/26] IB/ipath: Improve UD loopback performance by allocating temp array only once Receive work queue entries are checked for L_Key validity, and pointers to the memory region structure are saved in an allocated structure. For UD loopback packets, this structure is allocated and freed for each packet. This patch changes that to allocate/free during QP creation and destruction. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_qp.c | 32 +++++++++++++++++------ drivers/infiniband/hw/ipath/ipath_ud.c | 19 +------------- drivers/infiniband/hw/ipath/ipath_verbs.h | 1 + 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 4715911101e4..3a5a89b609c4 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -745,6 +745,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, struct ipath_swqe *swq = NULL; struct ipath_ibdev *dev; size_t sz; + size_t sg_list_sz; struct ib_qp *ret; if (init_attr->create_flags) { @@ -789,19 +790,31 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, goto bail; } sz = sizeof(*qp); + sg_list_sz = 0; if (init_attr->srq) { struct ipath_srq *srq = to_isrq(init_attr->srq); - sz += sizeof(*qp->r_sg_list) * - srq->rq.max_sge; - } else - sz += sizeof(*qp->r_sg_list) * - init_attr->cap.max_recv_sge; - qp = kmalloc(sz, GFP_KERNEL); + if (srq->rq.max_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (srq->rq.max_sge - 1); + } else if (init_attr->cap.max_recv_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (init_attr->cap.max_recv_sge - 1); + qp = kmalloc(sz + sg_list_sz, GFP_KERNEL); if (!qp) { ret = ERR_PTR(-ENOMEM); goto bail_swq; } + if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD || + init_attr->qp_type == IB_QPT_SMI || + init_attr->qp_type == IB_QPT_GSI)) { + qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL); + if (!qp->r_ud_sg_list) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + } else + qp->r_ud_sg_list = NULL; if (init_attr->srq) { sz = 0; qp->r_rq.size = 0; @@ -818,7 +831,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, qp->r_rq.size * sz); if (!qp->r_rq.wq) { ret = ERR_PTR(-ENOMEM); - goto bail_qp; + goto bail_sg_list; } } @@ -848,7 +861,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, if (err) { ret = ERR_PTR(err); vfree(qp->r_rq.wq); - goto bail_qp; + goto bail_sg_list; } qp->ip = NULL; qp->s_tx = NULL; @@ -925,6 +938,8 @@ bail_ip: vfree(qp->r_rq.wq); ipath_free_qp(&dev->qp_table, qp); free_qpn(&dev->qp_table, qp->ibqp.qp_num); +bail_sg_list: + kfree(qp->r_ud_sg_list); bail_qp: kfree(qp); bail_swq: @@ -989,6 +1004,7 @@ int ipath_destroy_qp(struct ib_qp *ibqp) kref_put(&qp->ip->ref, ipath_release_mmap_info); else vfree(qp->r_rq.wq); + kfree(qp->r_ud_sg_list); vfree(qp->s_wq); kfree(qp); return 0; diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index 729446f56aab..91c74cc797ae 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -70,8 +70,6 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) goto done; } - rsge.sg_list = NULL; - /* * Check that the qkey matches (except for QP0, see 9.6.1.4.1). * Qkeys with the high order bit set mean use the @@ -115,21 +113,6 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) rq = &qp->r_rq; } - if (rq->max_sge > 1) { - /* - * XXX We could use GFP_KERNEL if ipath_do_send() - * was always called from the tasklet instead of - * from ipath_post_send(). - */ - rsge.sg_list = kmalloc((rq->max_sge - 1) * - sizeof(struct ipath_sge), - GFP_ATOMIC); - if (!rsge.sg_list) { - dev->n_pkt_drops++; - goto drop; - } - } - /* * Get the next work request entry to find where to put the data. * Note that it is safe to drop the lock after changing rq->tail @@ -147,6 +130,7 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) goto drop; } wqe = get_rwqe_ptr(rq, tail); + rsge.sg_list = qp->r_ud_sg_list; if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) { spin_unlock_irqrestore(&rq->lock, flags); dev->n_pkt_drops++; @@ -242,7 +226,6 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, swqe->wr.send_flags & IB_SEND_SOLICITED); drop: - kfree(rsge.sg_list); if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); done:; diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index 9d12ae8a778e..11e3f613df93 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -431,6 +431,7 @@ struct ipath_qp { u32 s_lsn; /* limit sequence number (credit) */ struct ipath_swqe *s_wq; /* send work queue */ struct ipath_swqe *s_wqe; + struct ipath_sge *r_ud_sg_list; struct ipath_rq r_rq; /* receive work queue */ struct ipath_sge r_sg_list[0]; /* verified SGEs */ }; From c5d321e5c924384cf5b35f6288d69e9237490565 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Fri, 21 Nov 2008 20:50:38 -0600 Subject: [PATCH 04/26] RDMA/nes: Cleanup cqp_request list usage Use nes_free_cqp_request() instead of open coding. Change some continue to break in nes_cm_timer_tick, because send_entry used to be a list processed in a loop (so continue went to the next item). Now it is a single item, so using break is correct. Signed-off-by: Faisal Latif Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 12 +++---- drivers/infiniband/hw/nes/nes_verbs.c | 45 ++++++--------------------- 2 files changed, 16 insertions(+), 41 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 2caf9da81ad5..2a1d6c7f8d32 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -519,7 +519,7 @@ static void nes_cm_timer_tick(unsigned long pass) do { send_entry = cm_node->send_entry; if (!send_entry) - continue; + break; if (time_after(send_entry->timetosend, jiffies)) { if (cm_node->state != NES_CM_STATE_TSA) { if ((nexttimeout > @@ -528,18 +528,18 @@ static void nes_cm_timer_tick(unsigned long pass) nexttimeout = send_entry->timetosend; settimer = 1; - continue; + break; } } else { free_retrans_entry(cm_node); - continue; + break; } } if ((cm_node->state == NES_CM_STATE_TSA) || (cm_node->state == NES_CM_STATE_CLOSED)) { free_retrans_entry(cm_node); - continue; + break; } if (!send_entry->retranscount || @@ -557,7 +557,7 @@ static void nes_cm_timer_tick(unsigned long pass) NES_CM_EVENT_ABORTED); spin_lock_irqsave(&cm_node->retrans_list_lock, flags); - continue; + break; } atomic_inc(&send_entry->skb->users); cm_packets_retrans++; @@ -583,7 +583,7 @@ static void nes_cm_timer_tick(unsigned long pass) send_entry->retrycount--; nexttimeout = jiffies + NES_SHORT_TIME; settimer = 1; - continue; + break; } else { cm_packets_sent++; } diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index d36c9a0bf1bb..4fdb72454f94 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1695,13 +1695,8 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, /* use 4k pbl */ nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries); if (nesadapter->free_4kpbl == 0) { - if (cqp_request->dynamic) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - kfree(cqp_request); - } else { - list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_free_cqp_request(nesdev, cqp_request); if (!context) pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, nescq->hw_cq.cq_pbase); @@ -1717,13 +1712,8 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, /* use 256 byte pbl */ nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries); if (nesadapter->free_256pbl == 0) { - if (cqp_request->dynamic) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - kfree(cqp_request); - } else { - list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_free_cqp_request(nesdev, cqp_request); if (!context) pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, nescq->hw_cq.cq_pbase); @@ -1928,13 +1918,8 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, /* Two level PBL */ if ((pbl_count+1) > nesadapter->free_4kpbl) { nes_debug(NES_DBG_MR, "Out of 4KB Pbls for two level request.\n"); - if (cqp_request->dynamic) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - kfree(cqp_request); - } else { - list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_free_cqp_request(nesdev, cqp_request); return -ENOMEM; } else { nesadapter->free_4kpbl -= pbl_count+1; @@ -1942,13 +1927,8 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, } else if (residual_page_count > 32) { if (pbl_count > nesadapter->free_4kpbl) { nes_debug(NES_DBG_MR, "Out of 4KB Pbls.\n"); - if (cqp_request->dynamic) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - kfree(cqp_request); - } else { - list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_free_cqp_request(nesdev, cqp_request); return -ENOMEM; } else { nesadapter->free_4kpbl -= pbl_count; @@ -1956,13 +1936,8 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, } else { if (pbl_count > nesadapter->free_256pbl) { nes_debug(NES_DBG_MR, "Out of 256B Pbls.\n"); - if (cqp_request->dynamic) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - kfree(cqp_request); - } else { - list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_free_cqp_request(nesdev, cqp_request); return -ENOMEM; } else { nesadapter->free_256pbl -= pbl_count; From 879e5bd5a1a0a317fb67fa4dc550db092a7bdcb0 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Fri, 21 Nov 2008 20:50:41 -0600 Subject: [PATCH 05/26] RDMA/nes: Lock down connected_nodes list while processing it While processing connected_nodes list, we would release the lock when we need to send reset to remote partner. That created a window where the list can be modified. Change this into a two step process: place nodes that need processing on a local list then process the local list. Signed-off-by: Faisal Latif Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 40 +++++++++++++++++++++--------- drivers/infiniband/hw/nes/nes_cm.h | 2 ++ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 2a1d6c7f8d32..257d994ec7b5 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -459,13 +459,23 @@ static void nes_cm_timer_tick(unsigned long pass) int ret = NETDEV_TX_OK; enum nes_cm_node_state last_state; + struct list_head timer_list; + INIT_LIST_HEAD(&timer_list); spin_lock_irqsave(&cm_core->ht_lock, flags); list_for_each_safe(list_node, list_core_temp, - &cm_core->connected_nodes) { + &cm_core->connected_nodes) { cm_node = container_of(list_node, struct nes_cm_node, list); - add_ref_cm_node(cm_node); - spin_unlock_irqrestore(&cm_core->ht_lock, flags); + if (!list_empty(&cm_node->recv_list) || (cm_node->send_entry)) { + add_ref_cm_node(cm_node); + list_add(&cm_node->timer_entry, &timer_list); + } + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + list_for_each_safe(list_node, list_core_temp, &timer_list) { + cm_node = container_of(list_node, struct nes_cm_node, + timer_entry); spin_lock_irqsave(&cm_node->recv_list_lock, flags); list_for_each_safe(list_core, list_node_temp, &cm_node->recv_list) { @@ -615,14 +625,12 @@ static void nes_cm_timer_tick(unsigned long pass) spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); rem_ref_cm_node(cm_node->cm_core, cm_node); - spin_lock_irqsave(&cm_core->ht_lock, flags); if (ret != NETDEV_TX_OK) { nes_debug(NES_DBG_CM, "rexmit failed for cm_node=%p\n", cm_node); break; } } - spin_unlock_irqrestore(&cm_core->ht_lock, flags); if (settimer) { if (!timer_pending(&cm_core->tcp_timer)) { @@ -925,28 +933,36 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, struct list_head *list_pos = NULL; struct list_head *list_temp = NULL; struct nes_cm_node *cm_node = NULL; + struct list_head reset_list; nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, " "refcnt=%d\n", listener, free_hanging_nodes, atomic_read(&listener->ref_count)); /* free non-accelerated child nodes for this listener */ + INIT_LIST_HEAD(&reset_list); if (free_hanging_nodes) { spin_lock_irqsave(&cm_core->ht_lock, flags); list_for_each_safe(list_pos, list_temp, - &g_cm_core->connected_nodes) { + &g_cm_core->connected_nodes) { cm_node = container_of(list_pos, struct nes_cm_node, list); if ((cm_node->listener == listener) && - (!cm_node->accelerated)) { - cleanup_retrans_entry(cm_node); - spin_unlock_irqrestore(&cm_core->ht_lock, - flags); - send_reset(cm_node, NULL); - spin_lock_irqsave(&cm_core->ht_lock, flags); + (!cm_node->accelerated)) { + add_ref_cm_node(cm_node); + list_add(&cm_node->reset_entry, &reset_list); } } spin_unlock_irqrestore(&cm_core->ht_lock, flags); } + + list_for_each_safe(list_pos, list_temp, &reset_list) { + cm_node = container_of(list_pos, struct nes_cm_node, + reset_entry); + cleanup_retrans_entry(cm_node); + send_reset(cm_node, NULL); + rem_ref_cm_node(cm_node->cm_core, cm_node); + } + spin_lock_irqsave(&cm_core->listen_list_lock, flags); if (!atomic_dec_return(&listener->ref_count)) { list_del(&listener->list); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 367b3d290140..282a9cbe508f 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -292,6 +292,8 @@ struct nes_cm_node { int apbvt_set; int accept_pend; int freed; + struct list_head timer_entry; + struct list_head reset_entry; struct nes_qp *nesqp; }; From 183ecfa3091cd4cdda329a7fe89d9544088f517d Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Fri, 21 Nov 2008 20:50:46 -0600 Subject: [PATCH 06/26] RDMA/nes: Avoid race between MPA request and reset event to rdma_cm In passive open, after indicating MPA request to rdma_cm, an incoming RST would fire a reset event to rdma_cm causing it to crash, since the current state is not connected. The solution is to wait for nes_accept() or nes_reject() before firing the reset event. If nes_accept() or nes_reject() is already done, then the reset event will be fired when RST is processed. Signed-off-by: Faisal Latif Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 50 ++++++++++++++++++++++++++---- drivers/infiniband/hw/nes/nes_cm.h | 5 +++ 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 257d994ec7b5..c259ddc8dd88 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1360,6 +1360,7 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, { int reset = 0; /* whether to send reset in case of err.. */ + int passive_state; atomic_inc(&cm_resets_recvd); nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u." " refcnt=%d\n", cm_node, cm_node->state, @@ -1373,7 +1374,14 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, cm_node->listener, cm_node->state); active_open_err(cm_node, skb, reset); break; - /* For PASSIVE open states, remove the cm_node event */ + case NES_CM_STATE_MPAREQ_RCVD: + passive_state = atomic_add_return(1, &cm_node->passive_state); + if (passive_state == NES_SEND_RESET_EVENT) + create_event(cm_node, NES_CM_EVENT_RESET); + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + dev_kfree_skb_any(skb); + break; case NES_CM_STATE_ESTABLISHED: case NES_CM_STATE_SYN_RCVD: case NES_CM_STATE_LISTENING: @@ -1381,7 +1389,14 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, passive_open_err(cm_node, skb, reset); break; case NES_CM_STATE_TSA: + active_open_err(cm_node, skb, reset); + break; + case NES_CM_STATE_CLOSED: + cleanup_retrans_entry(cm_node); + drop_packet(skb); + break; default: + drop_packet(skb); break; } } @@ -1410,6 +1425,9 @@ static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb, dev_kfree_skb_any(skb); if (type == NES_CM_EVENT_CONNECTED) cm_node->state = NES_CM_STATE_TSA; + else + atomic_set(&cm_node->passive_state, + NES_PASSIVE_STATE_INDICATED); create_event(cm_node, type); } @@ -1986,6 +2004,7 @@ static int mini_cm_reject(struct nes_cm_core *cm_core, struct ietf_mpa_frame *mpa_frame, struct nes_cm_node *cm_node) { int ret = 0; + int passive_state; nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n", __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state); @@ -1993,9 +2012,13 @@ static int mini_cm_reject(struct nes_cm_core *cm_core, if (cm_node->tcp_cntxt.client) return ret; cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_CLOSED; - ret = send_reset(cm_node, NULL); + passive_state = atomic_add_return(1, &cm_node->passive_state); + cm_node->state = NES_CM_STATE_CLOSED; + if (passive_state == NES_SEND_RESET_EVENT) + rem_ref_cm_node(cm_core, cm_node); + else + ret = send_reset(cm_node, NULL); return ret; } @@ -2413,7 +2436,6 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) atomic_inc(&cm_disconnects); cm_event.event = IW_CM_EVENT_DISCONNECT; if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) { - issued_disconnect_reset = 1; cm_event.status = IW_CM_EVENT_STATUS_RESET; nes_debug(NES_DBG_CM, "Generating a CM " "Disconnect Event (status reset) for " @@ -2563,6 +2585,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct nes_v4_quad nes_quad; u32 crc_value; int ret; + int passive_state; ibqp = nes_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) @@ -2730,8 +2753,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) conn_param->private_data_len + sizeof(struct ietf_mpa_frame)); - attr.qp_state = IB_QPS_RTS; - nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); /* notify OF layer that accept event was successfull */ cm_id->add_ref(cm_id); @@ -2744,6 +2765,8 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_event.private_data = NULL; cm_event.private_data_len = 0; ret = cm_id->event_handler(cm_id, &cm_event); + attr.qp_state = IB_QPS_RTS; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); if (cm_node->loopbackpartner) { cm_node->loopbackpartner->mpa_frame_size = nesqp->private_data_len; @@ -2756,6 +2779,9 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " "ret=%d\n", __func__, __LINE__, ret); + passive_state = atomic_add_return(1, &cm_node->passive_state); + if (passive_state == NES_SEND_RESET_EVENT) + create_event(cm_node, NES_CM_EVENT_RESET); return 0; } @@ -3238,6 +3264,18 @@ static void cm_event_reset(struct nes_cm_event *event) cm_event.private_data_len = 0; ret = cm_id->event_handler(cm_id, &cm_event); + cm_id->add_ref(cm_id); + atomic_inc(&cm_closes); + cm_event.event = IW_CM_EVENT_CLOSE; + cm_event.status = IW_CM_EVENT_STATUS_OK; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node); + ret = cm_id->event_handler(cm_id, &cm_event); + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 282a9cbe508f..25e2493abc6a 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -76,6 +76,10 @@ enum nes_timer_type { NES_TIMER_TYPE_CLOSE, }; +#define NES_PASSIVE_STATE_INDICATED 0 +#define NES_DO_NOT_SEND_RESET_EVENT 1 +#define NES_SEND_RESET_EVENT 2 + #define MAX_NES_IFS 4 #define SET_ACK 1 @@ -295,6 +299,7 @@ struct nes_cm_node { struct list_head timer_entry; struct list_head reset_entry; struct nes_qp *nesqp; + atomic_t passive_state; }; /* structure for client or CM to fill when making CM api calls. */ From 4a14f6a79f5110c6033f0c61d77d07c449c2d083 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Fri, 21 Nov 2008 20:50:49 -0600 Subject: [PATCH 07/26] RDMA/nes: Forward packets for a new connection with stale APBVT entry Under heavy traffic, there is a small windows when an APBVT entry is not yet removed and a new connection is established. Packets for the new connection are dropped until APBVT entry is removed. This patch will forward the packets instead of dropping them. Signed-off-by: Faisal Latif Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 44 ++++++++++++++---------------- drivers/infiniband/hw/nes/nes_cm.h | 2 +- drivers/infiniband/hw/nes/nes_hw.c | 42 ++++++++++++++++------------ 3 files changed, 45 insertions(+), 43 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index c259ddc8dd88..f34fa6f77a81 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -86,7 +86,7 @@ static int mini_cm_accept(struct nes_cm_core *, struct ietf_mpa_frame *, struct nes_cm_node *); static int mini_cm_reject(struct nes_cm_core *, struct ietf_mpa_frame *, struct nes_cm_node *); -static void mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, +static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *); static int mini_cm_dealloc_core(struct nes_cm_core *); static int mini_cm_get(struct nes_cm_core *); @@ -2076,7 +2076,7 @@ static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_nod * recv_pkt - recv an ETHERNET packet, and process it through CM * node state machine */ -static void mini_cm_recv_pkt(struct nes_cm_core *cm_core, +static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, struct nes_vnic *nesvnic, struct sk_buff *skb) { struct nes_cm_node *cm_node = NULL; @@ -2084,23 +2084,16 @@ static void mini_cm_recv_pkt(struct nes_cm_core *cm_core, struct iphdr *iph; struct tcphdr *tcph; struct nes_cm_info nfo; + int skb_handled = 1; if (!skb) - return; + return 0; if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr)) { - dev_kfree_skb_any(skb); - return; + return 0; } iph = (struct iphdr *)skb->data; tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr)); - skb_reset_network_header(skb); - skb_set_transport_header(skb, sizeof(*tcph)); - if (!tcph) { - dev_kfree_skb_any(skb); - return; - } - skb->len = ntohs(iph->tot_len); nfo.loc_addr = ntohl(iph->daddr); nfo.loc_port = ntohs(tcph->dest); @@ -2121,23 +2114,21 @@ static void mini_cm_recv_pkt(struct nes_cm_core *cm_core, /* Only type of packet accepted are for */ /* the PASSIVE open (syn only) */ if ((!tcph->syn) || (tcph->ack)) { - cm_packets_dropped++; + skb_handled = 0; break; } listener = find_listener(cm_core, nfo.loc_addr, nfo.loc_port, NES_CM_LISTENER_ACTIVE_STATE); - if (listener) { - nfo.cm_id = listener->cm_id; - nfo.conn_type = listener->conn_type; - } else { - nes_debug(NES_DBG_CM, "Unable to find listener " - "for the pkt\n"); - cm_packets_dropped++; - dev_kfree_skb_any(skb); + if (!listener) { + nfo.cm_id = NULL; + nfo.conn_type = 0; + nes_debug(NES_DBG_CM, "Unable to find listener for the pkt\n"); + skb_handled = 0; break; } - + nfo.cm_id = listener->cm_id; + nfo.conn_type = listener->conn_type; cm_node = make_cm_node(cm_core, nesvnic, &nfo, listener); if (!cm_node) { @@ -2163,9 +2154,13 @@ static void mini_cm_recv_pkt(struct nes_cm_core *cm_core, dev_kfree_skb_any(skb); break; } + skb_reset_network_header(skb); + skb_set_transport_header(skb, sizeof(*tcph)); + skb->len = ntohs(iph->tot_len); process_packet(cm_node, skb, cm_core); rem_ref_cm_node(cm_core, cm_node); } while (0); + return skb_handled; } @@ -2985,15 +2980,16 @@ int nes_destroy_listen(struct iw_cm_id *cm_id) */ int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice) { + int rc = 0; cm_packets_received++; if ((g_cm_core) && (g_cm_core->api)) { - g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb); + rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb); } else { nes_debug(NES_DBG_CM, "Unable to process packet for CM," " cm is not setup properly.\n"); } - return 0; + return rc; } diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 25e2493abc6a..3a20a7883976 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -397,7 +397,7 @@ struct nes_cm_ops { struct nes_cm_node *); int (*reject)(struct nes_cm_core *, struct ietf_mpa_frame *, struct nes_cm_node *); - void (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *, + int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *); int (*destroy_cm_core)(struct nes_cm_core *); int (*get)(struct nes_cm_core *); diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 7c49cc882d75..8f70ff2dcc58 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -2700,27 +2700,33 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */ if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) { - nes_cm_recv(rx_skb, nesvnic->netdev); + if (nes_cm_recv(rx_skb, nesvnic->netdev)) + rx_skb = NULL; + } + if (rx_skb == NULL) + goto skip_rx_indicate0; + + + if ((cqe_misc & NES_NIC_CQE_TAG_VALID) && + (nesvnic->vlan_grp != NULL)) { + vlan_tag = (u16)(le32_to_cpu( + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]) + >> 16); + nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n", + nesvnic->netdev->name, vlan_tag); + if (nes_use_lro) + lro_vlan_hwaccel_receive_skb(&nesvnic->lro_mgr, rx_skb, + nesvnic->vlan_grp, vlan_tag, NULL); + else + nes_vlan_rx(rx_skb, nesvnic->vlan_grp, vlan_tag); } else { - if ((cqe_misc & NES_NIC_CQE_TAG_VALID) && (nesvnic->vlan_grp != NULL)) { - vlan_tag = (u16)(le32_to_cpu( - cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]) - >> 16); - nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n", - nesvnic->netdev->name, vlan_tag); - if (nes_use_lro) - lro_vlan_hwaccel_receive_skb(&nesvnic->lro_mgr, rx_skb, - nesvnic->vlan_grp, vlan_tag, NULL); - else - nes_vlan_rx(rx_skb, nesvnic->vlan_grp, vlan_tag); - } else { - if (nes_use_lro) - lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL); - else - nes_netif_rx(rx_skb); - } + if (nes_use_lro) + lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL); + else + nes_netif_rx(rx_skb); } +skip_rx_indicate0: nesvnic->netdev->last_rx = jiffies; /* nesvnic->netstats.rx_packets++; */ /* nesvnic->netstats.rx_bytes += rx_pkt_size; */ From abb77256761bc3ee7a21cc28f6f12a938964e83f Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Fri, 21 Nov 2008 20:50:52 -0600 Subject: [PATCH 08/26] RDMA/nes: Fix TCP compliance test failures ANVL testing showed we are not handling all cm_node states during connection establishment. Add missing state handlers and fix sequence number send reset in handle_tcp_options(). Signed-off-by: Faisal Latif Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 44 +++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index f34fa6f77a81..0997c7b8cd9b 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1508,7 +1508,7 @@ static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, int optionsize; optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); - skb_pull(skb, tcph->doff << 2); + skb_trim(skb, 0); inc_sequence = ntohl(tcph->seq); switch (cm_node->state) { @@ -1541,6 +1541,10 @@ static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, cm_node->state = NES_CM_STATE_SYN_RCVD; send_syn(cm_node, 1, skb); break; + case NES_CM_STATE_CLOSED: + cleanup_retrans_entry(cm_node); + send_reset(cm_node, skb); + break; case NES_CM_STATE_TSA: case NES_CM_STATE_ESTABLISHED: case NES_CM_STATE_FIN_WAIT1: @@ -1549,7 +1553,6 @@ static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, case NES_CM_STATE_LAST_ACK: case NES_CM_STATE_CLOSING: case NES_CM_STATE_UNKNOWN: - case NES_CM_STATE_CLOSED: default: drop_packet(skb); break; @@ -1565,7 +1568,7 @@ static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, int optionsize; optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); - skb_pull(skb, tcph->doff << 2); + skb_trim(skb, 0); inc_sequence = ntohl(tcph->seq); switch (cm_node->state) { case NES_CM_STATE_SYN_SENT: @@ -1589,6 +1592,12 @@ static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, /* passive open, so should not be here */ passive_open_err(cm_node, skb, 1); break; + case NES_CM_STATE_LISTENING: + case NES_CM_STATE_CLOSED: + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + cleanup_retrans_entry(cm_node); + send_reset(cm_node, skb); + break; case NES_CM_STATE_ESTABLISHED: case NES_CM_STATE_FIN_WAIT1: case NES_CM_STATE_FIN_WAIT2: @@ -1596,7 +1605,6 @@ static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, case NES_CM_STATE_TSA: case NES_CM_STATE_CLOSING: case NES_CM_STATE_UNKNOWN: - case NES_CM_STATE_CLOSED: case NES_CM_STATE_MPAREQ_SENT: default: drop_packet(skb); @@ -1611,6 +1619,13 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, u32 inc_sequence; u32 rem_seq_ack; u32 rem_seq; + int ret; + int optionsize; + u32 temp_seq = cm_node->tcp_cntxt.loc_seq_num; + + optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + if (check_seq(cm_node, tcph, skb)) return; @@ -1623,7 +1638,18 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, switch (cm_node->state) { case NES_CM_STATE_SYN_RCVD: /* Passive OPEN */ + ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1); + if (ret) + break; cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); + cm_node->tcp_cntxt.loc_seq_num = temp_seq; + if (cm_node->tcp_cntxt.rem_ack_num != + cm_node->tcp_cntxt.loc_seq_num) { + nes_debug(NES_DBG_CM, "rem_ack_num != loc_seq_num\n"); + cleanup_retrans_entry(cm_node); + send_reset(cm_node, skb); + return; + } cm_node->state = NES_CM_STATE_ESTABLISHED; if (datasize) { cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; @@ -1655,11 +1681,15 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, dev_kfree_skb_any(skb); } break; + case NES_CM_STATE_LISTENING: + case NES_CM_STATE_CLOSED: + cleanup_retrans_entry(cm_node); + send_reset(cm_node, skb); + break; case NES_CM_STATE_FIN_WAIT1: case NES_CM_STATE_SYN_SENT: case NES_CM_STATE_FIN_WAIT2: case NES_CM_STATE_TSA: - case NES_CM_STATE_CLOSED: case NES_CM_STATE_MPAREQ_RCVD: case NES_CM_STATE_LAST_ACK: case NES_CM_STATE_CLOSING: @@ -1682,9 +1712,9 @@ static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n", __func__, cm_node); if (passive) - passive_open_err(cm_node, skb, 0); + passive_open_err(cm_node, skb, 1); else - active_open_err(cm_node, skb, 0); + active_open_err(cm_node, skb, 1); return 1; } } From f3181a10e13ac55e18958d7c48cba6f925c71483 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Fri, 21 Nov 2008 20:50:55 -0600 Subject: [PATCH 09/26] RDMA/nes: Check cqp_avail_reqs is empty after locking the list Between the first empty list check and locking the list, the list can change. Check it again after it is locked to make sure the list is still not empty. Signed-off-by: Faisal Latif Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_utils.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index fb8cbd71a2ef..5611a73d5831 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -540,11 +540,14 @@ struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) if (!list_empty(&nesdev->cqp_avail_reqs)) { spin_lock_irqsave(&nesdev->cqp.lock, flags); - cqp_request = list_entry(nesdev->cqp_avail_reqs.next, + if (!list_empty(&nesdev->cqp_avail_reqs)) { + cqp_request = list_entry(nesdev->cqp_avail_reqs.next, struct nes_cqp_request, list); - list_del_init(&cqp_request->list); + list_del_init(&cqp_request->list); + } spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - } else { + } + if (cqp_request == NULL) { cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_KERNEL); if (cqp_request) { cqp_request->dynamic = 1; From 1ee86555b2ad4d16a3c18253b7e4d70d34eb94f3 Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Fri, 21 Nov 2008 20:51:04 -0600 Subject: [PATCH 10/26] RDMA/nes: Add loopback check to make_cm_node() Check for loopback connection in make_cm_node(). Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 0997c7b8cd9b..f241ca104d16 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1142,7 +1142,10 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->loopbackpartner = NULL; /* get the mac addr for the remote node */ - arpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); + if (ipv4_is_loopback(htonl(cm_node->rem_addr))) + arpindex = nes_arp_table(nesdev, ntohl(nesvnic->local_ipaddr), NULL, NES_ARP_RESOLVE); + else + arpindex = nes_arp_table(nesdev, cm_node->rem_addr, NULL, NES_ARP_RESOLVE); if (arpindex < 0) { arpindex = nes_addr_resolve_neigh(nesvnic, cm_info->rem_addr); if (arpindex < 0) { From 6098d107499e1335f899bfcb558253fb7ee4f73f Mon Sep 17 00:00:00 2001 From: Chien Tung Date: Fri, 21 Nov 2008 20:51:01 -0600 Subject: [PATCH 11/26] RDMA/nes: Cleanup warnings Wrap NES_DEBUG and assert macros with do while (0) to avoid ambiguous else. No one is using sk_buff * returned from form_cm_frame(), so drop the return. drop_packet() should not be incrementing reset counter on receiving a FIN. Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes.h | 16 ++++++++++------ drivers/infiniband/hw/nes/nes_cm.c | 8 ++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 1595dc7bba9d..13a5bb1a7bcf 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -137,14 +137,18 @@ #ifdef CONFIG_INFINIBAND_NES_DEBUG #define nes_debug(level, fmt, args...) \ +do { \ if (level & nes_debug_level) \ - printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args) + printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \ +} while (0) -#define assert(expr) \ -if (!(expr)) { \ - printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \ - #expr, __FILE__, __func__, __LINE__); \ -} +#define assert(expr) \ +do { \ + if (!(expr)) { \ + printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } \ +} while (0) #define NES_EVENT_TIMEOUT 1200000 #else diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index f241ca104d16..aa373c5f8d84 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -92,7 +92,7 @@ static int mini_cm_dealloc_core(struct nes_cm_core *); static int mini_cm_get(struct nes_cm_core *); static int mini_cm_set(struct nes_cm_core *, u32, u32); -static struct sk_buff *form_cm_frame(struct sk_buff *, struct nes_cm_node *, +static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8); static struct sk_buff *get_free_pkt(struct nes_cm_node *cm_node); static int add_ref_cm_node(struct nes_cm_node *); @@ -251,7 +251,7 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 len) * form_cm_frame - get a free packet and build empty frame Use * node info to build. */ -static struct sk_buff *form_cm_frame(struct sk_buff *skb, +static void form_cm_frame(struct sk_buff *skb, struct nes_cm_node *cm_node, void *options, u32 optionsize, void *data, u32 datasize, u8 flags) { @@ -339,7 +339,6 @@ static struct sk_buff *form_cm_frame(struct sk_buff *skb, skb_shinfo(skb)->nr_frags = 0; cm_packets_created++; - return skb; } @@ -381,8 +380,6 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, int ret = 0; u32 was_timer_set; - if (!cm_node) - return -EINVAL; new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC); if (!new_send) return -1; @@ -1325,7 +1322,6 @@ static void drop_packet(struct sk_buff *skb) static void handle_fin_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, struct tcphdr *tcph) { - atomic_inc(&cm_resets_recvd); nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. " "refcnt=%d\n", cm_node, cm_node->state, atomic_read(&cm_node->ref_count)); From fab01fc56063dafcc083f481ac0f9e6b5a576dd6 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 5 Dec 2008 11:13:18 -0800 Subject: [PATCH 12/26] IB/ipath: Fix PSN of send WQEs after an RDMA read resend The PSN of the first packet after an RDMA read is based on the size of the RDMA read request. This is calculated correctly for the WQE sent after the first request message but not on subsequent requests if the RDMA read is resent. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_rc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c index 7b93cda1a4bd..9170710b950d 100644 --- a/drivers/infiniband/hw/ipath/ipath_rc.c +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -573,9 +573,8 @@ int ipath_make_rc_req(struct ipath_qp *qp) ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); qp->s_state = OP(RDMA_READ_REQUEST); hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); - bth2 = qp->s_psn++ & IPATH_PSN_MASK; - if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) - qp->s_next_psn = qp->s_psn; + bth2 = qp->s_psn & IPATH_PSN_MASK; + qp->s_psn = wqe->lpsn + 1; ss = NULL; len = 0; qp->s_cur++; From 890fccb2427d53b48ab9d009fd87d55bcb173f62 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 5 Dec 2008 11:13:18 -0800 Subject: [PATCH 13/26] IB/ipath: Check return value of dma_map_single() This fixes an obvious oversight where the return value is not checked for error. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_sdma.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c index 284c9bca517e..8e255adf5d9b 100644 --- a/drivers/infiniband/hw/ipath/ipath_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -698,10 +698,8 @@ retry: addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr, tx->map_len, DMA_TO_DEVICE); - if (dma_mapping_error(&dd->pcidev->dev, addr)) { - ret = -EIO; - goto unlock; - } + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto ioerr; dwoffset = tx->map_len >> 2; make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0); @@ -741,6 +739,8 @@ retry: dw = (len + 3) >> 2; addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto unmap; make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset); /* SDmaUseLargeBuf has to be set in every descriptor */ if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF) @@ -798,7 +798,18 @@ retry: list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist); if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15) vl15_watchdog_enq(dd); + goto unlock; +unmap: + while (tail != dd->ipath_sdma_descq_tail) { + if (!tail) + tail = dd->ipath_sdma_descq_cnt - 1; + else + tail--; + unmap_desc(dd, tail); + } +ioerr: + ret = -EIO; unlock: spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); fail: From 60e845035a066e81af1a29047530088d59150d8b Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Fri, 5 Dec 2008 11:13:19 -0800 Subject: [PATCH 14/26] IB/ipath: Don't count IB symbol and link errors unless link is UP Implement the ignoring of ibsymbol errors and linkrecover errors while the link is at less than INIT (long needed), to get accurate counts. Particularly an issue when doing non-IBTA DDR negotiation with chips from vendors that do not support IBTA mode negotiation. If the driver is unloaded, and there is a delta, the adjusted counters are written back to the chip, so they stay adjusted across driver reload. Signed-off-by: Dave Olson Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_iba6120.c | 61 +++++++++++++++++ drivers/infiniband/hw/ipath/ipath_iba7220.c | 76 ++++++++++++++++++++- drivers/infiniband/hw/ipath/ipath_kernel.h | 13 ++++ drivers/infiniband/hw/ipath/ipath_stats.c | 8 +++ 4 files changed, 155 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c index 421cc2af891f..fbf8c5379ea8 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c @@ -721,6 +721,12 @@ static int ipath_pe_bringup_serdes(struct ipath_devdata *dd) INFINIPATH_HWE_SERDESPLLFAILED); } + dd->ibdeltainprog = 1; + dd->ibsymsnap = + ipath_read_creg32(dd, dd->ipath_cregs->cr_ibsymbolerrcnt); + dd->iblnkerrsnap = + ipath_read_creg32(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1); @@ -810,6 +816,36 @@ static void ipath_pe_quiet_serdes(struct ipath_devdata *dd) { u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + if (dd->ibsymdelta || dd->iblnkerrdelta || + dd->ibdeltainprog) { + u64 diagc; + /* enable counter writes */ + diagc = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwdiagctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwdiagctrl, + diagc | INFINIPATH_DC_COUNTERWREN); + + if (dd->ibsymdelta || dd->ibdeltainprog) { + val = ipath_read_creg32(dd, + dd->ipath_cregs->cr_ibsymbolerrcnt); + if (dd->ibdeltainprog) + val -= val - dd->ibsymsnap; + val -= dd->ibsymdelta; + ipath_write_creg(dd, + dd->ipath_cregs->cr_ibsymbolerrcnt, val); + } + if (dd->iblnkerrdelta || dd->ibdeltainprog) { + val = ipath_read_creg32(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + if (dd->ibdeltainprog) + val -= val - dd->iblnkerrsnap; + val -= dd->iblnkerrdelta; + ipath_write_creg(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt, val); + } + + /* and disable counter writes */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwdiagctrl, diagc); + } val |= INFINIPATH_SERDC0_TXIDLE; ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n", (unsigned long long) val); @@ -1749,6 +1785,31 @@ static void ipath_pe_config_jint(struct ipath_devdata *dd, u16 a, u16 b) static int ipath_pe_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) { + if (ibup) { + if (dd->ibdeltainprog) { + dd->ibdeltainprog = 0; + dd->ibsymdelta += + ipath_read_creg32(dd, + dd->ipath_cregs->cr_ibsymbolerrcnt) - + dd->ibsymsnap; + dd->iblnkerrdelta += + ipath_read_creg32(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt) - + dd->iblnkerrsnap; + } + } else { + dd->ipath_lli_counter = 0; + if (!dd->ibdeltainprog) { + dd->ibdeltainprog = 1; + dd->ibsymsnap = + ipath_read_creg32(dd, + dd->ipath_cregs->cr_ibsymbolerrcnt); + dd->iblnkerrsnap = + ipath_read_creg32(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + } + } + ipath_setup_pe_setextled(dd, ipath_ib_linkstate(dd, ibcs), ipath_ib_linktrstate(dd, ibcs)); return 0; diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c index 9839e20119bc..3b38bc9a331d 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba7220.c +++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c @@ -951,6 +951,12 @@ static int ipath_7220_bringup_serdes(struct ipath_devdata *dd) INFINIPATH_HWE_SERDESPLLFAILED); } + dd->ibdeltainprog = 1; + dd->ibsymsnap = + ipath_read_creg32(dd, dd->ipath_cregs->cr_ibsymbolerrcnt); + dd->iblnkerrsnap = + ipath_read_creg32(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt); + if (!dd->ipath_ibcddrctrl) { /* not on re-init after reset */ dd->ipath_ibcddrctrl = @@ -1084,6 +1090,37 @@ static void ipath_7220_config_jint(struct ipath_devdata *dd, static void ipath_7220_quiet_serdes(struct ipath_devdata *dd) { u64 val; + if (dd->ibsymdelta || dd->iblnkerrdelta || + dd->ibdeltainprog) { + u64 diagc; + /* enable counter writes */ + diagc = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwdiagctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwdiagctrl, + diagc | INFINIPATH_DC_COUNTERWREN); + + if (dd->ibsymdelta || dd->ibdeltainprog) { + val = ipath_read_creg32(dd, + dd->ipath_cregs->cr_ibsymbolerrcnt); + if (dd->ibdeltainprog) + val -= val - dd->ibsymsnap; + val -= dd->ibsymdelta; + ipath_write_creg(dd, + dd->ipath_cregs->cr_ibsymbolerrcnt, val); + } + if (dd->iblnkerrdelta || dd->ibdeltainprog) { + val = ipath_read_creg32(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + if (dd->ibdeltainprog) + val -= val - dd->iblnkerrsnap; + val -= dd->iblnkerrdelta; + ipath_write_creg(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt, val); + } + + /* and disable counter writes */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwdiagctrl, diagc); + } + dd->ipath_flags &= ~IPATH_IB_AUTONEG_INPROG; wake_up(&dd->ipath_autoneg_wait); cancel_delayed_work(&dd->ipath_autoneg_work); @@ -2325,7 +2362,7 @@ static void try_auto_neg(struct ipath_devdata *dd) static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) { - int ret = 0; + int ret = 0, symadj = 0; u32 ltstate = ipath_ib_linkstate(dd, ibcs); dd->ipath_link_width_active = @@ -2368,6 +2405,13 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) ipath_dbg("DDR negotiation try, %u/%u\n", dd->ipath_autoneg_tries, IPATH_AUTONEG_TRIES); + if (!dd->ibdeltainprog) { + dd->ibdeltainprog = 1; + dd->ibsymsnap = ipath_read_creg32(dd, + dd->ipath_cregs->cr_ibsymbolerrcnt); + dd->iblnkerrsnap = ipath_read_creg32(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + } try_auto_neg(dd); ret = 1; /* no other IB status change processing */ } else if ((dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) @@ -2388,6 +2432,7 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) set_speed_fast(dd, dd->ipath_link_speed_enabled); wake_up(&dd->ipath_autoneg_wait); + symadj = 1; } else if (dd->ipath_flags & IPATH_IB_AUTONEG_FAILED) { /* * clear autoneg failure flag, and do setup @@ -2403,6 +2448,7 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) IBA7220_IBC_IBTA_1_2_MASK; ipath_write_kreg(dd, IPATH_KREG_OFFSET(IBNCModeCtrl), 0); + symadj = 1; } } /* @@ -2416,9 +2462,13 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) IB_WIDTH_4X)) == (IB_WIDTH_1X | IB_WIDTH_4X) && dd->ipath_link_width_active == IB_WIDTH_1X && dd->ipath_x1_fix_tries < 3) { - if (++dd->ipath_x1_fix_tries == 3) + if (++dd->ipath_x1_fix_tries == 3) { dev_info(&dd->pcidev->dev, "IB link is in 1X mode\n"); + if (!(dd->ipath_flags & + IPATH_IB_AUTONEG_INPROG)) + symadj = 1; + } else { ipath_cdbg(VERBOSE, "IB 1X in " "auto-width, try %u to be " @@ -2429,7 +2479,8 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) dd->ipath_f_xgxs_reset(dd); ret = 1; /* skip other processing */ } - } + } else if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG)) + symadj = 1; if (!ret) { dd->delay_mult = rate_to_delay @@ -2440,6 +2491,25 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) } } + if (symadj) { + if (dd->ibdeltainprog) { + dd->ibdeltainprog = 0; + dd->ibsymdelta += ipath_read_creg32(dd, + dd->ipath_cregs->cr_ibsymbolerrcnt) - + dd->ibsymsnap; + dd->iblnkerrdelta += ipath_read_creg32(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt) - + dd->iblnkerrsnap; + } + } else if (!ibup && !dd->ibdeltainprog + && !(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG)) { + dd->ibdeltainprog = 1; + dd->ibsymsnap = ipath_read_creg32(dd, + dd->ipath_cregs->cr_ibsymbolerrcnt); + dd->iblnkerrsnap = ipath_read_creg32(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + } + if (!ret) ipath_setup_7220_setextled(dd, ipath_ib_linkstate(dd, ibcs), ltstate); diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index 0bd8bcb184a1..aa84153b731b 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -355,6 +355,19 @@ struct ipath_devdata { /* errors masked because they occur too fast */ ipath_err_t ipath_maskederrs; u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */ + /* these 5 fields are used to establish deltas for IB Symbol + * errors and linkrecovery errors. They can be reported on + * some chips during link negotiation prior to INIT, and with + * DDR when faking DDR negotiations with non-IBTA switches. + * The chip counters are adjusted at driver unload if there is + * a non-zero delta. + */ + u64 ibdeltainprog; + u64 ibsymdelta; + u64 ibsymsnap; + u64 iblnkerrdelta; + u64 iblnkerrsnap; + /* time in jiffies at which to re-enable maskederrs */ unsigned long ipath_unmasktime; /* count of egrfull errors, combined for all ports */ diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c index c8e3d65f0de8..f63e143e3292 100644 --- a/drivers/infiniband/hw/ipath/ipath_stats.c +++ b/drivers/infiniband/hw/ipath/ipath_stats.c @@ -112,6 +112,14 @@ u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg) dd->ipath_lastrpkts = val; } val64 = dd->ipath_rpkts; + } else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->ibsymsnap; + val64 -= dd->ibsymdelta; + } else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->iblnkerrsnap; + val64 -= dd->iblnkerrdelta; } else val64 = (u64) val; From 6114d4cd313acbb6e9935c2bee77e368d10c4f04 Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Fri, 5 Dec 2008 11:13:19 -0800 Subject: [PATCH 15/26] IB/ipath: Only do 1X workaround on rev1 chips Signed-off-by: Dave Olson Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_iba7220.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c index 3b38bc9a331d..b2a9d4c155d1 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba7220.c +++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c @@ -2452,13 +2452,14 @@ static int ipath_7220_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) } } /* - * if we are in 1X, and are in autoneg width, it - * could be due to an xgxs problem, so if we haven't + * if we are in 1X on rev1 only, and are in autoneg width, + * it could be due to an xgxs problem, so if we haven't * already tried, try twice to get to 4X; if we * tried, and couldn't, report it, since it will * probably not be what is desired. */ - if ((dd->ipath_link_width_enabled & (IB_WIDTH_1X | + if (dd->ipath_minrev == 1 && + (dd->ipath_link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) == (IB_WIDTH_1X | IB_WIDTH_4X) && dd->ipath_link_width_active == IB_WIDTH_1X && dd->ipath_x1_fix_tries < 3) { From 1bf7724e093cf3071d943d53bfa4de8b8e50426b Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Fri, 5 Dec 2008 11:13:19 -0800 Subject: [PATCH 16/26] IB/ipath: Fix spi_pioindex value ipath_piobufbase was a single value offset, but is multiple values on newer chips, so use only the 32 bits for the 2K buffers (4K buffers are currently used only by the driver). Signed-off-by: Dave Olson Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_file_ops.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 1af1f3a907c6..ceab52c09cdb 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -223,8 +223,13 @@ static int ipath_get_base_info(struct file *fp, (unsigned long long) kinfo->spi_subport_rcvhdr_base); } - kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->ipath_piobufbase) / - dd->ipath_palign; + /* + * All user buffers are 2KB buffers. If we ever support + * giving 4KB buffers to user processes, this will need some + * work. + */ + kinfo->spi_pioindex = (kinfo->spi_piobufbase - + (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign; kinfo->spi_pioalign = dd->ipath_palign; kinfo->spi_qpair = IPATH_KD_QP; From 3d0890985ac4dff781b7feba19fedda547314749 Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Fri, 5 Dec 2008 11:14:38 -0800 Subject: [PATCH 17/26] IB/ipath: Add locking for interrupt use of ipath_pd contexts vs free Fixes timing race resulting in panic. Not a performance sensitive path. Signed-off-by: Dave Olson Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_driver.c | 49 ++++++++++++------- drivers/infiniband/hw/ipath/ipath_file_ops.c | 21 ++++---- drivers/infiniband/hw/ipath/ipath_init_chip.c | 1 + drivers/infiniband/hw/ipath/ipath_kernel.h | 2 + drivers/infiniband/hw/ipath/ipath_keys.c | 2 + drivers/infiniband/hw/ipath/ipath_mad.c | 2 + drivers/infiniband/hw/ipath/ipath_verbs.c | 3 +- 7 files changed, 49 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index ad0aab60b051..69c0ce321b4e 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -661,6 +661,8 @@ bail: static void __devexit cleanup_device(struct ipath_devdata *dd) { int port; + struct ipath_portdata **tmp; + unsigned long flags; if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { /* can't do anything more with chip; needs re-init */ @@ -742,20 +744,21 @@ static void __devexit cleanup_device(struct ipath_devdata *dd) /* * free any resources still in use (usually just kernel ports) - * at unload; we do for portcnt, not cfgports, because cfgports - * could have changed while we were loaded. + * at unload; we do for portcnt, because that's what we allocate. + * We acquire lock to be really paranoid that ipath_pd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + tmp = dd->ipath_pd; + dd->ipath_pd = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); for (port = 0; port < dd->ipath_portcnt; port++) { - struct ipath_portdata *pd = dd->ipath_pd[port]; - dd->ipath_pd[port] = NULL; + struct ipath_portdata *pd = tmp[port]; + tmp[port] = NULL; /* debugging paranoia */ ipath_free_pddata(dd, pd); } - kfree(dd->ipath_pd); - /* - * debuggability, in case some cleanup path tries to use it - * after this - */ - dd->ipath_pd = NULL; + kfree(tmp); } static void __devexit ipath_remove_one(struct pci_dev *pdev) @@ -2586,6 +2589,7 @@ int ipath_reset_device(int unit) { int ret, i; struct ipath_devdata *dd = ipath_lookup(unit); + unsigned long flags; if (!dd) { ret = -ENODEV; @@ -2611,18 +2615,21 @@ int ipath_reset_device(int unit) goto bail; } + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); if (dd->ipath_pd) for (i = 1; i < dd->ipath_cfgports; i++) { - if (dd->ipath_pd[i] && dd->ipath_pd[i]->port_cnt) { - ipath_dbg("unit %u port %d is in use " - "(PID %u cmd %s), can't reset\n", - unit, i, - pid_nr(dd->ipath_pd[i]->port_pid), - dd->ipath_pd[i]->port_comm); - ret = -EBUSY; - goto bail; - } + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + ipath_dbg("unit %u port %d is in use " + "(PID %u cmd %s), can't reset\n", + unit, i, + pid_nr(dd->ipath_pd[i]->port_pid), + dd->ipath_pd[i]->port_comm); + ret = -EBUSY; + goto bail; } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); if (dd->ipath_flags & IPATH_HAS_SEND_DMA) teardown_sdma(dd); @@ -2656,9 +2663,12 @@ static int ipath_signal_procs(struct ipath_devdata *dd, int sig) { int i, sub, any = 0; struct pid *pid; + unsigned long flags; if (!dd->ipath_pd) return 0; + + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); for (i = 1; i < dd->ipath_cfgports; i++) { if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) continue; @@ -2682,6 +2692,7 @@ static int ipath_signal_procs(struct ipath_devdata *dd, int sig) any++; } } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); return any; } diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index ceab52c09cdb..239d4e8068ac 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -2046,7 +2046,9 @@ static int ipath_close(struct inode *in, struct file *fp) struct ipath_filedata *fd; struct ipath_portdata *pd; struct ipath_devdata *dd; + unsigned long flags; unsigned port; + struct pid *pid; ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n", (long)in->i_rdev, fp->private_data); @@ -2079,14 +2081,13 @@ static int ipath_close(struct inode *in, struct file *fp) mutex_unlock(&ipath_mutex); goto bail; } + /* early; no interrupt users after this */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); port = pd->port_port; - - if (pd->port_hdrqfull) { - ipath_cdbg(PROC, "%s[%u] had %u rcvhdrqfull errors " - "during run\n", pd->port_comm, pid_nr(pd->port_pid), - pd->port_hdrqfull); - pd->port_hdrqfull = 0; - } + dd->ipath_pd[port] = NULL; + pid = pd->port_pid; + pd->port_pid = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); if (pd->port_rcvwait_to || pd->port_piowait_to || pd->port_rcvnowait || pd->port_pionowait) { @@ -2143,13 +2144,11 @@ static int ipath_close(struct inode *in, struct file *fp) unlock_expected_tids(pd); ipath_stats.sps_ports--; ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", - pd->port_comm, pid_nr(pd->port_pid), + pd->port_comm, pid_nr(pid), dd->ipath_unit, port); } - put_pid(pd->port_pid); - pd->port_pid = NULL; - dd->ipath_pd[pd->port_port] = NULL; /* before releasing mutex */ + put_pid(pid); mutex_unlock(&ipath_mutex); ipath_free_pddata(dd, pd); /* after releasing the mutex */ diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index 3e5baa43fc82..64aeefbd2a5d 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -229,6 +229,7 @@ static int init_chip_first(struct ipath_devdata *dd) spin_lock_init(&dd->ipath_kernel_tid_lock); spin_lock_init(&dd->ipath_user_tid_lock); spin_lock_init(&dd->ipath_sendctrl_lock); + spin_lock_init(&dd->ipath_uctxt_lock); spin_lock_init(&dd->ipath_sdma_lock); spin_lock_init(&dd->ipath_gpio_lock); spin_lock_init(&dd->ipath_eep_st_lock); diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index aa84153b731b..6ba4861dd6ac 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -477,6 +477,8 @@ struct ipath_devdata { spinlock_t ipath_kernel_tid_lock; spinlock_t ipath_user_tid_lock; spinlock_t ipath_sendctrl_lock; + /* around ipath_pd and (user ports) port_cnt use (intr vs free) */ + spinlock_t ipath_uctxt_lock; /* * IPATH_STATUS_*, diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c index 8f32b17a5eed..c0e933fec218 100644 --- a/drivers/infiniband/hw/ipath/ipath_keys.c +++ b/drivers/infiniband/hw/ipath/ipath_keys.c @@ -132,6 +132,7 @@ int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, * (see ipath_get_dma_mr and ipath_dma.c). */ if (sge->lkey == 0) { + /* always a kernel port, no locking needed */ struct ipath_pd *pd = to_ipd(qp->ibqp.pd); if (pd->user) { @@ -211,6 +212,7 @@ int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, * (see ipath_get_dma_mr and ipath_dma.c). */ if (rkey == 0) { + /* always a kernel port, no locking needed */ struct ipath_pd *pd = to_ipd(qp->ibqp.pd); if (pd->user) { diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index be4fc9ada8e7..17a123197477 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -348,6 +348,7 @@ bail: */ static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys) { + /* always a kernel port, no locking needed */ struct ipath_portdata *pd = dd->ipath_pd[0]; memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys)); @@ -730,6 +731,7 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys) int i; int changed = 0; + /* always a kernel port, no locking needed */ pd = dd->ipath_pd[0]; for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index eabc4247860b..cdf0e6abd34d 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -1852,7 +1852,7 @@ unsigned ipath_get_npkeys(struct ipath_devdata *dd) } /** - * ipath_get_pkey - return the indexed PKEY from the port 0 PKEY table + * ipath_get_pkey - return the indexed PKEY from the port PKEY table * @dd: the infinipath device * @index: the PKEY index */ @@ -1860,6 +1860,7 @@ unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index) { unsigned ret; + /* always a kernel port, no locking needed */ if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys)) ret = 0; else From 1c721940ddd6496508f1f2fde5167b1c898b419b Mon Sep 17 00:00:00 2001 From: Stefan Roscher Date: Fri, 5 Dec 2008 11:25:38 -0800 Subject: [PATCH 18/26] IB/ehca: Replace modulus operations in flush error completion path With the latest flush error completion patch we introduced modulus operation to calculate the next index within a qmap. Based on comments from other mailing lists we decided to optimize this operation by using an addition and an if-statement instead of modulus, even though this is on the error path. Signed-off-by: Stefan Roscher Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_classes.h | 7 +++++++ drivers/infiniband/hw/ehca/ehca_qp.c | 12 ++++++------ drivers/infiniband/hw/ehca/ehca_reqs.c | 13 ++++++------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index 7fc35cf0cddf..c825142a2fb7 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -175,6 +175,13 @@ struct ehca_queue_map { unsigned int next_wqe_idx; /* Idx to first wqe to be flushed */ }; +/* function to calculate the next index for the qmap */ +static inline unsigned int next_index(unsigned int cur_index, unsigned int limit) +{ + unsigned int temp = cur_index + 1; + return (temp == limit) ? 0 : temp; +} + struct ehca_qp { union { struct ib_qp ib_qp; diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index cadbf0cdd910..f161cf173dbe 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -1138,14 +1138,14 @@ static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue, return -EFAULT; } - tail_idx = (qmap->tail + 1) % qmap->entries; + tail_idx = next_index(qmap->tail, qmap->entries); wqe_idx = q_ofs / ipz_queue->qe_size; /* check all processed wqes, whether a cqe is requested or not */ while (tail_idx != wqe_idx) { if (qmap->map[tail_idx].cqe_req) qmap->left_to_poll++; - tail_idx = (tail_idx + 1) % qmap->entries; + tail_idx = next_index(tail_idx, qmap->entries); } /* save index in queue, where we have to start flushing */ qmap->next_wqe_idx = wqe_idx; @@ -1195,14 +1195,14 @@ static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca) } else { spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); my_qp->sq_map.left_to_poll = 0; - my_qp->sq_map.next_wqe_idx = (my_qp->sq_map.tail + 1) % - my_qp->sq_map.entries; + my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail, + my_qp->sq_map.entries); spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); my_qp->rq_map.left_to_poll = 0; - my_qp->rq_map.next_wqe_idx = (my_qp->rq_map.tail + 1) % - my_qp->rq_map.entries; + my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail, + my_qp->rq_map.entries); spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags); } diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c index 00a648f4316c..c7112686782f 100644 --- a/drivers/infiniband/hw/ehca/ehca_reqs.c +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -726,13 +726,13 @@ repoll: * set left_to_poll to 0 because in error state, we will not * get any additional CQEs */ - my_qp->sq_map.next_wqe_idx = (my_qp->sq_map.tail + 1) % - my_qp->sq_map.entries; + my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail, + my_qp->sq_map.entries); my_qp->sq_map.left_to_poll = 0; ehca_add_to_err_list(my_qp, 1); - my_qp->rq_map.next_wqe_idx = (my_qp->rq_map.tail + 1) % - my_qp->rq_map.entries; + my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail, + my_qp->rq_map.entries); my_qp->rq_map.left_to_poll = 0; if (HAS_RQ(my_qp)) ehca_add_to_err_list(my_qp, 0); @@ -860,9 +860,8 @@ static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq, /* mark as reported and advance next_wqe pointer */ qmap_entry->reported = 1; - qmap->next_wqe_idx++; - if (qmap->next_wqe_idx == qmap->entries) - qmap->next_wqe_idx = 0; + qmap->next_wqe_idx = next_index(qmap->next_wqe_idx, + qmap->entries); qmap_entry = &qmap->map[qmap->next_wqe_idx]; wc++; nr++; From 139cdab0a2af6f5eaee47cc0144608e53b65279d Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 21 Dec 2008 13:29:13 -0800 Subject: [PATCH 19/26] IB/ehca: Remove redundant test of vpage vpage is checked not to be NULL just after it is initialized at the beginning of each loop iteration. A simplified version of the semantic patch that makes this change is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; expression E; position p1,p2; @@ if (x@p1 == NULL || ...) { ... when forall return ...; } ... when != \(x=E\|x--\|x++\|--x\|++x\|x-=E\|x+=E\|x|=E\|x&=E\|&x\) ( x@p2 == NULL | x@p2 != NULL ) // another path to the test that is not through p1? @s exists@ local idexpression r.x; position r.p1,r.p2; @@ ... when != x@p1 ( x@p2 == NULL | x@p2 != NULL ) @fix depends on !s@ position r.p1,r.p2; expression x,E; statement S1,S2; @@ ( - if ((x@p2 != NULL) || ...) S1 | - if ((x@p2 == NULL) && ...) S1 | - BUG_ON(x@p2 == NULL); ) // Signed-off-by: Julia Lawall Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_eq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c index 49660dfa1867..523e733c630e 100644 --- a/drivers/infiniband/hw/ehca/ehca_eq.c +++ b/drivers/infiniband/hw/ehca/ehca_eq.c @@ -113,7 +113,7 @@ int ehca_create_eq(struct ehca_shca *shca, if (h_ret != H_SUCCESS || vpage) goto create_eq_exit2; } else { - if (h_ret != H_PAGE_REGISTERED || !vpage) + if (h_ret != H_PAGE_REGISTERED) goto create_eq_exit2; } } From bba7ebba3b17f4fe8c5907a32e16d9bd3fcf5192 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Sun, 21 Dec 2008 13:56:50 -0800 Subject: [PATCH 20/26] IB/iser: Avoid recv buffer exhaustion caused by unexpected PDUs iSCSI/iSER targets may send PDUs without a prior request from the initiator. RFC 5046 refers to these PDUs as "unexpected". NOP-In PDUs with itt=RESERVED and Asynchronous Message PDUs occupy this category. The amount of active "unexpected" PDU's an iSER target may have at any time is governed by the MaxOutstandingUnexpectedPDUs key, which is not yet supported. Currently when an iSER target sends an "unexpected" PDU, the initiators recv buffer consumed by the PDU is not replaced. If over initial_post_recv_bufs_num "unexpected" PDUs are received then the receive queue will run out of receive work requests entirely. This patch ensures recv buffers consumed by "unexpected" PDUs are replaced in the next iser_post_receive_control() call. Signed-off-by: David Disseldorp Signed-off-by: Ken Sandars Acked-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.h | 3 + drivers/infiniband/ulp/iser/iser_initiator.c | 146 +++++++++++++------ drivers/infiniband/ulp/iser/iser_verbs.c | 1 + 3 files changed, 102 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 81a82628a5f1..861119593f2b 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -252,6 +252,9 @@ struct iser_conn { wait_queue_head_t wait; /* waitq for conn/disconn */ atomic_t post_recv_buf_count; /* posted rx count */ atomic_t post_send_buf_count; /* posted tx count */ + atomic_t unexpected_pdu_count;/* count of received * + * unexpected pdus * + * not yet retired */ char name[ISER_OBJECT_NAME_SIZE]; struct iser_page_vec *page_vec; /* represents SG to fmr maps* * maps serialized as tx is*/ diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index cdd283189047..ed1aff21b7ea 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -183,14 +183,8 @@ static int iser_post_receive_control(struct iscsi_conn *conn) struct iser_regd_buf *regd_data; struct iser_dto *recv_dto = NULL; struct iser_device *device = iser_conn->ib_conn->device; - int rx_data_size, err = 0; - - rx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO); - if (rx_desc == NULL) { - iser_err("Failed to alloc desc for post recv\n"); - return -ENOMEM; - } - rx_desc->type = ISCSI_RX; + int rx_data_size, err; + int posts, outstanding_unexp_pdus; /* for the login sequence we must support rx of upto 8K; login is done * after conn create/bind (connect) and conn stop/bind (reconnect), @@ -201,46 +195,80 @@ static int iser_post_receive_control(struct iscsi_conn *conn) else /* FIXME till user space sets conn->max_recv_dlength correctly */ rx_data_size = 128; - rx_desc->data = kmalloc(rx_data_size, GFP_NOIO); - if (rx_desc->data == NULL) { - iser_err("Failed to alloc data buf for post recv\n"); - err = -ENOMEM; - goto post_rx_kmalloc_failure; + outstanding_unexp_pdus = + atomic_xchg(&iser_conn->ib_conn->unexpected_pdu_count, 0); + + /* + * in addition to the response buffer, replace those consumed by + * unexpected pdus. + */ + for (posts = 0; posts < 1 + outstanding_unexp_pdus; posts++) { + rx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO); + if (rx_desc == NULL) { + iser_err("Failed to alloc desc for post recv %d\n", + posts); + err = -ENOMEM; + goto post_rx_cache_alloc_failure; + } + rx_desc->type = ISCSI_RX; + rx_desc->data = kmalloc(rx_data_size, GFP_NOIO); + if (rx_desc->data == NULL) { + iser_err("Failed to alloc data buf for post recv %d\n", + posts); + err = -ENOMEM; + goto post_rx_kmalloc_failure; + } + + recv_dto = &rx_desc->dto; + recv_dto->ib_conn = iser_conn->ib_conn; + recv_dto->regd_vector_len = 0; + + regd_hdr = &rx_desc->hdr_regd_buf; + memset(regd_hdr, 0, sizeof(struct iser_regd_buf)); + regd_hdr->device = device; + regd_hdr->virt_addr = rx_desc; /* == &rx_desc->iser_header */ + regd_hdr->data_size = ISER_TOTAL_HEADERS_LEN; + + iser_reg_single(device, regd_hdr, DMA_FROM_DEVICE); + + iser_dto_add_regd_buff(recv_dto, regd_hdr, 0, 0); + + regd_data = &rx_desc->data_regd_buf; + memset(regd_data, 0, sizeof(struct iser_regd_buf)); + regd_data->device = device; + regd_data->virt_addr = rx_desc->data; + regd_data->data_size = rx_data_size; + + iser_reg_single(device, regd_data, DMA_FROM_DEVICE); + + iser_dto_add_regd_buff(recv_dto, regd_data, 0, 0); + + err = iser_post_recv(rx_desc); + if (err) { + iser_err("Failed iser_post_recv for post %d\n", posts); + goto post_rx_post_recv_failure; + } } + /* all posts successful */ + return 0; - recv_dto = &rx_desc->dto; - recv_dto->ib_conn = iser_conn->ib_conn; - recv_dto->regd_vector_len = 0; - - regd_hdr = &rx_desc->hdr_regd_buf; - memset(regd_hdr, 0, sizeof(struct iser_regd_buf)); - regd_hdr->device = device; - regd_hdr->virt_addr = rx_desc; /* == &rx_desc->iser_header */ - regd_hdr->data_size = ISER_TOTAL_HEADERS_LEN; - - iser_reg_single(device, regd_hdr, DMA_FROM_DEVICE); - - iser_dto_add_regd_buff(recv_dto, regd_hdr, 0, 0); - - regd_data = &rx_desc->data_regd_buf; - memset(regd_data, 0, sizeof(struct iser_regd_buf)); - regd_data->device = device; - regd_data->virt_addr = rx_desc->data; - regd_data->data_size = rx_data_size; - - iser_reg_single(device, regd_data, DMA_FROM_DEVICE); - - iser_dto_add_regd_buff(recv_dto, regd_data, 0, 0); - - err = iser_post_recv(rx_desc); - if (!err) - return 0; - - /* iser_post_recv failed */ +post_rx_post_recv_failure: iser_dto_buffs_release(recv_dto); kfree(rx_desc->data); post_rx_kmalloc_failure: kmem_cache_free(ig.desc_cache, rx_desc); +post_rx_cache_alloc_failure: + if (posts > 0) { + /* + * response buffer posted, but did not replace all unexpected + * pdu recv bufs. Ignore error, retry occurs next send + */ + outstanding_unexp_pdus -= (posts - 1); + err = 0; + } + atomic_add(outstanding_unexp_pdus, + &iser_conn->ib_conn->unexpected_pdu_count); + return err; } @@ -274,8 +302,10 @@ int iser_conn_set_full_featured_mode(struct iscsi_conn *conn) struct iscsi_iser_conn *iser_conn = conn->dd_data; int i; - /* no need to keep it in a var, we are after login so if this should - * be negotiated, by now the result should be available here */ + /* + * FIXME this value should be declared to the target during login with + * the MaxOutstandingUnexpectedPDUs key when supported + */ int initial_post_recv_bufs_num = ISER_MAX_RX_MISC_PDUS; iser_dbg("Initially post: %d\n", initial_post_recv_bufs_num); @@ -478,6 +508,7 @@ int iser_send_control(struct iscsi_conn *conn, int err = 0; struct iser_regd_buf *regd_buf; struct iser_device *device; + unsigned char opcode; if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) { iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn); @@ -512,10 +543,15 @@ int iser_send_control(struct iscsi_conn *conn, data_seg_len); } - if (iser_post_receive_control(conn) != 0) { - iser_err("post_rcv_buff failed!\n"); - err = -ENOMEM; - goto send_control_error; + opcode = task->hdr->opcode & ISCSI_OPCODE_MASK; + + /* post recv buffer for response if one is expected */ + if (!(opcode == ISCSI_OP_NOOP_OUT && task->hdr->itt == RESERVED_ITT)) { + if (iser_post_receive_control(conn) != 0) { + iser_err("post_rcv_buff failed!\n"); + err = -ENOMEM; + goto send_control_error; + } } err = iser_post_send(mdesc); @@ -586,6 +622,20 @@ void iser_rcv_completion(struct iser_desc *rx_desc, * parallel to the execution of iser_conn_term. So the code that waits * * for the posted rx bufs refcount to become zero handles everything */ atomic_dec(&conn->ib_conn->post_recv_buf_count); + + /* + * if an unexpected PDU was received then the recv wr consumed must + * be replaced, this is done in the next send of a control-type PDU + */ + if (opcode == ISCSI_OP_NOOP_IN && hdr->itt == RESERVED_ITT) { + /* nop-in with itt = 0xffffffff */ + atomic_inc(&conn->ib_conn->unexpected_pdu_count); + } + else if (opcode == ISCSI_OP_ASYNC_EVENT) { + /* asyncronous message */ + atomic_inc(&conn->ib_conn->unexpected_pdu_count); + } + /* a reject PDU consumes the recv buf posted for the response */ } void iser_snd_completion(struct iser_desc *tx_desc) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 26ff6214a81f..6dc6b174cdd4 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -498,6 +498,7 @@ void iser_conn_init(struct iser_conn *ib_conn) init_waitqueue_head(&ib_conn->wait); atomic_set(&ib_conn->post_recv_buf_count, 0); atomic_set(&ib_conn->post_send_buf_count, 0); + atomic_set(&ib_conn->unexpected_pdu_count, 0); atomic_set(&ib_conn->refcount, 1); INIT_LIST_HEAD(&ib_conn->conn_list); spin_lock_init(&ib_conn->lock); From b8dd786f9417e5885929bfe33a235c76a9c1c569 Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Mon, 22 Dec 2008 07:15:03 -0800 Subject: [PATCH 21/26] mlx4_core: Add support for multiple completion event vectors When using MSI-X mode, create a completion event queue for each CPU. Report the number of completion EQs in a new struct mlx4_caps member, num_comp_vectors, and extend the mlx4_cq_alloc() interface with a vector parameter so that consumers can specify which completion EQ should be used to report events for the CQ being created. Signed-off-by: Yevgeny Petrilin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/cq.c | 2 +- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/net/mlx4/cq.c | 11 ++- drivers/net/mlx4/en_cq.c | 9 ++- drivers/net/mlx4/en_main.c | 4 +- drivers/net/mlx4/eq.c | 117 ++++++++++++++++++++++-------- drivers/net/mlx4/main.c | 53 ++++++++++---- drivers/net/mlx4/mlx4.h | 14 ++-- drivers/net/mlx4/profile.c | 4 +- include/linux/mlx4/device.h | 4 +- 10 files changed, 157 insertions(+), 63 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 18308494a195..2198753bf13d 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -222,7 +222,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector } err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, - cq->db.dma, &cq->mcq, 0); + cq->db.dma, &cq->mcq, vector, 0); if (err) goto err_dbmap; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 2e80f8f47b02..dcefe1fceb5c 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -578,7 +578,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) ibdev->num_ports++; ibdev->ib_dev.phys_port_cnt = ibdev->num_ports; - ibdev->ib_dev.num_comp_vectors = 1; + ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c index b7ad2829d67e..ac57b6a42c6e 100644 --- a/drivers/net/mlx4/cq.c +++ b/drivers/net/mlx4/cq.c @@ -189,7 +189,7 @@ EXPORT_SYMBOL_GPL(mlx4_cq_resize); int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq, - int collapsed) + unsigned vector, int collapsed) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cq_table *cq_table = &priv->cq_table; @@ -198,6 +198,11 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, u64 mtt_addr; int err; + if (vector >= dev->caps.num_comp_vectors) + return -EINVAL; + + cq->vector = vector; + cq->cqn = mlx4_bitmap_alloc(&cq_table->bitmap); if (cq->cqn == -1) return -ENOMEM; @@ -227,7 +232,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, cq_context->flags = cpu_to_be32(!!collapsed << 18); cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index); - cq_context->comp_eqn = priv->eq_table.eq[MLX4_EQ_COMP].eqn; + cq_context->comp_eqn = priv->eq_table.eq[vector].eqn; cq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; mtt_addr = mlx4_mtt_addr(dev, mtt); @@ -276,7 +281,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq) if (err) mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn); - synchronize_irq(priv->eq_table.eq[MLX4_EQ_COMP].irq); + synchronize_irq(priv->eq_table.eq[cq->vector].irq); spin_lock_irq(&cq_table->lock); radix_tree_delete(&cq_table->tree, cq->cqn); diff --git a/drivers/net/mlx4/en_cq.c b/drivers/net/mlx4/en_cq.c index 1368a8010af4..674f836e225b 100644 --- a/drivers/net/mlx4/en_cq.c +++ b/drivers/net/mlx4/en_cq.c @@ -51,10 +51,13 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv, int err; cq->size = entries; - if (mode == RX) + if (mode == RX) { cq->buf_size = cq->size * sizeof(struct mlx4_cqe); - else + cq->vector = ring % mdev->dev->caps.num_comp_vectors; + } else { cq->buf_size = sizeof(struct mlx4_cqe); + cq->vector = 0; + } cq->ring = ring; cq->is_tx = mode; @@ -86,7 +89,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq) memset(cq->buf, 0, cq->buf_size); err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar, - cq->wqres.db.dma, &cq->mcq, cq->is_tx); + cq->wqres.db.dma, &cq->mcq, cq->vector, cq->is_tx); if (err) return err; diff --git a/drivers/net/mlx4/en_main.c b/drivers/net/mlx4/en_main.c index 4b9794e97a79..c1c05852a95e 100644 --- a/drivers/net/mlx4/en_main.c +++ b/drivers/net/mlx4/en_main.c @@ -170,9 +170,9 @@ static void *mlx4_en_add(struct mlx4_dev *dev) mlx4_info(mdev, "Using %d tx rings for port:%d\n", mdev->profile.prof[i].tx_ring_num, i); if (!mdev->profile.prof[i].rx_ring_num) { - mdev->profile.prof[i].rx_ring_num = 1; + mdev->profile.prof[i].rx_ring_num = dev->caps.num_comp_vectors; mlx4_info(mdev, "Defaulting to %d rx rings for port:%d\n", - 1, i); + mdev->profile.prof[i].rx_ring_num, i); } else mlx4_info(mdev, "Using %d rx rings for port:%d\n", mdev->profile.prof[i].rx_ring_num, i); diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index de169338cd90..5d867ebe6a4d 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -266,7 +266,7 @@ static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr) writel(priv->eq_table.clr_mask, priv->eq_table.clr_int); - for (i = 0; i < MLX4_NUM_EQ; ++i) + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]); return IRQ_RETVAL(work); @@ -304,6 +304,17 @@ static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, MLX4_CMD_TIME_CLASS_A); } +static int mlx4_num_eq_uar(struct mlx4_dev *dev) +{ + /* + * Each UAR holds 4 EQ doorbells. To figure out how many UARs + * we need to map, take the difference of highest index and + * the lowest index we'll use and add 1. + */ + return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 - + dev->caps.reserved_eqs / 4 + 1; +} + static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -483,9 +494,11 @@ static void mlx4_free_irqs(struct mlx4_dev *dev) if (eq_table->have_irq) free_irq(dev->pdev->irq, dev); - for (i = 0; i < MLX4_NUM_EQ; ++i) + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) if (eq_table->eq[i].have_irq) free_irq(eq_table->eq[i].irq, eq_table->eq + i); + + kfree(eq_table->irq_names); } static int mlx4_map_clr_int(struct mlx4_dev *dev) @@ -551,57 +564,93 @@ void mlx4_unmap_eq_icm(struct mlx4_dev *dev) __free_page(priv->eq_table.icm_page); } +int mlx4_alloc_eq_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + priv->eq_table.eq = kcalloc(dev->caps.num_eqs - dev->caps.reserved_eqs, + sizeof *priv->eq_table.eq, GFP_KERNEL); + if (!priv->eq_table.eq) + return -ENOMEM; + + return 0; +} + +void mlx4_free_eq_table(struct mlx4_dev *dev) +{ + kfree(mlx4_priv(dev)->eq_table.eq); +} + int mlx4_init_eq_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err; int i; + priv->eq_table.uar_map = kcalloc(sizeof *priv->eq_table.uar_map, + mlx4_num_eq_uar(dev), GFP_KERNEL); + if (!priv->eq_table.uar_map) { + err = -ENOMEM; + goto err_out_free; + } + err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs, dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0); if (err) - return err; + goto err_out_free; - for (i = 0; i < ARRAY_SIZE(priv->eq_table.uar_map); ++i) + for (i = 0; i < mlx4_num_eq_uar(dev); ++i) priv->eq_table.uar_map[i] = NULL; err = mlx4_map_clr_int(dev); if (err) - goto err_out_free; + goto err_out_bitmap; priv->eq_table.clr_mask = swab32(1 << (priv->eq_table.inta_pin & 31)); priv->eq_table.clr_int = priv->clr_base + (priv->eq_table.inta_pin < 32 ? 4 : 0); - err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE, - (dev->flags & MLX4_FLAG_MSI_X) ? MLX4_EQ_COMP : 0, - &priv->eq_table.eq[MLX4_EQ_COMP]); - if (err) - goto err_out_unmap; + priv->eq_table.irq_names = kmalloc(16 * dev->caps.num_comp_vectors, GFP_KERNEL); + if (!priv->eq_table.irq_names) { + err = -ENOMEM; + goto err_out_bitmap; + } + + for (i = 0; i < dev->caps.num_comp_vectors; ++i) { + err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE, + (dev->flags & MLX4_FLAG_MSI_X) ? i : 0, + &priv->eq_table.eq[i]); + if (err) + goto err_out_unmap; + } err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE, - (dev->flags & MLX4_FLAG_MSI_X) ? MLX4_EQ_ASYNC : 0, - &priv->eq_table.eq[MLX4_EQ_ASYNC]); + (dev->flags & MLX4_FLAG_MSI_X) ? dev->caps.num_comp_vectors : 0, + &priv->eq_table.eq[dev->caps.num_comp_vectors]); if (err) goto err_out_comp; if (dev->flags & MLX4_FLAG_MSI_X) { - static const char *eq_name[] = { - [MLX4_EQ_COMP] = DRV_NAME " (comp)", - [MLX4_EQ_ASYNC] = DRV_NAME " (async)" - }; + static const char async_eq_name[] = "mlx4-async"; + const char *eq_name; + + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) { + if (i < dev->caps.num_comp_vectors) { + snprintf(priv->eq_table.irq_names + i * 16, 16, + "mlx4-comp-%d", i); + eq_name = priv->eq_table.irq_names + i * 16; + } else + eq_name = async_eq_name; - for (i = 0; i < MLX4_NUM_EQ; ++i) { err = request_irq(priv->eq_table.eq[i].irq, - mlx4_msi_x_interrupt, - 0, eq_name[i], priv->eq_table.eq + i); + mlx4_msi_x_interrupt, 0, eq_name, + priv->eq_table.eq + i); if (err) goto err_out_async; priv->eq_table.eq[i].have_irq = 1; } - } else { err = request_irq(dev->pdev->irq, mlx4_interrupt, IRQF_SHARED, DRV_NAME, dev); @@ -612,28 +661,36 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) } err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0, - priv->eq_table.eq[MLX4_EQ_ASYNC].eqn); + priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); if (err) mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n", - priv->eq_table.eq[MLX4_EQ_ASYNC].eqn, err); + priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err); - for (i = 0; i < MLX4_NUM_EQ; ++i) + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) eq_set_ci(&priv->eq_table.eq[i], 1); return 0; err_out_async: - mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_ASYNC]); + mlx4_free_eq(dev, &priv->eq_table.eq[dev->caps.num_comp_vectors]); err_out_comp: - mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_COMP]); + i = dev->caps.num_comp_vectors - 1; err_out_unmap: + while (i >= 0) { + mlx4_free_eq(dev, &priv->eq_table.eq[i]); + --i; + } mlx4_unmap_clr_int(dev); mlx4_free_irqs(dev); -err_out_free: +err_out_bitmap: mlx4_bitmap_cleanup(&priv->eq_table.bitmap); + +err_out_free: + kfree(priv->eq_table.uar_map); + return err; } @@ -643,18 +700,20 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev) int i; mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 1, - priv->eq_table.eq[MLX4_EQ_ASYNC].eqn); + priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); mlx4_free_irqs(dev); - for (i = 0; i < MLX4_NUM_EQ; ++i) + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) mlx4_free_eq(dev, &priv->eq_table.eq[i]); mlx4_unmap_clr_int(dev); - for (i = 0; i < ARRAY_SIZE(priv->eq_table.uar_map); ++i) + for (i = 0; i < mlx4_num_eq_uar(dev); ++i) if (priv->eq_table.uar_map[i]) iounmap(priv->eq_table.uar_map[i]); mlx4_bitmap_cleanup(&priv->eq_table.bitmap); + + kfree(priv->eq_table.uar_map); } diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 90a0281d15ea..710c79e7a2db 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -421,9 +421,7 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, ((u64) (MLX4_CMPT_TYPE_EQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, - roundup_pow_of_two(MLX4_NUM_EQ + - dev->caps.reserved_eqs), - MLX4_NUM_EQ + dev->caps.reserved_eqs, 0, 0); + dev->caps.num_eqs, dev->caps.num_eqs, 0, 0); if (err) goto err_cq; @@ -810,12 +808,12 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) if (dev->flags & MLX4_FLAG_MSI_X) { mlx4_warn(dev, "NOP command failed to generate MSI-X " "interrupt IRQ %d).\n", - priv->eq_table.eq[MLX4_EQ_ASYNC].irq); + priv->eq_table.eq[dev->caps.num_comp_vectors].irq); mlx4_warn(dev, "Trying again without MSI-X.\n"); } else { mlx4_err(dev, "NOP command failed to generate interrupt " "(IRQ %d), aborting.\n", - priv->eq_table.eq[MLX4_EQ_ASYNC].irq); + priv->eq_table.eq[dev->caps.num_comp_vectors].irq); mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n"); } @@ -908,31 +906,50 @@ err_uar_table_free: static void mlx4_enable_msi_x(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); - struct msix_entry entries[MLX4_NUM_EQ]; + struct msix_entry *entries; + int nreq; int err; int i; if (msi_x) { - for (i = 0; i < MLX4_NUM_EQ; ++i) + nreq = min(dev->caps.num_eqs - dev->caps.reserved_eqs, + num_possible_cpus() + 1); + entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL); + if (!entries) + goto no_msi; + + for (i = 0; i < nreq; ++i) entries[i].entry = i; - err = pci_enable_msix(dev->pdev, entries, ARRAY_SIZE(entries)); + retry: + err = pci_enable_msix(dev->pdev, entries, nreq); if (err) { - if (err > 0) - mlx4_info(dev, "Only %d MSI-X vectors available, " - "not using MSI-X\n", err); + /* Try again if at least 2 vectors are available */ + if (err > 1) { + mlx4_info(dev, "Requested %d vectors, " + "but only %d MSI-X vectors available, " + "trying again\n", nreq, err); + nreq = err; + goto retry; + } + goto no_msi; } - for (i = 0; i < MLX4_NUM_EQ; ++i) + dev->caps.num_comp_vectors = nreq - 1; + for (i = 0; i < nreq; ++i) priv->eq_table.eq[i].irq = entries[i].vector; dev->flags |= MLX4_FLAG_MSI_X; + + kfree(entries); return; } no_msi: - for (i = 0; i < MLX4_NUM_EQ; ++i) + dev->caps.num_comp_vectors = 1; + + for (i = 0; i < 2; ++i) priv->eq_table.eq[i].irq = dev->pdev->irq; } @@ -1074,6 +1091,10 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (err) goto err_cmd; + err = mlx4_alloc_eq_table(dev); + if (err) + goto err_close; + mlx4_enable_msi_x(dev); err = mlx4_setup_hca(dev); @@ -1084,7 +1105,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) } if (err) - goto err_close; + goto err_free_eq; for (port = 1; port <= dev->caps.num_ports; port++) { err = mlx4_init_port_info(dev, port); @@ -1114,6 +1135,9 @@ err_port: mlx4_cleanup_pd_table(dev); mlx4_cleanup_uar_table(dev); +err_free_eq: + mlx4_free_eq_table(dev); + err_close: if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); @@ -1177,6 +1201,7 @@ static void mlx4_remove_one(struct pci_dev *pdev) iounmap(priv->kar); mlx4_uar_free(dev, &priv->driver_uar); mlx4_cleanup_uar_table(dev); + mlx4_free_eq_table(dev); mlx4_close_hca(dev); mlx4_cmd_cleanup(dev); diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 34c909deaff3..e0213bad61c7 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -62,12 +62,6 @@ enum { MLX4_MTT_ENTRY_PER_SEG = 8 }; -enum { - MLX4_EQ_ASYNC, - MLX4_EQ_COMP, - MLX4_NUM_EQ -}; - enum { MLX4_NUM_PDS = 1 << 15 }; @@ -205,10 +199,11 @@ struct mlx4_cq_table { struct mlx4_eq_table { struct mlx4_bitmap bitmap; + char *irq_names; void __iomem *clr_int; - void __iomem *uar_map[(MLX4_NUM_EQ + 6) / 4]; + void __iomem **uar_map; u32 clr_mask; - struct mlx4_eq eq[MLX4_NUM_EQ]; + struct mlx4_eq *eq; u64 icm_virt; struct page *icm_page; dma_addr_t icm_dma; @@ -328,6 +323,9 @@ void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap); int mlx4_reset(struct mlx4_dev *dev); +int mlx4_alloc_eq_table(struct mlx4_dev *dev); +void mlx4_free_eq_table(struct mlx4_dev *dev); + int mlx4_init_pd_table(struct mlx4_dev *dev); int mlx4_init_uar_table(struct mlx4_dev *dev); int mlx4_init_mr_table(struct mlx4_dev *dev); diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c index 9ca42b213d54..919fb9eb1b62 100644 --- a/drivers/net/mlx4/profile.c +++ b/drivers/net/mlx4/profile.c @@ -107,7 +107,9 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, profile[MLX4_RES_AUXC].num = request->num_qp; profile[MLX4_RES_SRQ].num = request->num_srq; profile[MLX4_RES_CQ].num = request->num_cq; - profile[MLX4_RES_EQ].num = MLX4_NUM_EQ + dev_cap->reserved_eqs; + profile[MLX4_RES_EQ].num = min(dev_cap->max_eqs, + dev_cap->reserved_eqs + + num_possible_cpus() + 1); profile[MLX4_RES_DMPT].num = request->num_mpt; profile[MLX4_RES_CMPT].num = MLX4_NUM_CMPTS; profile[MLX4_RES_MTT].num = request->num_mtt; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 371086fd946f..8f659cc29960 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -206,6 +206,7 @@ struct mlx4_caps { int reserved_cqs; int num_eqs; int reserved_eqs; + int num_comp_vectors; int num_mpts; int num_mtt_segs; int fmr_reserved_mtts; @@ -328,6 +329,7 @@ struct mlx4_cq { int arm_sn; int cqn; + unsigned vector; atomic_t refcount; struct completion free; @@ -437,7 +439,7 @@ void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres, int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq, - int collapsed); + unsigned vector, int collapsed); void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq); int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base); From f5eda57f9bb17b6f09f7888dfc2c47db7aea45d4 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 22 Dec 2008 07:15:05 -0800 Subject: [PATCH 22/26] mlx4_core: Delete incorrect comment The comment about a "Conditional on hca_type" was cut-and-pasted from the mthca driver, and doesn't apply to mlx4 (since only one type of HCA is handled by mlx4). So just delete it. Signed-off-by: Roland Dreier --- drivers/net/mlx4/eq.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index 5d867ebe6a4d..2c19bff7cbab 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -243,10 +243,6 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) * least that often. */ if (unlikely(set_ci >= MLX4_NUM_SPARE_EQE)) { - /* - * Conditional on hca_type is OK here because - * this is a rare case, not the fast path. - */ eq_set_ci(eq, 0); set_ci = 0; } From 38617c64bf9a10bf20e41d95b69bb81e8560fe9d Mon Sep 17 00:00:00 2001 From: Aleksey Senin Date: Wed, 24 Dec 2008 10:16:37 -0800 Subject: [PATCH 23/26] RDMA/addr: Add support for translating IPv6 addresses Add support for translating AF_INET6 addresses to the IB address translation service. This requires using struct sockaddr_storage instead of struct sockaddr wherever an IPv6 address might be stored, and adding cases to handle IPv6 in addition to IPv4 to the various translation functions. Signed-off-by: Aleksey Senin Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 194 +++++++++++++++++++++++++-------- 1 file changed, 150 insertions(+), 44 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 09a2bec7fd32..d98b05b28262 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include MODULE_AUTHOR("Sean Hefty"); @@ -49,8 +51,8 @@ MODULE_LICENSE("Dual BSD/GPL"); struct addr_req { struct list_head list; - struct sockaddr src_addr; - struct sockaddr dst_addr; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; struct rdma_dev_addr *addr; struct rdma_addr_client *client; void *context; @@ -113,15 +115,32 @@ EXPORT_SYMBOL(rdma_copy_addr); int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { struct net_device *dev; - __be32 ip = ((struct sockaddr_in *) addr)->sin_addr.s_addr; - int ret; + int ret = -EADDRNOTAVAIL; - dev = ip_dev_find(&init_net, ip); - if (!dev) - return -EADDRNOTAVAIL; + switch (addr->sa_family) { + case AF_INET: + dev = ip_dev_find(&init_net, + ((struct sockaddr_in *) addr)->sin_addr.s_addr); - ret = rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); + if (!dev) + return ret; + + ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_put(dev); + break; + case AF_INET6: + for_each_netdev(&init_net, dev) { + if (ipv6_chk_addr(&init_net, + &((struct sockaddr_in6 *) addr)->sin6_addr, + dev, 1)) { + ret = rdma_copy_addr(dev_addr, dev, NULL); + break; + } + } + break; + default: + break; + } return ret; } EXPORT_SYMBOL(rdma_translate_ip); @@ -156,22 +175,37 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } -static void addr_send_arp(struct sockaddr_in *dst_in) +static void addr_send_arp(struct sockaddr *dst_in) { struct rtable *rt; struct flowi fl; - __be32 dst_ip = dst_in->sin_addr.s_addr; + struct dst_entry *dst; memset(&fl, 0, sizeof fl); - fl.nl_u.ip4_u.daddr = dst_ip; - if (ip_route_output_key(&init_net, &rt, &fl)) - return; + if (dst_in->sa_family == AF_INET) { + fl.nl_u.ip4_u.daddr = + ((struct sockaddr_in *) dst_in)->sin_addr.s_addr; - neigh_event_send(rt->u.dst.neighbour, NULL); - ip_rt_put(rt); + if (ip_route_output_key(&init_net, &rt, &fl)) + return; + + neigh_event_send(rt->u.dst.neighbour, NULL); + ip_rt_put(rt); + + } else { + fl.nl_u.ip6_u.daddr = + ((struct sockaddr_in6 *) dst_in)->sin6_addr; + + dst = ip6_route_output(&init_net, NULL, &fl); + if (!dst) + return; + + neigh_event_send(dst->neighbour, NULL); + dst_release(dst); + } } -static int addr_resolve_remote(struct sockaddr_in *src_in, +static int addr4_resolve_remote(struct sockaddr_in *src_in, struct sockaddr_in *dst_in, struct rdma_dev_addr *addr) { @@ -220,10 +254,51 @@ out: return ret; } +static int addr6_resolve_remote(struct sockaddr_in6 *src_in, + struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr) +{ + struct flowi fl; + struct neighbour *neigh; + struct dst_entry *dst; + int ret = -ENODATA; + + memset(&fl, 0, sizeof fl); + fl.nl_u.ip6_u.daddr = dst_in->sin6_addr; + fl.nl_u.ip6_u.saddr = src_in->sin6_addr; + + dst = ip6_route_output(&init_net, NULL, &fl); + if (!dst) + return ret; + + if (dst->dev->flags & IFF_NOARP) { + ret = rdma_copy_addr(addr, dst->dev, NULL); + } else { + neigh = dst->neighbour; + if (neigh && (neigh->nud_state & NUD_VALID)) + ret = rdma_copy_addr(addr, neigh->dev, neigh->ha); + } + + dst_release(dst); + return ret; +} + +static int addr_resolve_remote(struct sockaddr *src_in, + struct sockaddr *dst_in, + struct rdma_dev_addr *addr) +{ + if (src_in->sa_family == AF_INET) { + return addr4_resolve_remote((struct sockaddr_in *) src_in, + (struct sockaddr_in *) dst_in, addr); + } else + return addr6_resolve_remote((struct sockaddr_in6 *) src_in, + (struct sockaddr_in6 *) dst_in, addr); +} + static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; - struct sockaddr_in *src_in, *dst_in; + struct sockaddr *src_in, *dst_in; struct list_head done_list; INIT_LIST_HEAD(&done_list); @@ -231,8 +306,8 @@ static void process_req(struct work_struct *work) mutex_lock(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->status == -ENODATA) { - src_in = (struct sockaddr_in *) &req->src_addr; - dst_in = (struct sockaddr_in *) &req->dst_addr; + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; req->status = addr_resolve_remote(src_in, dst_in, req->addr); if (req->status && time_after_eq(jiffies, req->timeout)) @@ -251,41 +326,72 @@ static void process_req(struct work_struct *work) list_for_each_entry_safe(req, temp_req, &done_list, list) { list_del(&req->list); - req->callback(req->status, &req->src_addr, req->addr, - req->context); + req->callback(req->status, (struct sockaddr *) &req->src_addr, + req->addr, req->context); put_client(req->client); kfree(req); } } -static int addr_resolve_local(struct sockaddr_in *src_in, - struct sockaddr_in *dst_in, +static int addr_resolve_local(struct sockaddr *src_in, + struct sockaddr *dst_in, struct rdma_dev_addr *addr) { struct net_device *dev; - __be32 src_ip = src_in->sin_addr.s_addr; - __be32 dst_ip = dst_in->sin_addr.s_addr; int ret; - dev = ip_dev_find(&init_net, dst_ip); - if (!dev) - return -EADDRNOTAVAIL; + if (dst_in->sa_family == AF_INET) { + __be32 src_ip = ((struct sockaddr_in *) src_in)->sin_addr.s_addr; + __be32 dst_ip = ((struct sockaddr_in *) dst_in)->sin_addr.s_addr; - if (ipv4_is_zeronet(src_ip)) { - src_in->sin_family = dst_in->sin_family; - src_in->sin_addr.s_addr = dst_ip; - ret = rdma_copy_addr(addr, dev, dev->dev_addr); - } else if (ipv4_is_loopback(src_ip)) { - ret = rdma_translate_ip((struct sockaddr *)dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + dev = ip_dev_find(&init_net, dst_ip); + if (!dev) + return -EADDRNOTAVAIL; + + if (ipv4_is_zeronet(src_ip)) { + src_in->sa_family = dst_in->sa_family; + ((struct sockaddr_in *) src_in)->sin_addr.s_addr = dst_ip; + ret = rdma_copy_addr(addr, dev, dev->dev_addr); + } else if (ipv4_is_loopback(src_ip)) { + ret = rdma_translate_ip(dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + } else { + ret = rdma_translate_ip(src_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + } + dev_put(dev); } else { - ret = rdma_translate_ip((struct sockaddr *)src_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + struct in6_addr *a; + + for_each_netdev(&init_net, dev) + if (ipv6_chk_addr(&init_net, + &((struct sockaddr_in6 *) addr)->sin6_addr, + dev, 1)) + break; + + if (!dev) + return -EADDRNOTAVAIL; + + a = &((struct sockaddr_in6 *) src_in)->sin6_addr; + + if (ipv6_addr_any(a)) { + src_in->sa_family = dst_in->sa_family; + ((struct sockaddr_in6 *) src_in)->sin6_addr = + ((struct sockaddr_in6 *) dst_in)->sin6_addr; + ret = rdma_copy_addr(addr, dev, dev->dev_addr); + } else if (ipv6_addr_loopback(a)) { + ret = rdma_translate_ip(dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + } else { + ret = rdma_translate_ip(src_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + } } - dev_put(dev); return ret; } @@ -296,7 +402,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, struct rdma_dev_addr *addr, void *context), void *context) { - struct sockaddr_in *src_in, *dst_in; + struct sockaddr *src_in, *dst_in; struct addr_req *req; int ret = 0; @@ -313,8 +419,8 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->client = client; atomic_inc(&client->refcount); - src_in = (struct sockaddr_in *) &req->src_addr; - dst_in = (struct sockaddr_in *) &req->dst_addr; + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; req->status = addr_resolve_local(src_in, dst_in, addr); if (req->status == -EADDRNOTAVAIL) From 1f5175adeaa1d161f603ef351785a19814dfe900 Mon Sep 17 00:00:00 2001 From: Aleksey Senin Date: Wed, 24 Dec 2008 10:16:45 -0800 Subject: [PATCH 24/26] RDMA/cma: Add IPv6 support Handle AF_INET6 cases where required, and use struct sockaddr_storage wherever an IPv6 address might be stored. Signed-off-by: Aleksey Senin Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 84 ++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index d951896ff7fc..2a2e50871b40 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -42,6 +42,7 @@ #include #include +#include #include #include @@ -636,7 +637,12 @@ static inline int cma_zero_addr(struct sockaddr *addr) static inline int cma_loopback_addr(struct sockaddr *addr) { - return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); + if (addr->sa_family == AF_INET) + return ipv4_is_loopback( + ((struct sockaddr_in *) addr)->sin_addr.s_addr); + else + return ipv6_addr_loopback( + &((struct sockaddr_in6 *) addr)->sin6_addr); } static inline int cma_any_addr(struct sockaddr *addr) @@ -1467,10 +1473,10 @@ static void cma_listen_on_all(struct rdma_id_private *id_priv) static int cma_bind_any(struct rdma_cm_id *id, sa_family_t af) { - struct sockaddr_in addr_in; + struct sockaddr_storage addr_in; memset(&addr_in, 0, sizeof addr_in); - addr_in.sin_family = af; + addr_in.ss_family = af; return rdma_bind_addr(id, (struct sockaddr *) &addr_in); } @@ -2073,7 +2079,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) struct rdma_id_private *id_priv; int ret; - if (addr->sa_family != AF_INET) + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); @@ -2113,31 +2119,59 @@ EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, enum rdma_port_space ps, struct rdma_route *route) { - struct sockaddr_in *src4, *dst4; struct cma_hdr *cma_hdr; struct sdp_hh *sdp_hdr; - src4 = (struct sockaddr_in *) &route->addr.src_addr; - dst4 = (struct sockaddr_in *) &route->addr.dst_addr; + if (route->addr.src_addr.ss_family == AF_INET) { + struct sockaddr_in *src4, *dst4; - switch (ps) { - case RDMA_PS_SDP: - sdp_hdr = hdr; - if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) - return -EINVAL; - sdp_set_ip_ver(sdp_hdr, 4); - sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; - sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; - sdp_hdr->port = src4->sin_port; - break; - default: - cma_hdr = hdr; - cma_hdr->cma_version = CMA_VERSION; - cma_set_ip_ver(cma_hdr, 4); - cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; - cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; - cma_hdr->port = src4->sin_port; - break; + src4 = (struct sockaddr_in *) &route->addr.src_addr; + dst4 = (struct sockaddr_in *) &route->addr.dst_addr; + + switch (ps) { + case RDMA_PS_SDP: + sdp_hdr = hdr; + if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) + return -EINVAL; + sdp_set_ip_ver(sdp_hdr, 4); + sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + sdp_hdr->port = src4->sin_port; + break; + default: + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + cma_set_ip_ver(cma_hdr, 4); + cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + cma_hdr->port = src4->sin_port; + break; + } + } else { + struct sockaddr_in6 *src6, *dst6; + + src6 = (struct sockaddr_in6 *) &route->addr.src_addr; + dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr; + + switch (ps) { + case RDMA_PS_SDP: + sdp_hdr = hdr; + if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) + return -EINVAL; + sdp_set_ip_ver(sdp_hdr, 6); + sdp_hdr->src_addr.ip6 = src6->sin6_addr; + sdp_hdr->dst_addr.ip6 = dst6->sin6_addr; + sdp_hdr->port = src6->sin6_port; + break; + default: + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + cma_set_ip_ver(cma_hdr, 6); + cma_hdr->src_addr.ip6 = src6->sin6_addr; + cma_hdr->dst_addr.ip6 = dst6->sin6_addr; + cma_hdr->port = src6->sin6_port; + break; + } } return 0; } From e189062a8ca55b0a1843f0346c3fae1a47297c34 Mon Sep 17 00:00:00 2001 From: Faisal Latif Date: Wed, 24 Dec 2008 20:30:04 -0800 Subject: [PATCH 25/26] RDMA/nes: Remove tx_free_list There is no lock protecting tx_free_list thus causing a system crash when skb_dequeue() is called and the list is empty. Since it did not give any performance boost under heavy load, remove it to simplify the code. Replace get_free_pkt() with dev_alloc_skb() to allocate MAX_CM_BUFFER skb for connection establishment/teardown as well as MPA request/response. Signed-off-by: Faisal Latif Signed-off-by: Chien Tung Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 76 ++---------------------------- drivers/infiniband/hw/nes/nes_cm.h | 5 +- 2 files changed, 6 insertions(+), 75 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index aa373c5f8d84..cb48041bed69 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -94,7 +94,6 @@ static int mini_cm_set(struct nes_cm_core *, u32, u32); static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8); -static struct sk_buff *get_free_pkt(struct nes_cm_node *cm_node); static int add_ref_cm_node(struct nes_cm_node *); static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *); @@ -355,7 +354,6 @@ static void print_core(struct nes_cm_core *core) nes_debug(NES_DBG_CM, "State : %u \n", core->state); - nes_debug(NES_DBG_CM, "Tx Free cnt : %u \n", skb_queue_len(&core->tx_free_list)); nes_debug(NES_DBG_CM, "Listen Nodes : %u \n", atomic_read(&core->listen_node_cnt)); nes_debug(NES_DBG_CM, "Active Nodes : %u \n", atomic_read(&core->node_cnt)); @@ -688,7 +686,7 @@ static int send_syn(struct nes_cm_node *cm_node, u32 sendack, optionssize += 1; if (!skb) - skb = get_free_pkt(cm_node); + skb = dev_alloc_skb(MAX_CM_BUFFER); if (!skb) { nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); return -1; @@ -713,7 +711,7 @@ static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb) int flags = SET_RST | SET_ACK; if (!skb) - skb = get_free_pkt(cm_node); + skb = dev_alloc_skb(MAX_CM_BUFFER); if (!skb) { nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); return -1; @@ -734,7 +732,7 @@ static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb) int ret; if (!skb) - skb = get_free_pkt(cm_node); + skb = dev_alloc_skb(MAX_CM_BUFFER); if (!skb) { nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); @@ -757,7 +755,7 @@ static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb) /* if we didn't get a frame get one */ if (!skb) - skb = get_free_pkt(cm_node); + skb = dev_alloc_skb(MAX_CM_BUFFER); if (!skb) { nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); @@ -771,46 +769,6 @@ static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb) } -/** - * get_free_pkt - */ -static struct sk_buff *get_free_pkt(struct nes_cm_node *cm_node) -{ - struct sk_buff *skb, *new_skb; - - /* check to see if we need to repopulate the free tx pkt queue */ - if (skb_queue_len(&cm_node->cm_core->tx_free_list) < NES_CM_FREE_PKT_LO_WATERMARK) { - while (skb_queue_len(&cm_node->cm_core->tx_free_list) < - cm_node->cm_core->free_tx_pkt_max) { - /* replace the frame we took, we won't get it back */ - new_skb = dev_alloc_skb(cm_node->cm_core->mtu); - BUG_ON(!new_skb); - /* add a replacement frame to the free tx list head */ - skb_queue_head(&cm_node->cm_core->tx_free_list, new_skb); - } - } - - skb = skb_dequeue(&cm_node->cm_core->tx_free_list); - - return skb; -} - - -/** - * make_hashkey - generate hash key from node tuple - */ -static inline int make_hashkey(u16 loc_port, nes_addr_t loc_addr, u16 rem_port, - nes_addr_t rem_addr) -{ - u32 hashkey = 0; - - hashkey = loc_addr + rem_addr + loc_port + rem_port; - hashkey = (hashkey % NES_CM_HASHTABLE_SIZE); - - return hashkey; -} - - /** * find_node - find a cm node that matches the reference cm node */ @@ -818,13 +776,9 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr) { unsigned long flags; - u32 hashkey; struct list_head *hte; struct nes_cm_node *cm_node; - /* make a hash index key for this packet */ - hashkey = make_hashkey(loc_port, loc_addr, rem_port, rem_addr); - /* get a handle on the hte */ hte = &cm_core->connected_nodes; @@ -892,7 +846,6 @@ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) { unsigned long flags; - u32 hashkey; struct list_head *hte; if (!cm_node || !cm_core) @@ -901,11 +854,6 @@ static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n", cm_node); - /* first, make an index into our hash table */ - hashkey = make_hashkey(cm_node->loc_port, cm_node->loc_addr, - cm_node->rem_port, cm_node->rem_addr); - cm_node->hashkey = hashkey; - spin_lock_irqsave(&cm_core->ht_lock, flags); /* get a handle on the hash table element (list head for this slot) */ @@ -2198,10 +2146,7 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, */ static struct nes_cm_core *nes_cm_alloc_core(void) { - int i; - struct nes_cm_core *cm_core; - struct sk_buff *skb = NULL; /* setup the CM core */ /* alloc top level core control structure */ @@ -2219,19 +2164,6 @@ static struct nes_cm_core *nes_cm_alloc_core(void) atomic_set(&cm_core->events_posted, 0); - /* init the packet lists */ - skb_queue_head_init(&cm_core->tx_free_list); - - for (i = 0; i < NES_CM_DEFAULT_FRAME_CNT; i++) { - skb = dev_alloc_skb(cm_core->mtu); - if (!skb) { - kfree(cm_core); - return NULL; - } - /* add 'raw' skb to free frame list */ - skb_queue_head(&cm_core->tx_free_list, skb); - } - cm_core->api = &nes_cm_api; spin_lock_init(&cm_core->ht_lock); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 3a20a7883976..fafa35042ebd 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -165,6 +165,8 @@ struct nes_timer_entry { #define NES_CM_DEF_SEQ2 0x18ed5740 #define NES_CM_DEF_LOCAL_ID2 0xb807 +#define MAX_CM_BUFFER 512 + typedef u32 nes_addr_t; @@ -258,8 +260,6 @@ struct nes_cm_listener { /* per connection node and node state information */ struct nes_cm_node { - u32 hashkey; - nes_addr_t loc_addr, rem_addr; u16 loc_port, rem_port; @@ -357,7 +357,6 @@ struct nes_cm_core { u32 mtu; u32 free_tx_pkt_max; u32 rx_pkt_posted; - struct sk_buff_head tx_free_list; atomic_t ht_node_cnt; struct list_head connected_nodes; /* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */ From 7798dbf40ad9d295aa3a02eca700168e4327239a Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 24 Dec 2008 20:32:42 -0800 Subject: [PATCH 26/26] IB/mlx4: Set ownership bit correctly when copying CQEs during CQ resize When resizing a CQ, when copying over unpolled CQEs from the old CQE buffer to the new buffer, the ownership bit must be set appropriately for the new buffer, or the ownership bit in the new buffer gets corrupted. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/cq.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 2198753bf13d..8415ecce5c4c 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -325,15 +325,17 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) { - struct mlx4_cqe *cqe; + struct mlx4_cqe *cqe, *new_cqe; int i; i = cq->mcq.cons_index; cqe = get_cqe(cq, i & cq->ibcq.cqe); while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { - memcpy(get_cqe_from_buf(&cq->resize_buf->buf, - (i + 1) & cq->resize_buf->cqe), - get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe)); + new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, + (i + 1) & cq->resize_buf->cqe); + memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe)); + new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | + (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); cqe = get_cqe(cq, ++i & cq->ibcq.cqe); } ++cq->mcq.cons_index;