linux/drivers/infiniband/core/cm.c

4537 lines
129 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2004-2007 Intel Corporation. All rights reserved.
* Copyright (c) 2004 Topspin Corporation. All rights reserved.
* Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2019, Mellanox Technologies inc. All rights reserved.
*/
#include <linux/completion.h>
#include <linux/dma-mapping.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/err.h>
#include <linux/idr.h>
#include <linux/interrupt.h>
#include <linux/random.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/workqueue.h>
#include <linux/kdev_t.h>
IB/core: Ethernet L2 attributes in verbs/cm structures This patch add the support for Ethernet L2 attributes in the verbs/cm/cma structures. When dealing with L2 Ethernet, we should use smac, dmac, vlan ID and priority in a similar manner that the IB L2 (and the L4 PKEY) attributes are used. Thus, those attributes were added to the following structures: * ib_ah_attr - added dmac * ib_qp_attr - added smac and vlan_id, (sl remains vlan priority) * ib_wc - added smac, vlan_id * ib_sa_path_rec - added smac, dmac, vlan_id * cm_av - added smac and vlan_id For the path record structure, extra care was taken to avoid the new fields when packing it into wire format, so we don't break the IB CM and SA wire protocol. On the active side, the CM fills. its internal structures from the path provided by the ULP. We add there taking the ETH L2 attributes and placing them into the CM Address Handle (struct cm_av). On the passive side, the CM fills its internal structures from the WC associated with the REQ message. We add there taking the ETH L2 attributes from the WC. When the HW driver provides the required ETH L2 attributes in the WC, they set the IB_WC_WITH_SMAC and IB_WC_WITH_VLAN flags. The IB core code checks for the presence of these flags, and in their absence does address resolution from the ib_init_ah_from_wc() helper function. ib_modify_qp_is_ok is also updated to consider the link layer. Some parameters are mandatory for Ethernet link layer, while they are irrelevant for IB. Vendor drivers are modified to support the new function signature. Signed-off-by: Matan Barak <matanb@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
2013-12-13 00:03:11 +08:00
#include <linux/etherdevice.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_cm.h>
#include <rdma/ib_sysfs.h>
#include "cm_msgs.h"
RDMA/cm: Move debug counters to be under relevant IB device The sysfs layout is created by CM incorrectly presented RDMA devices with InfiniBand link layer. Layout of such devices represents device tree of connections. By moving CM statistics to be under relevant port of IB device, we will fix the following issues: * Symlink name - It used device name instead of specific identifier. * Target location - It was supposed to point to PCI-ID/infiniband_cm/ instead of PCI-ID/infiniband/ * Target name - It created extra device file under already existing device folder, e.g. mlx5_0/mlx5_0 * Crash during boot with RDMA persistent naming patches. sysfs: cannot create duplicate filename '/class/infiniband_cm/mlx5_0' CPU: 29 PID: 433 Comm: modprobe Not tainted 5.0.0-rc5+ #178 Call Trace: dump_stack+0xcc/0x180 sysfs_warn_dup.cold.3+0x17/0x2d sysfs_do_create_link_sd.isra.2+0xd0/0xf0 device_add+0x7cb/0x1450 device_create_groups_vargs+0x1ae/0x220 device_create+0x93/0xc0 cm_add_one+0x38f/0xf60 [ib_cm] add_client_context+0x167/0x210 [ib_core] enable_device_and_get+0x230/0x3f0 [ib_core] ib_register_device+0x823/0xbf0 [ib_core] __mlx5_ib_add+0x45/0x150 [mlx5_ib] mlx5_ib_add+0x1b3/0x5e0 [mlx5_ib] mlx5_add_device+0x130/0x3a0 [mlx5_core] mlx5_register_interface+0x1a9/0x270 [mlx5_core] do_one_initcall+0x14f/0x5de do_init_module+0x247/0x7c0 load_module+0x4c2f/0x60d0 entry_SYSCALL_64_after_hwframe+0x49/0xbe After this change: [leonro@server ~]$ ls -al /sys/class/infiniband/ibp0s12f0/ports/1/ drwxr-xr-x 2 root root 0 Mar 11 11:17 cm_rx_duplicates drwxr-xr-x 2 root root 0 Mar 11 11:17 cm_rx_msgs drwxr-xr-x 2 root root 0 Mar 11 11:17 cm_tx_msgs drwxr-xr-x 2 root root 0 Mar 11 11:17 cm_tx_retries Fixes: 110cf374a809 ("infiniband: make cm_device use a struct device and not a kobject.") Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-03-11 20:40:31 +08:00
#include "core_priv.h"
#include "cm_trace.h"
MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
static const char * const ibcm_rej_reason_strs[] = {
[IB_CM_REJ_NO_QP] = "no QP",
[IB_CM_REJ_NO_EEC] = "no EEC",
[IB_CM_REJ_NO_RESOURCES] = "no resources",
[IB_CM_REJ_TIMEOUT] = "timeout",
[IB_CM_REJ_UNSUPPORTED] = "unsupported",
[IB_CM_REJ_INVALID_COMM_ID] = "invalid comm ID",
[IB_CM_REJ_INVALID_COMM_INSTANCE] = "invalid comm instance",
[IB_CM_REJ_INVALID_SERVICE_ID] = "invalid service ID",
[IB_CM_REJ_INVALID_TRANSPORT_TYPE] = "invalid transport type",
[IB_CM_REJ_STALE_CONN] = "stale conn",
[IB_CM_REJ_RDC_NOT_EXIST] = "RDC not exist",
[IB_CM_REJ_INVALID_GID] = "invalid GID",
[IB_CM_REJ_INVALID_LID] = "invalid LID",
[IB_CM_REJ_INVALID_SL] = "invalid SL",
[IB_CM_REJ_INVALID_TRAFFIC_CLASS] = "invalid traffic class",
[IB_CM_REJ_INVALID_HOP_LIMIT] = "invalid hop limit",
[IB_CM_REJ_INVALID_PACKET_RATE] = "invalid packet rate",
[IB_CM_REJ_INVALID_ALT_GID] = "invalid alt GID",
[IB_CM_REJ_INVALID_ALT_LID] = "invalid alt LID",
[IB_CM_REJ_INVALID_ALT_SL] = "invalid alt SL",
[IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS] = "invalid alt traffic class",
[IB_CM_REJ_INVALID_ALT_HOP_LIMIT] = "invalid alt hop limit",
[IB_CM_REJ_INVALID_ALT_PACKET_RATE] = "invalid alt packet rate",
[IB_CM_REJ_PORT_CM_REDIRECT] = "port CM redirect",
[IB_CM_REJ_PORT_REDIRECT] = "port redirect",
[IB_CM_REJ_INVALID_MTU] = "invalid MTU",
[IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES] = "insufficient resp resources",
[IB_CM_REJ_CONSUMER_DEFINED] = "consumer defined",
[IB_CM_REJ_INVALID_RNR_RETRY] = "invalid RNR retry",
[IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID] = "duplicate local comm ID",
[IB_CM_REJ_INVALID_CLASS_VERSION] = "invalid class version",
[IB_CM_REJ_INVALID_FLOW_LABEL] = "invalid flow label",
[IB_CM_REJ_INVALID_ALT_FLOW_LABEL] = "invalid alt flow label",
[IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED] =
"vendor option is not supported",
};
const char *__attribute_const__ ibcm_reject_msg(int reason)
{
size_t index = reason;
if (index < ARRAY_SIZE(ibcm_rej_reason_strs) &&
ibcm_rej_reason_strs[index])
return ibcm_rej_reason_strs[index];
else
return "unrecognized reason";
}
EXPORT_SYMBOL(ibcm_reject_msg);
struct cm_id_private;
struct cm_work;
static int cm_add_one(struct ib_device *device);
static void cm_remove_one(struct ib_device *device, void *client_data);
static void cm_process_work(struct cm_id_private *cm_id_priv,
struct cm_work *work);
static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv,
struct ib_cm_sidr_rep_param *param);
static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
const void *private_data, u8 private_data_len);
static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
void *private_data, u8 private_data_len);
static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
enum ib_cm_rej_reason reason, void *ari,
u8 ari_length, const void *private_data,
u8 private_data_len);
static struct ib_client cm_client = {
.name = "cm",
.add = cm_add_one,
.remove = cm_remove_one
};
static struct ib_cm {
spinlock_t lock;
struct list_head device_list;
rwlock_t device_lock;
struct rb_root listen_service_table;
u64 listen_service_id;
/* struct rb_root peer_service_table; todo: fix peer to peer */
struct rb_root remote_qp_table;
struct rb_root remote_id_table;
struct rb_root remote_sidr_table;
struct xarray local_id_table;
u32 local_id_next;
__be32 random_id_operand;
struct list_head timewait_list;
struct workqueue_struct *wq;
} cm;
/* Counter indexes ordered by attribute ID */
enum {
CM_REQ_COUNTER,
CM_MRA_COUNTER,
CM_REJ_COUNTER,
CM_REP_COUNTER,
CM_RTU_COUNTER,
CM_DREQ_COUNTER,
CM_DREP_COUNTER,
CM_SIDR_REQ_COUNTER,
CM_SIDR_REP_COUNTER,
CM_LAP_COUNTER,
CM_APR_COUNTER,
CM_ATTR_COUNT,
CM_ATTR_ID_OFFSET = 0x0010,
};
enum {
CM_XMIT,
CM_XMIT_RETRIES,
CM_RECV,
CM_RECV_DUPLICATES,
CM_COUNTER_GROUPS
};
struct cm_counter_attribute {
struct ib_port_attribute attr;
unsigned short group;
unsigned short index;
};
struct cm_port {
struct cm_device *cm_dev;
struct ib_mad_agent *mad_agent;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 15:04:20 +08:00
u32 port_num;
atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT];
};
struct cm_device {
struct kref kref;
struct list_head list;
spinlock_t mad_agent_lock;
struct ib_device *ib_device;
u8 ack_delay;
int going_down;
struct cm_port *port[];
};
struct cm_av {
struct cm_port *port;
struct rdma_ah_attr ah_attr;
u16 dlid_datapath;
u16 pkey_index;
u8 timeout;
};
struct cm_work {
struct delayed_work work;
struct list_head list;
struct cm_port *port;
struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */
__be32 local_id; /* Established / timewait */
__be32 remote_id;
struct ib_cm_event cm_event;
struct sa_path_rec path[];
};
struct cm_timewait_info {
struct cm_work work;
struct list_head list;
struct rb_node remote_qp_node;
struct rb_node remote_id_node;
__be64 remote_ca_guid;
__be32 remote_qpn;
u8 inserted_remote_qp;
u8 inserted_remote_id;
};
struct cm_id_private {
struct ib_cm_id id;
struct rb_node service_node;
struct rb_node sidr_id_node;
u32 sidr_slid;
spinlock_t lock; /* Do not acquire inside cm.lock */
struct completion comp;
refcount_t refcount;
/* Number of clients sharing this ib_cm_id. Only valid for listeners.
* Protected by the cm.lock spinlock.
*/
int listen_sharecount;
struct rcu_head rcu;
struct ib_mad_send_buf *msg;
struct cm_timewait_info *timewait_info;
/* todo: use alternate port on send failure */
struct cm_av av;
struct cm_av alt_av;
void *private_data;
__be64 tid;
__be32 local_qpn;
__be32 remote_qpn;
enum ib_qp_type qp_type;
__be32 sq_psn;
__be32 rq_psn;
int timeout_ms;
enum ib_mtu path_mtu;
__be16 pkey;
u8 private_data_len;
u8 max_cm_retries;
u8 responder_resources;
u8 initiator_depth;
u8 retry_count;
u8 rnr_retry_count;
u8 service_timeout;
u8 target_ack_delay;
struct list_head work_list;
atomic_t work_count;
struct rdma_ucm_ece ece;
};
static void cm_dev_release(struct kref *kref)
{
struct cm_device *cm_dev = container_of(kref, struct cm_device, kref);
u32 i;
rdma_for_each_port(cm_dev->ib_device, i)
kfree(cm_dev->port[i - 1]);
kfree(cm_dev);
}
static void cm_device_put(struct cm_device *cm_dev)
{
kref_put(&cm_dev->kref, cm_dev_release);
}
static void cm_work_handler(struct work_struct *work);
static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
{
if (refcount_dec_and_test(&cm_id_priv->refcount))
complete(&cm_id_priv->comp);
}
static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
{
struct ib_mad_agent *mad_agent;
struct ib_mad_send_buf *m;
struct ib_ah *ah;
lockdep_assert_held(&cm_id_priv->lock);
if (!cm_id_priv->av.port)
return ERR_PTR(-EINVAL);
spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
mad_agent = cm_id_priv->av.port->mad_agent;
if (!mad_agent) {
m = ERR_PTR(-EINVAL);
goto out;
}
ah = rdma_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr, 0);
if (IS_ERR(ah)) {
m = ERR_CAST(ah);
goto out;
}
m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
cm_id_priv->av.pkey_index,
0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
GFP_ATOMIC,
IB_MGMT_BASE_VERSION);
if (IS_ERR(m)) {
rdma_destroy_ah(ah, 0);
goto out;
}
/* Timeout set by caller if response is expected. */
m->ah = ah;
m->retries = cm_id_priv->max_cm_retries;
refcount_inc(&cm_id_priv->refcount);
m->context[0] = cm_id_priv;
out:
spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return m;
}
static void cm_free_msg(struct ib_mad_send_buf *msg)
{
struct cm_id_private *cm_id_priv = msg->context[0];
if (msg->ah)
rdma_destroy_ah(msg->ah, 0);
cm_deref_id(cm_id_priv);
ib_free_send_mad(msg);
}
static struct ib_mad_send_buf *
cm_alloc_priv_msg(struct cm_id_private *cm_id_priv)
{
struct ib_mad_send_buf *msg;
lockdep_assert_held(&cm_id_priv->lock);
msg = cm_alloc_msg(cm_id_priv);
if (IS_ERR(msg))
return msg;
cm_id_priv->msg = msg;
return msg;
}
static void cm_free_priv_msg(struct ib_mad_send_buf *msg)
{
struct cm_id_private *cm_id_priv = msg->context[0];
lockdep_assert_held(&cm_id_priv->lock);
if (!WARN_ON(cm_id_priv->msg != msg))
cm_id_priv->msg = NULL;
if (msg->ah)
rdma_destroy_ah(msg->ah, 0);
cm_deref_id(cm_id_priv);
ib_free_send_mad(msg);
}
static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port,
struct ib_mad_recv_wc *mad_recv_wc)
{
return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
GFP_ATOMIC,
IB_MGMT_BASE_VERSION);
}
static int cm_create_response_msg_ah(struct cm_port *port,
struct ib_mad_recv_wc *mad_recv_wc,
struct ib_mad_send_buf *msg)
{
struct ib_ah *ah;
ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
mad_recv_wc->recv_buf.grh, port->port_num);
if (IS_ERR(ah))
return PTR_ERR(ah);
msg->ah = ah;
return 0;
}
static int cm_alloc_response_msg(struct cm_port *port,
struct ib_mad_recv_wc *mad_recv_wc,
struct ib_mad_send_buf **msg)
{
struct ib_mad_send_buf *m;
int ret;
m = cm_alloc_response_msg_no_ah(port, mad_recv_wc);
if (IS_ERR(m))
return PTR_ERR(m);
ret = cm_create_response_msg_ah(port, mad_recv_wc, m);
if (ret) {
ib_free_send_mad(m);
return ret;
}
*msg = m;
return 0;
}
static void cm_free_response_msg(struct ib_mad_send_buf *msg)
{
if (msg->ah)
rdma_destroy_ah(msg->ah, 0);
ib_free_send_mad(msg);
}
static void *cm_copy_private_data(const void *private_data, u8 private_data_len)
{
void *data;
if (!private_data || !private_data_len)
return NULL;
data = kmemdup(private_data, private_data_len, GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
return data;
}
static void cm_set_private_data(struct cm_id_private *cm_id_priv,
void *private_data, u8 private_data_len)
{
if (cm_id_priv->private_data && cm_id_priv->private_data_len)
kfree(cm_id_priv->private_data);
cm_id_priv->private_data = private_data;
cm_id_priv->private_data_len = private_data_len;
}
static void cm_set_av_port(struct cm_av *av, struct cm_port *port)
{
struct cm_port *old_port = av->port;
if (old_port == port)
return;
av->port = port;
if (old_port)
cm_device_put(old_port->cm_dev);
if (port)
kref_get(&port->cm_dev->kref);
}
static void cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
struct rdma_ah_attr *ah_attr, struct cm_av *av)
{
cm_set_av_port(av, port);
av->pkey_index = wc->pkey_index;
rdma_move_ah_attr(&av->ah_attr, ah_attr);
}
static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
struct ib_grh *grh, struct cm_av *av)
{
cm_set_av_port(av, port);
av->pkey_index = wc->pkey_index;
return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
port->port_num, wc,
grh, &av->ah_attr);
}
static struct cm_port *
get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr)
{
struct cm_device *cm_dev;
struct cm_port *port = NULL;
unsigned long flags;
if (attr) {
read_lock_irqsave(&cm.device_lock, flags);
list_for_each_entry(cm_dev, &cm.device_list, list) {
if (cm_dev->ib_device == attr->device) {
port = cm_dev->port[attr->port_num - 1];
break;
}
}
read_unlock_irqrestore(&cm.device_lock, flags);
} else {
/* SGID attribute can be NULL in following
* conditions.
* (a) Alternative path
* (b) IB link layer without GRH
* (c) LAP send messages
*/
read_lock_irqsave(&cm.device_lock, flags);
list_for_each_entry(cm_dev, &cm.device_list, list) {
attr = rdma_find_gid(cm_dev->ib_device,
&path->sgid,
sa_conv_pathrec_to_gid_type(path),
NULL);
if (!IS_ERR(attr)) {
port = cm_dev->port[attr->port_num - 1];
break;
}
}
read_unlock_irqrestore(&cm.device_lock, flags);
if (port)
rdma_put_gid_attr(attr);
}
return port;
}
static int cm_init_av_by_path(struct sa_path_rec *path,
const struct ib_gid_attr *sgid_attr,
struct cm_av *av)
{
struct rdma_ah_attr new_ah_attr;
struct cm_device *cm_dev;
struct cm_port *port;
int ret;
port = get_cm_port_from_path(path, sgid_attr);
if (!port)
return -EINVAL;
cm_dev = port->cm_dev;
ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
be16_to_cpu(path->pkey), &av->pkey_index);
if (ret)
return ret;
cm_set_av_port(av, port);
/*
* av->ah_attr might be initialized based on wc or during
* request processing time which might have reference to sgid_attr.
* So initialize a new ah_attr on stack.
* If initialization fails, old ah_attr is used for sending any
* responses. If initialization is successful, than new ah_attr
* is used by overwriting the old one. So that right ah_attr
* can be used to return an error response.
*/
ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path,
&new_ah_attr, sgid_attr);
if (ret)
return ret;
av->timeout = path->packet_life_time + 1;
rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
return 0;
}
/* Move av created by cm_init_av_by_path(), so av.dgid is not moved */
static void cm_move_av_from_path(struct cm_av *dest, struct cm_av *src)
{
cm_set_av_port(dest, src->port);
cm_set_av_port(src, NULL);
dest->pkey_index = src->pkey_index;
rdma_move_ah_attr(&dest->ah_attr, &src->ah_attr);
dest->timeout = src->timeout;
}
static void cm_destroy_av(struct cm_av *av)
{
rdma_destroy_ah_attr(&av->ah_attr);
cm_set_av_port(av, NULL);
}
static u32 cm_local_id(__be32 local_id)
{
return (__force u32) (local_id ^ cm.random_id_operand);
}
static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id)
{
struct cm_id_private *cm_id_priv;
rcu_read_lock();
cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id));
if (!cm_id_priv || cm_id_priv->id.remote_id != remote_id ||
!refcount_inc_not_zero(&cm_id_priv->refcount))
cm_id_priv = NULL;
rcu_read_unlock();
return cm_id_priv;
}
/*
* Trivial helpers to strip endian annotation and compare; the
* endianness doesn't actually matter since we just need a stable
* order for the RB tree.
*/
static int be32_lt(__be32 a, __be32 b)
{
return (__force u32) a < (__force u32) b;
}
static int be32_gt(__be32 a, __be32 b)
{
return (__force u32) a > (__force u32) b;
}
static int be64_lt(__be64 a, __be64 b)
{
return (__force u64) a < (__force u64) b;
}
static int be64_gt(__be64 a, __be64 b)
{
return (__force u64) a > (__force u64) b;
}
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
/*
* Inserts a new cm_id_priv into the listen_service_table. Returns cm_id_priv
* if the new ID was inserted, NULL if it could not be inserted due to a
* collision, or the existing cm_id_priv ready for shared usage.
*/
static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv,
ib_cm_handler shared_handler)
{
struct rb_node **link = &cm.listen_service_table.rb_node;
struct rb_node *parent = NULL;
struct cm_id_private *cur_cm_id_priv;
__be64 service_id = cm_id_priv->id.service_id;
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
unsigned long flags;
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
spin_lock_irqsave(&cm.lock, flags);
while (*link) {
parent = *link;
cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
service_node);
if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
link = &(*link)->rb_left;
else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
link = &(*link)->rb_right;
else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
link = &(*link)->rb_left;
else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
link = &(*link)->rb_right;
else {
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
/*
* Sharing an ib_cm_id with different handlers is not
* supported
*/
if (cur_cm_id_priv->id.cm_handler != shared_handler ||
cur_cm_id_priv->id.context ||
WARN_ON(!cur_cm_id_priv->id.cm_handler)) {
spin_unlock_irqrestore(&cm.lock, flags);
return NULL;
}
refcount_inc(&cur_cm_id_priv->refcount);
cur_cm_id_priv->listen_sharecount++;
spin_unlock_irqrestore(&cm.lock, flags);
return cur_cm_id_priv;
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
}
}
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
cm_id_priv->listen_sharecount++;
rb_link_node(&cm_id_priv->service_node, parent, link);
rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
spin_unlock_irqrestore(&cm.lock, flags);
return cm_id_priv;
}
static struct cm_id_private *cm_find_listen(struct ib_device *device,
__be64 service_id)
{
struct rb_node *node = cm.listen_service_table.rb_node;
struct cm_id_private *cm_id_priv;
while (node) {
cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
if (device < cm_id_priv->id.device)
node = node->rb_left;
else if (device > cm_id_priv->id.device)
node = node->rb_right;
else if (be64_lt(service_id, cm_id_priv->id.service_id))
node = node->rb_left;
else if (be64_gt(service_id, cm_id_priv->id.service_id))
node = node->rb_right;
else {
refcount_inc(&cm_id_priv->refcount);
return cm_id_priv;
}
}
return NULL;
}
static struct cm_timewait_info *
cm_insert_remote_id(struct cm_timewait_info *timewait_info)
{
struct rb_node **link = &cm.remote_id_table.rb_node;
struct rb_node *parent = NULL;
struct cm_timewait_info *cur_timewait_info;
__be64 remote_ca_guid = timewait_info->remote_ca_guid;
__be32 remote_id = timewait_info->work.remote_id;
while (*link) {
parent = *link;
cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
remote_id_node);
if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
link = &(*link)->rb_left;
else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
link = &(*link)->rb_right;
else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
link = &(*link)->rb_left;
else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
link = &(*link)->rb_right;
else
return cur_timewait_info;
}
timewait_info->inserted_remote_id = 1;
rb_link_node(&timewait_info->remote_id_node, parent, link);
rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
return NULL;
}
static struct cm_id_private *cm_find_remote_id(__be64 remote_ca_guid,
__be32 remote_id)
{
struct rb_node *node = cm.remote_id_table.rb_node;
struct cm_timewait_info *timewait_info;
struct cm_id_private *res = NULL;
spin_lock_irq(&cm.lock);
while (node) {
timewait_info = rb_entry(node, struct cm_timewait_info,
remote_id_node);
if (be32_lt(remote_id, timewait_info->work.remote_id))
node = node->rb_left;
else if (be32_gt(remote_id, timewait_info->work.remote_id))
node = node->rb_right;
else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
node = node->rb_left;
else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
node = node->rb_right;
else {
res = cm_acquire_id(timewait_info->work.local_id,
timewait_info->work.remote_id);
break;
}
}
spin_unlock_irq(&cm.lock);
return res;
}
static struct cm_timewait_info *
cm_insert_remote_qpn(struct cm_timewait_info *timewait_info)
{
struct rb_node **link = &cm.remote_qp_table.rb_node;
struct rb_node *parent = NULL;
struct cm_timewait_info *cur_timewait_info;
__be64 remote_ca_guid = timewait_info->remote_ca_guid;
__be32 remote_qpn = timewait_info->remote_qpn;
while (*link) {
parent = *link;
cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
remote_qp_node);
if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
link = &(*link)->rb_left;
else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
link = &(*link)->rb_right;
else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
link = &(*link)->rb_left;
else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
link = &(*link)->rb_right;
else
return cur_timewait_info;
}
timewait_info->inserted_remote_qp = 1;
rb_link_node(&timewait_info->remote_qp_node, parent, link);
rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
return NULL;
}
static struct cm_id_private *
cm_insert_remote_sidr(struct cm_id_private *cm_id_priv)
{
struct rb_node **link = &cm.remote_sidr_table.rb_node;
struct rb_node *parent = NULL;
struct cm_id_private *cur_cm_id_priv;
__be32 remote_id = cm_id_priv->id.remote_id;
while (*link) {
parent = *link;
cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
sidr_id_node);
if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
link = &(*link)->rb_left;
else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
link = &(*link)->rb_right;
else {
if (cur_cm_id_priv->sidr_slid < cm_id_priv->sidr_slid)
link = &(*link)->rb_left;
else if (cur_cm_id_priv->sidr_slid > cm_id_priv->sidr_slid)
link = &(*link)->rb_right;
else
return cur_cm_id_priv;
}
}
rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
return NULL;
}
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
ib_cm_handler cm_handler,
void *context)
{
struct cm_id_private *cm_id_priv;
u32 id;
int ret;
cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
if (!cm_id_priv)
return ERR_PTR(-ENOMEM);
cm_id_priv->id.state = IB_CM_IDLE;
cm_id_priv->id.device = device;
cm_id_priv->id.cm_handler = cm_handler;
cm_id_priv->id.context = context;
cm_id_priv->id.remote_cm_qpn = 1;
RB_CLEAR_NODE(&cm_id_priv->service_node);
RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
spin_lock_init(&cm_id_priv->lock);
init_completion(&cm_id_priv->comp);
INIT_LIST_HEAD(&cm_id_priv->work_list);
atomic_set(&cm_id_priv->work_count, -1);
refcount_set(&cm_id_priv->refcount, 1);
ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b,
&cm.local_id_next, GFP_KERNEL);
if (ret < 0)
goto error;
cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
return cm_id_priv;
error:
kfree(cm_id_priv);
return ERR_PTR(ret);
}
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
/*
* Make the ID visible to the MAD handlers and other threads that use the
* xarray.
*/
static void cm_finalize_id(struct cm_id_private *cm_id_priv)
{
xa_store(&cm.local_id_table, cm_local_id(cm_id_priv->id.local_id),
cm_id_priv, GFP_ATOMIC);
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
}
struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
ib_cm_handler cm_handler,
void *context)
{
struct cm_id_private *cm_id_priv;
cm_id_priv = cm_alloc_id_priv(device, cm_handler, context);
if (IS_ERR(cm_id_priv))
return ERR_CAST(cm_id_priv);
cm_finalize_id(cm_id_priv);
return &cm_id_priv->id;
}
EXPORT_SYMBOL(ib_create_cm_id);
static struct cm_work *cm_dequeue_work(struct cm_id_private *cm_id_priv)
{
struct cm_work *work;
if (list_empty(&cm_id_priv->work_list))
return NULL;
work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
list_del(&work->list);
return work;
}
static void cm_free_work(struct cm_work *work)
{
if (work->mad_recv_wc)
ib_free_recv_mad(work->mad_recv_wc);
kfree(work);
}
static void cm_queue_work_unlock(struct cm_id_private *cm_id_priv,
struct cm_work *work)
__releases(&cm_id_priv->lock)
{
bool immediate;
/*
* To deliver the event to the user callback we have the drop the
* spinlock, however, we need to ensure that the user callback is single
* threaded and receives events in the temporal order. If there are
* already events being processed then thread new events onto a list,
* the thread currently processing will pick them up.
*/
immediate = atomic_inc_and_test(&cm_id_priv->work_count);
if (!immediate) {
list_add_tail(&work->list, &cm_id_priv->work_list);
/*
* This routine always consumes incoming reference. Once queued
* to the work_list then a reference is held by the thread
* currently running cm_process_work() and this reference is not
* needed.
*/
cm_deref_id(cm_id_priv);
}
spin_unlock_irq(&cm_id_priv->lock);
if (immediate)
cm_process_work(cm_id_priv, work);
}
static inline int cm_convert_to_ms(int iba_time)
{
/* approximate conversion to ms from 4.096us x 2^iba_time */
return 1 << max(iba_time - 8, 0);
}
/*
* calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
* Because of how ack_timeout is stored, adding one doubles the timeout.
* To avoid large timeouts, select the max(ack_delay, life_time + 1), and
* increment it (round up) only if the other is within 50%.
*/
static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
{
int ack_timeout = packet_life_time + 1;
if (ack_timeout >= ca_ack_delay)
ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
else
ack_timeout = ca_ack_delay +
(ack_timeout >= (ca_ack_delay - 1));
return min(31, ack_timeout);
}
static void cm_remove_remote(struct cm_id_private *cm_id_priv)
{
struct cm_timewait_info *timewait_info = cm_id_priv->timewait_info;
if (timewait_info->inserted_remote_id) {
rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
timewait_info->inserted_remote_id = 0;
}
if (timewait_info->inserted_remote_qp) {
rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
timewait_info->inserted_remote_qp = 0;
}
}
static struct cm_timewait_info *cm_create_timewait_info(__be32 local_id)
{
struct cm_timewait_info *timewait_info;
timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
if (!timewait_info)
return ERR_PTR(-ENOMEM);
timewait_info->work.local_id = local_id;
INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
return timewait_info;
}
static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
{
int wait_time;
unsigned long flags;
struct cm_device *cm_dev;
lockdep_assert_held(&cm_id_priv->lock);
cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client);
if (!cm_dev)
return;
spin_lock_irqsave(&cm.lock, flags);
cm_remove_remote(cm_id_priv);
list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
spin_unlock_irqrestore(&cm.lock, flags);
/*
* The cm_id could be destroyed by the user before we exit timewait.
* To protect against this, we search for the cm_id after exiting
* timewait before notifying the user that we've exited timewait.
*/
cm_id_priv->id.state = IB_CM_TIMEWAIT;
wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
/* Check if the device started its remove_one */
IB/cm: Fix a recently introduced deadlock ib_send_cm_drep() calls cm_enter_timewait() while holding a spinlock that can be locked from inside an interrupt handler. Hence do not enable interrupts inside cm_enter_timewait() if called with interrupts disabled. This patch fixes e.g. the following deadlock: Acked-by: Erez Shitrit <erezsh@mellanox.com> ================================= [ INFO: inconsistent lock state ] 4.4.0-rc7+ #1 Tainted: G E --------------------------------- inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. swapper/8/0 [HC1[1]:SC0[0]:HE0:SE1] takes: (&(&cm_id_priv->lock)->rlock){?.+...}, at: [<ffffffffa036eec4>] cm_establish+0x 74/0x1b0 [ib_cm] {HARDIRQ-ON-W} state was registered at: [<ffffffff810a3c11>] mark_held_locks+0x71/0x90 [<ffffffff810a3e87>] trace_hardirqs_on_caller+0xa7/0x1c0 [<ffffffff810a3fad>] trace_hardirqs_on+0xd/0x10 [<ffffffff8151c40b>] _raw_spin_unlock_irq+0x2b/0x40 [<ffffffffa036ea8e>] cm_enter_timewait+0xae/0x100 [ib_cm] [<ffffffffa036ff76>] ib_send_cm_drep+0xb6/0x190 [ib_cm] [<ffffffffa052ed08>] srp_cm_handler+0x128/0x1a0 [ib_srp] [<ffffffffa0370340>] cm_process_work+0x20/0xf0 [ib_cm] [<ffffffffa0371335>] cm_dreq_handler+0x135/0x2c0 [ib_cm] [<ffffffffa03733c5>] cm_work_handler+0x75/0xd0 [ib_cm] [<ffffffff8107184d>] process_one_work+0x1bd/0x460 [<ffffffff81073148>] worker_thread+0x118/0x420 [<ffffffff81078454>] kthread+0xe4/0x100 [<ffffffff8151cbbf>] ret_from_fork+0x3f/0x70 irq event stamp: 1672286 hardirqs last enabled at (1672283): [<ffffffff81408ec0>] poll_idle+0x10/0x80 hardirqs last disabled at (1672284): [<ffffffff8151d304>] common_interrupt+0x84/0x89 softirqs last enabled at (1672286): [<ffffffff8105b4dc>] _local_bh_enable+0x1c/0x50 softirqs last disabled at (1672285): [<ffffffff8105b697>] irq_enter+0x47/0x70 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&cm_id_priv->lock)->rlock); <Interrupt> lock(&(&cm_id_priv->lock)->rlock); *** DEADLOCK *** no locks held by swapper/8/0. stack backtrace: CPU: 8 PID: 0 Comm: swapper/8 Tainted: G E 4.4.0-rc7+ #1 Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014 ffff88045af5e950 ffff88046e503a88 ffffffff81251c1b 0000000000000007 0000000000000006 0000000000000003 ffff88045af5ddc0 ffff88046e503ad8 ffffffff810a32f4 0000000000000000 0000000000000000 0000000000000001 Call Trace: <IRQ> [<ffffffff81251c1b>] dump_stack+0x4f/0x74 [<ffffffff810a32f4>] print_usage_bug+0x184/0x190 [<ffffffff810a36e2>] mark_lock_irq+0xf2/0x290 [<ffffffff810a3995>] mark_lock+0x115/0x1b0 [<ffffffff810a3b8c>] mark_irqflags+0x15c/0x170 [<ffffffff810a4fef>] __lock_acquire+0x1ef/0x560 [<ffffffff810a53c2>] lock_acquire+0x62/0x80 [<ffffffff8151bd33>] _raw_spin_lock_irqsave+0x43/0x60 [<ffffffffa036eec4>] cm_establish+0x74/0x1b0 [ib_cm] [<ffffffffa036f031>] ib_cm_notify+0x31/0x100 [ib_cm] [<ffffffffa0637f24>] srpt_qp_event+0x54/0xd0 [ib_srpt] [<ffffffffa0196052>] mlx4_ib_qp_event+0x72/0xc0 [mlx4_ib] [<ffffffffa00775b9>] mlx4_qp_event+0x69/0xd0 [mlx4_core] [<ffffffffa006000e>] mlx4_eq_int+0x51e/0xd50 [mlx4_core] [<ffffffffa006084f>] mlx4_msi_x_interrupt+0xf/0x20 [mlx4_core] [<ffffffff810b67b0>] handle_irq_event_percpu+0x40/0x110 [<ffffffff810b68bf>] handle_irq_event+0x3f/0x70 [<ffffffff810ba7f9>] handle_edge_irq+0x79/0x120 [<ffffffff81007f3d>] handle_irq+0x5d/0x130 [<ffffffff810071fd>] do_IRQ+0x6d/0x130 [<ffffffff8151d309>] common_interrupt+0x89/0x89 <EOI> [<ffffffff8140895f>] cpuidle_enter_state+0xcf/0x200 [<ffffffff81408aa2>] cpuidle_enter+0x12/0x20 [<ffffffff810990d6>] call_cpuidle+0x36/0x60 [<ffffffff81099163>] cpuidle_idle_call+0x63/0x110 [<ffffffff8109930a>] cpu_idle_loop+0xfa/0x130 [<ffffffff8109934e>] cpu_startup_entry+0xe/0x10 [<ffffffff8103c443>] start_secondary+0x83/0x90 Fixes: commit be4b499323bf ("IB/cm: Do not queue work to a device that's going away") Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com> Cc: Erez Shitrit <erezsh@mellanox.com> Cc: stable <stable@vger.kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-01-01 20:17:46 +08:00
spin_lock_irqsave(&cm.lock, flags);
if (!cm_dev->going_down)
queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
msecs_to_jiffies(wait_time));
IB/cm: Fix a recently introduced deadlock ib_send_cm_drep() calls cm_enter_timewait() while holding a spinlock that can be locked from inside an interrupt handler. Hence do not enable interrupts inside cm_enter_timewait() if called with interrupts disabled. This patch fixes e.g. the following deadlock: Acked-by: Erez Shitrit <erezsh@mellanox.com> ================================= [ INFO: inconsistent lock state ] 4.4.0-rc7+ #1 Tainted: G E --------------------------------- inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. swapper/8/0 [HC1[1]:SC0[0]:HE0:SE1] takes: (&(&cm_id_priv->lock)->rlock){?.+...}, at: [<ffffffffa036eec4>] cm_establish+0x 74/0x1b0 [ib_cm] {HARDIRQ-ON-W} state was registered at: [<ffffffff810a3c11>] mark_held_locks+0x71/0x90 [<ffffffff810a3e87>] trace_hardirqs_on_caller+0xa7/0x1c0 [<ffffffff810a3fad>] trace_hardirqs_on+0xd/0x10 [<ffffffff8151c40b>] _raw_spin_unlock_irq+0x2b/0x40 [<ffffffffa036ea8e>] cm_enter_timewait+0xae/0x100 [ib_cm] [<ffffffffa036ff76>] ib_send_cm_drep+0xb6/0x190 [ib_cm] [<ffffffffa052ed08>] srp_cm_handler+0x128/0x1a0 [ib_srp] [<ffffffffa0370340>] cm_process_work+0x20/0xf0 [ib_cm] [<ffffffffa0371335>] cm_dreq_handler+0x135/0x2c0 [ib_cm] [<ffffffffa03733c5>] cm_work_handler+0x75/0xd0 [ib_cm] [<ffffffff8107184d>] process_one_work+0x1bd/0x460 [<ffffffff81073148>] worker_thread+0x118/0x420 [<ffffffff81078454>] kthread+0xe4/0x100 [<ffffffff8151cbbf>] ret_from_fork+0x3f/0x70 irq event stamp: 1672286 hardirqs last enabled at (1672283): [<ffffffff81408ec0>] poll_idle+0x10/0x80 hardirqs last disabled at (1672284): [<ffffffff8151d304>] common_interrupt+0x84/0x89 softirqs last enabled at (1672286): [<ffffffff8105b4dc>] _local_bh_enable+0x1c/0x50 softirqs last disabled at (1672285): [<ffffffff8105b697>] irq_enter+0x47/0x70 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&cm_id_priv->lock)->rlock); <Interrupt> lock(&(&cm_id_priv->lock)->rlock); *** DEADLOCK *** no locks held by swapper/8/0. stack backtrace: CPU: 8 PID: 0 Comm: swapper/8 Tainted: G E 4.4.0-rc7+ #1 Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014 ffff88045af5e950 ffff88046e503a88 ffffffff81251c1b 0000000000000007 0000000000000006 0000000000000003 ffff88045af5ddc0 ffff88046e503ad8 ffffffff810a32f4 0000000000000000 0000000000000000 0000000000000001 Call Trace: <IRQ> [<ffffffff81251c1b>] dump_stack+0x4f/0x74 [<ffffffff810a32f4>] print_usage_bug+0x184/0x190 [<ffffffff810a36e2>] mark_lock_irq+0xf2/0x290 [<ffffffff810a3995>] mark_lock+0x115/0x1b0 [<ffffffff810a3b8c>] mark_irqflags+0x15c/0x170 [<ffffffff810a4fef>] __lock_acquire+0x1ef/0x560 [<ffffffff810a53c2>] lock_acquire+0x62/0x80 [<ffffffff8151bd33>] _raw_spin_lock_irqsave+0x43/0x60 [<ffffffffa036eec4>] cm_establish+0x74/0x1b0 [ib_cm] [<ffffffffa036f031>] ib_cm_notify+0x31/0x100 [ib_cm] [<ffffffffa0637f24>] srpt_qp_event+0x54/0xd0 [ib_srpt] [<ffffffffa0196052>] mlx4_ib_qp_event+0x72/0xc0 [mlx4_ib] [<ffffffffa00775b9>] mlx4_qp_event+0x69/0xd0 [mlx4_core] [<ffffffffa006000e>] mlx4_eq_int+0x51e/0xd50 [mlx4_core] [<ffffffffa006084f>] mlx4_msi_x_interrupt+0xf/0x20 [mlx4_core] [<ffffffff810b67b0>] handle_irq_event_percpu+0x40/0x110 [<ffffffff810b68bf>] handle_irq_event+0x3f/0x70 [<ffffffff810ba7f9>] handle_edge_irq+0x79/0x120 [<ffffffff81007f3d>] handle_irq+0x5d/0x130 [<ffffffff810071fd>] do_IRQ+0x6d/0x130 [<ffffffff8151d309>] common_interrupt+0x89/0x89 <EOI> [<ffffffff8140895f>] cpuidle_enter_state+0xcf/0x200 [<ffffffff81408aa2>] cpuidle_enter+0x12/0x20 [<ffffffff810990d6>] call_cpuidle+0x36/0x60 [<ffffffff81099163>] cpuidle_idle_call+0x63/0x110 [<ffffffff8109930a>] cpu_idle_loop+0xfa/0x130 [<ffffffff8109934e>] cpu_startup_entry+0xe/0x10 [<ffffffff8103c443>] start_secondary+0x83/0x90 Fixes: commit be4b499323bf ("IB/cm: Do not queue work to a device that's going away") Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com> Cc: Erez Shitrit <erezsh@mellanox.com> Cc: stable <stable@vger.kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-01-01 20:17:46 +08:00
spin_unlock_irqrestore(&cm.lock, flags);
/*
* The timewait_info is converted into a work and gets freed during
* cm_free_work() in cm_timewait_handler().
*/
BUILD_BUG_ON(offsetof(struct cm_timewait_info, work) != 0);
cm_id_priv->timewait_info = NULL;
}
static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
{
unsigned long flags;
lockdep_assert_held(&cm_id_priv->lock);
cm_id_priv->id.state = IB_CM_IDLE;
if (cm_id_priv->timewait_info) {
spin_lock_irqsave(&cm.lock, flags);
cm_remove_remote(cm_id_priv);
spin_unlock_irqrestore(&cm.lock, flags);
kfree(cm_id_priv->timewait_info);
cm_id_priv->timewait_info = NULL;
}
}
static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
{
struct cm_id_private *cm_id_priv;
struct cm_work *work;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irq(&cm_id_priv->lock);
retest:
switch (cm_id->state) {
case IB_CM_LISTEN:
spin_lock(&cm.lock);
if (--cm_id_priv->listen_sharecount > 0) {
/* The id is still shared. */
WARN_ON(refcount_read(&cm_id_priv->refcount) == 1);
spin_unlock(&cm.lock);
spin_unlock_irq(&cm_id_priv->lock);
cm_deref_id(cm_id_priv);
return;
}
cm_id->state = IB_CM_IDLE;
rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
RB_CLEAR_NODE(&cm_id_priv->service_node);
spin_unlock(&cm.lock);
break;
case IB_CM_SIDR_REQ_SENT:
cm_id->state = IB_CM_IDLE;
ib_cancel_mad(cm_id_priv->msg);
break;
case IB_CM_SIDR_REQ_RCVD:
cm_send_sidr_rep_locked(cm_id_priv,
&(struct ib_cm_sidr_rep_param){
.status = IB_SIDR_REJECT });
/* cm_send_sidr_rep_locked will not move to IDLE if it fails */
cm_id->state = IB_CM_IDLE;
break;
case IB_CM_REQ_SENT:
ib/cm: Change reject message type when destroying cm_id Problem reported by: Ted Kim <ted.h.kim@oracle.com>: We have a case where a Linux system and a non-Linux system are trying to interoperate. The Linux host is the active side and starts the connection establishment, but later decides to not go through with the connection setup and does rdma_destroy_id(). The rdma_destroy_id() eventually works its way down to cm_destroy_id() in core/cm.c, where a REJ is sent. The non-Linux system has some trouble recognizing the REJ because of: A. CM states which can't receive the REJ B. Some issues about REJ formatting (missing comm ID) ISSUE A: That part of the spec says, a Consumer Reject REJ can be sent for a connection abort, but it goes further and says: can send a REJ message with a "Consumer Reject" Reason code if they are in a CM state (i.e. REP Rcvd, MRA(REP) Sent, REQ Rcvd, MRA Sent) that allows a REJ to be sent (lines 35-38). Of the states listed there in that sentence, it would seem to limit the active side to using the Consumer Reject (for the abort case) in just the REP-Rcvd and MRA-REP-Sent states. That is basically only after the active side sees a REP (or alternatively goes down the state transitions to timeout in which case a Timeout REJ is sent). As a fix, in cm-destroy-id() move the IB-CM-MRA-REQ-RCVD case to the same as REQ-SENT. Essentially, make a REJ sent after getting an MRA on active side a timeout rather than Consumer- Reject, which is arguably more correct with the CM state diagrams previous to getting a REP. Signed-off-by: Ted Kim <ted.h.kim@oracle.com> Signed-off-by: Sean Hefty <sean.hefty@intel.com>
2015-05-15 03:49:01 +08:00
case IB_CM_MRA_REQ_RCVD:
ib_cancel_mad(cm_id_priv->msg);
cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT,
&cm_id_priv->id.device->node_guid,
sizeof(cm_id_priv->id.device->node_guid),
NULL, 0);
break;
case IB_CM_REQ_RCVD:
if (err == -ENOMEM) {
/* Do not reject to allow future retries. */
cm_reset_to_idle(cm_id_priv);
} else {
cm_send_rej_locked(cm_id_priv,
IB_CM_REJ_CONSUMER_DEFINED, NULL, 0,
NULL, 0);
}
break;
case IB_CM_REP_SENT:
case IB_CM_MRA_REP_RCVD:
ib_cancel_mad(cm_id_priv->msg);
cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL,
0, NULL, 0);
goto retest;
case IB_CM_MRA_REQ_SENT:
case IB_CM_REP_RCVD:
case IB_CM_MRA_REP_SENT:
cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL,
0, NULL, 0);
break;
case IB_CM_ESTABLISHED:
if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
cm_id->state = IB_CM_IDLE;
break;
}
cm_send_dreq_locked(cm_id_priv, NULL, 0);
goto retest;
case IB_CM_DREQ_SENT:
ib_cancel_mad(cm_id_priv->msg);
cm_enter_timewait(cm_id_priv);
goto retest;
case IB_CM_DREQ_RCVD:
cm_send_drep_locked(cm_id_priv, NULL, 0);
WARN_ON(cm_id->state != IB_CM_TIMEWAIT);
goto retest;
case IB_CM_TIMEWAIT:
/*
* The cm_acquire_id in cm_timewait_handler will stop working
* once we do xa_erase below, so just move to idle here for
* consistency.
*/
cm_id->state = IB_CM_IDLE;
break;
case IB_CM_IDLE:
break;
}
WARN_ON(cm_id->state != IB_CM_IDLE);
spin_lock(&cm.lock);
/* Required for cleanup paths related cm_req_handler() */
if (cm_id_priv->timewait_info) {
cm_remove_remote(cm_id_priv);
kfree(cm_id_priv->timewait_info);
cm_id_priv->timewait_info = NULL;
}
WARN_ON(cm_id_priv->listen_sharecount);
WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node));
if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
spin_unlock(&cm.lock);
spin_unlock_irq(&cm_id_priv->lock);
IB/cm: Mark stale CM id's whenever the mad agent was unregistered When there is a CM id object that has port assigned to it, it means that the cm-id asked for the specific port that it should go by it, but if that port was removed (hot-unplug event) the cm-id was not updated. In order to fix that the port keeps a list of all the cm-id's that are planning to go by it, whenever the port is removed it marks all of them as invalid. This commit fixes a kernel panic which happens when running traffic between guests and we force reboot a guest mid traffic, it triggers a kernel panic: Call Trace: [<ffffffff815271fa>] ? panic+0xa7/0x16f [<ffffffff8152b534>] ? oops_end+0xe4/0x100 [<ffffffff8104a00b>] ? no_context+0xfb/0x260 [<ffffffff81084db2>] ? del_timer_sync+0x22/0x30 [<ffffffff8104a295>] ? __bad_area_nosemaphore+0x125/0x1e0 [<ffffffff81084240>] ? process_timeout+0x0/0x10 [<ffffffff8104a363>] ? bad_area_nosemaphore+0x13/0x20 [<ffffffff8104aabf>] ? __do_page_fault+0x31f/0x480 [<ffffffff81065df0>] ? default_wake_function+0x0/0x20 [<ffffffffa0752675>] ? free_msg+0x55/0x70 [mlx5_core] [<ffffffffa0753434>] ? cmd_exec+0x124/0x840 [mlx5_core] [<ffffffff8105a924>] ? find_busiest_group+0x244/0x9f0 [<ffffffff8152d45e>] ? do_page_fault+0x3e/0xa0 [<ffffffff8152a815>] ? page_fault+0x25/0x30 [<ffffffffa024da25>] ? cm_alloc_msg+0x35/0xc0 [ib_cm] [<ffffffffa024e821>] ? ib_send_cm_dreq+0xb1/0x1e0 [ib_cm] [<ffffffffa024f836>] ? cm_destroy_id+0x176/0x320 [ib_cm] [<ffffffffa024fb00>] ? ib_destroy_cm_id+0x10/0x20 [ib_cm] [<ffffffffa034f527>] ? ipoib_cm_free_rx_reap_list+0xa7/0x110 [ib_ipoib] [<ffffffffa034f590>] ? ipoib_cm_rx_reap+0x0/0x20 [ib_ipoib] [<ffffffffa034f5a5>] ? ipoib_cm_rx_reap+0x15/0x20 [ib_ipoib] [<ffffffff81094d20>] ? worker_thread+0x170/0x2a0 [<ffffffff8109b2a0>] ? autoremove_wake_function+0x0/0x40 [<ffffffff81094bb0>] ? worker_thread+0x0/0x2a0 [<ffffffff8109aef6>] ? kthread+0x96/0xa0 [<ffffffff8100c20a>] ? child_rip+0xa/0x20 [<ffffffff8109ae60>] ? kthread+0x0/0xa0 [<ffffffff8100c200>] ? child_rip+0x0/0x20 Fixes: a977049dacde ("[PATCH] IB: Add the kernel CM implementation") Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Erez Shitrit <erezsh@mellanox.com> Reviewed-by: Maor Gottlieb <maorg@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-10-27 21:36:27 +08:00
xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
cm_deref_id(cm_id_priv);
wait_for_completion(&cm_id_priv->comp);
while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
cm_free_work(work);
cm_destroy_av(&cm_id_priv->av);
cm_destroy_av(&cm_id_priv->alt_av);
kfree(cm_id_priv->private_data);
kfree_rcu(cm_id_priv, rcu);
}
void ib_destroy_cm_id(struct ib_cm_id *cm_id)
{
cm_destroy_id(cm_id, 0);
}
EXPORT_SYMBOL(ib_destroy_cm_id);
static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id)
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
{
if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
(service_id != IB_CM_ASSIGN_SERVICE_ID))
return -EINVAL;
if (service_id == IB_CM_ASSIGN_SERVICE_ID)
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
cm_id_priv->id.service_id = cpu_to_be64(cm.listen_service_id++);
else
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
cm_id_priv->id.service_id = service_id;
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
return 0;
}
/**
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
* ib_cm_listen - Initiates listening on the specified service ID for
* connection and service ID resolution requests.
* @cm_id: Connection identifier associated with the listen request.
* @service_id: Service identifier matched against incoming connection
* and service ID resolution requests. The service ID should be specified
* network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
* assign a service ID to the caller.
*/
int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id)
{
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
struct cm_id_private *cm_id_priv =
container_of(cm_id, struct cm_id_private, id);
unsigned long flags;
int ret;
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id_priv->id.state != IB_CM_IDLE) {
ret = -EINVAL;
goto out;
}
ret = cm_init_listen(cm_id_priv, service_id);
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
if (ret)
goto out;
if (!cm_insert_listen(cm_id_priv, NULL)) {
ret = -EBUSY;
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
goto out;
}
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
cm_id_priv->id.state = IB_CM_LISTEN;
ret = 0;
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
out:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_cm_listen);
/**
* ib_cm_insert_listen - Create a new listening ib_cm_id and listen on
* the given service ID.
*
* If there's an existing ID listening on that same device and service ID,
* return it.
*
* @device: Device associated with the cm_id. All related communication will
* be associated with the specified device.
* @cm_handler: Callback invoked to notify the user of CM events.
* @service_id: Service identifier matched against incoming connection
* and service ID resolution requests. The service ID should be specified
* network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
* assign a service ID to the caller.
*
* Callers should call ib_destroy_cm_id when done with the listener ID.
*/
struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
ib_cm_handler cm_handler,
__be64 service_id)
{
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
struct cm_id_private *listen_id_priv;
struct cm_id_private *cm_id_priv;
int err = 0;
/* Create an ID in advance, since the creation may sleep */
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
cm_id_priv = cm_alloc_id_priv(device, cm_handler, NULL);
if (IS_ERR(cm_id_priv))
return ERR_CAST(cm_id_priv);
err = cm_init_listen(cm_id_priv, service_id);
if (err) {
ib_destroy_cm_id(&cm_id_priv->id);
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
return ERR_PTR(err);
}
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
spin_lock_irq(&cm_id_priv->lock);
listen_id_priv = cm_insert_listen(cm_id_priv, cm_handler);
if (listen_id_priv != cm_id_priv) {
spin_unlock_irq(&cm_id_priv->lock);
ib_destroy_cm_id(&cm_id_priv->id);
if (!listen_id_priv)
return ERR_PTR(-EINVAL);
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
return &listen_id_priv->id;
}
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
cm_id_priv->id.state = IB_CM_LISTEN;
spin_unlock_irq(&cm_id_priv->lock);
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
/*
* A listen ID does not need to be in the xarray since it does not
* receive mads, is not placed in the remote_id or remote_qpn rbtree,
* and does not enter timewait.
*/
RDMA/cm: Simplify establishing a listen cm_id Any manipulation of cm_id->state must be done under the cm_id_priv->lock, the two routines that added listens did not follow this rule, because they never participate in any concurrent access around the state. However, since this exception makes the code hard to understand, simplify the flow so that it can be fully locked: - Move manipulation of listen_sharecount into cm_insert_listen() so it is trivially under the cm.lock without having to expose the cm.lock to the caller. - Push the cm.lock down into cm_insert_listen() and have the function increment the reference count before returning an existing pointer. - Split ib_cm_listen() into an cm_init_listen() and do not call ib_cm_listen() from ib_cm_insert_listen() - Make both ib_cm_listen() and ib_cm_insert_listen() directly call cm_insert_listen() under their cm_id_priv->lock which does both a collision detect and, if needed, the insert (atomically) - Enclose all state manipulation within the cm_id_priv->lock, notice this set can be done safely after cm_insert_listen() as no reader is allowed to read the state without holding the lock. - Do not set the listen cm_id in the xarray, as it is never correct to look it up. This makes the concurrency simpler to understand. Many needless error unwinds are removed in the process. Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-03-10 17:25:35 +08:00
return &cm_id_priv->id;
}
EXPORT_SYMBOL(ib_cm_insert_listen);
static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
{
u64 hi_tid = 0, low_tid;
lockdep_assert_held(&cm_id_priv->lock);
low_tid = (u64)cm_id_priv->id.local_id;
if (!cm_id_priv->av.port)
return cpu_to_be64(low_tid);
spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
if (cm_id_priv->av.port->mad_agent)
hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return cpu_to_be64(hi_tid | low_tid);
}
static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
__be16 attr_id, __be64 tid)
{
hdr->base_version = IB_MGMT_BASE_VERSION;
hdr->mgmt_class = IB_MGMT_CLASS_CM;
hdr->class_version = IB_CM_CLASS_VERSION;
hdr->method = IB_MGMT_METHOD_SEND;
hdr->attr_id = attr_id;
hdr->tid = tid;
}
static void cm_format_mad_ece_hdr(struct ib_mad_hdr *hdr, __be16 attr_id,
__be64 tid, u32 attr_mod)
{
cm_format_mad_hdr(hdr, attr_id, tid);
hdr->attr_mod = cpu_to_be32(attr_mod);
}
static void cm_format_req(struct cm_req_msg *req_msg,
struct cm_id_private *cm_id_priv,
struct ib_cm_req_param *param)
{
struct sa_path_rec *pri_path = param->primary_path;
struct sa_path_rec *alt_path = param->alternate_path;
bool pri_ext = false;
__be16 lid;
if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA)
pri_ext = opa_is_extended_lid(pri_path->opa.dlid,
pri_path->opa.slid);
cm_format_mad_ece_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
cm_form_tid(cm_id_priv), param->ece.attr_mod);
IBA_SET(CM_REQ_LOCAL_COMM_ID, req_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_REQ_SERVICE_ID, req_msg, be64_to_cpu(param->service_id));
IBA_SET(CM_REQ_LOCAL_CA_GUID, req_msg,
be64_to_cpu(cm_id_priv->id.device->node_guid));
IBA_SET(CM_REQ_LOCAL_QPN, req_msg, param->qp_num);
IBA_SET(CM_REQ_INITIATOR_DEPTH, req_msg, param->initiator_depth);
IBA_SET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg,
param->remote_cm_response_timeout);
cm_req_set_qp_type(req_msg, param->qp_type);
IBA_SET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg, param->flow_control);
IBA_SET(CM_REQ_STARTING_PSN, req_msg, param->starting_psn);
IBA_SET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg,
param->local_cm_response_timeout);
IBA_SET(CM_REQ_PARTITION_KEY, req_msg,
be16_to_cpu(param->primary_path->pkey));
IBA_SET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg,
param->primary_path->mtu);
IBA_SET(CM_REQ_MAX_CM_RETRIES, req_msg, param->max_cm_retries);
if (param->qp_type != IB_QPT_XRC_INI) {
IBA_SET(CM_REQ_RESPONDER_RESOURCES, req_msg,
param->responder_resources);
IBA_SET(CM_REQ_RETRY_COUNT, req_msg, param->retry_count);
IBA_SET(CM_REQ_RNR_RETRY_COUNT, req_msg,
param->rnr_retry_count);
IBA_SET(CM_REQ_SRQ, req_msg, param->srq);
}
*IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg) =
pri_path->sgid;
*IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg) =
pri_path->dgid;
if (pri_ext) {
IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg)
->global.interface_id =
OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid));
IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg)
->global.interface_id =
OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid));
}
if (pri_path->hop_limit <= 1) {
IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
be16_to_cpu(pri_ext ? 0 :
htons(ntohl(sa_path_get_slid(
pri_path)))));
IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg,
be16_to_cpu(pri_ext ? 0 :
htons(ntohl(sa_path_get_dlid(
pri_path)))));
} else {
if (param->primary_path_inbound) {
lid = param->primary_path_inbound->ib.dlid;
IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
be16_to_cpu(lid));
} else
IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
be16_to_cpu(IB_LID_PERMISSIVE));
/* Work-around until there's a way to obtain remote LID info */
IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg,
be16_to_cpu(IB_LID_PERMISSIVE));
}
IBA_SET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg,
be32_to_cpu(pri_path->flow_label));
IBA_SET(CM_REQ_PRIMARY_PACKET_RATE, req_msg, pri_path->rate);
IBA_SET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg, pri_path->traffic_class);
IBA_SET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg, pri_path->hop_limit);
IBA_SET(CM_REQ_PRIMARY_SL, req_msg, pri_path->sl);
IBA_SET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg,
(pri_path->hop_limit <= 1));
IBA_SET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg,
cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
pri_path->packet_life_time));
if (alt_path) {
bool alt_ext = false;
if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA)
alt_ext = opa_is_extended_lid(alt_path->opa.dlid,
alt_path->opa.slid);
*IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg) =
alt_path->sgid;
*IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg) =
alt_path->dgid;
if (alt_ext) {
IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID,
req_msg)
->global.interface_id =
OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid));
IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID,
req_msg)
->global.interface_id =
OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid));
}
if (alt_path->hop_limit <= 1) {
IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg,
be16_to_cpu(
alt_ext ? 0 :
htons(ntohl(sa_path_get_slid(
alt_path)))));
IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg,
be16_to_cpu(
alt_ext ? 0 :
htons(ntohl(sa_path_get_dlid(
alt_path)))));
} else {
IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg,
be16_to_cpu(IB_LID_PERMISSIVE));
IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg,
be16_to_cpu(IB_LID_PERMISSIVE));
}
IBA_SET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg,
be32_to_cpu(alt_path->flow_label));
IBA_SET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg, alt_path->rate);
IBA_SET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg,
alt_path->traffic_class);
IBA_SET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg,
alt_path->hop_limit);
IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, alt_path->sl);
IBA_SET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg,
(alt_path->hop_limit <= 1));
IBA_SET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg,
cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
alt_path->packet_life_time));
}
IBA_SET(CM_REQ_VENDOR_ID, req_msg, param->ece.vendor_id);
if (param->private_data && param->private_data_len)
IBA_SET_MEM(CM_REQ_PRIVATE_DATA, req_msg, param->private_data,
param->private_data_len);
}
static int cm_validate_req_param(struct ib_cm_req_param *param)
{
if (!param->primary_path)
return -EINVAL;
if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC &&
param->qp_type != IB_QPT_XRC_INI)
return -EINVAL;
if (param->private_data &&
param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
return -EINVAL;
if (param->alternate_path &&
(param->alternate_path->pkey != param->primary_path->pkey ||
param->alternate_path->mtu != param->primary_path->mtu))
return -EINVAL;
return 0;
}
int ib_send_cm_req(struct ib_cm_id *cm_id,
struct ib_cm_req_param *param)
{
struct cm_av av = {}, alt_av = {};
struct cm_id_private *cm_id_priv;
struct ib_mad_send_buf *msg;
struct cm_req_msg *req_msg;
unsigned long flags;
int ret;
ret = cm_validate_req_param(param);
if (ret)
return ret;
/* Verify that we're not in timewait. */
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_IDLE || WARN_ON(cm_id_priv->timewait_info)) {
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return -EINVAL;
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
id.local_id);
if (IS_ERR(cm_id_priv->timewait_info)) {
ret = PTR_ERR(cm_id_priv->timewait_info);
RDMA/cm: Fix an attempt to use non-valid pointer when cleaning timewait If cm_create_timewait_info() fails, the timewait_info pointer will contain an error value and will be used in cm_remove_remote() later. general protection fault, probably for non-canonical address 0xdffffc0000000024: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0×0000000000000120-0×0000000000000127] CPU: 2 PID: 12446 Comm: syz-executor.3 Not tainted 5.10.0-rc5-5d4c0742a60e #27 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:cm_remove_remote.isra.0+0x24/0×170 drivers/infiniband/core/cm.c:978 Code: 84 00 00 00 00 00 41 54 55 53 48 89 fb 48 8d ab 2d 01 00 00 e8 7d bf 4b fe 48 89 ea 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <0f> b6 04 02 48 89 ea 83 e2 07 38 d0 7f 08 84 c0 0f 85 fc 00 00 00 RSP: 0018:ffff888013127918 EFLAGS: 00010006 RAX: dffffc0000000000 RBX: fffffffffffffff4 RCX: ffffc9000a18b000 RDX: 0000000000000024 RSI: ffffffff82edc573 RDI: fffffffffffffff4 RBP: 0000000000000121 R08: 0000000000000001 R09: ffffed1002624f1d R10: 0000000000000003 R11: ffffed1002624f1c R12: ffff888107760c70 R13: ffff888107760c40 R14: fffffffffffffff4 R15: ffff888107760c9c FS: 00007fe1ffcc1700(0000) GS:ffff88811a600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2ff21000 CR3: 000000010f504001 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: cm_destroy_id+0x189/0×15b0 drivers/infiniband/core/cm.c:1155 cma_connect_ib drivers/infiniband/core/cma.c:4029 [inline] rdma_connect_locked+0x1100/0×17c0 drivers/infiniband/core/cma.c:4107 rdma_connect+0x2a/0×40 drivers/infiniband/core/cma.c:4140 ucma_connect+0x277/0×340 drivers/infiniband/core/ucma.c:1069 ucma_write+0x236/0×2f0 drivers/infiniband/core/ucma.c:1724 vfs_write+0x220/0×830 fs/read_write.c:603 ksys_write+0x1df/0×240 fs/read_write.c:658 do_syscall_64+0x33/0×40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: a977049dacde ("[PATCH] IB: Add the kernel CM implementation") Link: https://lore.kernel.org/r/20201204064205.145795-1-leon@kernel.org Reviewed-by: Maor Gottlieb <maorg@nvidia.com> Reported-by: Amit Matityahu <mitm@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2020-12-04 14:42:05 +08:00
cm_id_priv->timewait_info = NULL;
return ret;
}
ret = cm_init_av_by_path(param->primary_path,
param->ppath_sgid_attr, &av);
if (ret)
return ret;
if (param->alternate_path) {
ret = cm_init_av_by_path(param->alternate_path, NULL,
&alt_av);
if (ret) {
cm_destroy_av(&av);
return ret;
}
}
cm_id->service_id = param->service_id;
cm_id_priv->timeout_ms = cm_convert_to_ms(
param->primary_path->packet_life_time) * 2 +
cm_convert_to_ms(
param->remote_cm_response_timeout);
cm_id_priv->max_cm_retries = param->max_cm_retries;
cm_id_priv->initiator_depth = param->initiator_depth;
cm_id_priv->responder_resources = param->responder_resources;
cm_id_priv->retry_count = param->retry_count;
cm_id_priv->path_mtu = param->primary_path->mtu;
cm_id_priv->pkey = param->primary_path->pkey;
cm_id_priv->qp_type = param->qp_type;
spin_lock_irqsave(&cm_id_priv->lock, flags);
cm_move_av_from_path(&cm_id_priv->av, &av);
if (param->primary_path_outbound)
cm_id_priv->av.dlid_datapath =
be16_to_cpu(param->primary_path_outbound->ib.dlid);
if (param->alternate_path)
cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av);
msg = cm_alloc_priv_msg(cm_id_priv);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out_unlock;
}
req_msg = (struct cm_req_msg *)msg->mad;
cm_format_req(req_msg, cm_id_priv, param);
cm_id_priv->tid = req_msg->hdr.tid;
msg->timeout_ms = cm_id_priv->timeout_ms;
msg->context[1] = (void *)(unsigned long)IB_CM_REQ_SENT;
cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg));
cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
trace_icm_send_req(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
if (ret)
goto out_free;
BUG_ON(cm_id->state != IB_CM_IDLE);
cm_id->state = IB_CM_REQ_SENT;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return 0;
out_free:
cm_free_priv_msg(msg);
out_unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_req);
static int cm_issue_rej(struct cm_port *port,
struct ib_mad_recv_wc *mad_recv_wc,
enum ib_cm_rej_reason reason,
enum cm_msg_response msg_rejected,
void *ari, u8 ari_length)
{
struct ib_mad_send_buf *msg = NULL;
struct cm_rej_msg *rej_msg, *rcv_msg;
int ret;
ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
if (ret)
return ret;
/* We just need common CM header information. Cast to any message. */
rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
rej_msg = (struct cm_rej_msg *) msg->mad;
cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg,
IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg));
IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg,
IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg));
IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, msg_rejected);
IBA_SET(CM_REJ_REASON, rej_msg, reason);
if (ari && ari_length) {
IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length);
IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length);
}
trace_icm_issue_rej(
IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg),
IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg));
ret = ib_post_send_mad(msg, NULL);
if (ret)
cm_free_response_msg(msg);
return ret;
}
static bool cm_req_has_alt_path(struct cm_req_msg *req_msg)
{
return ((cpu_to_be16(
IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg))) ||
(ib_is_opa_gid(IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID,
req_msg))));
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 15:04:20 +08:00
static void cm_path_set_rec_type(struct ib_device *ib_device, u32 port_num,
struct sa_path_rec *path, union ib_gid *gid)
{
if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num))
path->rec_type = SA_PATH_REC_TYPE_OPA;
else
path->rec_type = SA_PATH_REC_TYPE_IB;
}
static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg,
struct sa_path_rec *primary_path,
struct sa_path_rec *alt_path,
struct ib_wc *wc)
{
u32 lid;
if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) {
sa_path_set_dlid(primary_path, wc->slid);
sa_path_set_slid(primary_path,
IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID,
req_msg));
} else {
lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg));
sa_path_set_dlid(primary_path, lid);
lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg));
sa_path_set_slid(primary_path, lid);
}
if (!cm_req_has_alt_path(req_msg))
return;
if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) {
sa_path_set_dlid(alt_path,
IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID,
req_msg));
sa_path_set_slid(alt_path,
IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID,
req_msg));
} else {
lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg));
sa_path_set_dlid(alt_path, lid);
lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg));
sa_path_set_slid(alt_path, lid);
}
}
static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
struct sa_path_rec *primary_path,
struct sa_path_rec *alt_path,
struct ib_wc *wc)
{
primary_path->dgid =
*IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg);
primary_path->sgid =
*IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg);
primary_path->flow_label =
cpu_to_be32(IBA_GET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg));
primary_path->hop_limit = IBA_GET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg);
primary_path->traffic_class =
IBA_GET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg);
primary_path->reversible = 1;
primary_path->pkey =
cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg));
primary_path->sl = IBA_GET(CM_REQ_PRIMARY_SL, req_msg);
primary_path->mtu_selector = IB_SA_EQ;
primary_path->mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg);
primary_path->rate_selector = IB_SA_EQ;
primary_path->rate = IBA_GET(CM_REQ_PRIMARY_PACKET_RATE, req_msg);
primary_path->packet_life_time_selector = IB_SA_EQ;
primary_path->packet_life_time =
IBA_GET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg);
primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
primary_path->service_id =
cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg));
if (sa_path_is_roce(primary_path))
primary_path->roce.route_resolved = false;
if (cm_req_has_alt_path(req_msg)) {
alt_path->dgid = *IBA_GET_MEM_PTR(
CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg);
alt_path->sgid = *IBA_GET_MEM_PTR(
CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg);
alt_path->flow_label = cpu_to_be32(
IBA_GET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg));
alt_path->hop_limit =
IBA_GET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg);
alt_path->traffic_class =
IBA_GET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg);
alt_path->reversible = 1;
alt_path->pkey =
cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg));
alt_path->sl = IBA_GET(CM_REQ_ALTERNATE_SL, req_msg);
alt_path->mtu_selector = IB_SA_EQ;
alt_path->mtu =
IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg);
alt_path->rate_selector = IB_SA_EQ;
alt_path->rate = IBA_GET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg);
alt_path->packet_life_time_selector = IB_SA_EQ;
alt_path->packet_life_time =
IBA_GET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg);
alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
alt_path->service_id =
cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg));
if (sa_path_is_roce(alt_path))
alt_path->roce.route_resolved = false;
}
cm_format_path_lid_from_req(req_msg, primary_path, alt_path, wc);
}
static u16 cm_get_bth_pkey(struct cm_work *work)
{
struct ib_device *ib_dev = work->port->cm_dev->ib_device;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 15:04:20 +08:00
u32 port_num = work->port->port_num;
u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
u16 pkey;
int ret;
ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
if (ret) {
dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %u, pkey index %u). %d\n",
port_num, pkey_index, ret);
return 0;
}
return pkey;
}
/**
* cm_opa_to_ib_sgid - Convert OPA SGID to IB SGID
* ULPs (such as IPoIB) do not understand OPA GIDs and will
* reject them as the local_gid will not match the sgid. Therefore,
* change the pathrec's SGID to an IB SGID.
*
* @work: Work completion
* @path: Path record
*/
static void cm_opa_to_ib_sgid(struct cm_work *work,
struct sa_path_rec *path)
{
struct ib_device *dev = work->port->cm_dev->ib_device;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 15:04:20 +08:00
u32 port_num = work->port->port_num;
if (rdma_cap_opa_ah(dev, port_num) &&
(ib_is_opa_gid(&path->sgid))) {
union ib_gid sgid;
if (rdma_query_gid(dev, port_num, 0, &sgid)) {
dev_warn(&dev->dev,
"Error updating sgid in CM request\n");
return;
}
path->sgid = sgid;
}
}
static void cm_format_req_event(struct cm_work *work,
struct cm_id_private *cm_id_priv,
struct ib_cm_id *listen_id)
{
struct cm_req_msg *req_msg;
struct ib_cm_req_event_param *param;
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
param = &work->cm_event.param.req_rcvd;
param->listen_id = listen_id;
param->bth_pkey = cm_get_bth_pkey(work);
param->port = cm_id_priv->av.port->port_num;
param->primary_path = &work->path[0];
cm_opa_to_ib_sgid(work, param->primary_path);
if (cm_req_has_alt_path(req_msg)) {
param->alternate_path = &work->path[1];
cm_opa_to_ib_sgid(work, param->alternate_path);
} else {
param->alternate_path = NULL;
}
param->remote_ca_guid =
cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg));
param->remote_qkey = IBA_GET(CM_REQ_LOCAL_Q_KEY, req_msg);
param->remote_qpn = IBA_GET(CM_REQ_LOCAL_QPN, req_msg);
param->qp_type = cm_req_get_qp_type(req_msg);
param->starting_psn = IBA_GET(CM_REQ_STARTING_PSN, req_msg);
param->responder_resources = IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg);
param->initiator_depth = IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg);
param->local_cm_response_timeout =
IBA_GET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg);
param->flow_control = IBA_GET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg);
param->remote_cm_response_timeout =
IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg);
param->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg);
param->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg);
param->srq = IBA_GET(CM_REQ_SRQ, req_msg);
param->ppath_sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr;
param->ece.vendor_id = IBA_GET(CM_REQ_VENDOR_ID, req_msg);
param->ece.attr_mod = be32_to_cpu(req_msg->hdr.attr_mod);
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_REQ_PRIVATE_DATA, req_msg);
}
static void cm_process_work(struct cm_id_private *cm_id_priv,
struct cm_work *work)
{
int ret;
/* We will typically only have the current event to report. */
ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
cm_free_work(work);
while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
spin_lock_irq(&cm_id_priv->lock);
work = cm_dequeue_work(cm_id_priv);
spin_unlock_irq(&cm_id_priv->lock);
if (!work)
return;
ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
&work->cm_event);
cm_free_work(work);
}
cm_deref_id(cm_id_priv);
if (ret)
cm_destroy_id(&cm_id_priv->id, ret);
}
static void cm_format_mra(struct cm_mra_msg *mra_msg,
struct cm_id_private *cm_id_priv,
enum cm_msg_response msg_mraed, u8 service_timeout,
const void *private_data, u8 private_data_len)
{
cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
IBA_SET(CM_MRA_MESSAGE_MRAED, mra_msg, msg_mraed);
IBA_SET(CM_MRA_LOCAL_COMM_ID, mra_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout);
if (private_data && private_data_len)
IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data,
private_data_len);
}
static void cm_format_rej(struct cm_rej_msg *rej_msg,
struct cm_id_private *cm_id_priv,
enum ib_cm_rej_reason reason, void *ari,
u8 ari_length, const void *private_data,
u8 private_data_len, enum ib_cm_state state)
{
lockdep_assert_held(&cm_id_priv->lock);
cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
switch (state) {
case IB_CM_REQ_RCVD:
IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(0));
IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ);
break;
case IB_CM_MRA_REQ_SENT:
IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ);
break;
case IB_CM_REP_RCVD:
case IB_CM_MRA_REP_SENT:
IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REP);
break;
default:
IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg,
CM_MSG_RESPONSE_OTHER);
break;
}
IBA_SET(CM_REJ_REASON, rej_msg, reason);
if (ari && ari_length) {
IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length);
IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length);
}
if (private_data && private_data_len)
IBA_SET_MEM(CM_REJ_PRIVATE_DATA, rej_msg, private_data,
private_data_len);
}
static void cm_dup_req_handler(struct cm_work *work,
struct cm_id_private *cm_id_priv)
{
struct ib_mad_send_buf *msg = NULL;
int ret;
atomic_long_inc(
&work->port->counters[CM_RECV_DUPLICATES][CM_REQ_COUNTER]);
/* Quick state check to discard duplicate REQs. */
spin_lock_irq(&cm_id_priv->lock);
if (cm_id_priv->id.state == IB_CM_REQ_RCVD) {
spin_unlock_irq(&cm_id_priv->lock);
return;
}
spin_unlock_irq(&cm_id_priv->lock);
ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
if (ret)
return;
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id_priv->id.state) {
case IB_CM_MRA_REQ_SENT:
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
break;
case IB_CM_TIMEWAIT:
cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv,
IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0,
IB_CM_TIMEWAIT);
break;
default:
goto unlock;
}
spin_unlock_irq(&cm_id_priv->lock);
trace_icm_send_dup_req(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
if (ret)
goto free;
return;
unlock: spin_unlock_irq(&cm_id_priv->lock);
free: cm_free_response_msg(msg);
}
static struct cm_id_private *cm_match_req(struct cm_work *work,
struct cm_id_private *cm_id_priv)
{
struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
struct cm_timewait_info *timewait_info;
struct cm_req_msg *req_msg;
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
/* Check for possible duplicate REQ. */
spin_lock_irq(&cm.lock);
timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
if (timewait_info) {
cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
timewait_info->work.remote_id);
spin_unlock_irq(&cm.lock);
if (cur_cm_id_priv) {
cm_dup_req_handler(work, cur_cm_id_priv);
cm_deref_id(cur_cm_id_priv);
}
return NULL;
}
/* Check for stale connections. */
timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
if (timewait_info) {
cm_remove_remote(cm_id_priv);
cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
timewait_info->work.remote_id);
spin_unlock_irq(&cm.lock);
cm_issue_rej(work->port, work->mad_recv_wc,
IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
NULL, 0);
if (cur_cm_id_priv) {
ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0);
cm_deref_id(cur_cm_id_priv);
}
return NULL;
}
/* Find matching listen request. */
listen_cm_id_priv = cm_find_listen(
cm_id_priv->id.device,
cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)));
if (!listen_cm_id_priv) {
cm_remove_remote(cm_id_priv);
spin_unlock_irq(&cm.lock);
cm_issue_rej(work->port, work->mad_recv_wc,
IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
NULL, 0);
return NULL;
}
spin_unlock_irq(&cm.lock);
return listen_cm_id_priv;
}
/*
* Work-around for inter-subnet connections. If the LIDs are permissive,
* we need to override the LID/SL data in the REQ with the LID information
* in the work completion.
*/
static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
{
if (!IBA_GET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg)) {
if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID,
req_msg)) == IB_LID_PERMISSIVE) {
IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
be16_to_cpu(ib_lid_be16(wc->slid)));
IBA_SET(CM_REQ_PRIMARY_SL, req_msg, wc->sl);
}
if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID,
req_msg)) == IB_LID_PERMISSIVE)
IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg,
wc->dlid_path_bits);
}
if (!IBA_GET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg)) {
if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID,
req_msg)) == IB_LID_PERMISSIVE) {
IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg,
be16_to_cpu(ib_lid_be16(wc->slid)));
IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, wc->sl);
}
if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID,
req_msg)) == IB_LID_PERMISSIVE)
IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg,
wc->dlid_path_bits);
}
}
static int cm_req_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
struct cm_req_msg *req_msg;
const struct ib_global_route *grh;
const struct ib_gid_attr *gid_attr;
int ret;
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
cm_id_priv =
cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL);
if (IS_ERR(cm_id_priv))
return PTR_ERR(cm_id_priv);
cm_id_priv->id.remote_id =
cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg));
cm_id_priv->id.service_id =
cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg));
cm_id_priv->tid = req_msg->hdr.tid;
cm_id_priv->timeout_ms = cm_convert_to_ms(
IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg));
cm_id_priv->max_cm_retries = IBA_GET(CM_REQ_MAX_CM_RETRIES, req_msg);
cm_id_priv->remote_qpn =
cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg));
cm_id_priv->initiator_depth =
IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg);
cm_id_priv->responder_resources =
IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg);
cm_id_priv->path_mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg);
cm_id_priv->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg));
cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
cm_id_priv->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg);
cm_id_priv->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg);
cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
work->mad_recv_wc->recv_buf.grh,
&cm_id_priv->av);
if (ret)
goto destroy;
cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
id.local_id);
if (IS_ERR(cm_id_priv->timewait_info)) {
ret = PTR_ERR(cm_id_priv->timewait_info);
RDMA/cm: Fix an attempt to use non-valid pointer when cleaning timewait If cm_create_timewait_info() fails, the timewait_info pointer will contain an error value and will be used in cm_remove_remote() later. general protection fault, probably for non-canonical address 0xdffffc0000000024: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0×0000000000000120-0×0000000000000127] CPU: 2 PID: 12446 Comm: syz-executor.3 Not tainted 5.10.0-rc5-5d4c0742a60e #27 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:cm_remove_remote.isra.0+0x24/0×170 drivers/infiniband/core/cm.c:978 Code: 84 00 00 00 00 00 41 54 55 53 48 89 fb 48 8d ab 2d 01 00 00 e8 7d bf 4b fe 48 89 ea 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <0f> b6 04 02 48 89 ea 83 e2 07 38 d0 7f 08 84 c0 0f 85 fc 00 00 00 RSP: 0018:ffff888013127918 EFLAGS: 00010006 RAX: dffffc0000000000 RBX: fffffffffffffff4 RCX: ffffc9000a18b000 RDX: 0000000000000024 RSI: ffffffff82edc573 RDI: fffffffffffffff4 RBP: 0000000000000121 R08: 0000000000000001 R09: ffffed1002624f1d R10: 0000000000000003 R11: ffffed1002624f1c R12: ffff888107760c70 R13: ffff888107760c40 R14: fffffffffffffff4 R15: ffff888107760c9c FS: 00007fe1ffcc1700(0000) GS:ffff88811a600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2ff21000 CR3: 000000010f504001 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: cm_destroy_id+0x189/0×15b0 drivers/infiniband/core/cm.c:1155 cma_connect_ib drivers/infiniband/core/cma.c:4029 [inline] rdma_connect_locked+0x1100/0×17c0 drivers/infiniband/core/cma.c:4107 rdma_connect+0x2a/0×40 drivers/infiniband/core/cma.c:4140 ucma_connect+0x277/0×340 drivers/infiniband/core/ucma.c:1069 ucma_write+0x236/0×2f0 drivers/infiniband/core/ucma.c:1724 vfs_write+0x220/0×830 fs/read_write.c:603 ksys_write+0x1df/0×240 fs/read_write.c:658 do_syscall_64+0x33/0×40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: a977049dacde ("[PATCH] IB: Add the kernel CM implementation") Link: https://lore.kernel.org/r/20201204064205.145795-1-leon@kernel.org Reviewed-by: Maor Gottlieb <maorg@nvidia.com> Reported-by: Amit Matityahu <mitm@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2020-12-04 14:42:05 +08:00
cm_id_priv->timewait_info = NULL;
goto destroy;
}
cm_id_priv->timewait_info->work.remote_id = cm_id_priv->id.remote_id;
cm_id_priv->timewait_info->remote_ca_guid =
cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg));
cm_id_priv->timewait_info->remote_qpn = cm_id_priv->remote_qpn;
/*
* Note that the ID pointer is not in the xarray at this point,
* so this set is only visible to the local thread.
*/
cm_id_priv->id.state = IB_CM_REQ_RCVD;
listen_cm_id_priv = cm_match_req(work, cm_id_priv);
if (!listen_cm_id_priv) {
trace_icm_no_listener_err(&cm_id_priv->id);
cm_id_priv->id.state = IB_CM_IDLE;
ret = -EINVAL;
goto destroy;
}
memset(&work->path[0], 0, sizeof(work->path[0]));
if (cm_req_has_alt_path(req_msg))
memset(&work->path[1], 0, sizeof(work->path[1]));
grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr);
gid_attr = grh->sgid_attr;
if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) {
work->path[0].rec_type =
sa_conv_gid_to_pathrec_type(gid_attr->gid_type);
} else {
cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
cm_path_set_rec_type(
work->port->cm_dev->ib_device, work->port->port_num,
&work->path[0],
IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID,
req_msg));
}
if (cm_req_has_alt_path(req_msg))
work->path[1].rec_type = work->path[0].rec_type;
cm_format_paths_from_req(req_msg, &work->path[0],
&work->path[1], work->mad_recv_wc->wc);
if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
sa_path_set_dmac(&work->path[0],
cm_id_priv->av.ah_attr.roce.dmac);
work->path[0].hop_limit = grh->hop_limit;
/* This destroy call is needed to pair with cm_init_av_for_response */
cm_destroy_av(&cm_id_priv->av);
ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av);
if (ret) {
int err;
err = rdma_query_gid(work->port->cm_dev->ib_device,
work->port->port_num, 0,
&work->path[0].sgid);
if (err)
ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID,
NULL, 0, NULL, 0);
else
ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID,
&work->path[0].sgid,
sizeof(work->path[0].sgid),
NULL, 0);
goto rejected;
}
if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_IB)
cm_id_priv->av.dlid_datapath =
IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg);
if (cm_req_has_alt_path(req_msg)) {
ret = cm_init_av_by_path(&work->path[1], NULL,
&cm_id_priv->alt_av);
if (ret) {
ib_send_cm_rej(&cm_id_priv->id,
IB_CM_REJ_INVALID_ALT_GID,
&work->path[0].sgid,
sizeof(work->path[0].sgid), NULL, 0);
goto rejected;
}
}
cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
cm_id_priv->id.context = listen_cm_id_priv->id.context;
cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
/* Now MAD handlers can see the new ID */
spin_lock_irq(&cm_id_priv->lock);
cm_finalize_id(cm_id_priv);
/* Refcount belongs to the event, pairs with cm_process_work() */
refcount_inc(&cm_id_priv->refcount);
cm_queue_work_unlock(cm_id_priv, work);
/*
* Since this ID was just created and was not made visible to other MAD
* handlers until the cm_finalize_id() above we know that the
* cm_process_work() will deliver the event and the listen_cm_id
* embedded in the event can be derefed here.
*/
cm_deref_id(listen_cm_id_priv);
return 0;
rejected:
cm_deref_id(listen_cm_id_priv);
destroy:
ib_destroy_cm_id(&cm_id_priv->id);
return ret;
}
static void cm_format_rep(struct cm_rep_msg *rep_msg,
struct cm_id_private *cm_id_priv,
struct ib_cm_rep_param *param)
{
cm_format_mad_ece_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid,
param->ece.attr_mod);
IBA_SET(CM_REP_LOCAL_COMM_ID, rep_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_REP_REMOTE_COMM_ID, rep_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn);
IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg,
param->responder_resources);
IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg,
cm_id_priv->av.port->cm_dev->ack_delay);
IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted);
IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count);
IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg,
be64_to_cpu(cm_id_priv->id.device->node_guid));
if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) {
IBA_SET(CM_REP_INITIATOR_DEPTH, rep_msg,
param->initiator_depth);
IBA_SET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg,
param->flow_control);
IBA_SET(CM_REP_SRQ, rep_msg, param->srq);
IBA_SET(CM_REP_LOCAL_QPN, rep_msg, param->qp_num);
} else {
IBA_SET(CM_REP_SRQ, rep_msg, 1);
IBA_SET(CM_REP_LOCAL_EE_CONTEXT_NUMBER, rep_msg, param->qp_num);
}
IBA_SET(CM_REP_VENDOR_ID_L, rep_msg, param->ece.vendor_id);
IBA_SET(CM_REP_VENDOR_ID_M, rep_msg, param->ece.vendor_id >> 8);
IBA_SET(CM_REP_VENDOR_ID_H, rep_msg, param->ece.vendor_id >> 16);
if (param->private_data && param->private_data_len)
IBA_SET_MEM(CM_REP_PRIVATE_DATA, rep_msg, param->private_data,
param->private_data_len);
}
int ib_send_cm_rep(struct ib_cm_id *cm_id,
struct ib_cm_rep_param *param)
{
struct cm_id_private *cm_id_priv;
struct ib_mad_send_buf *msg;
struct cm_rep_msg *rep_msg;
unsigned long flags;
int ret;
if (param->private_data &&
param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
return -EINVAL;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_REQ_RCVD &&
cm_id->state != IB_CM_MRA_REQ_SENT) {
trace_icm_send_rep_err(cm_id_priv->id.local_id, cm_id->state);
ret = -EINVAL;
goto out;
}
msg = cm_alloc_priv_msg(cm_id_priv);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out;
}
rep_msg = (struct cm_rep_msg *) msg->mad;
cm_format_rep(rep_msg, cm_id_priv, param);
msg->timeout_ms = cm_id_priv->timeout_ms;
msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
trace_icm_send_rep(cm_id);
ret = ib_post_send_mad(msg, NULL);
if (ret)
goto out_free;
cm_id->state = IB_CM_REP_SENT;
cm_id_priv->initiator_depth = param->initiator_depth;
cm_id_priv->responder_resources = param->responder_resources;
cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg));
WARN_ONCE(param->qp_num & 0xFF000000,
"IBTA declares QPN to be 24 bits, but it is 0x%X\n",
param->qp_num);
cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return 0;
out_free:
cm_free_priv_msg(msg);
out:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_rep);
static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
struct cm_id_private *cm_id_priv,
const void *private_data,
u8 private_data_len)
{
cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
IBA_SET(CM_RTU_LOCAL_COMM_ID, rtu_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_RTU_REMOTE_COMM_ID, rtu_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
if (private_data && private_data_len)
IBA_SET_MEM(CM_RTU_PRIVATE_DATA, rtu_msg, private_data,
private_data_len);
}
int ib_send_cm_rtu(struct ib_cm_id *cm_id,
const void *private_data,
u8 private_data_len)
{
struct cm_id_private *cm_id_priv;
struct ib_mad_send_buf *msg;
unsigned long flags;
void *data;
int ret;
if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
return -EINVAL;
data = cm_copy_private_data(private_data, private_data_len);
if (IS_ERR(data))
return PTR_ERR(data);
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_REP_RCVD &&
cm_id->state != IB_CM_MRA_REP_SENT) {
trace_icm_send_cm_rtu_err(cm_id);
ret = -EINVAL;
goto error;
}
msg = cm_alloc_msg(cm_id_priv);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto error;
}
cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
private_data, private_data_len);
trace_icm_send_rtu(cm_id);
ret = ib_post_send_mad(msg, NULL);
if (ret) {
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
cm_free_msg(msg);
kfree(data);
return ret;
}
cm_id->state = IB_CM_ESTABLISHED;
cm_set_private_data(cm_id_priv, data, private_data_len);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return 0;
error: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
kfree(data);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_rtu);
static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type)
{
struct cm_rep_msg *rep_msg;
struct ib_cm_rep_event_param *param;
rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
param = &work->cm_event.param.rep_rcvd;
param->remote_ca_guid =
cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg));
param->remote_qkey = IBA_GET(CM_REP_LOCAL_Q_KEY, rep_msg);
param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type));
param->starting_psn = IBA_GET(CM_REP_STARTING_PSN, rep_msg);
param->responder_resources = IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg);
param->initiator_depth = IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg);
param->target_ack_delay = IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg);
param->failover_accepted = IBA_GET(CM_REP_FAILOVER_ACCEPTED, rep_msg);
param->flow_control = IBA_GET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg);
param->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg);
param->srq = IBA_GET(CM_REP_SRQ, rep_msg);
param->ece.vendor_id = IBA_GET(CM_REP_VENDOR_ID_H, rep_msg) << 16;
param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_M, rep_msg) << 8;
param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_L, rep_msg);
param->ece.attr_mod = be32_to_cpu(rep_msg->hdr.attr_mod);
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_REP_PRIVATE_DATA, rep_msg);
}
static void cm_dup_rep_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
struct cm_rep_msg *rep_msg;
struct ib_mad_send_buf *msg = NULL;
int ret;
rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)),
cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg)));
if (!cm_id_priv)
return;
atomic_long_inc(
&work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]);
ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
if (ret)
goto deref;
spin_lock_irq(&cm_id_priv->lock);
if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
else
goto unlock;
spin_unlock_irq(&cm_id_priv->lock);
trace_icm_send_dup_rep(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
if (ret)
goto free;
goto deref;
unlock: spin_unlock_irq(&cm_id_priv->lock);
free: cm_free_response_msg(msg);
deref: cm_deref_id(cm_id_priv);
}
static int cm_rep_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
struct cm_rep_msg *rep_msg;
int ret;
struct cm_id_private *cur_cm_id_priv;
struct cm_timewait_info *timewait_info;
rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0);
if (!cm_id_priv) {
cm_dup_rep_handler(work);
trace_icm_remote_no_priv_err(
IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg));
return -EINVAL;
}
cm_format_rep_event(work, cm_id_priv->qp_type);
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id_priv->id.state) {
case IB_CM_REQ_SENT:
case IB_CM_MRA_REQ_RCVD:
break;
default:
ret = -EINVAL;
trace_icm_rep_unknown_err(
IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg),
IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg),
cm_id_priv->id.state);
spin_unlock_irq(&cm_id_priv->lock);
goto error;
}
cm_id_priv->timewait_info->work.remote_id =
cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg));
cm_id_priv->timewait_info->remote_ca_guid =
cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg));
cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
spin_lock(&cm.lock);
/* Check for duplicate REP. */
if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
spin_unlock(&cm.lock);
spin_unlock_irq(&cm_id_priv->lock);
ret = -EINVAL;
trace_icm_insert_failed_err(
IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg));
goto error;
}
/* Check for a stale connection. */
timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
if (timewait_info) {
cm_remove_remote(cm_id_priv);
cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
timewait_info->work.remote_id);
spin_unlock(&cm.lock);
spin_unlock_irq(&cm_id_priv->lock);
cm_issue_rej(work->port, work->mad_recv_wc,
IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
NULL, 0);
ret = -EINVAL;
trace_icm_staleconn_err(
IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg),
IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg));
if (cur_cm_id_priv) {
ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0);
cm_deref_id(cur_cm_id_priv);
}
goto error;
}
spin_unlock(&cm.lock);
cm_id_priv->id.state = IB_CM_REP_RCVD;
cm_id_priv->id.remote_id =
cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg));
cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
cm_id_priv->initiator_depth =
IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg);
cm_id_priv->responder_resources =
IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg);
cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg));
cm_id_priv->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg);
cm_id_priv->target_ack_delay =
IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg);
cm_id_priv->av.timeout =
cm_ack_timeout(cm_id_priv->target_ack_delay,
cm_id_priv->av.timeout - 1);
cm_id_priv->alt_av.timeout =
cm_ack_timeout(cm_id_priv->target_ack_delay,
cm_id_priv->alt_av.timeout - 1);
ib_cancel_mad(cm_id_priv->msg);
cm_queue_work_unlock(cm_id_priv, work);
return 0;
error:
cm_deref_id(cm_id_priv);
return ret;
}
static int cm_establish_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
/* See comment in cm_establish about lookup. */
cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
if (!cm_id_priv)
return -EINVAL;
spin_lock_irq(&cm_id_priv->lock);
if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
spin_unlock_irq(&cm_id_priv->lock);
goto out;
}
ib_cancel_mad(cm_id_priv->msg);
cm_queue_work_unlock(cm_id_priv, work);
return 0;
out:
cm_deref_id(cm_id_priv);
return -EINVAL;
}
static int cm_rtu_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
struct cm_rtu_msg *rtu_msg;
rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_RTU_REMOTE_COMM_ID, rtu_msg)),
cpu_to_be32(IBA_GET(CM_RTU_LOCAL_COMM_ID, rtu_msg)));
if (!cm_id_priv)
return -EINVAL;
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_RTU_PRIVATE_DATA, rtu_msg);
spin_lock_irq(&cm_id_priv->lock);
if (cm_id_priv->id.state != IB_CM_REP_SENT &&
cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
spin_unlock_irq(&cm_id_priv->lock);
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_RTU_COUNTER]);
goto out;
}
cm_id_priv->id.state = IB_CM_ESTABLISHED;
ib_cancel_mad(cm_id_priv->msg);
cm_queue_work_unlock(cm_id_priv, work);
return 0;
out:
cm_deref_id(cm_id_priv);
return -EINVAL;
}
static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
struct cm_id_private *cm_id_priv,
const void *private_data,
u8 private_data_len)
{
cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
cm_form_tid(cm_id_priv));
IBA_SET(CM_DREQ_LOCAL_COMM_ID, dreq_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_DREQ_REMOTE_COMM_ID, dreq_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
IBA_SET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg,
be32_to_cpu(cm_id_priv->remote_qpn));
if (private_data && private_data_len)
IBA_SET_MEM(CM_DREQ_PRIVATE_DATA, dreq_msg, private_data,
private_data_len);
}
static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
const void *private_data, u8 private_data_len)
{
struct ib_mad_send_buf *msg;
int ret;
lockdep_assert_held(&cm_id_priv->lock);
if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
return -EINVAL;
if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
trace_icm_dreq_skipped(&cm_id_priv->id);
return -EINVAL;
}
if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
ib_cancel_mad(cm_id_priv->msg);
msg = cm_alloc_priv_msg(cm_id_priv);
if (IS_ERR(msg)) {
cm_enter_timewait(cm_id_priv);
return PTR_ERR(msg);
}
cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
private_data, private_data_len);
msg->timeout_ms = cm_id_priv->timeout_ms;
msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
trace_icm_send_dreq(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
if (ret) {
cm_enter_timewait(cm_id_priv);
cm_free_priv_msg(msg);
return ret;
}
cm_id_priv->id.state = IB_CM_DREQ_SENT;
return 0;
}
int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data,
u8 private_data_len)
{
struct cm_id_private *cm_id_priv =
container_of(cm_id, struct cm_id_private, id);
unsigned long flags;
int ret;
spin_lock_irqsave(&cm_id_priv->lock, flags);
ret = cm_send_dreq_locked(cm_id_priv, private_data, private_data_len);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_dreq);
static void cm_format_drep(struct cm_drep_msg *drep_msg,
struct cm_id_private *cm_id_priv,
const void *private_data,
u8 private_data_len)
{
cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
if (private_data && private_data_len)
IBA_SET_MEM(CM_DREP_PRIVATE_DATA, drep_msg, private_data,
private_data_len);
}
static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
void *private_data, u8 private_data_len)
{
struct ib_mad_send_buf *msg;
int ret;
lockdep_assert_held(&cm_id_priv->lock);
if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
return -EINVAL;
if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
trace_icm_send_drep_err(&cm_id_priv->id);
kfree(private_data);
return -EINVAL;
}
cm_set_private_data(cm_id_priv, private_data, private_data_len);
cm_enter_timewait(cm_id_priv);
msg = cm_alloc_msg(cm_id_priv);
if (IS_ERR(msg))
return PTR_ERR(msg);
cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
private_data, private_data_len);
trace_icm_send_drep(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
if (ret) {
cm_free_msg(msg);
return ret;
}
return 0;
}
int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data,
u8 private_data_len)
{
struct cm_id_private *cm_id_priv =
container_of(cm_id, struct cm_id_private, id);
unsigned long flags;
void *data;
int ret;
data = cm_copy_private_data(private_data, private_data_len);
if (IS_ERR(data))
return PTR_ERR(data);
spin_lock_irqsave(&cm_id_priv->lock, flags);
ret = cm_send_drep_locked(cm_id_priv, data, private_data_len);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_drep);
static int cm_issue_drep(struct cm_port *port,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct ib_mad_send_buf *msg = NULL;
struct cm_dreq_msg *dreq_msg;
struct cm_drep_msg *drep_msg;
int ret;
ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
if (ret)
return ret;
dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
drep_msg = (struct cm_drep_msg *) msg->mad;
cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg,
IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg));
IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg,
IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg));
trace_icm_issue_drep(
IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg),
IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg));
ret = ib_post_send_mad(msg, NULL);
if (ret)
cm_free_response_msg(msg);
return ret;
}
static int cm_dreq_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
struct cm_dreq_msg *dreq_msg;
struct ib_mad_send_buf *msg = NULL;
dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)),
cpu_to_be32(IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg)));
if (!cm_id_priv) {
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_DREQ_COUNTER]);
cm_issue_drep(work->port, work->mad_recv_wc);
trace_icm_no_priv_err(
IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg),
IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg));
return -EINVAL;
}
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_DREQ_PRIVATE_DATA, dreq_msg);
spin_lock_irq(&cm_id_priv->lock);
if (cm_id_priv->local_qpn !=
cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg)))
goto unlock;
switch (cm_id_priv->id.state) {
case IB_CM_REP_SENT:
case IB_CM_DREQ_SENT:
case IB_CM_MRA_REP_RCVD:
ib_cancel_mad(cm_id_priv->msg);
break;
case IB_CM_ESTABLISHED:
if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
ib_cancel_mad(cm_id_priv->msg);
break;
case IB_CM_TIMEWAIT:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_DREQ_COUNTER]);
msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
if (IS_ERR(msg))
goto unlock;
cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
spin_unlock_irq(&cm_id_priv->lock);
if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
ib_post_send_mad(msg, NULL))
cm_free_response_msg(msg);
goto deref;
case IB_CM_DREQ_RCVD:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_DREQ_COUNTER]);
goto unlock;
default:
trace_icm_dreq_unknown_err(&cm_id_priv->id);
goto unlock;
}
cm_id_priv->id.state = IB_CM_DREQ_RCVD;
cm_id_priv->tid = dreq_msg->hdr.tid;
cm_queue_work_unlock(cm_id_priv, work);
return 0;
unlock: spin_unlock_irq(&cm_id_priv->lock);
deref: cm_deref_id(cm_id_priv);
return -EINVAL;
}
static int cm_drep_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
struct cm_drep_msg *drep_msg;
drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_DREP_REMOTE_COMM_ID, drep_msg)),
cpu_to_be32(IBA_GET(CM_DREP_LOCAL_COMM_ID, drep_msg)));
if (!cm_id_priv)
return -EINVAL;
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_DREP_PRIVATE_DATA, drep_msg);
spin_lock_irq(&cm_id_priv->lock);
if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
spin_unlock_irq(&cm_id_priv->lock);
goto out;
}
cm_enter_timewait(cm_id_priv);
ib_cancel_mad(cm_id_priv->msg);
cm_queue_work_unlock(cm_id_priv, work);
return 0;
out:
cm_deref_id(cm_id_priv);
return -EINVAL;
}
static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
enum ib_cm_rej_reason reason, void *ari,
u8 ari_length, const void *private_data,
u8 private_data_len)
{
enum ib_cm_state state = cm_id_priv->id.state;
struct ib_mad_send_buf *msg;
int ret;
lockdep_assert_held(&cm_id_priv->lock);
if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
(ari && ari_length > IB_CM_REJ_ARI_LENGTH))
return -EINVAL;
switch (state) {
case IB_CM_REQ_SENT:
case IB_CM_MRA_REQ_RCVD:
case IB_CM_REQ_RCVD:
case IB_CM_MRA_REQ_SENT:
case IB_CM_REP_RCVD:
case IB_CM_MRA_REP_SENT:
cm_reset_to_idle(cm_id_priv);
msg = cm_alloc_msg(cm_id_priv);
if (IS_ERR(msg))
return PTR_ERR(msg);
cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason,
ari, ari_length, private_data, private_data_len,
state);
break;
case IB_CM_REP_SENT:
case IB_CM_MRA_REP_RCVD:
cm_enter_timewait(cm_id_priv);
msg = cm_alloc_msg(cm_id_priv);
if (IS_ERR(msg))
return PTR_ERR(msg);
cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason,
ari, ari_length, private_data, private_data_len,
state);
break;
default:
trace_icm_send_unknown_rej_err(&cm_id_priv->id);
return -EINVAL;
}
trace_icm_send_rej(&cm_id_priv->id, reason);
ret = ib_post_send_mad(msg, NULL);
if (ret) {
cm_free_msg(msg);
return ret;
}
return 0;
}
int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason,
void *ari, u8 ari_length, const void *private_data,
u8 private_data_len)
{
struct cm_id_private *cm_id_priv =
container_of(cm_id, struct cm_id_private, id);
unsigned long flags;
int ret;
spin_lock_irqsave(&cm_id_priv->lock, flags);
ret = cm_send_rej_locked(cm_id_priv, reason, ari, ari_length,
private_data, private_data_len);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_rej);
static void cm_format_rej_event(struct cm_work *work)
{
struct cm_rej_msg *rej_msg;
struct ib_cm_rej_event_param *param;
rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
param = &work->cm_event.param.rej_rcvd;
param->ari = IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg);
param->ari_length = IBA_GET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg);
param->reason = IBA_GET(CM_REJ_REASON, rej_msg);
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_REJ_PRIVATE_DATA, rej_msg);
}
static struct cm_id_private *cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
{
struct cm_id_private *cm_id_priv;
__be32 remote_id;
remote_id = cpu_to_be32(IBA_GET(CM_REJ_LOCAL_COMM_ID, rej_msg));
if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT) {
cm_id_priv = cm_find_remote_id(
*((__be64 *)IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg)),
remote_id);
} else if (IBA_GET(CM_REJ_MESSAGE_REJECTED, rej_msg) ==
CM_MSG_RESPONSE_REQ)
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)),
0);
else
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)),
remote_id);
return cm_id_priv;
}
static int cm_rej_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
struct cm_rej_msg *rej_msg;
rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_rejected_id(rej_msg);
if (!cm_id_priv)
return -EINVAL;
cm_format_rej_event(work);
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id_priv->id.state) {
case IB_CM_REQ_SENT:
case IB_CM_MRA_REQ_RCVD:
case IB_CM_REP_SENT:
case IB_CM_MRA_REP_RCVD:
ib_cancel_mad(cm_id_priv->msg);
fallthrough;
case IB_CM_REQ_RCVD:
case IB_CM_MRA_REQ_SENT:
if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_STALE_CONN)
cm_enter_timewait(cm_id_priv);
else
cm_reset_to_idle(cm_id_priv);
break;
case IB_CM_DREQ_SENT:
ib_cancel_mad(cm_id_priv->msg);
fallthrough;
case IB_CM_REP_RCVD:
case IB_CM_MRA_REP_SENT:
cm_enter_timewait(cm_id_priv);
break;
case IB_CM_ESTABLISHED:
if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
ib_cancel_mad(cm_id_priv->msg);
cm_enter_timewait(cm_id_priv);
break;
}
fallthrough;
default:
trace_icm_rej_unknown_err(&cm_id_priv->id);
spin_unlock_irq(&cm_id_priv->lock);
goto out;
}
cm_queue_work_unlock(cm_id_priv, work);
return 0;
out:
cm_deref_id(cm_id_priv);
return -EINVAL;
}
int ib_send_cm_mra(struct ib_cm_id *cm_id,
u8 service_timeout,
const void *private_data,
u8 private_data_len)
{
struct cm_id_private *cm_id_priv;
struct ib_mad_send_buf *msg;
enum ib_cm_state cm_state;
enum ib_cm_lap_state lap_state;
enum cm_msg_response msg_response;
void *data;
unsigned long flags;
int ret;
if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
return -EINVAL;
data = cm_copy_private_data(private_data, private_data_len);
if (IS_ERR(data))
return PTR_ERR(data);
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
switch (cm_id_priv->id.state) {
case IB_CM_REQ_RCVD:
cm_state = IB_CM_MRA_REQ_SENT;
lap_state = cm_id->lap_state;
msg_response = CM_MSG_RESPONSE_REQ;
break;
case IB_CM_REP_RCVD:
cm_state = IB_CM_MRA_REP_SENT;
lap_state = cm_id->lap_state;
msg_response = CM_MSG_RESPONSE_REP;
break;
case IB_CM_ESTABLISHED:
if (cm_id->lap_state == IB_CM_LAP_RCVD) {
cm_state = cm_id->state;
lap_state = IB_CM_MRA_LAP_SENT;
msg_response = CM_MSG_RESPONSE_OTHER;
break;
}
fallthrough;
default:
trace_icm_send_mra_unknown_err(&cm_id_priv->id);
ret = -EINVAL;
goto error_unlock;
}
if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
msg = cm_alloc_msg(cm_id_priv);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto error_unlock;
}
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
msg_response, service_timeout,
private_data, private_data_len);
trace_icm_send_mra(cm_id);
ret = ib_post_send_mad(msg, NULL);
if (ret)
goto error_free_msg;
}
cm_id->state = cm_state;
cm_id->lap_state = lap_state;
cm_id_priv->service_timeout = service_timeout;
cm_set_private_data(cm_id_priv, data, private_data_len);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return 0;
error_free_msg:
cm_free_msg(msg);
error_unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
kfree(data);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_mra);
static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
{
switch (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg)) {
case CM_MSG_RESPONSE_REQ:
return cm_acquire_id(
cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)),
0);
case CM_MSG_RESPONSE_REP:
case CM_MSG_RESPONSE_OTHER:
return cm_acquire_id(
cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)),
cpu_to_be32(IBA_GET(CM_MRA_LOCAL_COMM_ID, mra_msg)));
default:
return NULL;
}
}
static int cm_mra_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
struct cm_mra_msg *mra_msg;
int timeout;
mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_mraed_id(mra_msg);
if (!cm_id_priv)
return -EINVAL;
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_MRA_PRIVATE_DATA, mra_msg);
work->cm_event.param.mra_rcvd.service_timeout =
IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg);
timeout = cm_convert_to_ms(IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg)) +
cm_convert_to_ms(cm_id_priv->av.timeout);
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id_priv->id.state) {
case IB_CM_REQ_SENT:
if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) !=
CM_MSG_RESPONSE_REQ ||
ib_modify_mad(cm_id_priv->msg, timeout))
goto out;
cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
break;
case IB_CM_REP_SENT:
if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) !=
CM_MSG_RESPONSE_REP ||
ib_modify_mad(cm_id_priv->msg, timeout))
goto out;
cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
break;
case IB_CM_ESTABLISHED:
if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) !=
CM_MSG_RESPONSE_OTHER ||
cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
ib_modify_mad(cm_id_priv->msg, timeout)) {
if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
atomic_long_inc(
&work->port->counters[CM_RECV_DUPLICATES]
[CM_MRA_COUNTER]);
goto out;
}
cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
break;
case IB_CM_MRA_REQ_RCVD:
case IB_CM_MRA_REP_RCVD:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_MRA_COUNTER]);
fallthrough;
default:
trace_icm_mra_unknown_err(&cm_id_priv->id);
goto out;
}
cm_id_priv->msg->context[1] = (void *) (unsigned long)
cm_id_priv->id.state;
cm_queue_work_unlock(cm_id_priv, work);
return 0;
out:
spin_unlock_irq(&cm_id_priv->lock);
cm_deref_id(cm_id_priv);
return -EINVAL;
}
static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg,
struct sa_path_rec *path)
{
u32 lid;
if (path->rec_type != SA_PATH_REC_TYPE_OPA) {
sa_path_set_dlid(path, IBA_GET(CM_LAP_ALTERNATE_LOCAL_PORT_LID,
lap_msg));
sa_path_set_slid(path, IBA_GET(CM_LAP_ALTERNATE_REMOTE_PORT_LID,
lap_msg));
} else {
lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg));
sa_path_set_dlid(path, lid);
lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg));
sa_path_set_slid(path, lid);
}
}
static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
struct sa_path_rec *path,
struct cm_lap_msg *lap_msg)
{
path->dgid = *IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg);
path->sgid =
*IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg);
path->flow_label =
cpu_to_be32(IBA_GET(CM_LAP_ALTERNATE_FLOW_LABEL, lap_msg));
path->hop_limit = IBA_GET(CM_LAP_ALTERNATE_HOP_LIMIT, lap_msg);
path->traffic_class = IBA_GET(CM_LAP_ALTERNATE_TRAFFIC_CLASS, lap_msg);
path->reversible = 1;
path->pkey = cm_id_priv->pkey;
path->sl = IBA_GET(CM_LAP_ALTERNATE_SL, lap_msg);
path->mtu_selector = IB_SA_EQ;
path->mtu = cm_id_priv->path_mtu;
path->rate_selector = IB_SA_EQ;
path->rate = IBA_GET(CM_LAP_ALTERNATE_PACKET_RATE, lap_msg);
path->packet_life_time_selector = IB_SA_EQ;
path->packet_life_time =
IBA_GET(CM_LAP_ALTERNATE_LOCAL_ACK_TIMEOUT, lap_msg);
path->packet_life_time -= (path->packet_life_time > 0);
cm_format_path_lid_from_lap(lap_msg, path);
}
static int cm_lap_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
struct cm_lap_msg *lap_msg;
struct ib_cm_lap_event_param *param;
struct ib_mad_send_buf *msg = NULL;
struct rdma_ah_attr ah_attr;
struct cm_av alt_av = {};
int ret;
/* Currently Alternate path messages are not supported for
* RoCE link layer.
*/
if (rdma_protocol_roce(work->port->cm_dev->ib_device,
work->port->port_num))
return -EINVAL;
/* todo: verify LAP request and send reject APR if invalid. */
lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_LAP_REMOTE_COMM_ID, lap_msg)),
cpu_to_be32(IBA_GET(CM_LAP_LOCAL_COMM_ID, lap_msg)));
if (!cm_id_priv)
return -EINVAL;
param = &work->cm_event.param.lap_rcvd;
memset(&work->path[0], 0, sizeof(work->path[1]));
cm_path_set_rec_type(work->port->cm_dev->ib_device,
work->port->port_num, &work->path[0],
IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID,
lap_msg));
param->alternate_path = &work->path[0];
cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_LAP_PRIVATE_DATA, lap_msg);
ret = ib_init_ah_attr_from_wc(work->port->cm_dev->ib_device,
work->port->port_num,
work->mad_recv_wc->wc,
work->mad_recv_wc->recv_buf.grh,
&ah_attr);
if (ret)
goto deref;
ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av);
if (ret) {
rdma_destroy_ah_attr(&ah_attr);
goto deref;
}
spin_lock_irq(&cm_id_priv->lock);
cm_init_av_for_lap(work->port, work->mad_recv_wc->wc,
&ah_attr, &cm_id_priv->av);
cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av);
if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
goto unlock;
switch (cm_id_priv->id.lap_state) {
case IB_CM_LAP_UNINIT:
case IB_CM_LAP_IDLE:
break;
case IB_CM_MRA_LAP_SENT:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_LAP_COUNTER]);
msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
if (IS_ERR(msg))
goto unlock;
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_OTHER,
cm_id_priv->service_timeout,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
spin_unlock_irq(&cm_id_priv->lock);
if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
ib_post_send_mad(msg, NULL))
cm_free_response_msg(msg);
goto deref;
case IB_CM_LAP_RCVD:
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_LAP_COUNTER]);
goto unlock;
default:
goto unlock;
}
cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
cm_id_priv->tid = lap_msg->hdr.tid;
cm_queue_work_unlock(cm_id_priv, work);
return 0;
unlock: spin_unlock_irq(&cm_id_priv->lock);
deref: cm_deref_id(cm_id_priv);
return -EINVAL;
}
static int cm_apr_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv;
struct cm_apr_msg *apr_msg;
/* Currently Alternate path messages are not supported for
* RoCE link layer.
*/
if (rdma_protocol_roce(work->port->cm_dev->ib_device,
work->port->port_num))
return -EINVAL;
apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_APR_REMOTE_COMM_ID, apr_msg)),
cpu_to_be32(IBA_GET(CM_APR_LOCAL_COMM_ID, apr_msg)));
if (!cm_id_priv)
return -EINVAL; /* Unmatched reply. */
work->cm_event.param.apr_rcvd.ap_status =
IBA_GET(CM_APR_AR_STATUS, apr_msg);
work->cm_event.param.apr_rcvd.apr_info =
IBA_GET_MEM_PTR(CM_APR_ADDITIONAL_INFORMATION, apr_msg);
work->cm_event.param.apr_rcvd.info_len =
IBA_GET(CM_APR_ADDITIONAL_INFORMATION_LENGTH, apr_msg);
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_APR_PRIVATE_DATA, apr_msg);
spin_lock_irq(&cm_id_priv->lock);
if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
(cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
spin_unlock_irq(&cm_id_priv->lock);
goto out;
}
cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
ib_cancel_mad(cm_id_priv->msg);
cm_queue_work_unlock(cm_id_priv, work);
return 0;
out:
cm_deref_id(cm_id_priv);
return -EINVAL;
}
static int cm_timewait_handler(struct cm_work *work)
{
struct cm_timewait_info *timewait_info;
struct cm_id_private *cm_id_priv;
timewait_info = container_of(work, struct cm_timewait_info, work);
spin_lock_irq(&cm.lock);
list_del(&timewait_info->list);
spin_unlock_irq(&cm.lock);
cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
timewait_info->work.remote_id);
if (!cm_id_priv)
return -EINVAL;
spin_lock_irq(&cm_id_priv->lock);
if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
spin_unlock_irq(&cm_id_priv->lock);
goto out;
}
cm_id_priv->id.state = IB_CM_IDLE;
cm_queue_work_unlock(cm_id_priv, work);
return 0;
out:
cm_deref_id(cm_id_priv);
return -EINVAL;
}
static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
struct cm_id_private *cm_id_priv,
struct ib_cm_sidr_req_param *param)
{
cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
cm_form_tid(cm_id_priv));
IBA_SET(CM_SIDR_REQ_REQUESTID, sidr_req_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg,
be16_to_cpu(param->path->pkey));
IBA_SET(CM_SIDR_REQ_SERVICEID, sidr_req_msg,
be64_to_cpu(param->service_id));
if (param->private_data && param->private_data_len)
IBA_SET_MEM(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg,
param->private_data, param->private_data_len);
}
int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
struct ib_cm_sidr_req_param *param)
{
struct cm_id_private *cm_id_priv;
struct ib_mad_send_buf *msg;
struct cm_av av = {};
unsigned long flags;
int ret;
if (!param->path || (param->private_data &&
param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
return -EINVAL;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
ret = cm_init_av_by_path(param->path, param->sgid_attr, &av);
if (ret)
return ret;
spin_lock_irqsave(&cm_id_priv->lock, flags);
cm_move_av_from_path(&cm_id_priv->av, &av);
cm_id->service_id = param->service_id;
cm_id_priv->timeout_ms = param->timeout_ms;
cm_id_priv->max_cm_retries = param->max_cm_retries;
if (cm_id->state != IB_CM_IDLE) {
ret = -EINVAL;
goto out_unlock;
}
msg = cm_alloc_priv_msg(cm_id_priv);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out_unlock;
}
cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv,
param);
msg->timeout_ms = cm_id_priv->timeout_ms;
msg->context[1] = (void *)(unsigned long)IB_CM_SIDR_REQ_SENT;
trace_icm_send_sidr_req(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
if (ret)
goto out_free;
cm_id->state = IB_CM_SIDR_REQ_SENT;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return 0;
out_free:
cm_free_priv_msg(msg);
out_unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_sidr_req);
static void cm_format_sidr_req_event(struct cm_work *work,
const struct cm_id_private *rx_cm_id,
struct ib_cm_id *listen_id)
{
struct cm_sidr_req_msg *sidr_req_msg;
struct ib_cm_sidr_req_event_param *param;
sidr_req_msg = (struct cm_sidr_req_msg *)
work->mad_recv_wc->recv_buf.mad;
param = &work->cm_event.param.sidr_req_rcvd;
param->pkey = IBA_GET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg);
param->listen_id = listen_id;
param->service_id =
cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg));
param->bth_pkey = cm_get_bth_pkey(work);
param->port = work->port->port_num;
param->sgid_attr = rx_cm_id->av.ah_attr.grh.sgid_attr;
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg);
}
static int cm_sidr_req_handler(struct cm_work *work)
{
struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
struct cm_sidr_req_msg *sidr_req_msg;
struct ib_wc *wc;
int ret;
cm_id_priv =
cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL);
if (IS_ERR(cm_id_priv))
return PTR_ERR(cm_id_priv);
/* Record SGID/SLID and request ID for lookup. */
sidr_req_msg = (struct cm_sidr_req_msg *)
work->mad_recv_wc->recv_buf.mad;
cm_id_priv->id.remote_id =
cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg));
cm_id_priv->id.service_id =
cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg));
cm_id_priv->tid = sidr_req_msg->hdr.tid;
wc = work->mad_recv_wc->wc;
cm_id_priv->sidr_slid = wc->slid;
ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
work->mad_recv_wc->recv_buf.grh,
&cm_id_priv->av);
if (ret)
goto out;
spin_lock_irq(&cm.lock);
listen_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
if (listen_cm_id_priv) {
spin_unlock_irq(&cm.lock);
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
[CM_SIDR_REQ_COUNTER]);
goto out; /* Duplicate message. */
}
cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
cm_id_priv->id.service_id);
if (!listen_cm_id_priv) {
spin_unlock_irq(&cm.lock);
ib_send_cm_sidr_rep(&cm_id_priv->id,
&(struct ib_cm_sidr_rep_param){
.status = IB_SIDR_UNSUPPORTED });
goto out; /* No match. */
}
spin_unlock_irq(&cm.lock);
cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
cm_id_priv->id.context = listen_cm_id_priv->id.context;
/*
* A SIDR ID does not need to be in the xarray since it does not receive
* mads, is not placed in the remote_id or remote_qpn rbtree, and does
* not enter timewait.
*/
cm_format_sidr_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
cm_free_work(work);
/*
* A pointer to the listen_cm_id is held in the event, so this deref
* must be after the event is delivered above.
*/
cm_deref_id(listen_cm_id_priv);
if (ret)
cm_destroy_id(&cm_id_priv->id, ret);
return 0;
out:
ib_destroy_cm_id(&cm_id_priv->id);
return -EINVAL;
}
static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
struct cm_id_private *cm_id_priv,
struct ib_cm_sidr_rep_param *param)
{
cm_format_mad_ece_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
cm_id_priv->tid, param->ece.attr_mod);
IBA_SET(CM_SIDR_REP_REQUESTID, sidr_rep_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
IBA_SET(CM_SIDR_REP_STATUS, sidr_rep_msg, param->status);
IBA_SET(CM_SIDR_REP_QPN, sidr_rep_msg, param->qp_num);
IBA_SET(CM_SIDR_REP_SERVICEID, sidr_rep_msg,
be64_to_cpu(cm_id_priv->id.service_id));
IBA_SET(CM_SIDR_REP_Q_KEY, sidr_rep_msg, param->qkey);
IBA_SET(CM_SIDR_REP_VENDOR_ID_L, sidr_rep_msg,
param->ece.vendor_id & 0xFF);
IBA_SET(CM_SIDR_REP_VENDOR_ID_H, sidr_rep_msg,
(param->ece.vendor_id >> 8) & 0xFF);
if (param->info && param->info_length)
IBA_SET_MEM(CM_SIDR_REP_ADDITIONAL_INFORMATION, sidr_rep_msg,
param->info, param->info_length);
if (param->private_data && param->private_data_len)
IBA_SET_MEM(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg,
param->private_data, param->private_data_len);
}
static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv,
struct ib_cm_sidr_rep_param *param)
{
struct ib_mad_send_buf *msg;
unsigned long flags;
int ret;
lockdep_assert_held(&cm_id_priv->lock);
if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
(param->private_data &&
param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
return -EINVAL;
if (cm_id_priv->id.state != IB_CM_SIDR_REQ_RCVD)
return -EINVAL;
msg = cm_alloc_msg(cm_id_priv);
if (IS_ERR(msg))
return PTR_ERR(msg);
cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
param);
trace_icm_send_sidr_rep(&cm_id_priv->id);
ret = ib_post_send_mad(msg, NULL);
if (ret) {
cm_free_msg(msg);
return ret;
}
cm_id_priv->id.state = IB_CM_IDLE;
spin_lock_irqsave(&cm.lock, flags);
if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) {
rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
}
spin_unlock_irqrestore(&cm.lock, flags);
return 0;
}
int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
struct ib_cm_sidr_rep_param *param)
{
struct cm_id_private *cm_id_priv =
container_of(cm_id, struct cm_id_private, id);
unsigned long flags;
int ret;
spin_lock_irqsave(&cm_id_priv->lock, flags);
ret = cm_send_sidr_rep_locked(cm_id_priv, param);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_sidr_rep);
static void cm_format_sidr_rep_event(struct cm_work *work,
const struct cm_id_private *cm_id_priv)
{
struct cm_sidr_rep_msg *sidr_rep_msg;
struct ib_cm_sidr_rep_event_param *param;
sidr_rep_msg = (struct cm_sidr_rep_msg *)
work->mad_recv_wc->recv_buf.mad;
param = &work->cm_event.param.sidr_rep_rcvd;
param->status = IBA_GET(CM_SIDR_REP_STATUS, sidr_rep_msg);
param->qkey = IBA_GET(CM_SIDR_REP_Q_KEY, sidr_rep_msg);
param->qpn = IBA_GET(CM_SIDR_REP_QPN, sidr_rep_msg);
param->info = IBA_GET_MEM_PTR(CM_SIDR_REP_ADDITIONAL_INFORMATION,
sidr_rep_msg);
param->info_len = IBA_GET(CM_SIDR_REP_ADDITIONAL_INFORMATION_LENGTH,
sidr_rep_msg);
param->sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr;
work->cm_event.private_data =
IBA_GET_MEM_PTR(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg);
}
static int cm_sidr_rep_handler(struct cm_work *work)
{
struct cm_sidr_rep_msg *sidr_rep_msg;
struct cm_id_private *cm_id_priv;
sidr_rep_msg = (struct cm_sidr_rep_msg *)
work->mad_recv_wc->recv_buf.mad;
cm_id_priv = cm_acquire_id(
cpu_to_be32(IBA_GET(CM_SIDR_REP_REQUESTID, sidr_rep_msg)), 0);
if (!cm_id_priv)
return -EINVAL; /* Unmatched reply. */
spin_lock_irq(&cm_id_priv->lock);
if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
spin_unlock_irq(&cm_id_priv->lock);
goto out;
}
cm_id_priv->id.state = IB_CM_IDLE;
ib_cancel_mad(cm_id_priv->msg);
spin_unlock_irq(&cm_id_priv->lock);
cm_format_sidr_rep_event(work, cm_id_priv);
cm_process_work(cm_id_priv, work);
return 0;
out:
cm_deref_id(cm_id_priv);
return -EINVAL;
}
static void cm_process_send_error(struct cm_id_private *cm_id_priv,
struct ib_mad_send_buf *msg,
enum ib_cm_state state,
enum ib_wc_status wc_status)
{
struct ib_cm_event cm_event = {};
int ret;
/* Discard old sends or ones without a response. */
spin_lock_irq(&cm_id_priv->lock);
if (msg != cm_id_priv->msg) {
spin_unlock_irq(&cm_id_priv->lock);
cm_free_msg(msg);
return;
}
cm_free_priv_msg(msg);
if (state != cm_id_priv->id.state || wc_status == IB_WC_SUCCESS ||
wc_status == IB_WC_WR_FLUSH_ERR)
goto out_unlock;
trace_icm_mad_send_err(state, wc_status);
switch (state) {
case IB_CM_REQ_SENT:
case IB_CM_MRA_REQ_RCVD:
cm_reset_to_idle(cm_id_priv);
cm_event.event = IB_CM_REQ_ERROR;
break;
case IB_CM_REP_SENT:
case IB_CM_MRA_REP_RCVD:
cm_reset_to_idle(cm_id_priv);
cm_event.event = IB_CM_REP_ERROR;
break;
case IB_CM_DREQ_SENT:
cm_enter_timewait(cm_id_priv);
cm_event.event = IB_CM_DREQ_ERROR;
break;
case IB_CM_SIDR_REQ_SENT:
cm_id_priv->id.state = IB_CM_IDLE;
cm_event.event = IB_CM_SIDR_REQ_ERROR;
break;
default:
goto out_unlock;
}
spin_unlock_irq(&cm_id_priv->lock);
cm_event.param.send_status = wc_status;
/* No other events can occur on the cm_id at this point. */
ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
if (ret)
ib_destroy_cm_id(&cm_id_priv->id);
return;
out_unlock:
spin_unlock_irq(&cm_id_priv->lock);
}
static void cm_send_handler(struct ib_mad_agent *mad_agent,
struct ib_mad_send_wc *mad_send_wc)
{
struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
struct cm_id_private *cm_id_priv = msg->context[0];
enum ib_cm_state state =
(enum ib_cm_state)(unsigned long)msg->context[1];
struct cm_port *port;
u16 attr_index;
port = mad_agent->context;
attr_index = be16_to_cpu(((struct ib_mad_hdr *)
msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
/*
* If the send was in response to a received message (context[0] is not
* set to a cm_id), and is not a REJ, then it is a send that was
* manually retried.
*/
if (!cm_id_priv && (attr_index != CM_REJ_COUNTER))
msg->retries = 1;
atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]);
if (msg->retries)
atomic_long_add(msg->retries,
&port->counters[CM_XMIT_RETRIES][attr_index]);
if (cm_id_priv)
cm_process_send_error(cm_id_priv, msg, state,
mad_send_wc->status);
else
cm_free_response_msg(msg);
}
static void cm_work_handler(struct work_struct *_work)
{
struct cm_work *work = container_of(_work, struct cm_work, work.work);
int ret;
switch (work->cm_event.event) {
case IB_CM_REQ_RECEIVED:
ret = cm_req_handler(work);
break;
case IB_CM_MRA_RECEIVED:
ret = cm_mra_handler(work);
break;
case IB_CM_REJ_RECEIVED:
ret = cm_rej_handler(work);
break;
case IB_CM_REP_RECEIVED:
ret = cm_rep_handler(work);
break;
case IB_CM_RTU_RECEIVED:
ret = cm_rtu_handler(work);
break;
case IB_CM_USER_ESTABLISHED:
ret = cm_establish_handler(work);
break;
case IB_CM_DREQ_RECEIVED:
ret = cm_dreq_handler(work);
break;
case IB_CM_DREP_RECEIVED:
ret = cm_drep_handler(work);
break;
case IB_CM_SIDR_REQ_RECEIVED:
ret = cm_sidr_req_handler(work);
break;
case IB_CM_SIDR_REP_RECEIVED:
ret = cm_sidr_rep_handler(work);
break;
case IB_CM_LAP_RECEIVED:
ret = cm_lap_handler(work);
break;
case IB_CM_APR_RECEIVED:
ret = cm_apr_handler(work);
break;
case IB_CM_TIMEWAIT_EXIT:
ret = cm_timewait_handler(work);
break;
default:
trace_icm_handler_err(work->cm_event.event);
ret = -EINVAL;
break;
}
if (ret)
cm_free_work(work);
}
static int cm_establish(struct ib_cm_id *cm_id)
{
struct cm_id_private *cm_id_priv;
struct cm_work *work;
unsigned long flags;
int ret = 0;
struct cm_device *cm_dev;
cm_dev = ib_get_client_data(cm_id->device, &cm_client);
if (!cm_dev)
return -ENODEV;
work = kmalloc(sizeof *work, GFP_ATOMIC);
if (!work)
return -ENOMEM;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
switch (cm_id->state) {
case IB_CM_REP_SENT:
case IB_CM_MRA_REP_RCVD:
cm_id->state = IB_CM_ESTABLISHED;
break;
case IB_CM_ESTABLISHED:
ret = -EISCONN;
break;
default:
trace_icm_establish_err(cm_id);
ret = -EINVAL;
break;
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
if (ret) {
kfree(work);
goto out;
}
/*
* The CM worker thread may try to destroy the cm_id before it
* can execute this work item. To prevent potential deadlock,
* we need to find the cm_id once we're in the context of the
* worker thread, rather than holding a reference on it.
*/
INIT_DELAYED_WORK(&work->work, cm_work_handler);
work->local_id = cm_id->local_id;
work->remote_id = cm_id->remote_id;
work->mad_recv_wc = NULL;
work->cm_event.event = IB_CM_USER_ESTABLISHED;
/* Check if the device started its remove_one */
IB/cm: Fix a recently introduced locking bug ib_cm_notify() can be called from interrupt context. Hence do not reenable interrupts unconditionally in cm_establish(). This patch avoids that lockdep reports the following warning: WARNING: CPU: 0 PID: 23317 at kernel/locking/lockdep.c:2624 trace _hardirqs_on_caller+0x112/0x1b0 DEBUG_LOCKS_WARN_ON(current->hardirq_context) Call Trace: <IRQ> [<ffffffff812bd0e5>] dump_stack+0x67/0x92 [<ffffffff81056f21>] __warn+0xc1/0xe0 [<ffffffff81056f8a>] warn_slowpath_fmt+0x4a/0x50 [<ffffffff810a5932>] trace_hardirqs_on_caller+0x112/0x1b0 [<ffffffff810a59dd>] trace_hardirqs_on+0xd/0x10 [<ffffffff815992c7>] _raw_spin_unlock_irq+0x27/0x40 [<ffffffffa0382e9c>] ib_cm_notify+0x25c/0x290 [ib_cm] [<ffffffffa068fbc1>] srpt_qp_event+0xa1/0xf0 [ib_srpt] [<ffffffffa04efb97>] mlx4_ib_qp_event+0x67/0xd0 [mlx4_ib] [<ffffffffa034ec0a>] mlx4_qp_event+0x5a/0xc0 [mlx4_core] [<ffffffffa03365f8>] mlx4_eq_int+0x3d8/0xcf0 [mlx4_core] [<ffffffffa0336f9c>] mlx4_msi_x_interrupt+0xc/0x20 [mlx4_core] [<ffffffff810b0914>] handle_irq_event_percpu+0x64/0x100 [<ffffffff810b09e4>] handle_irq_event+0x34/0x60 [<ffffffff810b3a6a>] handle_edge_irq+0x6a/0x150 [<ffffffff8101ad05>] handle_irq+0x15/0x20 [<ffffffff8101a66c>] do_IRQ+0x5c/0x110 [<ffffffff8159a2c9>] common_interrupt+0x89/0x89 [<ffffffff81297a17>] blk_run_queue_async+0x37/0x40 [<ffffffffa0163e53>] rq_completed+0x43/0x70 [dm_mod] [<ffffffffa0164896>] dm_softirq_done+0x176/0x280 [dm_mod] [<ffffffff812a26c2>] blk_done_softirq+0x52/0x90 [<ffffffff8105bc1f>] __do_softirq+0x10f/0x230 [<ffffffff8105bec8>] irq_exit+0xa8/0xb0 [<ffffffff8103653e>] smp_trace_call_function_single_interrupt+0x2e/0x30 [<ffffffff81036549>] smp_call_function_single_interrupt+0x9/0x10 [<ffffffff8159a959>] call_function_single_interrupt+0x89/0x90 <EOI> Fixes: commit be4b499323bf (IB/cm: Do not queue work to a device that's going away) Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com> Cc: Erez Shitrit <erezsh@mellanox.com> Cc: Sean Hefty <sean.hefty@intel.com> Cc: Nikolay Borisov <kernel@kyup.com> Cc: stable <stable@vger.kernel.org> # v4.2+ Acked-by: Erez Shitrit <erezsh@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-03-25 23:33:16 +08:00
spin_lock_irqsave(&cm.lock, flags);
if (!cm_dev->going_down) {
queue_delayed_work(cm.wq, &work->work, 0);
} else {
kfree(work);
ret = -ENODEV;
}
IB/cm: Fix a recently introduced locking bug ib_cm_notify() can be called from interrupt context. Hence do not reenable interrupts unconditionally in cm_establish(). This patch avoids that lockdep reports the following warning: WARNING: CPU: 0 PID: 23317 at kernel/locking/lockdep.c:2624 trace _hardirqs_on_caller+0x112/0x1b0 DEBUG_LOCKS_WARN_ON(current->hardirq_context) Call Trace: <IRQ> [<ffffffff812bd0e5>] dump_stack+0x67/0x92 [<ffffffff81056f21>] __warn+0xc1/0xe0 [<ffffffff81056f8a>] warn_slowpath_fmt+0x4a/0x50 [<ffffffff810a5932>] trace_hardirqs_on_caller+0x112/0x1b0 [<ffffffff810a59dd>] trace_hardirqs_on+0xd/0x10 [<ffffffff815992c7>] _raw_spin_unlock_irq+0x27/0x40 [<ffffffffa0382e9c>] ib_cm_notify+0x25c/0x290 [ib_cm] [<ffffffffa068fbc1>] srpt_qp_event+0xa1/0xf0 [ib_srpt] [<ffffffffa04efb97>] mlx4_ib_qp_event+0x67/0xd0 [mlx4_ib] [<ffffffffa034ec0a>] mlx4_qp_event+0x5a/0xc0 [mlx4_core] [<ffffffffa03365f8>] mlx4_eq_int+0x3d8/0xcf0 [mlx4_core] [<ffffffffa0336f9c>] mlx4_msi_x_interrupt+0xc/0x20 [mlx4_core] [<ffffffff810b0914>] handle_irq_event_percpu+0x64/0x100 [<ffffffff810b09e4>] handle_irq_event+0x34/0x60 [<ffffffff810b3a6a>] handle_edge_irq+0x6a/0x150 [<ffffffff8101ad05>] handle_irq+0x15/0x20 [<ffffffff8101a66c>] do_IRQ+0x5c/0x110 [<ffffffff8159a2c9>] common_interrupt+0x89/0x89 [<ffffffff81297a17>] blk_run_queue_async+0x37/0x40 [<ffffffffa0163e53>] rq_completed+0x43/0x70 [dm_mod] [<ffffffffa0164896>] dm_softirq_done+0x176/0x280 [dm_mod] [<ffffffff812a26c2>] blk_done_softirq+0x52/0x90 [<ffffffff8105bc1f>] __do_softirq+0x10f/0x230 [<ffffffff8105bec8>] irq_exit+0xa8/0xb0 [<ffffffff8103653e>] smp_trace_call_function_single_interrupt+0x2e/0x30 [<ffffffff81036549>] smp_call_function_single_interrupt+0x9/0x10 [<ffffffff8159a959>] call_function_single_interrupt+0x89/0x90 <EOI> Fixes: commit be4b499323bf (IB/cm: Do not queue work to a device that's going away) Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com> Cc: Erez Shitrit <erezsh@mellanox.com> Cc: Sean Hefty <sean.hefty@intel.com> Cc: Nikolay Borisov <kernel@kyup.com> Cc: stable <stable@vger.kernel.org> # v4.2+ Acked-by: Erez Shitrit <erezsh@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-03-25 23:33:16 +08:00
spin_unlock_irqrestore(&cm.lock, flags);
out:
return ret;
}
static int cm_migrate(struct ib_cm_id *cm_id)
{
struct cm_id_private *cm_id_priv;
unsigned long flags;
int ret = 0;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state == IB_CM_ESTABLISHED &&
(cm_id->lap_state == IB_CM_LAP_UNINIT ||
cm_id->lap_state == IB_CM_LAP_IDLE)) {
cm_id->lap_state = IB_CM_LAP_IDLE;
cm_id_priv->av = cm_id_priv->alt_av;
} else
ret = -EINVAL;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
{
int ret;
switch (event) {
case IB_EVENT_COMM_EST:
ret = cm_establish(cm_id);
break;
case IB_EVENT_PATH_MIG:
ret = cm_migrate(cm_id);
break;
default:
ret = -EINVAL;
}
return ret;
}
EXPORT_SYMBOL(ib_cm_notify);
static void cm_recv_handler(struct ib_mad_agent *mad_agent,
struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct cm_port *port = mad_agent->context;
struct cm_work *work;
enum ib_cm_event_type event;
bool alt_path = false;
u16 attr_id;
int paths = 0;
int going_down = 0;
switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
case CM_REQ_ATTR_ID:
alt_path = cm_req_has_alt_path((struct cm_req_msg *)
mad_recv_wc->recv_buf.mad);
paths = 1 + (alt_path != 0);
event = IB_CM_REQ_RECEIVED;
break;
case CM_MRA_ATTR_ID:
event = IB_CM_MRA_RECEIVED;
break;
case CM_REJ_ATTR_ID:
event = IB_CM_REJ_RECEIVED;
break;
case CM_REP_ATTR_ID:
event = IB_CM_REP_RECEIVED;
break;
case CM_RTU_ATTR_ID:
event = IB_CM_RTU_RECEIVED;
break;
case CM_DREQ_ATTR_ID:
event = IB_CM_DREQ_RECEIVED;
break;
case CM_DREP_ATTR_ID:
event = IB_CM_DREP_RECEIVED;
break;
case CM_SIDR_REQ_ATTR_ID:
event = IB_CM_SIDR_REQ_RECEIVED;
break;
case CM_SIDR_REP_ATTR_ID:
event = IB_CM_SIDR_REP_RECEIVED;
break;
case CM_LAP_ATTR_ID:
paths = 1;
event = IB_CM_LAP_RECEIVED;
break;
case CM_APR_ATTR_ID:
event = IB_CM_APR_RECEIVED;
break;
default:
ib_free_recv_mad(mad_recv_wc);
return;
}
attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
atomic_long_inc(&port->counters[CM_RECV][attr_id - CM_ATTR_ID_OFFSET]);
work = kmalloc(struct_size(work, path, paths), GFP_KERNEL);
if (!work) {
ib_free_recv_mad(mad_recv_wc);
return;
}
INIT_DELAYED_WORK(&work->work, cm_work_handler);
work->cm_event.event = event;
work->mad_recv_wc = mad_recv_wc;
work->port = port;
/* Check if the device started its remove_one */
spin_lock_irq(&cm.lock);
if (!port->cm_dev->going_down)
queue_delayed_work(cm.wq, &work->work, 0);
else
going_down = 1;
spin_unlock_irq(&cm.lock);
if (going_down) {
kfree(work);
ib_free_recv_mad(mad_recv_wc);
}
}
static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
struct ib_qp_attr *qp_attr,
int *qp_attr_mask)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&cm_id_priv->lock, flags);
switch (cm_id_priv->id.state) {
case IB_CM_REQ_SENT:
case IB_CM_MRA_REQ_RCVD:
case IB_CM_REQ_RCVD:
case IB_CM_MRA_REQ_SENT:
case IB_CM_REP_RCVD:
case IB_CM_MRA_REP_SENT:
case IB_CM_REP_SENT:
case IB_CM_MRA_REP_RCVD:
case IB_CM_ESTABLISHED:
*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
IB_QP_PKEY_INDEX | IB_QP_PORT;
qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
if (cm_id_priv->responder_resources) {
struct ib_device *ib_dev = cm_id_priv->id.device;
u64 support_flush = ib_dev->attrs.device_cap_flags &
(IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT);
u32 flushable = support_flush ?
(IB_ACCESS_FLUSH_GLOBAL |
IB_ACCESS_FLUSH_PERSISTENT) : 0;
qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_ATOMIC |
flushable;
}
qp_attr->pkey_index = cm_id_priv->av.pkey_index;
if (cm_id_priv->av.port)
qp_attr->port_num = cm_id_priv->av.port->port_num;
ret = 0;
break;
default:
trace_icm_qp_init_err(&cm_id_priv->id);
ret = -EINVAL;
break;
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
struct ib_qp_attr *qp_attr,
int *qp_attr_mask)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&cm_id_priv->lock, flags);
switch (cm_id_priv->id.state) {
case IB_CM_REQ_RCVD:
case IB_CM_MRA_REQ_SENT:
case IB_CM_REP_RCVD:
case IB_CM_MRA_REP_SENT:
case IB_CM_REP_SENT:
case IB_CM_MRA_REP_RCVD:
case IB_CM_ESTABLISHED:
*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
IB_QP_DEST_QPN | IB_QP_RQ_PSN;
qp_attr->ah_attr = cm_id_priv->av.ah_attr;
if ((qp_attr->ah_attr.type == RDMA_AH_ATTR_TYPE_IB) &&
cm_id_priv->av.dlid_datapath &&
(cm_id_priv->av.dlid_datapath != 0xffff))
qp_attr->ah_attr.ib.dlid = cm_id_priv->av.dlid_datapath;
qp_attr->path_mtu = cm_id_priv->path_mtu;
qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
if (cm_id_priv->qp_type == IB_QPT_RC ||
cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
*qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
IB_QP_MIN_RNR_TIMER;
qp_attr->max_dest_rd_atomic =
cm_id_priv->responder_resources;
qp_attr->min_rnr_timer = 0;
}
if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr) &&
cm_id_priv->alt_av.port) {
*qp_attr_mask |= IB_QP_ALT_PATH;
qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
}
ret = 0;
break;
default:
trace_icm_qp_rtr_err(&cm_id_priv->id);
ret = -EINVAL;
break;
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
struct ib_qp_attr *qp_attr,
int *qp_attr_mask)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&cm_id_priv->lock, flags);
switch (cm_id_priv->id.state) {
/* Allow transition to RTS before sending REP */
case IB_CM_REQ_RCVD:
case IB_CM_MRA_REQ_SENT:
case IB_CM_REP_RCVD:
case IB_CM_MRA_REP_SENT:
case IB_CM_REP_SENT:
case IB_CM_MRA_REP_RCVD:
case IB_CM_ESTABLISHED:
if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
*qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
switch (cm_id_priv->qp_type) {
case IB_QPT_RC:
case IB_QPT_XRC_INI:
*qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
IB_QP_MAX_QP_RD_ATOMIC;
qp_attr->retry_cnt = cm_id_priv->retry_count;
qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
qp_attr->max_rd_atomic = cm_id_priv->initiator_depth;
fallthrough;
case IB_QPT_XRC_TGT:
*qp_attr_mask |= IB_QP_TIMEOUT;
qp_attr->timeout = cm_id_priv->av.timeout;
break;
default:
break;
}
if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
*qp_attr_mask |= IB_QP_PATH_MIG_STATE;
qp_attr->path_mig_state = IB_MIG_REARM;
}
} else {
*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
if (cm_id_priv->alt_av.port)
qp_attr->alt_port_num =
cm_id_priv->alt_av.port->port_num;
qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
qp_attr->path_mig_state = IB_MIG_REARM;
}
ret = 0;
break;
default:
trace_icm_qp_rts_err(&cm_id_priv->id);
ret = -EINVAL;
break;
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return ret;
}
int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
struct ib_qp_attr *qp_attr,
int *qp_attr_mask)
{
struct cm_id_private *cm_id_priv;
int ret;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
switch (qp_attr->qp_state) {
case IB_QPS_INIT:
ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
break;
case IB_QPS_RTR:
ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
break;
case IB_QPS_RTS:
ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
EXPORT_SYMBOL(ib_cm_init_qp_attr);
static ssize_t cm_show_counter(struct ib_device *ibdev, u32 port_num,
struct ib_port_attribute *attr, char *buf)
{
struct cm_counter_attribute *cm_attr =
container_of(attr, struct cm_counter_attribute, attr);
struct cm_device *cm_dev = ib_get_client_data(ibdev, &cm_client);
if (WARN_ON(!cm_dev))
return -EINVAL;
return sysfs_emit(
buf, "%ld\n",
atomic_long_read(
&cm_dev->port[port_num - 1]
->counters[cm_attr->group][cm_attr->index]));
}
#define CM_COUNTER_ATTR(_name, _group, _index) \
{ \
.attr = __ATTR(_name, 0444, cm_show_counter, NULL), \
.group = _group, .index = _index \
}
#define CM_COUNTER_GROUP(_group, _name) \
static struct cm_counter_attribute cm_counter_attr_##_group[] = { \
CM_COUNTER_ATTR(req, _group, CM_REQ_COUNTER), \
CM_COUNTER_ATTR(mra, _group, CM_MRA_COUNTER), \
CM_COUNTER_ATTR(rej, _group, CM_REJ_COUNTER), \
CM_COUNTER_ATTR(rep, _group, CM_REP_COUNTER), \
CM_COUNTER_ATTR(rtu, _group, CM_RTU_COUNTER), \
CM_COUNTER_ATTR(dreq, _group, CM_DREQ_COUNTER), \
CM_COUNTER_ATTR(drep, _group, CM_DREP_COUNTER), \
CM_COUNTER_ATTR(sidr_req, _group, CM_SIDR_REQ_COUNTER), \
CM_COUNTER_ATTR(sidr_rep, _group, CM_SIDR_REP_COUNTER), \
CM_COUNTER_ATTR(lap, _group, CM_LAP_COUNTER), \
CM_COUNTER_ATTR(apr, _group, CM_APR_COUNTER), \
}; \
static struct attribute *cm_counter_attrs_##_group[] = { \
&cm_counter_attr_##_group[0].attr.attr, \
&cm_counter_attr_##_group[1].attr.attr, \
&cm_counter_attr_##_group[2].attr.attr, \
&cm_counter_attr_##_group[3].attr.attr, \
&cm_counter_attr_##_group[4].attr.attr, \
&cm_counter_attr_##_group[5].attr.attr, \
&cm_counter_attr_##_group[6].attr.attr, \
&cm_counter_attr_##_group[7].attr.attr, \
&cm_counter_attr_##_group[8].attr.attr, \
&cm_counter_attr_##_group[9].attr.attr, \
&cm_counter_attr_##_group[10].attr.attr, \
NULL, \
}; \
static const struct attribute_group cm_counter_group_##_group = { \
.name = _name, \
.attrs = cm_counter_attrs_##_group, \
};
CM_COUNTER_GROUP(CM_XMIT, "cm_tx_msgs")
CM_COUNTER_GROUP(CM_XMIT_RETRIES, "cm_tx_retries")
CM_COUNTER_GROUP(CM_RECV, "cm_rx_msgs")
CM_COUNTER_GROUP(CM_RECV_DUPLICATES, "cm_rx_duplicates")
static const struct attribute_group *cm_counter_groups[] = {
&cm_counter_group_CM_XMIT,
&cm_counter_group_CM_XMIT_RETRIES,
&cm_counter_group_CM_RECV,
&cm_counter_group_CM_RECV_DUPLICATES,
NULL,
};
static int cm_add_one(struct ib_device *ib_device)
{
struct cm_device *cm_dev;
struct cm_port *port;
struct ib_mad_reg_req reg_req = {
.mgmt_class = IB_MGMT_CLASS_CM,
.mgmt_class_version = IB_CM_CLASS_VERSION,
};
struct ib_port_modify port_modify = {
.set_port_cap_mask = IB_PORT_CM_SUP
};
unsigned long flags;
int ret;
int count = 0;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 15:04:20 +08:00
u32 i;
treewide: Use struct_size() for kmalloc()-family One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kmalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL); This patch makes the changes for kmalloc()-family (and kvmalloc()-family) uses. It was done via automatic conversion with manual review for the "CHECKME" non-standard cases noted below, using the following Coccinelle script: // pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len * // sizeof *pkey_cache->table, GFP_KERNEL); @@ identifier alloc =~ "kmalloc|kzalloc|kvmalloc|kvzalloc"; expression GFP; identifier VAR, ELEMENT; expression COUNT; @@ - alloc(sizeof(*VAR) + COUNT * sizeof(*VAR->ELEMENT), GFP) + alloc(struct_size(VAR, ELEMENT, COUNT), GFP) // mr = kzalloc(sizeof(*mr) + m * sizeof(mr->map[0]), GFP_KERNEL); @@ identifier alloc =~ "kmalloc|kzalloc|kvmalloc|kvzalloc"; expression GFP; identifier VAR, ELEMENT; expression COUNT; @@ - alloc(sizeof(*VAR) + COUNT * sizeof(VAR->ELEMENT[0]), GFP) + alloc(struct_size(VAR, ELEMENT, COUNT), GFP) // Same pattern, but can't trivially locate the trailing element name, // or variable name. @@ identifier alloc =~ "kmalloc|kzalloc|kvmalloc|kvzalloc"; expression GFP; expression SOMETHING, COUNT, ELEMENT; @@ - alloc(sizeof(SOMETHING) + COUNT * sizeof(ELEMENT), GFP) + alloc(CHECKME_struct_size(&SOMETHING, ELEMENT, COUNT), GFP) Signed-off-by: Kees Cook <keescook@chromium.org>
2018-05-09 04:45:50 +08:00
cm_dev = kzalloc(struct_size(cm_dev, port, ib_device->phys_port_cnt),
GFP_KERNEL);
if (!cm_dev)
return -ENOMEM;
kref_init(&cm_dev->kref);
spin_lock_init(&cm_dev->mad_agent_lock);
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
ib_set_client_data(ib_device, &cm_client, cm_dev);
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
rdma_for_each_port (ib_device, i) {
if (!rdma_cap_ib_cm(ib_device, i))
continue;
port = kzalloc(sizeof *port, GFP_KERNEL);
if (!port) {
ret = -ENOMEM;
goto error1;
}
cm_dev->port[i-1] = port;
port->cm_dev = cm_dev;
port->port_num = i;
ret = ib_port_register_client_groups(ib_device, i,
cm_counter_groups);
if (ret)
goto error1;
port->mad_agent = ib_register_mad_agent(ib_device, i,
IB_QPT_GSI,
&reg_req,
0,
cm_send_handler,
cm_recv_handler,
port,
0);
if (IS_ERR(port->mad_agent)) {
ret = PTR_ERR(port->mad_agent);
goto error2;
}
ret = ib_modify_port(ib_device, i, 0, &port_modify);
if (ret)
goto error3;
count++;
}
if (!count) {
ret = -EOPNOTSUPP;
goto free;
}
write_lock_irqsave(&cm.device_lock, flags);
list_add_tail(&cm_dev->list, &cm.device_list);
write_unlock_irqrestore(&cm.device_lock, flags);
return 0;
error3:
ib_unregister_mad_agent(port->mad_agent);
error2:
ib_port_unregister_client_groups(ib_device, i, cm_counter_groups);
error1:
port_modify.set_port_cap_mask = 0;
port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
while (--i) {
if (!rdma_cap_ib_cm(ib_device, i))
continue;
port = cm_dev->port[i-1];
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
ib_unregister_mad_agent(port->mad_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
}
free:
cm_device_put(cm_dev);
return ret;
}
static void cm_remove_one(struct ib_device *ib_device, void *client_data)
{
struct cm_device *cm_dev = client_data;
struct cm_port *port;
struct ib_port_modify port_modify = {
.clr_port_cap_mask = IB_PORT_CM_SUP
};
unsigned long flags;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 15:04:20 +08:00
u32 i;
write_lock_irqsave(&cm.device_lock, flags);
list_del(&cm_dev->list);
write_unlock_irqrestore(&cm.device_lock, flags);
spin_lock_irq(&cm.lock);
cm_dev->going_down = 1;
spin_unlock_irq(&cm.lock);
rdma_for_each_port (ib_device, i) {
struct ib_mad_agent *mad_agent;
if (!rdma_cap_ib_cm(ib_device, i))
continue;
port = cm_dev->port[i-1];
mad_agent = port->mad_agent;
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
/*
* We flush the queue here after the going_down set, this
* verify that no new works will be queued in the recv handler,
* after that we can call the unregister_mad_agent
*/
flush_workqueue(cm.wq);
/*
* The above ensures no call paths from the work are running,
* the remaining paths all take the mad_agent_lock.
*/
spin_lock(&cm_dev->mad_agent_lock);
port->mad_agent = NULL;
spin_unlock(&cm_dev->mad_agent_lock);
ib_unregister_mad_agent(mad_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
}
IB/cm: Mark stale CM id's whenever the mad agent was unregistered When there is a CM id object that has port assigned to it, it means that the cm-id asked for the specific port that it should go by it, but if that port was removed (hot-unplug event) the cm-id was not updated. In order to fix that the port keeps a list of all the cm-id's that are planning to go by it, whenever the port is removed it marks all of them as invalid. This commit fixes a kernel panic which happens when running traffic between guests and we force reboot a guest mid traffic, it triggers a kernel panic: Call Trace: [<ffffffff815271fa>] ? panic+0xa7/0x16f [<ffffffff8152b534>] ? oops_end+0xe4/0x100 [<ffffffff8104a00b>] ? no_context+0xfb/0x260 [<ffffffff81084db2>] ? del_timer_sync+0x22/0x30 [<ffffffff8104a295>] ? __bad_area_nosemaphore+0x125/0x1e0 [<ffffffff81084240>] ? process_timeout+0x0/0x10 [<ffffffff8104a363>] ? bad_area_nosemaphore+0x13/0x20 [<ffffffff8104aabf>] ? __do_page_fault+0x31f/0x480 [<ffffffff81065df0>] ? default_wake_function+0x0/0x20 [<ffffffffa0752675>] ? free_msg+0x55/0x70 [mlx5_core] [<ffffffffa0753434>] ? cmd_exec+0x124/0x840 [mlx5_core] [<ffffffff8105a924>] ? find_busiest_group+0x244/0x9f0 [<ffffffff8152d45e>] ? do_page_fault+0x3e/0xa0 [<ffffffff8152a815>] ? page_fault+0x25/0x30 [<ffffffffa024da25>] ? cm_alloc_msg+0x35/0xc0 [ib_cm] [<ffffffffa024e821>] ? ib_send_cm_dreq+0xb1/0x1e0 [ib_cm] [<ffffffffa024f836>] ? cm_destroy_id+0x176/0x320 [ib_cm] [<ffffffffa024fb00>] ? ib_destroy_cm_id+0x10/0x20 [ib_cm] [<ffffffffa034f527>] ? ipoib_cm_free_rx_reap_list+0xa7/0x110 [ib_ipoib] [<ffffffffa034f590>] ? ipoib_cm_rx_reap+0x0/0x20 [ib_ipoib] [<ffffffffa034f5a5>] ? ipoib_cm_rx_reap+0x15/0x20 [ib_ipoib] [<ffffffff81094d20>] ? worker_thread+0x170/0x2a0 [<ffffffff8109b2a0>] ? autoremove_wake_function+0x0/0x40 [<ffffffff81094bb0>] ? worker_thread+0x0/0x2a0 [<ffffffff8109aef6>] ? kthread+0x96/0xa0 [<ffffffff8100c20a>] ? child_rip+0xa/0x20 [<ffffffff8109ae60>] ? kthread+0x0/0xa0 [<ffffffff8100c200>] ? child_rip+0x0/0x20 Fixes: a977049dacde ("[PATCH] IB: Add the kernel CM implementation") Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Erez Shitrit <erezsh@mellanox.com> Reviewed-by: Maor Gottlieb <maorg@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-10-27 21:36:27 +08:00
cm_device_put(cm_dev);
}
static int __init ib_cm_init(void)
{
int ret;
INIT_LIST_HEAD(&cm.device_list);
rwlock_init(&cm.device_lock);
spin_lock_init(&cm.lock);
cm.listen_service_table = RB_ROOT;
cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
cm.remote_id_table = RB_ROOT;
cm.remote_qp_table = RB_ROOT;
cm.remote_sidr_table = RB_ROOT;
xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC);
get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
INIT_LIST_HEAD(&cm.timewait_list);
cm.wq = alloc_workqueue("ib_cm", 0, 1);
if (!cm.wq) {
ret = -ENOMEM;
goto error2;
}
ret = ib_register_client(&cm_client);
if (ret)
goto error3;
return 0;
error3:
destroy_workqueue(cm.wq);
error2:
return ret;
}
static void __exit ib_cm_cleanup(void)
{
struct cm_timewait_info *timewait_info, *tmp;
spin_lock_irq(&cm.lock);
list_for_each_entry(timewait_info, &cm.timewait_list, list)
cancel_delayed_work(&timewait_info->work.work);
spin_unlock_irq(&cm.lock);
ib_unregister_client(&cm_client);
destroy_workqueue(cm.wq);
list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
list_del(&timewait_info->list);
kfree(timewait_info);
}
WARN_ON(!xa_empty(&cm.local_id_table));
}
module_init(ib_cm_init);
module_exit(ib_cm_cleanup);