2019-05-19 20:08:20 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2017-01-09 23:55:13 +08:00
|
|
|
/*
|
|
|
|
* Shared Memory Communications over RDMA (SMC-R) and RoCE
|
|
|
|
*
|
|
|
|
* AF_SMC protocol family socket handler keeping the AF_INET sock address type
|
|
|
|
* applies to SOCK_STREAM sockets only
|
|
|
|
* offers an alternative communication option for TCP-protocol sockets
|
|
|
|
* applicable with RoCE-cards only
|
|
|
|
*
|
2017-01-09 23:55:16 +08:00
|
|
|
* Initial restrictions:
|
|
|
|
* - support for alternate links postponed
|
|
|
|
*
|
2018-03-16 22:06:41 +08:00
|
|
|
* Copyright IBM Corp. 2016, 2018
|
2017-01-09 23:55:13 +08:00
|
|
|
*
|
|
|
|
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
|
|
|
|
* based on prototype from Frank Blaschka
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define KMSG_COMPONENT "smc"
|
|
|
|
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/socket.h>
|
2017-01-09 23:55:16 +08:00
|
|
|
#include <linux/workqueue.h>
|
2017-01-09 23:55:22 +08:00
|
|
|
#include <linux/in.h>
|
2017-02-02 15:35:14 +08:00
|
|
|
#include <linux/sched/signal.h>
|
2018-06-29 01:05:11 +08:00
|
|
|
#include <linux/if_vlan.h>
|
2019-11-17 00:47:31 +08:00
|
|
|
#include <linux/rcupdate_wait.h>
|
2020-09-26 18:44:31 +08:00
|
|
|
#include <linux/ctype.h>
|
2023-01-26 15:14:21 +08:00
|
|
|
#include <linux/splice.h>
|
2017-02-02 15:35:14 +08:00
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
#include <net/sock.h>
|
2017-01-09 23:55:16 +08:00
|
|
|
#include <net/tcp.h>
|
2017-01-09 23:55:26 +08:00
|
|
|
#include <net/smc.h>
|
2018-05-02 22:56:46 +08:00
|
|
|
#include <asm/ioctls.h>
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2019-02-21 20:01:02 +08:00
|
|
|
#include <net/net_namespace.h>
|
|
|
|
#include <net/netns/generic.h>
|
|
|
|
#include "smc_netns.h"
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
#include "smc.h"
|
2017-01-09 23:55:16 +08:00
|
|
|
#include "smc_clc.h"
|
2017-01-09 23:55:21 +08:00
|
|
|
#include "smc_llc.h"
|
2017-01-09 23:55:22 +08:00
|
|
|
#include "smc_cdc.h"
|
2017-01-09 23:55:17 +08:00
|
|
|
#include "smc_core.h"
|
2017-01-09 23:55:14 +08:00
|
|
|
#include "smc_ib.h"
|
2018-06-29 01:05:11 +08:00
|
|
|
#include "smc_ism.h"
|
2017-01-09 23:55:15 +08:00
|
|
|
#include "smc_pnet.h"
|
2020-12-02 03:20:43 +08:00
|
|
|
#include "smc_netlink.h"
|
2017-01-09 23:55:23 +08:00
|
|
|
#include "smc_tx.h"
|
2017-01-09 23:55:24 +08:00
|
|
|
#include "smc_rx.h"
|
2017-01-09 23:55:25 +08:00
|
|
|
#include "smc_close.h"
|
2021-06-16 22:52:55 +08:00
|
|
|
#include "smc_stats.h"
|
2021-11-01 15:39:12 +08:00
|
|
|
#include "smc_tracepoint.h"
|
2022-03-01 17:43:56 +08:00
|
|
|
#include "smc_sysctl.h"
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2019-02-07 22:56:18 +08:00
|
|
|
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
|
|
|
|
* creation on server
|
|
|
|
*/
|
|
|
|
static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
|
|
|
|
* creation on client
|
2017-01-09 23:55:17 +08:00
|
|
|
*/
|
|
|
|
|
2022-02-10 17:11:34 +08:00
|
|
|
static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */
|
2020-09-11 00:48:29 +08:00
|
|
|
struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
|
|
|
|
struct workqueue_struct *smc_close_wq; /* wq for close work */
|
|
|
|
|
2017-01-09 23:55:16 +08:00
|
|
|
static void smc_tcp_listen_work(struct work_struct *);
|
2018-06-27 23:59:50 +08:00
|
|
|
static void smc_connect_work(struct work_struct *);
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2022-02-10 17:11:38 +08:00
|
|
|
int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
|
|
|
|
{
|
|
|
|
struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
|
|
|
|
void *hdr;
|
|
|
|
|
|
|
|
if (cb_ctx->pos[0])
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
|
|
|
|
&smc_gen_nl_family, NLM_F_MULTI,
|
|
|
|
SMC_NETLINK_DUMP_HS_LIMITATION);
|
|
|
|
if (!hdr)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
|
|
|
|
sock_net(skb->sk)->smc.limit_smc_hs))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
genlmsg_end(skb, hdr);
|
|
|
|
cb_ctx->pos[0] = 1;
|
|
|
|
out:
|
|
|
|
return skb->len;
|
|
|
|
err:
|
|
|
|
genlmsg_cancel(skb, hdr);
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
|
|
|
|
{
|
|
|
|
sock_net(skb->sk)->smc.limit_smc_hs = true;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
|
|
|
|
{
|
|
|
|
sock_net(skb->sk)->smc.limit_smc_hs = false;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
static void smc_set_keepalive(struct sock *sk, int val)
|
|
|
|
{
|
|
|
|
struct smc_sock *smc = smc_sk(sk);
|
|
|
|
|
|
|
|
smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
|
|
|
|
}
|
|
|
|
|
net/smc: Limit backlog connections
Current implementation does not handling backlog semantics, one
potential risk is that server will be flooded by infinite amount
connections, even if client was SMC-incapable.
This patch works to put a limit on backlog connections, referring to the
TCP implementation, we divides SMC connections into two categories:
1. Half SMC connection, which includes all TCP established while SMC not
connections.
2. Full SMC connection, which includes all SMC established connections.
For half SMC connection, since all half SMC connections starts with TCP
established, we can achieve our goal by put a limit before TCP
established. Refer to the implementation of TCP, this limits will based
on not only the half SMC connections but also the full connections,
which is also a constraint on full SMC connections.
For full SMC connections, although we know exactly where it starts, it's
quite hard to put a limit before it. The easiest way is to block wait
before receive SMC confirm CLC message, while it's under protection by
smc_server_lgr_pending, a global lock, which leads this limit to the
entire host instead of a single listen socket. Another way is to drop
the full connections, but considering the cast of SMC connections, we
prefer to keep full SMC connections.
Even so, the limits of full SMC connections still exists, see commits
about half SMC connection below.
After this patch, the limits of backend connection shows like:
For SMC:
1. Client with SMC-capability can makes 2 * backlog full SMC connections
or 1 * backlog half SMC connections and 1 * backlog full SMC
connections at most.
2. Client without SMC-capability can only makes 1 * backlog half TCP
connections and 1 * backlog full TCP connections.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 17:11:35 +08:00
|
|
|
static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
|
|
|
|
struct sk_buff *skb,
|
|
|
|
struct request_sock *req,
|
|
|
|
struct dst_entry *dst,
|
|
|
|
struct request_sock *req_unhash,
|
|
|
|
bool *own_req)
|
|
|
|
{
|
|
|
|
struct smc_sock *smc;
|
2022-04-08 23:10:35 +08:00
|
|
|
struct sock *child;
|
net/smc: Limit backlog connections
Current implementation does not handling backlog semantics, one
potential risk is that server will be flooded by infinite amount
connections, even if client was SMC-incapable.
This patch works to put a limit on backlog connections, referring to the
TCP implementation, we divides SMC connections into two categories:
1. Half SMC connection, which includes all TCP established while SMC not
connections.
2. Full SMC connection, which includes all SMC established connections.
For half SMC connection, since all half SMC connections starts with TCP
established, we can achieve our goal by put a limit before TCP
established. Refer to the implementation of TCP, this limits will based
on not only the half SMC connections but also the full connections,
which is also a constraint on full SMC connections.
For full SMC connections, although we know exactly where it starts, it's
quite hard to put a limit before it. The easiest way is to block wait
before receive SMC confirm CLC message, while it's under protection by
smc_server_lgr_pending, a global lock, which leads this limit to the
entire host instead of a single listen socket. Another way is to drop
the full connections, but considering the cast of SMC connections, we
prefer to keep full SMC connections.
Even so, the limits of full SMC connections still exists, see commits
about half SMC connection below.
After this patch, the limits of backend connection shows like:
For SMC:
1. Client with SMC-capability can makes 2 * backlog full SMC connections
or 1 * backlog half SMC connections and 1 * backlog full SMC
connections at most.
2. Client without SMC-capability can only makes 1 * backlog half TCP
connections and 1 * backlog full TCP connections.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 17:11:35 +08:00
|
|
|
|
|
|
|
smc = smc_clcsock_user_data(sk);
|
|
|
|
|
|
|
|
if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
|
|
|
|
sk->sk_max_ack_backlog)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
if (sk_acceptq_is_full(&smc->sk)) {
|
|
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* passthrough to original syn recv sock fct */
|
2022-04-08 23:10:35 +08:00
|
|
|
child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
|
|
|
|
own_req);
|
|
|
|
/* child must not inherit smc or its ops */
|
|
|
|
if (child) {
|
|
|
|
rcu_assign_sk_user_data(child, NULL);
|
|
|
|
|
|
|
|
/* v4-mapped sockets don't inherit parent ops. Don't restore. */
|
|
|
|
if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops)
|
|
|
|
inet_csk(child)->icsk_af_ops = smc->ori_af_ops;
|
|
|
|
}
|
|
|
|
return child;
|
net/smc: Limit backlog connections
Current implementation does not handling backlog semantics, one
potential risk is that server will be flooded by infinite amount
connections, even if client was SMC-incapable.
This patch works to put a limit on backlog connections, referring to the
TCP implementation, we divides SMC connections into two categories:
1. Half SMC connection, which includes all TCP established while SMC not
connections.
2. Full SMC connection, which includes all SMC established connections.
For half SMC connection, since all half SMC connections starts with TCP
established, we can achieve our goal by put a limit before TCP
established. Refer to the implementation of TCP, this limits will based
on not only the half SMC connections but also the full connections,
which is also a constraint on full SMC connections.
For full SMC connections, although we know exactly where it starts, it's
quite hard to put a limit before it. The easiest way is to block wait
before receive SMC confirm CLC message, while it's under protection by
smc_server_lgr_pending, a global lock, which leads this limit to the
entire host instead of a single listen socket. Another way is to drop
the full connections, but considering the cast of SMC connections, we
prefer to keep full SMC connections.
Even so, the limits of full SMC connections still exists, see commits
about half SMC connection below.
After this patch, the limits of backend connection shows like:
For SMC:
1. Client with SMC-capability can makes 2 * backlog full SMC connections
or 1 * backlog half SMC connections and 1 * backlog full SMC
connections at most.
2. Client without SMC-capability can only makes 1 * backlog half TCP
connections and 1 * backlog full TCP connections.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 17:11:35 +08:00
|
|
|
|
|
|
|
drop:
|
|
|
|
dst_release(dst);
|
|
|
|
tcp_listendrop(sk);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
net/smc: Limit SMC visits when handshake workqueue congested
This patch intends to provide a mechanism to put constraint on SMC
connections visit according to the pressure of SMC handshake process.
At present, frequent visits will cause the incoming connections to be
backlogged in SMC handshake queue, raise the connections established
time. Which is quite unacceptable for those applications who base on
short lived connections.
There are two ways to implement this mechanism:
1. Put limitation after TCP established.
2. Put limitation before TCP established.
In the first way, we need to wait and receive CLC messages that the
client will potentially send, and then actively reply with a decline
message, in a sense, which is also a sort of SMC handshake, affect the
connections established time on its way.
In the second way, the only problem is that we need to inject SMC logic
into TCP when it is about to reply the incoming SYN, since we already do
that, it's seems not a problem anymore. And advantage is obvious, few
additional processes are required to complete the constraint.
This patch use the second way. After this patch, connections who beyond
constraint will not informed any SMC indication, and SMC will not be
involved in any of its subsequent processes.
Link: https://lore.kernel.org/all/1641301961-59331-1-git-send-email-alibuda@linux.alibaba.com/
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 17:11:36 +08:00
|
|
|
static bool smc_hs_congested(const struct sock *sk)
|
|
|
|
{
|
|
|
|
const struct smc_sock *smc;
|
|
|
|
|
|
|
|
smc = smc_clcsock_user_data(sk);
|
|
|
|
|
|
|
|
if (!smc)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:26 +08:00
|
|
|
static struct smc_hashinfo smc_v4_hashinfo = {
|
|
|
|
.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
|
|
|
|
};
|
|
|
|
|
2018-03-16 22:06:41 +08:00
|
|
|
static struct smc_hashinfo smc_v6_hashinfo = {
|
|
|
|
.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
|
|
|
|
};
|
|
|
|
|
2017-01-09 23:55:26 +08:00
|
|
|
int smc_hash_sk(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
|
|
|
|
struct hlist_head *head;
|
|
|
|
|
|
|
|
head = &h->ht;
|
|
|
|
|
|
|
|
write_lock_bh(&h->lock);
|
|
|
|
sk_add_node(sk, head);
|
|
|
|
write_unlock_bh(&h->lock);
|
2021-11-16 01:11:50 +08:00
|
|
|
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
|
2017-01-09 23:55:26 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(smc_hash_sk);
|
|
|
|
|
|
|
|
void smc_unhash_sk(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
|
|
|
|
|
|
|
|
write_lock_bh(&h->lock);
|
|
|
|
if (sk_del_node_init(sk))
|
|
|
|
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
|
|
|
|
write_unlock_bh(&h->lock);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(smc_unhash_sk);
|
|
|
|
|
2022-03-01 17:44:02 +08:00
|
|
|
/* This will be called before user really release sock_lock. So do the
|
|
|
|
* work which we didn't do because of user hold the sock_lock in the
|
|
|
|
* BH context
|
|
|
|
*/
|
|
|
|
static void smc_release_cb(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct smc_sock *smc = smc_sk(sk);
|
|
|
|
|
|
|
|
if (smc->conn.tx_in_release_sock) {
|
|
|
|
smc_tx_pending(&smc->conn);
|
|
|
|
smc->conn.tx_in_release_sock = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:26 +08:00
|
|
|
struct proto smc_proto = {
|
2017-01-09 23:55:13 +08:00
|
|
|
.name = "SMC",
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.keepalive = smc_set_keepalive,
|
2017-01-09 23:55:26 +08:00
|
|
|
.hash = smc_hash_sk,
|
|
|
|
.unhash = smc_unhash_sk,
|
2022-03-01 17:44:02 +08:00
|
|
|
.release_cb = smc_release_cb,
|
2017-01-09 23:55:13 +08:00
|
|
|
.obj_size = sizeof(struct smc_sock),
|
2017-01-09 23:55:26 +08:00
|
|
|
.h.smc_hash = &smc_v4_hashinfo,
|
2017-01-18 18:53:44 +08:00
|
|
|
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
2017-01-09 23:55:13 +08:00
|
|
|
};
|
2017-01-09 23:55:26 +08:00
|
|
|
EXPORT_SYMBOL_GPL(smc_proto);
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2018-03-16 22:06:41 +08:00
|
|
|
struct proto smc_proto6 = {
|
|
|
|
.name = "SMC6",
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.keepalive = smc_set_keepalive,
|
|
|
|
.hash = smc_hash_sk,
|
|
|
|
.unhash = smc_unhash_sk,
|
2022-03-01 17:44:02 +08:00
|
|
|
.release_cb = smc_release_cb,
|
2018-03-16 22:06:41 +08:00
|
|
|
.obj_size = sizeof(struct smc_sock),
|
|
|
|
.h.smc_hash = &smc_v6_hashinfo,
|
|
|
|
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
|
|
|
};
|
|
|
|
EXPORT_SYMBOL_GPL(smc_proto6);
|
|
|
|
|
2022-04-22 15:56:19 +08:00
|
|
|
static void smc_fback_restore_callbacks(struct smc_sock *smc)
|
|
|
|
{
|
|
|
|
struct sock *clcsk = smc->clcsock->sk;
|
|
|
|
|
|
|
|
write_lock_bh(&clcsk->sk_callback_lock);
|
|
|
|
clcsk->sk_user_data = NULL;
|
|
|
|
|
|
|
|
smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change);
|
|
|
|
smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready);
|
|
|
|
smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space);
|
|
|
|
smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report);
|
|
|
|
|
|
|
|
write_unlock_bh(&clcsk->sk_callback_lock);
|
|
|
|
}
|
|
|
|
|
2019-10-23 21:44:05 +08:00
|
|
|
static void smc_restore_fallback_changes(struct smc_sock *smc)
|
|
|
|
{
|
2020-07-18 21:06:18 +08:00
|
|
|
if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
|
|
|
|
smc->clcsock->file->private_data = smc->sk.sk_socket;
|
|
|
|
smc->clcsock->file = NULL;
|
2022-04-22 15:56:19 +08:00
|
|
|
smc_fback_restore_callbacks(smc);
|
2020-07-18 21:06:18 +08:00
|
|
|
}
|
2019-10-23 21:44:05 +08:00
|
|
|
}
|
|
|
|
|
2019-06-27 21:04:52 +08:00
|
|
|
static int __smc_release(struct smc_sock *smc)
|
2017-01-09 23:55:13 +08:00
|
|
|
{
|
2019-06-27 21:04:52 +08:00
|
|
|
struct sock *sk = &smc->sk;
|
2017-01-09 23:55:25 +08:00
|
|
|
int rc = 0;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2018-01-26 16:28:48 +08:00
|
|
|
if (!smc->use_fallback) {
|
2017-01-09 23:55:25 +08:00
|
|
|
rc = smc_close_active(smc);
|
2023-11-03 14:07:38 +08:00
|
|
|
smc_sock_set_flag(sk, SOCK_DEAD);
|
2017-01-09 23:55:25 +08:00
|
|
|
sk->sk_shutdown |= SHUTDOWN_MASK;
|
2019-02-07 22:56:15 +08:00
|
|
|
} else {
|
2021-11-10 15:02:34 +08:00
|
|
|
if (sk->sk_state != SMC_CLOSED) {
|
|
|
|
if (sk->sk_state != SMC_LISTEN &&
|
|
|
|
sk->sk_state != SMC_INIT)
|
|
|
|
sock_put(sk); /* passive closing */
|
|
|
|
if (sk->sk_state == SMC_LISTEN) {
|
|
|
|
/* wake up clcsock accept */
|
|
|
|
rc = kernel_sock_shutdown(smc->clcsock,
|
|
|
|
SHUT_RDWR);
|
|
|
|
}
|
|
|
|
sk->sk_state = SMC_CLOSED;
|
|
|
|
sk->sk_state_change(sk);
|
2018-12-19 01:02:25 +08:00
|
|
|
}
|
2019-10-23 21:44:05 +08:00
|
|
|
smc_restore_fallback_changes(smc);
|
2018-01-26 16:28:48 +08:00
|
|
|
}
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2019-02-07 22:56:15 +08:00
|
|
|
sk->sk_prot->unhash(sk);
|
|
|
|
|
|
|
|
if (sk->sk_state == SMC_CLOSED) {
|
|
|
|
if (smc->clcsock) {
|
2019-04-11 17:17:30 +08:00
|
|
|
release_sock(sk);
|
|
|
|
smc_clcsock_release(smc);
|
|
|
|
lock_sock(sk);
|
2019-02-07 22:56:15 +08:00
|
|
|
}
|
|
|
|
if (!smc->use_fallback)
|
|
|
|
smc_conn_free(&smc->conn);
|
|
|
|
}
|
|
|
|
|
2019-06-27 21:04:52 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_release(struct socket *sock)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
net/smc: fix connection leak
There's a potential leak issue under following execution sequence :
smc_release smc_connect_work
if (sk->sk_state == SMC_INIT)
send_clc_confirim
tcp_abort();
...
sk.sk_state = SMC_ACTIVE
smc_close_active
switch(sk->sk_state) {
...
case SMC_ACTIVE:
smc_close_final()
// then wait peer closed
Unfortunately, tcp_abort() may discard CLC CONFIRM messages that are
still in the tcp send buffer, in which case our connection token cannot
be delivered to the server side, which means that we cannot get a
passive close message at all. Therefore, it is impossible for the to be
disconnected at all.
This patch tries a very simple way to avoid this issue, once the state
has changed to SMC_ACTIVE after tcp_abort(), we can actively abort the
smc connection, considering that the state is SMC_INIT before
tcp_abort(), abandoning the complete disconnection process should not
cause too much problem.
In fact, this problem may exist as long as the CLC CONFIRM message is
not received by the server. Whether a timer should be added after
smc_close_final() needs to be discussed in the future. But even so, this
patch provides a faster release for connection in above case, it should
also be valuable.
Fixes: 39f41f367b08 ("net/smc: common release code for non-accepted sockets")
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-24 23:26:19 +08:00
|
|
|
int old_state, rc = 0;
|
2019-06-27 21:04:52 +08:00
|
|
|
|
|
|
|
if (!sk)
|
|
|
|
goto out;
|
|
|
|
|
2019-10-21 22:13:15 +08:00
|
|
|
sock_hold(sk); /* sock_put below */
|
2019-06-27 21:04:52 +08:00
|
|
|
smc = smc_sk(sk);
|
|
|
|
|
net/smc: fix connection leak
There's a potential leak issue under following execution sequence :
smc_release smc_connect_work
if (sk->sk_state == SMC_INIT)
send_clc_confirim
tcp_abort();
...
sk.sk_state = SMC_ACTIVE
smc_close_active
switch(sk->sk_state) {
...
case SMC_ACTIVE:
smc_close_final()
// then wait peer closed
Unfortunately, tcp_abort() may discard CLC CONFIRM messages that are
still in the tcp send buffer, in which case our connection token cannot
be delivered to the server side, which means that we cannot get a
passive close message at all. Therefore, it is impossible for the to be
disconnected at all.
This patch tries a very simple way to avoid this issue, once the state
has changed to SMC_ACTIVE after tcp_abort(), we can actively abort the
smc connection, considering that the state is SMC_INIT before
tcp_abort(), abandoning the complete disconnection process should not
cause too much problem.
In fact, this problem may exist as long as the CLC CONFIRM message is
not received by the server. Whether a timer should be added after
smc_close_final() needs to be discussed in the future. But even so, this
patch provides a faster release for connection in above case, it should
also be valuable.
Fixes: 39f41f367b08 ("net/smc: common release code for non-accepted sockets")
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-24 23:26:19 +08:00
|
|
|
old_state = sk->sk_state;
|
|
|
|
|
2019-06-27 21:04:52 +08:00
|
|
|
/* cleanup for a dangling non-blocking connect */
|
net/smc: fix connection leak
There's a potential leak issue under following execution sequence :
smc_release smc_connect_work
if (sk->sk_state == SMC_INIT)
send_clc_confirim
tcp_abort();
...
sk.sk_state = SMC_ACTIVE
smc_close_active
switch(sk->sk_state) {
...
case SMC_ACTIVE:
smc_close_final()
// then wait peer closed
Unfortunately, tcp_abort() may discard CLC CONFIRM messages that are
still in the tcp send buffer, in which case our connection token cannot
be delivered to the server side, which means that we cannot get a
passive close message at all. Therefore, it is impossible for the to be
disconnected at all.
This patch tries a very simple way to avoid this issue, once the state
has changed to SMC_ACTIVE after tcp_abort(), we can actively abort the
smc connection, considering that the state is SMC_INIT before
tcp_abort(), abandoning the complete disconnection process should not
cause too much problem.
In fact, this problem may exist as long as the CLC CONFIRM message is
not received by the server. Whether a timer should be added after
smc_close_final() needs to be discussed in the future. But even so, this
patch provides a faster release for connection in above case, it should
also be valuable.
Fixes: 39f41f367b08 ("net/smc: common release code for non-accepted sockets")
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-24 23:26:19 +08:00
|
|
|
if (smc->connect_nonblock && old_state == SMC_INIT)
|
2019-06-27 21:04:52 +08:00
|
|
|
tcp_abort(smc->clcsock->sk, ECONNABORTED);
|
net/smc: Prevent smc_release() from long blocking
In nginx/wrk benchmark, there's a hung problem with high probability
on case likes that: (client will last several minutes to exit)
server: smc_run nginx
client: smc_run wrk -c 10000 -t 1 http://server
Client hangs with the following backtrace:
0 [ffffa7ce8Of3bbf8] __schedule at ffffffff9f9eOd5f
1 [ffffa7ce8Of3bc88] schedule at ffffffff9f9eløe6
2 [ffffa7ce8Of3bcaO] schedule_timeout at ffffffff9f9e3f3c
3 [ffffa7ce8Of3bd2O] wait_for_common at ffffffff9f9el9de
4 [ffffa7ce8Of3bd8O] __flush_work at ffffffff9fOfeOl3
5 [ffffa7ce8øf3bdfO] smc_release at ffffffffcO697d24 [smc]
6 [ffffa7ce8Of3be2O] __sock_release at ffffffff9f8O2e2d
7 [ffffa7ce8Of3be4ø] sock_close at ffffffff9f8ø2ebl
8 [ffffa7ce8øf3be48] __fput at ffffffff9f334f93
9 [ffffa7ce8Of3be78] task_work_run at ffffffff9flOlff5
10 [ffffa7ce8Of3beaO] do_exit at ffffffff9fOe5Ol2
11 [ffffa7ce8Of3bflO] do_group_exit at ffffffff9fOe592a
12 [ffffa7ce8Of3bf38] __x64_sys_exit_group at ffffffff9fOe5994
13 [ffffa7ce8Of3bf4O] do_syscall_64 at ffffffff9f9d4373
14 [ffffa7ce8Of3bfsO] entry_SYSCALL_64_after_hwframe at ffffffff9fa0007c
This issue dues to flush_work(), which is used to wait for
smc_connect_work() to finish in smc_release(). Once lots of
smc_connect_work() was pending or all executing work dangling,
smc_release() has to block until one worker comes to free, which
is equivalent to wait another smc_connnect_work() to finish.
In order to fix this, There are two changes:
1. For those idle smc_connect_work(), cancel it from the workqueue; for
executing smc_connect_work(), waiting for it to finish. For that
purpose, replace flush_work() with cancel_work_sync().
2. Since smc_connect() hold a reference for passive closing, if
smc_connect_work() has been cancelled, release the reference.
Fixes: 24ac3a08e658 ("net/smc: rebuild nonblocking connect")
Reported-by: Tony Lu <tonylu@linux.alibaba.com>
Tested-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Link: https://lore.kernel.org/r/1639571361-101128-1-git-send-email-alibuda@linux.alibaba.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-12-15 20:29:21 +08:00
|
|
|
|
|
|
|
if (cancel_work_sync(&smc->connect_work))
|
|
|
|
sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
|
2019-06-27 21:04:52 +08:00
|
|
|
|
|
|
|
if (sk->sk_state == SMC_LISTEN)
|
|
|
|
/* smc_close_non_accepted() is called and acquires
|
|
|
|
* sock lock for child sockets again
|
|
|
|
*/
|
|
|
|
lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
|
|
|
|
else
|
|
|
|
lock_sock(sk);
|
|
|
|
|
net/smc: fix connection leak
There's a potential leak issue under following execution sequence :
smc_release smc_connect_work
if (sk->sk_state == SMC_INIT)
send_clc_confirim
tcp_abort();
...
sk.sk_state = SMC_ACTIVE
smc_close_active
switch(sk->sk_state) {
...
case SMC_ACTIVE:
smc_close_final()
// then wait peer closed
Unfortunately, tcp_abort() may discard CLC CONFIRM messages that are
still in the tcp send buffer, in which case our connection token cannot
be delivered to the server side, which means that we cannot get a
passive close message at all. Therefore, it is impossible for the to be
disconnected at all.
This patch tries a very simple way to avoid this issue, once the state
has changed to SMC_ACTIVE after tcp_abort(), we can actively abort the
smc connection, considering that the state is SMC_INIT before
tcp_abort(), abandoning the complete disconnection process should not
cause too much problem.
In fact, this problem may exist as long as the CLC CONFIRM message is
not received by the server. Whether a timer should be added after
smc_close_final() needs to be discussed in the future. But even so, this
patch provides a faster release for connection in above case, it should
also be valuable.
Fixes: 39f41f367b08 ("net/smc: common release code for non-accepted sockets")
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-24 23:26:19 +08:00
|
|
|
if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
|
|
|
|
!smc->use_fallback)
|
|
|
|
smc_close_active_abort(smc);
|
|
|
|
|
2019-06-27 21:04:52 +08:00
|
|
|
rc = __smc_release(smc);
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
/* detach socket */
|
|
|
|
sock_orphan(sk);
|
|
|
|
sock->sk = NULL;
|
|
|
|
release_sock(sk);
|
|
|
|
|
2019-10-21 22:13:15 +08:00
|
|
|
sock_put(sk); /* sock_hold above */
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_put(sk); /* final sock_put */
|
2017-01-09 23:55:13 +08:00
|
|
|
out:
|
2017-01-09 23:55:25 +08:00
|
|
|
return rc;
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_destruct(struct sock *sk)
|
|
|
|
{
|
|
|
|
if (sk->sk_state != SMC_CLOSED)
|
|
|
|
return;
|
|
|
|
if (!sock_flag(sk, SOCK_DEAD))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-03-16 22:06:41 +08:00
|
|
|
static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
|
|
|
|
int protocol)
|
2017-01-09 23:55:13 +08:00
|
|
|
{
|
|
|
|
struct smc_sock *smc;
|
2018-03-16 22:06:41 +08:00
|
|
|
struct proto *prot;
|
2017-01-09 23:55:13 +08:00
|
|
|
struct sock *sk;
|
|
|
|
|
2018-03-16 22:06:41 +08:00
|
|
|
prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
|
|
|
|
sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
|
2017-01-09 23:55:13 +08:00
|
|
|
if (!sk)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
|
|
|
|
sk->sk_state = SMC_INIT;
|
|
|
|
sk->sk_destruct = smc_destruct;
|
2018-03-16 22:06:41 +08:00
|
|
|
sk->sk_protocol = protocol;
|
2023-08-05 01:06:23 +08:00
|
|
|
WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem));
|
|
|
|
WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem));
|
2017-01-09 23:55:13 +08:00
|
|
|
smc = smc_sk(sk);
|
2017-01-09 23:55:16 +08:00
|
|
|
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
|
2018-06-27 23:59:50 +08:00
|
|
|
INIT_WORK(&smc->connect_work, smc_connect_work);
|
net/smc: init conn.tx_work & conn.send_lock sooner
syzkaller found that following program crashes the host :
{
int fd = socket(AF_SMC, SOCK_STREAM, 0);
int val = 1;
listen(fd, 0);
shutdown(fd, SHUT_RDWR);
setsockopt(fd, 6, TCP_NODELAY, &val, 4);
}
Simply initialize conn.tx_work & conn.send_lock at socket creation,
rather than deeper in the stack.
ODEBUG: assert_init not available (active state 0) object type: timer_list hint: (null)
WARNING: CPU: 1 PID: 13988 at lib/debugobjects.c:329 debug_print_object+0x16a/0x210 lib/debugobjects.c:326
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 13988 Comm: syz-executor0 Not tainted 4.17.0-rc4+ #46
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1b9/0x294 lib/dump_stack.c:113
panic+0x22f/0x4de kernel/panic.c:184
__warn.cold.8+0x163/0x1b3 kernel/panic.c:536
report_bug+0x252/0x2d0 lib/bug.c:186
fixup_bug arch/x86/kernel/traps.c:178 [inline]
do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:debug_print_object+0x16a/0x210 lib/debugobjects.c:326
RSP: 0018:ffff880197a37880 EFLAGS: 00010086
RAX: 0000000000000061 RBX: 0000000000000005 RCX: ffffc90001ed0000
RDX: 0000000000004aaf RSI: ffffffff8160f6f1 RDI: 0000000000000001
RBP: ffff880197a378c0 R08: ffff8801aa7a0080 R09: ffffed003b5e3eb2
R10: ffffed003b5e3eb2 R11: ffff8801daf1f597 R12: 0000000000000001
R13: ffffffff88d96980 R14: ffffffff87fa19a0 R15: ffffffff81666ec0
debug_object_assert_init+0x309/0x500 lib/debugobjects.c:692
debug_timer_assert_init kernel/time/timer.c:724 [inline]
debug_assert_init kernel/time/timer.c:776 [inline]
del_timer+0x74/0x140 kernel/time/timer.c:1198
try_to_grab_pending+0x439/0x9a0 kernel/workqueue.c:1223
mod_delayed_work_on+0x91/0x250 kernel/workqueue.c:1592
mod_delayed_work include/linux/workqueue.h:541 [inline]
smc_setsockopt+0x387/0x6d0 net/smc/af_smc.c:1367
__sys_setsockopt+0x1bd/0x390 net/socket.c:1903
__do_sys_setsockopt net/socket.c:1914 [inline]
__se_sys_setsockopt net/socket.c:1911 [inline]
__x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911
do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x49/0xbe
Fixes: 01d2f7e2cdd3 ("net/smc: sockopts TCP_NODELAY and TCP_CORK")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ursula Braun <ubraun@linux.ibm.com>
Cc: linux-s390@vger.kernel.org
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-17 18:54:21 +08:00
|
|
|
INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
|
2017-01-09 23:55:16 +08:00
|
|
|
INIT_LIST_HEAD(&smc->accept_q);
|
|
|
|
spin_lock_init(&smc->accept_q_lock);
|
net/smc: init conn.tx_work & conn.send_lock sooner
syzkaller found that following program crashes the host :
{
int fd = socket(AF_SMC, SOCK_STREAM, 0);
int val = 1;
listen(fd, 0);
shutdown(fd, SHUT_RDWR);
setsockopt(fd, 6, TCP_NODELAY, &val, 4);
}
Simply initialize conn.tx_work & conn.send_lock at socket creation,
rather than deeper in the stack.
ODEBUG: assert_init not available (active state 0) object type: timer_list hint: (null)
WARNING: CPU: 1 PID: 13988 at lib/debugobjects.c:329 debug_print_object+0x16a/0x210 lib/debugobjects.c:326
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 13988 Comm: syz-executor0 Not tainted 4.17.0-rc4+ #46
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1b9/0x294 lib/dump_stack.c:113
panic+0x22f/0x4de kernel/panic.c:184
__warn.cold.8+0x163/0x1b3 kernel/panic.c:536
report_bug+0x252/0x2d0 lib/bug.c:186
fixup_bug arch/x86/kernel/traps.c:178 [inline]
do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:debug_print_object+0x16a/0x210 lib/debugobjects.c:326
RSP: 0018:ffff880197a37880 EFLAGS: 00010086
RAX: 0000000000000061 RBX: 0000000000000005 RCX: ffffc90001ed0000
RDX: 0000000000004aaf RSI: ffffffff8160f6f1 RDI: 0000000000000001
RBP: ffff880197a378c0 R08: ffff8801aa7a0080 R09: ffffed003b5e3eb2
R10: ffffed003b5e3eb2 R11: ffff8801daf1f597 R12: 0000000000000001
R13: ffffffff88d96980 R14: ffffffff87fa19a0 R15: ffffffff81666ec0
debug_object_assert_init+0x309/0x500 lib/debugobjects.c:692
debug_timer_assert_init kernel/time/timer.c:724 [inline]
debug_assert_init kernel/time/timer.c:776 [inline]
del_timer+0x74/0x140 kernel/time/timer.c:1198
try_to_grab_pending+0x439/0x9a0 kernel/workqueue.c:1223
mod_delayed_work_on+0x91/0x250 kernel/workqueue.c:1592
mod_delayed_work include/linux/workqueue.h:541 [inline]
smc_setsockopt+0x387/0x6d0 net/smc/af_smc.c:1367
__sys_setsockopt+0x1bd/0x390 net/socket.c:1903
__do_sys_setsockopt net/socket.c:1914 [inline]
__se_sys_setsockopt net/socket.c:1911 [inline]
__x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911
do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x49/0xbe
Fixes: 01d2f7e2cdd3 ("net/smc: sockopts TCP_NODELAY and TCP_CORK")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ursula Braun <ubraun@linux.ibm.com>
Cc: linux-s390@vger.kernel.org
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-17 18:54:21 +08:00
|
|
|
spin_lock_init(&smc->conn.send_lock);
|
2017-01-09 23:55:26 +08:00
|
|
|
sk->sk_prot->hash(sk);
|
2018-12-19 01:02:25 +08:00
|
|
|
mutex_init(&smc->clcsock_release_lock);
|
2022-04-22 15:56:18 +08:00
|
|
|
smc_init_saved_callbacks(smc);
|
2017-01-09 23:55:13 +08:00
|
|
|
|
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
|
|
|
|
int addr_len)
|
|
|
|
{
|
|
|
|
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
smc = smc_sk(sk);
|
|
|
|
|
|
|
|
/* replicate tests from inet_bind(), to be safe wrt. future changes */
|
|
|
|
rc = -EINVAL;
|
|
|
|
if (addr_len < sizeof(struct sockaddr_in))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
rc = -EAFNOSUPPORT;
|
2018-03-16 22:06:41 +08:00
|
|
|
if (addr->sin_family != AF_INET &&
|
|
|
|
addr->sin_family != AF_INET6 &&
|
|
|
|
addr->sin_family != AF_UNSPEC)
|
|
|
|
goto out;
|
2017-01-09 23:55:13 +08:00
|
|
|
/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
|
2018-03-16 22:06:41 +08:00
|
|
|
if (addr->sin_family == AF_UNSPEC &&
|
|
|
|
addr->sin_addr.s_addr != htonl(INADDR_ANY))
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
lock_sock(sk);
|
|
|
|
|
|
|
|
/* Check if socket is already active */
|
|
|
|
rc = -EINVAL;
|
2019-08-02 16:47:50 +08:00
|
|
|
if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out_rel;
|
|
|
|
|
|
|
|
smc->clcsock->sk->sk_reuse = sk->sk_reuse;
|
2022-09-22 20:19:07 +08:00
|
|
|
smc->clcsock->sk->sk_reuseport = sk->sk_reuseport;
|
2017-01-09 23:55:13 +08:00
|
|
|
rc = kernel_bind(smc->clcsock, uaddr, addr_len);
|
|
|
|
|
|
|
|
out_rel:
|
|
|
|
release_sock(sk);
|
|
|
|
out:
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2023-08-05 01:06:24 +08:00
|
|
|
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
|
|
|
|
* clc socket (since smc is not called for these options from net/core)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
|
|
|
|
(1UL << SOCK_KEEPOPEN) | \
|
|
|
|
(1UL << SOCK_LINGER) | \
|
|
|
|
(1UL << SOCK_BROADCAST) | \
|
|
|
|
(1UL << SOCK_TIMESTAMP) | \
|
|
|
|
(1UL << SOCK_DBG) | \
|
|
|
|
(1UL << SOCK_RCVTSTAMP) | \
|
|
|
|
(1UL << SOCK_RCVTSTAMPNS) | \
|
|
|
|
(1UL << SOCK_LOCALROUTE) | \
|
|
|
|
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
|
|
|
|
(1UL << SOCK_RXQ_OVFL) | \
|
|
|
|
(1UL << SOCK_WIFI_STATUS) | \
|
|
|
|
(1UL << SOCK_NOFCS) | \
|
|
|
|
(1UL << SOCK_FILTER_LOCKED) | \
|
|
|
|
(1UL << SOCK_TSTAMP_NEW))
|
|
|
|
|
|
|
|
/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
|
|
|
|
static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
|
|
|
|
unsigned long mask)
|
|
|
|
{
|
|
|
|
struct net *nnet = sock_net(nsk);
|
|
|
|
|
|
|
|
nsk->sk_userlocks = osk->sk_userlocks;
|
|
|
|
if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) {
|
|
|
|
nsk->sk_sndbuf = osk->sk_sndbuf;
|
|
|
|
} else {
|
|
|
|
if (mask == SK_FLAGS_SMC_TO_CLC)
|
|
|
|
WRITE_ONCE(nsk->sk_sndbuf,
|
|
|
|
READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1]));
|
|
|
|
else
|
|
|
|
WRITE_ONCE(nsk->sk_sndbuf,
|
|
|
|
2 * READ_ONCE(nnet->smc.sysctl_wmem));
|
|
|
|
}
|
|
|
|
if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) {
|
|
|
|
nsk->sk_rcvbuf = osk->sk_rcvbuf;
|
|
|
|
} else {
|
|
|
|
if (mask == SK_FLAGS_SMC_TO_CLC)
|
|
|
|
WRITE_ONCE(nsk->sk_rcvbuf,
|
|
|
|
READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1]));
|
|
|
|
else
|
|
|
|
WRITE_ONCE(nsk->sk_rcvbuf,
|
|
|
|
2 * READ_ONCE(nnet->smc.sysctl_rmem));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
|
|
|
|
unsigned long mask)
|
|
|
|
{
|
|
|
|
/* options we don't get control via setsockopt for */
|
|
|
|
nsk->sk_type = osk->sk_type;
|
|
|
|
nsk->sk_sndtimeo = osk->sk_sndtimeo;
|
|
|
|
nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
|
2023-07-28 23:03:15 +08:00
|
|
|
nsk->sk_mark = READ_ONCE(osk->sk_mark);
|
2023-09-22 04:28:11 +08:00
|
|
|
nsk->sk_priority = READ_ONCE(osk->sk_priority);
|
2017-01-09 23:55:13 +08:00
|
|
|
nsk->sk_rcvlowat = osk->sk_rcvlowat;
|
|
|
|
nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
|
|
|
|
nsk->sk_err = osk->sk_err;
|
|
|
|
|
|
|
|
nsk->sk_flags &= ~mask;
|
|
|
|
nsk->sk_flags |= osk->sk_flags & mask;
|
2023-08-05 01:06:24 +08:00
|
|
|
|
|
|
|
smc_adjust_sock_bufsizes(nsk, osk, mask);
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
|
|
|
|
{
|
|
|
|
smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
|
|
|
|
(1UL << SOCK_KEEPOPEN) | \
|
|
|
|
(1UL << SOCK_LINGER) | \
|
|
|
|
(1UL << SOCK_DBG))
|
|
|
|
/* copy only settings and flags relevant for smc from clc to smc socket */
|
|
|
|
static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
|
|
|
|
{
|
|
|
|
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
|
|
|
|
}
|
|
|
|
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
/* register the new vzalloced sndbuf on all links */
|
|
|
|
static int smcr_lgr_reg_sndbufs(struct smc_link *link,
|
|
|
|
struct smc_buf_desc *snd_desc)
|
|
|
|
{
|
|
|
|
struct smc_link_group *lgr = link->lgr;
|
|
|
|
int i, rc = 0;
|
|
|
|
|
|
|
|
if (!snd_desc->is_vm)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* protect against parallel smcr_link_reg_buf() */
|
net/smc: llc_conf_mutex refactor, replace it with rw_semaphore
llc_conf_mutex was used to protect links and link related configurations
in the same link group, for example, add or delete links. However,
in most cases, the protected critical area has only read semantics and
with no write semantics at all, such as obtaining a usable link or an
available rmb_desc.
This patch do simply code refactoring, replace mutex with rw_semaphore,
replace mutex_lock with down_write and replace mutex_unlock with
up_write.
Theoretically, this replacement is equivalent, but after this patch,
we can distinguish lock granularity according to different semantics
of critical areas.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-02-02 16:26:39 +08:00
|
|
|
down_write(&lgr->llc_conf_mutex);
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
|
|
|
|
if (!smc_link_active(&lgr->lnk[i]))
|
|
|
|
continue;
|
|
|
|
rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc);
|
|
|
|
if (rc)
|
|
|
|
break;
|
|
|
|
}
|
net/smc: llc_conf_mutex refactor, replace it with rw_semaphore
llc_conf_mutex was used to protect links and link related configurations
in the same link group, for example, add or delete links. However,
in most cases, the protected critical area has only read semantics and
with no write semantics at all, such as obtaining a usable link or an
available rmb_desc.
This patch do simply code refactoring, replace mutex with rw_semaphore,
replace mutex_lock with down_write and replace mutex_unlock with
up_write.
Theoretically, this replacement is equivalent, but after this patch,
we can distinguish lock granularity according to different semantics
of critical areas.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-02-02 16:26:39 +08:00
|
|
|
up_write(&lgr->llc_conf_mutex);
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2020-04-29 23:10:41 +08:00
|
|
|
/* register the new rmb on all links */
|
2020-05-01 18:48:01 +08:00
|
|
|
static int smcr_lgr_reg_rmbs(struct smc_link *link,
|
2020-04-29 23:10:41 +08:00
|
|
|
struct smc_buf_desc *rmb_desc)
|
|
|
|
{
|
2020-05-01 18:48:01 +08:00
|
|
|
struct smc_link_group *lgr = link->lgr;
|
2023-02-02 16:26:41 +08:00
|
|
|
bool do_slow = false;
|
2020-05-01 18:48:01 +08:00
|
|
|
int i, rc = 0;
|
2020-04-29 23:10:41 +08:00
|
|
|
|
2020-05-01 18:48:05 +08:00
|
|
|
rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
2023-02-02 16:26:41 +08:00
|
|
|
|
|
|
|
down_read(&lgr->llc_conf_mutex);
|
|
|
|
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
|
|
|
|
if (!smc_link_active(&lgr->lnk[i]))
|
|
|
|
continue;
|
|
|
|
if (!rmb_desc->is_reg_mr[link->link_idx]) {
|
|
|
|
up_read(&lgr->llc_conf_mutex);
|
|
|
|
goto slow_path;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* mr register already */
|
|
|
|
goto fast_path;
|
|
|
|
slow_path:
|
|
|
|
do_slow = true;
|
2020-05-01 18:48:05 +08:00
|
|
|
/* protect against parallel smc_llc_cli_rkey_exchange() and
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
* parallel smcr_link_reg_buf()
|
2020-05-01 18:48:05 +08:00
|
|
|
*/
|
net/smc: llc_conf_mutex refactor, replace it with rw_semaphore
llc_conf_mutex was used to protect links and link related configurations
in the same link group, for example, add or delete links. However,
in most cases, the protected critical area has only read semantics and
with no write semantics at all, such as obtaining a usable link or an
available rmb_desc.
This patch do simply code refactoring, replace mutex with rw_semaphore,
replace mutex_lock with down_write and replace mutex_unlock with
up_write.
Theoretically, this replacement is equivalent, but after this patch,
we can distinguish lock granularity according to different semantics
of critical areas.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-02-02 16:26:39 +08:00
|
|
|
down_write(&lgr->llc_conf_mutex);
|
2020-04-29 23:10:41 +08:00
|
|
|
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
|
2020-07-18 21:06:16 +08:00
|
|
|
if (!smc_link_active(&lgr->lnk[i]))
|
2020-04-29 23:10:41 +08:00
|
|
|
continue;
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc);
|
2020-04-29 23:10:41 +08:00
|
|
|
if (rc)
|
2020-05-01 18:48:01 +08:00
|
|
|
goto out;
|
2018-05-15 23:04:55 +08:00
|
|
|
}
|
2023-02-02 16:26:41 +08:00
|
|
|
fast_path:
|
2020-05-01 18:48:01 +08:00
|
|
|
/* exchange confirm_rkey msg with peer */
|
|
|
|
rc = smc_llc_do_confirm_rkey(link, rmb_desc);
|
|
|
|
if (rc) {
|
|
|
|
rc = -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
rmb_desc->is_conf_rkey = true;
|
|
|
|
out:
|
2023-02-02 16:26:41 +08:00
|
|
|
do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex);
|
2020-05-01 18:48:05 +08:00
|
|
|
smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
|
2020-05-01 18:48:01 +08:00
|
|
|
return rc;
|
2018-05-03 23:57:37 +08:00
|
|
|
}
|
|
|
|
|
2020-04-29 23:10:41 +08:00
|
|
|
static int smcr_clnt_conf_first_link(struct smc_sock *smc)
|
2017-01-09 23:55:21 +08:00
|
|
|
{
|
2020-04-29 23:10:40 +08:00
|
|
|
struct smc_link *link = smc->conn.lnk;
|
2020-04-30 21:55:43 +08:00
|
|
|
struct smc_llc_qentry *qentry;
|
2017-01-09 23:55:21 +08:00
|
|
|
int rc;
|
|
|
|
|
2023-11-22 10:37:05 +08:00
|
|
|
/* Receive CONFIRM LINK request from server over RoCE fabric.
|
|
|
|
* Increasing the client's timeout by twice as much as the server's
|
|
|
|
* timeout by default can temporarily avoid decline messages of
|
|
|
|
* both sides crossing or colliding
|
|
|
|
*/
|
|
|
|
qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME,
|
2020-04-30 21:55:43 +08:00
|
|
|
SMC_LLC_CONFIRM_LINK);
|
|
|
|
if (!qentry) {
|
2017-01-09 23:55:21 +08:00
|
|
|
struct smc_clc_msg_decline dclc;
|
|
|
|
|
|
|
|
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
|
2018-11-22 17:26:39 +08:00
|
|
|
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
|
2018-11-22 17:26:37 +08:00
|
|
|
return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
|
2017-01-09 23:55:21 +08:00
|
|
|
}
|
2020-05-04 20:18:48 +08:00
|
|
|
smc_llc_save_peer_uid(qentry);
|
2020-04-30 21:55:43 +08:00
|
|
|
rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
|
|
|
|
smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
|
|
|
|
if (rc)
|
2018-03-01 20:51:31 +08:00
|
|
|
return SMC_CLC_DECL_RMBE_EC;
|
|
|
|
|
2017-01-09 23:55:21 +08:00
|
|
|
rc = smc_ib_modify_qp_rts(link);
|
|
|
|
if (rc)
|
2018-07-25 22:35:32 +08:00
|
|
|
return SMC_CLC_DECL_ERR_RDYLNK;
|
2017-01-09 23:55:21 +08:00
|
|
|
|
|
|
|
smc_wr_remember_qp_attr(link);
|
2017-07-28 19:56:17 +08:00
|
|
|
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
/* reg the sndbuf if it was vzalloced */
|
|
|
|
if (smc->conn.sndbuf_desc->is_vm) {
|
|
|
|
if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
|
|
|
|
return SMC_CLC_DECL_ERR_REGBUF;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* reg the rmb */
|
|
|
|
if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
|
|
|
|
return SMC_CLC_DECL_ERR_REGBUF;
|
2017-07-28 19:56:17 +08:00
|
|
|
|
2020-04-30 21:55:43 +08:00
|
|
|
/* confirm_rkey is implicit on 1st contact */
|
|
|
|
smc->conn.rmb_desc->is_conf_rkey = true;
|
|
|
|
|
2017-01-09 23:55:21 +08:00
|
|
|
/* send CONFIRM LINK response over RoCE fabric */
|
2018-07-25 22:35:30 +08:00
|
|
|
rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
|
2017-01-09 23:55:21 +08:00
|
|
|
if (rc < 0)
|
2018-07-25 22:35:32 +08:00
|
|
|
return SMC_CLC_DECL_TIMEOUT_CL;
|
2017-01-09 23:55:21 +08:00
|
|
|
|
2020-04-30 21:55:43 +08:00
|
|
|
smc_llc_link_active(link);
|
2020-05-05 21:01:20 +08:00
|
|
|
smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
|
2020-04-30 21:55:43 +08:00
|
|
|
|
2023-08-17 21:20:31 +08:00
|
|
|
if (link->lgr->max_links > 1) {
|
|
|
|
/* optional 2nd link, receive ADD LINK request from server */
|
|
|
|
qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
|
|
|
|
SMC_LLC_ADD_LINK);
|
|
|
|
if (!qentry) {
|
|
|
|
struct smc_clc_msg_decline dclc;
|
|
|
|
|
|
|
|
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
|
|
|
|
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
|
|
|
|
if (rc == -EAGAIN)
|
|
|
|
rc = 0; /* no DECLINE received, go with one link */
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
|
|
|
|
smc_llc_cli_add_link(link, qentry);
|
2018-03-01 20:51:32 +08:00
|
|
|
}
|
2018-03-01 20:51:31 +08:00
|
|
|
return 0;
|
2017-01-09 23:55:21 +08:00
|
|
|
}
|
|
|
|
|
2021-10-16 17:37:45 +08:00
|
|
|
static bool smc_isascii(char *hostname)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
|
|
|
|
if (!isascii(hostname[i]))
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_conn_save_peer_info_fce(struct smc_sock *smc,
|
|
|
|
struct smc_clc_msg_accept_confirm *clc)
|
|
|
|
{
|
|
|
|
struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
|
|
|
|
(struct smc_clc_msg_accept_confirm_v2 *)clc;
|
|
|
|
struct smc_clc_first_contact_ext *fce;
|
|
|
|
int clc_v2_len;
|
|
|
|
|
|
|
|
if (clc->hdr.version == SMC_V1 ||
|
|
|
|
!(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (smc->conn.lgr->is_smcd) {
|
|
|
|
memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid,
|
|
|
|
SMC_MAX_EID_LEN);
|
|
|
|
clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
|
|
|
|
d1);
|
|
|
|
} else {
|
|
|
|
memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid,
|
|
|
|
SMC_MAX_EID_LEN);
|
|
|
|
clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
|
|
|
|
r1);
|
|
|
|
}
|
|
|
|
fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len);
|
|
|
|
smc->conn.lgr->peer_os = fce->os_type;
|
|
|
|
smc->conn.lgr->peer_smc_release = fce->release;
|
|
|
|
if (smc_isascii(fce->hostname))
|
|
|
|
memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
|
|
|
|
SMC_MAX_HOSTNAME_LEN);
|
|
|
|
}
|
|
|
|
|
2018-06-29 01:05:11 +08:00
|
|
|
static void smcr_conn_save_peer_info(struct smc_sock *smc,
|
|
|
|
struct smc_clc_msg_accept_confirm *clc)
|
2017-01-09 23:55:17 +08:00
|
|
|
{
|
2020-09-11 00:48:23 +08:00
|
|
|
int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
|
2018-05-18 15:34:13 +08:00
|
|
|
|
2020-09-11 00:48:23 +08:00
|
|
|
smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
|
|
|
|
smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
|
2018-05-18 15:34:13 +08:00
|
|
|
smc->conn.peer_rmbe_size = bufsize;
|
2017-01-09 23:55:18 +08:00
|
|
|
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
|
2018-05-18 15:34:13 +08:00
|
|
|
smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
|
2017-01-09 23:55:17 +08:00
|
|
|
}
|
|
|
|
|
2018-06-29 01:05:11 +08:00
|
|
|
static void smcd_conn_save_peer_info(struct smc_sock *smc,
|
|
|
|
struct smc_clc_msg_accept_confirm *clc)
|
|
|
|
{
|
2020-09-11 00:48:23 +08:00
|
|
|
int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
|
2018-06-29 01:05:11 +08:00
|
|
|
|
2020-09-11 00:48:23 +08:00
|
|
|
smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
|
2023-12-07 01:02:37 +08:00
|
|
|
smc->conn.peer_token = ntohll(clc->d0.token);
|
2018-06-29 01:05:11 +08:00
|
|
|
/* msg header takes up space in the buffer */
|
|
|
|
smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
|
|
|
|
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
|
|
|
|
smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_conn_save_peer_info(struct smc_sock *smc,
|
|
|
|
struct smc_clc_msg_accept_confirm *clc)
|
|
|
|
{
|
|
|
|
if (smc->conn.lgr->is_smcd)
|
|
|
|
smcd_conn_save_peer_info(smc, clc);
|
|
|
|
else
|
|
|
|
smcr_conn_save_peer_info(smc, clc);
|
2021-10-16 17:37:45 +08:00
|
|
|
smc_conn_save_peer_info_fce(smc, clc);
|
2018-06-29 01:05:11 +08:00
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:17 +08:00
|
|
|
static void smc_link_save_peer_info(struct smc_link *link,
|
2021-10-16 17:37:45 +08:00
|
|
|
struct smc_clc_msg_accept_confirm *clc,
|
|
|
|
struct smc_init_info *ini)
|
2017-01-09 23:55:17 +08:00
|
|
|
{
|
2020-09-11 00:48:23 +08:00
|
|
|
link->peer_qpn = ntoh24(clc->r0.qpn);
|
2021-10-16 17:37:45 +08:00
|
|
|
memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE);
|
|
|
|
memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac));
|
2020-09-11 00:48:23 +08:00
|
|
|
link->peer_psn = ntoh24(clc->r0.psn);
|
|
|
|
link->peer_mtu = clc->r0.qp_mtu;
|
2017-01-09 23:55:17 +08:00
|
|
|
}
|
|
|
|
|
2021-06-16 22:52:55 +08:00
|
|
|
static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc,
|
|
|
|
struct smc_stats_fback *fback_arr)
|
|
|
|
{
|
|
|
|
int cnt;
|
|
|
|
|
|
|
|
for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) {
|
|
|
|
if (fback_arr[cnt].fback_code == smc->fallback_rsn) {
|
|
|
|
fback_arr[cnt].count++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!fback_arr[cnt].fback_code) {
|
|
|
|
fback_arr[cnt].fback_code = smc->fallback_rsn;
|
|
|
|
fback_arr[cnt].count++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_stat_fallback(struct smc_sock *smc)
|
|
|
|
{
|
2021-06-16 22:52:58 +08:00
|
|
|
struct net *net = sock_net(&smc->sk);
|
|
|
|
|
|
|
|
mutex_lock(&net->smc.mutex_fback_rsn);
|
2021-06-16 22:52:55 +08:00
|
|
|
if (smc->listen_smc) {
|
2021-06-16 22:52:58 +08:00
|
|
|
smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv);
|
|
|
|
net->smc.fback_rsn->srv_fback_cnt++;
|
2021-06-16 22:52:55 +08:00
|
|
|
} else {
|
2021-06-16 22:52:58 +08:00
|
|
|
smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt);
|
|
|
|
net->smc.fback_rsn->clnt_fback_cnt++;
|
2021-06-16 22:52:55 +08:00
|
|
|
}
|
2021-06-16 22:52:58 +08:00
|
|
|
mutex_unlock(&net->smc.mutex_fback_rsn);
|
2021-06-16 22:52:55 +08:00
|
|
|
}
|
|
|
|
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
/* must be called under rcu read lock */
|
|
|
|
static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key)
|
|
|
|
{
|
|
|
|
struct socket_wq *wq;
|
|
|
|
__poll_t flags;
|
|
|
|
|
|
|
|
wq = rcu_dereference(smc->sk.sk_wq);
|
|
|
|
if (!skwq_has_sleeper(wq))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* wake up smc sk->sk_wq */
|
|
|
|
if (!key) {
|
|
|
|
/* sk_state_change */
|
|
|
|
wake_up_interruptible_all(&wq->wait);
|
|
|
|
} else {
|
|
|
|
flags = key_to_poll(key);
|
|
|
|
if (flags & (EPOLLIN | EPOLLOUT))
|
|
|
|
/* sk_data_ready or sk_write_space */
|
|
|
|
wake_up_interruptible_sync_poll(&wq->wait, flags);
|
|
|
|
else if (flags & EPOLLERR)
|
|
|
|
/* sk_error_report */
|
|
|
|
wake_up_interruptible_poll(&wq->wait, flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_fback_mark_woken(wait_queue_entry_t *wait,
|
|
|
|
unsigned int mode, int sync, void *key)
|
|
|
|
{
|
|
|
|
struct smc_mark_woken *mark =
|
|
|
|
container_of(wait, struct smc_mark_woken, wait_entry);
|
|
|
|
|
|
|
|
mark->woken = true;
|
|
|
|
mark->key = key;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk,
|
|
|
|
void (*clcsock_callback)(struct sock *sk))
|
|
|
|
{
|
|
|
|
struct smc_mark_woken mark = { .woken = false };
|
|
|
|
struct socket_wq *wq;
|
|
|
|
|
|
|
|
init_waitqueue_func_entry(&mark.wait_entry,
|
|
|
|
smc_fback_mark_woken);
|
|
|
|
rcu_read_lock();
|
|
|
|
wq = rcu_dereference(clcsk->sk_wq);
|
|
|
|
if (!wq)
|
|
|
|
goto out;
|
|
|
|
add_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
|
|
|
|
clcsock_callback(clcsk);
|
|
|
|
remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
|
|
|
|
|
|
|
|
if (mark.woken)
|
|
|
|
smc_fback_wakeup_waitqueue(smc, mark.key);
|
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_fback_state_change(struct sock *clcsk)
|
|
|
|
{
|
2022-04-22 15:56:19 +08:00
|
|
|
struct smc_sock *smc;
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
|
2022-04-22 15:56:19 +08:00
|
|
|
read_lock_bh(&clcsk->sk_callback_lock);
|
|
|
|
smc = smc_clcsock_user_data(clcsk);
|
|
|
|
if (smc)
|
|
|
|
smc_fback_forward_wakeup(smc, clcsk,
|
|
|
|
smc->clcsk_state_change);
|
|
|
|
read_unlock_bh(&clcsk->sk_callback_lock);
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_fback_data_ready(struct sock *clcsk)
|
|
|
|
{
|
2022-04-22 15:56:19 +08:00
|
|
|
struct smc_sock *smc;
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
|
2022-04-22 15:56:19 +08:00
|
|
|
read_lock_bh(&clcsk->sk_callback_lock);
|
|
|
|
smc = smc_clcsock_user_data(clcsk);
|
|
|
|
if (smc)
|
|
|
|
smc_fback_forward_wakeup(smc, clcsk,
|
|
|
|
smc->clcsk_data_ready);
|
|
|
|
read_unlock_bh(&clcsk->sk_callback_lock);
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_fback_write_space(struct sock *clcsk)
|
|
|
|
{
|
2022-04-22 15:56:19 +08:00
|
|
|
struct smc_sock *smc;
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
|
2022-04-22 15:56:19 +08:00
|
|
|
read_lock_bh(&clcsk->sk_callback_lock);
|
|
|
|
smc = smc_clcsock_user_data(clcsk);
|
|
|
|
if (smc)
|
|
|
|
smc_fback_forward_wakeup(smc, clcsk,
|
|
|
|
smc->clcsk_write_space);
|
|
|
|
read_unlock_bh(&clcsk->sk_callback_lock);
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_fback_error_report(struct sock *clcsk)
|
|
|
|
{
|
2022-04-22 15:56:19 +08:00
|
|
|
struct smc_sock *smc;
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
|
2022-04-22 15:56:19 +08:00
|
|
|
read_lock_bh(&clcsk->sk_callback_lock);
|
|
|
|
smc = smc_clcsock_user_data(clcsk);
|
|
|
|
if (smc)
|
|
|
|
smc_fback_forward_wakeup(smc, clcsk,
|
|
|
|
smc->clcsk_error_report);
|
|
|
|
read_unlock_bh(&clcsk->sk_callback_lock);
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
}
|
|
|
|
|
2022-04-22 15:56:18 +08:00
|
|
|
static void smc_fback_replace_callbacks(struct smc_sock *smc)
|
|
|
|
{
|
|
|
|
struct sock *clcsk = smc->clcsock->sk;
|
|
|
|
|
2022-04-22 15:56:19 +08:00
|
|
|
write_lock_bh(&clcsk->sk_callback_lock);
|
2022-04-22 15:56:18 +08:00
|
|
|
clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
|
|
|
|
|
|
|
|
smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change,
|
|
|
|
&smc->clcsk_state_change);
|
|
|
|
smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready,
|
|
|
|
&smc->clcsk_data_ready);
|
|
|
|
smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space,
|
|
|
|
&smc->clcsk_write_space);
|
|
|
|
smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report,
|
|
|
|
&smc->clcsk_error_report);
|
2022-04-22 15:56:19 +08:00
|
|
|
|
|
|
|
write_unlock_bh(&clcsk->sk_callback_lock);
|
2022-04-22 15:56:18 +08:00
|
|
|
}
|
|
|
|
|
2022-01-22 17:43:09 +08:00
|
|
|
static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
|
2019-04-11 17:17:32 +08:00
|
|
|
{
|
2022-02-09 22:10:53 +08:00
|
|
|
int rc = 0;
|
2021-11-13 15:33:35 +08:00
|
|
|
|
2022-01-22 17:43:09 +08:00
|
|
|
mutex_lock(&smc->clcsock_release_lock);
|
|
|
|
if (!smc->clcsock) {
|
2022-02-09 22:10:53 +08:00
|
|
|
rc = -EBADF;
|
|
|
|
goto out;
|
2022-01-22 17:43:09 +08:00
|
|
|
}
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
|
2019-04-11 17:17:32 +08:00
|
|
|
smc->use_fallback = true;
|
2021-06-16 22:52:55 +08:00
|
|
|
smc->fallback_rsn = reason_code;
|
|
|
|
smc_stat_fallback(smc);
|
2021-11-01 15:39:12 +08:00
|
|
|
trace_smc_switch_to_fallback(smc, reason_code);
|
2019-04-11 17:17:32 +08:00
|
|
|
if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
|
|
|
|
smc->clcsock->file = smc->sk.sk_socket->file;
|
|
|
|
smc->clcsock->file->private_data = smc->clcsock;
|
2020-02-14 15:58:59 +08:00
|
|
|
smc->clcsock->wq.fasync_list =
|
|
|
|
smc->sk.sk_socket->wq.fasync_list;
|
2021-11-13 15:33:35 +08:00
|
|
|
|
net/smc: Forward wakeup to smc socket waitqueue after fallback
When we replace TCP with SMC and a fallback occurs, there may be
some socket waitqueue entries remaining in smc socket->wq, such
as eppoll_entries inserted by userspace applications.
After the fallback, data flows over TCP/IP and only clcsocket->wq
will be woken up. Applications can't be notified by the entries
which were inserted in smc socket->wq before fallback. So we need
a mechanism to wake up smc socket->wq at the same time if some
entries remaining in it.
The current workaround is to transfer the entries from smc socket->wq
to clcsock->wq during the fallback. But this may cause a crash
like this:
general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI
CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107
RIP: 0010:__wake_up_common+0x65/0x170
Call Trace:
<IRQ>
__wake_up_common_lock+0x7a/0xc0
sock_def_readable+0x3c/0x70
tcp_data_queue+0x4a7/0xc40
tcp_rcv_established+0x32f/0x660
? sk_filter_trim_cap+0xcb/0x2e0
tcp_v4_do_rcv+0x10b/0x260
tcp_v4_rcv+0xd2a/0xde0
ip_protocol_deliver_rcu+0x3b/0x1d0
ip_local_deliver_finish+0x54/0x60
ip_local_deliver+0x6a/0x110
? tcp_v4_early_demux+0xa2/0x140
? tcp_v4_early_demux+0x10d/0x140
ip_sublist_rcv_finish+0x49/0x60
ip_sublist_rcv+0x19d/0x230
ip_list_rcv+0x13e/0x170
__netif_receive_skb_list_core+0x1c2/0x240
netif_receive_skb_list_internal+0x1e6/0x320
napi_complete_done+0x11d/0x190
mlx5e_napi_poll+0x163/0x6b0 [mlx5_core]
__napi_poll+0x3c/0x1b0
net_rx_action+0x27c/0x300
__do_softirq+0x114/0x2d2
irq_exit_rcu+0xb4/0xe0
common_interrupt+0xba/0xe0
</IRQ>
<TASK>
The crash is caused by privately transferring waitqueue entries from
smc socket->wq to clcsock->wq. The owners of these entries, such as
epoll, have no idea that the entries have been transferred to a
different socket wait queue and still use original waitqueue spinlock
(smc socket->wq.wait.lock) to make the entries operation exclusive,
but it doesn't work. The operations to the entries, such as removing
from the waitqueue (now is clcsock->wq after fallback), may cause a
crash when clcsock waitqueue is being iterated over at the moment.
This patch tries to fix this by no longer transferring wait queue
entries privately, but introducing own implementations of clcsock's
callback functions in fallback situation. The callback functions will
forward the wakeup to smc socket->wq if clcsock->wq is actually woken
up and smc socket->wq has remaining entries.
Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback")
Suggested-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 23:33:04 +08:00
|
|
|
/* There might be some wait entries remaining
|
|
|
|
* in smc sk->sk_wq and they should be woken up
|
|
|
|
* as clcsock's wait queue is woken up.
|
2021-11-13 15:33:35 +08:00
|
|
|
*/
|
2022-04-22 15:56:18 +08:00
|
|
|
smc_fback_replace_callbacks(smc);
|
2019-04-11 17:17:32 +08:00
|
|
|
}
|
2022-02-09 22:10:53 +08:00
|
|
|
out:
|
2022-01-22 17:43:09 +08:00
|
|
|
mutex_unlock(&smc->clcsock_release_lock);
|
2022-02-09 22:10:53 +08:00
|
|
|
return rc;
|
2019-04-11 17:17:32 +08:00
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* fall back during connect */
|
2018-07-25 22:35:32 +08:00
|
|
|
static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
|
2017-01-09 23:55:16 +08:00
|
|
|
{
|
2022-01-22 17:43:09 +08:00
|
|
|
struct net *net = sock_net(&smc->sk);
|
|
|
|
int rc = 0;
|
|
|
|
|
|
|
|
rc = smc_switch_to_fallback(smc, reason_code);
|
|
|
|
if (rc) { /* fallback fails */
|
|
|
|
this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
|
|
|
|
if (smc->sk.sk_state == SMC_INIT)
|
|
|
|
sock_put(&smc->sk); /* passive closing */
|
|
|
|
return rc;
|
|
|
|
}
|
2018-05-18 15:34:18 +08:00
|
|
|
smc_copy_sock_settings_to_clc(smc);
|
2019-04-12 18:57:23 +08:00
|
|
|
smc->connect_nonblock = 0;
|
2018-05-18 15:34:18 +08:00
|
|
|
if (smc->sk.sk_state == SMC_INIT)
|
|
|
|
smc->sk.sk_state = SMC_ACTIVE;
|
|
|
|
return 0;
|
|
|
|
}
|
2018-01-26 16:28:48 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* decline and fall back during connect */
|
2020-09-26 18:44:32 +08:00
|
|
|
static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
|
|
|
|
u8 version)
|
2018-05-18 15:34:18 +08:00
|
|
|
{
|
2021-06-16 22:52:58 +08:00
|
|
|
struct net *net = sock_net(&smc->sk);
|
2018-05-18 15:34:18 +08:00
|
|
|
int rc;
|
2018-04-26 23:18:21 +08:00
|
|
|
|
2018-07-05 22:15:30 +08:00
|
|
|
if (reason_code < 0) { /* error, fallback is not possible */
|
2021-06-16 22:52:58 +08:00
|
|
|
this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
|
2018-07-05 22:15:30 +08:00
|
|
|
if (smc->sk.sk_state == SMC_INIT)
|
|
|
|
sock_put(&smc->sk); /* passive closing */
|
2018-05-18 15:34:18 +08:00
|
|
|
return reason_code;
|
2018-07-05 22:15:30 +08:00
|
|
|
}
|
2018-07-25 22:35:32 +08:00
|
|
|
if (reason_code != SMC_CLC_DECL_PEERDECL) {
|
2020-09-26 18:44:32 +08:00
|
|
|
rc = smc_clc_send_decline(smc, reason_code, version);
|
2018-07-05 22:15:30 +08:00
|
|
|
if (rc < 0) {
|
2021-06-16 22:52:58 +08:00
|
|
|
this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
|
2018-07-05 22:15:30 +08:00
|
|
|
if (smc->sk.sk_state == SMC_INIT)
|
|
|
|
sock_put(&smc->sk); /* passive closing */
|
2018-05-18 15:34:18 +08:00
|
|
|
return rc;
|
2018-07-05 22:15:30 +08:00
|
|
|
}
|
2017-10-25 17:01:46 +08:00
|
|
|
}
|
2018-07-25 22:35:32 +08:00
|
|
|
return smc_connect_fallback(smc, reason_code);
|
2018-05-18 15:34:18 +08:00
|
|
|
}
|
2017-10-25 17:01:46 +08:00
|
|
|
|
2020-12-02 03:20:36 +08:00
|
|
|
static void smc_conn_abort(struct smc_sock *smc, int local_first)
|
2018-05-18 15:34:18 +08:00
|
|
|
{
|
2022-01-06 20:42:08 +08:00
|
|
|
struct smc_connection *conn = &smc->conn;
|
|
|
|
struct smc_link_group *lgr = conn->lgr;
|
2022-01-13 16:36:41 +08:00
|
|
|
bool lgr_valid = false;
|
|
|
|
|
|
|
|
if (smc_conn_lgr_valid(conn))
|
|
|
|
lgr_valid = true;
|
2022-01-06 20:42:08 +08:00
|
|
|
|
|
|
|
smc_conn_free(conn);
|
2022-01-13 16:36:41 +08:00
|
|
|
if (local_first && lgr_valid)
|
2022-01-06 20:42:08 +08:00
|
|
|
smc_lgr_cleanup_early(lgr);
|
2018-05-18 15:34:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* check if there is a rdma device available for this connection. */
|
|
|
|
/* called for connect and listen */
|
2019-04-12 18:57:28 +08:00
|
|
|
static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
|
2018-05-18 15:34:18 +08:00
|
|
|
{
|
2017-01-09 23:55:16 +08:00
|
|
|
/* PNET table look up: search active ib_device and port
|
|
|
|
* within same PNETID that also contains the ethernet device
|
|
|
|
* used for the internal TCP socket
|
|
|
|
*/
|
2019-04-12 18:57:26 +08:00
|
|
|
smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
|
2021-10-16 17:37:44 +08:00
|
|
|
if (!ini->check_smcrv2 && !ini->ib_dev)
|
|
|
|
return SMC_CLC_DECL_NOSMCRDEV;
|
|
|
|
if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2)
|
2019-04-12 18:57:29 +08:00
|
|
|
return SMC_CLC_DECL_NOSMCRDEV;
|
2019-04-12 18:57:26 +08:00
|
|
|
return 0;
|
2018-05-18 15:34:18 +08:00
|
|
|
}
|
|
|
|
|
2018-06-29 01:05:11 +08:00
|
|
|
/* check if there is an ISM device available for this connection. */
|
|
|
|
/* called for connect and listen */
|
2019-04-12 18:57:28 +08:00
|
|
|
static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
|
2018-06-29 01:05:11 +08:00
|
|
|
{
|
|
|
|
/* Find ISM device with same PNETID as connecting interface */
|
2019-04-12 18:57:26 +08:00
|
|
|
smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
|
2020-09-26 18:44:23 +08:00
|
|
|
if (!ini->ism_dev[0])
|
2019-04-12 18:57:29 +08:00
|
|
|
return SMC_CLC_DECL_NOSMCDDEV;
|
2020-09-26 18:44:25 +08:00
|
|
|
else
|
|
|
|
ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
|
2018-06-29 01:05:11 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-10-02 23:09:26 +08:00
|
|
|
/* is chid unique for the ism devices that are already determined? */
|
|
|
|
static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
|
|
|
|
int cnt)
|
|
|
|
{
|
|
|
|
int i = (!ini->ism_dev[0]) ? 1 : 0;
|
|
|
|
|
|
|
|
for (; i < cnt; i++)
|
|
|
|
if (ini->ism_chid[i] == chid)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:27 +08:00
|
|
|
/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
|
|
|
|
* PNETID matching net_device)
|
|
|
|
*/
|
|
|
|
static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
|
|
|
|
struct smc_init_info *ini)
|
|
|
|
{
|
|
|
|
int rc = SMC_CLC_DECL_NOSMCDDEV;
|
|
|
|
struct smcd_dev *smcd;
|
|
|
|
int i = 1;
|
2020-10-02 23:09:26 +08:00
|
|
|
u16 chid;
|
2020-09-26 18:44:27 +08:00
|
|
|
|
|
|
|
if (smcd_indicated(ini->smc_type_v1))
|
|
|
|
rc = 0; /* already initialized for V1 */
|
|
|
|
mutex_lock(&smcd_dev_list.mutex);
|
|
|
|
list_for_each_entry(smcd, &smcd_dev_list.list, list) {
|
|
|
|
if (smcd->going_away || smcd == ini->ism_dev[0])
|
|
|
|
continue;
|
2020-10-02 23:09:26 +08:00
|
|
|
chid = smc_ism_get_chid(smcd);
|
|
|
|
if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
|
|
|
|
continue;
|
2020-09-26 18:44:27 +08:00
|
|
|
if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
|
|
|
|
smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
|
|
|
|
ini->ism_dev[i] = smcd;
|
2020-10-02 23:09:26 +08:00
|
|
|
ini->ism_chid[i] = chid;
|
2020-09-26 18:44:27 +08:00
|
|
|
ini->is_smcd = true;
|
|
|
|
rc = 0;
|
|
|
|
i++;
|
|
|
|
if (i > SMC_MAX_ISM_DEVS)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&smcd_dev_list.mutex);
|
|
|
|
ini->ism_offered_cnt = i - 1;
|
|
|
|
if (!ini->ism_dev[0] && !ini->ism_dev[1])
|
|
|
|
ini->smcd_version = 0;
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2018-06-29 01:05:11 +08:00
|
|
|
/* Check for VLAN ID and register it on ISM device just for CLC handshake */
|
|
|
|
static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
|
2019-04-12 18:57:26 +08:00
|
|
|
struct smc_init_info *ini)
|
2018-06-29 01:05:11 +08:00
|
|
|
{
|
2020-09-26 18:44:23 +08:00
|
|
|
if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
|
2019-04-12 18:57:30 +08:00
|
|
|
return SMC_CLC_DECL_ISMVLANERR;
|
2018-06-29 01:05:11 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:27 +08:00
|
|
|
static int smc_find_proposal_devices(struct smc_sock *smc,
|
|
|
|
struct smc_init_info *ini)
|
|
|
|
{
|
|
|
|
int rc = 0;
|
|
|
|
|
|
|
|
/* check if there is an ism device available */
|
2021-10-16 17:37:44 +08:00
|
|
|
if (!(ini->smcd_version & SMC_V1) ||
|
|
|
|
smc_find_ism_device(smc, ini) ||
|
|
|
|
smc_connect_ism_vlan_setup(smc, ini))
|
|
|
|
ini->smcd_version &= ~SMC_V1;
|
|
|
|
/* else ISM V1 is supported for this connection */
|
|
|
|
|
|
|
|
/* check if there is an rdma device available */
|
|
|
|
if (!(ini->smcr_version & SMC_V1) ||
|
|
|
|
smc_find_rdma_device(smc, ini))
|
|
|
|
ini->smcr_version &= ~SMC_V1;
|
|
|
|
/* else RDMA is supported for this connection */
|
|
|
|
|
|
|
|
ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1,
|
|
|
|
ini->smcr_version & SMC_V1);
|
|
|
|
|
|
|
|
/* check if there is an ism v2 device available */
|
|
|
|
if (!(ini->smcd_version & SMC_V2) ||
|
|
|
|
!smc_ism_is_v2_capable() ||
|
|
|
|
smc_find_ism_v2_device_clnt(smc, ini))
|
|
|
|
ini->smcd_version &= ~SMC_V2;
|
|
|
|
|
|
|
|
/* check if there is an rdma v2 device available */
|
|
|
|
ini->check_smcrv2 = true;
|
|
|
|
ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
|
|
|
|
if (!(ini->smcr_version & SMC_V2) ||
|
|
|
|
smc->clcsock->sk->sk_family != AF_INET ||
|
|
|
|
!smc_clc_ueid_count() ||
|
|
|
|
smc_find_rdma_device(smc, ini))
|
|
|
|
ini->smcr_version &= ~SMC_V2;
|
|
|
|
ini->check_smcrv2 = false;
|
|
|
|
|
|
|
|
ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2,
|
|
|
|
ini->smcr_version & SMC_V2);
|
2020-09-26 18:44:27 +08:00
|
|
|
|
|
|
|
/* if neither ISM nor RDMA are supported, fallback */
|
2021-10-16 17:37:44 +08:00
|
|
|
if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
|
2020-09-26 18:44:27 +08:00
|
|
|
rc = SMC_CLC_DECL_NOSMCDEV;
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2018-06-29 01:05:11 +08:00
|
|
|
/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
|
|
|
|
* used, the VLAN ID will be registered again during the connection setup.
|
|
|
|
*/
|
2020-09-26 18:44:27 +08:00
|
|
|
static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
|
2019-04-12 18:57:26 +08:00
|
|
|
struct smc_init_info *ini)
|
2018-06-29 01:05:11 +08:00
|
|
|
{
|
2020-09-26 18:44:27 +08:00
|
|
|
if (!smcd_indicated(ini->smc_type_v1))
|
2018-06-29 01:05:11 +08:00
|
|
|
return 0;
|
2020-09-26 18:44:23 +08:00
|
|
|
if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
|
2018-06-29 01:05:11 +08:00
|
|
|
return SMC_CLC_DECL_CNFERR;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:30 +08:00
|
|
|
#define SMC_CLC_MAX_ACCEPT_LEN \
|
|
|
|
(sizeof(struct smc_clc_msg_accept_confirm_v2) + \
|
net/smc: add vendor unique experimental options area in clc handshake
Add vendor unique experimental options area in clc handshake. In clc
accept and confirm msg, vendor unique experimental options use the
16-Bytes reserved field, which defined in struct smc_clc_fce_gid_ext
in previous version. Because of the struct smc_clc_first_contact_ext
is widely used and limit the scope of modification, this patch moves
the 16-Bytes reserved field out of struct smc_clc_fce_gid_ext, and
followed with the struct smc_clc_first_contact_ext in a new struct
names struct smc_clc_first_contact_ext_v2x.
For SMC-R first connection, in previous version, the struct smc_clc_
first_contact_ext and the 16-Bytes reserved field has already been
included in clc accept and confirm msg. Thus, this patch use struct
smc_clc_first_contact_ext_v2x instead of the struct smc_clc_first_
contact_ext and the 16-Bytes reserved field in SMC-R clc accept and
confirm msg is compatible with previous version.
For SMC-D first connection, in previous version, only the struct smc_
clc_first_contact_ext is included in clc accept and confirm msg, and
the 16-Bytes reserved field is not included. Thus, when the negotiated
smc release version is the version before v2.1, we still use struct
smc_clc_first_contact_ext for compatible consideration. If the negotiated
smc release version is v2.1 or later, use struct smc_clc_first_contact_
ext_v2x instead.
Signed-off-by: Guangguan Wang <guangguan.wang@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Reviewed-by: Jan Karcher <jaka@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-08-17 21:20:28 +08:00
|
|
|
sizeof(struct smc_clc_first_contact_ext_v2x) + \
|
2020-09-26 18:44:30 +08:00
|
|
|
sizeof(struct smc_clc_msg_trail))
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* CLC handshake during connect */
|
2020-09-26 18:44:27 +08:00
|
|
|
static int smc_connect_clc(struct smc_sock *smc,
|
2020-09-26 18:44:30 +08:00
|
|
|
struct smc_clc_msg_accept_confirm_v2 *aclc2,
|
2019-04-12 18:57:26 +08:00
|
|
|
struct smc_init_info *ini)
|
2018-05-18 15:34:18 +08:00
|
|
|
{
|
|
|
|
int rc = 0;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
|
|
|
/* do inband token exchange */
|
2020-09-26 18:44:27 +08:00
|
|
|
rc = smc_clc_send_proposal(smc, ini);
|
2018-05-18 15:34:18 +08:00
|
|
|
if (rc)
|
|
|
|
return rc;
|
2017-01-09 23:55:16 +08:00
|
|
|
/* receive SMC Accept CLC message */
|
2020-09-26 18:44:30 +08:00
|
|
|
return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
|
|
|
|
SMC_CLC_ACCEPT, CLC_WAIT_TIME);
|
2018-05-18 15:34:18 +08:00
|
|
|
}
|
|
|
|
|
2021-10-16 17:37:50 +08:00
|
|
|
void smc_fill_gid_list(struct smc_link_group *lgr,
|
|
|
|
struct smc_gidlist *gidlist,
|
|
|
|
struct smc_ib_device *known_dev, u8 *known_gid)
|
2021-10-16 17:37:45 +08:00
|
|
|
{
|
|
|
|
struct smc_init_info *alt_ini = NULL;
|
|
|
|
|
|
|
|
memset(gidlist, 0, sizeof(*gidlist));
|
|
|
|
memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE);
|
|
|
|
|
|
|
|
alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL);
|
|
|
|
if (!alt_ini)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
alt_ini->vlan_id = lgr->vlan_id;
|
|
|
|
alt_ini->check_smcrv2 = true;
|
|
|
|
alt_ini->smcrv2.saddr = lgr->saddr;
|
|
|
|
smc_pnet_find_alt_roce(lgr, alt_ini, known_dev);
|
|
|
|
|
|
|
|
if (!alt_ini->smcrv2.ib_dev_v2)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2,
|
|
|
|
SMC_GID_SIZE);
|
|
|
|
|
|
|
|
out:
|
|
|
|
kfree(alt_ini);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_connect_rdma_v2_prepare(struct smc_sock *smc,
|
|
|
|
struct smc_clc_msg_accept_confirm *aclc,
|
|
|
|
struct smc_init_info *ini)
|
|
|
|
{
|
|
|
|
struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
|
|
|
|
(struct smc_clc_msg_accept_confirm_v2 *)aclc;
|
|
|
|
struct smc_clc_first_contact_ext *fce =
|
2023-08-17 21:20:27 +08:00
|
|
|
smc_get_clc_first_contact_ext(clc_v2, false);
|
2023-10-11 15:48:51 +08:00
|
|
|
struct net *net = sock_net(&smc->sk);
|
2023-08-17 21:20:29 +08:00
|
|
|
int rc;
|
2021-10-16 17:37:45 +08:00
|
|
|
|
|
|
|
if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (fce->v2_direct) {
|
|
|
|
memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN);
|
|
|
|
ini->smcrv2.uses_gateway = false;
|
|
|
|
} else {
|
2023-10-11 15:48:51 +08:00
|
|
|
if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr,
|
2021-10-16 17:37:45 +08:00
|
|
|
smc_ib_gid_to_ipv4(aclc->r0.lcl.gid),
|
|
|
|
ini->smcrv2.nexthop_mac,
|
|
|
|
&ini->smcrv2.uses_gateway))
|
|
|
|
return SMC_CLC_DECL_NOROUTE;
|
|
|
|
if (!ini->smcrv2.uses_gateway) {
|
|
|
|
/* mismatch: peer claims indirect, but its direct */
|
|
|
|
return SMC_CLC_DECL_NOINDIRECT;
|
|
|
|
}
|
|
|
|
}
|
2023-08-17 21:20:27 +08:00
|
|
|
|
|
|
|
ini->release_nr = fce->release;
|
2023-08-17 21:20:29 +08:00
|
|
|
rc = smc_clc_clnt_v2x_features_validate(fce, ini);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
2023-08-17 21:20:27 +08:00
|
|
|
|
2021-10-16 17:37:45 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* setup for RDMA connection of client */
|
|
|
|
static int smc_connect_rdma(struct smc_sock *smc,
|
|
|
|
struct smc_clc_msg_accept_confirm *aclc,
|
2019-04-12 18:57:26 +08:00
|
|
|
struct smc_init_info *ini)
|
2018-05-18 15:34:18 +08:00
|
|
|
{
|
2020-04-30 21:55:43 +08:00
|
|
|
int i, reason_code = 0;
|
2018-05-18 15:34:18 +08:00
|
|
|
struct smc_link *link;
|
2021-10-16 17:37:45 +08:00
|
|
|
u8 *eid = NULL;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2019-04-12 18:57:26 +08:00
|
|
|
ini->is_smcd = false;
|
2020-09-11 00:48:23 +08:00
|
|
|
ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
|
2020-09-26 18:44:20 +08:00
|
|
|
ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
|
2021-10-16 17:37:45 +08:00
|
|
|
memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN);
|
|
|
|
memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE);
|
|
|
|
memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN);
|
2023-08-17 21:20:30 +08:00
|
|
|
ini->max_conns = SMC_CONN_PER_LGR_MAX;
|
2023-08-17 21:20:31 +08:00
|
|
|
ini->max_links = SMC_LINKS_ADD_LNK_MAX;
|
2021-10-16 17:37:45 +08:00
|
|
|
|
|
|
|
reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini);
|
|
|
|
if (reason_code)
|
|
|
|
return reason_code;
|
2019-04-12 18:57:26 +08:00
|
|
|
|
2019-02-07 22:56:18 +08:00
|
|
|
mutex_lock(&smc_client_lgr_pending);
|
2019-04-12 18:57:30 +08:00
|
|
|
reason_code = smc_conn_create(smc, ini);
|
|
|
|
if (reason_code) {
|
2019-02-07 22:56:18 +08:00
|
|
|
mutex_unlock(&smc_client_lgr_pending);
|
|
|
|
return reason_code;
|
2017-01-09 23:55:17 +08:00
|
|
|
}
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
smc_conn_save_peer_info(smc, aclc);
|
2017-01-09 23:55:18 +08:00
|
|
|
|
2020-09-11 00:48:21 +08:00
|
|
|
if (ini->first_contact_local) {
|
2020-04-30 21:55:43 +08:00
|
|
|
link = smc->conn.lnk;
|
|
|
|
} else {
|
|
|
|
/* set link that was assigned by server */
|
|
|
|
link = NULL;
|
|
|
|
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
|
|
|
|
struct smc_link *l = &smc->conn.lgr->lnk[i];
|
|
|
|
|
2020-09-11 00:48:23 +08:00
|
|
|
if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
|
|
|
|
!memcmp(l->peer_gid, &aclc->r0.lcl.gid,
|
|
|
|
SMC_GID_SIZE) &&
|
2021-10-16 17:37:45 +08:00
|
|
|
(aclc->hdr.version > SMC_V1 ||
|
|
|
|
!memcmp(l->peer_mac, &aclc->r0.lcl.mac,
|
|
|
|
sizeof(l->peer_mac)))) {
|
2020-04-30 21:55:43 +08:00
|
|
|
link = l;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2020-10-08 04:57:41 +08:00
|
|
|
if (!link) {
|
|
|
|
reason_code = SMC_CLC_DECL_NOSRVLINK;
|
|
|
|
goto connect_abort;
|
|
|
|
}
|
2021-08-09 17:05:57 +08:00
|
|
|
smc_switch_link_and_count(&smc->conn, link);
|
2020-04-30 21:55:43 +08:00
|
|
|
}
|
|
|
|
|
2017-07-28 19:56:20 +08:00
|
|
|
/* create send buffer and rmb */
|
2020-10-08 04:57:41 +08:00
|
|
|
if (smc_buf_create(smc, false)) {
|
|
|
|
reason_code = SMC_CLC_DECL_MEM;
|
|
|
|
goto connect_abort;
|
|
|
|
}
|
2017-01-09 23:55:18 +08:00
|
|
|
|
2020-09-11 00:48:21 +08:00
|
|
|
if (ini->first_contact_local)
|
2021-10-16 17:37:45 +08:00
|
|
|
smc_link_save_peer_info(link, aclc, ini);
|
2017-01-09 23:55:20 +08:00
|
|
|
|
2020-10-08 04:57:41 +08:00
|
|
|
if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
|
|
|
|
reason_code = SMC_CLC_DECL_ERR_RTOK;
|
|
|
|
goto connect_abort;
|
|
|
|
}
|
2017-01-09 23:55:20 +08:00
|
|
|
|
2017-04-10 20:58:01 +08:00
|
|
|
smc_close_init(smc);
|
|
|
|
smc_rx_init(smc);
|
|
|
|
|
2020-09-11 00:48:21 +08:00
|
|
|
if (ini->first_contact_local) {
|
2020-10-08 04:57:41 +08:00
|
|
|
if (smc_ib_ready_link(link)) {
|
|
|
|
reason_code = SMC_CLC_DECL_ERR_RDYLNK;
|
|
|
|
goto connect_abort;
|
|
|
|
}
|
2017-07-28 19:56:17 +08:00
|
|
|
} else {
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
/* reg sendbufs if they were vzalloced */
|
|
|
|
if (smc->conn.sndbuf_desc->is_vm) {
|
|
|
|
if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) {
|
|
|
|
reason_code = SMC_CLC_DECL_ERR_REGBUF;
|
|
|
|
goto connect_abort;
|
|
|
|
}
|
|
|
|
}
|
2020-10-08 04:57:41 +08:00
|
|
|
if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
reason_code = SMC_CLC_DECL_ERR_REGBUF;
|
2020-10-08 04:57:41 +08:00
|
|
|
goto connect_abort;
|
|
|
|
}
|
2017-01-09 23:55:20 +08:00
|
|
|
}
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2021-10-16 17:37:45 +08:00
|
|
|
if (aclc->hdr.version > SMC_V1) {
|
|
|
|
struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
|
|
|
|
(struct smc_clc_msg_accept_confirm_v2 *)aclc;
|
|
|
|
|
|
|
|
eid = clc_v2->r1.eid;
|
|
|
|
if (ini->first_contact_local)
|
|
|
|
smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist,
|
|
|
|
link->smcibdev, link->gid);
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:30 +08:00
|
|
|
reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
|
2021-10-16 17:37:45 +08:00
|
|
|
aclc->hdr.version, eid, ini);
|
2018-05-18 15:34:18 +08:00
|
|
|
if (reason_code)
|
2020-10-08 04:57:41 +08:00
|
|
|
goto connect_abort;
|
2018-05-18 15:34:18 +08:00
|
|
|
|
|
|
|
smc_tx_init(smc);
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2020-09-11 00:48:21 +08:00
|
|
|
if (ini->first_contact_local) {
|
2017-01-09 23:55:21 +08:00
|
|
|
/* QP confirmation over RoCE fabric */
|
2020-04-30 21:55:43 +08:00
|
|
|
smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
|
2020-04-29 23:10:41 +08:00
|
|
|
reason_code = smcr_clnt_conf_first_link(smc);
|
2020-04-30 21:55:43 +08:00
|
|
|
smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
|
2018-05-18 15:34:18 +08:00
|
|
|
if (reason_code)
|
2020-10-08 04:57:41 +08:00
|
|
|
goto connect_abort;
|
2017-01-09 23:55:21 +08:00
|
|
|
}
|
2019-02-07 22:56:18 +08:00
|
|
|
mutex_unlock(&smc_client_lgr_pending);
|
2017-01-09 23:55:23 +08:00
|
|
|
|
2017-01-09 23:55:16 +08:00
|
|
|
smc_copy_sock_settings_to_clc(smc);
|
2019-04-12 18:57:23 +08:00
|
|
|
smc->connect_nonblock = 0;
|
2017-01-09 23:55:25 +08:00
|
|
|
if (smc->sk.sk_state == SMC_INIT)
|
|
|
|
smc->sk.sk_state = SMC_ACTIVE;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
return 0;
|
2020-10-08 04:57:41 +08:00
|
|
|
connect_abort:
|
2020-12-02 03:20:36 +08:00
|
|
|
smc_conn_abort(smc, ini->first_contact_local);
|
2020-10-08 04:57:41 +08:00
|
|
|
mutex_unlock(&smc_client_lgr_pending);
|
|
|
|
smc->connect_nonblock = 0;
|
|
|
|
|
|
|
|
return reason_code;
|
2018-05-18 15:34:18 +08:00
|
|
|
}
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2020-09-26 18:44:30 +08:00
|
|
|
/* The server has chosen one of the proposed ISM devices for the communication.
|
|
|
|
* Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
|
|
|
|
struct smc_init_info *ini)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
|
2021-10-16 17:37:45 +08:00
|
|
|
if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
|
2020-09-26 18:44:30 +08:00
|
|
|
ini->ism_selected = i;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return -EPROTO;
|
|
|
|
}
|
|
|
|
|
2018-06-29 01:05:11 +08:00
|
|
|
/* setup for ISM connection of client */
|
|
|
|
static int smc_connect_ism(struct smc_sock *smc,
|
|
|
|
struct smc_clc_msg_accept_confirm *aclc,
|
2019-04-12 18:57:26 +08:00
|
|
|
struct smc_init_info *ini)
|
2018-06-29 01:05:11 +08:00
|
|
|
{
|
2021-09-14 16:35:05 +08:00
|
|
|
u8 *eid = NULL;
|
2018-06-29 01:05:11 +08:00
|
|
|
int rc = 0;
|
|
|
|
|
2019-04-12 18:57:26 +08:00
|
|
|
ini->is_smcd = true;
|
2020-09-26 18:44:20 +08:00
|
|
|
ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
|
2019-04-12 18:57:26 +08:00
|
|
|
|
2020-09-26 18:44:30 +08:00
|
|
|
if (aclc->hdr.version == SMC_V2) {
|
|
|
|
struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
|
|
|
|
(struct smc_clc_msg_accept_confirm_v2 *)aclc;
|
|
|
|
|
2023-08-17 21:20:27 +08:00
|
|
|
if (ini->first_contact_peer) {
|
|
|
|
struct smc_clc_first_contact_ext *fce =
|
|
|
|
smc_get_clc_first_contact_ext(aclc_v2, true);
|
|
|
|
|
|
|
|
ini->release_nr = fce->release;
|
2023-08-17 21:20:29 +08:00
|
|
|
rc = smc_clc_clnt_v2x_features_validate(fce, ini);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
2023-08-17 21:20:27 +08:00
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:30 +08:00
|
|
|
rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
|
|
|
}
|
2023-12-07 01:02:37 +08:00
|
|
|
ini->ism_peer_gid[ini->ism_selected] = ntohll(aclc->d0.gid);
|
2020-09-26 18:44:30 +08:00
|
|
|
|
2019-02-07 22:56:18 +08:00
|
|
|
/* there is only one lgr role for SMC-D; use server lock */
|
|
|
|
mutex_lock(&smc_server_lgr_pending);
|
2019-04-12 18:57:30 +08:00
|
|
|
rc = smc_conn_create(smc, ini);
|
|
|
|
if (rc) {
|
2019-02-07 22:56:18 +08:00
|
|
|
mutex_unlock(&smc_server_lgr_pending);
|
2019-04-12 18:57:30 +08:00
|
|
|
return rc;
|
2019-02-07 22:56:18 +08:00
|
|
|
}
|
2018-06-29 01:05:11 +08:00
|
|
|
|
|
|
|
/* Create send and receive buffers */
|
2020-07-27 02:34:28 +08:00
|
|
|
rc = smc_buf_create(smc, true);
|
2020-10-08 04:57:41 +08:00
|
|
|
if (rc) {
|
|
|
|
rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
|
|
|
|
goto connect_abort;
|
|
|
|
}
|
2018-06-29 01:05:11 +08:00
|
|
|
|
|
|
|
smc_conn_save_peer_info(smc, aclc);
|
|
|
|
smc_close_init(smc);
|
|
|
|
smc_rx_init(smc);
|
|
|
|
smc_tx_init(smc);
|
|
|
|
|
2021-09-14 16:35:05 +08:00
|
|
|
if (aclc->hdr.version > SMC_V1) {
|
|
|
|
struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
|
|
|
|
(struct smc_clc_msg_accept_confirm_v2 *)aclc;
|
|
|
|
|
2021-10-16 17:37:45 +08:00
|
|
|
eid = clc_v2->d1.eid;
|
2021-09-14 16:35:05 +08:00
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:30 +08:00
|
|
|
rc = smc_clc_send_confirm(smc, ini->first_contact_local,
|
2023-08-17 21:20:27 +08:00
|
|
|
aclc->hdr.version, eid, ini);
|
2018-06-29 01:05:11 +08:00
|
|
|
if (rc)
|
2020-10-08 04:57:41 +08:00
|
|
|
goto connect_abort;
|
2019-02-07 22:56:18 +08:00
|
|
|
mutex_unlock(&smc_server_lgr_pending);
|
2018-06-29 01:05:11 +08:00
|
|
|
|
|
|
|
smc_copy_sock_settings_to_clc(smc);
|
2019-04-12 18:57:23 +08:00
|
|
|
smc->connect_nonblock = 0;
|
2018-06-29 01:05:11 +08:00
|
|
|
if (smc->sk.sk_state == SMC_INIT)
|
|
|
|
smc->sk.sk_state = SMC_ACTIVE;
|
|
|
|
|
|
|
|
return 0;
|
2020-10-08 04:57:41 +08:00
|
|
|
connect_abort:
|
2020-12-02 03:20:36 +08:00
|
|
|
smc_conn_abort(smc, ini->first_contact_local);
|
2020-10-08 04:57:41 +08:00
|
|
|
mutex_unlock(&smc_server_lgr_pending);
|
|
|
|
smc->connect_nonblock = 0;
|
|
|
|
|
|
|
|
return rc;
|
2018-06-29 01:05:11 +08:00
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:27 +08:00
|
|
|
/* check if received accept type and version matches a proposed one */
|
|
|
|
static int smc_connect_check_aclc(struct smc_init_info *ini,
|
|
|
|
struct smc_clc_msg_accept_confirm *aclc)
|
|
|
|
{
|
2021-10-16 17:37:44 +08:00
|
|
|
if (aclc->hdr.typev1 != SMC_TYPE_R &&
|
|
|
|
aclc->hdr.typev1 != SMC_TYPE_D)
|
2020-09-26 18:44:27 +08:00
|
|
|
return SMC_CLC_DECL_MODEUNSUPP;
|
|
|
|
|
2021-10-16 17:37:44 +08:00
|
|
|
if (aclc->hdr.version >= SMC_V2) {
|
|
|
|
if ((aclc->hdr.typev1 == SMC_TYPE_R &&
|
|
|
|
!smcr_indicated(ini->smc_type_v2)) ||
|
|
|
|
(aclc->hdr.typev1 == SMC_TYPE_D &&
|
|
|
|
!smcd_indicated(ini->smc_type_v2)))
|
|
|
|
return SMC_CLC_DECL_MODEUNSUPP;
|
|
|
|
} else {
|
|
|
|
if ((aclc->hdr.typev1 == SMC_TYPE_R &&
|
|
|
|
!smcr_indicated(ini->smc_type_v1)) ||
|
|
|
|
(aclc->hdr.typev1 == SMC_TYPE_D &&
|
|
|
|
!smcd_indicated(ini->smc_type_v1)))
|
|
|
|
return SMC_CLC_DECL_MODEUNSUPP;
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:27 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* perform steps before actually connecting */
|
|
|
|
static int __smc_connect(struct smc_sock *smc)
|
|
|
|
{
|
2020-12-02 03:20:42 +08:00
|
|
|
u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
|
2020-09-26 18:44:30 +08:00
|
|
|
struct smc_clc_msg_accept_confirm_v2 *aclc2;
|
|
|
|
struct smc_clc_msg_accept_confirm *aclc;
|
2020-09-26 18:44:23 +08:00
|
|
|
struct smc_init_info *ini = NULL;
|
2020-09-26 18:44:30 +08:00
|
|
|
u8 *buf = NULL;
|
2018-05-18 15:34:18 +08:00
|
|
|
int rc = 0;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
if (smc->use_fallback)
|
2018-07-25 22:35:32 +08:00
|
|
|
return smc_connect_fallback(smc, smc->fallback_rsn);
|
2018-05-18 15:34:18 +08:00
|
|
|
|
|
|
|
/* if peer has not signalled SMC-capability, fall back */
|
|
|
|
if (!tcp_sk(smc->clcsock->sk)->syn_smc)
|
2018-07-25 22:35:32 +08:00
|
|
|
return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
|
2018-05-18 15:34:18 +08:00
|
|
|
|
2020-09-26 18:44:32 +08:00
|
|
|
/* IPSec connections opt out of SMC optimizations */
|
2018-05-18 15:34:18 +08:00
|
|
|
if (using_ipsec(smc))
|
2020-09-26 18:44:32 +08:00
|
|
|
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
|
|
|
|
version);
|
2018-05-18 15:34:18 +08:00
|
|
|
|
2020-09-26 18:44:23 +08:00
|
|
|
ini = kzalloc(sizeof(*ini), GFP_KERNEL);
|
|
|
|
if (!ini)
|
2020-09-26 18:44:32 +08:00
|
|
|
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
|
|
|
|
version);
|
2020-09-26 18:44:23 +08:00
|
|
|
|
2021-10-16 17:37:44 +08:00
|
|
|
ini->smcd_version = SMC_V1 | SMC_V2;
|
|
|
|
ini->smcr_version = SMC_V1 | SMC_V2;
|
2020-09-26 18:44:27 +08:00
|
|
|
ini->smc_type_v1 = SMC_TYPE_B;
|
2021-10-16 17:37:44 +08:00
|
|
|
ini->smc_type_v2 = SMC_TYPE_B;
|
2020-09-26 18:44:27 +08:00
|
|
|
|
2019-04-12 18:57:27 +08:00
|
|
|
/* get vlan id from IP device */
|
2020-09-26 18:44:23 +08:00
|
|
|
if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
|
2020-09-26 18:44:27 +08:00
|
|
|
ini->smcd_version &= ~SMC_V1;
|
2021-10-16 17:37:44 +08:00
|
|
|
ini->smcr_version = 0;
|
2020-09-26 18:44:27 +08:00
|
|
|
ini->smc_type_v1 = SMC_TYPE_N;
|
|
|
|
if (!ini->smcd_version) {
|
|
|
|
rc = SMC_CLC_DECL_GETVLANERR;
|
|
|
|
goto fallback;
|
|
|
|
}
|
2018-06-29 01:05:11 +08:00
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:27 +08:00
|
|
|
rc = smc_find_proposal_devices(smc, ini);
|
|
|
|
if (rc)
|
|
|
|
goto fallback;
|
2018-05-18 15:34:18 +08:00
|
|
|
|
2020-09-26 18:44:30 +08:00
|
|
|
buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
|
|
|
|
if (!buf) {
|
|
|
|
rc = SMC_CLC_DECL_MEM;
|
|
|
|
goto fallback;
|
|
|
|
}
|
|
|
|
aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
|
|
|
|
aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* perform CLC handshake */
|
2020-09-26 18:44:30 +08:00
|
|
|
rc = smc_connect_clc(smc, aclc2, ini);
|
2022-02-15 16:24:50 +08:00
|
|
|
if (rc) {
|
|
|
|
/* -EAGAIN on timeout, see tcp_recvmsg() */
|
|
|
|
if (rc == -EAGAIN) {
|
|
|
|
rc = -ETIMEDOUT;
|
|
|
|
smc->sk.sk_err = ETIMEDOUT;
|
|
|
|
}
|
2020-09-26 18:44:27 +08:00
|
|
|
goto vlan_cleanup;
|
2022-02-15 16:24:50 +08:00
|
|
|
}
|
2020-09-26 18:44:27 +08:00
|
|
|
|
|
|
|
/* check if smc modes and versions of CLC proposal and accept match */
|
2020-09-26 18:44:30 +08:00
|
|
|
rc = smc_connect_check_aclc(ini, aclc);
|
2020-11-19 05:40:37 +08:00
|
|
|
version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
|
2020-09-26 18:44:27 +08:00
|
|
|
if (rc)
|
|
|
|
goto vlan_cleanup;
|
2018-05-18 15:34:18 +08:00
|
|
|
|
2018-06-29 01:05:11 +08:00
|
|
|
/* depending on previous steps, connect using rdma or ism */
|
2021-10-16 17:37:44 +08:00
|
|
|
if (aclc->hdr.typev1 == SMC_TYPE_R) {
|
|
|
|
ini->smcr_version = version;
|
2020-09-26 18:44:30 +08:00
|
|
|
rc = smc_connect_rdma(smc, aclc, ini);
|
2021-10-16 17:37:44 +08:00
|
|
|
} else if (aclc->hdr.typev1 == SMC_TYPE_D) {
|
|
|
|
ini->smcd_version = version;
|
2020-09-26 18:44:30 +08:00
|
|
|
rc = smc_connect_ism(smc, aclc, ini);
|
2021-10-16 17:37:44 +08:00
|
|
|
}
|
2020-09-26 18:44:27 +08:00
|
|
|
if (rc)
|
|
|
|
goto vlan_cleanup;
|
2018-05-18 15:34:18 +08:00
|
|
|
|
2021-06-16 22:52:58 +08:00
|
|
|
SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc);
|
2020-09-26 18:44:27 +08:00
|
|
|
smc_connect_ism_vlan_cleanup(smc, ini);
|
2020-09-26 18:44:30 +08:00
|
|
|
kfree(buf);
|
2020-09-26 18:44:23 +08:00
|
|
|
kfree(ini);
|
2018-05-18 15:34:18 +08:00
|
|
|
return 0;
|
2020-09-26 18:44:27 +08:00
|
|
|
|
|
|
|
vlan_cleanup:
|
|
|
|
smc_connect_ism_vlan_cleanup(smc, ini);
|
2020-09-26 18:44:30 +08:00
|
|
|
kfree(buf);
|
2020-09-26 18:44:27 +08:00
|
|
|
fallback:
|
|
|
|
kfree(ini);
|
2020-09-26 18:44:32 +08:00
|
|
|
return smc_connect_decline_fallback(smc, rc, version);
|
2017-01-09 23:55:16 +08:00
|
|
|
}
|
|
|
|
|
2018-06-27 23:59:50 +08:00
|
|
|
static void smc_connect_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct smc_sock *smc = container_of(work, struct smc_sock,
|
|
|
|
connect_work);
|
2019-04-12 18:57:23 +08:00
|
|
|
long timeo = smc->sk.sk_sndtimeo;
|
|
|
|
int rc = 0;
|
2018-06-27 23:59:50 +08:00
|
|
|
|
2019-04-12 18:57:23 +08:00
|
|
|
if (!timeo)
|
|
|
|
timeo = MAX_SCHEDULE_TIMEOUT;
|
|
|
|
lock_sock(smc->clcsock->sk);
|
2018-06-27 23:59:50 +08:00
|
|
|
if (smc->clcsock->sk->sk_err) {
|
|
|
|
smc->sk.sk_err = smc->clcsock->sk->sk_err;
|
2019-04-12 18:57:23 +08:00
|
|
|
} else if ((1 << smc->clcsock->sk->sk_state) &
|
2021-10-28 15:13:47 +08:00
|
|
|
(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
|
2019-04-12 18:57:23 +08:00
|
|
|
rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
|
|
|
|
if ((rc == -EPIPE) &&
|
|
|
|
((1 << smc->clcsock->sk->sk_state) &
|
|
|
|
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
|
|
|
|
rc = 0;
|
2018-06-27 23:59:50 +08:00
|
|
|
}
|
2019-04-12 18:57:23 +08:00
|
|
|
release_sock(smc->clcsock->sk);
|
|
|
|
lock_sock(&smc->sk);
|
|
|
|
if (rc != 0 || smc->sk.sk_err) {
|
|
|
|
smc->sk.sk_state = SMC_CLOSED;
|
|
|
|
if (rc == -EPIPE || rc == -EAGAIN)
|
|
|
|
smc->sk.sk_err = EPIPE;
|
net/smc: sync err code when tcp connection was refused
In the current implementation, when TCP initiates a connection
to an unavailable [ip,port], ECONNREFUSED will be stored in the
TCP socket, but SMC will not. However, some apps (like curl) use
getsockopt(,,SO_ERROR,,) to get the error information, which makes
them miss the error message and behave strangely.
Fixes: 50717a37db03 ("net/smc: nonblocking connect rework")
Signed-off-by: liuyacan <liuyacan@corp.netease.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-21 17:40:27 +08:00
|
|
|
else if (rc == -ECONNREFUSED)
|
|
|
|
smc->sk.sk_err = ECONNREFUSED;
|
2019-04-12 18:57:23 +08:00
|
|
|
else if (signal_pending(current))
|
|
|
|
smc->sk.sk_err = -sock_intr_errno(timeo);
|
2019-11-12 23:03:41 +08:00
|
|
|
sock_put(&smc->sk); /* passive closing */
|
2018-06-27 23:59:50 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = __smc_connect(smc);
|
|
|
|
if (rc < 0)
|
|
|
|
smc->sk.sk_err = -rc;
|
|
|
|
|
|
|
|
out:
|
2019-04-11 17:17:32 +08:00
|
|
|
if (!sock_flag(&smc->sk, SOCK_DEAD)) {
|
|
|
|
if (smc->sk.sk_err) {
|
|
|
|
smc->sk.sk_state_change(&smc->sk);
|
|
|
|
} else { /* allow polling before and after fallback decision */
|
|
|
|
smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
|
|
|
|
smc->sk.sk_write_space(&smc->sk);
|
|
|
|
}
|
|
|
|
}
|
2018-06-27 23:59:50 +08:00
|
|
|
release_sock(&smc->sk);
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
static int smc_connect(struct socket *sock, struct sockaddr *addr,
|
|
|
|
int alen, int flags)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
|
|
|
int rc = -EINVAL;
|
|
|
|
|
|
|
|
smc = smc_sk(sk);
|
|
|
|
|
|
|
|
/* separate smc parameter checking to be safe */
|
|
|
|
if (alen < sizeof(addr->sa_family))
|
|
|
|
goto out_err;
|
2018-03-16 22:06:41 +08:00
|
|
|
if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out_err;
|
|
|
|
|
|
|
|
lock_sock(sk);
|
2022-05-13 10:24:53 +08:00
|
|
|
switch (sock->state) {
|
|
|
|
default:
|
|
|
|
rc = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
case SS_CONNECTED:
|
|
|
|
rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL;
|
|
|
|
goto out;
|
|
|
|
case SS_CONNECTING:
|
|
|
|
if (sk->sk_state == SMC_ACTIVE)
|
|
|
|
goto connected;
|
|
|
|
break;
|
|
|
|
case SS_UNCONNECTED:
|
|
|
|
sock->state = SS_CONNECTING;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
switch (sk->sk_state) {
|
|
|
|
default:
|
|
|
|
goto out;
|
2022-05-13 10:24:53 +08:00
|
|
|
case SMC_CLOSED:
|
|
|
|
rc = sock_error(sk) ? : -ECONNABORTED;
|
|
|
|
sock->state = SS_UNCONNECTED;
|
|
|
|
goto out;
|
2017-01-09 23:55:13 +08:00
|
|
|
case SMC_ACTIVE:
|
|
|
|
rc = -EISCONN;
|
|
|
|
goto out;
|
|
|
|
case SMC_INIT:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
smc_copy_sock_settings_to_clc(smc);
|
2017-10-25 17:01:46 +08:00
|
|
|
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
|
2019-04-12 18:57:23 +08:00
|
|
|
if (smc->connect_nonblock) {
|
|
|
|
rc = -EALREADY;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
rc = kernel_connect(smc->clcsock, addr, alen, flags);
|
|
|
|
if (rc && rc != -EINPROGRESS)
|
|
|
|
goto out;
|
2019-10-29 19:41:26 +08:00
|
|
|
|
2022-05-13 10:24:53 +08:00
|
|
|
if (smc->use_fallback) {
|
|
|
|
sock->state = rc ? SS_CONNECTING : SS_CONNECTED;
|
2019-12-13 05:35:58 +08:00
|
|
|
goto out;
|
2022-05-13 10:24:53 +08:00
|
|
|
}
|
2022-05-23 12:57:07 +08:00
|
|
|
sock_hold(&smc->sk); /* sock put in passive closing */
|
2018-06-27 23:59:50 +08:00
|
|
|
if (flags & O_NONBLOCK) {
|
2020-09-11 00:48:29 +08:00
|
|
|
if (queue_work(smc_hs_wq, &smc->connect_work))
|
2019-04-12 18:57:23 +08:00
|
|
|
smc->connect_nonblock = 1;
|
2018-06-27 23:59:50 +08:00
|
|
|
rc = -EINPROGRESS;
|
2022-05-13 10:24:53 +08:00
|
|
|
goto out;
|
2018-06-27 23:59:50 +08:00
|
|
|
} else {
|
|
|
|
rc = __smc_connect(smc);
|
|
|
|
if (rc < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2022-05-13 10:24:53 +08:00
|
|
|
connected:
|
|
|
|
rc = 0;
|
|
|
|
sock->state = SS_CONNECTED;
|
2017-01-09 23:55:13 +08:00
|
|
|
out:
|
|
|
|
release_sock(sk);
|
|
|
|
out_err:
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
|
|
|
|
{
|
2018-01-24 17:28:12 +08:00
|
|
|
struct socket *new_clcsock = NULL;
|
|
|
|
struct sock *lsk = &lsmc->sk;
|
2017-01-09 23:55:13 +08:00
|
|
|
struct sock *new_sk;
|
2018-12-19 01:02:25 +08:00
|
|
|
int rc = -EINVAL;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2018-01-24 17:28:12 +08:00
|
|
|
release_sock(lsk);
|
2018-03-16 22:06:41 +08:00
|
|
|
new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
|
2017-01-09 23:55:13 +08:00
|
|
|
if (!new_sk) {
|
|
|
|
rc = -ENOMEM;
|
2018-01-24 17:28:12 +08:00
|
|
|
lsk->sk_err = ENOMEM;
|
2017-01-09 23:55:13 +08:00
|
|
|
*new_smc = NULL;
|
2018-01-24 17:28:12 +08:00
|
|
|
lock_sock(lsk);
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
*new_smc = smc_sk(new_sk);
|
|
|
|
|
2018-12-19 01:02:25 +08:00
|
|
|
mutex_lock(&lsmc->clcsock_release_lock);
|
|
|
|
if (lsmc->clcsock)
|
2020-09-11 00:48:20 +08:00
|
|
|
rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
|
2018-12-19 01:02:25 +08:00
|
|
|
mutex_unlock(&lsmc->clcsock_release_lock);
|
2018-01-24 17:28:12 +08:00
|
|
|
lock_sock(lsk);
|
2020-09-11 00:48:20 +08:00
|
|
|
if (rc < 0 && rc != -EAGAIN)
|
2018-01-24 17:28:12 +08:00
|
|
|
lsk->sk_err = -rc;
|
2018-01-24 17:28:13 +08:00
|
|
|
if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
|
2019-04-11 17:17:34 +08:00
|
|
|
new_sk->sk_prot->unhash(new_sk);
|
2017-01-09 23:55:16 +08:00
|
|
|
if (new_clcsock)
|
|
|
|
sock_release(new_clcsock);
|
|
|
|
new_sk->sk_state = SMC_CLOSED;
|
2023-11-03 14:07:38 +08:00
|
|
|
smc_sock_set_flag(new_sk, SOCK_DEAD);
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_put(new_sk); /* final */
|
2017-01-09 23:55:13 +08:00
|
|
|
*new_smc = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-09-11 00:48:20 +08:00
|
|
|
/* new clcsock has inherited the smc listen-specific sk_data_ready
|
|
|
|
* function; switch it back to the original sk_data_ready function
|
|
|
|
*/
|
|
|
|
new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
|
2022-04-22 15:56:18 +08:00
|
|
|
|
|
|
|
/* if new clcsock has also inherited the fallback-specific callback
|
|
|
|
* functions, switch them back to the original ones.
|
|
|
|
*/
|
|
|
|
if (lsmc->use_fallback) {
|
|
|
|
if (lsmc->clcsk_state_change)
|
|
|
|
new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change;
|
|
|
|
if (lsmc->clcsk_write_space)
|
|
|
|
new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space;
|
|
|
|
if (lsmc->clcsk_error_report)
|
|
|
|
new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report;
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
(*new_smc)->clcsock = new_clcsock;
|
|
|
|
out:
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:16 +08:00
|
|
|
/* add a just created sock to the accept queue of the listen sock as
|
|
|
|
* candidate for a following socket accept call from user space
|
|
|
|
*/
|
|
|
|
static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
|
|
|
|
{
|
|
|
|
struct smc_sock *par = smc_sk(parent);
|
|
|
|
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_hold(sk); /* sock_put in smc_accept_unlink () */
|
2017-01-09 23:55:16 +08:00
|
|
|
spin_lock(&par->accept_q_lock);
|
|
|
|
list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
|
|
|
|
spin_unlock(&par->accept_q_lock);
|
|
|
|
sk_acceptq_added(parent);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* remove a socket from the accept queue of its parental listening socket */
|
|
|
|
static void smc_accept_unlink(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct smc_sock *par = smc_sk(sk)->listen_smc;
|
|
|
|
|
|
|
|
spin_lock(&par->accept_q_lock);
|
|
|
|
list_del_init(&smc_sk(sk)->accept_q);
|
|
|
|
spin_unlock(&par->accept_q_lock);
|
|
|
|
sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_put(sk); /* sock_hold in smc_accept_enqueue */
|
2017-01-09 23:55:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* remove a sock from the accept queue to bind it to a new socket created
|
|
|
|
* for a socket accept call from user space
|
|
|
|
*/
|
2017-01-09 23:55:25 +08:00
|
|
|
struct sock *smc_accept_dequeue(struct sock *parent,
|
|
|
|
struct socket *new_sock)
|
2017-01-09 23:55:16 +08:00
|
|
|
{
|
|
|
|
struct smc_sock *isk, *n;
|
|
|
|
struct sock *new_sk;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
|
|
|
|
new_sk = (struct sock *)isk;
|
|
|
|
|
|
|
|
smc_accept_unlink(new_sk);
|
|
|
|
if (new_sk->sk_state == SMC_CLOSED) {
|
2019-04-11 17:17:34 +08:00
|
|
|
new_sk->sk_prot->unhash(new_sk);
|
2018-01-26 16:28:49 +08:00
|
|
|
if (isk->clcsock) {
|
|
|
|
sock_release(isk->clcsock);
|
|
|
|
isk->clcsock = NULL;
|
|
|
|
}
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_put(new_sk); /* final */
|
2017-01-09 23:55:16 +08:00
|
|
|
continue;
|
|
|
|
}
|
2019-04-11 17:17:32 +08:00
|
|
|
if (new_sock) {
|
2017-01-09 23:55:16 +08:00
|
|
|
sock_graft(new_sk, new_sock);
|
2022-05-13 10:24:53 +08:00
|
|
|
new_sock->state = SS_CONNECTED;
|
2019-04-11 17:17:32 +08:00
|
|
|
if (isk->use_fallback) {
|
|
|
|
smc_sk(new_sk)->clcsock->file = new_sock->file;
|
|
|
|
isk->clcsock->file->private_data = isk->clcsock;
|
|
|
|
}
|
|
|
|
}
|
2017-01-09 23:55:16 +08:00
|
|
|
return new_sk;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* clean up for a created but never accepted sock */
|
2017-01-09 23:55:25 +08:00
|
|
|
void smc_close_non_accepted(struct sock *sk)
|
2017-01-09 23:55:16 +08:00
|
|
|
{
|
|
|
|
struct smc_sock *smc = smc_sk(sk);
|
|
|
|
|
2019-10-21 22:13:15 +08:00
|
|
|
sock_hold(sk); /* sock_put below */
|
2017-01-09 23:55:25 +08:00
|
|
|
lock_sock(sk);
|
|
|
|
if (!sk->sk_lingertime)
|
|
|
|
/* wait for peer closing */
|
2023-08-19 12:06:46 +08:00
|
|
|
WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT);
|
2019-06-27 21:04:52 +08:00
|
|
|
__smc_release(smc);
|
2017-01-09 23:55:25 +08:00
|
|
|
release_sock(sk);
|
2019-10-21 22:13:15 +08:00
|
|
|
sock_put(sk); /* sock_hold above */
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_put(sk); /* final sock_put */
|
2017-01-09 23:55:16 +08:00
|
|
|
}
|
|
|
|
|
2020-04-29 23:10:41 +08:00
|
|
|
static int smcr_serv_conf_first_link(struct smc_sock *smc)
|
2017-01-09 23:55:21 +08:00
|
|
|
{
|
2020-04-29 23:10:40 +08:00
|
|
|
struct smc_link *link = smc->conn.lnk;
|
2020-04-30 21:55:42 +08:00
|
|
|
struct smc_llc_qentry *qentry;
|
2017-01-09 23:55:21 +08:00
|
|
|
int rc;
|
|
|
|
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
/* reg the sndbuf if it was vzalloced*/
|
|
|
|
if (smc->conn.sndbuf_desc->is_vm) {
|
|
|
|
if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
|
|
|
|
return SMC_CLC_DECL_ERR_REGBUF;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* reg the rmb */
|
|
|
|
if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
|
|
|
|
return SMC_CLC_DECL_ERR_REGBUF;
|
2017-07-28 19:56:17 +08:00
|
|
|
|
2017-01-09 23:55:21 +08:00
|
|
|
/* send CONFIRM LINK request to client over the RoCE fabric */
|
2018-07-25 22:35:30 +08:00
|
|
|
rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
|
2017-01-09 23:55:21 +08:00
|
|
|
if (rc < 0)
|
2018-07-25 22:35:32 +08:00
|
|
|
return SMC_CLC_DECL_TIMEOUT_CL;
|
2017-01-09 23:55:21 +08:00
|
|
|
|
|
|
|
/* receive CONFIRM LINK response from client over the RoCE fabric */
|
2020-04-30 21:55:42 +08:00
|
|
|
qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
|
|
|
|
SMC_LLC_CONFIRM_LINK);
|
|
|
|
if (!qentry) {
|
2017-01-09 23:55:21 +08:00
|
|
|
struct smc_clc_msg_decline dclc;
|
|
|
|
|
|
|
|
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
|
2018-11-22 17:26:39 +08:00
|
|
|
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
|
2018-11-22 17:26:37 +08:00
|
|
|
return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
|
2017-01-09 23:55:21 +08:00
|
|
|
}
|
2020-05-04 20:18:48 +08:00
|
|
|
smc_llc_save_peer_uid(qentry);
|
2020-04-30 21:55:42 +08:00
|
|
|
rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
|
|
|
|
smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
|
|
|
|
if (rc)
|
2018-03-01 20:51:31 +08:00
|
|
|
return SMC_CLC_DECL_RMBE_EC;
|
|
|
|
|
2020-04-30 21:55:42 +08:00
|
|
|
/* confirm_rkey is implicit on 1st contact */
|
|
|
|
smc->conn.rmb_desc->is_conf_rkey = true;
|
2018-03-01 20:51:32 +08:00
|
|
|
|
2020-04-29 23:10:49 +08:00
|
|
|
smc_llc_link_active(link);
|
2020-05-05 21:01:20 +08:00
|
|
|
smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
|
2018-03-01 20:51:32 +08:00
|
|
|
|
2023-08-17 21:20:31 +08:00
|
|
|
if (link->lgr->max_links > 1) {
|
|
|
|
down_write(&link->lgr->llc_conf_mutex);
|
|
|
|
/* initial contact - try to establish second link */
|
|
|
|
smc_llc_srv_add_link(link, NULL);
|
|
|
|
up_write(&link->lgr->llc_conf_mutex);
|
|
|
|
}
|
2018-03-01 20:51:31 +08:00
|
|
|
return 0;
|
2017-01-09 23:55:21 +08:00
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* listen worker: finish */
|
|
|
|
static void smc_listen_out(struct smc_sock *new_smc)
|
2017-01-09 23:55:16 +08:00
|
|
|
{
|
|
|
|
struct smc_sock *lsmc = new_smc->listen_smc;
|
|
|
|
struct sock *newsmcsk = &new_smc->sk;
|
|
|
|
|
net/smc: Limit backlog connections
Current implementation does not handling backlog semantics, one
potential risk is that server will be flooded by infinite amount
connections, even if client was SMC-incapable.
This patch works to put a limit on backlog connections, referring to the
TCP implementation, we divides SMC connections into two categories:
1. Half SMC connection, which includes all TCP established while SMC not
connections.
2. Full SMC connection, which includes all SMC established connections.
For half SMC connection, since all half SMC connections starts with TCP
established, we can achieve our goal by put a limit before TCP
established. Refer to the implementation of TCP, this limits will based
on not only the half SMC connections but also the full connections,
which is also a constraint on full SMC connections.
For full SMC connections, although we know exactly where it starts, it's
quite hard to put a limit before it. The easiest way is to block wait
before receive SMC confirm CLC message, while it's under protection by
smc_server_lgr_pending, a global lock, which leads this limit to the
entire host instead of a single listen socket. Another way is to drop
the full connections, but considering the cast of SMC connections, we
prefer to keep full SMC connections.
Even so, the limits of full SMC connections still exists, see commits
about half SMC connection below.
After this patch, the limits of backend connection shows like:
For SMC:
1. Client with SMC-capability can makes 2 * backlog full SMC connections
or 1 * backlog half SMC connections and 1 * backlog full SMC
connections at most.
2. Client without SMC-capability can only makes 1 * backlog half TCP
connections and 1 * backlog full TCP connections.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 17:11:35 +08:00
|
|
|
if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
|
|
|
|
atomic_dec(&lsmc->queued_smc_hs);
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
if (lsmc->sk.sk_state == SMC_LISTEN) {
|
2019-04-11 17:17:30 +08:00
|
|
|
lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
|
2018-05-18 15:34:18 +08:00
|
|
|
smc_accept_enqueue(&lsmc->sk, newsmcsk);
|
2019-04-11 17:17:30 +08:00
|
|
|
release_sock(&lsmc->sk);
|
2018-05-18 15:34:18 +08:00
|
|
|
} else { /* no longer listening */
|
|
|
|
smc_close_non_accepted(newsmcsk);
|
2017-10-25 17:01:46 +08:00
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* Wake up accept */
|
|
|
|
lsmc->sk.sk_data_ready(&lsmc->sk);
|
|
|
|
sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
|
|
|
|
}
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* listen worker: finish in state connected */
|
|
|
|
static void smc_listen_out_connected(struct smc_sock *new_smc)
|
|
|
|
{
|
|
|
|
struct sock *newsmcsk = &new_smc->sk;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
if (newsmcsk->sk_state == SMC_INIT)
|
|
|
|
newsmcsk->sk_state = SMC_ACTIVE;
|
|
|
|
|
|
|
|
smc_listen_out(new_smc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* listen worker: finish in error state */
|
|
|
|
static void smc_listen_out_err(struct smc_sock *new_smc)
|
|
|
|
{
|
|
|
|
struct sock *newsmcsk = &new_smc->sk;
|
2021-06-16 22:52:58 +08:00
|
|
|
struct net *net = sock_net(newsmcsk);
|
2018-05-18 15:34:18 +08:00
|
|
|
|
2021-06-16 22:52:58 +08:00
|
|
|
this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt);
|
2018-05-18 15:34:18 +08:00
|
|
|
if (newsmcsk->sk_state == SMC_INIT)
|
|
|
|
sock_put(&new_smc->sk); /* passive closing */
|
|
|
|
newsmcsk->sk_state = SMC_CLOSED;
|
|
|
|
|
|
|
|
smc_listen_out(new_smc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* listen worker: decline and fall back if possible */
|
|
|
|
static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
|
2020-10-24 02:48:28 +08:00
|
|
|
int local_first, u8 version)
|
2018-05-18 15:34:18 +08:00
|
|
|
{
|
|
|
|
/* RDMA setup failed, switch back to TCP */
|
2020-12-02 03:20:36 +08:00
|
|
|
smc_conn_abort(new_smc, local_first);
|
2022-01-22 17:43:09 +08:00
|
|
|
if (reason_code < 0 ||
|
|
|
|
smc_switch_to_fallback(new_smc, reason_code)) {
|
|
|
|
/* error, no fallback possible */
|
2018-05-18 15:34:18 +08:00
|
|
|
smc_listen_out_err(new_smc);
|
|
|
|
return;
|
|
|
|
}
|
2018-07-25 22:35:32 +08:00
|
|
|
if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
|
2020-09-26 18:44:32 +08:00
|
|
|
if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
|
2018-05-18 15:34:18 +08:00
|
|
|
smc_listen_out_err(new_smc);
|
|
|
|
return;
|
|
|
|
}
|
2017-01-09 23:55:16 +08:00
|
|
|
}
|
2018-05-18 15:34:18 +08:00
|
|
|
smc_listen_out_connected(new_smc);
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:29 +08:00
|
|
|
/* listen worker: version checking */
|
|
|
|
static int smc_listen_v2_check(struct smc_sock *new_smc,
|
|
|
|
struct smc_clc_msg_proposal *pclc,
|
|
|
|
struct smc_init_info *ini)
|
|
|
|
{
|
|
|
|
struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
|
|
|
|
struct smc_clc_v2_extension *pclc_v2_ext;
|
2020-11-01 02:19:38 +08:00
|
|
|
int rc = SMC_CLC_DECL_PEERNOSMC;
|
2020-09-26 18:44:29 +08:00
|
|
|
|
|
|
|
ini->smc_type_v1 = pclc->hdr.typev1;
|
|
|
|
ini->smc_type_v2 = pclc->hdr.typev2;
|
2021-10-16 17:37:46 +08:00
|
|
|
ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
|
|
|
|
ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
|
|
|
|
if (pclc->hdr.version > SMC_V1) {
|
|
|
|
if (smcd_indicated(ini->smc_type_v2))
|
|
|
|
ini->smcd_version |= SMC_V2;
|
|
|
|
if (smcr_indicated(ini->smc_type_v2))
|
|
|
|
ini->smcr_version |= SMC_V2;
|
2020-11-01 02:19:38 +08:00
|
|
|
}
|
2021-10-16 17:37:46 +08:00
|
|
|
if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) {
|
|
|
|
rc = SMC_CLC_DECL_PEERNOSMC;
|
2020-09-26 18:44:29 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
pclc_v2_ext = smc_get_clc_v2_ext(pclc);
|
|
|
|
if (!pclc_v2_ext) {
|
|
|
|
ini->smcd_version &= ~SMC_V2;
|
2021-10-16 17:37:46 +08:00
|
|
|
ini->smcr_version &= ~SMC_V2;
|
2020-11-01 02:19:38 +08:00
|
|
|
rc = SMC_CLC_DECL_NOV2EXT;
|
2020-09-26 18:44:29 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
|
2021-10-16 17:37:46 +08:00
|
|
|
if (ini->smcd_version & SMC_V2) {
|
|
|
|
if (!smc_ism_is_v2_capable()) {
|
|
|
|
ini->smcd_version &= ~SMC_V2;
|
|
|
|
rc = SMC_CLC_DECL_NOISM2SUPP;
|
|
|
|
} else if (!pclc_smcd_v2_ext) {
|
|
|
|
ini->smcd_version &= ~SMC_V2;
|
|
|
|
rc = SMC_CLC_DECL_NOV2DEXT;
|
|
|
|
} else if (!pclc_v2_ext->hdr.eid_cnt &&
|
|
|
|
!pclc_v2_ext->hdr.flag.seid) {
|
|
|
|
ini->smcd_version &= ~SMC_V2;
|
|
|
|
rc = SMC_CLC_DECL_NOUEID;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ini->smcr_version & SMC_V2) {
|
|
|
|
if (!pclc_v2_ext->hdr.eid_cnt) {
|
|
|
|
ini->smcr_version &= ~SMC_V2;
|
|
|
|
rc = SMC_CLC_DECL_NOUEID;
|
|
|
|
}
|
2020-11-01 02:19:38 +08:00
|
|
|
}
|
2020-09-26 18:44:29 +08:00
|
|
|
|
2023-08-17 21:20:27 +08:00
|
|
|
ini->release_nr = pclc_v2_ext->hdr.flag.release;
|
|
|
|
if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE)
|
|
|
|
ini->release_nr = SMC_RELEASE;
|
|
|
|
|
2020-09-26 18:44:29 +08:00
|
|
|
out:
|
2021-10-16 17:37:46 +08:00
|
|
|
if (!ini->smcd_version && !ini->smcr_version)
|
2020-11-01 02:19:38 +08:00
|
|
|
return rc;
|
2020-09-26 18:44:29 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* listen worker: check prefixes */
|
2019-04-12 18:57:25 +08:00
|
|
|
static int smc_listen_prfx_check(struct smc_sock *new_smc,
|
2018-05-18 15:34:18 +08:00
|
|
|
struct smc_clc_msg_proposal *pclc)
|
|
|
|
{
|
|
|
|
struct smc_clc_msg_proposal_prefix *pclc_prfx;
|
|
|
|
struct socket *newclcsock = new_smc->clcsock;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2020-09-26 18:44:29 +08:00
|
|
|
if (pclc->hdr.typev1 == SMC_TYPE_N)
|
|
|
|
return 0;
|
2017-12-07 20:38:49 +08:00
|
|
|
pclc_prfx = smc_clc_proposal_get_prefix(pclc);
|
2018-05-18 15:34:18 +08:00
|
|
|
if (smc_clc_prfx_match(newclcsock, pclc_prfx))
|
2019-04-12 18:57:25 +08:00
|
|
|
return SMC_CLC_DECL_DIFFPREFIX;
|
2018-03-16 22:06:39 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* listen worker: initialize connection and buffers */
|
|
|
|
static int smc_listen_rdma_init(struct smc_sock *new_smc,
|
2019-04-12 18:57:30 +08:00
|
|
|
struct smc_init_info *ini)
|
2018-05-18 15:34:18 +08:00
|
|
|
{
|
2019-04-12 18:57:30 +08:00
|
|
|
int rc;
|
|
|
|
|
2017-01-09 23:55:17 +08:00
|
|
|
/* allocate connection / link group */
|
2019-04-12 18:57:30 +08:00
|
|
|
rc = smc_conn_create(new_smc, ini);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2017-07-28 19:56:20 +08:00
|
|
|
/* create send buffer and rmb */
|
net/smc: Reset connection when trying to use SMCRv2 fails.
We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It
can be reproduced by:
- smc_run nginx
- smc_run wrk -t 32 -c 500 -d 30 http://<ip>:<port>
BUG: kernel NULL pointer dereference, address: 0000000000000014
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42
Workqueue: smc_hs_wq smc_listen_work [smc]
RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc]
RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06
RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000
RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00
RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000
R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180
R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001
FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib]
? smcr_buf_map_link+0x24b/0x290 [smc]
? __smc_buf_create+0x4ee/0x9b0 [smc]
smc_clc_send_accept+0x4c/0xb0 [smc]
smc_listen_work+0x346/0x650 [smc]
? __schedule+0x279/0x820
process_one_work+0x1e5/0x3f0
worker_thread+0x4d/0x2f0
? __pfx_worker_thread+0x10/0x10
kthread+0xe5/0x120
? __pfx_kthread+0x10/0x10
ret_from_fork+0x2c/0x50
</TASK>
During the CLC handshake, server sequentially tries available SMCRv2
and SMCRv1 devices in smc_listen_work().
If an SMCRv2 device is found. SMCv2 based link group and link will be
assigned to the connection. Then assumed that some buffer assignment
errors happen later in the CLC handshake, such as RMB registration
failure, server will give up SMCRv2 and try SMCRv1 device instead. But
the resources assigned to the connection won't be reset.
When server tries SMCRv1 device, the connection creation process will
be executed again. Since conn->lnk has been assigned when trying SMCRv2,
it will not be set to the correct SMCRv1 link in
smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to
correct SMCRv1 link group but conn->lnk points to the SMCRv2 link
mistakenly.
Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx]
will be accessed. Since the link->link_idx is not correct, the related
MR may not have been initialized, so crash happens.
| Try SMCRv2 device first
| |-> conn->lgr: assign existed SMCRv2 link group;
| |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC);
| |-> sndbuf & RMB creation fails, quit;
|
| Try SMCRv1 device then
| |-> conn->lgr: create SMCRv1 link group and assign;
| |-> conn->link: keep SMCRv2 link mistakenly;
| |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0]
| initialized.
|
| Then smc_clc_send_confirm_accept() accesses
| conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash.
v
This patch tries to fix this by cleaning conn->lnk before assigning
link. In addition, it is better to reset the connection and clean the
resources assigned if trying SMCRv2 failed in buffer creation or
registration.
Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2")
Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-18 13:14:55 +08:00
|
|
|
if (smc_buf_create(new_smc, false)) {
|
|
|
|
smc_conn_abort(new_smc, ini->first_contact_local);
|
2018-05-18 15:34:18 +08:00
|
|
|
return SMC_CLC_DECL_MEM;
|
net/smc: Reset connection when trying to use SMCRv2 fails.
We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It
can be reproduced by:
- smc_run nginx
- smc_run wrk -t 32 -c 500 -d 30 http://<ip>:<port>
BUG: kernel NULL pointer dereference, address: 0000000000000014
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42
Workqueue: smc_hs_wq smc_listen_work [smc]
RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc]
RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06
RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000
RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00
RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000
R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180
R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001
FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib]
? smcr_buf_map_link+0x24b/0x290 [smc]
? __smc_buf_create+0x4ee/0x9b0 [smc]
smc_clc_send_accept+0x4c/0xb0 [smc]
smc_listen_work+0x346/0x650 [smc]
? __schedule+0x279/0x820
process_one_work+0x1e5/0x3f0
worker_thread+0x4d/0x2f0
? __pfx_worker_thread+0x10/0x10
kthread+0xe5/0x120
? __pfx_kthread+0x10/0x10
ret_from_fork+0x2c/0x50
</TASK>
During the CLC handshake, server sequentially tries available SMCRv2
and SMCRv1 devices in smc_listen_work().
If an SMCRv2 device is found. SMCv2 based link group and link will be
assigned to the connection. Then assumed that some buffer assignment
errors happen later in the CLC handshake, such as RMB registration
failure, server will give up SMCRv2 and try SMCRv1 device instead. But
the resources assigned to the connection won't be reset.
When server tries SMCRv1 device, the connection creation process will
be executed again. Since conn->lnk has been assigned when trying SMCRv2,
it will not be set to the correct SMCRv1 link in
smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to
correct SMCRv1 link group but conn->lnk points to the SMCRv2 link
mistakenly.
Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx]
will be accessed. Since the link->link_idx is not correct, the related
MR may not have been initialized, so crash happens.
| Try SMCRv2 device first
| |-> conn->lgr: assign existed SMCRv2 link group;
| |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC);
| |-> sndbuf & RMB creation fails, quit;
|
| Try SMCRv1 device then
| |-> conn->lgr: create SMCRv1 link group and assign;
| |-> conn->link: keep SMCRv2 link mistakenly;
| |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0]
| initialized.
|
| Then smc_clc_send_confirm_accept() accesses
| conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash.
v
This patch tries to fix this by cleaning conn->lnk before assigning
link. In addition, it is better to reset the connection and clean the
resources assigned if trying SMCRv2 failed in buffer creation or
registration.
Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2")
Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-18 13:14:55 +08:00
|
|
|
}
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-06-29 01:05:11 +08:00
|
|
|
/* listen worker: initialize connection and buffers for SMC-D */
|
|
|
|
static int smc_listen_ism_init(struct smc_sock *new_smc,
|
2019-04-12 18:57:30 +08:00
|
|
|
struct smc_init_info *ini)
|
2018-06-29 01:05:11 +08:00
|
|
|
{
|
2019-04-12 18:57:30 +08:00
|
|
|
int rc;
|
2018-06-29 01:05:11 +08:00
|
|
|
|
2019-04-12 18:57:30 +08:00
|
|
|
rc = smc_conn_create(new_smc, ini);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
2018-06-29 01:05:11 +08:00
|
|
|
|
|
|
|
/* Create send and receive buffers */
|
2020-07-27 02:34:28 +08:00
|
|
|
rc = smc_buf_create(new_smc, true);
|
|
|
|
if (rc) {
|
2020-12-02 03:20:36 +08:00
|
|
|
smc_conn_abort(new_smc, ini->first_contact_local);
|
2020-07-27 02:34:28 +08:00
|
|
|
return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
|
|
|
|
SMC_CLC_DECL_MEM;
|
2018-06-29 01:05:11 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:29 +08:00
|
|
|
static bool smc_is_already_selected(struct smcd_dev *smcd,
|
|
|
|
struct smc_init_info *ini,
|
|
|
|
int matches)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < matches; i++)
|
|
|
|
if (smcd == ini->ism_dev[i])
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check for ISM devices matching proposed ISM devices */
|
|
|
|
static void smc_check_ism_v2_match(struct smc_init_info *ini,
|
|
|
|
u16 proposed_chid, u64 proposed_gid,
|
|
|
|
unsigned int *matches)
|
|
|
|
{
|
|
|
|
struct smcd_dev *smcd;
|
|
|
|
|
|
|
|
list_for_each_entry(smcd, &smcd_dev_list.list, list) {
|
|
|
|
if (smcd->going_away)
|
|
|
|
continue;
|
|
|
|
if (smc_is_already_selected(smcd, ini, *matches))
|
|
|
|
continue;
|
|
|
|
if (smc_ism_get_chid(smcd) == proposed_chid &&
|
|
|
|
!smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
|
|
|
|
ini->ism_peer_gid[*matches] = proposed_gid;
|
|
|
|
ini->ism_dev[*matches] = smcd;
|
|
|
|
(*matches)++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-01 02:19:38 +08:00
|
|
|
static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini)
|
|
|
|
{
|
|
|
|
if (!ini->rc)
|
|
|
|
ini->rc = rc;
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:29 +08:00
|
|
|
static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
|
|
|
|
struct smc_clc_msg_proposal *pclc,
|
|
|
|
struct smc_init_info *ini)
|
|
|
|
{
|
|
|
|
struct smc_clc_smcd_v2_extension *smcd_v2_ext;
|
|
|
|
struct smc_clc_v2_extension *smc_v2_ext;
|
|
|
|
struct smc_clc_msg_smcd *pclc_smcd;
|
|
|
|
unsigned int matches = 0;
|
2020-10-08 04:57:43 +08:00
|
|
|
u8 smcd_version;
|
2020-09-26 18:44:29 +08:00
|
|
|
u8 *eid = NULL;
|
2020-11-01 02:19:38 +08:00
|
|
|
int i, rc;
|
2020-09-26 18:44:29 +08:00
|
|
|
|
|
|
|
if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
|
2020-10-08 04:57:43 +08:00
|
|
|
goto not_found;
|
2020-09-26 18:44:29 +08:00
|
|
|
|
|
|
|
pclc_smcd = smc_get_clc_msg_smcd(pclc);
|
|
|
|
smc_v2_ext = smc_get_clc_v2_ext(pclc);
|
|
|
|
smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
|
|
|
|
|
|
|
|
mutex_lock(&smcd_dev_list.mutex);
|
|
|
|
if (pclc_smcd->ism.chid)
|
|
|
|
/* check for ISM device matching proposed native ISM device */
|
|
|
|
smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
|
|
|
|
ntohll(pclc_smcd->ism.gid), &matches);
|
|
|
|
for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
|
|
|
|
/* check for ISM devices matching proposed non-native ISM
|
|
|
|
* devices
|
|
|
|
*/
|
|
|
|
smc_check_ism_v2_match(ini,
|
|
|
|
ntohs(smcd_v2_ext->gidchid[i - 1].chid),
|
|
|
|
ntohll(smcd_v2_ext->gidchid[i - 1].gid),
|
|
|
|
&matches);
|
|
|
|
}
|
|
|
|
mutex_unlock(&smcd_dev_list.mutex);
|
|
|
|
|
2021-10-16 17:37:46 +08:00
|
|
|
if (!ini->ism_dev[0]) {
|
|
|
|
smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini);
|
2021-09-14 16:35:05 +08:00
|
|
|
goto not_found;
|
2021-10-16 17:37:46 +08:00
|
|
|
}
|
2021-09-14 16:35:05 +08:00
|
|
|
|
2021-09-14 16:35:06 +08:00
|
|
|
smc_ism_get_system_eid(&eid);
|
2021-09-14 16:35:05 +08:00
|
|
|
if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext,
|
|
|
|
smcd_v2_ext->system_eid, eid))
|
2020-09-26 18:44:29 +08:00
|
|
|
goto not_found;
|
|
|
|
|
|
|
|
/* separate - outside the smcd_dev_list.lock */
|
2020-10-08 04:57:43 +08:00
|
|
|
smcd_version = ini->smcd_version;
|
2020-09-26 18:44:29 +08:00
|
|
|
for (i = 0; i < matches; i++) {
|
|
|
|
ini->smcd_version = SMC_V2;
|
|
|
|
ini->is_smcd = true;
|
|
|
|
ini->ism_selected = i;
|
2020-11-01 02:19:38 +08:00
|
|
|
rc = smc_listen_ism_init(new_smc, ini);
|
|
|
|
if (rc) {
|
|
|
|
smc_find_ism_store_rc(rc, ini);
|
2020-09-26 18:44:29 +08:00
|
|
|
/* try next active ISM device */
|
|
|
|
continue;
|
2020-11-01 02:19:38 +08:00
|
|
|
}
|
2020-09-26 18:44:29 +08:00
|
|
|
return; /* matching and usable V2 ISM device found */
|
|
|
|
}
|
2020-10-08 04:57:43 +08:00
|
|
|
/* no V2 ISM device could be initialized */
|
|
|
|
ini->smcd_version = smcd_version; /* restore original value */
|
2021-09-14 16:35:05 +08:00
|
|
|
ini->negotiated_eid[0] = 0;
|
2020-09-26 18:44:29 +08:00
|
|
|
|
|
|
|
not_found:
|
|
|
|
ini->smcd_version &= ~SMC_V2;
|
|
|
|
ini->ism_dev[0] = NULL;
|
|
|
|
ini->is_smcd = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
|
|
|
|
struct smc_clc_msg_proposal *pclc,
|
|
|
|
struct smc_init_info *ini)
|
2020-09-26 18:44:21 +08:00
|
|
|
{
|
|
|
|
struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
|
2020-11-01 02:19:38 +08:00
|
|
|
int rc = 0;
|
2020-09-26 18:44:21 +08:00
|
|
|
|
2020-09-26 18:44:29 +08:00
|
|
|
/* check if ISM V1 is available */
|
|
|
|
if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
|
2020-09-26 18:44:21 +08:00
|
|
|
goto not_found;
|
|
|
|
ini->is_smcd = true; /* prepare ISM check */
|
2020-09-26 18:44:28 +08:00
|
|
|
ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
|
2020-11-01 02:19:38 +08:00
|
|
|
rc = smc_find_ism_device(new_smc, ini);
|
|
|
|
if (rc)
|
2020-09-26 18:44:21 +08:00
|
|
|
goto not_found;
|
2020-09-26 18:44:29 +08:00
|
|
|
ini->ism_selected = 0;
|
2020-11-01 02:19:38 +08:00
|
|
|
rc = smc_listen_ism_init(new_smc, ini);
|
|
|
|
if (!rc)
|
2020-09-26 18:44:29 +08:00
|
|
|
return; /* V1 ISM device found */
|
2020-09-26 18:44:21 +08:00
|
|
|
|
|
|
|
not_found:
|
2020-11-01 02:19:38 +08:00
|
|
|
smc_find_ism_store_rc(rc, ini);
|
2021-10-16 17:37:46 +08:00
|
|
|
ini->smcd_version &= ~SMC_V1;
|
2020-09-26 18:44:23 +08:00
|
|
|
ini->ism_dev[0] = NULL;
|
2020-09-26 18:44:21 +08:00
|
|
|
ini->is_smcd = false;
|
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* listen worker: register buffers */
|
2020-09-11 00:48:21 +08:00
|
|
|
static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
|
2018-05-18 15:34:18 +08:00
|
|
|
{
|
2020-04-29 23:10:41 +08:00
|
|
|
struct smc_connection *conn = &new_smc->conn;
|
2017-04-10 20:58:01 +08:00
|
|
|
|
2020-09-11 00:48:21 +08:00
|
|
|
if (!local_first) {
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
/* reg sendbufs if they were vzalloced */
|
|
|
|
if (conn->sndbuf_desc->is_vm) {
|
|
|
|
if (smcr_lgr_reg_sndbufs(conn->lnk,
|
|
|
|
conn->sndbuf_desc))
|
|
|
|
return SMC_CLC_DECL_ERR_REGBUF;
|
|
|
|
}
|
2020-05-01 18:48:01 +08:00
|
|
|
if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
|
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.
When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.
So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.
Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:
1) regression in data path, which is brought by additional address
translation of sndbuf by RNIC in Tx. But in general, translating
address through MTT is fast.
Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
latency and bandwidth test with physically and virtually contiguous
buffers are as follows:
- client:
smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
-t 5 -vu tcp_{bw|lat}
- server:
smc_run taskset -c <cpu> qperf
[latency]
msgsize tcp smcr smcr-use-virt-buf
1 11.17 us 7.56 us 7.51 us (-0.67%)
2 10.65 us 7.74 us 7.56 us (-2.31%)
4 11.11 us 7.52 us 7.59 us ( 0.84%)
8 10.83 us 7.55 us 7.51 us (-0.48%)
16 11.21 us 7.46 us 7.51 us ( 0.71%)
32 10.65 us 7.53 us 7.58 us ( 0.61%)
64 10.95 us 7.74 us 7.80 us ( 0.76%)
128 11.14 us 7.83 us 7.87 us ( 0.47%)
256 10.97 us 7.94 us 7.92 us (-0.28%)
512 11.23 us 7.94 us 8.20 us ( 3.25%)
1024 11.60 us 8.12 us 8.20 us ( 0.96%)
2048 14.04 us 8.30 us 8.51 us ( 2.49%)
4096 16.88 us 9.13 us 9.07 us (-0.64%)
8192 22.50 us 10.56 us 11.22 us ( 6.26%)
16384 28.99 us 12.88 us 13.83 us ( 7.37%)
32768 40.13 us 16.76 us 16.95 us ( 1.16%)
65536 68.70 us 24.68 us 24.85 us ( 0.68%)
[bandwidth]
msgsize tcp smcr smcr-use-virt-buf
1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%)
2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%)
4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%)
8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%)
16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%)
32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%)
64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%)
128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%)
256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%)
512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%)
1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%)
2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%)
4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%)
8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%)
16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%)
32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%)
65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%)
2) regression in buffer initialization and destruction path, which is
brought by additional MR operations of sndbufs. But thanks to link
group buffer reuse mechanism, the impact of this kind of regression
decreases as times of buffer reuse increases.
Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
buffer-related function obtained by bpftrace are as follows:
Function Phys-bufs Virt-bufs
smcr_new_buf_create() 67154 ns 79164 ns
smc_ib_buf_map_sg() 525 ns 928 ns
smc_ib_get_memory_region() 162294 ns 161191 ns
smc_wr_reg_send() 9957 ns 9635 ns
smc_ib_put_memory_region() 203548 ns 198374 ns
smc_ib_buf_unmap_sg() 508 ns 1158 ns
------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 17:44:04 +08:00
|
|
|
return SMC_CLC_DECL_ERR_REGBUF;
|
2017-07-28 19:56:17 +08:00
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-05-24 17:02:30 +08:00
|
|
|
static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc,
|
|
|
|
struct smc_clc_msg_proposal *pclc,
|
|
|
|
struct smc_init_info *ini)
|
2021-10-16 17:37:46 +08:00
|
|
|
{
|
|
|
|
struct smc_clc_v2_extension *smc_v2_ext;
|
|
|
|
u8 smcr_version;
|
2022-05-24 17:02:30 +08:00
|
|
|
int rc;
|
2021-10-16 17:37:46 +08:00
|
|
|
|
|
|
|
if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2))
|
|
|
|
goto not_found;
|
|
|
|
|
|
|
|
smc_v2_ext = smc_get_clc_v2_ext(pclc);
|
|
|
|
if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL))
|
|
|
|
goto not_found;
|
|
|
|
|
|
|
|
/* prepare RDMA check */
|
|
|
|
memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
|
|
|
|
memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE);
|
|
|
|
memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
|
|
|
|
ini->check_smcrv2 = true;
|
|
|
|
ini->smcrv2.clc_sk = new_smc->clcsock->sk;
|
|
|
|
ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr;
|
|
|
|
ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce);
|
|
|
|
rc = smc_find_rdma_device(new_smc, ini);
|
2022-05-24 17:02:30 +08:00
|
|
|
if (rc) {
|
|
|
|
smc_find_ism_store_rc(rc, ini);
|
2021-10-16 17:37:46 +08:00
|
|
|
goto not_found;
|
2022-05-24 17:02:30 +08:00
|
|
|
}
|
2021-10-16 17:37:46 +08:00
|
|
|
if (!ini->smcrv2.uses_gateway)
|
|
|
|
memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN);
|
|
|
|
|
|
|
|
smcr_version = ini->smcr_version;
|
|
|
|
ini->smcr_version = SMC_V2;
|
|
|
|
rc = smc_listen_rdma_init(new_smc, ini);
|
net/smc: Reset connection when trying to use SMCRv2 fails.
We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It
can be reproduced by:
- smc_run nginx
- smc_run wrk -t 32 -c 500 -d 30 http://<ip>:<port>
BUG: kernel NULL pointer dereference, address: 0000000000000014
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42
Workqueue: smc_hs_wq smc_listen_work [smc]
RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc]
RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06
RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000
RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00
RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000
R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180
R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001
FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib]
? smcr_buf_map_link+0x24b/0x290 [smc]
? __smc_buf_create+0x4ee/0x9b0 [smc]
smc_clc_send_accept+0x4c/0xb0 [smc]
smc_listen_work+0x346/0x650 [smc]
? __schedule+0x279/0x820
process_one_work+0x1e5/0x3f0
worker_thread+0x4d/0x2f0
? __pfx_worker_thread+0x10/0x10
kthread+0xe5/0x120
? __pfx_kthread+0x10/0x10
ret_from_fork+0x2c/0x50
</TASK>
During the CLC handshake, server sequentially tries available SMCRv2
and SMCRv1 devices in smc_listen_work().
If an SMCRv2 device is found. SMCv2 based link group and link will be
assigned to the connection. Then assumed that some buffer assignment
errors happen later in the CLC handshake, such as RMB registration
failure, server will give up SMCRv2 and try SMCRv1 device instead. But
the resources assigned to the connection won't be reset.
When server tries SMCRv1 device, the connection creation process will
be executed again. Since conn->lnk has been assigned when trying SMCRv2,
it will not be set to the correct SMCRv1 link in
smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to
correct SMCRv1 link group but conn->lnk points to the SMCRv2 link
mistakenly.
Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx]
will be accessed. Since the link->link_idx is not correct, the related
MR may not have been initialized, so crash happens.
| Try SMCRv2 device first
| |-> conn->lgr: assign existed SMCRv2 link group;
| |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC);
| |-> sndbuf & RMB creation fails, quit;
|
| Try SMCRv1 device then
| |-> conn->lgr: create SMCRv1 link group and assign;
| |-> conn->link: keep SMCRv2 link mistakenly;
| |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0]
| initialized.
|
| Then smc_clc_send_confirm_accept() accesses
| conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash.
v
This patch tries to fix this by cleaning conn->lnk before assigning
link. In addition, it is better to reset the connection and clean the
resources assigned if trying SMCRv2 failed in buffer creation or
registration.
Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2")
Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-18 13:14:55 +08:00
|
|
|
if (!rc) {
|
2022-05-24 17:02:30 +08:00
|
|
|
rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local);
|
net/smc: Reset connection when trying to use SMCRv2 fails.
We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It
can be reproduced by:
- smc_run nginx
- smc_run wrk -t 32 -c 500 -d 30 http://<ip>:<port>
BUG: kernel NULL pointer dereference, address: 0000000000000014
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42
Workqueue: smc_hs_wq smc_listen_work [smc]
RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc]
RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06
RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000
RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00
RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000
R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180
R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001
FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib]
? smcr_buf_map_link+0x24b/0x290 [smc]
? __smc_buf_create+0x4ee/0x9b0 [smc]
smc_clc_send_accept+0x4c/0xb0 [smc]
smc_listen_work+0x346/0x650 [smc]
? __schedule+0x279/0x820
process_one_work+0x1e5/0x3f0
worker_thread+0x4d/0x2f0
? __pfx_worker_thread+0x10/0x10
kthread+0xe5/0x120
? __pfx_kthread+0x10/0x10
ret_from_fork+0x2c/0x50
</TASK>
During the CLC handshake, server sequentially tries available SMCRv2
and SMCRv1 devices in smc_listen_work().
If an SMCRv2 device is found. SMCv2 based link group and link will be
assigned to the connection. Then assumed that some buffer assignment
errors happen later in the CLC handshake, such as RMB registration
failure, server will give up SMCRv2 and try SMCRv1 device instead. But
the resources assigned to the connection won't be reset.
When server tries SMCRv1 device, the connection creation process will
be executed again. Since conn->lnk has been assigned when trying SMCRv2,
it will not be set to the correct SMCRv1 link in
smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to
correct SMCRv1 link group but conn->lnk points to the SMCRv2 link
mistakenly.
Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx]
will be accessed. Since the link->link_idx is not correct, the related
MR may not have been initialized, so crash happens.
| Try SMCRv2 device first
| |-> conn->lgr: assign existed SMCRv2 link group;
| |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC);
| |-> sndbuf & RMB creation fails, quit;
|
| Try SMCRv1 device then
| |-> conn->lgr: create SMCRv1 link group and assign;
| |-> conn->link: keep SMCRv2 link mistakenly;
| |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0]
| initialized.
|
| Then smc_clc_send_confirm_accept() accesses
| conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash.
v
This patch tries to fix this by cleaning conn->lnk before assigning
link. In addition, it is better to reset the connection and clean the
resources assigned if trying SMCRv2 failed in buffer creation or
registration.
Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2")
Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-18 13:14:55 +08:00
|
|
|
if (rc)
|
|
|
|
smc_conn_abort(new_smc, ini->first_contact_local);
|
|
|
|
}
|
2022-05-24 17:02:30 +08:00
|
|
|
if (!rc)
|
|
|
|
return;
|
|
|
|
ini->smcr_version = smcr_version;
|
|
|
|
smc_find_ism_store_rc(rc, ini);
|
2021-10-16 17:37:46 +08:00
|
|
|
|
|
|
|
not_found:
|
|
|
|
ini->smcr_version &= ~SMC_V2;
|
2022-05-25 16:54:08 +08:00
|
|
|
ini->smcrv2.ib_dev_v2 = NULL;
|
2021-10-16 17:37:46 +08:00
|
|
|
ini->check_smcrv2 = false;
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:29 +08:00
|
|
|
static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
|
|
|
|
struct smc_clc_msg_proposal *pclc,
|
|
|
|
struct smc_init_info *ini)
|
2020-09-26 18:44:21 +08:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
2021-10-16 17:37:46 +08:00
|
|
|
if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1))
|
2020-09-26 18:44:21 +08:00
|
|
|
return SMC_CLC_DECL_NOSMCDEV;
|
|
|
|
|
|
|
|
/* prepare RDMA check */
|
2021-10-16 17:37:46 +08:00
|
|
|
memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
|
|
|
|
memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE);
|
|
|
|
memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
|
2020-09-26 18:44:21 +08:00
|
|
|
rc = smc_find_rdma_device(new_smc, ini);
|
|
|
|
if (rc) {
|
|
|
|
/* no RDMA device found */
|
2021-10-16 17:37:46 +08:00
|
|
|
return SMC_CLC_DECL_NOSMCDEV;
|
2020-09-26 18:44:21 +08:00
|
|
|
}
|
|
|
|
rc = smc_listen_rdma_init(new_smc, ini);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
|
|
|
return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* determine the local device matching to proposal */
|
|
|
|
static int smc_listen_find_device(struct smc_sock *new_smc,
|
|
|
|
struct smc_clc_msg_proposal *pclc,
|
|
|
|
struct smc_init_info *ini)
|
|
|
|
{
|
2021-10-16 17:37:46 +08:00
|
|
|
int prfx_rc;
|
2020-09-26 18:44:29 +08:00
|
|
|
|
|
|
|
/* check for ISM device matching V2 proposed device */
|
|
|
|
smc_find_ism_v2_device_serv(new_smc, pclc, ini);
|
|
|
|
if (ini->ism_dev[0])
|
2020-09-26 18:44:21 +08:00
|
|
|
return 0;
|
2020-09-26 18:44:29 +08:00
|
|
|
|
2021-10-16 17:37:46 +08:00
|
|
|
/* check for matching IP prefix and subnet length (V1) */
|
|
|
|
prfx_rc = smc_listen_prfx_check(new_smc, pclc);
|
|
|
|
if (prfx_rc)
|
|
|
|
smc_find_ism_store_rc(prfx_rc, ini);
|
2020-09-26 18:44:29 +08:00
|
|
|
|
|
|
|
/* get vlan id from IP device */
|
|
|
|
if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
|
2020-11-01 02:19:38 +08:00
|
|
|
return ini->rc ?: SMC_CLC_DECL_GETVLANERR;
|
2020-09-26 18:44:29 +08:00
|
|
|
|
|
|
|
/* check for ISM device matching V1 proposed device */
|
2021-10-16 17:37:46 +08:00
|
|
|
if (!prfx_rc)
|
|
|
|
smc_find_ism_v1_device_serv(new_smc, pclc, ini);
|
2020-09-26 18:44:29 +08:00
|
|
|
if (ini->ism_dev[0])
|
|
|
|
return 0;
|
|
|
|
|
2021-10-16 17:37:46 +08:00
|
|
|
if (!smcr_indicated(pclc->hdr.typev1) &&
|
|
|
|
!smcr_indicated(pclc->hdr.typev2))
|
2020-11-01 02:19:38 +08:00
|
|
|
/* skip RDMA and decline */
|
|
|
|
return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV;
|
2020-09-26 18:44:21 +08:00
|
|
|
|
2021-10-16 17:37:46 +08:00
|
|
|
/* check if RDMA V2 is available */
|
2022-05-24 17:02:30 +08:00
|
|
|
smc_find_rdma_v2_device_serv(new_smc, pclc, ini);
|
|
|
|
if (ini->smcrv2.ib_dev_v2)
|
2021-10-16 17:37:46 +08:00
|
|
|
return 0;
|
2020-11-01 02:19:38 +08:00
|
|
|
|
2021-10-16 17:37:46 +08:00
|
|
|
/* check if RDMA V1 is available */
|
|
|
|
if (!prfx_rc) {
|
2022-05-24 17:02:30 +08:00
|
|
|
int rc;
|
|
|
|
|
2021-10-16 17:37:46 +08:00
|
|
|
rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
|
|
|
|
smc_find_ism_store_rc(rc, ini);
|
|
|
|
return (!rc) ? 0 : ini->rc;
|
|
|
|
}
|
2023-10-12 20:37:29 +08:00
|
|
|
return prfx_rc;
|
2020-09-26 18:44:21 +08:00
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* listen worker: finish RDMA setup */
|
2018-09-18 21:46:35 +08:00
|
|
|
static int smc_listen_rdma_finish(struct smc_sock *new_smc,
|
|
|
|
struct smc_clc_msg_accept_confirm *cclc,
|
2021-10-16 17:37:46 +08:00
|
|
|
bool local_first,
|
|
|
|
struct smc_init_info *ini)
|
2018-05-18 15:34:18 +08:00
|
|
|
{
|
2020-04-29 23:10:40 +08:00
|
|
|
struct smc_link *link = new_smc->conn.lnk;
|
2018-05-18 15:34:18 +08:00
|
|
|
int reason_code = 0;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2020-09-11 00:48:21 +08:00
|
|
|
if (local_first)
|
2021-10-16 17:37:46 +08:00
|
|
|
smc_link_save_peer_info(link, cclc, ini);
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2020-09-11 00:48:25 +08:00
|
|
|
if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
|
|
|
|
return SMC_CLC_DECL_ERR_RTOK;
|
2017-01-09 23:55:20 +08:00
|
|
|
|
2020-09-11 00:48:21 +08:00
|
|
|
if (local_first) {
|
2020-09-11 00:48:25 +08:00
|
|
|
if (smc_ib_ready_link(link))
|
|
|
|
return SMC_CLC_DECL_ERR_RDYLNK;
|
2017-01-09 23:55:21 +08:00
|
|
|
/* QP confirmation over RoCE fabric */
|
2020-04-30 21:55:42 +08:00
|
|
|
smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
|
2020-04-29 23:10:41 +08:00
|
|
|
reason_code = smcr_serv_conf_first_link(new_smc);
|
2020-04-30 21:55:42 +08:00
|
|
|
smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
|
2017-01-09 23:55:20 +08:00
|
|
|
}
|
2018-09-18 21:46:35 +08:00
|
|
|
return reason_code;
|
2018-05-18 15:34:18 +08:00
|
|
|
}
|
2017-01-09 23:55:23 +08:00
|
|
|
|
2020-09-26 18:44:21 +08:00
|
|
|
/* setup for connection of server */
|
2018-05-18 15:34:18 +08:00
|
|
|
static void smc_listen_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct smc_sock *new_smc = container_of(work, struct smc_sock,
|
|
|
|
smc_listen_work);
|
|
|
|
struct socket *newclcsock = new_smc->clcsock;
|
2020-09-26 18:44:30 +08:00
|
|
|
struct smc_clc_msg_accept_confirm *cclc;
|
2020-09-11 00:48:22 +08:00
|
|
|
struct smc_clc_msg_proposal_area *buf;
|
2018-05-18 15:34:18 +08:00
|
|
|
struct smc_clc_msg_proposal *pclc;
|
2020-09-26 18:44:23 +08:00
|
|
|
struct smc_init_info *ini = NULL;
|
2021-10-16 17:37:46 +08:00
|
|
|
u8 proposal_version = SMC_V1;
|
|
|
|
u8 accept_version;
|
2018-05-18 15:34:18 +08:00
|
|
|
int rc = 0;
|
|
|
|
|
2019-04-11 17:17:30 +08:00
|
|
|
if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
|
|
|
|
return smc_listen_out_err(new_smc);
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
if (new_smc->use_fallback) {
|
|
|
|
smc_listen_out_connected(new_smc);
|
|
|
|
return;
|
2017-01-09 23:55:16 +08:00
|
|
|
}
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* check if peer is smc capable */
|
|
|
|
if (!tcp_sk(newclcsock->sk)->syn_smc) {
|
2022-01-22 17:43:09 +08:00
|
|
|
rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC);
|
|
|
|
if (rc)
|
|
|
|
smc_listen_out_err(new_smc);
|
|
|
|
else
|
|
|
|
smc_listen_out_connected(new_smc);
|
2018-05-18 15:34:18 +08:00
|
|
|
return;
|
|
|
|
}
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* do inband token exchange -
|
|
|
|
* wait for and receive SMC Proposal CLC message
|
|
|
|
*/
|
2020-09-11 00:48:22 +08:00
|
|
|
buf = kzalloc(sizeof(*buf), GFP_KERNEL);
|
|
|
|
if (!buf) {
|
|
|
|
rc = SMC_CLC_DECL_MEM;
|
|
|
|
goto out_decl;
|
|
|
|
}
|
|
|
|
pclc = (struct smc_clc_msg_proposal *)buf;
|
|
|
|
rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
|
2019-04-12 18:57:28 +08:00
|
|
|
SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
|
2019-04-12 18:57:29 +08:00
|
|
|
if (rc)
|
|
|
|
goto out_decl;
|
2021-10-16 17:37:46 +08:00
|
|
|
|
|
|
|
if (pclc->hdr.version > SMC_V1)
|
|
|
|
proposal_version = SMC_V2;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2020-09-26 18:44:32 +08:00
|
|
|
/* IPSec connections opt out of SMC optimizations */
|
2018-05-18 15:34:18 +08:00
|
|
|
if (using_ipsec(new_smc)) {
|
2019-04-12 18:57:29 +08:00
|
|
|
rc = SMC_CLC_DECL_IPSEC;
|
|
|
|
goto out_decl;
|
2018-05-18 15:34:18 +08:00
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:23 +08:00
|
|
|
ini = kzalloc(sizeof(*ini), GFP_KERNEL);
|
|
|
|
if (!ini) {
|
|
|
|
rc = SMC_CLC_DECL_MEM;
|
|
|
|
goto out_decl;
|
|
|
|
}
|
|
|
|
|
2020-09-26 18:44:29 +08:00
|
|
|
/* initial version checking */
|
|
|
|
rc = smc_listen_v2_check(new_smc, pclc, ini);
|
|
|
|
if (rc)
|
2019-04-12 18:57:29 +08:00
|
|
|
goto out_decl;
|
2019-04-12 18:57:27 +08:00
|
|
|
|
2023-08-17 21:20:29 +08:00
|
|
|
rc = smc_clc_srv_v2x_features_validate(pclc, ini);
|
|
|
|
if (rc)
|
|
|
|
goto out_decl;
|
|
|
|
|
2019-02-07 22:56:18 +08:00
|
|
|
mutex_lock(&smc_server_lgr_pending);
|
2018-05-18 15:34:18 +08:00
|
|
|
smc_close_init(new_smc);
|
|
|
|
smc_rx_init(new_smc);
|
|
|
|
smc_tx_init(new_smc);
|
|
|
|
|
2020-09-26 18:44:21 +08:00
|
|
|
/* determine ISM or RoCE device used for connection */
|
2020-09-26 18:44:23 +08:00
|
|
|
rc = smc_listen_find_device(new_smc, pclc, ini);
|
2020-09-26 18:44:21 +08:00
|
|
|
if (rc)
|
|
|
|
goto out_unlock;
|
2018-05-18 15:34:18 +08:00
|
|
|
|
|
|
|
/* send SMC Accept CLC message */
|
2021-10-16 17:37:46 +08:00
|
|
|
accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version;
|
2020-09-26 18:44:30 +08:00
|
|
|
rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
|
2023-08-17 21:20:27 +08:00
|
|
|
accept_version, ini->negotiated_eid, ini);
|
2019-04-12 18:57:29 +08:00
|
|
|
if (rc)
|
|
|
|
goto out_unlock;
|
2018-05-18 15:34:18 +08:00
|
|
|
|
2019-02-07 22:56:17 +08:00
|
|
|
/* SMC-D does not need this lock any more */
|
2020-09-26 18:44:23 +08:00
|
|
|
if (ini->is_smcd)
|
2019-02-07 22:56:18 +08:00
|
|
|
mutex_unlock(&smc_server_lgr_pending);
|
2019-02-07 22:56:17 +08:00
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* receive SMC Confirm CLC message */
|
2020-10-08 04:57:42 +08:00
|
|
|
memset(buf, 0, sizeof(*buf));
|
|
|
|
cclc = (struct smc_clc_msg_accept_confirm *)buf;
|
|
|
|
rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
|
2019-04-12 18:57:28 +08:00
|
|
|
SMC_CLC_CONFIRM, CLC_WAIT_TIME);
|
|
|
|
if (rc) {
|
2020-09-26 18:44:23 +08:00
|
|
|
if (!ini->is_smcd)
|
2019-04-12 18:57:29 +08:00
|
|
|
goto out_unlock;
|
|
|
|
goto out_decl;
|
2018-05-18 15:34:18 +08:00
|
|
|
}
|
|
|
|
|
2023-08-17 21:20:29 +08:00
|
|
|
rc = smc_clc_v2x_features_confirm_check(cclc, ini);
|
|
|
|
if (rc) {
|
|
|
|
if (!ini->is_smcd)
|
|
|
|
goto out_unlock;
|
|
|
|
goto out_decl;
|
|
|
|
}
|
|
|
|
|
2023-08-17 21:20:31 +08:00
|
|
|
/* fce smc release version is needed in smc_listen_rdma_finish,
|
|
|
|
* so save fce info here.
|
|
|
|
*/
|
|
|
|
smc_conn_save_peer_info_fce(new_smc, cclc);
|
|
|
|
|
2018-05-18 15:34:18 +08:00
|
|
|
/* finish worker */
|
2020-09-26 18:44:23 +08:00
|
|
|
if (!ini->is_smcd) {
|
2020-09-26 18:44:30 +08:00
|
|
|
rc = smc_listen_rdma_finish(new_smc, cclc,
|
2021-10-16 17:37:46 +08:00
|
|
|
ini->first_contact_local, ini);
|
2019-02-07 22:56:17 +08:00
|
|
|
if (rc)
|
2020-09-11 00:48:25 +08:00
|
|
|
goto out_unlock;
|
|
|
|
mutex_unlock(&smc_server_lgr_pending);
|
2018-09-18 21:46:35 +08:00
|
|
|
}
|
2020-09-26 18:44:30 +08:00
|
|
|
smc_conn_save_peer_info(new_smc, cclc);
|
2018-05-18 15:34:18 +08:00
|
|
|
smc_listen_out_connected(new_smc);
|
2021-06-16 22:52:58 +08:00
|
|
|
SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
|
2020-09-18 04:46:02 +08:00
|
|
|
goto out_free;
|
2019-04-12 18:57:29 +08:00
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&smc_server_lgr_pending);
|
|
|
|
out_decl:
|
2020-10-24 02:48:28 +08:00
|
|
|
smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
|
2021-10-16 17:37:46 +08:00
|
|
|
proposal_version);
|
2020-09-18 04:46:02 +08:00
|
|
|
out_free:
|
2020-09-26 18:44:23 +08:00
|
|
|
kfree(ini);
|
2020-09-11 00:48:22 +08:00
|
|
|
kfree(buf);
|
2017-01-09 23:55:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_tcp_listen_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct smc_sock *lsmc = container_of(work, struct smc_sock,
|
|
|
|
tcp_listen_work);
|
2018-01-24 17:28:12 +08:00
|
|
|
struct sock *lsk = &lsmc->sk;
|
2017-01-09 23:55:16 +08:00
|
|
|
struct smc_sock *new_smc;
|
|
|
|
int rc = 0;
|
|
|
|
|
2018-01-24 17:28:12 +08:00
|
|
|
lock_sock(lsk);
|
|
|
|
while (lsk->sk_state == SMC_LISTEN) {
|
2017-01-09 23:55:16 +08:00
|
|
|
rc = smc_clcsock_accept(lsmc, &new_smc);
|
2020-09-11 00:48:20 +08:00
|
|
|
if (rc) /* clcsock accept queue empty or error */
|
2017-01-09 23:55:16 +08:00
|
|
|
goto out;
|
|
|
|
if (!new_smc)
|
|
|
|
continue;
|
|
|
|
|
net/smc: Limit backlog connections
Current implementation does not handling backlog semantics, one
potential risk is that server will be flooded by infinite amount
connections, even if client was SMC-incapable.
This patch works to put a limit on backlog connections, referring to the
TCP implementation, we divides SMC connections into two categories:
1. Half SMC connection, which includes all TCP established while SMC not
connections.
2. Full SMC connection, which includes all SMC established connections.
For half SMC connection, since all half SMC connections starts with TCP
established, we can achieve our goal by put a limit before TCP
established. Refer to the implementation of TCP, this limits will based
on not only the half SMC connections but also the full connections,
which is also a constraint on full SMC connections.
For full SMC connections, although we know exactly where it starts, it's
quite hard to put a limit before it. The easiest way is to block wait
before receive SMC confirm CLC message, while it's under protection by
smc_server_lgr_pending, a global lock, which leads this limit to the
entire host instead of a single listen socket. Another way is to drop
the full connections, but considering the cast of SMC connections, we
prefer to keep full SMC connections.
Even so, the limits of full SMC connections still exists, see commits
about half SMC connection below.
After this patch, the limits of backend connection shows like:
For SMC:
1. Client with SMC-capability can makes 2 * backlog full SMC connections
or 1 * backlog half SMC connections and 1 * backlog full SMC
connections at most.
2. Client without SMC-capability can only makes 1 * backlog half TCP
connections and 1 * backlog full TCP connections.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 17:11:35 +08:00
|
|
|
if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
|
|
|
|
atomic_inc(&lsmc->queued_smc_hs);
|
|
|
|
|
2017-01-09 23:55:16 +08:00
|
|
|
new_smc->listen_smc = lsmc;
|
2018-04-26 23:18:21 +08:00
|
|
|
new_smc->use_fallback = lsmc->use_fallback;
|
2018-07-25 22:35:32 +08:00
|
|
|
new_smc->fallback_rsn = lsmc->fallback_rsn;
|
2018-01-24 17:28:12 +08:00
|
|
|
sock_hold(lsk); /* sock_put in smc_listen_work */
|
2017-01-09 23:55:16 +08:00
|
|
|
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
|
|
|
|
smc_copy_sock_settings_to_smc(new_smc);
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_hold(&new_smc->sk); /* sock_put in passive closing */
|
2020-09-11 00:48:29 +08:00
|
|
|
if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_put(&new_smc->sk);
|
2017-01-09 23:55:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2018-01-24 17:28:12 +08:00
|
|
|
release_sock(lsk);
|
2020-09-11 00:48:20 +08:00
|
|
|
sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
|
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_clcsock_data_ready(struct sock *listen_clcsock)
|
|
|
|
{
|
2022-04-22 15:56:19 +08:00
|
|
|
struct smc_sock *lsmc;
|
2020-09-11 00:48:20 +08:00
|
|
|
|
2022-04-22 15:56:19 +08:00
|
|
|
read_lock_bh(&listen_clcsock->sk_callback_lock);
|
|
|
|
lsmc = smc_clcsock_user_data(listen_clcsock);
|
2020-09-11 00:48:20 +08:00
|
|
|
if (!lsmc)
|
2022-04-22 15:56:19 +08:00
|
|
|
goto out;
|
2020-09-11 00:48:20 +08:00
|
|
|
lsmc->clcsk_data_ready(listen_clcsock);
|
|
|
|
if (lsmc->sk.sk_state == SMC_LISTEN) {
|
|
|
|
sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
|
2022-02-10 17:11:34 +08:00
|
|
|
if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
|
2020-09-11 00:48:20 +08:00
|
|
|
sock_put(&lsmc->sk);
|
|
|
|
}
|
2022-04-22 15:56:19 +08:00
|
|
|
out:
|
|
|
|
read_unlock_bh(&listen_clcsock->sk_callback_lock);
|
2017-01-09 23:55:16 +08:00
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
static int smc_listen(struct socket *sock, int backlog)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
smc = smc_sk(sk);
|
|
|
|
lock_sock(sk);
|
|
|
|
|
|
|
|
rc = -EINVAL;
|
2019-08-02 16:47:50 +08:00
|
|
|
if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
|
2022-05-13 10:24:53 +08:00
|
|
|
smc->connect_nonblock || sock->state != SS_UNCONNECTED)
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
rc = 0;
|
|
|
|
if (sk->sk_state == SMC_LISTEN) {
|
|
|
|
sk->sk_max_ack_backlog = backlog;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/* some socket options are handled in core, so we could not apply
|
|
|
|
* them to the clc socket -- copy smc socket options to clc socket
|
|
|
|
*/
|
|
|
|
smc_copy_sock_settings_to_clc(smc);
|
2018-04-26 23:18:21 +08:00
|
|
|
if (!smc->use_fallback)
|
|
|
|
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2020-09-11 00:48:20 +08:00
|
|
|
/* save original sk_data_ready function and establish
|
|
|
|
* smc-specific sk_data_ready function
|
|
|
|
*/
|
2022-04-22 15:56:19 +08:00
|
|
|
write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
|
2020-09-11 00:48:20 +08:00
|
|
|
smc->clcsock->sk->sk_user_data =
|
|
|
|
(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
|
2022-04-22 15:56:18 +08:00
|
|
|
smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready,
|
|
|
|
smc_clcsock_data_ready, &smc->clcsk_data_ready);
|
2022-04-22 15:56:19 +08:00
|
|
|
write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
|
net/smc: Limit backlog connections
Current implementation does not handling backlog semantics, one
potential risk is that server will be flooded by infinite amount
connections, even if client was SMC-incapable.
This patch works to put a limit on backlog connections, referring to the
TCP implementation, we divides SMC connections into two categories:
1. Half SMC connection, which includes all TCP established while SMC not
connections.
2. Full SMC connection, which includes all SMC established connections.
For half SMC connection, since all half SMC connections starts with TCP
established, we can achieve our goal by put a limit before TCP
established. Refer to the implementation of TCP, this limits will based
on not only the half SMC connections but also the full connections,
which is also a constraint on full SMC connections.
For full SMC connections, although we know exactly where it starts, it's
quite hard to put a limit before it. The easiest way is to block wait
before receive SMC confirm CLC message, while it's under protection by
smc_server_lgr_pending, a global lock, which leads this limit to the
entire host instead of a single listen socket. Another way is to drop
the full connections, but considering the cast of SMC connections, we
prefer to keep full SMC connections.
Even so, the limits of full SMC connections still exists, see commits
about half SMC connection below.
After this patch, the limits of backend connection shows like:
For SMC:
1. Client with SMC-capability can makes 2 * backlog full SMC connections
or 1 * backlog half SMC connections and 1 * backlog full SMC
connections at most.
2. Client without SMC-capability can only makes 1 * backlog half TCP
connections and 1 * backlog full TCP connections.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 17:11:35 +08:00
|
|
|
|
|
|
|
/* save original ops */
|
|
|
|
smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
|
|
|
|
|
|
|
|
smc->af_ops = *smc->ori_af_ops;
|
|
|
|
smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
|
|
|
|
|
|
|
|
inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
|
|
|
|
|
2022-02-10 17:11:37 +08:00
|
|
|
if (smc->limit_smc_hs)
|
|
|
|
tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
|
net/smc: Limit SMC visits when handshake workqueue congested
This patch intends to provide a mechanism to put constraint on SMC
connections visit according to the pressure of SMC handshake process.
At present, frequent visits will cause the incoming connections to be
backlogged in SMC handshake queue, raise the connections established
time. Which is quite unacceptable for those applications who base on
short lived connections.
There are two ways to implement this mechanism:
1. Put limitation after TCP established.
2. Put limitation before TCP established.
In the first way, we need to wait and receive CLC messages that the
client will potentially send, and then actively reply with a decline
message, in a sense, which is also a sort of SMC handshake, affect the
connections established time on its way.
In the second way, the only problem is that we need to inject SMC logic
into TCP when it is about to reply the incoming SYN, since we already do
that, it's seems not a problem anymore. And advantage is obvious, few
additional processes are required to complete the constraint.
This patch use the second way. After this patch, connections who beyond
constraint will not informed any SMC indication, and SMC will not be
involved in any of its subsequent processes.
Link: https://lore.kernel.org/all/1641301961-59331-1-git-send-email-alibuda@linux.alibaba.com/
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 17:11:36 +08:00
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
rc = kernel_listen(smc->clcsock, backlog);
|
2021-11-24 20:32:38 +08:00
|
|
|
if (rc) {
|
2022-04-22 15:56:19 +08:00
|
|
|
write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
|
2022-04-22 15:56:18 +08:00
|
|
|
smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
|
|
|
|
&smc->clcsk_data_ready);
|
|
|
|
smc->clcsock->sk->sk_user_data = NULL;
|
2022-04-22 15:56:19 +08:00
|
|
|
write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out;
|
2021-11-24 20:32:38 +08:00
|
|
|
}
|
2017-01-09 23:55:13 +08:00
|
|
|
sk->sk_max_ack_backlog = backlog;
|
|
|
|
sk->sk_ack_backlog = 0;
|
|
|
|
sk->sk_state = SMC_LISTEN;
|
|
|
|
|
|
|
|
out:
|
|
|
|
release_sock(sk);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_accept(struct socket *sock, struct socket *new_sock,
|
2017-03-09 16:09:05 +08:00
|
|
|
int flags, bool kern)
|
2017-01-09 23:55:13 +08:00
|
|
|
{
|
2017-01-09 23:55:16 +08:00
|
|
|
struct sock *sk = sock->sk, *nsk;
|
|
|
|
DECLARE_WAITQUEUE(wait, current);
|
2017-01-09 23:55:13 +08:00
|
|
|
struct smc_sock *lsmc;
|
2017-01-09 23:55:16 +08:00
|
|
|
long timeo;
|
|
|
|
int rc = 0;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
|
|
|
lsmc = smc_sk(sk);
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_hold(sk); /* sock_put below */
|
2017-01-09 23:55:13 +08:00
|
|
|
lock_sock(sk);
|
|
|
|
|
|
|
|
if (lsmc->sk.sk_state != SMC_LISTEN) {
|
|
|
|
rc = -EINVAL;
|
2018-04-26 23:18:23 +08:00
|
|
|
release_sock(sk);
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:16 +08:00
|
|
|
/* Wait for an incoming connection */
|
|
|
|
timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
|
|
|
|
add_wait_queue_exclusive(sk_sleep(sk), &wait);
|
|
|
|
while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
if (!timeo) {
|
|
|
|
rc = -EAGAIN;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
release_sock(sk);
|
|
|
|
timeo = schedule_timeout(timeo);
|
|
|
|
/* wakeup by sk_data_ready in smc_listen_work() */
|
|
|
|
sched_annotate_sleep();
|
|
|
|
lock_sock(sk);
|
|
|
|
if (signal_pending(current)) {
|
|
|
|
rc = sock_intr_errno(timeo);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
set_current_state(TASK_RUNNING);
|
|
|
|
remove_wait_queue(sk_sleep(sk), &wait);
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2017-01-09 23:55:16 +08:00
|
|
|
if (!rc)
|
|
|
|
rc = sock_error(nsk);
|
2018-04-26 23:18:23 +08:00
|
|
|
release_sock(sk);
|
|
|
|
if (rc)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
|
|
|
|
/* wait till data arrives on the socket */
|
|
|
|
timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
|
|
|
|
MSEC_PER_SEC);
|
|
|
|
if (smc_sk(nsk)->use_fallback) {
|
|
|
|
struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
|
|
|
|
|
|
|
|
lock_sock(clcsk);
|
|
|
|
if (skb_queue_empty(&clcsk->sk_receive_queue))
|
|
|
|
sk_wait_data(clcsk, &timeo, NULL);
|
|
|
|
release_sock(clcsk);
|
|
|
|
} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
|
|
|
|
lock_sock(nsk);
|
2018-05-04 00:12:37 +08:00
|
|
|
smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
|
2018-04-26 23:18:23 +08:00
|
|
|
release_sock(nsk);
|
|
|
|
}
|
|
|
|
}
|
2017-01-09 23:55:13 +08:00
|
|
|
|
|
|
|
out:
|
2018-01-26 16:28:48 +08:00
|
|
|
sock_put(sk); /* sock_hold above */
|
2017-01-09 23:55:13 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_getname(struct socket *sock, struct sockaddr *addr,
|
2018-02-13 03:00:20 +08:00
|
|
|
int peer)
|
2017-01-09 23:55:13 +08:00
|
|
|
{
|
|
|
|
struct smc_sock *smc;
|
|
|
|
|
2017-01-09 23:55:25 +08:00
|
|
|
if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
|
|
|
|
(sock->sk->sk_state != SMC_APPCLOSEWAIT1))
|
2017-01-09 23:55:13 +08:00
|
|
|
return -ENOTCONN;
|
|
|
|
|
|
|
|
smc = smc_sk(sock->sk);
|
|
|
|
|
2018-02-13 03:00:20 +08:00
|
|
|
return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
2023-03-07 11:23:46 +08:00
|
|
|
int rc;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
|
|
|
smc = smc_sk(sk);
|
|
|
|
lock_sock(sk);
|
2018-04-26 23:18:21 +08:00
|
|
|
|
2023-03-07 11:23:46 +08:00
|
|
|
/* SMC does not support connect with fastopen */
|
2018-04-26 23:18:21 +08:00
|
|
|
if (msg->msg_flags & MSG_FASTOPEN) {
|
2023-03-07 11:23:46 +08:00
|
|
|
/* not connected yet, fallback */
|
2019-08-02 16:47:50 +08:00
|
|
|
if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
|
2022-01-22 17:43:09 +08:00
|
|
|
rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
|
|
|
|
if (rc)
|
|
|
|
goto out;
|
2018-04-26 23:18:21 +08:00
|
|
|
} else {
|
|
|
|
rc = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2023-03-07 11:23:46 +08:00
|
|
|
} else if ((sk->sk_state != SMC_ACTIVE) &&
|
|
|
|
(sk->sk_state != SMC_APPCLOSEWAIT1) &&
|
|
|
|
(sk->sk_state != SMC_INIT)) {
|
|
|
|
rc = -EPIPE;
|
|
|
|
goto out;
|
2018-04-26 23:18:21 +08:00
|
|
|
}
|
|
|
|
|
2021-06-16 22:52:55 +08:00
|
|
|
if (smc->use_fallback) {
|
2017-01-09 23:55:13 +08:00
|
|
|
rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
|
2021-06-16 22:52:55 +08:00
|
|
|
} else {
|
2017-01-09 23:55:23 +08:00
|
|
|
rc = smc_tx_sendmsg(smc, msg, len);
|
2021-06-16 22:52:55 +08:00
|
|
|
SMC_STAT_TX_PAYLOAD(smc, len, rc);
|
|
|
|
}
|
2017-01-09 23:55:13 +08:00
|
|
|
out:
|
|
|
|
release_sock(sk);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
|
|
|
|
int flags)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
|
|
|
int rc = -ENOTCONN;
|
|
|
|
|
|
|
|
smc = smc_sk(sk);
|
|
|
|
lock_sock(sk);
|
2019-01-31 01:51:04 +08:00
|
|
|
if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
|
|
|
|
/* socket was connected before, no more data to read */
|
|
|
|
rc = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
2017-01-09 23:55:25 +08:00
|
|
|
if ((sk->sk_state == SMC_INIT) ||
|
|
|
|
(sk->sk_state == SMC_LISTEN) ||
|
|
|
|
(sk->sk_state == SMC_CLOSED))
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out;
|
|
|
|
|
2017-01-09 23:55:25 +08:00
|
|
|
if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
|
|
|
|
rc = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2018-05-04 00:12:39 +08:00
|
|
|
if (smc->use_fallback) {
|
2017-01-09 23:55:13 +08:00
|
|
|
rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
|
2018-05-04 00:12:39 +08:00
|
|
|
} else {
|
|
|
|
msg->msg_namelen = 0;
|
|
|
|
rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
|
2021-06-16 22:52:55 +08:00
|
|
|
SMC_STAT_RX_PAYLOAD(smc, rc, rc);
|
2018-05-04 00:12:39 +08:00
|
|
|
}
|
2017-01-09 23:55:25 +08:00
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
out:
|
|
|
|
release_sock(sk);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2017-07-03 12:01:49 +08:00
|
|
|
static __poll_t smc_accept_poll(struct sock *parent)
|
2017-01-09 23:55:16 +08:00
|
|
|
{
|
2018-01-26 16:28:47 +08:00
|
|
|
struct smc_sock *isk = smc_sk(parent);
|
2018-02-01 23:02:53 +08:00
|
|
|
__poll_t mask = 0;
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-01-26 16:28:47 +08:00
|
|
|
spin_lock(&isk->accept_q_lock);
|
|
|
|
if (!list_empty(&isk->accept_q))
|
2018-02-12 06:34:03 +08:00
|
|
|
mask = EPOLLIN | EPOLLRDNORM;
|
2018-01-26 16:28:47 +08:00
|
|
|
spin_unlock(&isk->accept_q_lock);
|
2017-01-09 23:55:16 +08:00
|
|
|
|
2018-01-26 16:28:47 +08:00
|
|
|
return mask;
|
2017-01-09 23:55:16 +08:00
|
|
|
}
|
|
|
|
|
2018-06-29 00:43:44 +08:00
|
|
|
static __poll_t smc_poll(struct file *file, struct socket *sock,
|
|
|
|
poll_table *wait)
|
2017-01-09 23:55:13 +08:00
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
2019-04-12 18:57:23 +08:00
|
|
|
__poll_t mask = 0;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2018-01-26 16:28:47 +08:00
|
|
|
if (!sk)
|
2018-02-12 06:34:03 +08:00
|
|
|
return EPOLLNVAL;
|
2018-01-26 16:28:47 +08:00
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
smc = smc_sk(sock->sk);
|
2018-09-18 21:46:34 +08:00
|
|
|
if (smc->use_fallback) {
|
2017-01-09 23:55:16 +08:00
|
|
|
/* delegate to CLC child sock */
|
2018-06-29 00:43:44 +08:00
|
|
|
mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
|
2018-05-02 22:53:56 +08:00
|
|
|
sk->sk_err = smc->clcsock->sk->sk_err;
|
2017-01-09 23:55:13 +08:00
|
|
|
} else {
|
2018-07-04 00:53:43 +08:00
|
|
|
if (sk->sk_state != SMC_CLOSED)
|
2018-10-23 19:40:39 +08:00
|
|
|
sock_poll_wait(file, sock, wait);
|
2017-01-09 23:55:16 +08:00
|
|
|
if (sk->sk_err)
|
2018-02-12 06:34:03 +08:00
|
|
|
mask |= EPOLLERR;
|
2017-01-09 23:55:25 +08:00
|
|
|
if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
|
|
|
|
(sk->sk_state == SMC_CLOSED))
|
2018-02-12 06:34:03 +08:00
|
|
|
mask |= EPOLLHUP;
|
2018-01-26 16:28:47 +08:00
|
|
|
if (sk->sk_state == SMC_LISTEN) {
|
|
|
|
/* woken up by sk_data_ready in smc_listen_work() */
|
2019-04-12 18:57:23 +08:00
|
|
|
mask |= smc_accept_poll(sk);
|
|
|
|
} else if (smc->use_fallback) { /* as result of connect_work()*/
|
|
|
|
mask |= smc->clcsock->ops->poll(file, smc->clcsock,
|
|
|
|
wait);
|
|
|
|
sk->sk_err = smc->clcsock->sk->sk_err;
|
2018-01-26 16:28:47 +08:00
|
|
|
} else {
|
2019-04-12 18:57:23 +08:00
|
|
|
if ((sk->sk_state != SMC_INIT &&
|
|
|
|
atomic_read(&smc->conn.sndbuf_space)) ||
|
2018-01-26 16:28:47 +08:00
|
|
|
sk->sk_shutdown & SEND_SHUTDOWN) {
|
2018-02-12 06:34:03 +08:00
|
|
|
mask |= EPOLLOUT | EPOLLWRNORM;
|
2018-01-26 16:28:47 +08:00
|
|
|
} else {
|
|
|
|
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
|
|
|
|
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
|
|
|
}
|
|
|
|
if (atomic_read(&smc->conn.bytes_to_rcv))
|
2018-02-12 06:34:03 +08:00
|
|
|
mask |= EPOLLIN | EPOLLRDNORM;
|
2018-01-26 16:28:47 +08:00
|
|
|
if (sk->sk_shutdown & RCV_SHUTDOWN)
|
2018-02-12 06:34:03 +08:00
|
|
|
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
|
2018-01-26 16:28:47 +08:00
|
|
|
if (sk->sk_state == SMC_APPCLOSEWAIT1)
|
2018-02-12 06:34:03 +08:00
|
|
|
mask |= EPOLLIN;
|
2018-09-18 21:46:37 +08:00
|
|
|
if (smc->conn.urg_state == SMC_URG_VALID)
|
|
|
|
mask |= EPOLLPRI;
|
2018-01-26 16:28:47 +08:00
|
|
|
}
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_shutdown(struct socket *sock, int how)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
2021-11-26 10:41:35 +08:00
|
|
|
bool do_shutdown = true;
|
2017-01-09 23:55:13 +08:00
|
|
|
struct smc_sock *smc;
|
|
|
|
int rc = -EINVAL;
|
2021-11-26 10:41:35 +08:00
|
|
|
int old_state;
|
2017-01-09 23:55:25 +08:00
|
|
|
int rc1 = 0;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
|
|
|
smc = smc_sk(sk);
|
|
|
|
|
|
|
|
if ((how < SHUT_RD) || (how > SHUT_RDWR))
|
2017-01-09 23:55:25 +08:00
|
|
|
return rc;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
|
|
|
lock_sock(sk);
|
|
|
|
|
2022-05-13 10:24:53 +08:00
|
|
|
if (sock->state == SS_CONNECTING) {
|
|
|
|
if (sk->sk_state == SMC_ACTIVE)
|
|
|
|
sock->state = SS_CONNECTED;
|
|
|
|
else if (sk->sk_state == SMC_PEERCLOSEWAIT1 ||
|
|
|
|
sk->sk_state == SMC_PEERCLOSEWAIT2 ||
|
|
|
|
sk->sk_state == SMC_APPCLOSEWAIT1 ||
|
|
|
|
sk->sk_state == SMC_APPCLOSEWAIT2 ||
|
|
|
|
sk->sk_state == SMC_APPFINCLOSEWAIT)
|
|
|
|
sock->state = SS_DISCONNECTING;
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
rc = -ENOTCONN;
|
2018-08-08 20:13:19 +08:00
|
|
|
if ((sk->sk_state != SMC_ACTIVE) &&
|
2017-01-09 23:55:25 +08:00
|
|
|
(sk->sk_state != SMC_PEERCLOSEWAIT1) &&
|
|
|
|
(sk->sk_state != SMC_PEERCLOSEWAIT2) &&
|
|
|
|
(sk->sk_state != SMC_APPCLOSEWAIT1) &&
|
|
|
|
(sk->sk_state != SMC_APPCLOSEWAIT2) &&
|
|
|
|
(sk->sk_state != SMC_APPFINCLOSEWAIT))
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out;
|
|
|
|
if (smc->use_fallback) {
|
|
|
|
rc = kernel_sock_shutdown(smc->clcsock, how);
|
|
|
|
sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
|
2022-04-14 15:51:03 +08:00
|
|
|
if (sk->sk_shutdown == SHUTDOWN_MASK) {
|
2017-01-09 23:55:13 +08:00
|
|
|
sk->sk_state = SMC_CLOSED;
|
2022-05-13 10:24:53 +08:00
|
|
|
sk->sk_socket->state = SS_UNCONNECTED;
|
2022-04-14 15:51:03 +08:00
|
|
|
sock_put(sk);
|
|
|
|
}
|
2017-01-09 23:55:25 +08:00
|
|
|
goto out;
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
2017-01-09 23:55:25 +08:00
|
|
|
switch (how) {
|
|
|
|
case SHUT_RDWR: /* shutdown in both directions */
|
2021-11-26 10:41:35 +08:00
|
|
|
old_state = sk->sk_state;
|
2017-01-09 23:55:25 +08:00
|
|
|
rc = smc_close_active(smc);
|
2021-11-26 10:41:35 +08:00
|
|
|
if (old_state == SMC_ACTIVE &&
|
|
|
|
sk->sk_state == SMC_PEERCLOSEWAIT1)
|
|
|
|
do_shutdown = false;
|
2017-01-09 23:55:25 +08:00
|
|
|
break;
|
|
|
|
case SHUT_WR:
|
|
|
|
rc = smc_close_shutdown_write(smc);
|
|
|
|
break;
|
|
|
|
case SHUT_RD:
|
2018-04-19 21:56:40 +08:00
|
|
|
rc = 0;
|
|
|
|
/* nothing more to do because peer is not involved */
|
2017-01-09 23:55:25 +08:00
|
|
|
break;
|
|
|
|
}
|
2021-11-26 10:41:35 +08:00
|
|
|
if (do_shutdown && smc->clcsock)
|
2018-04-19 21:56:40 +08:00
|
|
|
rc1 = kernel_sock_shutdown(smc->clcsock, how);
|
2017-01-09 23:55:25 +08:00
|
|
|
/* map sock_shutdown_cmd constants to sk_shutdown value range */
|
|
|
|
sk->sk_shutdown |= how + 1;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2022-05-13 10:24:53 +08:00
|
|
|
if (sk->sk_state == SMC_CLOSED)
|
|
|
|
sock->state = SS_UNCONNECTED;
|
|
|
|
else
|
|
|
|
sock->state = SS_DISCONNECTING;
|
2017-01-09 23:55:13 +08:00
|
|
|
out:
|
|
|
|
release_sock(sk);
|
2017-01-09 23:55:25 +08:00
|
|
|
return rc ? rc : rc1;
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
2022-02-10 17:11:37 +08:00
|
|
|
static int __smc_getsockopt(struct socket *sock, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen)
|
|
|
|
{
|
|
|
|
struct smc_sock *smc;
|
|
|
|
int val, len;
|
|
|
|
|
|
|
|
smc = smc_sk(sock->sk);
|
|
|
|
|
|
|
|
if (get_user(len, optlen))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
len = min_t(int, len, sizeof(int));
|
|
|
|
|
|
|
|
if (len < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
switch (optname) {
|
|
|
|
case SMC_LIMIT_HS:
|
|
|
|
val = smc->limit_smc_hs;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (put_user(len, optlen))
|
|
|
|
return -EFAULT;
|
|
|
|
if (copy_to_user(optval, &val, len))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __smc_setsockopt(struct socket *sock, int level, int optname,
|
|
|
|
sockptr_t optval, unsigned int optlen)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
|
|
|
int val, rc;
|
|
|
|
|
|
|
|
smc = smc_sk(sk);
|
|
|
|
|
|
|
|
lock_sock(sk);
|
|
|
|
switch (optname) {
|
|
|
|
case SMC_LIMIT_HS:
|
2022-02-18 23:32:59 +08:00
|
|
|
if (optlen < sizeof(int)) {
|
|
|
|
rc = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (copy_from_sockptr(&val, optval, sizeof(int))) {
|
|
|
|
rc = -EFAULT;
|
|
|
|
break;
|
|
|
|
}
|
2022-02-10 17:11:37 +08:00
|
|
|
|
|
|
|
smc->limit_smc_hs = !!val;
|
|
|
|
rc = 0;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
rc = -EOPNOTSUPP;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
release_sock(sk);
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
static int smc_setsockopt(struct socket *sock, int level, int optname,
|
2020-07-23 14:09:07 +08:00
|
|
|
sockptr_t optval, unsigned int optlen)
|
2017-01-09 23:55:13 +08:00
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
2018-04-26 23:18:22 +08:00
|
|
|
int val, rc;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2021-05-06 03:40:48 +08:00
|
|
|
if (level == SOL_TCP && optname == TCP_ULP)
|
|
|
|
return -EOPNOTSUPP;
|
2022-02-10 17:11:37 +08:00
|
|
|
else if (level == SOL_SMC)
|
|
|
|
return __smc_setsockopt(sock, level, optname, optval, optlen);
|
2021-05-06 03:40:48 +08:00
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
smc = smc_sk(sk);
|
|
|
|
|
|
|
|
/* generic setsockopts reaching us here always apply to the
|
|
|
|
* CLC socket
|
|
|
|
*/
|
2022-01-22 17:43:09 +08:00
|
|
|
mutex_lock(&smc->clcsock_release_lock);
|
|
|
|
if (!smc->clcsock) {
|
|
|
|
mutex_unlock(&smc->clcsock_release_lock);
|
|
|
|
return -EBADF;
|
|
|
|
}
|
2020-07-17 14:23:31 +08:00
|
|
|
if (unlikely(!smc->clcsock->ops->setsockopt))
|
|
|
|
rc = -EOPNOTSUPP;
|
|
|
|
else
|
|
|
|
rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
|
|
|
|
optval, optlen);
|
2018-04-26 23:18:21 +08:00
|
|
|
if (smc->clcsock->sk->sk_err) {
|
|
|
|
sk->sk_err = smc->clcsock->sk->sk_err;
|
2021-06-28 06:48:21 +08:00
|
|
|
sk_error_report(sk);
|
2018-04-26 23:18:21 +08:00
|
|
|
}
|
2022-01-22 17:43:09 +08:00
|
|
|
mutex_unlock(&smc->clcsock_release_lock);
|
2018-04-26 23:18:21 +08:00
|
|
|
|
2018-04-26 23:18:22 +08:00
|
|
|
if (optlen < sizeof(int))
|
2018-05-31 10:31:22 +08:00
|
|
|
return -EINVAL;
|
2020-07-23 14:09:07 +08:00
|
|
|
if (copy_from_sockptr(&val, optval, sizeof(int)))
|
2018-07-18 21:22:50 +08:00
|
|
|
return -EFAULT;
|
2018-04-26 23:18:22 +08:00
|
|
|
|
2018-04-26 23:18:21 +08:00
|
|
|
lock_sock(sk);
|
2019-12-13 05:35:58 +08:00
|
|
|
if (rc || smc->use_fallback)
|
|
|
|
goto out;
|
2018-04-26 23:18:21 +08:00
|
|
|
switch (optname) {
|
|
|
|
case TCP_FASTOPEN:
|
|
|
|
case TCP_FASTOPEN_CONNECT:
|
|
|
|
case TCP_FASTOPEN_KEY:
|
|
|
|
case TCP_FASTOPEN_NO_COOKIE:
|
|
|
|
/* option not supported by SMC */
|
2019-11-15 19:39:30 +08:00
|
|
|
if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
|
2022-01-22 17:43:09 +08:00
|
|
|
rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
|
2018-04-26 23:18:21 +08:00
|
|
|
} else {
|
2019-12-13 05:35:58 +08:00
|
|
|
rc = -EINVAL;
|
2018-04-26 23:18:21 +08:00
|
|
|
}
|
|
|
|
break;
|
2018-04-26 23:18:22 +08:00
|
|
|
case TCP_NODELAY:
|
2019-08-02 16:16:38 +08:00
|
|
|
if (sk->sk_state != SMC_INIT &&
|
|
|
|
sk->sk_state != SMC_LISTEN &&
|
|
|
|
sk->sk_state != SMC_CLOSED) {
|
2021-06-16 22:52:55 +08:00
|
|
|
if (val) {
|
2021-06-16 22:52:58 +08:00
|
|
|
SMC_STAT_INC(smc, ndly_cnt);
|
2022-03-01 17:43:59 +08:00
|
|
|
smc_tx_pending(&smc->conn);
|
|
|
|
cancel_delayed_work(&smc->conn.tx_work);
|
2021-06-16 22:52:55 +08:00
|
|
|
}
|
2018-04-26 23:18:22 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case TCP_CORK:
|
2019-08-02 16:16:38 +08:00
|
|
|
if (sk->sk_state != SMC_INIT &&
|
|
|
|
sk->sk_state != SMC_LISTEN &&
|
|
|
|
sk->sk_state != SMC_CLOSED) {
|
2021-06-16 22:52:55 +08:00
|
|
|
if (!val) {
|
2021-06-16 22:52:58 +08:00
|
|
|
SMC_STAT_INC(smc, cork_cnt);
|
net/smc: Send directly when TCP_CORK is cleared
According to the man page of TCP_CORK [1], if set, don't send out
partial frames. All queued partial frames are sent when option is
cleared again.
When applications call setsockopt to disable TCP_CORK, this call is
protected by lock_sock(), and tries to mod_delayed_work() to 0, in order
to send pending data right now. However, the delayed work smc_tx_work is
also protected by lock_sock(). There introduces lock contention for
sending data.
To fix it, send pending data directly which acts like TCP, without
lock_sock() protected in the context of setsockopt (already lock_sock()ed),
and cancel unnecessary dealyed work, which is protected by lock.
[1] https://linux.die.net/man/7/tcp
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-31 02:02:55 +08:00
|
|
|
smc_tx_pending(&smc->conn);
|
|
|
|
cancel_delayed_work(&smc->conn.tx_work);
|
2021-06-16 22:52:55 +08:00
|
|
|
}
|
2018-04-26 23:18:22 +08:00
|
|
|
}
|
|
|
|
break;
|
2018-04-26 23:18:23 +08:00
|
|
|
case TCP_DEFER_ACCEPT:
|
|
|
|
smc->sockopt_defer_accept = val;
|
|
|
|
break;
|
2018-04-26 23:18:21 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2019-12-13 05:35:58 +08:00
|
|
|
out:
|
2018-04-26 23:18:21 +08:00
|
|
|
release_sock(sk);
|
|
|
|
|
|
|
|
return rc;
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_getsockopt(struct socket *sock, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen)
|
|
|
|
{
|
|
|
|
struct smc_sock *smc;
|
2022-01-22 17:43:09 +08:00
|
|
|
int rc;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2022-02-10 17:11:37 +08:00
|
|
|
if (level == SOL_SMC)
|
|
|
|
return __smc_getsockopt(sock, level, optname, optval, optlen);
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
smc = smc_sk(sock->sk);
|
2022-01-22 17:43:09 +08:00
|
|
|
mutex_lock(&smc->clcsock_release_lock);
|
|
|
|
if (!smc->clcsock) {
|
|
|
|
mutex_unlock(&smc->clcsock_release_lock);
|
|
|
|
return -EBADF;
|
|
|
|
}
|
2017-01-09 23:55:13 +08:00
|
|
|
/* socket options apply to the CLC socket */
|
2022-01-22 17:43:09 +08:00
|
|
|
if (unlikely(!smc->clcsock->ops->getsockopt)) {
|
|
|
|
mutex_unlock(&smc->clcsock_release_lock);
|
2020-07-17 14:23:31 +08:00
|
|
|
return -EOPNOTSUPP;
|
2022-01-22 17:43:09 +08:00
|
|
|
}
|
|
|
|
rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
|
|
|
|
optval, optlen);
|
|
|
|
mutex_unlock(&smc->clcsock_release_lock);
|
|
|
|
return rc;
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int smc_ioctl(struct socket *sock, unsigned int cmd,
|
|
|
|
unsigned long arg)
|
|
|
|
{
|
2018-05-23 22:38:11 +08:00
|
|
|
union smc_host_cursor cons, urg;
|
|
|
|
struct smc_connection *conn;
|
2017-01-09 23:55:13 +08:00
|
|
|
struct smc_sock *smc;
|
2018-05-02 22:56:46 +08:00
|
|
|
int answ;
|
2017-01-09 23:55:13 +08:00
|
|
|
|
|
|
|
smc = smc_sk(sock->sk);
|
2018-05-23 22:38:11 +08:00
|
|
|
conn = &smc->conn;
|
2018-08-08 20:13:21 +08:00
|
|
|
lock_sock(&smc->sk);
|
2018-05-02 22:56:46 +08:00
|
|
|
if (smc->use_fallback) {
|
2018-08-08 20:13:21 +08:00
|
|
|
if (!smc->clcsock) {
|
|
|
|
release_sock(&smc->sk);
|
2018-05-02 22:56:46 +08:00
|
|
|
return -EBADF;
|
2018-08-08 20:13:21 +08:00
|
|
|
}
|
|
|
|
answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
|
|
|
|
release_sock(&smc->sk);
|
|
|
|
return answ;
|
2018-05-02 22:56:46 +08:00
|
|
|
}
|
|
|
|
switch (cmd) {
|
|
|
|
case SIOCINQ: /* same as FIONREAD */
|
2018-07-16 19:56:52 +08:00
|
|
|
if (smc->sk.sk_state == SMC_LISTEN) {
|
|
|
|
release_sock(&smc->sk);
|
2018-05-02 22:56:46 +08:00
|
|
|
return -EINVAL;
|
2018-07-16 19:56:52 +08:00
|
|
|
}
|
2018-05-23 22:38:09 +08:00
|
|
|
if (smc->sk.sk_state == SMC_INIT ||
|
|
|
|
smc->sk.sk_state == SMC_CLOSED)
|
|
|
|
answ = 0;
|
|
|
|
else
|
|
|
|
answ = atomic_read(&smc->conn.bytes_to_rcv);
|
2018-05-02 22:56:46 +08:00
|
|
|
break;
|
|
|
|
case SIOCOUTQ:
|
|
|
|
/* output queue size (not send + not acked) */
|
2018-07-16 19:56:52 +08:00
|
|
|
if (smc->sk.sk_state == SMC_LISTEN) {
|
|
|
|
release_sock(&smc->sk);
|
2018-05-02 22:56:46 +08:00
|
|
|
return -EINVAL;
|
2018-07-16 19:56:52 +08:00
|
|
|
}
|
2018-05-23 22:38:09 +08:00
|
|
|
if (smc->sk.sk_state == SMC_INIT ||
|
|
|
|
smc->sk.sk_state == SMC_CLOSED)
|
|
|
|
answ = 0;
|
|
|
|
else
|
|
|
|
answ = smc->conn.sndbuf_desc->len -
|
2018-05-02 22:56:46 +08:00
|
|
|
atomic_read(&smc->conn.sndbuf_space);
|
|
|
|
break;
|
|
|
|
case SIOCOUTQNSD:
|
|
|
|
/* output queue size (not send only) */
|
2018-07-16 19:56:52 +08:00
|
|
|
if (smc->sk.sk_state == SMC_LISTEN) {
|
|
|
|
release_sock(&smc->sk);
|
2018-05-02 22:56:46 +08:00
|
|
|
return -EINVAL;
|
2018-07-16 19:56:52 +08:00
|
|
|
}
|
2018-05-23 22:38:09 +08:00
|
|
|
if (smc->sk.sk_state == SMC_INIT ||
|
|
|
|
smc->sk.sk_state == SMC_CLOSED)
|
|
|
|
answ = 0;
|
|
|
|
else
|
|
|
|
answ = smc_tx_prepared_sends(&smc->conn);
|
2018-05-02 22:56:46 +08:00
|
|
|
break;
|
2018-05-23 22:38:11 +08:00
|
|
|
case SIOCATMARK:
|
2018-07-16 19:56:52 +08:00
|
|
|
if (smc->sk.sk_state == SMC_LISTEN) {
|
|
|
|
release_sock(&smc->sk);
|
2018-05-23 22:38:11 +08:00
|
|
|
return -EINVAL;
|
2018-07-16 19:56:52 +08:00
|
|
|
}
|
2018-05-23 22:38:11 +08:00
|
|
|
if (smc->sk.sk_state == SMC_INIT ||
|
|
|
|
smc->sk.sk_state == SMC_CLOSED) {
|
|
|
|
answ = 0;
|
|
|
|
} else {
|
2018-07-23 19:53:09 +08:00
|
|
|
smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
|
|
|
|
smc_curs_copy(&urg, &conn->urg_curs, conn);
|
2018-05-23 22:38:11 +08:00
|
|
|
answ = smc_curs_diff(conn->rmb_desc->len,
|
|
|
|
&cons, &urg) == 1;
|
|
|
|
}
|
|
|
|
break;
|
2018-05-02 22:56:46 +08:00
|
|
|
default:
|
2018-07-16 19:56:52 +08:00
|
|
|
release_sock(&smc->sk);
|
2018-05-02 22:56:46 +08:00
|
|
|
return -ENOIOCTLCMD;
|
|
|
|
}
|
2018-07-16 19:56:52 +08:00
|
|
|
release_sock(&smc->sk);
|
2018-05-02 22:56:46 +08:00
|
|
|
|
|
|
|
return put_user(answ, (int __user *)arg);
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
2018-05-04 00:12:39 +08:00
|
|
|
/* Map the affected portions of the rmbe into an spd, note the number of bytes
|
|
|
|
* to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
|
|
|
|
* updates till whenever a respective page has been fully processed.
|
|
|
|
* Note that subsequent recv() calls have to wait till all splice() processing
|
|
|
|
* completed.
|
|
|
|
*/
|
2017-01-09 23:55:13 +08:00
|
|
|
static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
|
|
|
|
struct pipe_inode_info *pipe, size_t len,
|
2018-05-04 00:12:39 +08:00
|
|
|
unsigned int flags)
|
2017-01-09 23:55:13 +08:00
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct smc_sock *smc;
|
|
|
|
int rc = -ENOTCONN;
|
|
|
|
|
|
|
|
smc = smc_sk(sk);
|
|
|
|
lock_sock(sk);
|
2019-01-31 01:51:04 +08:00
|
|
|
if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
|
|
|
|
/* socket was connected before, no more data to read */
|
|
|
|
rc = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
2018-05-04 00:12:39 +08:00
|
|
|
if (sk->sk_state == SMC_INIT ||
|
|
|
|
sk->sk_state == SMC_LISTEN ||
|
|
|
|
sk->sk_state == SMC_CLOSED)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
|
|
|
|
rc = 0;
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out;
|
2018-05-04 00:12:39 +08:00
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
if (smc->use_fallback) {
|
|
|
|
rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
|
|
|
|
pipe, len, flags);
|
|
|
|
} else {
|
2018-05-04 00:12:39 +08:00
|
|
|
if (*ppos) {
|
|
|
|
rc = -ESPIPE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (flags & SPLICE_F_NONBLOCK)
|
|
|
|
flags = MSG_DONTWAIT;
|
|
|
|
else
|
|
|
|
flags = 0;
|
2021-06-16 22:52:58 +08:00
|
|
|
SMC_STAT_INC(smc, splice_cnt);
|
2018-05-04 00:12:39 +08:00
|
|
|
rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
out:
|
|
|
|
release_sock(sk);
|
2018-05-04 00:12:39 +08:00
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* must look like tcp */
|
|
|
|
static const struct proto_ops smc_sock_ops = {
|
|
|
|
.family = PF_SMC,
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.release = smc_release,
|
|
|
|
.bind = smc_bind,
|
|
|
|
.connect = smc_connect,
|
|
|
|
.socketpair = sock_no_socketpair,
|
|
|
|
.accept = smc_accept,
|
|
|
|
.getname = smc_getname,
|
2018-06-29 00:43:44 +08:00
|
|
|
.poll = smc_poll,
|
2017-01-09 23:55:13 +08:00
|
|
|
.ioctl = smc_ioctl,
|
|
|
|
.listen = smc_listen,
|
|
|
|
.shutdown = smc_shutdown,
|
|
|
|
.setsockopt = smc_setsockopt,
|
|
|
|
.getsockopt = smc_getsockopt,
|
|
|
|
.sendmsg = smc_sendmsg,
|
|
|
|
.recvmsg = smc_recvmsg,
|
|
|
|
.mmap = sock_no_mmap,
|
|
|
|
.splice_read = smc_splice_read,
|
|
|
|
};
|
|
|
|
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
static int __smc_create(struct net *net, struct socket *sock, int protocol,
|
|
|
|
int kern, struct socket *clcsock)
|
2017-01-09 23:55:13 +08:00
|
|
|
{
|
2018-03-16 22:06:41 +08:00
|
|
|
int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
|
2017-01-09 23:55:13 +08:00
|
|
|
struct smc_sock *smc;
|
|
|
|
struct sock *sk;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = -ESOCKTNOSUPPORT;
|
|
|
|
if (sock->type != SOCK_STREAM)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
rc = -EPROTONOSUPPORT;
|
2018-03-16 22:06:41 +08:00
|
|
|
if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
|
2017-01-09 23:55:13 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
rc = -ENOBUFS;
|
|
|
|
sock->ops = &smc_sock_ops;
|
2022-05-13 10:24:53 +08:00
|
|
|
sock->state = SS_UNCONNECTED;
|
2018-03-16 22:06:41 +08:00
|
|
|
sk = smc_sock_alloc(net, sock, protocol);
|
2017-01-09 23:55:13 +08:00
|
|
|
if (!sk)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* create internal TCP socket for CLC handshake and fallback */
|
|
|
|
smc = smc_sk(sk);
|
2017-01-09 23:55:16 +08:00
|
|
|
smc->use_fallback = false; /* assume rdma capability first */
|
2018-07-25 22:35:32 +08:00
|
|
|
smc->fallback_rsn = 0;
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
|
2022-02-10 17:11:38 +08:00
|
|
|
/* default behavior from limit_smc_hs in every net namespace */
|
|
|
|
smc->limit_smc_hs = net->smc.limit_smc_hs;
|
|
|
|
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
rc = 0;
|
|
|
|
if (!clcsock) {
|
|
|
|
rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
|
|
|
|
&smc->clcsock);
|
|
|
|
if (rc) {
|
|
|
|
sk_common_release(sk);
|
|
|
|
goto out;
|
|
|
|
}
|
2023-04-09 02:49:43 +08:00
|
|
|
|
|
|
|
/* smc_clcsock_release() does not wait smc->clcsock->sk's
|
|
|
|
* destruction; its sk_state might not be TCP_CLOSE after
|
|
|
|
* smc->sk is close()d, and TCP timers can be fired later,
|
|
|
|
* which need net ref.
|
|
|
|
*/
|
|
|
|
sk = smc->clcsock->sk;
|
|
|
|
__netns_tracker_free(net, &sk->ns_tracker, false);
|
|
|
|
sk->sk_net_refcnt = 1;
|
|
|
|
get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
|
|
|
|
sock_inuse_add(net, 1);
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
} else {
|
|
|
|
smc->clcsock = clcsock;
|
2018-02-28 19:44:09 +08:00
|
|
|
}
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
out:
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
static int smc_create(struct net *net, struct socket *sock, int protocol,
|
|
|
|
int kern)
|
|
|
|
{
|
|
|
|
return __smc_create(net, sock, protocol, kern, NULL);
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
static const struct net_proto_family smc_sock_family_ops = {
|
|
|
|
.family = PF_SMC,
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.create = smc_create,
|
|
|
|
};
|
|
|
|
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
static int smc_ulp_init(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct socket *tcp = sk->sk_socket;
|
|
|
|
struct net *net = sock_net(sk);
|
|
|
|
struct socket *smcsock;
|
|
|
|
int protocol, ret;
|
|
|
|
|
|
|
|
/* only TCP can be replaced */
|
|
|
|
if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP ||
|
|
|
|
(sk->sk_family != AF_INET && sk->sk_family != AF_INET6))
|
|
|
|
return -ESOCKTNOSUPPORT;
|
|
|
|
/* don't handle wq now */
|
|
|
|
if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list)
|
|
|
|
return -ENOTCONN;
|
|
|
|
|
|
|
|
if (sk->sk_family == AF_INET)
|
|
|
|
protocol = SMCPROTO_SMC;
|
|
|
|
else
|
|
|
|
protocol = SMCPROTO_SMC6;
|
|
|
|
|
|
|
|
smcsock = sock_alloc();
|
|
|
|
if (!smcsock)
|
|
|
|
return -ENFILE;
|
|
|
|
|
|
|
|
smcsock->type = SOCK_STREAM;
|
|
|
|
__module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */
|
|
|
|
ret = __smc_create(net, smcsock, protocol, 1, tcp);
|
|
|
|
if (ret) {
|
|
|
|
sock_release(smcsock); /* module_put() which ops won't be NULL */
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* replace tcp socket to smc */
|
|
|
|
smcsock->file = tcp->file;
|
|
|
|
smcsock->file->private_data = smcsock;
|
|
|
|
smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */
|
|
|
|
smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */
|
|
|
|
tcp->file = NULL;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk,
|
|
|
|
const gfp_t priority)
|
|
|
|
{
|
|
|
|
struct inet_connection_sock *icsk = inet_csk(newsk);
|
|
|
|
|
|
|
|
/* don't inherit ulp ops to child when listen */
|
|
|
|
icsk->icsk_ulp_ops = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct tcp_ulp_ops smc_ulp_ops __read_mostly = {
|
|
|
|
.name = "smc",
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.init = smc_ulp_init,
|
|
|
|
.clone = smc_ulp_clone,
|
|
|
|
};
|
|
|
|
|
2019-02-21 20:01:02 +08:00
|
|
|
unsigned int smc_net_id;
|
|
|
|
|
|
|
|
static __net_init int smc_net_init(struct net *net)
|
|
|
|
{
|
2022-03-07 09:54:24 +08:00
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = smc_sysctl_net_init(net);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
2019-02-21 20:01:02 +08:00
|
|
|
return smc_pnet_net_init(net);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __net_exit smc_net_exit(struct net *net)
|
|
|
|
{
|
2022-03-07 09:54:24 +08:00
|
|
|
smc_sysctl_net_exit(net);
|
2019-02-21 20:01:02 +08:00
|
|
|
smc_pnet_net_exit(net);
|
|
|
|
}
|
|
|
|
|
2021-06-16 22:52:58 +08:00
|
|
|
static __net_init int smc_net_stat_init(struct net *net)
|
|
|
|
{
|
|
|
|
return smc_stats_init(net);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __net_exit smc_net_stat_exit(struct net *net)
|
|
|
|
{
|
|
|
|
smc_stats_exit(net);
|
|
|
|
}
|
|
|
|
|
2019-02-21 20:01:02 +08:00
|
|
|
static struct pernet_operations smc_net_ops = {
|
|
|
|
.init = smc_net_init,
|
|
|
|
.exit = smc_net_exit,
|
|
|
|
.id = &smc_net_id,
|
|
|
|
.size = sizeof(struct smc_net),
|
|
|
|
};
|
|
|
|
|
2021-06-16 22:52:58 +08:00
|
|
|
static struct pernet_operations smc_net_stat_ops = {
|
|
|
|
.init = smc_net_stat_init,
|
|
|
|
.exit = smc_net_stat_exit,
|
|
|
|
};
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
static int __init smc_init(void)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
2019-02-21 20:01:02 +08:00
|
|
|
rc = register_pernet_subsys(&smc_net_ops);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
|
|
|
|
2021-06-16 22:52:58 +08:00
|
|
|
rc = register_pernet_subsys(&smc_net_stat_ops);
|
|
|
|
if (rc)
|
2022-11-01 17:37:22 +08:00
|
|
|
goto out_pernet_subsys;
|
2021-06-16 22:52:58 +08:00
|
|
|
|
2023-01-24 02:17:49 +08:00
|
|
|
rc = smc_ism_init();
|
|
|
|
if (rc)
|
|
|
|
goto out_pernet_subsys_stat;
|
2020-09-26 18:44:31 +08:00
|
|
|
smc_clc_init();
|
2020-09-26 18:44:24 +08:00
|
|
|
|
2020-12-02 03:20:43 +08:00
|
|
|
rc = smc_nl_init();
|
2017-01-09 23:55:15 +08:00
|
|
|
if (rc)
|
2023-01-24 02:17:49 +08:00
|
|
|
goto out_ism;
|
2017-01-09 23:55:15 +08:00
|
|
|
|
2020-12-02 03:20:43 +08:00
|
|
|
rc = smc_pnet_init();
|
|
|
|
if (rc)
|
|
|
|
goto out_nl;
|
|
|
|
|
2020-09-11 00:48:29 +08:00
|
|
|
rc = -ENOMEM;
|
2022-02-10 17:11:34 +08:00
|
|
|
|
|
|
|
smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
|
|
|
|
if (!smc_tcp_ls_wq)
|
|
|
|
goto out_pnet;
|
|
|
|
|
2020-09-11 00:48:29 +08:00
|
|
|
smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
|
|
|
|
if (!smc_hs_wq)
|
2022-02-10 17:11:34 +08:00
|
|
|
goto out_alloc_tcp_ls_wq;
|
2020-09-11 00:48:29 +08:00
|
|
|
|
|
|
|
smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
|
|
|
|
if (!smc_close_wq)
|
|
|
|
goto out_alloc_hs_wq;
|
|
|
|
|
2019-11-17 00:47:29 +08:00
|
|
|
rc = smc_core_init();
|
|
|
|
if (rc) {
|
|
|
|
pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
|
2021-06-16 22:52:58 +08:00
|
|
|
goto out_alloc_wqs;
|
2019-11-17 00:47:29 +08:00
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:21 +08:00
|
|
|
rc = smc_llc_init();
|
|
|
|
if (rc) {
|
|
|
|
pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
|
2019-11-17 00:47:29 +08:00
|
|
|
goto out_core;
|
2017-01-09 23:55:21 +08:00
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:22 +08:00
|
|
|
rc = smc_cdc_init();
|
|
|
|
if (rc) {
|
|
|
|
pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
|
2019-11-17 00:47:29 +08:00
|
|
|
goto out_core;
|
2017-01-09 23:55:22 +08:00
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
rc = proto_register(&smc_proto, 1);
|
|
|
|
if (rc) {
|
2018-03-16 22:06:41 +08:00
|
|
|
pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
|
2019-11-17 00:47:29 +08:00
|
|
|
goto out_core;
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
2018-03-16 22:06:41 +08:00
|
|
|
rc = proto_register(&smc_proto6, 1);
|
|
|
|
if (rc) {
|
|
|
|
pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
|
|
|
|
goto out_proto;
|
|
|
|
}
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
rc = sock_register(&smc_sock_family_ops);
|
|
|
|
if (rc) {
|
|
|
|
pr_err("%s: sock_register fails with %d\n", __func__, rc);
|
2018-03-16 22:06:41 +08:00
|
|
|
goto out_proto6;
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
2017-01-09 23:55:26 +08:00
|
|
|
INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
|
2018-03-16 22:06:41 +08:00
|
|
|
INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
|
2017-01-09 23:55:13 +08:00
|
|
|
|
2017-01-09 23:55:14 +08:00
|
|
|
rc = smc_ib_register_client();
|
|
|
|
if (rc) {
|
|
|
|
pr_err("%s: ib_register fails with %d\n", __func__, rc);
|
|
|
|
goto out_sock;
|
|
|
|
}
|
|
|
|
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
rc = tcp_register_ulp(&smc_ulp_ops);
|
|
|
|
if (rc) {
|
|
|
|
pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
|
2022-02-25 14:56:57 +08:00
|
|
|
goto out_ib;
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
}
|
|
|
|
|
2017-10-25 17:01:46 +08:00
|
|
|
static_branch_enable(&tcp_have_smc);
|
2017-01-09 23:55:13 +08:00
|
|
|
return 0;
|
|
|
|
|
2022-02-25 14:56:57 +08:00
|
|
|
out_ib:
|
|
|
|
smc_ib_unregister_client();
|
2017-01-09 23:55:14 +08:00
|
|
|
out_sock:
|
|
|
|
sock_unregister(PF_SMC);
|
2018-03-16 22:06:41 +08:00
|
|
|
out_proto6:
|
|
|
|
proto_unregister(&smc_proto6);
|
2017-01-09 23:55:13 +08:00
|
|
|
out_proto:
|
|
|
|
proto_unregister(&smc_proto);
|
2019-11-17 00:47:29 +08:00
|
|
|
out_core:
|
|
|
|
smc_core_exit();
|
2020-09-11 00:48:29 +08:00
|
|
|
out_alloc_wqs:
|
|
|
|
destroy_workqueue(smc_close_wq);
|
|
|
|
out_alloc_hs_wq:
|
|
|
|
destroy_workqueue(smc_hs_wq);
|
2022-02-10 17:11:34 +08:00
|
|
|
out_alloc_tcp_ls_wq:
|
|
|
|
destroy_workqueue(smc_tcp_ls_wq);
|
2017-01-09 23:55:15 +08:00
|
|
|
out_pnet:
|
|
|
|
smc_pnet_exit();
|
2020-12-02 03:20:43 +08:00
|
|
|
out_nl:
|
|
|
|
smc_nl_exit();
|
2023-01-24 02:17:49 +08:00
|
|
|
out_ism:
|
2023-03-13 18:08:29 +08:00
|
|
|
smc_clc_exit();
|
2023-01-24 02:17:49 +08:00
|
|
|
smc_ism_exit();
|
2022-11-01 17:37:22 +08:00
|
|
|
out_pernet_subsys_stat:
|
|
|
|
unregister_pernet_subsys(&smc_net_stat_ops);
|
2019-06-26 23:47:50 +08:00
|
|
|
out_pernet_subsys:
|
|
|
|
unregister_pernet_subsys(&smc_net_ops);
|
|
|
|
|
2017-01-09 23:55:13 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __exit smc_exit(void)
|
|
|
|
{
|
2017-10-25 17:01:46 +08:00
|
|
|
static_branch_disable(&tcp_have_smc);
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
tcp_unregister_ulp(&smc_ulp_ops);
|
2017-01-09 23:55:13 +08:00
|
|
|
sock_unregister(PF_SMC);
|
2019-11-17 00:47:29 +08:00
|
|
|
smc_core_exit();
|
|
|
|
smc_ib_unregister_client();
|
2023-01-24 02:17:52 +08:00
|
|
|
smc_ism_exit();
|
2020-09-11 00:48:29 +08:00
|
|
|
destroy_workqueue(smc_close_wq);
|
2022-02-10 17:11:34 +08:00
|
|
|
destroy_workqueue(smc_tcp_ls_wq);
|
2020-09-11 00:48:29 +08:00
|
|
|
destroy_workqueue(smc_hs_wq);
|
2018-03-16 22:06:41 +08:00
|
|
|
proto_unregister(&smc_proto6);
|
2017-01-09 23:55:13 +08:00
|
|
|
proto_unregister(&smc_proto);
|
2017-01-09 23:55:15 +08:00
|
|
|
smc_pnet_exit();
|
2020-12-02 03:20:43 +08:00
|
|
|
smc_nl_exit();
|
2021-09-14 16:35:05 +08:00
|
|
|
smc_clc_exit();
|
2021-06-16 22:52:58 +08:00
|
|
|
unregister_pernet_subsys(&smc_net_stat_ops);
|
2019-02-21 20:01:02 +08:00
|
|
|
unregister_pernet_subsys(&smc_net_ops);
|
2019-11-17 00:47:31 +08:00
|
|
|
rcu_barrier();
|
2017-01-09 23:55:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
module_init(smc_init);
|
|
|
|
module_exit(smc_exit);
|
|
|
|
|
|
|
|
MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
|
|
|
|
MODULE_DESCRIPTION("smc socket address family");
|
|
|
|
MODULE_LICENSE("GPL");
|
|
|
|
MODULE_ALIAS_NETPROTO(PF_SMC);
|
net/smc: Introduce TCP ULP support
This implements TCP ULP for SMC, helps applications to replace TCP with
SMC protocol in place. And we use it to implement transparent
replacement.
This replaces original TCP sockets with SMC, reuse TCP as clcsock when
calling setsockopt with TCP_ULP option, and without any overhead.
To replace TCP sockets with SMC, there are two approaches:
- use setsockopt() syscall with TCP_ULP option, if error, it would
fallback to TCP.
- use BPF prog with types BPF_CGROUP_INET_SOCK_CREATE or others to
replace transparently. BPF hooks some points in create socket, bind
and others, users can inject their BPF logics without modifying their
applications, and choose which connections should be replaced with SMC
by calling setsockopt() in BPF prog, based on rules, such as TCP tuples,
PID, cgroup, etc...
BPF doesn't support calling setsockopt with TCP_ULP now, I will send the
patches after this accepted.
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-28 21:44:36 +08:00
|
|
|
MODULE_ALIAS_TCP_ULP("smc");
|
2022-07-25 22:10:00 +08:00
|
|
|
MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME);
|