linux/net/mptcp/pm_netlink.c

2462 lines
62 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
*
* Copyright (c) 2020, Red Hat, Inc.
*/
#define pr_fmt(fmt) "MPTCP: " fmt
#include <linux/inet.h>
#include <linux/kernel.h>
#include <net/inet_common.h>
#include <net/netns/generic.h>
#include <net/mptcp.h>
#include "protocol.h"
#include "mib.h"
#include "mptcp_pm_gen.h"
static int pm_nl_pernet_id;
struct mptcp_pm_add_entry {
struct list_head list;
struct mptcp_addr_info addr;
u8 retrans_times;
struct timer_list add_timer;
struct mptcp_sock *sock;
};
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
struct list_head local_addr_list;
unsigned int addrs;
unsigned int stale_loss_cnt;
unsigned int add_addr_signal_max;
unsigned int add_addr_accept_max;
unsigned int local_addr_max;
unsigned int subflows_max;
unsigned int next_id;
DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
};
#define MPTCP_PM_ADDR_MAX 8
#define ADD_ADDR_RETRANS_MAX 3
static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net)
{
return net_generic(net, pm_nl_pernet_id);
}
static struct pm_nl_pernet *
pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk)
{
return pm_nl_get_pernet(sock_net((struct sock *)msk));
}
bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
const struct mptcp_addr_info *b, bool use_port)
{
bool addr_equals = false;
if (a->family == b->family) {
if (a->family == AF_INET)
addr_equals = a->addr.s_addr == b->addr.s_addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else
addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
} else if (a->family == AF_INET) {
if (ipv6_addr_v4mapped(&b->addr6))
addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3];
} else if (b->family == AF_INET) {
if (ipv6_addr_v4mapped(&a->addr6))
addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr;
#endif
}
if (!addr_equals)
return false;
if (!use_port)
return true;
return a->port == b->port;
}
void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr)
{
addr->family = skc->skc_family;
addr->port = htons(skc->skc_num);
if (addr->family == AF_INET)
addr->addr.s_addr = skc->skc_rcv_saddr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (addr->family == AF_INET6)
addr->addr6 = skc->skc_v6_rcv_saddr;
#endif
}
static void remote_address(const struct sock_common *skc,
struct mptcp_addr_info *addr)
{
addr->family = skc->skc_family;
addr->port = skc->skc_dport;
if (addr->family == AF_INET)
addr->addr.s_addr = skc->skc_daddr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (addr->family == AF_INET6)
addr->addr6 = skc->skc_v6_daddr;
#endif
}
static bool lookup_subflow_by_saddr(const struct list_head *list,
const struct mptcp_addr_info *saddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
struct sock_common *skc;
list_for_each_entry(subflow, list, node) {
skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
mptcp_local_address(skc, &cur);
if (mptcp_addresses_equal(&cur, saddr, saddr->port))
return true;
}
return false;
}
static bool lookup_subflow_by_daddr(const struct list_head *list,
const struct mptcp_addr_info *daddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
list_for_each_entry(subflow, list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!((1 << inet_sk_state_load(ssk)) &
(TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV)))
continue;
remote_address((struct sock_common *)ssk, &cur);
if (mptcp_addresses_equal(&cur, daddr, daddr->port))
return true;
}
return false;
}
static bool
select_local_address(const struct pm_nl_pernet *pernet,
const struct mptcp_sock *msk,
struct mptcp_pm_local *new_local)
{
struct mptcp_pm_addr_entry *entry;
bool found = false;
msk_owned_by_me(msk);
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
new_local->addr = entry->addr;
new_local->flags = entry->flags;
new_local->ifindex = entry->ifindex;
found = true;
break;
}
rcu_read_unlock();
return found;
}
static bool
select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk,
struct mptcp_pm_local *new_local)
{
struct mptcp_pm_addr_entry *entry;
bool found = false;
rcu_read_lock();
/* do not keep any additional per socket state, just signal
* the address list in order.
* Note: removal from the local address list during the msk life-cycle
* can lead to additional addresses not being announced.
*/
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue;
new_local->addr = entry->addr;
new_local->flags = entry->flags;
new_local->ifindex = entry->ifindex;
found = true;
break;
}
rcu_read_unlock();
return found;
}
unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk)
{
const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->add_addr_signal_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max);
unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->add_addr_accept_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max);
unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->subflows_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max);
unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->local_addr_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) ||
(find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) {
WRITE_ONCE(msk->pm.work_pending, false);
return false;
}
return true;
}
struct mptcp_pm_add_entry *
mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *entry;
lockdep_assert_held(&msk->pm.lock);
list_for_each_entry(entry, &msk->pm.anno_list, list) {
if (mptcp_addresses_equal(&entry->addr, addr, true))
return entry;
}
return NULL;
}
bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk)
{
struct mptcp_pm_add_entry *entry;
struct mptcp_addr_info saddr;
bool ret = false;
mptcp_local_address((struct sock_common *)sk, &saddr);
spin_lock_bh(&msk->pm.lock);
list_for_each_entry(entry, &msk->pm.anno_list, list) {
if (mptcp_addresses_equal(&entry->addr, &saddr, true)) {
ret = true;
goto out;
}
}
out:
spin_unlock_bh(&msk->pm.lock);
return ret;
}
static void mptcp_pm_add_timer(struct timer_list *timer)
{
struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer);
struct mptcp_sock *msk = entry->sock;
struct sock *sk = (struct sock *)msk;
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug("msk=%p\n", msk);
if (!msk)
return;
if (inet_sk_state_load(sk) == TCP_CLOSE)
return;
if (!entry->addr.id)
return;
if (mptcp_pm_should_add_signal_addr(msk)) {
sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
goto out;
}
spin_lock_bh(&msk->pm.lock);
if (!mptcp_pm_should_add_signal_addr(msk)) {
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id);
mptcp_pm_announce_addr(msk, &entry->addr, false);
mptcp_pm_add_addr_send_ack(msk);
entry->retrans_times++;
}
if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
sk_reset_timer(sk, timer,
jiffies + mptcp_get_add_addr_timeout(sock_net(sk)));
spin_unlock_bh(&msk->pm.lock);
if (entry->retrans_times == ADD_ADDR_RETRANS_MAX)
mptcp_pm_subflow_established(msk);
out:
__sock_put(sk);
}
struct mptcp_pm_add_entry *
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr, bool check_id)
{
struct mptcp_pm_add_entry *entry;
struct sock *sk = (struct sock *)msk;
spin_lock_bh(&msk->pm.lock);
entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
mptcp: validate 'id' when stopping the ADD_ADDR retransmit timer when Linux receives an echo-ed ADD_ADDR, it checks the IP address against the list of "announced" addresses. In case of a positive match, the timer that handles retransmissions is stopped regardless of the 'Address Id' in the received packet: this behaviour does not comply with RFC8684 3.4.1. Fix it by validating the 'Address Id' in received echo-ed ADD_ADDRs. Tested using packetdrill, with the following captured output: unpatched kernel: Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0xfd2e62517888fe29,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 1 1.2.3.4,mptcp dss ack 3013740213], length 0 Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0xfd2e62517888fe29,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 90 198.51.100.2,mptcp dss ack 3013740213], length 0 ^^^ retransmission is stopped here, but 'Address Id' is 90 patched kernel: Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0x1cf372d59e05f4b8,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 1 1.2.3.4,mptcp dss ack 1672384568], length 0 Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0x1cf372d59e05f4b8,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 90 198.51.100.2,mptcp dss ack 1672384568], length 0 Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0x1cf372d59e05f4b8,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 1 198.51.100.2,mptcp dss ack 1672384568], length 0 ^^^ retransmission is stopped here, only when both 'Address Id' and 'IP Address' match Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Signed-off-by: Davide Caratti <dcaratti@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-05-26 05:23:13 +08:00
if (entry && (!check_id || entry->addr.id == addr->id))
entry->retrans_times = ADD_ADDR_RETRANS_MAX;
spin_unlock_bh(&msk->pm.lock);
mptcp: validate 'id' when stopping the ADD_ADDR retransmit timer when Linux receives an echo-ed ADD_ADDR, it checks the IP address against the list of "announced" addresses. In case of a positive match, the timer that handles retransmissions is stopped regardless of the 'Address Id' in the received packet: this behaviour does not comply with RFC8684 3.4.1. Fix it by validating the 'Address Id' in received echo-ed ADD_ADDRs. Tested using packetdrill, with the following captured output: unpatched kernel: Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0xfd2e62517888fe29,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 1 1.2.3.4,mptcp dss ack 3013740213], length 0 Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0xfd2e62517888fe29,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 90 198.51.100.2,mptcp dss ack 3013740213], length 0 ^^^ retransmission is stopped here, but 'Address Id' is 90 patched kernel: Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0x1cf372d59e05f4b8,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 1 1.2.3.4,mptcp dss ack 1672384568], length 0 Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0x1cf372d59e05f4b8,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 90 198.51.100.2,mptcp dss ack 1672384568], length 0 Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0x1cf372d59e05f4b8,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 1 198.51.100.2,mptcp dss ack 1672384568], length 0 ^^^ retransmission is stopped here, only when both 'Address Id' and 'IP Address' match Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Signed-off-by: Davide Caratti <dcaratti@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-05-26 05:23:13 +08:00
if (entry && (!check_id || entry->addr.id == addr->id))
sk_stop_timer_sync(sk, &entry->add_timer);
return entry;
}
bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *add_entry = NULL;
struct sock *sk = (struct sock *)msk;
struct net *net = sock_net(sk);
lockdep_assert_held(&msk->pm.lock);
add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
if (add_entry) {
if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk)))
return false;
sk_reset_timer(sk, &add_entry->add_timer,
jiffies + mptcp_get_add_addr_timeout(net));
return true;
}
add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC);
if (!add_entry)
return false;
list_add(&add_entry->list, &msk->pm.anno_list);
add_entry->addr = *addr;
add_entry->sock = msk;
add_entry->retrans_times = 0;
timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
sk_reset_timer(sk, &add_entry->add_timer,
jiffies + mptcp_get_add_addr_timeout(net));
return true;
}
void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
{
struct mptcp_pm_add_entry *entry, *tmp;
struct sock *sk = (struct sock *)msk;
LIST_HEAD(free_list);
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug("msk=%p\n", msk);
spin_lock_bh(&msk->pm.lock);
list_splice_init(&msk->pm.anno_list, &free_list);
spin_unlock_bh(&msk->pm.lock);
list_for_each_entry_safe(entry, tmp, &free_list, list) {
sk_stop_timer_sync(sk, &entry->add_timer);
kfree(entry);
}
}
/* Fill all the remote addresses into the array addrs[],
* and return the array size.
*/
static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk,
struct mptcp_addr_info *local,
bool fullmesh,
struct mptcp_addr_info *addrs)
{
bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
struct sock *sk = (struct sock *)msk, *ssk;
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info remote = { 0 };
unsigned int subflows_max;
int i = 0;
subflows_max = mptcp_pm_get_subflows_max(msk);
remote_address((struct sock_common *)sk, &remote);
/* Non-fullmesh endpoint, fill in the single entry
* corresponding to the primary MPC subflow remote address
*/
if (!fullmesh) {
if (deny_id0)
return 0;
if (!mptcp_pm_addr_families_match(sk, local, &remote))
return 0;
msk->pm.subflows++;
addrs[i++] = remote;
} else {
DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
/* Forbid creation of new subflows matching existing
* ones, possibly already created by incoming ADD_ADDR
*/
bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
mptcp_for_each_subflow(msk, subflow)
if (READ_ONCE(subflow->local_id) == local->id)
__set_bit(subflow->remote_id, unavail_id);
mptcp_for_each_subflow(msk, subflow) {
ssk = mptcp_subflow_tcp_sock(subflow);
remote_address((struct sock_common *)ssk, &addrs[i]);
addrs[i].id = READ_ONCE(subflow->remote_id);
if (deny_id0 && !addrs[i].id)
continue;
if (test_bit(addrs[i].id, unavail_id))
continue;
if (!mptcp_pm_addr_families_match(sk, local, &addrs[i]))
continue;
if (msk->pm.subflows < subflows_max) {
/* forbid creating multiple address towards
* this id
*/
__set_bit(addrs[i].id, unavail_id);
msk->pm.subflows++;
i++;
}
}
}
return i;
}
static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
bool prio, bool backup)
{
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow;
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug("send ack for %s\n",
prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"));
slow = lock_sock_fast(ssk);
if (prio) {
subflow->send_mp_prio = 1;
subflow->request_bkup = backup;
}
__mptcp_subflow_send_ack(ssk);
unlock_sock_fast(ssk, slow);
}
static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
bool prio, bool backup)
{
spin_unlock_bh(&msk->pm.lock);
__mptcp_pm_send_ack(msk, subflow, prio, backup);
spin_lock_bh(&msk->pm.lock);
}
static struct mptcp_pm_addr_entry *
__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
{
struct mptcp_pm_addr_entry *entry;
list_for_each_entry(entry, &pernet->local_addr_list, list) {
if (entry->addr.id == id)
return entry;
}
return NULL;
}
static struct mptcp_pm_addr_entry *
__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
{
struct mptcp_pm_addr_entry *entry;
list_for_each_entry(entry, &pernet->local_addr_list, list) {
if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
return entry;
}
return NULL;
}
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
unsigned int add_addr_signal_max;
bool signal_and_subflow = false;
unsigned int local_addr_max;
struct pm_nl_pernet *pernet;
struct mptcp_pm_local local;
unsigned int subflows_max;
pernet = pm_nl_get_pernet(sock_net(sk));
add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk);
local_addr_max = mptcp_pm_get_local_addr_max(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
/* do lazy endpoint usage accounting for the MPC subflows */
if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first);
struct mptcp_pm_addr_entry *entry;
struct mptcp_addr_info mpc_addr;
bool backup = false;
mptcp_local_address((struct sock_common *)msk->first, &mpc_addr);
rcu_read_lock();
entry = __lookup_addr(pernet, &mpc_addr);
if (entry) {
__clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
msk->mpc_endpoint_id = entry->addr.id;
backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
}
rcu_read_unlock();
if (backup)
mptcp_pm_send_ack(msk, subflow, true, backup);
msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
}
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
msk->pm.local_addr_used, local_addr_max,
msk->pm.add_addr_signaled, add_addr_signal_max,
msk->pm.subflows, subflows_max);
/* check first for announce */
if (msk->pm.add_addr_signaled < add_addr_signal_max) {
/* due to racing events on both ends we can reach here while
* previous add address is still running: if we invoke now
* mptcp_pm_announce_addr(), that will fail and the
* corresponding id will be marked as used.
* Instead let the PM machinery reschedule us when the
* current address announce will be completed.
*/
if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL))
return;
if (!select_signal_address(pernet, msk, &local))
goto subflow;
/* If the alloc fails, we are on memory pressure, not worth
* continuing, and trying to create subflows.
*/
if (!mptcp_pm_alloc_anno_list(msk, &local.addr))
return;
__clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
msk->pm.add_addr_signaled++;
/* Special case for ID0: set the correct ID */
if (local.addr.id == msk->mpc_endpoint_id)
local.addr.id = 0;
mptcp_pm_announce_addr(msk, &local.addr, false);
mptcp_pm_nl_addr_send_ack(msk);
mptcp: pm: do not ignore 'subflow' if 'signal' flag is also set Up to the 'Fixes' commit, having an endpoint with both the 'signal' and 'subflow' flags, resulted in the creation of a subflow and an address announcement using the address linked to this endpoint. After this commit, only the address announcement was done, ignoring the 'subflow' flag. That's because the same bitmap is used for the two flags. It is OK to keep this single bitmap, the already selected local endpoint simply have to be re-used, but not via select_local_address() not to look at the just modified bitmap. Note that it is unusual to set the two flags together: creating a new subflow using a new local address will implicitly advertise it to the other peer. So in theory, no need to advertise it explicitly as well. Maybe there are use-cases -- the subflow might not reach the other peer that way, we can ask the other peer to try initiating the new subflow without delay -- or very likely the user is confused, and put both flags "just to be sure at least the right one is set". Still, if it is allowed, the kernel should do what has been asked: using this endpoint to announce the address and to create a new subflow from it. An alternative is to forbid the use of the two flags together, but that's probably too late, there are maybe use-cases, and it was working before. This patch will avoid people complaining subflows are not created using the endpoint they added with the 'subflow' and 'signal' flag. Note that with the current patch, the subflow might not be created in some corner cases, e.g. if the 'subflows' limit was reached when sending the ADD_ADDR, but changed later on. It is probably not worth splitting id_avail_bitmap per target ('signal', 'subflow'), which will add another large field to the msk "just" to track (again) endpoints. Anyway, currently when the limits are changed, the kernel doesn't check if new subflows can be created or removed, because we would need to keep track of the received ADD_ADDR, and more. It sounds OK to assume that the limits should be properly configured before establishing new connections. Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240731-upstream-net-20240731-mptcp-endp-subflow-signal-v1-5-c8a9b036493b@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-07-31 19:05:57 +08:00
if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
signal_and_subflow = true;
}
subflow:
/* check if should create a new subflow */
while (msk->pm.local_addr_used < local_addr_max &&
msk->pm.subflows < subflows_max) {
struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
bool fullmesh;
int i, nr;
if (signal_and_subflow)
signal_and_subflow = false;
else if (!select_local_address(pernet, msk, &local))
break;
fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
__clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
/* Special case for ID0: set the correct ID */
if (local.addr.id == msk->mpc_endpoint_id)
local.addr.id = 0;
else /* local_addr_used is not decr for ID 0 */
msk->pm.local_addr_used++;
nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs);
if (nr == 0)
continue;
spin_unlock_bh(&msk->pm.lock);
for (i = 0; i < nr; i++)
__mptcp_subflow_connect(sk, &local, &addrs[i]);
spin_lock_bh(&msk->pm.lock);
}
mptcp_pm_nl_check_work_pending(msk);
}
static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
{
mptcp_pm_create_subflow_or_signal_addr(msk);
}
static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
{
mptcp_pm_create_subflow_or_signal_addr(msk);
}
/* Fill all the local addresses into the array addrs[],
* and return the array size.
*/
static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals)
{
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *entry;
struct mptcp_addr_info mpc_addr;
struct pm_nl_pernet *pernet;
unsigned int subflows_max;
int i = 0;
pernet = pm_nl_get_pernet_from_msk(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
mptcp_local_address((struct sock_common *)msk, &mpc_addr);
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
continue;
if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
continue;
if (msk->pm.subflows < subflows_max) {
locals[i].addr = entry->addr;
locals[i].flags = entry->flags;
locals[i].ifindex = entry->ifindex;
/* Special case for ID0: set the correct ID */
if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port))
locals[i].addr.id = 0;
msk->pm.subflows++;
i++;
}
}
rcu_read_unlock();
/* If the array is empty, fill in the single
* 'IPADDRANY' local address
*/
if (!i) {
memset(&locals[i], 0, sizeof(locals[i]));
locals[i].addr.family =
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
remote->family == AF_INET6 &&
ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
#endif
remote->family;
if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote))
return 0;
msk->pm.subflows++;
i++;
}
return i;
}
static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
{
struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX];
struct sock *sk = (struct sock *)msk;
unsigned int add_addr_accept_max;
struct mptcp_addr_info remote;
unsigned int subflows_max;
mptcp: pm: update add_addr counters after connect The creation of new subflows can fail for different reasons. If no subflow have been created using the received ADD_ADDR, the related counters should not be updated, otherwise they will never be decremented for events related to this ID later on. For the moment, the number of accepted ADD_ADDR is only decremented upon the reception of a related RM_ADDR, and only if the remote address ID is currently being used by at least one subflow. In other words, if no subflow can be created with the received address, the counter will not be decremented. In this case, it is then important not to increment pm.add_addr_accepted counter, and not to modify pm.accept_addr bit. Note that this patch does not modify the behaviour in case of failures later on, e.g. if the MP Join is dropped or rejected. The "remove invalid addresses" MP Join subtest has been modified to validate this case. The broadcast IP address is added before the "valid" address that will be used to successfully create a subflow, and the limit is decreased by one: without this patch, it was not possible to create the last subflow, because: - the broadcast address would have been accepted even if it was not usable: the creation of a subflow to this address results in an error, - the limit of 2 accepted ADD_ADDR would have then been reached. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Co-developed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: YonglongLi <liyonglong@chinatelecom.cn> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://lore.kernel.org/r/20240607-upstream-net-20240607-misc-fixes-v1-3-1ab9ddfa3d00@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-07 23:01:50 +08:00
bool sf_created = false;
int i, nr;
add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug("accepted %d:%d remote family %d\n",
msk->pm.add_addr_accepted, add_addr_accept_max,
msk->pm.remote.family);
remote = msk->pm.remote;
mptcp_pm_announce_addr(msk, &remote, true);
mptcp_pm_nl_addr_send_ack(msk);
if (lookup_subflow_by_daddr(&msk->conn_list, &remote))
return;
/* pick id 0 port, if none is provided the remote address */
if (!remote.port)
remote.port = sk->sk_dport;
/* connect to the specified remote address, using whatever
* local address the routing configuration will pick.
*/
nr = fill_local_addresses_vec(msk, &remote, locals);
if (nr == 0)
return;
spin_unlock_bh(&msk->pm.lock);
for (i = 0; i < nr; i++)
if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0)
mptcp: pm: update add_addr counters after connect The creation of new subflows can fail for different reasons. If no subflow have been created using the received ADD_ADDR, the related counters should not be updated, otherwise they will never be decremented for events related to this ID later on. For the moment, the number of accepted ADD_ADDR is only decremented upon the reception of a related RM_ADDR, and only if the remote address ID is currently being used by at least one subflow. In other words, if no subflow can be created with the received address, the counter will not be decremented. In this case, it is then important not to increment pm.add_addr_accepted counter, and not to modify pm.accept_addr bit. Note that this patch does not modify the behaviour in case of failures later on, e.g. if the MP Join is dropped or rejected. The "remove invalid addresses" MP Join subtest has been modified to validate this case. The broadcast IP address is added before the "valid" address that will be used to successfully create a subflow, and the limit is decreased by one: without this patch, it was not possible to create the last subflow, because: - the broadcast address would have been accepted even if it was not usable: the creation of a subflow to this address results in an error, - the limit of 2 accepted ADD_ADDR would have then been reached. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Co-developed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: YonglongLi <liyonglong@chinatelecom.cn> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://lore.kernel.org/r/20240607-upstream-net-20240607-misc-fixes-v1-3-1ab9ddfa3d00@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-07 23:01:50 +08:00
sf_created = true;
spin_lock_bh(&msk->pm.lock);
mptcp: pm: update add_addr counters after connect The creation of new subflows can fail for different reasons. If no subflow have been created using the received ADD_ADDR, the related counters should not be updated, otherwise they will never be decremented for events related to this ID later on. For the moment, the number of accepted ADD_ADDR is only decremented upon the reception of a related RM_ADDR, and only if the remote address ID is currently being used by at least one subflow. In other words, if no subflow can be created with the received address, the counter will not be decremented. In this case, it is then important not to increment pm.add_addr_accepted counter, and not to modify pm.accept_addr bit. Note that this patch does not modify the behaviour in case of failures later on, e.g. if the MP Join is dropped or rejected. The "remove invalid addresses" MP Join subtest has been modified to validate this case. The broadcast IP address is added before the "valid" address that will be used to successfully create a subflow, and the limit is decreased by one: without this patch, it was not possible to create the last subflow, because: - the broadcast address would have been accepted even if it was not usable: the creation of a subflow to this address results in an error, - the limit of 2 accepted ADD_ADDR would have then been reached. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Co-developed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: YonglongLi <liyonglong@chinatelecom.cn> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://lore.kernel.org/r/20240607-upstream-net-20240607-misc-fixes-v1-3-1ab9ddfa3d00@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-07 23:01:50 +08:00
if (sf_created) {
/* add_addr_accepted is not decr for ID 0 */
if (remote.id)
msk->pm.add_addr_accepted++;
mptcp: pm: update add_addr counters after connect The creation of new subflows can fail for different reasons. If no subflow have been created using the received ADD_ADDR, the related counters should not be updated, otherwise they will never be decremented for events related to this ID later on. For the moment, the number of accepted ADD_ADDR is only decremented upon the reception of a related RM_ADDR, and only if the remote address ID is currently being used by at least one subflow. In other words, if no subflow can be created with the received address, the counter will not be decremented. In this case, it is then important not to increment pm.add_addr_accepted counter, and not to modify pm.accept_addr bit. Note that this patch does not modify the behaviour in case of failures later on, e.g. if the MP Join is dropped or rejected. The "remove invalid addresses" MP Join subtest has been modified to validate this case. The broadcast IP address is added before the "valid" address that will be used to successfully create a subflow, and the limit is decreased by one: without this patch, it was not possible to create the last subflow, because: - the broadcast address would have been accepted even if it was not usable: the creation of a subflow to this address results in an error, - the limit of 2 accepted ADD_ADDR would have then been reached. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Co-developed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: YonglongLi <liyonglong@chinatelecom.cn> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://lore.kernel.org/r/20240607-upstream-net-20240607-misc-fixes-v1-3-1ab9ddfa3d00@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-07 23:01:50 +08:00
if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
msk->pm.subflows >= subflows_max)
WRITE_ONCE(msk->pm.accept_addr, false);
}
}
bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *remote)
{
struct mptcp_addr_info mpc_remote;
remote_address((struct sock_common *)msk, &mpc_remote);
return mptcp_addresses_equal(&mpc_remote, remote, remote->port);
}
void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
msk_owned_by_me(msk);
lockdep_assert_held(&msk->pm.lock);
if (!mptcp_pm_should_add_signal(msk) &&
!mptcp_pm_should_rm_signal(msk))
return;
mptcp_for_each_subflow(msk, subflow) {
if (__mptcp_subflow_active(subflow)) {
mptcp_pm_send_ack(msk, subflow, false, false);
break;
}
}
}
int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
struct mptcp_addr_info *addr,
struct mptcp_addr_info *rem,
u8 bkup)
{
struct mptcp_subflow_context *subflow;
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug("bkup=%d\n", bkup);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
struct mptcp_addr_info local, remote;
mptcp_local_address((struct sock_common *)ssk, &local);
if (!mptcp_addresses_equal(&local, addr, addr->port))
continue;
if (rem && rem->family != AF_UNSPEC) {
remote_address((struct sock_common *)ssk, &remote);
if (!mptcp_addresses_equal(&remote, rem, rem->port))
continue;
}
__mptcp_pm_send_ack(msk, subflow, true, bkup);
return 0;
}
return -EINVAL;
}
static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list,
enum linux_mptcp_mib_field rm_type)
{
struct mptcp_subflow_context *subflow, *tmp;
struct sock *sk = (struct sock *)msk;
u8 i;
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug("%s rm_list_nr %d\n",
rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr);
msk_owned_by_me(msk);
mptcp: never allow the PM to close a listener subflow Currently, when deleting an endpoint the netlink PM treverses all the local MPTCP sockets, regardless of their status. If an MPTCP listener socket is bound to the IP matching the delete endpoint, the listener TCP socket will be closed. That is unexpected, the PM should only affect data subflows. Additionally, syzbot was able to trigger a NULL ptr dereference due to the above: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] CPU: 1 PID: 6550 Comm: syz-executor122 Not tainted 5.16.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__lock_acquire+0xd7d/0x54a0 kernel/locking/lockdep.c:4897 Code: 0f 0e 41 be 01 00 00 00 0f 86 c8 00 00 00 89 05 69 cc 0f 0e e9 bd 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 da 48 c1 ea 03 <80> 3c 02 00 0f 85 f3 2f 00 00 48 81 3b 20 75 17 8f 0f 84 52 f3 ff RSP: 0018:ffffc90001f2f818 EFLAGS: 00010016 RAX: dffffc0000000000 RBX: 0000000000000018 RCX: 0000000000000000 RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000001 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000000 R11: 000000000000000a R12: 0000000000000000 R13: ffff88801b98d700 R14: 0000000000000000 R15: 0000000000000001 FS: 00007f177cd3d700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f177cd1b268 CR3: 000000001dd55000 CR4: 0000000000350ee0 Call Trace: <TASK> lock_acquire kernel/locking/lockdep.c:5637 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5602 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x39/0x50 kernel/locking/spinlock.c:162 finish_wait+0xc0/0x270 kernel/sched/wait.c:400 inet_csk_wait_for_connect net/ipv4/inet_connection_sock.c:464 [inline] inet_csk_accept+0x7de/0x9d0 net/ipv4/inet_connection_sock.c:497 mptcp_accept+0xe5/0x500 net/mptcp/protocol.c:2865 inet_accept+0xe4/0x7b0 net/ipv4/af_inet.c:739 mptcp_stream_accept+0x2e7/0x10e0 net/mptcp/protocol.c:3345 do_accept+0x382/0x510 net/socket.c:1773 __sys_accept4_file+0x7e/0xe0 net/socket.c:1816 __sys_accept4+0xb0/0x100 net/socket.c:1846 __do_sys_accept net/socket.c:1864 [inline] __se_sys_accept net/socket.c:1861 [inline] __x64_sys_accept+0x71/0xb0 net/socket.c:1861 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f177cd8b8e9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 b1 14 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f177cd3d308 EFLAGS: 00000246 ORIG_RAX: 000000000000002b RAX: ffffffffffffffda RBX: 00007f177ce13408 RCX: 00007f177cd8b8e9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00007f177ce13400 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f177ce1340c R13: 00007f177cde1004 R14: 6d705f706374706d R15: 0000000000022000 </TASK> Fix the issue explicitly skipping MPTCP socket in TCP_LISTEN status. Reported-and-tested-by: syzbot+e4d843bb96a9431e6331@syzkaller.appspotmail.com Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Fixes: 740d798e8767 ("mptcp: remove id 0 address") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Link: https://lore.kernel.org/r/ebc7594cdd420d241fb2172ddb8542ba64717657.1639238695.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-12-12 00:11:12 +08:00
if (sk->sk_state == TCP_LISTEN)
return;
if (!rm_list->nr)
return;
if (list_empty(&msk->conn_list))
return;
for (i = 0; i < rm_list->nr; i++) {
u8 rm_id = rm_list->ids[i];
bool removed = false;
mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
u8 remote_id = READ_ONCE(subflow->remote_id);
int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
u8 id = subflow_get_local_id(subflow);
if (inet_sk_state_load(ssk) == TCP_CLOSE)
continue;
if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id)
continue;
if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id)
continue;
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n",
rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow",
i, rm_id, id, remote_id, msk->mpc_endpoint_id);
spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how);
/* the following takes care of updating the subflows counter */
mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock);
removed |= subflow->request_join;
if (rm_type == MPTCP_MIB_RMSUBFLOW)
__MPTCP_INC_STATS(sock_net(sk), rm_type);
}
if (rm_type == MPTCP_MIB_RMADDR)
__MPTCP_INC_STATS(sock_net(sk), rm_type);
if (!removed)
continue;
if (!mptcp_pm_is_kernel(msk))
continue;
if (rm_type == MPTCP_MIB_RMADDR && rm_id &&
!WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) {
/* Note: if the subflow has been closed before, this
* add_addr_accepted counter will not be decremented.
*/
if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk))
WRITE_ONCE(msk->pm.accept_addr, true);
}
}
}
static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
{
mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR);
}
static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list)
{
mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW);
}
void mptcp_pm_nl_work(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
msk_owned_by_me(msk);
if (!(pm->status & MPTCP_PM_WORK_MASK))
return;
spin_lock_bh(&msk->pm.lock);
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug("msk=%p status=%x\n", msk, pm->status);
if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
mptcp_pm_nl_add_addr_received(msk);
}
if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
mptcp_pm_nl_addr_send_ack(msk);
}
if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
mptcp_pm_nl_rm_addr_received(msk);
}
if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
mptcp_pm_nl_fully_established(msk);
}
if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
mptcp_pm_nl_subflow_established(msk);
}
spin_unlock_bh(&msk->pm.lock);
}
static bool address_use_port(struct mptcp_pm_addr_entry *entry)
{
return (entry->flags &
(MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
MPTCP_PM_ADDR_FLAG_SIGNAL;
}
/* caller must ensure the RCU grace period is already elapsed */
static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
{
if (entry->lsk)
sock_release(entry->lsk);
kfree(entry);
}
static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry,
bool needs_id)
{
struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
unsigned int addr_max;
int ret = -EINVAL;
spin_lock_bh(&pernet->lock);
/* to keep the code simple, don't do IDR-like allocation for address ID,
* just bail when we exceed limits
*/
if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
pernet->next_id = 1;
if (pernet->addrs >= MPTCP_PM_ADDR_MAX) {
ret = -ERANGE;
goto out;
}
if (test_bit(entry->addr.id, pernet->id_bitmap)) {
ret = -EBUSY;
goto out;
}
/* do not insert duplicate address, differentiate on port only
* singled addresses
*/
if (!address_use_port(entry))
entry->addr.port = 0;
list_for_each_entry(cur, &pernet->local_addr_list, list) {
if (mptcp_addresses_equal(&cur->addr, &entry->addr,
cur->addr.port || entry->addr.port)) {
/* allow replacing the exiting endpoint only if such
* endpoint is an implicit one and the user-space
* did not provide an endpoint id
*/
if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) {
ret = -EEXIST;
goto out;
}
if (entry->addr.id)
goto out;
pernet->addrs--;
entry->addr.id = cur->addr.id;
list_del_rcu(&cur->list);
del_entry = cur;
break;
}
}
if (!entry->addr.id && needs_id) {
find_next:
entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
MPTCP_PM_MAX_ADDR_ID + 1,
pernet->next_id);
if (!entry->addr.id && pernet->next_id != 1) {
pernet->next_id = 1;
goto find_next;
}
}
if (!entry->addr.id && needs_id)
goto out;
__set_bit(entry->addr.id, pernet->id_bitmap);
if (entry->addr.id > pernet->next_id)
pernet->next_id = entry->addr.id;
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
addr_max = pernet->add_addr_signal_max;
WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
addr_max = pernet->local_addr_max;
WRITE_ONCE(pernet->local_addr_max, addr_max + 1);
}
pernet->addrs++;
if (!entry->addr.port)
list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
else
list_add_rcu(&entry->list, &pernet->local_addr_list);
ret = entry->addr.id;
out:
spin_unlock_bh(&pernet->lock);
/* just replaced an existing entry, free it */
if (del_entry) {
synchronize_rcu();
__mptcp_pm_release_addr_entry(del_entry);
}
return ret;
}
static struct lock_class_key mptcp_slock_keys[2];
static struct lock_class_key mptcp_keys[2];
static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
struct mptcp_pm_addr_entry *entry)
{
bool is_ipv6 = sk->sk_family == AF_INET6;
int addrlen = sizeof(struct sockaddr_in);
struct sockaddr_storage addr;
struct sock *newsk, *ssk;
int backlog = 1024;
int err;
err = sock_create_kern(sock_net(sk), entry->addr.family,
SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk);
if (err)
return err;
newsk = entry->lsk->sk;
if (!newsk)
return -EINVAL;
/* The subflow socket lock is acquired in a nested to the msk one
* in several places, even by the TCP stack, and this msk is a kernel
* socket: lockdep complains. Instead of propagating the _nested
* modifiers in several places, re-init the lock class for the msk
* socket to an mptcp specific one.
*/
sock_lock_init_class_and_name(newsk,
is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET",
&mptcp_slock_keys[is_ipv6],
is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET",
&mptcp_keys[is_ipv6]);
lock_sock(newsk);
ssk = __mptcp_nmpc_sk(mptcp_sk(newsk));
release_sock(newsk);
if (IS_ERR(ssk))
return PTR_ERR(ssk);
mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (entry->addr.family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
if (ssk->sk_family == AF_INET)
err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (ssk->sk_family == AF_INET6)
err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
#endif
if (err)
return err;
/* We don't use mptcp_set_state() here because it needs to be called
* under the msk socket lock. For the moment, that will not bring
* anything more than only calling inet_sk_state_store(), because the
* old status is known (TCP_CLOSE).
*/
inet_sk_state_store(newsk, TCP_LISTEN);
lock_sock(ssk);
err = __inet_listen_sk(ssk, backlog);
if (!err)
mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
release_sock(ssk);
return err;
}
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc)
{
struct mptcp_pm_addr_entry *entry;
struct pm_nl_pernet *pernet;
int ret = -1;
pernet = pm_nl_get_pernet_from_msk(msk);
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) {
ret = entry->addr.id;
break;
}
}
rcu_read_unlock();
if (ret >= 0)
return ret;
/* address not found, add to local list */
mptcp: fix kmalloc flag in mptcp_pm_nl_get_local_id mptcp_pm_nl_get_local_id may be called in interrupt context, so we need to use GFP_ATOMIC flag to allocate memory to avoid sleeping in atomic context. [ 280.209809] BUG: sleeping function called from invalid context at mm/slab.h:498 [ 280.209812] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1680, name: kworker/1:3 [ 280.209814] INFO: lockdep is turned off. [ 280.209816] CPU: 1 PID: 1680 Comm: kworker/1:3 Tainted: G W 5.9.0-rc3-mptcp+ #146 [ 280.209818] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 280.209820] Workqueue: events mptcp_worker [ 280.209822] Call Trace: [ 280.209824] <IRQ> [ 280.209826] dump_stack+0x77/0xa0 [ 280.209829] ___might_sleep.cold+0xa6/0xb6 [ 280.209832] kmem_cache_alloc_trace+0x1d1/0x290 [ 280.209835] mptcp_pm_nl_get_local_id+0x23c/0x410 [ 280.209840] subflow_init_req+0x1e9/0x2ea [ 280.209843] ? inet_reqsk_alloc+0x1c/0x120 [ 280.209845] ? kmem_cache_alloc+0x264/0x290 [ 280.209849] tcp_conn_request+0x303/0xae0 [ 280.209854] ? printk+0x53/0x6a [ 280.209857] ? tcp_rcv_state_process+0x28f/0x1374 [ 280.209859] tcp_rcv_state_process+0x28f/0x1374 [ 280.209864] ? tcp_v4_do_rcv+0xb3/0x1f0 [ 280.209866] tcp_v4_do_rcv+0xb3/0x1f0 [ 280.209869] tcp_v4_rcv+0xed6/0xfa0 [ 280.209873] ip_protocol_deliver_rcu+0x28/0x270 [ 280.209875] ip_local_deliver_finish+0x89/0x120 [ 280.209877] ip_local_deliver+0x180/0x220 [ 280.209881] ip_rcv+0x166/0x210 [ 280.209885] __netif_receive_skb_one_core+0x82/0x90 [ 280.209888] process_backlog+0xd6/0x230 [ 280.209891] net_rx_action+0x13a/0x410 [ 280.209895] __do_softirq+0xcf/0x468 [ 280.209899] asm_call_on_stack+0x12/0x20 [ 280.209901] </IRQ> [ 280.209903] ? ip_finish_output2+0x240/0x9a0 [ 280.209906] do_softirq_own_stack+0x4d/0x60 [ 280.209908] do_softirq.part.0+0x2b/0x60 [ 280.209911] __local_bh_enable_ip+0x9a/0xa0 [ 280.209913] ip_finish_output2+0x264/0x9a0 [ 280.209916] ? rcu_read_lock_held+0x4d/0x60 [ 280.209920] ? ip_output+0x7a/0x250 [ 280.209922] ip_output+0x7a/0x250 [ 280.209925] ? __ip_finish_output+0x330/0x330 [ 280.209928] __ip_queue_xmit+0x1dc/0x5a0 [ 280.209931] __tcp_transmit_skb+0xa0f/0xc70 [ 280.209937] tcp_connect+0xb03/0xff0 [ 280.209939] ? lockdep_hardirqs_on_prepare+0xe7/0x190 [ 280.209942] ? ktime_get_with_offset+0x125/0x150 [ 280.209944] ? trace_hardirqs_on+0x1c/0xe0 [ 280.209948] tcp_v4_connect+0x449/0x550 [ 280.209953] __inet_stream_connect+0xbb/0x320 [ 280.209955] ? mark_held_locks+0x49/0x70 [ 280.209958] ? lockdep_hardirqs_on_prepare+0xe7/0x190 [ 280.209960] ? __local_bh_enable_ip+0x6b/0xa0 [ 280.209963] inet_stream_connect+0x32/0x50 [ 280.209966] __mptcp_subflow_connect+0x1fd/0x242 [ 280.209972] mptcp_pm_create_subflow_or_signal_addr+0x2db/0x600 [ 280.209975] mptcp_worker+0x543/0x7a0 [ 280.209980] process_one_work+0x26d/0x5b0 [ 280.209984] ? process_one_work+0x5b0/0x5b0 [ 280.209987] worker_thread+0x48/0x3d0 [ 280.209990] ? process_one_work+0x5b0/0x5b0 [ 280.209993] kthread+0x117/0x150 [ 280.209996] ? kthread_park+0x80/0x80 [ 280.209998] ret_from_fork+0x22/0x30 Fixes: 01cacb00b35cb ("mptcp: add netlink-based PM") Signed-off-by: Geliang Tang <geliangtang@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-09 11:01:24 +08:00
entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
if (!entry)
return -ENOMEM;
entry->addr = *skc;
entry->addr.id = 0;
entry->addr.port = 0;
entry->ifindex = 0;
entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
entry->lsk = NULL;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true);
if (ret < 0)
kfree(entry);
return ret;
}
bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
struct mptcp_pm_addr_entry *entry;
bool backup = false;
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) {
backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
break;
}
}
rcu_read_unlock();
return backup;
}
#define MPTCP_PM_CMD_GRP_OFFSET 0
#define MPTCP_PM_EV_GRP_OFFSET 1
static const struct genl_multicast_group mptcp_pm_mcgrps[] = {
[MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, },
[MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME,
.flags = GENL_MCAST_CAP_NET_ADMIN,
},
};
void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
{
struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = (struct sock *)msk;
unsigned int active_max_loss_cnt;
struct net *net = sock_net(sk);
unsigned int stale_loss_cnt;
bool slow;
stale_loss_cnt = mptcp_stale_loss_cnt(net);
if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt)
return;
/* look for another available subflow not in loss state */
active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1);
mptcp_for_each_subflow(msk, iter) {
if (iter != subflow && mptcp_subflow_active(iter) &&
iter->stale_count < active_max_loss_cnt) {
/* we have some alternatives, try to mark this subflow as idle ...*/
slow = lock_sock_fast(ssk);
if (!tcp_rtx_and_write_queues_empty(ssk)) {
subflow->stale = 1;
__mptcp_retransmit_pending_data(sk);
MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE);
}
unlock_sock_fast(ssk, slow);
/* always try to push the pending data regardless of re-injections:
* we can possibly use backup subflows now, and subflow selection
* is cheap under the msk socket lock
*/
__mptcp_push_pending(sk, 0);
return;
}
}
}
static int mptcp_pm_family_to_addr(int family)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (family == AF_INET6)
return MPTCP_PM_ADDR_ATTR_ADDR6;
#endif
return MPTCP_PM_ADDR_ATTR_ADDR4;
}
static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[],
const struct nlattr *attr,
struct genl_info *info,
struct mptcp_addr_info *addr,
bool require_family)
{
int err, addr_addr;
if (!attr) {
GENL_SET_ERR_MSG(info, "missing address info");
return -EINVAL;
}
/* no validation needed - was already done via nested policy */
err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
mptcp_pm_address_nl_policy, info->extack);
if (err)
return err;
if (tb[MPTCP_PM_ADDR_ATTR_ID])
addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) {
if (!require_family)
return 0;
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"missing family");
return -EINVAL;
}
addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]);
if (addr->family != AF_INET
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
&& addr->family != AF_INET6
#endif
) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"unknown address family");
return -EINVAL;
}
addr_addr = mptcp_pm_family_to_addr(addr->family);
if (!tb[addr_addr]) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"missing address data");
return -EINVAL;
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if (addr->family == AF_INET6)
addr->addr6 = nla_get_in6_addr(tb[addr_addr]);
else
#endif
addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]);
if (tb[MPTCP_PM_ADDR_ATTR_PORT])
addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
return 0;
}
int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
struct mptcp_addr_info *addr)
{
struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
memset(addr, 0, sizeof(*addr));
return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true);
}
int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
bool require_family,
struct mptcp_pm_addr_entry *entry)
{
struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
int err;
memset(entry, 0, sizeof(*entry));
err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family);
if (err)
return err;
if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) {
u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
entry->ifindex = val;
}
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
if (tb[MPTCP_PM_ADDR_ATTR_PORT])
entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
return 0;
}
static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
{
return pm_nl_get_pernet(genl_info_net(info));
}
static int mptcp_nl_add_subflow_or_signal_addr(struct net *net,
struct mptcp_addr_info *addr)
{
struct mptcp_sock *msk;
long s_slot = 0, s_num = 0;
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info mpc_addr;
if (!READ_ONCE(msk->fully_established) ||
mptcp_pm_is_userspace(msk))
goto next;
/* if the endp linked to the init sf is re-added with a != ID */
mptcp_local_address((struct sock_common *)msk, &mpc_addr);
lock_sock(sk);
spin_lock_bh(&msk->pm.lock);
if (mptcp_addresses_equal(addr, &mpc_addr, addr->port))
msk->mpc_endpoint_id = addr->id;
mptcp_pm_create_subflow_or_signal_addr(msk);
spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
next:
sock_put(sk);
cond_resched();
}
return 0;
}
static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr,
struct genl_info *info)
{
struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
mptcp_pm_address_nl_policy, info->extack) &&
tb[MPTCP_PM_ADDR_ATTR_ID])
return true;
return false;
}
int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
int ret;
ret = mptcp_pm_parse_entry(attr, info, true, &addr);
if (ret < 0)
return ret;
if (addr.addr.port && !address_use_port(&addr)) {
GENL_SET_ERR_MSG(info, "flags must have signal and not subflow when using port");
return -EINVAL;
}
if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL &&
addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh");
return -EINVAL;
}
if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) {
GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint");
return -EINVAL;
}
entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT);
if (!entry) {
GENL_SET_ERR_MSG(info, "can't allocate addr");
return -ENOMEM;
}
*entry = addr;
if (entry->addr.port) {
ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
if (ret) {
GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret);
goto out_free;
}
}
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry,
!mptcp_pm_has_addr_attr_id(attr, info));
if (ret < 0) {
GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret);
goto out_free;
}
mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr);
return 0;
out_free:
__mptcp_pm_release_addr_entry(entry);
return ret;
}
static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *entry;
mptcp: validate 'id' when stopping the ADD_ADDR retransmit timer when Linux receives an echo-ed ADD_ADDR, it checks the IP address against the list of "announced" addresses. In case of a positive match, the timer that handles retransmissions is stopped regardless of the 'Address Id' in the received packet: this behaviour does not comply with RFC8684 3.4.1. Fix it by validating the 'Address Id' in received echo-ed ADD_ADDRs. Tested using packetdrill, with the following captured output: unpatched kernel: Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0xfd2e62517888fe29,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 1 1.2.3.4,mptcp dss ack 3013740213], length 0 Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0xfd2e62517888fe29,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 90 198.51.100.2,mptcp dss ack 3013740213], length 0 ^^^ retransmission is stopped here, but 'Address Id' is 90 patched kernel: Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0x1cf372d59e05f4b8,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 1 1.2.3.4,mptcp dss ack 1672384568], length 0 Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0x1cf372d59e05f4b8,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 90 198.51.100.2,mptcp dss ack 1672384568], length 0 Out <...> Flags [.], ack 1, win 256, options [mptcp add-addr v1 id 1 198.51.100.2 hmac 0x1cf372d59e05f4b8,mptcp dss ack 3007449509], length 0 In <...> Flags [.], ack 1, win 257, options [mptcp add-addr v1-echo id 1 198.51.100.2,mptcp dss ack 1672384568], length 0 ^^^ retransmission is stopped here, only when both 'Address Id' and 'IP Address' match Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Signed-off-by: Davide Caratti <dcaratti@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-05-26 05:23:13 +08:00
entry = mptcp_pm_del_add_timer(msk, addr, false);
if (entry) {
list_del(&entry->list);
kfree(entry);
return true;
}
return false;
}
static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
return msk->mpc_endpoint_id == addr->id ? 0 : addr->id;
}
static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool force)
{
struct mptcp_rm_list list = { .nr = 0 };
bool ret;
list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
ret = remove_anno_list_by_saddr(msk, addr);
if (ret || force) {
spin_lock_bh(&msk->pm.lock);
if (ret) {
__set_bit(addr->id, msk->pm.id_avail_bitmap);
msk->pm.add_addr_signaled--;
}
mptcp_pm_remove_addr(msk, &list);
spin_unlock_bh(&msk->pm.lock);
}
return ret;
}
static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id)
{
/* If it was marked as used, and not ID 0, decrement local_addr_used */
if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) &&
id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0))
msk->pm.local_addr_used--;
}
static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
const struct mptcp_pm_addr_entry *entry)
{
const struct mptcp_addr_info *addr = &entry->addr;
struct mptcp_rm_list list = { .nr = 1 };
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
mptcp: pr_debug: add missing \n at the end pr_debug() have been added in various places in MPTCP code to help developers to debug some situations. With the dynamic debug feature, it is easy to enable all or some of them, and asks users to reproduce issues with extra debug. Many of these pr_debug() don't end with a new line, while no 'pr_cont()' are used in MPTCP code. So the goal was not to display multiple debug messages on one line: they were then not missing the '\n' on purpose. Not having the new line at the end causes these messages to be printed with a delay, when something else needs to be printed. This issue is not visible when many messages need to be printed, but it is annoying and confusing when only specific messages are expected, e.g. # echo "func mptcp_pm_add_addr_echoed +fmp" \ > /sys/kernel/debug/dynamic_debug/control # ./mptcp_join.sh "signal address"; \ echo "$(awk '{print $1}' /proc/uptime) - end"; \ sleep 5s; \ echo "$(awk '{print $1}' /proc/uptime) - restart"; \ ./mptcp_join.sh "signal address" 013 signal address (...) 10.75 - end 15.76 - restart 013 signal address [ 10.367935] mptcp:mptcp_pm_add_addr_echoed: MPTCP: msk=(...) (...) => a delay of 5 seconds: printed with a 10.36 ts, but after 'restart' which was printed at the 15.76 ts. The 'Fixes' tag here below points to the first pr_debug() used without '\n' in net/mptcp. This patch could be split in many small ones, with different Fixes tag, but it doesn't seem worth it, because it is easy to re-generate this patch with this simple 'sed' command: git grep -l pr_debug -- net/mptcp | xargs sed -i "s/\(pr_debug(\".*[^n]\)\(\"[,)]\)/\1\\\n\2/g" So in case of conflicts, simply drop the modifications, and launch this command. Fixes: f870fa0b5768 ("mptcp: Add MPTCP socket stubs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-4-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-08-27 01:11:21 +08:00
pr_debug("remove_id=%d\n", addr->id);
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
bool remove_subflow;
if (mptcp_pm_is_userspace(msk))
goto next;
if (list_empty(&msk->conn_list)) {
mptcp_pm_remove_anno_addr(msk, addr, false);
goto next;
}
lock_sock(sk);
remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
mptcp_pm_remove_anno_addr(msk, addr, remove_subflow &&
!(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
list.ids[0] = mptcp_endp_get_local_id(msk, addr);
if (remove_subflow) {
spin_lock_bh(&msk->pm.lock);
mptcp_pm_nl_rm_subflow_received(msk, &list);
spin_unlock_bh(&msk->pm.lock);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
spin_lock_bh(&msk->pm.lock);
__mark_subflow_endp_available(msk, list.ids[0]);
spin_unlock_bh(&msk->pm.lock);
}
if (msk->mpc_endpoint_id == entry->addr.id)
msk->mpc_endpoint_id = 0;
release_sock(sk);
next:
sock_put(sk);
cond_resched();
}
return 0;
}
static int mptcp_nl_remove_id_zero_address(struct net *net,
struct mptcp_addr_info *addr)
{
struct mptcp_rm_list list = { .nr = 0 };
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
list.ids[list.nr++] = 0;
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info msk_local;
if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;
mptcp_local_address((struct sock_common *)msk, &msk_local);
if (!mptcp_addresses_equal(&msk_local, addr, addr->port))
goto next;
lock_sock(sk);
spin_lock_bh(&msk->pm.lock);
mptcp_pm_remove_addr(msk, &list);
mptcp_pm_nl_rm_subflow_received(msk, &list);
__mark_subflow_endp_available(msk, 0);
spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
next:
sock_put(sk);
cond_resched();
}
return 0;
}
int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
unsigned int addr_max;
int ret;
ret = mptcp_pm_parse_entry(attr, info, false, &addr);
if (ret < 0)
return ret;
/* the zero id address is special: the first address used by the msk
* always gets such an id, so different subflows can have different zero
* id addresses. Additionally zero id is not accounted for in id_bitmap.
* Let's use an 'mptcp_rm_list' instead of the common remove code.
*/
if (addr.addr.id == 0)
return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr);
spin_lock_bh(&pernet->lock);
entry = __lookup_addr_by_id(pernet, addr.addr.id);
if (!entry) {
GENL_SET_ERR_MSG(info, "address not found");
spin_unlock_bh(&pernet->lock);
return -EINVAL;
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
addr_max = pernet->add_addr_signal_max;
WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
addr_max = pernet->local_addr_max;
WRITE_ONCE(pernet->local_addr_max, addr_max - 1);
}
pernet->addrs--;
list_del_rcu(&entry->list);
__clear_bit(entry->addr.id, pernet->id_bitmap);
spin_unlock_bh(&pernet->lock);
mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry);
mptcp: fix memory leak on address flush The endpoint cleanup path is prone to a memory leak, as reported by syzkaller: BUG: memory leak unreferenced object 0xffff88810680ea00 (size 64): comm "syz-executor.6", pid 6191, jiffies 4295756280 (age 24.138s) hex dump (first 32 bytes): 58 75 7d 3c 80 88 ff ff 22 01 00 00 00 00 ad de Xu}<...."....... 01 00 02 00 00 00 00 00 ac 1e 00 07 00 00 00 00 ................ backtrace: [<0000000072a9f72a>] kmalloc include/linux/slab.h:591 [inline] [<0000000072a9f72a>] mptcp_nl_cmd_add_addr+0x287/0x9f0 net/mptcp/pm_netlink.c:1170 [<00000000f6e931bf>] genl_family_rcv_msg_doit.isra.0+0x225/0x340 net/netlink/genetlink.c:731 [<00000000f1504a2c>] genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] [<00000000f1504a2c>] genl_rcv_msg+0x341/0x5b0 net/netlink/genetlink.c:792 [<0000000097e76f6a>] netlink_rcv_skb+0x148/0x430 net/netlink/af_netlink.c:2504 [<00000000ceefa2b8>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:803 [<000000008ff91aec>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<000000008ff91aec>] netlink_unicast+0x537/0x750 net/netlink/af_netlink.c:1340 [<0000000041682c35>] netlink_sendmsg+0x846/0xd80 net/netlink/af_netlink.c:1929 [<00000000df3aa8e7>] sock_sendmsg_nosec net/socket.c:704 [inline] [<00000000df3aa8e7>] sock_sendmsg+0x14e/0x190 net/socket.c:724 [<000000002154c54c>] ____sys_sendmsg+0x709/0x870 net/socket.c:2403 [<000000001aab01d7>] ___sys_sendmsg+0xff/0x170 net/socket.c:2457 [<00000000fa3b1446>] __sys_sendmsg+0xe5/0x1b0 net/socket.c:2486 [<00000000db2ee9c7>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<00000000db2ee9c7>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000005873517d>] entry_SYSCALL_64_after_hwframe+0x44/0xae We should not require an allocation to cleanup stuff. Rework the code a bit so that the additional RCU work is no more needed. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-19 07:42:36 +08:00
synchronize_rcu();
__mptcp_pm_release_addr_entry(entry);
return ret;
}
/* Called from the userspace PM only */
void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list)
{
struct mptcp_rm_list alist = { .nr = 0 };
struct mptcp_pm_addr_entry *entry;
int anno_nr = 0;
list_for_each_entry(entry, rm_list, list) {
if (alist.nr >= MPTCP_RM_IDS_MAX)
break;
/* only delete if either announced or matching a subflow */
if (remove_anno_list_by_saddr(msk, &entry->addr))
anno_nr++;
else if (!lookup_subflow_by_saddr(&msk->conn_list,
&entry->addr))
continue;
alist.ids[alist.nr++] = entry->addr.id;
}
if (alist.nr) {
spin_lock_bh(&msk->pm.lock);
msk->pm.add_addr_signaled -= anno_nr;
mptcp_pm_remove_addr(msk, &alist);
spin_unlock_bh(&msk->pm.lock);
}
}
/* Called from the in-kernel PM only */
static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk,
struct list_head *rm_list)
{
struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 };
struct mptcp_pm_addr_entry *entry;
list_for_each_entry(entry, rm_list, list) {
if (slist.nr < MPTCP_RM_IDS_MAX &&
lookup_subflow_by_saddr(&msk->conn_list, &entry->addr))
slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
if (alist.nr < MPTCP_RM_IDS_MAX &&
remove_anno_list_by_saddr(msk, &entry->addr))
alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
}
spin_lock_bh(&msk->pm.lock);
if (alist.nr) {
msk->pm.add_addr_signaled -= alist.nr;
mptcp_pm_remove_addr(msk, &alist);
}
if (slist.nr)
mptcp_pm_nl_rm_subflow_received(msk, &slist);
/* Reset counters: maybe some subflows have been removed before */
bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
msk->pm.local_addr_used = 0;
spin_unlock_bh(&msk->pm.lock);
}
static void mptcp_nl_flush_addrs_list(struct net *net,
struct list_head *rm_list)
{
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
if (list_empty(rm_list))
return;
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
if (!mptcp_pm_is_userspace(msk)) {
lock_sock(sk);
mptcp_pm_flush_addrs_and_subflows(msk, rm_list);
release_sock(sk);
}
sock_put(sk);
cond_resched();
}
}
mptcp: fix memory leak on address flush The endpoint cleanup path is prone to a memory leak, as reported by syzkaller: BUG: memory leak unreferenced object 0xffff88810680ea00 (size 64): comm "syz-executor.6", pid 6191, jiffies 4295756280 (age 24.138s) hex dump (first 32 bytes): 58 75 7d 3c 80 88 ff ff 22 01 00 00 00 00 ad de Xu}<...."....... 01 00 02 00 00 00 00 00 ac 1e 00 07 00 00 00 00 ................ backtrace: [<0000000072a9f72a>] kmalloc include/linux/slab.h:591 [inline] [<0000000072a9f72a>] mptcp_nl_cmd_add_addr+0x287/0x9f0 net/mptcp/pm_netlink.c:1170 [<00000000f6e931bf>] genl_family_rcv_msg_doit.isra.0+0x225/0x340 net/netlink/genetlink.c:731 [<00000000f1504a2c>] genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] [<00000000f1504a2c>] genl_rcv_msg+0x341/0x5b0 net/netlink/genetlink.c:792 [<0000000097e76f6a>] netlink_rcv_skb+0x148/0x430 net/netlink/af_netlink.c:2504 [<00000000ceefa2b8>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:803 [<000000008ff91aec>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<000000008ff91aec>] netlink_unicast+0x537/0x750 net/netlink/af_netlink.c:1340 [<0000000041682c35>] netlink_sendmsg+0x846/0xd80 net/netlink/af_netlink.c:1929 [<00000000df3aa8e7>] sock_sendmsg_nosec net/socket.c:704 [inline] [<00000000df3aa8e7>] sock_sendmsg+0x14e/0x190 net/socket.c:724 [<000000002154c54c>] ____sys_sendmsg+0x709/0x870 net/socket.c:2403 [<000000001aab01d7>] ___sys_sendmsg+0xff/0x170 net/socket.c:2457 [<00000000fa3b1446>] __sys_sendmsg+0xe5/0x1b0 net/socket.c:2486 [<00000000db2ee9c7>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<00000000db2ee9c7>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000005873517d>] entry_SYSCALL_64_after_hwframe+0x44/0xae We should not require an allocation to cleanup stuff. Rework the code a bit so that the additional RCU work is no more needed. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-19 07:42:36 +08:00
/* caller must ensure the RCU grace period is already elapsed */
static void __flush_addrs(struct list_head *list)
{
while (!list_empty(list)) {
struct mptcp_pm_addr_entry *cur;
cur = list_entry(list->next,
struct mptcp_pm_addr_entry, list);
list_del_rcu(&cur->list);
mptcp: fix memory leak on address flush The endpoint cleanup path is prone to a memory leak, as reported by syzkaller: BUG: memory leak unreferenced object 0xffff88810680ea00 (size 64): comm "syz-executor.6", pid 6191, jiffies 4295756280 (age 24.138s) hex dump (first 32 bytes): 58 75 7d 3c 80 88 ff ff 22 01 00 00 00 00 ad de Xu}<...."....... 01 00 02 00 00 00 00 00 ac 1e 00 07 00 00 00 00 ................ backtrace: [<0000000072a9f72a>] kmalloc include/linux/slab.h:591 [inline] [<0000000072a9f72a>] mptcp_nl_cmd_add_addr+0x287/0x9f0 net/mptcp/pm_netlink.c:1170 [<00000000f6e931bf>] genl_family_rcv_msg_doit.isra.0+0x225/0x340 net/netlink/genetlink.c:731 [<00000000f1504a2c>] genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] [<00000000f1504a2c>] genl_rcv_msg+0x341/0x5b0 net/netlink/genetlink.c:792 [<0000000097e76f6a>] netlink_rcv_skb+0x148/0x430 net/netlink/af_netlink.c:2504 [<00000000ceefa2b8>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:803 [<000000008ff91aec>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<000000008ff91aec>] netlink_unicast+0x537/0x750 net/netlink/af_netlink.c:1340 [<0000000041682c35>] netlink_sendmsg+0x846/0xd80 net/netlink/af_netlink.c:1929 [<00000000df3aa8e7>] sock_sendmsg_nosec net/socket.c:704 [inline] [<00000000df3aa8e7>] sock_sendmsg+0x14e/0x190 net/socket.c:724 [<000000002154c54c>] ____sys_sendmsg+0x709/0x870 net/socket.c:2403 [<000000001aab01d7>] ___sys_sendmsg+0xff/0x170 net/socket.c:2457 [<00000000fa3b1446>] __sys_sendmsg+0xe5/0x1b0 net/socket.c:2486 [<00000000db2ee9c7>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<00000000db2ee9c7>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000005873517d>] entry_SYSCALL_64_after_hwframe+0x44/0xae We should not require an allocation to cleanup stuff. Rework the code a bit so that the additional RCU work is no more needed. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-19 07:42:36 +08:00
__mptcp_pm_release_addr_entry(cur);
}
}
static void __reset_counters(struct pm_nl_pernet *pernet)
{
WRITE_ONCE(pernet->add_addr_signal_max, 0);
WRITE_ONCE(pernet->add_addr_accept_max, 0);
WRITE_ONCE(pernet->local_addr_max, 0);
pernet->addrs = 0;
}
int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
LIST_HEAD(free_list);
spin_lock_bh(&pernet->lock);
list_splice_init(&pernet->local_addr_list, &free_list);
__reset_counters(pernet);
pernet->next_id = 1;
bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
spin_unlock_bh(&pernet->lock);
mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list);
mptcp: fix memory leak on address flush The endpoint cleanup path is prone to a memory leak, as reported by syzkaller: BUG: memory leak unreferenced object 0xffff88810680ea00 (size 64): comm "syz-executor.6", pid 6191, jiffies 4295756280 (age 24.138s) hex dump (first 32 bytes): 58 75 7d 3c 80 88 ff ff 22 01 00 00 00 00 ad de Xu}<...."....... 01 00 02 00 00 00 00 00 ac 1e 00 07 00 00 00 00 ................ backtrace: [<0000000072a9f72a>] kmalloc include/linux/slab.h:591 [inline] [<0000000072a9f72a>] mptcp_nl_cmd_add_addr+0x287/0x9f0 net/mptcp/pm_netlink.c:1170 [<00000000f6e931bf>] genl_family_rcv_msg_doit.isra.0+0x225/0x340 net/netlink/genetlink.c:731 [<00000000f1504a2c>] genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] [<00000000f1504a2c>] genl_rcv_msg+0x341/0x5b0 net/netlink/genetlink.c:792 [<0000000097e76f6a>] netlink_rcv_skb+0x148/0x430 net/netlink/af_netlink.c:2504 [<00000000ceefa2b8>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:803 [<000000008ff91aec>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<000000008ff91aec>] netlink_unicast+0x537/0x750 net/netlink/af_netlink.c:1340 [<0000000041682c35>] netlink_sendmsg+0x846/0xd80 net/netlink/af_netlink.c:1929 [<00000000df3aa8e7>] sock_sendmsg_nosec net/socket.c:704 [inline] [<00000000df3aa8e7>] sock_sendmsg+0x14e/0x190 net/socket.c:724 [<000000002154c54c>] ____sys_sendmsg+0x709/0x870 net/socket.c:2403 [<000000001aab01d7>] ___sys_sendmsg+0xff/0x170 net/socket.c:2457 [<00000000fa3b1446>] __sys_sendmsg+0xe5/0x1b0 net/socket.c:2486 [<00000000db2ee9c7>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<00000000db2ee9c7>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000005873517d>] entry_SYSCALL_64_after_hwframe+0x44/0xae We should not require an allocation to cleanup stuff. Rework the code a bit so that the additional RCU work is no more needed. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-19 07:42:36 +08:00
synchronize_rcu();
__flush_addrs(&free_list);
return 0;
}
int mptcp_nl_fill_addr(struct sk_buff *skb,
struct mptcp_pm_addr_entry *entry)
{
struct mptcp_addr_info *addr = &entry->addr;
struct nlattr *attr;
attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR);
if (!attr)
return -EMSGSIZE;
if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family))
goto nla_put_failure;
if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port)))
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
goto nla_put_failure;
if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
goto nla_put_failure;
if (entry->ifindex &&
nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
goto nla_put_failure;
if (addr->family == AF_INET &&
nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4,
addr->addr.s_addr))
goto nla_put_failure;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (addr->family == AF_INET6 &&
nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6))
goto nla_put_failure;
#endif
nla_nest_end(skb, attr);
return 0;
nla_put_failure:
nla_nest_cancel(skb, attr);
return -EMSGSIZE;
}
int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
struct sk_buff *msg;
void *reply;
int ret;
ret = mptcp_pm_parse_entry(attr, info, false, &addr);
if (ret < 0)
return ret;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
info->genlhdr->cmd);
if (!reply) {
GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
ret = -EMSGSIZE;
goto fail;
}
spin_lock_bh(&pernet->lock);
entry = __lookup_addr_by_id(pernet, addr.addr.id);
if (!entry) {
GENL_SET_ERR_MSG(info, "address not found");
ret = -EINVAL;
goto unlock_fail;
}
ret = mptcp_nl_fill_addr(msg, entry);
if (ret)
goto unlock_fail;
genlmsg_end(msg, reply);
ret = genlmsg_reply(msg, info);
spin_unlock_bh(&pernet->lock);
return ret;
unlock_fail:
spin_unlock_bh(&pernet->lock);
fail:
nlmsg_free(msg);
return ret;
}
int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info)
{
return mptcp_pm_get_addr(skb, info);
}
int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
struct netlink_callback *cb)
{
struct net *net = sock_net(msg->sk);
struct mptcp_pm_addr_entry *entry;
struct pm_nl_pernet *pernet;
int id = cb->args[0];
void *hdr;
int i;
pernet = pm_nl_get_pernet(net);
spin_lock_bh(&pernet->lock);
for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
if (test_bit(i, pernet->id_bitmap)) {
entry = __lookup_addr_by_id(pernet, i);
if (!entry)
break;
if (entry->addr.id <= id)
continue;
hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, &mptcp_genl_family,
NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
if (!hdr)
break;
if (mptcp_nl_fill_addr(msg, entry) < 0) {
genlmsg_cancel(msg, hdr);
break;
}
id = entry->addr.id;
genlmsg_end(msg, hdr);
}
}
spin_unlock_bh(&pernet->lock);
cb->args[0] = id;
return msg->len;
}
int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg,
struct netlink_callback *cb)
{
return mptcp_pm_dump_addr(msg, cb);
}
static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
{
struct nlattr *attr = info->attrs[id];
if (!attr)
return 0;
*limit = nla_get_u32(attr);
if (*limit > MPTCP_PM_ADDR_MAX) {
GENL_SET_ERR_MSG(info, "limit greater than maximum");
return -EINVAL;
}
return 0;
}
int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
unsigned int rcv_addrs, subflows;
int ret;
spin_lock_bh(&pernet->lock);
rcv_addrs = pernet->add_addr_accept_max;
ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
if (ret)
goto unlock;
subflows = pernet->subflows_max;
ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
if (ret)
goto unlock;
WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs);
WRITE_ONCE(pernet->subflows_max, subflows);
unlock:
spin_unlock_bh(&pernet->lock);
return ret;
}
int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct sk_buff *msg;
void *reply;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
MPTCP_PM_CMD_GET_LIMITS);
if (!reply)
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
READ_ONCE(pernet->add_addr_accept_max)))
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
READ_ONCE(pernet->subflows_max)))
goto fail;
genlmsg_end(msg, reply);
return genlmsg_reply(msg, info);
fail:
GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
nlmsg_free(msg);
return -EMSGSIZE;
}
static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
struct mptcp_addr_info *addr)
{
struct mptcp_rm_list list = { .nr = 0 };
list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
spin_lock_bh(&msk->pm.lock);
mptcp_pm_nl_rm_subflow_received(msk, &list);
__mark_subflow_endp_available(msk, list.ids[0]);
mptcp_pm_create_subflow_or_signal_addr(msk);
spin_unlock_bh(&msk->pm.lock);
}
static int mptcp_nl_set_flags(struct net *net,
struct mptcp_addr_info *addr,
u8 bkup, u8 changed)
{
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
int ret = -EINVAL;
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;
lock_sock(sk);
if (changed & MPTCP_PM_ADDR_FLAG_BACKUP)
ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup);
if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)
mptcp_pm_nl_fullmesh(msk, addr);
release_sock(sk);
next:
sock_put(sk);
cond_resched();
}
return ret;
}
int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info)
{
struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, };
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
MPTCP_PM_ADDR_FLAG_FULLMESH;
struct net *net = sock_net(skb->sk);
struct mptcp_pm_addr_entry *entry;
struct pm_nl_pernet *pernet;
u8 lookup_by_id = 0;
u8 bkup = 0;
int ret;
pernet = pm_nl_get_pernet(net);
ret = mptcp_pm_parse_entry(attr, info, false, &addr);
if (ret < 0)
return ret;
if (addr.addr.family == AF_UNSPEC) {
lookup_by_id = 1;
if (!addr.addr.id) {
GENL_SET_ERR_MSG(info, "missing required inputs");
return -EOPNOTSUPP;
}
}
if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
bkup = 1;
spin_lock_bh(&pernet->lock);
entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) :
__lookup_addr(pernet, &addr.addr);
if (!entry) {
spin_unlock_bh(&pernet->lock);
GENL_SET_ERR_MSG(info, "address not found");
return -EINVAL;
}
if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
spin_unlock_bh(&pernet->lock);
GENL_SET_ERR_MSG(info, "invalid addr flags");
return -EINVAL;
}
changed = (addr.flags ^ entry->flags) & mask;
entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
addr = *entry;
spin_unlock_bh(&pernet->lock);
mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
return 0;
}
int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info)
{
return mptcp_pm_set_flags(skb, info);
}
static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp)
{
genlmsg_multicast_netns(&mptcp_genl_family, net,
nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp);
}
bool mptcp_userspace_pm_active(const struct mptcp_sock *msk)
{
return genl_has_listeners(&mptcp_genl_family,
sock_net((const struct sock *)msk),
MPTCP_PM_EV_GRP_OFFSET);
}
static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk)
{
const struct inet_sock *issk = inet_sk(ssk);
const struct mptcp_subflow_context *sf;
if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family))
return -EMSGSIZE;
switch (ssk->sk_family) {
case AF_INET:
if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr))
return -EMSGSIZE;
if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr))
return -EMSGSIZE;
break;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
case AF_INET6: {
const struct ipv6_pinfo *np = inet6_sk(ssk);
if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr))
return -EMSGSIZE;
if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr))
return -EMSGSIZE;
break;
}
#endif
default:
WARN_ON_ONCE(1);
return -EMSGSIZE;
}
if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport))
return -EMSGSIZE;
if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport))
return -EMSGSIZE;
sf = mptcp_subflow_ctx(ssk);
if (WARN_ON_ONCE(!sf))
return -EINVAL;
if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, subflow_get_local_id(sf)))
return -EMSGSIZE;
if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id))
return -EMSGSIZE;
return 0;
}
static int mptcp_event_put_token_and_ssk(struct sk_buff *skb,
const struct mptcp_sock *msk,
const struct sock *ssk)
{
const struct sock *sk = (const struct sock *)msk;
const struct mptcp_subflow_context *sf;
u8 sk_err;
if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)))
return -EMSGSIZE;
if (mptcp_event_add_subflow(skb, ssk))
return -EMSGSIZE;
sf = mptcp_subflow_ctx(ssk);
if (WARN_ON_ONCE(!sf))
return -EINVAL;
if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup))
return -EMSGSIZE;
if (ssk->sk_bound_dev_if &&
nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if))
return -EMSGSIZE;
sk_err = READ_ONCE(ssk->sk_err);
if (sk_err && sk->sk_state == TCP_ESTABLISHED &&
nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err))
return -EMSGSIZE;
return 0;
}
static int mptcp_event_sub_established(struct sk_buff *skb,
const struct mptcp_sock *msk,
const struct sock *ssk)
{
return mptcp_event_put_token_and_ssk(skb, msk, ssk);
}
static int mptcp_event_sub_closed(struct sk_buff *skb,
const struct mptcp_sock *msk,
const struct sock *ssk)
{
const struct mptcp_subflow_context *sf;
if (mptcp_event_put_token_and_ssk(skb, msk, ssk))
return -EMSGSIZE;
sf = mptcp_subflow_ctx(ssk);
if (!sf->reset_seen)
return 0;
if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason))
return -EMSGSIZE;
if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient))
return -EMSGSIZE;
return 0;
}
static int mptcp_event_created(struct sk_buff *skb,
const struct mptcp_sock *msk,
const struct sock *ssk)
{
int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token));
if (err)
return err;
if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side)))
return -EMSGSIZE;
return mptcp_event_add_subflow(skb, ssk);
}
void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id)
{
struct net *net = sock_net((const struct sock *)msk);
struct nlmsghdr *nlh;
struct sk_buff *skb;
if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
return;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
if (!skb)
return;
nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED);
if (!nlh)
goto nla_put_failure;
if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)))
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id))
goto nla_put_failure;
genlmsg_end(skb, nlh);
mptcp_nl_mcast_send(net, skb, GFP_ATOMIC);
return;
nla_put_failure:
nlmsg_free(skb);
}
void mptcp_event_addr_announced(const struct sock *ssk,
const struct mptcp_addr_info *info)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct net *net = sock_net(ssk);
struct nlmsghdr *nlh;
struct sk_buff *skb;
if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
return;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
if (!skb)
return;
nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0,
MPTCP_EVENT_ANNOUNCED);
if (!nlh)
goto nla_put_failure;
if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)))
goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id))
goto nla_put_failure;
if (nla_put_be16(skb, MPTCP_ATTR_DPORT,
info->port == 0 ?
inet_sk(ssk)->inet_dport :
info->port))
goto nla_put_failure;
switch (info->family) {
case AF_INET:
if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr))
goto nla_put_failure;
break;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
case AF_INET6:
if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6))
goto nla_put_failure;
break;
#endif
default:
WARN_ON_ONCE(1);
goto nla_put_failure;
}
genlmsg_end(skb, nlh);
mptcp_nl_mcast_send(net, skb, GFP_ATOMIC);
return;
nla_put_failure:
nlmsg_free(skb);
}
void mptcp_event_pm_listener(const struct sock *ssk,
enum mptcp_event_type event)
{
const struct inet_sock *issk = inet_sk(ssk);
struct net *net = sock_net(ssk);
struct nlmsghdr *nlh;
struct sk_buff *skb;
if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
return;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!skb)
return;
nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, event);
if (!nlh)
goto nla_put_failure;
if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family))
goto nla_put_failure;
if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport))
goto nla_put_failure;
switch (ssk->sk_family) {
case AF_INET:
if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr))
goto nla_put_failure;
break;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
case AF_INET6: {
const struct ipv6_pinfo *np = inet6_sk(ssk);
if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr))
goto nla_put_failure;
break;
}
#endif
default:
WARN_ON_ONCE(1);
goto nla_put_failure;
}
genlmsg_end(skb, nlh);
mptcp_nl_mcast_send(net, skb, GFP_KERNEL);
return;
nla_put_failure:
nlmsg_free(skb);
}
void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
const struct sock *ssk, gfp_t gfp)
{
struct net *net = sock_net((const struct sock *)msk);
struct nlmsghdr *nlh;
struct sk_buff *skb;
if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
return;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
if (!skb)
return;
nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type);
if (!nlh)
goto nla_put_failure;
switch (type) {
case MPTCP_EVENT_UNSPEC:
WARN_ON_ONCE(1);
break;
case MPTCP_EVENT_CREATED:
case MPTCP_EVENT_ESTABLISHED:
if (mptcp_event_created(skb, msk, ssk) < 0)
goto nla_put_failure;
break;
case MPTCP_EVENT_CLOSED:
if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)) < 0)
goto nla_put_failure;
break;
case MPTCP_EVENT_ANNOUNCED:
case MPTCP_EVENT_REMOVED:
/* call mptcp_event_addr_announced()/removed instead */
WARN_ON_ONCE(1);
break;
case MPTCP_EVENT_SUB_ESTABLISHED:
case MPTCP_EVENT_SUB_PRIORITY:
if (mptcp_event_sub_established(skb, msk, ssk) < 0)
goto nla_put_failure;
break;
case MPTCP_EVENT_SUB_CLOSED:
if (mptcp_event_sub_closed(skb, msk, ssk) < 0)
goto nla_put_failure;
break;
case MPTCP_EVENT_LISTENER_CREATED:
case MPTCP_EVENT_LISTENER_CLOSED:
break;
}
genlmsg_end(skb, nlh);
mptcp_nl_mcast_send(net, skb, gfp);
return;
nla_put_failure:
nlmsg_free(skb);
}
struct genl_family mptcp_genl_family __ro_after_init = {
.name = MPTCP_PM_NAME,
.version = MPTCP_PM_VER,
.netnsok = true,
.module = THIS_MODULE,
.ops = mptcp_pm_nl_ops,
.n_ops = ARRAY_SIZE(mptcp_pm_nl_ops),
.resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1,
.mcgrps = mptcp_pm_mcgrps,
.n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
};
static int __net_init pm_nl_init_net(struct net *net)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
/* Cit. 2 subflows ought to be enough for anybody. */
pernet->subflows_max = 2;
pernet->next_id = 1;
pernet->stale_loss_cnt = 4;
spin_lock_init(&pernet->lock);
/* No need to initialize other pernet fields, the struct is zeroed at
* allocation time.
*/
return 0;
}
static void __net_exit pm_nl_exit_net(struct list_head *net_list)
{
struct net *net;
list_for_each_entry(net, net_list, exit_list) {
struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
/* net is removed from namespace list, can't race with
mptcp: fix memory leak on address flush The endpoint cleanup path is prone to a memory leak, as reported by syzkaller: BUG: memory leak unreferenced object 0xffff88810680ea00 (size 64): comm "syz-executor.6", pid 6191, jiffies 4295756280 (age 24.138s) hex dump (first 32 bytes): 58 75 7d 3c 80 88 ff ff 22 01 00 00 00 00 ad de Xu}<...."....... 01 00 02 00 00 00 00 00 ac 1e 00 07 00 00 00 00 ................ backtrace: [<0000000072a9f72a>] kmalloc include/linux/slab.h:591 [inline] [<0000000072a9f72a>] mptcp_nl_cmd_add_addr+0x287/0x9f0 net/mptcp/pm_netlink.c:1170 [<00000000f6e931bf>] genl_family_rcv_msg_doit.isra.0+0x225/0x340 net/netlink/genetlink.c:731 [<00000000f1504a2c>] genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] [<00000000f1504a2c>] genl_rcv_msg+0x341/0x5b0 net/netlink/genetlink.c:792 [<0000000097e76f6a>] netlink_rcv_skb+0x148/0x430 net/netlink/af_netlink.c:2504 [<00000000ceefa2b8>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:803 [<000000008ff91aec>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<000000008ff91aec>] netlink_unicast+0x537/0x750 net/netlink/af_netlink.c:1340 [<0000000041682c35>] netlink_sendmsg+0x846/0xd80 net/netlink/af_netlink.c:1929 [<00000000df3aa8e7>] sock_sendmsg_nosec net/socket.c:704 [inline] [<00000000df3aa8e7>] sock_sendmsg+0x14e/0x190 net/socket.c:724 [<000000002154c54c>] ____sys_sendmsg+0x709/0x870 net/socket.c:2403 [<000000001aab01d7>] ___sys_sendmsg+0xff/0x170 net/socket.c:2457 [<00000000fa3b1446>] __sys_sendmsg+0xe5/0x1b0 net/socket.c:2486 [<00000000db2ee9c7>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<00000000db2ee9c7>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000005873517d>] entry_SYSCALL_64_after_hwframe+0x44/0xae We should not require an allocation to cleanup stuff. Rework the code a bit so that the additional RCU work is no more needed. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-19 07:42:36 +08:00
* other modifiers, also netns core already waited for a
* RCU grace period.
*/
__flush_addrs(&pernet->local_addr_list);
}
}
static struct pernet_operations mptcp_pm_pernet_ops = {
.init = pm_nl_init_net,
.exit_batch = pm_nl_exit_net,
.id = &pm_nl_pernet_id,
.size = sizeof(struct pm_nl_pernet),
};
void __init mptcp_pm_nl_init(void)
{
if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
panic("Failed to register MPTCP PM pernet subsystem.\n");
if (genl_register_family(&mptcp_genl_family))
panic("Failed to register MPTCP PM netlink family\n");
}