2019-05-27 14:55:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* UDP over IPv6
|
2007-02-09 22:24:49 +08:00
|
|
|
* Linux INET6 implementation
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Authors:
|
2007-02-09 22:24:49 +08:00
|
|
|
* Pedro Roque <roque@di.fc.ul.pt>
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Based on linux/ipv4/udp.c
|
|
|
|
*
|
|
|
|
* Fixes:
|
|
|
|
* Hideaki YOSHIFUJI : sin6_scope_id support
|
|
|
|
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
|
|
|
|
* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
|
|
|
|
* a single port at the same time.
|
|
|
|
* Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
|
|
|
|
* YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file.
|
|
|
|
*/
|
|
|
|
|
2021-12-16 10:55:37 +08:00
|
|
|
#include <linux/bpf-cgroup.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/socket.h>
|
|
|
|
#include <linux/sockios.h>
|
|
|
|
#include <linux/net.h>
|
|
|
|
#include <linux/in6.h>
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/if_arp.h>
|
|
|
|
#include <linux/ipv6.h>
|
|
|
|
#include <linux/icmpv6.h>
|
|
|
|
#include <linux/init.h>
|
2007-12-12 03:30:32 +08:00
|
|
|
#include <linux/module.h>
|
2005-12-14 15:16:37 +08:00
|
|
|
#include <linux/skbuff.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2016-12-25 03:46:01 +08:00
|
|
|
#include <linux/uaccess.h>
|
2019-05-03 23:01:37 +08:00
|
|
|
#include <linux/indirect_call_wrapper.h>
|
2024-03-27 02:05:47 +08:00
|
|
|
#include <trace/events/udp.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-02-11 00:50:36 +08:00
|
|
|
#include <net/addrconf.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/ndisc.h>
|
|
|
|
#include <net/protocol.h>
|
|
|
|
#include <net/transp_v6.h>
|
|
|
|
#include <net/ip6_route.h>
|
|
|
|
#include <net/raw.h>
|
2022-01-04 01:11:32 +08:00
|
|
|
#include <net/seg6.h>
|
2005-08-10 11:08:28 +08:00
|
|
|
#include <net/tcp_states.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/ip6_checksum.h>
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
#include <net/ip6_tunnel.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/xfrm.h>
|
2017-04-19 01:39:41 +08:00
|
|
|
#include <net/inet_hashtables.h>
|
2013-01-22 17:50:44 +08:00
|
|
|
#include <net/inet6_hashtables.h>
|
2013-07-10 22:13:17 +08:00
|
|
|
#include <net/busy_poll.h>
|
2016-01-05 06:41:46 +08:00
|
|
|
#include <net/sock_reuseport.h>
|
2023-07-27 23:33:56 +08:00
|
|
|
#include <net/gro.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <linux/proc_fs.h>
|
|
|
|
#include <linux/seq_file.h>
|
2012-06-27 08:23:44 +08:00
|
|
|
#include <trace/events/skb.h>
|
2006-11-28 03:10:57 +08:00
|
|
|
#include "udp_impl.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct().
Originally, inet6_sk(sk)->XXX were changed under lock_sock(), so we were
able to clean them up by calling inet6_destroy_sock() during the IPv6 ->
IPv4 conversion by IPV6_ADDRFORM. However, commit 03485f2adcde ("udpv6:
Add lockless sendmsg() support") added a lockless memory allocation path,
which could cause a memory leak:
setsockopt(IPV6_ADDRFORM) sendmsg()
+-----------------------+ +-------+
- do_ipv6_setsockopt(sk, ...) - udpv6_sendmsg(sk, ...)
- sockopt_lock_sock(sk) ^._ called via udpv6_prot
- lock_sock(sk) before WRITE_ONCE()
- WRITE_ONCE(sk->sk_prot, &tcp_prot)
- inet6_destroy_sock() - if (!corkreq)
- sockopt_release_sock(sk) - ip6_make_skb(sk, ...)
- release_sock(sk) ^._ lockless fast path for
the non-corking case
- __ip6_append_data(sk, ...)
- ipv6_local_rxpmtu(sk, ...)
- xchg(&np->rxpmtu, skb)
^._ rxpmtu is never freed.
- goto out_no_dst;
- lock_sock(sk)
For now, rxpmtu is only the case, but not to miss the future change
and a similar bug fixed in commit e27326009a3d ("net: ping6: Fix
memleak in ipv6_renew_options()."), let's set a new function to IPv6
sk->sk_destruct() and call inet6_cleanup_sock() there. Since the
conversion does not change sk->sk_destruct(), we can guarantee that
we can clean up IPv6 resources finally.
We can now remove all inet6_destroy_sock() calls from IPv6 protocol
specific ->destroy() functions, but such changes are invasive to
backport. So they can be posted as a follow-up later for net-next.
Fixes: 03485f2adcde ("udpv6: Add lockless sendmsg() support")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-10-07 02:53:47 +08:00
|
|
|
static void udpv6_destruct_sock(struct sock *sk)
|
|
|
|
{
|
|
|
|
udp_destruct_common(sk);
|
|
|
|
inet6_sock_destruct(sk);
|
|
|
|
}
|
|
|
|
|
|
|
|
int udpv6_init_sock(struct sock *sk)
|
|
|
|
{
|
2022-10-21 01:48:52 +08:00
|
|
|
udp_lib_init_sock(sk);
|
tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct().
Originally, inet6_sk(sk)->XXX were changed under lock_sock(), so we were
able to clean them up by calling inet6_destroy_sock() during the IPv6 ->
IPv4 conversion by IPV6_ADDRFORM. However, commit 03485f2adcde ("udpv6:
Add lockless sendmsg() support") added a lockless memory allocation path,
which could cause a memory leak:
setsockopt(IPV6_ADDRFORM) sendmsg()
+-----------------------+ +-------+
- do_ipv6_setsockopt(sk, ...) - udpv6_sendmsg(sk, ...)
- sockopt_lock_sock(sk) ^._ called via udpv6_prot
- lock_sock(sk) before WRITE_ONCE()
- WRITE_ONCE(sk->sk_prot, &tcp_prot)
- inet6_destroy_sock() - if (!corkreq)
- sockopt_release_sock(sk) - ip6_make_skb(sk, ...)
- release_sock(sk) ^._ lockless fast path for
the non-corking case
- __ip6_append_data(sk, ...)
- ipv6_local_rxpmtu(sk, ...)
- xchg(&np->rxpmtu, skb)
^._ rxpmtu is never freed.
- goto out_no_dst;
- lock_sock(sk)
For now, rxpmtu is only the case, but not to miss the future change
and a similar bug fixed in commit e27326009a3d ("net: ping6: Fix
memleak in ipv6_renew_options()."), let's set a new function to IPv6
sk->sk_destruct() and call inet6_cleanup_sock() there. Since the
conversion does not change sk->sk_destruct(), we can guarantee that
we can clean up IPv6 resources finally.
We can now remove all inet6_destroy_sock() calls from IPv6 protocol
specific ->destroy() functions, but such changes are invasive to
backport. So they can be posted as a follow-up later for net-next.
Fixes: 03485f2adcde ("udpv6: Add lockless sendmsg() support")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-10-07 02:53:47 +08:00
|
|
|
sk->sk_destruct = udpv6_destruct_sock;
|
2022-10-27 07:25:56 +08:00
|
|
|
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
|
tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct().
Originally, inet6_sk(sk)->XXX were changed under lock_sock(), so we were
able to clean them up by calling inet6_destroy_sock() during the IPv6 ->
IPv4 conversion by IPV6_ADDRFORM. However, commit 03485f2adcde ("udpv6:
Add lockless sendmsg() support") added a lockless memory allocation path,
which could cause a memory leak:
setsockopt(IPV6_ADDRFORM) sendmsg()
+-----------------------+ +-------+
- do_ipv6_setsockopt(sk, ...) - udpv6_sendmsg(sk, ...)
- sockopt_lock_sock(sk) ^._ called via udpv6_prot
- lock_sock(sk) before WRITE_ONCE()
- WRITE_ONCE(sk->sk_prot, &tcp_prot)
- inet6_destroy_sock() - if (!corkreq)
- sockopt_release_sock(sk) - ip6_make_skb(sk, ...)
- release_sock(sk) ^._ lockless fast path for
the non-corking case
- __ip6_append_data(sk, ...)
- ipv6_local_rxpmtu(sk, ...)
- xchg(&np->rxpmtu, skb)
^._ rxpmtu is never freed.
- goto out_no_dst;
- lock_sock(sk)
For now, rxpmtu is only the case, but not to miss the future change
and a similar bug fixed in commit e27326009a3d ("net: ping6: Fix
memleak in ipv6_renew_options()."), let's set a new function to IPv6
sk->sk_destruct() and call inet6_cleanup_sock() there. Since the
conversion does not change sk->sk_destruct(), we can guarantee that
we can clean up IPv6 resources finally.
We can now remove all inet6_destroy_sock() calls from IPv6 protocol
specific ->destroy() functions, but such changes are invasive to
backport. So they can be posted as a follow-up later for net-next.
Fixes: 03485f2adcde ("udpv6: Add lockless sendmsg() support")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-10-07 02:53:47 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-07-20 23:30:08 +08:00
|
|
|
INDIRECT_CALLABLE_SCOPE
|
|
|
|
u32 udp6_ehashfn(const struct net *net,
|
|
|
|
const struct in6_addr *laddr,
|
|
|
|
const u16 lport,
|
|
|
|
const struct in6_addr *faddr,
|
|
|
|
const __be16 fport)
|
2013-10-20 03:48:52 +08:00
|
|
|
{
|
2013-10-20 03:48:57 +08:00
|
|
|
u32 lhash, fhash;
|
|
|
|
|
|
|
|
net_get_random_once(&udp6_ehash_secret,
|
|
|
|
sizeof(udp6_ehash_secret));
|
|
|
|
net_get_random_once(&udp_ipv6_hash_secret,
|
|
|
|
sizeof(udp_ipv6_hash_secret));
|
|
|
|
|
|
|
|
lhash = (__force u32)laddr->s6_addr32[3];
|
|
|
|
fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret);
|
|
|
|
|
2013-10-20 03:48:52 +08:00
|
|
|
return __inet6_ehashfn(lhash, lport, fhash, fport,
|
2023-07-08 16:29:58 +08:00
|
|
|
udp6_ehash_secret + net_hash_mix(net));
|
2013-10-20 03:48:52 +08:00
|
|
|
}
|
|
|
|
|
2008-03-23 07:51:21 +08:00
|
|
|
int udp_v6_get_port(struct sock *sk, unsigned short snum)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2009-11-09 13:26:33 +08:00
|
|
|
unsigned int hash2_nulladdr =
|
2017-12-02 04:52:30 +08:00
|
|
|
ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
|
2012-05-19 09:45:21 +08:00
|
|
|
unsigned int hash2_partial =
|
2017-12-02 04:52:30 +08:00
|
|
|
ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
|
2009-11-09 13:26:33 +08:00
|
|
|
|
2009-11-08 18:17:30 +08:00
|
|
|
/* precompute partial secondary hash */
|
2009-11-09 13:26:33 +08:00
|
|
|
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
|
2017-01-17 23:51:01 +08:00
|
|
|
return udp_lib_get_port(sk, snum, hash2_nulladdr);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2019-01-17 00:17:45 +08:00
|
|
|
void udp_v6_rehash(struct sock *sk)
|
udp: add rehash on connect()
commit 30fff923 introduced in linux-2.6.33 (udp: bind() optimisation)
added a secondary hash on UDP, hashed on (local addr, local port).
Problem is that following sequence :
fd = socket(...)
connect(fd, &remote, ...)
not only selects remote end point (address and port), but also sets
local address, while UDP stack stored in secondary hash table the socket
while its local address was INADDR_ANY (or ipv6 equivalent)
Sequence is :
- autobind() : choose a random local port, insert socket in hash tables
[while local address is INADDR_ANY]
- connect() : set remote address and port, change local address to IP
given by a route lookup.
When an incoming UDP frame comes, if more than 10 sockets are found in
primary hash table, we switch to secondary table, and fail to find
socket because its local address changed.
One solution to this problem is to rehash datagram socket if needed.
We add a new rehash(struct socket *) method in "struct proto", and
implement this method for UDP v4 & v6, using a common helper.
This rehashing only takes care of secondary hash table, since primary
hash (based on local port only) is not changed.
Reported-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-09-08 13:08:44 +08:00
|
|
|
{
|
2017-12-02 04:52:30 +08:00
|
|
|
u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
&sk->sk_v6_rcv_saddr,
|
udp: add rehash on connect()
commit 30fff923 introduced in linux-2.6.33 (udp: bind() optimisation)
added a secondary hash on UDP, hashed on (local addr, local port).
Problem is that following sequence :
fd = socket(...)
connect(fd, &remote, ...)
not only selects remote end point (address and port), but also sets
local address, while UDP stack stored in secondary hash table the socket
while its local address was INADDR_ANY (or ipv6 equivalent)
Sequence is :
- autobind() : choose a random local port, insert socket in hash tables
[while local address is INADDR_ANY]
- connect() : set remote address and port, change local address to IP
given by a route lookup.
When an incoming UDP frame comes, if more than 10 sockets are found in
primary hash table, we switch to secondary table, and fail to find
socket because its local address changed.
One solution to this problem is to rehash datagram socket if needed.
We add a new rehash(struct socket *) method in "struct proto", and
implement this method for UDP v4 & v6, using a common helper.
This rehashing only takes care of secondary hash table, since primary
hash (based on local port only) is not changed.
Reported-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-09-08 13:08:44 +08:00
|
|
|
inet_sk(sk)->inet_num);
|
|
|
|
|
|
|
|
udp_lib_rehash(sk, new_hash);
|
|
|
|
}
|
|
|
|
|
2024-08-02 21:40:29 +08:00
|
|
|
static int compute_score(struct sock *sk, const struct net *net,
|
udp reuseport: fix packet of same flow hashed to different socket
There is a corner case in which udp packets belonging to a same
flow are hashed to different socket when hslot->count changes from 10
to 11:
1) When hslot->count <= 10, __udp_lib_lookup() searches udp_table->hash,
and always passes 'daddr' to udp_ehashfn().
2) When hslot->count > 10, __udp_lib_lookup() searches udp_table->hash2,
but may pass 'INADDR_ANY' to udp_ehashfn() if the sockets are bound to
INADDR_ANY instead of some specific addr.
That means when hslot->count changes from 10 to 11, the hash calculated by
udp_ehashfn() is also changed, and the udp packets belonging to a same
flow will be hashed to different socket.
This is easily reproduced:
1) Create 10 udp sockets and bind all of them to 0.0.0.0:40000.
2) From the same host send udp packets to 127.0.0.1:40000, record the
socket index which receives the packets.
3) Create 1 more udp socket and bind it to 0.0.0.0:44096. The number 44096
is 40000 + UDP_HASH_SIZE(4096), this makes the new socket put into the
same hslot as the aformentioned 10 sockets, and makes the hslot->count
change from 10 to 11.
4) From the same host send udp packets to 127.0.0.1:40000, and the socket
index which receives the packets will be different from the one received
in step 2.
This should not happen as the socket bound to 0.0.0.0:44096 should not
change the behavior of the sockets bound to 0.0.0.0:40000.
It's the same case for IPv6, and this patch also fixes that.
Signed-off-by: Su, Xuemin <suxm@chinanetcenter.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-13 11:02:50 +08:00
|
|
|
const struct in6_addr *saddr, __be16 sport,
|
|
|
|
const struct in6_addr *daddr, unsigned short hnum,
|
2019-06-14 12:41:26 +08:00
|
|
|
int dif, int sdif)
|
2008-10-29 16:41:45 +08:00
|
|
|
{
|
2022-05-14 02:55:41 +08:00
|
|
|
int bound_dev_if, score;
|
2014-12-02 12:29:06 +08:00
|
|
|
struct inet_sock *inet;
|
net: ensure unbound datagram socket to be chosen when not in a VRF
Ensure an unbound datagram skt is chosen when not in a VRF. The check
for a device match in compute_score() for UDP must be performed when
there is no device match. For this, a failure is returned when there is
no device match. This ensures that bound sockets are never selected,
even if there is no unbound socket.
Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These
packets are currently blocked, as flowi6_oif was set to that of the
master vrf device, and the ipi6_ifindex is that of the slave device.
Allow these packets to be sent by checking the device with ipi6_ifindex
has the same L3 scope as that of the bound device of the skt, which is
the master vrf device. Note that this check always succeeds if the skt
is unbound.
Even though the right datagram skt is now selected by compute_score(),
a different skt is being returned that is bound to the wrong vrf. The
difference between these and stream sockets is the handling of the skt
option for SO_REUSEPORT. While the handling when adding a skt for reuse
correctly checks that the bound device of the skt is a match, the skts
in the hashslot are already incorrect. So for the same hash, a skt for
the wrong vrf may be selected for the required port. The root cause is
that the skt is immediately placed into a slot when it is created,
but when the skt is then bound using SO_BINDTODEVICE, it remains in the
same slot. The solution is to move the skt to the correct slot by
forcing a rehash.
Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Tested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-07 23:36:04 +08:00
|
|
|
bool dev_match;
|
2014-12-02 12:29:06 +08:00
|
|
|
|
|
|
|
if (!net_eq(sock_net(sk), net) ||
|
|
|
|
udp_sk(sk)->udp_port_hash != hnum ||
|
|
|
|
sk->sk_family != PF_INET6)
|
|
|
|
return -1;
|
|
|
|
|
2018-12-13 05:15:34 +08:00
|
|
|
if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
|
|
|
|
return -1;
|
|
|
|
|
2014-12-02 12:29:06 +08:00
|
|
|
score = 0;
|
|
|
|
inet = inet_sk(sk);
|
|
|
|
|
|
|
|
if (inet->inet_dport) {
|
|
|
|
if (inet->inet_dport != sport)
|
|
|
|
return -1;
|
|
|
|
score++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
|
|
|
|
if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
|
|
|
|
return -1;
|
|
|
|
score++;
|
|
|
|
}
|
|
|
|
|
2022-05-14 02:55:41 +08:00
|
|
|
bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
|
|
|
|
dev_match = udp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif);
|
net: ensure unbound datagram socket to be chosen when not in a VRF
Ensure an unbound datagram skt is chosen when not in a VRF. The check
for a device match in compute_score() for UDP must be performed when
there is no device match. For this, a failure is returned when there is
no device match. This ensures that bound sockets are never selected,
even if there is no unbound socket.
Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These
packets are currently blocked, as flowi6_oif was set to that of the
master vrf device, and the ipi6_ifindex is that of the slave device.
Allow these packets to be sent by checking the device with ipi6_ifindex
has the same L3 scope as that of the bound device of the skt, which is
the master vrf device. Note that this check always succeeds if the skt
is unbound.
Even though the right datagram skt is now selected by compute_score(),
a different skt is being returned that is bound to the wrong vrf. The
difference between these and stream sockets is the handling of the skt
option for SO_REUSEPORT. While the handling when adding a skt for reuse
correctly checks that the bound device of the skt is a match, the skts
in the hashslot are already incorrect. So for the same hash, a skt for
the wrong vrf may be selected for the required port. The root cause is
that the skt is immediately placed into a slot when it is created,
but when the skt is then bound using SO_BINDTODEVICE, it remains in the
same slot. The solution is to move the skt to the correct slot by
forcing a rehash.
Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Tested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-07 23:36:04 +08:00
|
|
|
if (!dev_match)
|
|
|
|
return -1;
|
2022-05-14 02:55:41 +08:00
|
|
|
if (bound_dev_if)
|
2021-10-05 21:03:42 +08:00
|
|
|
score++;
|
2014-12-02 12:29:06 +08:00
|
|
|
|
2019-10-31 04:00:04 +08:00
|
|
|
if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
|
2015-10-09 10:33:21 +08:00
|
|
|
score++;
|
|
|
|
|
2008-10-29 16:41:45 +08:00
|
|
|
return score;
|
|
|
|
}
|
|
|
|
|
udp reuseport: fix packet of same flow hashed to different socket
There is a corner case in which udp packets belonging to a same
flow are hashed to different socket when hslot->count changes from 10
to 11:
1) When hslot->count <= 10, __udp_lib_lookup() searches udp_table->hash,
and always passes 'daddr' to udp_ehashfn().
2) When hslot->count > 10, __udp_lib_lookup() searches udp_table->hash2,
but may pass 'INADDR_ANY' to udp_ehashfn() if the sockets are bound to
INADDR_ANY instead of some specific addr.
That means when hslot->count changes from 10 to 11, the hash calculated by
udp_ehashfn() is also changed, and the udp packets belonging to a same
flow will be hashed to different socket.
This is easily reproduced:
1) Create 10 udp sockets and bind all of them to 0.0.0.0:40000.
2) From the same host send udp packets to 127.0.0.1:40000, record the
socket index which receives the packets.
3) Create 1 more udp socket and bind it to 0.0.0.0:44096. The number 44096
is 40000 + UDP_HASH_SIZE(4096), this makes the new socket put into the
same hslot as the aformentioned 10 sockets, and makes the hslot->count
change from 10 to 11.
4) From the same host send udp packets to 127.0.0.1:40000, and the socket
index which receives the packets will be different from the one received
in step 2.
This should not happen as the socket bound to 0.0.0.0:44096 should not
change the behavior of the sockets bound to 0.0.0.0:40000.
It's the same case for IPv6, and this patch also fixes that.
Signed-off-by: Su, Xuemin <suxm@chinanetcenter.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-13 11:02:50 +08:00
|
|
|
/* called with rcu_read_lock() */
|
2024-08-02 21:40:29 +08:00
|
|
|
static struct sock *udp6_lib_lookup2(const struct net *net,
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
const struct in6_addr *saddr, __be16 sport,
|
2017-08-07 23:44:20 +08:00
|
|
|
const struct in6_addr *daddr, unsigned int hnum,
|
2019-06-14 12:41:26 +08:00
|
|
|
int dif, int sdif, struct udp_hslot *hslot2,
|
|
|
|
struct sk_buff *skb)
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
{
|
|
|
|
struct sock *sk, *result;
|
2017-11-30 22:39:34 +08:00
|
|
|
int score, badness;
|
udp: Avoid call to compute_score on multiple sites
We've observed a 7-12% performance regression in iperf3 UDP ipv4 and
ipv6 tests with multiple sockets on Zen3 cpus, which we traced back to
commit f0ea27e7bfe1 ("udp: re-score reuseport groups when connected
sockets are present"). The failing tests were those that would spawn
UDP sockets per-cpu on systems that have a high number of cpus.
Unsurprisingly, it is not caused by the extra re-scoring of the reused
socket, but due to the compiler no longer inlining compute_score, once
it has the extra call site in udp4_lib_lookup2. This is augmented by
the "Safe RET" mitigation for SRSO, needed in our Zen3 cpus.
We could just explicitly inline it, but compute_score() is quite a large
function, around 300b. Inlining in two sites would almost double
udp4_lib_lookup2, which is a silly thing to do just to workaround a
mitigation. Instead, this patch shuffles the code a bit to avoid the
multiple calls to compute_score. Since it is a static function used in
one spot, the compiler can safely fold it in, as it did before, without
increasing the text size.
With this patch applied I ran my original iperf3 testcases. The failing
cases all looked like this (ipv4):
iperf3 -c 127.0.0.1 --udp -4 -f K -b $R -l 8920 -t 30 -i 5 -P 64 -O 2
where $R is either 1G/10G/0 (max, unlimited). I ran 3 times each.
baseline is v6.9-rc3. harmean == harmonic mean; CV == coefficient of
variation.
ipv4:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 1743852.66(0.0208) 1725933.02(0.0167) 1705203.78(0.0386)
patched 1968727.61(0.0035) 1962283.22(0.0195) 1923853.50(0.0256)
ipv6:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 1729020.03(0.0028) 1691704.49(0.0243) 1692251.34(0.0083)
patched 1900422.19(0.0067) 1900968.01(0.0067) 1568532.72(0.1519)
This restores the performance we had before the change above with this
benchmark. We obviously don't expect any real impact when mitigations
are disabled, but just to be sure it also doesn't regresses:
mitigations=off ipv4:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 3230279.97(0.0066) 3229320.91(0.0060) 2605693.19(0.0697)
patched 3242802.36(0.0073) 3239310.71(0.0035) 2502427.19(0.0882)
Cc: Lorenz Bauer <lmb@isovalent.com>
Fixes: f0ea27e7bfe1 ("udp: re-score reuseport groups when connected sockets are present")
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-04-13 05:20:04 +08:00
|
|
|
bool need_rescore;
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
|
|
|
|
result = NULL;
|
|
|
|
badness = -1;
|
2016-04-01 23:52:13 +08:00
|
|
|
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
|
udp: Avoid call to compute_score on multiple sites
We've observed a 7-12% performance regression in iperf3 UDP ipv4 and
ipv6 tests with multiple sockets on Zen3 cpus, which we traced back to
commit f0ea27e7bfe1 ("udp: re-score reuseport groups when connected
sockets are present"). The failing tests were those that would spawn
UDP sockets per-cpu on systems that have a high number of cpus.
Unsurprisingly, it is not caused by the extra re-scoring of the reused
socket, but due to the compiler no longer inlining compute_score, once
it has the extra call site in udp4_lib_lookup2. This is augmented by
the "Safe RET" mitigation for SRSO, needed in our Zen3 cpus.
We could just explicitly inline it, but compute_score() is quite a large
function, around 300b. Inlining in two sites would almost double
udp4_lib_lookup2, which is a silly thing to do just to workaround a
mitigation. Instead, this patch shuffles the code a bit to avoid the
multiple calls to compute_score. Since it is a static function used in
one spot, the compiler can safely fold it in, as it did before, without
increasing the text size.
With this patch applied I ran my original iperf3 testcases. The failing
cases all looked like this (ipv4):
iperf3 -c 127.0.0.1 --udp -4 -f K -b $R -l 8920 -t 30 -i 5 -P 64 -O 2
where $R is either 1G/10G/0 (max, unlimited). I ran 3 times each.
baseline is v6.9-rc3. harmean == harmonic mean; CV == coefficient of
variation.
ipv4:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 1743852.66(0.0208) 1725933.02(0.0167) 1705203.78(0.0386)
patched 1968727.61(0.0035) 1962283.22(0.0195) 1923853.50(0.0256)
ipv6:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 1729020.03(0.0028) 1691704.49(0.0243) 1692251.34(0.0083)
patched 1900422.19(0.0067) 1900968.01(0.0067) 1568532.72(0.1519)
This restores the performance we had before the change above with this
benchmark. We obviously don't expect any real impact when mitigations
are disabled, but just to be sure it also doesn't regresses:
mitigations=off ipv4:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 3230279.97(0.0066) 3229320.91(0.0060) 2605693.19(0.0697)
patched 3242802.36(0.0073) 3239310.71(0.0035) 2502427.19(0.0882)
Cc: Lorenz Bauer <lmb@isovalent.com>
Fixes: f0ea27e7bfe1 ("udp: re-score reuseport groups when connected sockets are present")
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-04-13 05:20:04 +08:00
|
|
|
need_rescore = false;
|
|
|
|
rescore:
|
|
|
|
score = compute_score(need_rescore ? result : sk, net, saddr,
|
|
|
|
sport, daddr, hnum, dif, sdif);
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
if (score > badness) {
|
2023-07-20 23:30:05 +08:00
|
|
|
badness = score;
|
2023-07-20 23:30:08 +08:00
|
|
|
|
udp: Avoid call to compute_score on multiple sites
We've observed a 7-12% performance regression in iperf3 UDP ipv4 and
ipv6 tests with multiple sockets on Zen3 cpus, which we traced back to
commit f0ea27e7bfe1 ("udp: re-score reuseport groups when connected
sockets are present"). The failing tests were those that would spawn
UDP sockets per-cpu on systems that have a high number of cpus.
Unsurprisingly, it is not caused by the extra re-scoring of the reused
socket, but due to the compiler no longer inlining compute_score, once
it has the extra call site in udp4_lib_lookup2. This is augmented by
the "Safe RET" mitigation for SRSO, needed in our Zen3 cpus.
We could just explicitly inline it, but compute_score() is quite a large
function, around 300b. Inlining in two sites would almost double
udp4_lib_lookup2, which is a silly thing to do just to workaround a
mitigation. Instead, this patch shuffles the code a bit to avoid the
multiple calls to compute_score. Since it is a static function used in
one spot, the compiler can safely fold it in, as it did before, without
increasing the text size.
With this patch applied I ran my original iperf3 testcases. The failing
cases all looked like this (ipv4):
iperf3 -c 127.0.0.1 --udp -4 -f K -b $R -l 8920 -t 30 -i 5 -P 64 -O 2
where $R is either 1G/10G/0 (max, unlimited). I ran 3 times each.
baseline is v6.9-rc3. harmean == harmonic mean; CV == coefficient of
variation.
ipv4:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 1743852.66(0.0208) 1725933.02(0.0167) 1705203.78(0.0386)
patched 1968727.61(0.0035) 1962283.22(0.0195) 1923853.50(0.0256)
ipv6:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 1729020.03(0.0028) 1691704.49(0.0243) 1692251.34(0.0083)
patched 1900422.19(0.0067) 1900968.01(0.0067) 1568532.72(0.1519)
This restores the performance we had before the change above with this
benchmark. We obviously don't expect any real impact when mitigations
are disabled, but just to be sure it also doesn't regresses:
mitigations=off ipv4:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 3230279.97(0.0066) 3229320.91(0.0060) 2605693.19(0.0697)
patched 3242802.36(0.0073) 3239310.71(0.0035) 2502427.19(0.0882)
Cc: Lorenz Bauer <lmb@isovalent.com>
Fixes: f0ea27e7bfe1 ("udp: re-score reuseport groups when connected sockets are present")
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-04-13 05:20:04 +08:00
|
|
|
if (need_rescore)
|
|
|
|
continue;
|
|
|
|
|
2023-07-20 23:30:08 +08:00
|
|
|
if (sk->sk_state == TCP_ESTABLISHED) {
|
|
|
|
result = sk;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
|
|
|
|
saddr, sport, daddr, hnum, udp6_ehashfn);
|
2023-07-20 23:30:05 +08:00
|
|
|
if (!result) {
|
|
|
|
result = sk;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-07-26 08:49:04 +08:00
|
|
|
/* Fall back to scoring if group has connections */
|
2023-07-20 23:30:05 +08:00
|
|
|
if (!reuseport_has_conns(sk))
|
2020-07-17 18:35:30 +08:00
|
|
|
return result;
|
|
|
|
|
2023-07-20 23:30:05 +08:00
|
|
|
/* Reuseport logic returned an error, keep original score. */
|
|
|
|
if (IS_ERR(result))
|
|
|
|
continue;
|
|
|
|
|
udp: Avoid call to compute_score on multiple sites
We've observed a 7-12% performance regression in iperf3 UDP ipv4 and
ipv6 tests with multiple sockets on Zen3 cpus, which we traced back to
commit f0ea27e7bfe1 ("udp: re-score reuseport groups when connected
sockets are present"). The failing tests were those that would spawn
UDP sockets per-cpu on systems that have a high number of cpus.
Unsurprisingly, it is not caused by the extra re-scoring of the reused
socket, but due to the compiler no longer inlining compute_score, once
it has the extra call site in udp4_lib_lookup2. This is augmented by
the "Safe RET" mitigation for SRSO, needed in our Zen3 cpus.
We could just explicitly inline it, but compute_score() is quite a large
function, around 300b. Inlining in two sites would almost double
udp4_lib_lookup2, which is a silly thing to do just to workaround a
mitigation. Instead, this patch shuffles the code a bit to avoid the
multiple calls to compute_score. Since it is a static function used in
one spot, the compiler can safely fold it in, as it did before, without
increasing the text size.
With this patch applied I ran my original iperf3 testcases. The failing
cases all looked like this (ipv4):
iperf3 -c 127.0.0.1 --udp -4 -f K -b $R -l 8920 -t 30 -i 5 -P 64 -O 2
where $R is either 1G/10G/0 (max, unlimited). I ran 3 times each.
baseline is v6.9-rc3. harmean == harmonic mean; CV == coefficient of
variation.
ipv4:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 1743852.66(0.0208) 1725933.02(0.0167) 1705203.78(0.0386)
patched 1968727.61(0.0035) 1962283.22(0.0195) 1923853.50(0.0256)
ipv6:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 1729020.03(0.0028) 1691704.49(0.0243) 1692251.34(0.0083)
patched 1900422.19(0.0067) 1900968.01(0.0067) 1568532.72(0.1519)
This restores the performance we had before the change above with this
benchmark. We obviously don't expect any real impact when mitigations
are disabled, but just to be sure it also doesn't regresses:
mitigations=off ipv4:
1G 10G MAX
HARMEAN (CV) HARMEAN (CV) HARMEAN (CV)
baseline 3230279.97(0.0066) 3229320.91(0.0060) 2605693.19(0.0697)
patched 3242802.36(0.0073) 3239310.71(0.0035) 2502427.19(0.0882)
Cc: Lorenz Bauer <lmb@isovalent.com>
Fixes: f0ea27e7bfe1 ("udp: re-score reuseport groups when connected sockets are present")
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-04-13 05:20:04 +08:00
|
|
|
/* compute_score is too long of a function to be
|
|
|
|
* inlined, and calling it again here yields
|
|
|
|
* measureable overhead for some
|
|
|
|
* workloads. Work around it by jumping
|
|
|
|
* backwards to rescore 'result'.
|
|
|
|
*/
|
|
|
|
need_rescore = true;
|
|
|
|
goto rescore;
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
/* rcu_read_lock() must be held */
|
2024-08-02 21:40:29 +08:00
|
|
|
struct sock *__udp6_lib_lookup(const struct net *net,
|
2017-08-07 23:44:20 +08:00
|
|
|
const struct in6_addr *saddr, __be16 sport,
|
|
|
|
const struct in6_addr *daddr, __be16 dport,
|
|
|
|
int dif, int sdif, struct udp_table *udptable,
|
|
|
|
struct sk_buff *skb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned short hnum = ntohs(dport);
|
2018-12-13 05:15:34 +08:00
|
|
|
unsigned int hash2, slot2;
|
|
|
|
struct udp_hslot *hslot2;
|
2020-07-17 18:35:31 +08:00
|
|
|
struct sock *result, *sk;
|
2008-10-29 16:41:45 +08:00
|
|
|
|
2018-12-13 05:15:34 +08:00
|
|
|
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
|
|
|
|
slot2 = hash2 & udptable->mask;
|
|
|
|
hslot2 = &udptable->hash2[slot2];
|
|
|
|
|
2020-07-17 18:35:31 +08:00
|
|
|
/* Lookup connected or non-wildcard sockets */
|
2018-12-13 05:15:34 +08:00
|
|
|
result = udp6_lib_lookup2(net, saddr, sport,
|
2019-06-14 12:41:26 +08:00
|
|
|
daddr, hnum, dif, sdif,
|
2018-12-13 05:15:34 +08:00
|
|
|
hslot2, skb);
|
2020-07-17 18:35:31 +08:00
|
|
|
if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
/* Lookup redirect from BPF */
|
2023-07-20 23:30:10 +08:00
|
|
|
if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
|
|
|
|
udptable == net->ipv4.udp_table) {
|
|
|
|
sk = inet6_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
|
|
|
|
saddr, sport, daddr, hnum, dif,
|
|
|
|
udp6_ehashfn);
|
2020-07-17 18:35:31 +08:00
|
|
|
if (sk) {
|
|
|
|
result = sk;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
}
|
2018-12-13 05:15:34 +08:00
|
|
|
|
2020-07-17 18:35:31 +08:00
|
|
|
/* Got non-wildcard socket or error on first lookup */
|
|
|
|
if (result)
|
|
|
|
goto done;
|
ipv6: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, in6addr_any) chain to find sockets not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 18:18:30 +08:00
|
|
|
|
2020-07-17 18:35:31 +08:00
|
|
|
/* Lookup wildcard sockets */
|
|
|
|
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
|
|
|
|
slot2 = hash2 & udptable->mask;
|
|
|
|
hslot2 = &udptable->hash2[slot2];
|
|
|
|
|
|
|
|
result = udp6_lib_lookup2(net, saddr, sport,
|
|
|
|
&in6addr_any, hnum, dif, sdif,
|
|
|
|
hslot2, skb);
|
|
|
|
done:
|
2019-06-06 05:11:34 +08:00
|
|
|
if (IS_ERR(result))
|
2018-12-13 05:15:34 +08:00
|
|
|
return NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
return result;
|
|
|
|
}
|
2011-12-09 14:23:34 +08:00
|
|
|
EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-10-08 03:38:32 +08:00
|
|
|
static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
|
|
|
|
__be16 sport, __be16 dport,
|
2008-10-29 16:41:45 +08:00
|
|
|
struct udp_table *udptable)
|
2008-10-08 03:38:32 +08:00
|
|
|
{
|
2011-04-22 12:53:02 +08:00
|
|
|
const struct ipv6hdr *iph = ipv6_hdr(skb);
|
2008-10-08 03:38:32 +08:00
|
|
|
|
2016-05-13 07:23:44 +08:00
|
|
|
return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
|
2009-06-02 13:19:30 +08:00
|
|
|
&iph->daddr, dport, inet6_iif(skb),
|
2017-08-07 23:44:20 +08:00
|
|
|
inet6_sdif(skb), udptable, skb);
|
2008-10-08 03:38:32 +08:00
|
|
|
}
|
|
|
|
|
2020-11-10 07:13:49 +08:00
|
|
|
struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
|
2016-04-05 23:22:50 +08:00
|
|
|
__be16 sport, __be16 dport)
|
|
|
|
{
|
net: gro: fix udp bad offset in socket lookup by adding {inner_}network_offset to napi_gro_cb
Commits a602456 ("udp: Add GRO functions to UDP socket") and 57c67ff ("udp:
additional GRO support") introduce incorrect usage of {ip,ipv6}_hdr in the
complete phase of gro. The functions always return skb->network_header,
which in the case of encapsulated packets at the gro complete phase, is
always set to the innermost L3 of the packet. That means that calling
{ip,ipv6}_hdr for skbs which completed the GRO receive phase (both in
gro_list and *_gro_complete) when parsing an encapsulated packet's _outer_
L3/L4 may return an unexpected value.
This incorrect usage leads to a bug in GRO's UDP socket lookup.
udp{4,6}_lib_lookup_skb functions use ip_hdr/ipv6_hdr respectively. These
*_hdr functions return network_header which will point to the innermost L3,
resulting in the wrong offset being used in __udp{4,6}_lib_lookup with
encapsulated packets.
This patch adds network_offset and inner_network_offset to napi_gro_cb, and
makes sure both are set correctly.
To fix the issue, network_offsets union is used inside napi_gro_cb, in
which both the outer and the inner network offsets are saved.
Reproduction example:
Endpoint configuration example (fou + local address bind)
# ip fou add port 6666 ipproto 4
# ip link add name tun1 type ipip remote 2.2.2.1 local 2.2.2.2 encap fou encap-dport 5555 encap-sport 6666 mode ipip
# ip link set tun1 up
# ip a add 1.1.1.2/24 dev tun1
Netperf TCP_STREAM result on net-next before patch is applied:
net-next main, GRO enabled:
$ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
131072 16384 16384 5.28 2.37
net-next main, GRO disabled:
$ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
131072 16384 16384 5.01 2745.06
patch applied, GRO enabled:
$ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
131072 16384 16384 5.01 2877.38
Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket")
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-04-30 22:35:54 +08:00
|
|
|
const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
|
|
|
|
const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);
|
2022-11-15 05:57:56 +08:00
|
|
|
struct net *net = dev_net(skb->dev);
|
2023-07-27 23:33:56 +08:00
|
|
|
int iif, sdif;
|
|
|
|
|
|
|
|
inet6_get_iif_sdif(skb, &iif, &sdif);
|
2016-04-05 23:22:50 +08:00
|
|
|
|
2022-11-15 05:57:56 +08:00
|
|
|
return __udp6_lib_lookup(net, &iph->saddr, sport,
|
2023-07-27 23:33:56 +08:00
|
|
|
&iph->daddr, dport, iif,
|
|
|
|
sdif, net->ipv4.udp_table, NULL);
|
2016-04-05 23:22:50 +08:00
|
|
|
}
|
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
/* Must be called under rcu_read_lock().
|
|
|
|
* Does increment socket refcount.
|
|
|
|
*/
|
2018-06-05 19:40:34 +08:00
|
|
|
#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
|
2024-08-02 21:40:29 +08:00
|
|
|
struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport,
|
2010-10-21 22:05:41 +08:00
|
|
|
const struct in6_addr *daddr, __be16 dport, int dif)
|
|
|
|
{
|
2016-04-01 23:52:13 +08:00
|
|
|
struct sock *sk;
|
|
|
|
|
|
|
|
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
|
2022-11-15 05:57:56 +08:00
|
|
|
dif, 0, net->ipv4.udp_table, NULL);
|
2017-06-30 18:08:01 +08:00
|
|
|
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
|
2016-04-01 23:52:13 +08:00
|
|
|
sk = NULL;
|
|
|
|
return sk;
|
2010-10-21 22:05:41 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
|
2016-04-01 23:52:13 +08:00
|
|
|
#endif
|
2010-10-21 22:05:41 +08:00
|
|
|
|
2017-07-31 22:52:36 +08:00
|
|
|
/* do not use the scratch area len for jumbogram: their length execeeds the
|
|
|
|
* scratch area space; note that the IP6CB flags is still in the first
|
|
|
|
* cacheline, so checking for jumbograms is cheap
|
|
|
|
*/
|
|
|
|
static int udp6_skb_len(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2014-08-25 04:53:10 +08:00
|
|
|
* This should be easy, if there is something there we
|
|
|
|
* return it, otherwise we block.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
|
2015-03-02 15:37:48 +08:00
|
|
|
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
|
net: remove noblock parameter from recvmsg() entities
The internal recvmsg() functions have two parameters 'flags' and 'noblock'
that were merged inside skb_recv_datagram(). As a follow up patch to commit
f4b41f062c42 ("net: remove noblock parameter from skb_recv_datagram()")
this patch removes the separate 'noblock' parameter for recvmsg().
Analogue to the referenced patch for skb_recv_datagram() the 'flags' and
'noblock' parameters are unnecessarily split up with e.g.
err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
or in
err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
instead of simply using only flags all the time and check for MSG_DONTWAIT
where needed (to preserve for the formerly separated no(n)block condition).
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20220411124955.154876-1-socketcan@hartkopp.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-04-11 20:49:55 +08:00
|
|
|
int flags, int *addr_len)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipv6_pinfo *np = inet6_sk(sk);
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
2007-02-09 22:24:49 +08:00
|
|
|
struct sk_buff *skb;
|
2011-12-02 03:12:55 +08:00
|
|
|
unsigned int ulen, copied;
|
2019-04-08 16:15:59 +08:00
|
|
|
int off, err, peeking = flags & MSG_PEEK;
|
2007-03-26 11:10:56 +08:00
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2019-02-22 00:43:57 +08:00
|
|
|
struct udp_mib __percpu *mib;
|
udp: properly support MSG_PEEK with truncated buffers
Backport of this upstream commit into stable kernels :
89c22d8c3b27 ("net: Fix skb csum races when peeking")
exposed a bug in udp stack vs MSG_PEEK support, when user provides
a buffer smaller than skb payload.
In this case,
skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr),
msg->msg_iov);
returns -EFAULT.
This bug does not happen in upstream kernels since Al Viro did a great
job to replace this into :
skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg);
This variant is safe vs short buffers.
For the time being, instead reverting Herbert Xu patch and add back
skb->ip_summed invalid changes, simply store the result of
udp_lib_checksum_complete() so that we avoid computing the checksum a
second time, and avoid the problematic
skb_copy_and_csum_datagram_iovec() call.
This patch can be applied on recent kernels as it avoids a double
checksumming, then backported to stable kernels as a bug fix.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-30 21:51:12 +08:00
|
|
|
bool checksum_valid = false;
|
2008-11-03 00:11:01 +08:00
|
|
|
int is_udp4;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (flags & MSG_ERRQUEUE)
|
2013-11-23 07:46:12 +08:00
|
|
|
return ipv6_recv_error(sk, msg, len, addr_len);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-04-23 19:26:09 +08:00
|
|
|
if (np->rxpmtu && np->rxopt.bits.rxpmtu)
|
2013-11-23 07:46:12 +08:00
|
|
|
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
|
2010-04-23 19:26:09 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
try_again:
|
datagram: When peeking datagrams with offset < 0 don't skip empty skbs
Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
headers from UDP packets before queueing"), when udp packets are being
peeked the requested extra offset is always 0 as there is no need to skip
the udp header. However, when the offset is 0 and the next skb is
of length 0, it is only returned once. The behaviour can be seen with
the following python script:
from socket import *;
f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
f.bind(('::', 0));
addr=('::1', f.getsockname()[1]);
g.sendto(b'', addr)
g.sendto(b'b', addr)
print(f.recvfrom(10, MSG_PEEK));
print(f.recvfrom(10, MSG_PEEK));
Where the expected output should be the empty string twice.
Instead, make sk_peek_offset return negative values, and pass those values
to __skb_try_recv_datagram/__skb_try_recv_from_queue. If the passed offset
to __skb_try_recv_from_queue is negative, the checked skb is never skipped.
__skb_try_recv_from_queue will then ensure the offset is reset back to 0
if a peek is requested without an offset, unless no packets are found.
Also simplify the if condition in __skb_try_recv_from_queue. If _off is
greater then 0, and off is greater then or equal to skb->len, then
(_off || skb->len) must always be true assuming skb->len >= 0 is always
true.
Also remove a redundant check around a call to sk_peek_offset in af_unix.c,
as it double checked if MSG_PEEK was set in the flags.
V2:
- Moved the negative fixup into __skb_try_recv_from_queue, and remove now
redundant checks
- Fix peeking in udp{,v6}_recvmsg to report the right value when the
offset is 0
V3:
- Marked new branch in __skb_try_recv_from_queue as unlikely.
Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-08-19 03:04:54 +08:00
|
|
|
off = sk_peek_offset(sk, flags);
|
net: remove noblock parameter from recvmsg() entities
The internal recvmsg() functions have two parameters 'flags' and 'noblock'
that were merged inside skb_recv_datagram(). As a follow up patch to commit
f4b41f062c42 ("net: remove noblock parameter from skb_recv_datagram()")
this patch removes the separate 'noblock' parameter for recvmsg().
Analogue to the referenced patch for skb_recv_datagram() the 'flags' and
'noblock' parameters are unnecessarily split up with e.g.
err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
or in
err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
instead of simply using only flags all the time and check for MSG_DONTWAIT
where needed (to preserve for the formerly separated no(n)block condition).
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20220411124955.154876-1-socketcan@hartkopp.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-04-11 20:49:55 +08:00
|
|
|
skb = __skb_recv_udp(sk, flags, &off, &err);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!skb)
|
2016-04-06 00:41:16 +08:00
|
|
|
return err;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-07-31 22:52:36 +08:00
|
|
|
ulen = udp6_skb_len(skb);
|
2011-12-02 03:12:55 +08:00
|
|
|
copied = len;
|
2016-04-06 00:41:16 +08:00
|
|
|
if (copied > ulen - off)
|
|
|
|
copied = ulen - off;
|
2011-12-02 03:12:55 +08:00
|
|
|
else if (copied < ulen)
|
2007-02-09 22:24:49 +08:00
|
|
|
msg->msg_flags |= MSG_TRUNC;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-11-03 00:11:01 +08:00
|
|
|
is_udp4 = (skb->protocol == htons(ETH_P_IP));
|
2018-11-09 22:52:45 +08:00
|
|
|
mib = __UDPX_MIB(sk, is_udp4);
|
2008-11-03 00:11:01 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
/*
|
2007-03-26 11:10:56 +08:00
|
|
|
* If checksum is needed at all, try to do it while copying the
|
|
|
|
* data. If the data is truncated, or if we only want a partial
|
|
|
|
* coverage checksum (UDP-Lite), do it before the copy.
|
2006-11-28 03:10:57 +08:00
|
|
|
*/
|
|
|
|
|
2016-11-19 09:18:03 +08:00
|
|
|
if (copied < ulen || peeking ||
|
|
|
|
(is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
|
2017-06-27 01:01:51 +08:00
|
|
|
checksum_valid = udp_skb_csum_unnecessary(skb) ||
|
|
|
|
!__udp_lib_checksum_complete(skb);
|
udp: properly support MSG_PEEK with truncated buffers
Backport of this upstream commit into stable kernels :
89c22d8c3b27 ("net: Fix skb csum races when peeking")
exposed a bug in udp stack vs MSG_PEEK support, when user provides
a buffer smaller than skb payload.
In this case,
skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr),
msg->msg_iov);
returns -EFAULT.
This bug does not happen in upstream kernels since Al Viro did a great
job to replace this into :
skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg);
This variant is safe vs short buffers.
For the time being, instead reverting Herbert Xu patch and add back
skb->ip_summed invalid changes, simply store the result of
udp_lib_checksum_complete() so that we avoid computing the checksum a
second time, and avoid the problematic
skb_copy_and_csum_datagram_iovec() call.
This patch can be applied on recent kernels as it avoids a double
checksumming, then backported to stable kernels as a bug fix.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-30 21:51:12 +08:00
|
|
|
if (!checksum_valid)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto csum_copy_err;
|
2006-11-28 03:10:57 +08:00
|
|
|
}
|
|
|
|
|
2017-06-27 01:01:51 +08:00
|
|
|
if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
|
|
|
|
if (udp_skb_is_linear(skb))
|
|
|
|
err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
|
|
|
|
else
|
|
|
|
err = skb_copy_datagram_msg(skb, off, msg, copied);
|
|
|
|
} else {
|
2016-04-06 00:41:16 +08:00
|
|
|
err = skb_copy_and_csum_datagram_msg(skb, off, msg);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (err == -EINVAL)
|
|
|
|
goto csum_copy_err;
|
|
|
|
}
|
2012-06-27 08:23:44 +08:00
|
|
|
if (unlikely(err)) {
|
2019-04-08 16:15:59 +08:00
|
|
|
if (!peeking) {
|
2012-09-06 07:34:44 +08:00
|
|
|
atomic_inc(&sk->sk_drops);
|
2018-11-09 22:52:45 +08:00
|
|
|
SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
|
2012-09-06 07:34:44 +08:00
|
|
|
}
|
2016-10-21 19:55:47 +08:00
|
|
|
kfree_skb(skb);
|
2016-04-06 00:41:16 +08:00
|
|
|
return err;
|
2012-06-27 08:23:44 +08:00
|
|
|
}
|
2019-04-08 16:15:59 +08:00
|
|
|
if (!peeking)
|
2018-11-09 22:52:45 +08:00
|
|
|
SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS);
|
2007-12-03 19:33:28 +08:00
|
|
|
|
2022-04-28 04:02:37 +08:00
|
|
|
sock_recv_cmsgs(msg, sk, skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Copy the address. */
|
|
|
|
if (msg->msg_name) {
|
2014-01-18 05:53:15 +08:00
|
|
|
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
|
2005-04-17 06:20:36 +08:00
|
|
|
sin6->sin6_family = AF_INET6;
|
2007-03-14 01:28:48 +08:00
|
|
|
sin6->sin6_port = udp_hdr(skb)->source;
|
2005-04-17 06:20:36 +08:00
|
|
|
sin6->sin6_flowinfo = 0;
|
|
|
|
|
2013-03-08 10:07:19 +08:00
|
|
|
if (is_udp4) {
|
2009-10-08 04:58:25 +08:00
|
|
|
ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
|
|
|
|
&sin6->sin6_addr);
|
2013-03-08 10:07:19 +08:00
|
|
|
sin6->sin6_scope_id = 0;
|
|
|
|
} else {
|
2011-11-21 11:39:03 +08:00
|
|
|
sin6->sin6_addr = ipv6_hdr(skb)->saddr;
|
2013-03-08 10:07:19 +08:00
|
|
|
sin6->sin6_scope_id =
|
|
|
|
ipv6_iface_scope_id(&sin6->sin6_addr,
|
2014-08-01 09:52:58 +08:00
|
|
|
inet6_iif(skb));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2013-11-18 11:20:45 +08:00
|
|
|
*addr_len = sizeof(*sin6);
|
bpf: fix unconnected udp hooks
Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently
to applications as also stated in original motivation in 7828f20e3779 ("Merge
branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter
two hooks into Cilium to enable host based load-balancing with Kubernetes,
I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes
typically sets up DNS as a service and is thus subject to load-balancing.
Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API
is currently insufficient and thus not usable as-is for standard applications
shipped with most distros. To break down the issue we ran into with a simple
example:
# cat /etc/resolv.conf
nameserver 147.75.207.207
nameserver 147.75.207.208
For the purpose of a simple test, we set up above IPs as service IPs and
transparently redirect traffic to a different DNS backend server for that
node:
# cilium service list
ID Frontend Backend
1 147.75.207.207:53 1 => 8.8.8.8:53
2 147.75.207.208:53 1 => 8.8.8.8:53
The attached BPF program is basically selecting one of the backends if the
service IP/port matches on the cgroup hook. DNS breaks here, because the
hooks are not transparent enough to applications which have built-in msg_name
address checks:
# nslookup 1.1.1.1
;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53
;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
[...]
;; connection timed out; no servers could be reached
# dig 1.1.1.1
;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53
;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
[...]
; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1
;; global options: +cmd
;; connection timed out; no servers could be reached
For comparison, if none of the service IPs is used, and we tell nslookup
to use 8.8.8.8 directly it works just fine, of course:
# nslookup 1.1.1.1 8.8.8.8
1.1.1.1.in-addr.arpa name = one.one.one.one.
In order to fix this and thus act more transparent to the application,
this needs reverse translation on recvmsg() side. A minimal fix for this
API is to add similar recvmsg() hooks behind the BPF cgroups static key
such that the program can track state and replace the current sockaddr_in{,6}
with the original service IP. From BPF side, this basically tracks the
service tuple plus socket cookie in an LRU map where the reverse NAT can
then be retrieved via map value as one example. Side-note: the BPF cgroups
static key should be converted to a per-hook static key in future.
Same example after this fix:
# cilium service list
ID Frontend Backend
1 147.75.207.207:53 1 => 8.8.8.8:53
2 147.75.207.208:53 1 => 8.8.8.8:53
Lookups work fine now:
# nslookup 1.1.1.1
1.1.1.1.in-addr.arpa name = one.one.one.one.
Authoritative answers can be found from:
# dig 1.1.1.1
; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550
;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1
;; OPT PSEUDOSECTION:
; EDNS: version: 0, flags:; udp: 512
;; QUESTION SECTION:
;1.1.1.1. IN A
;; AUTHORITY SECTION:
. 23426 IN SOA a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400
;; Query time: 17 msec
;; SERVER: 147.75.207.207#53(147.75.207.207)
;; WHEN: Tue May 21 12:59:38 UTC 2019
;; MSG SIZE rcvd: 111
And from an actual packet level it shows that we're using the back end
server when talking via 147.75.207.20{7,8} front end:
# tcpdump -i any udp
[...]
12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38)
12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38)
12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67)
12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67)
[...]
In order to be flexible and to have same semantics as in sendmsg BPF
programs, we only allow return codes in [1,1] range. In the sendmsg case
the program is called if msg->msg_name is present which can be the case
in both, connected and unconnected UDP.
The former only relies on the sockaddr_in{,6} passed via connect(2) if
passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar
way to call into the BPF program whenever a non-NULL msg->msg_name was
passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note
that for TCP case, the msg->msg_name is ignored in the regular recvmsg
path and therefore not relevant.
For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE,
the hook is not called. This is intentional as it aligns with the same
semantics as in case of TCP cgroup BPF hooks right now. This might be
better addressed in future through a different bpf_attach_type such
that this case can be distinguished from the regular recvmsg paths,
for example.
Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2019-06-07 07:48:57 +08:00
|
|
|
|
2021-01-16 00:35:01 +08:00
|
|
|
BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
|
2023-10-12 02:51:04 +08:00
|
|
|
(struct sockaddr *)sin6,
|
|
|
|
addr_len);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2014-01-20 10:43:08 +08:00
|
|
|
|
2023-09-12 17:17:24 +08:00
|
|
|
if (udp_test_bit(GRO_ENABLED, sk))
|
2018-11-07 19:38:30 +08:00
|
|
|
udp_cmsg_recv(msg, sk, skb);
|
|
|
|
|
2014-01-20 10:43:08 +08:00
|
|
|
if (np->rxopt.all)
|
|
|
|
ip6_datagram_recv_common_ctl(sk, msg, skb);
|
|
|
|
|
2008-11-03 00:11:01 +08:00
|
|
|
if (is_udp4) {
|
2023-08-16 16:15:33 +08:00
|
|
|
if (inet_cmsg_flags(inet))
|
2016-11-04 18:28:58 +08:00
|
|
|
ip_cmsg_recv_offset(msg, sk, skb,
|
2016-10-24 09:03:06 +08:00
|
|
|
sizeof(struct udphdr), off);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
if (np->rxopt.all)
|
2014-01-20 10:43:08 +08:00
|
|
|
ip6_datagram_recv_specific_ctl(sk, msg, skb);
|
2007-02-09 22:24:49 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-12-02 03:12:55 +08:00
|
|
|
err = copied;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (flags & MSG_TRUNC)
|
2007-03-26 11:10:56 +08:00
|
|
|
err = ulen;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-10-21 19:55:47 +08:00
|
|
|
skb_consume_udp(sk, skb, peeking ? -err : err);
|
2005-04-17 06:20:36 +08:00
|
|
|
return err;
|
|
|
|
|
|
|
|
csum_copy_err:
|
2017-05-16 17:20:14 +08:00
|
|
|
if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
|
|
|
|
udp_skb_destructor)) {
|
2018-11-09 22:52:45 +08:00
|
|
|
SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS);
|
|
|
|
SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
|
2008-11-03 00:14:27 +08:00
|
|
|
}
|
2016-10-21 19:55:47 +08:00
|
|
|
kfree_skb(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-05-31 00:16:53 +08:00
|
|
|
/* starting over for a new packet, but check if we need to yield */
|
|
|
|
cond_resched();
|
2011-06-21 18:43:40 +08:00
|
|
|
msg->msg_flags &= ~MSG_TRUNC;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto try_again;
|
|
|
|
}
|
|
|
|
|
2024-03-26 19:33:58 +08:00
|
|
|
DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
void udpv6_encap_enable(void)
|
|
|
|
{
|
2018-11-15 09:34:50 +08:00
|
|
|
static_branch_inc(&udpv6_encap_needed_key);
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(udpv6_encap_enable);
|
|
|
|
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
|
|
|
|
* through error handlers in encapsulations looking for a match.
|
|
|
|
*/
|
|
|
|
static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb,
|
|
|
|
struct inet6_skb_parm *opt,
|
2019-02-22 00:43:59 +08:00
|
|
|
u8 type, u8 code, int offset, __be32 info)
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
|
|
|
|
int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
2019-02-22 00:43:59 +08:00
|
|
|
u8 type, u8 code, int offset, __be32 info);
|
|
|
|
const struct ip6_tnl_encap_ops *encap;
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
|
2019-02-22 00:43:59 +08:00
|
|
|
encap = rcu_dereference(ip6tun_encaps[i]);
|
|
|
|
if (!encap)
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
continue;
|
2019-02-22 00:43:59 +08:00
|
|
|
handler = encap->err_handler;
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
if (handler && !handler(skb, opt, type, code, offset, info))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
/* Try to match ICMP errors to UDP tunnels by looking up a socket without
|
|
|
|
* reversing source and destination port: this will match tunnels that force the
|
|
|
|
* same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
|
|
|
|
* lwtunnels might actually break this assumption by being configured with
|
|
|
|
* different destination ports on endpoints, in this case we won't be able to
|
|
|
|
* trace ICMP messages back to them.
|
|
|
|
*
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
* If this doesn't match any socket, probe tunnels with arbitrary destination
|
|
|
|
* ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
|
|
|
|
* we've sent packets to won't necessarily match the local destination port.
|
|
|
|
*
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
* Then ask the tunnel implementation to match the error against a valid
|
|
|
|
* association.
|
|
|
|
*
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
* Return an error if we can't find a match, the socket if we need further
|
|
|
|
* processing, zero otherwise.
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
*/
|
|
|
|
static struct sock *__udp6_lib_err_encap(struct net *net,
|
|
|
|
const struct ipv6hdr *hdr, int offset,
|
|
|
|
struct udphdr *uh,
|
|
|
|
struct udp_table *udptable,
|
2021-07-21 04:35:28 +08:00
|
|
|
struct sock *sk,
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
struct sk_buff *skb,
|
|
|
|
struct inet6_skb_parm *opt,
|
|
|
|
u8 type, u8 code, __be32 info)
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
{
|
2021-07-21 04:35:28 +08:00
|
|
|
int (*lookup)(struct sock *sk, struct sk_buff *skb);
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
int network_offset, transport_offset;
|
2021-07-21 04:35:28 +08:00
|
|
|
struct udp_sock *up;
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
|
|
|
|
network_offset = skb_network_offset(skb);
|
|
|
|
transport_offset = skb_transport_offset(skb);
|
|
|
|
|
|
|
|
/* Network header needs to point to the outer IPv6 header inside ICMP */
|
|
|
|
skb_reset_network_header(skb);
|
|
|
|
|
|
|
|
/* Transport header needs to point to the UDP header */
|
|
|
|
skb_set_transport_header(skb, offset);
|
|
|
|
|
2021-07-21 04:35:28 +08:00
|
|
|
if (sk) {
|
|
|
|
up = udp_sk(sk);
|
|
|
|
|
|
|
|
lookup = READ_ONCE(up->encap_err_lookup);
|
|
|
|
if (lookup && lookup(sk, skb))
|
|
|
|
sk = NULL;
|
|
|
|
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
|
|
|
|
&hdr->saddr, uh->dest,
|
|
|
|
inet6_iif(skb), 0, udptable, skb);
|
|
|
|
if (sk) {
|
2021-07-21 04:35:28 +08:00
|
|
|
up = udp_sk(sk);
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
|
|
|
|
lookup = READ_ONCE(up->encap_err_lookup);
|
|
|
|
if (!lookup || lookup(sk, skb))
|
|
|
|
sk = NULL;
|
|
|
|
}
|
|
|
|
|
2021-07-21 04:35:28 +08:00
|
|
|
out:
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
if (!sk) {
|
|
|
|
sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code,
|
|
|
|
offset, info));
|
|
|
|
}
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
|
|
|
|
skb_set_transport_header(skb, transport_offset);
|
|
|
|
skb_set_network_header(skb, network_offset);
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
2018-11-08 19:19:21 +08:00
|
|
|
int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
|
|
|
u8 type, u8 code, int offset, __be32 info,
|
|
|
|
struct udp_table *udptable)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipv6_pinfo *np;
|
2011-04-22 12:53:02 +08:00
|
|
|
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
|
|
|
|
const struct in6_addr *saddr = &hdr->saddr;
|
2022-01-04 01:11:32 +08:00
|
|
|
const struct in6_addr *daddr = seg6_get_daddr(skb, opt) ? : &hdr->daddr;
|
2014-08-25 04:53:10 +08:00
|
|
|
struct udphdr *uh = (struct udphdr *)(skb->data+offset);
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
bool tunnel = false;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sock *sk;
|
2016-02-18 05:58:22 +08:00
|
|
|
int harderr;
|
2005-04-17 06:20:36 +08:00
|
|
|
int err;
|
2014-07-31 17:54:32 +08:00
|
|
|
struct net *net = dev_net(skb->dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-01-05 06:41:46 +08:00
|
|
|
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
|
2019-06-01 06:29:11 +08:00
|
|
|
inet6_iif(skb), inet6_sdif(skb), udptable, NULL);
|
2021-07-21 04:35:28 +08:00
|
|
|
|
2023-09-12 17:17:28 +08:00
|
|
|
if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
/* No socket for error: try tunnels before discarding */
|
|
|
|
if (static_branch_unlikely(&udpv6_encap_needed_key)) {
|
|
|
|
sk = __udp6_lib_err_encap(net, hdr, offset, uh,
|
2021-07-21 04:35:28 +08:00
|
|
|
udptable, sk, skb,
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
opt, type, code, info);
|
|
|
|
if (!sk)
|
|
|
|
return 0;
|
2021-07-21 04:35:28 +08:00
|
|
|
} else
|
|
|
|
sk = ERR_PTR(-ENOENT);
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
if (IS_ERR(sk)) {
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
|
|
|
|
ICMP6_MIB_INERRORS);
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
return PTR_ERR(sk);
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
}
|
udp: Support for error handlers of tunnels with arbitrary destination port
ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.
Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VXLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.
Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.
v2:
- Name all arguments in err_handler prototypes (David Miller)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:22 +08:00
|
|
|
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
tunnel = true;
|
2014-07-31 17:54:32 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-02-18 05:58:22 +08:00
|
|
|
harderr = icmpv6_err_convert(type, code, &err);
|
|
|
|
np = inet6_sk(sk);
|
|
|
|
|
2013-12-15 10:41:14 +08:00
|
|
|
if (type == ICMPV6_PKT_TOOBIG) {
|
|
|
|
if (!ip6_sk_accept_pmtu(sk))
|
|
|
|
goto out;
|
2012-06-16 05:54:11 +08:00
|
|
|
ip6_sk_update_pmtu(skb, sk, info);
|
2023-09-13 00:02:11 +08:00
|
|
|
if (READ_ONCE(np->pmtudisc) != IPV6_PMTUDISC_DONT)
|
2016-02-18 05:58:22 +08:00
|
|
|
harderr = 1;
|
2013-12-15 10:41:14 +08:00
|
|
|
}
|
2013-09-20 18:20:28 +08:00
|
|
|
if (type == NDISC_REDIRECT) {
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
if (tunnel) {
|
|
|
|
ip6_redirect(skb, sock_net(sk), inet6_iif(skb),
|
2023-07-28 23:03:15 +08:00
|
|
|
READ_ONCE(sk->sk_mark), sk->sk_uid);
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
} else {
|
|
|
|
ip6_sk_redirect(skb, sk);
|
|
|
|
}
|
2013-09-20 18:20:28 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2012-06-16 05:54:11 +08:00
|
|
|
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
/* Tunnels don't have an application socket: don't pass errors back */
|
2022-08-26 22:39:28 +08:00
|
|
|
if (tunnel) {
|
|
|
|
if (udp_sk(sk)->encap_err_rcv)
|
2022-10-12 15:49:29 +08:00
|
|
|
udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest,
|
|
|
|
ntohl(info), (u8 *)(uh+1));
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
goto out;
|
2022-08-26 22:39:28 +08:00
|
|
|
}
|
udp: Handle ICMP errors for tunnels with same destination port on both endpoints
For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VXLAN and GENEVE.
Actually, lwtunnels could break this assumption if they are configured by
an external control plane to have different destination ports on the
endpoints: in this case, we won't be able to trace ICMP messages back to
them.
For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.
Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.
v2:
- Added newline between network and transport header sets in
__udp{4,6}_lib_err_encap() (David Miller)
- Removed redundant skb_reset_network_header(skb); in
__udp4_lib_err_encap()
- Removed redundant reassignment of iph in __udp4_lib_err_encap()
(Sabrina Dubroca)
- Edited comment to __udp{4,6}_lib_err_encap() to reflect the fact this
won't work with lwtunnels configured to use asymmetric ports. By the way,
it's VXLAN, not VxLAN (Jiri Benc)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-08 19:19:14 +08:00
|
|
|
|
2023-09-13 00:02:08 +08:00
|
|
|
if (!inet6_test_bit(RECVERR6, sk)) {
|
2016-02-18 05:58:22 +08:00
|
|
|
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
|
|
|
|
goto out;
|
|
|
|
} else {
|
2005-04-17 06:20:36 +08:00
|
|
|
ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1));
|
2016-02-18 05:58:22 +08:00
|
|
|
}
|
2010-06-01 14:44:05 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
sk->sk_err = err;
|
2021-06-28 06:48:21 +08:00
|
|
|
sk_error_report(sk);
|
2005-04-17 06:20:36 +08:00
|
|
|
out:
|
2018-11-08 19:19:21 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 20:52:16 +08:00
|
|
|
static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
|
2012-04-27 16:23:21 +08:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
|
2012-04-27 16:23:21 +08:00
|
|
|
sock_rps_save_rxhash(sk, skb);
|
2013-10-08 00:01:38 +08:00
|
|
|
sk_mark_napi_id(sk, skb);
|
net: introduce SO_INCOMING_CPU
Alternative to RPS/RFS is to use hardware support for multiple
queues.
Then split a set of million of sockets into worker threads, each
one using epoll() to manage events on its own socket pool.
Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
know after accept() or connect() on which queue/cpu a socket is managed.
We normally use one cpu per RX queue (IRQ smp_affinity being properly
set), so remembering on socket structure which cpu delivered last packet
is enough to solve the problem.
After accept(), connect(), or even file descriptor passing around
processes, applications can use :
int cpu;
socklen_t len = sizeof(cpu);
getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
And use this information to put the socket into the right silo
for optimal performance, as all networking stack should run
on the appropriate cpu, without need to send IPI (RPS/RFS).
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-11 21:54:28 +08:00
|
|
|
sk_incoming_cpu_update(sk);
|
2016-11-17 01:10:42 +08:00
|
|
|
} else {
|
|
|
|
sk_mark_napi_id_once(sk, skb);
|
2013-10-08 00:01:38 +08:00
|
|
|
}
|
2012-04-27 16:23:21 +08:00
|
|
|
|
2016-10-21 19:55:47 +08:00
|
|
|
rc = __udp_enqueue_schedule_skb(sk, skb);
|
2012-04-27 16:23:21 +08:00
|
|
|
if (rc < 0) {
|
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2022-09-26 20:03:50 +08:00
|
|
|
enum skb_drop_reason drop_reason;
|
2012-04-27 16:23:21 +08:00
|
|
|
|
|
|
|
/* Note that an ENOMEM error is charged twice */
|
2022-09-26 20:03:50 +08:00
|
|
|
if (rc == -ENOMEM) {
|
2016-04-30 05:16:50 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
2016-04-28 07:44:30 +08:00
|
|
|
UDP_MIB_RCVBUFERRORS, is_udplite);
|
2022-09-26 20:03:50 +08:00
|
|
|
drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
|
|
|
|
} else {
|
2020-11-06 09:49:14 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_MEMERRORS, is_udplite);
|
2022-09-26 20:03:50 +08:00
|
|
|
drop_reason = SKB_DROP_REASON_PROTO_MEM;
|
|
|
|
}
|
2016-04-30 05:16:50 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
|
2024-03-27 02:05:47 +08:00
|
|
|
trace_udp_fail_queue_rcv_skb(rc, sk, skb);
|
2024-06-18 02:09:24 +08:00
|
|
|
sk_skb_reason_drop(sk, skb, drop_reason);
|
2012-04-27 16:23:21 +08:00
|
|
|
return -1;
|
|
|
|
}
|
2016-10-21 19:55:47 +08:00
|
|
|
|
2012-04-27 16:23:21 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-11-08 19:19:21 +08:00
|
|
|
static __inline__ int udpv6_err(struct sk_buff *skb,
|
|
|
|
struct inet6_skb_parm *opt, u8 type,
|
|
|
|
u8 code, int offset, __be32 info)
|
2006-11-28 03:10:57 +08:00
|
|
|
{
|
2022-11-15 05:57:56 +08:00
|
|
|
return __udp6_lib_err(skb, opt, type, code, offset, info,
|
|
|
|
dev_net(skb->dev)->ipv4.udp_table);
|
2006-11-28 03:10:57 +08:00
|
|
|
}
|
|
|
|
|
2018-11-07 19:38:33 +08:00
|
|
|
static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2022-09-26 20:03:50 +08:00
|
|
|
enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
|
2006-11-28 03:10:57 +08:00
|
|
|
struct udp_sock *up = udp_sk(sk);
|
2007-12-03 19:34:16 +08:00
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2006-08-15 15:00:09 +08:00
|
|
|
|
2022-09-26 20:03:50 +08:00
|
|
|
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
|
|
|
|
drop_reason = SKB_DROP_REASON_XFRM_POLICY;
|
2006-11-28 03:10:57 +08:00
|
|
|
goto drop;
|
2022-09-26 20:03:50 +08:00
|
|
|
}
|
2023-03-21 23:58:44 +08:00
|
|
|
nf_reset_ct(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2023-09-12 17:17:28 +08:00
|
|
|
if (static_branch_unlikely(&udpv6_encap_needed_key) &&
|
|
|
|
READ_ONCE(up->encap_type)) {
|
2012-04-27 16:24:08 +08:00
|
|
|
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is an encapsulation socket so pass the skb to
|
|
|
|
* the socket's udp_encap_rcv() hook. Otherwise, just
|
|
|
|
* fall through and pass this up the UDP socket.
|
|
|
|
* up->encap_rcv() returns the following value:
|
|
|
|
* =0 if skb was successfully passed to the encap
|
|
|
|
* handler or was discarded by it.
|
|
|
|
* >0 if skb should be passed on to UDP.
|
|
|
|
* <0 if skb should be resubmitted as proto -N
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* if we're overly short, let UDP handle it */
|
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE()
Please do not apply this to mainline directly, instead please re-run the
coccinelle script shown below and apply its output.
For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't harmful, and changing them results in
churn.
However, for some features, the read/write distinction is critical to
correct operation. To distinguish these cases, separate read/write
accessors must be used. This patch migrates (most) remaining
ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following
coccinelle script:
----
// Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and
// WRITE_ONCE()
// $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch
virtual patch
@ depends on patch @
expression E1, E2;
@@
- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)
@ depends on patch @
expression E;
@@
- ACCESS_ONCE(E)
+ READ_ONCE(E)
----
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
|
|
|
encap_rcv = READ_ONCE(up->encap_rcv);
|
2016-05-19 21:58:33 +08:00
|
|
|
if (encap_rcv) {
|
2012-04-27 16:24:08 +08:00
|
|
|
int ret;
|
|
|
|
|
2014-05-08 07:52:39 +08:00
|
|
|
/* Verify checksum before giving to encap */
|
|
|
|
if (udp_lib_checksum_complete(skb))
|
|
|
|
goto csum_error;
|
|
|
|
|
2012-04-27 16:24:08 +08:00
|
|
|
ret = encap_rcv(sk, skb);
|
|
|
|
if (ret <= 0) {
|
2021-11-03 16:28:43 +08:00
|
|
|
__UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_INDATAGRAMS,
|
|
|
|
is_udplite);
|
2012-04-27 16:24:08 +08:00
|
|
|
return -ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* FALLTHROUGH -- it's a UDP Packet */
|
|
|
|
}
|
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
/*
|
|
|
|
* UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
|
|
|
|
*/
|
2023-09-12 17:17:30 +08:00
|
|
|
if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
|
|
|
|
u16 pcrlen = READ_ONCE(up->pcrlen);
|
2006-11-28 03:10:57 +08:00
|
|
|
|
2023-09-12 17:17:30 +08:00
|
|
|
if (pcrlen == 0) { /* full coverage was set */
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
|
|
|
|
UDP_SKB_CB(skb)->cscov, skb->len);
|
2006-11-28 03:10:57 +08:00
|
|
|
goto drop;
|
|
|
|
}
|
2023-09-12 17:17:30 +08:00
|
|
|
if (UDP_SKB_CB(skb)->cscov < pcrlen) {
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n",
|
2023-09-12 17:17:30 +08:00
|
|
|
UDP_SKB_CB(skb)->cscov, pcrlen);
|
2006-11-28 03:10:57 +08:00
|
|
|
goto drop;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2017-06-22 21:01:22 +08:00
|
|
|
prefetch(&sk->sk_rmem_alloc);
|
2016-06-03 05:52:43 +08:00
|
|
|
if (rcu_access_pointer(sk->sk_filter) &&
|
|
|
|
udp_lib_checksum_complete(skb))
|
|
|
|
goto csum_error;
|
|
|
|
|
2022-09-26 20:03:50 +08:00
|
|
|
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
|
|
|
|
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
|
2016-07-08 23:52:33 +08:00
|
|
|
goto drop;
|
2022-09-26 20:03:50 +08:00
|
|
|
}
|
2006-11-28 03:10:57 +08:00
|
|
|
|
2016-04-06 00:41:15 +08:00
|
|
|
udp_csum_pull_header(skb);
|
2012-04-27 16:23:59 +08:00
|
|
|
|
ipv4: PKTINFO doesnt need dst reference
Le lundi 07 novembre 2011 à 15:33 +0100, Eric Dumazet a écrit :
> At least, in recent kernels we dont change dst->refcnt in forwarding
> patch (usinf NOREF skb->dst)
>
> One particular point is the atomic_inc(dst->refcnt) we have to perform
> when queuing an UDP packet if socket asked PKTINFO stuff (for example a
> typical DNS server has to setup this option)
>
> I have one patch somewhere that stores the information in skb->cb[] and
> avoid the atomic_{inc|dec}(dst->refcnt).
>
OK I found it, I did some extra tests and believe its ready.
[PATCH net-next] ipv4: IP_PKTINFO doesnt need dst reference
When a socket uses IP_PKTINFO notifications, we currently force a dst
reference for each received skb. Reader has to access dst to get needed
information (rt_iif & rt_spec_dst) and must release dst reference.
We also forced a dst reference if skb was put in socket backlog, even
without IP_PKTINFO handling. This happens under stress/load.
We can instead store the needed information in skb->cb[], so that only
softirq handler really access dst, improving cache hit ratios.
This removes two atomic operations per packet, and false sharing as
well.
On a benchmark using a mono threaded receiver (doing only recvmsg()
calls), I can reach 720.000 pps instead of 570.000 pps.
IP_PKTINFO is typically used by DNS servers, and any multihomed aware
UDP application.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 15:24:35 +08:00
|
|
|
skb_dst_drop(skb);
|
2007-12-03 19:33:28 +08:00
|
|
|
|
2016-10-21 19:55:47 +08:00
|
|
|
return __udpv6_queue_rcv_skb(sk, skb);
|
2014-06-26 05:38:13 +08:00
|
|
|
|
2013-04-29 16:39:56 +08:00
|
|
|
csum_error:
|
2022-09-26 20:03:50 +08:00
|
|
|
drop_reason = SKB_DROP_REASON_UDP_CSUM;
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
|
2006-11-28 03:10:57 +08:00
|
|
|
drop:
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
|
2012-04-27 16:23:59 +08:00
|
|
|
atomic_inc(&sk->sk_drops);
|
2024-06-18 02:09:24 +08:00
|
|
|
sk_skb_reason_drop(sk, skb, drop_reason);
|
2006-11-28 03:10:57 +08:00
|
|
|
return -1;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2018-11-07 19:38:33 +08:00
|
|
|
static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct sk_buff *next, *segs;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (likely(!udp_unexpected_gso(sk, skb)))
|
|
|
|
return udpv6_queue_rcv_one_skb(sk, skb);
|
|
|
|
|
|
|
|
__skb_push(skb, -skb_mac_offset(skb));
|
|
|
|
segs = udp_rcv_segment(sk, skb, false);
|
2020-01-14 07:42:27 +08:00
|
|
|
skb_list_walk_safe(segs, skb, next) {
|
2018-11-07 19:38:33 +08:00
|
|
|
__skb_pull(skb, skb_transport_offset(skb));
|
|
|
|
|
2021-03-30 18:28:49 +08:00
|
|
|
udp_post_segment_fix_csum(skb);
|
2018-11-07 19:38:33 +08:00
|
|
|
ret = udpv6_queue_rcv_one_skb(sk, skb);
|
|
|
|
if (ret > 0)
|
|
|
|
ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
|
|
|
|
true);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-03-16 23:31:59 +08:00
|
|
|
static bool __udp_v6_is_mcast_sock(struct net *net, const struct sock *sk,
|
2014-07-16 11:28:31 +08:00
|
|
|
__be16 loc_port, const struct in6_addr *loc_addr,
|
|
|
|
__be16 rmt_port, const struct in6_addr *rmt_addr,
|
2018-11-07 23:36:10 +08:00
|
|
|
int dif, int sdif, unsigned short hnum)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2023-03-16 23:31:59 +08:00
|
|
|
const struct inet_sock *inet = inet_sk(sk);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-07-16 11:28:31 +08:00
|
|
|
if (!net_eq(sock_net(sk), net))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (udp_sk(sk)->udp_port_hash != hnum ||
|
|
|
|
sk->sk_family != PF_INET6 ||
|
|
|
|
(inet->inet_dport && inet->inet_dport != rmt_port) ||
|
|
|
|
(!ipv6_addr_any(&sk->sk_v6_daddr) &&
|
|
|
|
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
|
2022-05-14 02:55:41 +08:00
|
|
|
!udp_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif) ||
|
2015-05-19 03:08:49 +08:00
|
|
|
(!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
|
|
|
|
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
|
2014-07-16 11:28:31 +08:00
|
|
|
return false;
|
|
|
|
if (!inet6_mc_check(sk, loc_addr, rmt_addr))
|
|
|
|
return false;
|
|
|
|
return true;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2014-05-03 07:29:58 +08:00
|
|
|
static void udp6_csum_zero_error(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
/* RFC 2460 section 8.1 says that we SHOULD log
|
|
|
|
* this error. Well, it is reasonable.
|
|
|
|
*/
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n",
|
|
|
|
&ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source),
|
|
|
|
&ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest));
|
2014-05-03 07:29:58 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Note: called only from the BH handler context,
|
|
|
|
* so we don't need to lock the hashes.
|
|
|
|
*/
|
2008-06-17 08:12:11 +08:00
|
|
|
static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
|
2011-04-22 12:53:02 +08:00
|
|
|
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
2014-11-07 02:37:54 +08:00
|
|
|
struct udp_table *udptable, int proto)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2016-04-01 23:52:13 +08:00
|
|
|
struct sock *sk, *first = NULL;
|
2007-03-14 01:28:48 +08:00
|
|
|
const struct udphdr *uh = udp_hdr(skb);
|
2014-07-16 11:28:31 +08:00
|
|
|
unsigned short hnum = ntohs(uh->dest);
|
|
|
|
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
|
2016-04-01 23:52:13 +08:00
|
|
|
unsigned int offset = offsetof(typeof(*sk), sk_node);
|
2014-07-16 11:28:32 +08:00
|
|
|
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
|
2016-04-01 23:52:13 +08:00
|
|
|
int dif = inet6_iif(skb);
|
2018-11-07 23:36:10 +08:00
|
|
|
int sdif = inet6_sdif(skb);
|
2016-04-01 23:52:13 +08:00
|
|
|
struct hlist_node *node;
|
|
|
|
struct sk_buff *nskb;
|
2014-07-16 11:28:32 +08:00
|
|
|
|
|
|
|
if (use_hash2) {
|
2017-12-02 04:52:30 +08:00
|
|
|
hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) &
|
2016-11-15 06:40:30 +08:00
|
|
|
udptable->mask;
|
2017-12-02 04:52:30 +08:00
|
|
|
hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
|
2014-07-16 11:28:32 +08:00
|
|
|
start_lookup:
|
2016-11-15 06:40:30 +08:00
|
|
|
hslot = &udptable->hash2[hash2];
|
2014-07-16 11:28:32 +08:00
|
|
|
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
|
|
|
|
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
|
2018-11-07 23:36:10 +08:00
|
|
|
uh->source, saddr, dif, sdif,
|
|
|
|
hnum))
|
2016-04-01 23:52:13 +08:00
|
|
|
continue;
|
|
|
|
/* If zero checksum and no_check is not on for
|
|
|
|
* the socket then skip it.
|
|
|
|
*/
|
2023-09-12 17:17:23 +08:00
|
|
|
if (!uh->check && !udp_get_no_check6_rx(sk))
|
2016-04-01 23:52:13 +08:00
|
|
|
continue;
|
|
|
|
if (!first) {
|
|
|
|
first = sk;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
nskb = skb_clone(skb, GFP_ATOMIC);
|
|
|
|
if (unlikely(!nskb)) {
|
|
|
|
atomic_inc(&sk->sk_drops);
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
|
|
|
|
IS_UDPLITE(sk));
|
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_INERRORS,
|
|
|
|
IS_UDPLITE(sk));
|
2016-04-01 23:52:13 +08:00
|
|
|
continue;
|
2007-12-31 16:29:24 +08:00
|
|
|
}
|
2009-11-08 18:18:52 +08:00
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
if (udpv6_queue_rcv_skb(sk, nskb) > 0)
|
|
|
|
consume_skb(nskb);
|
|
|
|
}
|
2009-11-08 18:18:52 +08:00
|
|
|
|
2014-07-16 11:28:32 +08:00
|
|
|
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
|
|
|
|
if (use_hash2 && hash2 != hash2_any) {
|
|
|
|
hash2 = hash2_any;
|
|
|
|
goto start_lookup;
|
|
|
|
}
|
|
|
|
|
2016-04-01 23:52:13 +08:00
|
|
|
if (first) {
|
|
|
|
if (udpv6_queue_rcv_skb(first, skb) > 0)
|
|
|
|
consume_skb(skb);
|
2009-11-08 18:18:52 +08:00
|
|
|
} else {
|
2016-04-01 23:52:13 +08:00
|
|
|
kfree_skb(skb);
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
|
|
|
|
proto == IPPROTO_UDPLITE);
|
2009-11-08 18:18:52 +08:00
|
|
|
}
|
2006-11-28 03:10:57 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2017-08-25 20:31:01 +08:00
|
|
|
static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
|
|
|
|
{
|
2024-04-26 23:19:52 +08:00
|
|
|
if (udp_sk_rx_dst_set(sk, dst))
|
|
|
|
sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
|
2017-08-25 20:31:01 +08:00
|
|
|
}
|
|
|
|
|
2018-09-13 22:27:21 +08:00
|
|
|
/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
|
|
|
|
* return code conversion for ip layer consumption
|
|
|
|
*/
|
|
|
|
static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
|
|
|
|
struct udphdr *uh)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
|
2019-07-04 17:03:26 +08:00
|
|
|
skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo);
|
2018-09-13 22:27:21 +08:00
|
|
|
|
|
|
|
ret = udpv6_queue_rcv_skb(sk, skb);
|
|
|
|
|
2018-10-17 17:44:04 +08:00
|
|
|
/* a return value > 0 means to resubmit the input */
|
2018-09-13 22:27:21 +08:00
|
|
|
if (ret > 0)
|
2018-10-17 17:44:04 +08:00
|
|
|
return ret;
|
2018-09-13 22:27:21 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-10-29 16:41:45 +08:00
|
|
|
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
|
2007-03-26 11:10:56 +08:00
|
|
|
int proto)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2022-02-12 01:15:07 +08:00
|
|
|
enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
|
2016-04-01 23:52:13 +08:00
|
|
|
const struct in6_addr *saddr, *daddr;
|
2010-02-18 16:25:24 +08:00
|
|
|
struct net *net = dev_net(skb->dev);
|
2024-06-18 02:09:24 +08:00
|
|
|
struct sock *sk = NULL;
|
2007-02-09 22:24:49 +08:00
|
|
|
struct udphdr *uh;
|
2020-03-30 06:53:39 +08:00
|
|
|
bool refcounted;
|
2005-04-17 06:20:36 +08:00
|
|
|
u32 ulen = 0;
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
|
2010-05-06 11:44:35 +08:00
|
|
|
goto discard;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-04-26 08:54:47 +08:00
|
|
|
saddr = &ipv6_hdr(skb)->saddr;
|
|
|
|
daddr = &ipv6_hdr(skb)->daddr;
|
2007-03-14 01:28:48 +08:00
|
|
|
uh = udp_hdr(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
ulen = ntohs(uh->len);
|
2006-11-28 03:10:57 +08:00
|
|
|
if (ulen > skb->len)
|
|
|
|
goto short_packet;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-03-26 11:10:56 +08:00
|
|
|
if (proto == IPPROTO_UDP) {
|
|
|
|
/* UDP validates ulen. */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
/* Check for jumbo payload */
|
|
|
|
if (ulen == 0)
|
|
|
|
ulen = skb->len;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
if (ulen < sizeof(*uh))
|
|
|
|
goto short_packet;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
if (ulen < skb->len) {
|
|
|
|
if (pskb_trim_rcsum(skb, ulen))
|
|
|
|
goto short_packet;
|
2007-04-26 08:54:47 +08:00
|
|
|
saddr = &ipv6_hdr(skb)->saddr;
|
|
|
|
daddr = &ipv6_hdr(skb)->daddr;
|
2007-03-14 01:28:48 +08:00
|
|
|
uh = udp_hdr(skb);
|
2006-11-28 03:10:57 +08:00
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-03-26 11:10:56 +08:00
|
|
|
if (udp6_csum_init(skb, uh, proto))
|
2013-04-29 16:39:56 +08:00
|
|
|
goto csum_error;
|
2007-03-26 11:10:56 +08:00
|
|
|
|
2017-07-27 20:45:09 +08:00
|
|
|
/* Check if the socket is already available, e.g. due to early demux */
|
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 23:30:11 +08:00
|
|
|
sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
|
|
|
|
&refcounted, udp6_ehashfn);
|
|
|
|
if (IS_ERR(sk))
|
|
|
|
goto no_sk;
|
|
|
|
|
2017-07-27 20:45:09 +08:00
|
|
|
if (sk) {
|
|
|
|
struct dst_entry *dst = skb_dst(skb);
|
|
|
|
int ret;
|
|
|
|
|
inet: fully convert sk->sk_rx_dst to RCU rules
syzbot reported various issues around early demux,
one being included in this changelog [1]
sk->sk_rx_dst is using RCU protection without clearly
documenting it.
And following sequences in tcp_v4_do_rcv()/tcp_v6_do_rcv()
are not following standard RCU rules.
[a] dst_release(dst);
[b] sk->sk_rx_dst = NULL;
They look wrong because a delete operation of RCU protected
pointer is supposed to clear the pointer before
the call_rcu()/synchronize_rcu() guarding actual memory freeing.
In some cases indeed, dst could be freed before [b] is done.
We could cheat by clearing sk_rx_dst before calling
dst_release(), but this seems the right time to stick
to standard RCU annotations and debugging facilities.
[1]
BUG: KASAN: use-after-free in dst_check include/net/dst.h:470 [inline]
BUG: KASAN: use-after-free in tcp_v4_early_demux+0x95b/0x960 net/ipv4/tcp_ipv4.c:1792
Read of size 2 at addr ffff88807f1cb73a by task syz-executor.5/9204
CPU: 0 PID: 9204 Comm: syz-executor.5 Not tainted 5.16.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description.constprop.0.cold+0x8d/0x320 mm/kasan/report.c:247
__kasan_report mm/kasan/report.c:433 [inline]
kasan_report.cold+0x83/0xdf mm/kasan/report.c:450
dst_check include/net/dst.h:470 [inline]
tcp_v4_early_demux+0x95b/0x960 net/ipv4/tcp_ipv4.c:1792
ip_rcv_finish_core.constprop.0+0x15de/0x1e80 net/ipv4/ip_input.c:340
ip_list_rcv_finish.constprop.0+0x1b2/0x6e0 net/ipv4/ip_input.c:583
ip_sublist_rcv net/ipv4/ip_input.c:609 [inline]
ip_list_rcv+0x34e/0x490 net/ipv4/ip_input.c:644
__netif_receive_skb_list_ptype net/core/dev.c:5508 [inline]
__netif_receive_skb_list_core+0x549/0x8e0 net/core/dev.c:5556
__netif_receive_skb_list net/core/dev.c:5608 [inline]
netif_receive_skb_list_internal+0x75e/0xd80 net/core/dev.c:5699
gro_normal_list net/core/dev.c:5853 [inline]
gro_normal_list net/core/dev.c:5849 [inline]
napi_complete_done+0x1f1/0x880 net/core/dev.c:6590
virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline]
virtnet_poll+0xca2/0x11b0 drivers/net/virtio_net.c:1557
__napi_poll+0xaf/0x440 net/core/dev.c:7023
napi_poll net/core/dev.c:7090 [inline]
net_rx_action+0x801/0xb40 net/core/dev.c:7177
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
invoke_softirq kernel/softirq.c:432 [inline]
__irq_exit_rcu+0x123/0x180 kernel/softirq.c:637
irq_exit_rcu+0x5/0x20 kernel/softirq.c:649
common_interrupt+0x52/0xc0 arch/x86/kernel/irq.c:240
asm_common_interrupt+0x1e/0x40 arch/x86/include/asm/idtentry.h:629
RIP: 0033:0x7f5e972bfd57
Code: 39 d1 73 14 0f 1f 80 00 00 00 00 48 8b 50 f8 48 83 e8 08 48 39 ca 77 f3 48 39 c3 73 3e 48 89 13 48 8b 50 f8 48 89 38 49 8b 0e <48> 8b 3e 48 83 c3 08 48 83 c6 08 eb bc 48 39 d1 72 9e 48 39 d0 73
RSP: 002b:00007fff8a413210 EFLAGS: 00000283
RAX: 00007f5e97108990 RBX: 00007f5e97108338 RCX: ffffffff81d3aa45
RDX: ffffffff81d3aa45 RSI: 00007f5e97108340 RDI: ffffffff81d3aa45
RBP: 00007f5e97107eb8 R08: 00007f5e97108d88 R09: 0000000093c2e8d9
R10: 0000000000000000 R11: 0000000000000000 R12: 00007f5e97107eb0
R13: 00007f5e97108338 R14: 00007f5e97107ea8 R15: 0000000000000019
</TASK>
Allocated by task 13:
kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:46 [inline]
set_alloc_info mm/kasan/common.c:434 [inline]
__kasan_slab_alloc+0x90/0xc0 mm/kasan/common.c:467
kasan_slab_alloc include/linux/kasan.h:259 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3234 [inline]
slab_alloc mm/slub.c:3242 [inline]
kmem_cache_alloc+0x202/0x3a0 mm/slub.c:3247
dst_alloc+0x146/0x1f0 net/core/dst.c:92
rt_dst_alloc+0x73/0x430 net/ipv4/route.c:1613
ip_route_input_slow+0x1817/0x3a20 net/ipv4/route.c:2340
ip_route_input_rcu net/ipv4/route.c:2470 [inline]
ip_route_input_noref+0x116/0x2a0 net/ipv4/route.c:2415
ip_rcv_finish_core.constprop.0+0x288/0x1e80 net/ipv4/ip_input.c:354
ip_list_rcv_finish.constprop.0+0x1b2/0x6e0 net/ipv4/ip_input.c:583
ip_sublist_rcv net/ipv4/ip_input.c:609 [inline]
ip_list_rcv+0x34e/0x490 net/ipv4/ip_input.c:644
__netif_receive_skb_list_ptype net/core/dev.c:5508 [inline]
__netif_receive_skb_list_core+0x549/0x8e0 net/core/dev.c:5556
__netif_receive_skb_list net/core/dev.c:5608 [inline]
netif_receive_skb_list_internal+0x75e/0xd80 net/core/dev.c:5699
gro_normal_list net/core/dev.c:5853 [inline]
gro_normal_list net/core/dev.c:5849 [inline]
napi_complete_done+0x1f1/0x880 net/core/dev.c:6590
virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline]
virtnet_poll+0xca2/0x11b0 drivers/net/virtio_net.c:1557
__napi_poll+0xaf/0x440 net/core/dev.c:7023
napi_poll net/core/dev.c:7090 [inline]
net_rx_action+0x801/0xb40 net/core/dev.c:7177
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Freed by task 13:
kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38
kasan_set_track+0x21/0x30 mm/kasan/common.c:46
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free mm/kasan/common.c:328 [inline]
__kasan_slab_free+0xff/0x130 mm/kasan/common.c:374
kasan_slab_free include/linux/kasan.h:235 [inline]
slab_free_hook mm/slub.c:1723 [inline]
slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1749
slab_free mm/slub.c:3513 [inline]
kmem_cache_free+0xbd/0x5d0 mm/slub.c:3530
dst_destroy+0x2d6/0x3f0 net/core/dst.c:127
rcu_do_batch kernel/rcu/tree.c:2506 [inline]
rcu_core+0x7ab/0x1470 kernel/rcu/tree.c:2741
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Last potentially related work creation:
kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38
__kasan_record_aux_stack+0xf5/0x120 mm/kasan/generic.c:348
__call_rcu kernel/rcu/tree.c:2985 [inline]
call_rcu+0xb1/0x740 kernel/rcu/tree.c:3065
dst_release net/core/dst.c:177 [inline]
dst_release+0x79/0xe0 net/core/dst.c:167
tcp_v4_do_rcv+0x612/0x8d0 net/ipv4/tcp_ipv4.c:1712
sk_backlog_rcv include/net/sock.h:1030 [inline]
__release_sock+0x134/0x3b0 net/core/sock.c:2768
release_sock+0x54/0x1b0 net/core/sock.c:3300
tcp_sendmsg+0x36/0x40 net/ipv4/tcp.c:1441
inet_sendmsg+0x99/0xe0 net/ipv4/af_inet.c:819
sock_sendmsg_nosec net/socket.c:704 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:724
sock_write_iter+0x289/0x3c0 net/socket.c:1057
call_write_iter include/linux/fs.h:2162 [inline]
new_sync_write+0x429/0x660 fs/read_write.c:503
vfs_write+0x7cd/0xae0 fs/read_write.c:590
ksys_write+0x1ee/0x250 fs/read_write.c:643
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
The buggy address belongs to the object at ffff88807f1cb700
which belongs to the cache ip_dst_cache of size 176
The buggy address is located 58 bytes inside of
176-byte region [ffff88807f1cb700, ffff88807f1cb7b0)
The buggy address belongs to the page:
page:ffffea0001fc72c0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7f1cb
flags: 0xfff00000000200(slab|node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000000200 dead000000000100 dead000000000122 ffff8881413bb780
raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 0, migratetype Unmovable, gfp_mask 0x112a20(GFP_ATOMIC|__GFP_NOWARN|__GFP_NORETRY|__GFP_HARDWALL), pid 5, ts 108466983062, free_ts 108048976062
prep_new_page mm/page_alloc.c:2418 [inline]
get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4149
__alloc_pages+0x1b2/0x500 mm/page_alloc.c:5369
alloc_pages+0x1a7/0x300 mm/mempolicy.c:2191
alloc_slab_page mm/slub.c:1793 [inline]
allocate_slab mm/slub.c:1930 [inline]
new_slab+0x32d/0x4a0 mm/slub.c:1993
___slab_alloc+0x918/0xfe0 mm/slub.c:3022
__slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3109
slab_alloc_node mm/slub.c:3200 [inline]
slab_alloc mm/slub.c:3242 [inline]
kmem_cache_alloc+0x35c/0x3a0 mm/slub.c:3247
dst_alloc+0x146/0x1f0 net/core/dst.c:92
rt_dst_alloc+0x73/0x430 net/ipv4/route.c:1613
__mkroute_output net/ipv4/route.c:2564 [inline]
ip_route_output_key_hash_rcu+0x921/0x2d00 net/ipv4/route.c:2791
ip_route_output_key_hash+0x18b/0x300 net/ipv4/route.c:2619
__ip_route_output_key include/net/route.h:126 [inline]
ip_route_output_flow+0x23/0x150 net/ipv4/route.c:2850
ip_route_output_key include/net/route.h:142 [inline]
geneve_get_v4_rt+0x3a6/0x830 drivers/net/geneve.c:809
geneve_xmit_skb drivers/net/geneve.c:899 [inline]
geneve_xmit+0xc4a/0x3540 drivers/net/geneve.c:1082
__netdev_start_xmit include/linux/netdevice.h:4994 [inline]
netdev_start_xmit include/linux/netdevice.h:5008 [inline]
xmit_one net/core/dev.c:3590 [inline]
dev_hard_start_xmit+0x1eb/0x920 net/core/dev.c:3606
__dev_queue_xmit+0x299a/0x3650 net/core/dev.c:4229
page last free stack trace:
reset_page_owner include/linux/page_owner.h:24 [inline]
free_pages_prepare mm/page_alloc.c:1338 [inline]
free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1389
free_unref_page_prepare mm/page_alloc.c:3309 [inline]
free_unref_page+0x19/0x690 mm/page_alloc.c:3388
qlink_free mm/kasan/quarantine.c:146 [inline]
qlist_free_all+0x5a/0xc0 mm/kasan/quarantine.c:165
kasan_quarantine_reduce+0x180/0x200 mm/kasan/quarantine.c:272
__kasan_slab_alloc+0xa2/0xc0 mm/kasan/common.c:444
kasan_slab_alloc include/linux/kasan.h:259 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3234 [inline]
kmem_cache_alloc_node+0x255/0x3f0 mm/slub.c:3270
__alloc_skb+0x215/0x340 net/core/skbuff.c:414
alloc_skb include/linux/skbuff.h:1126 [inline]
alloc_skb_with_frags+0x93/0x620 net/core/skbuff.c:6078
sock_alloc_send_pskb+0x783/0x910 net/core/sock.c:2575
mld_newpack+0x1df/0x770 net/ipv6/mcast.c:1754
add_grhead+0x265/0x330 net/ipv6/mcast.c:1857
add_grec+0x1053/0x14e0 net/ipv6/mcast.c:1995
mld_send_initial_cr.part.0+0xf6/0x230 net/ipv6/mcast.c:2242
mld_send_initial_cr net/ipv6/mcast.c:1232 [inline]
mld_dad_work+0x1d3/0x690 net/ipv6/mcast.c:2268
process_one_work+0x9b2/0x1690 kernel/workqueue.c:2298
worker_thread+0x658/0x11f0 kernel/workqueue.c:2445
Memory state around the buggy address:
ffff88807f1cb600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88807f1cb680: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc
>ffff88807f1cb700: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff88807f1cb780: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc
ffff88807f1cb800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
Fixes: 41063e9dd119 ("ipv4: Early TCP socket demux.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20211220143330.680945-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-12-20 22:33:30 +08:00
|
|
|
if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
|
2017-08-25 20:31:01 +08:00
|
|
|
udp6_sk_rx_dst_set(sk, dst);
|
2017-07-27 20:45:09 +08:00
|
|
|
|
2023-09-12 17:17:23 +08:00
|
|
|
if (!uh->check && !udp_get_no_check6_rx(sk)) {
|
2020-03-30 06:53:39 +08:00
|
|
|
if (refcounted)
|
|
|
|
sock_put(sk);
|
2018-09-13 22:27:21 +08:00
|
|
|
goto report_csum_error;
|
|
|
|
}
|
2017-07-27 20:45:09 +08:00
|
|
|
|
2018-09-13 22:27:21 +08:00
|
|
|
ret = udp6_unicast_rcv_skb(sk, skb, uh);
|
2020-03-30 06:53:39 +08:00
|
|
|
if (refcounted)
|
|
|
|
sock_put(sk);
|
2018-09-13 22:27:21 +08:00
|
|
|
return ret;
|
2017-07-27 20:45:09 +08:00
|
|
|
}
|
|
|
|
|
2007-02-09 22:24:49 +08:00
|
|
|
/*
|
|
|
|
* Multicast receive code
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-11-28 03:10:57 +08:00
|
|
|
if (ipv6_addr_is_multicast(daddr))
|
2008-06-17 08:12:11 +08:00
|
|
|
return __udp6_lib_mcast_deliver(net, skb,
|
2014-11-07 02:37:54 +08:00
|
|
|
saddr, daddr, udptable, proto);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Unicast */
|
2008-10-08 03:38:32 +08:00
|
|
|
sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
|
2015-03-29 21:00:05 +08:00
|
|
|
if (sk) {
|
2023-09-12 17:17:23 +08:00
|
|
|
if (!uh->check && !udp_get_no_check6_rx(sk))
|
2018-09-13 22:27:21 +08:00
|
|
|
goto report_csum_error;
|
|
|
|
return udp6_unicast_rcv_skb(sk, skb, uh);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 23:30:11 +08:00
|
|
|
no_sk:
|
2022-02-12 01:15:07 +08:00
|
|
|
reason = SKB_DROP_REASON_NO_SOCKET;
|
|
|
|
|
2018-09-13 22:27:21 +08:00
|
|
|
if (!uh->check)
|
|
|
|
goto report_csum_error;
|
2014-05-03 07:29:58 +08:00
|
|
|
|
2012-04-27 16:23:59 +08:00
|
|
|
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
|
2010-04-28 06:13:20 +08:00
|
|
|
goto discard;
|
2023-03-21 23:58:44 +08:00
|
|
|
nf_reset_ct(skb);
|
2012-04-27 16:23:59 +08:00
|
|
|
|
|
|
|
if (udp_lib_checksum_complete(skb))
|
2013-04-29 16:39:56 +08:00
|
|
|
goto csum_error;
|
2012-04-27 16:23:59 +08:00
|
|
|
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
|
2012-04-27 16:23:59 +08:00
|
|
|
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
|
|
|
|
|
2024-06-18 02:09:24 +08:00
|
|
|
sk_skb_reason_drop(sk, skb, reason);
|
2007-03-09 12:42:35 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-02-09 22:24:49 +08:00
|
|
|
short_packet:
|
2022-02-12 01:15:07 +08:00
|
|
|
if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
|
|
|
|
reason = SKB_DROP_REASON_PKT_TOO_SMALL;
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n",
|
|
|
|
proto == IPPROTO_UDPLITE ? "-Lite" : "",
|
|
|
|
saddr, ntohs(uh->source),
|
|
|
|
ulen, skb->len,
|
|
|
|
daddr, ntohs(uh->dest));
|
2013-04-29 16:39:56 +08:00
|
|
|
goto discard;
|
2018-09-13 22:27:21 +08:00
|
|
|
|
|
|
|
report_csum_error:
|
|
|
|
udp6_csum_zero_error(skb);
|
2013-04-29 16:39:56 +08:00
|
|
|
csum_error:
|
2022-02-12 01:15:07 +08:00
|
|
|
if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
|
|
|
|
reason = SKB_DROP_REASON_UDP_CSUM;
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
|
2005-04-17 06:20:36 +08:00
|
|
|
discard:
|
2016-04-28 07:44:30 +08:00
|
|
|
__UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
|
2024-06-18 02:09:24 +08:00
|
|
|
sk_skb_reason_drop(sk, skb, reason);
|
2007-03-09 12:42:35 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-11-28 03:10:57 +08:00
|
|
|
|
2017-04-19 01:39:41 +08:00
|
|
|
|
2017-03-09 07:36:49 +08:00
|
|
|
static struct sock *__udp6_lib_demux_lookup(struct net *net,
|
|
|
|
__be16 loc_port, const struct in6_addr *loc_addr,
|
|
|
|
__be16 rmt_port, const struct in6_addr *rmt_addr,
|
2017-08-07 23:44:21 +08:00
|
|
|
int dif, int sdif)
|
2017-03-09 07:36:49 +08:00
|
|
|
{
|
2022-11-15 05:57:56 +08:00
|
|
|
struct udp_table *udptable = net->ipv4.udp_table;
|
2017-04-19 01:39:41 +08:00
|
|
|
unsigned short hnum = ntohs(loc_port);
|
2022-11-15 05:57:53 +08:00
|
|
|
unsigned int hash2, slot2;
|
|
|
|
struct udp_hslot *hslot2;
|
|
|
|
__portpair ports;
|
2017-03-09 07:36:49 +08:00
|
|
|
struct sock *sk;
|
|
|
|
|
2022-11-15 05:57:53 +08:00
|
|
|
hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
|
2022-11-15 05:57:56 +08:00
|
|
|
slot2 = hash2 & udptable->mask;
|
|
|
|
hslot2 = &udptable->hash2[slot2];
|
2022-11-15 05:57:53 +08:00
|
|
|
ports = INET_COMBINED_PORTS(rmt_port, hnum);
|
|
|
|
|
2017-04-19 01:39:41 +08:00
|
|
|
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
|
2017-06-24 06:25:37 +08:00
|
|
|
if (sk->sk_state == TCP_ESTABLISHED &&
|
2022-05-14 02:55:49 +08:00
|
|
|
inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif))
|
2017-04-19 01:39:41 +08:00
|
|
|
return sk;
|
|
|
|
/* Only check first socket in chain */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return NULL;
|
2017-03-09 07:36:49 +08:00
|
|
|
}
|
|
|
|
|
tcp/udp: Make early_demux back namespacified.
Commit e21145a9871a ("ipv4: namespacify ip_early_demux sysctl knob") made
it possible to enable/disable early_demux on a per-netns basis. Then, we
introduced two knobs, tcp_early_demux and udp_early_demux, to switch it for
TCP/UDP in commit dddb64bcb346 ("net: Add sysctl to toggle early demux for
tcp and udp"). However, the .proc_handler() was wrong and actually
disabled us from changing the behaviour in each netns.
We can execute early_demux if net.ipv4.ip_early_demux is on and each proto
.early_demux() handler is not NULL. When we toggle (tcp|udp)_early_demux,
the change itself is saved in each netns variable, but the .early_demux()
handler is a global variable, so the handler is switched based on the
init_net's sysctl variable. Thus, netns (tcp|udp)_early_demux knobs have
nothing to do with the logic. Whether we CAN execute proto .early_demux()
is always decided by init_net's sysctl knob, and whether we DO it or not is
by each netns ip_early_demux knob.
This patch namespacifies (tcp|udp)_early_demux again. For now, the users
of the .early_demux() handler are TCP and UDP only, and they are called
directly to avoid retpoline. So, we can remove the .early_demux() handler
from inet6?_protos and need not dereference them in ip6?_rcv_finish_core().
If another proto needs .early_demux(), we can restore it at that time.
Fixes: dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20220713175207.7727-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-07-14 01:52:07 +08:00
|
|
|
void udp_v6_early_demux(struct sk_buff *skb)
|
2017-03-09 07:36:49 +08:00
|
|
|
{
|
|
|
|
struct net *net = dev_net(skb->dev);
|
|
|
|
const struct udphdr *uh;
|
|
|
|
struct sock *sk;
|
|
|
|
struct dst_entry *dst;
|
|
|
|
int dif = skb->dev->ifindex;
|
2017-08-07 23:44:21 +08:00
|
|
|
int sdif = inet6_sdif(skb);
|
2017-03-09 07:36:49 +08:00
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, skb_transport_offset(skb) +
|
|
|
|
sizeof(struct udphdr)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
uh = udp_hdr(skb);
|
|
|
|
|
|
|
|
if (skb->pkt_type == PACKET_HOST)
|
|
|
|
sk = __udp6_lib_demux_lookup(net, uh->dest,
|
|
|
|
&ipv6_hdr(skb)->daddr,
|
|
|
|
uh->source, &ipv6_hdr(skb)->saddr,
|
2017-08-07 23:44:21 +08:00
|
|
|
dif, sdif);
|
2017-03-09 07:36:49 +08:00
|
|
|
else
|
|
|
|
return;
|
|
|
|
|
2024-03-08 06:00:16 +08:00
|
|
|
if (!sk)
|
2017-03-09 07:36:49 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
skb->sk = sk;
|
2024-03-08 06:00:16 +08:00
|
|
|
DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
|
|
|
|
skb->destructor = sock_pfree;
|
inet: fully convert sk->sk_rx_dst to RCU rules
syzbot reported various issues around early demux,
one being included in this changelog [1]
sk->sk_rx_dst is using RCU protection without clearly
documenting it.
And following sequences in tcp_v4_do_rcv()/tcp_v6_do_rcv()
are not following standard RCU rules.
[a] dst_release(dst);
[b] sk->sk_rx_dst = NULL;
They look wrong because a delete operation of RCU protected
pointer is supposed to clear the pointer before
the call_rcu()/synchronize_rcu() guarding actual memory freeing.
In some cases indeed, dst could be freed before [b] is done.
We could cheat by clearing sk_rx_dst before calling
dst_release(), but this seems the right time to stick
to standard RCU annotations and debugging facilities.
[1]
BUG: KASAN: use-after-free in dst_check include/net/dst.h:470 [inline]
BUG: KASAN: use-after-free in tcp_v4_early_demux+0x95b/0x960 net/ipv4/tcp_ipv4.c:1792
Read of size 2 at addr ffff88807f1cb73a by task syz-executor.5/9204
CPU: 0 PID: 9204 Comm: syz-executor.5 Not tainted 5.16.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description.constprop.0.cold+0x8d/0x320 mm/kasan/report.c:247
__kasan_report mm/kasan/report.c:433 [inline]
kasan_report.cold+0x83/0xdf mm/kasan/report.c:450
dst_check include/net/dst.h:470 [inline]
tcp_v4_early_demux+0x95b/0x960 net/ipv4/tcp_ipv4.c:1792
ip_rcv_finish_core.constprop.0+0x15de/0x1e80 net/ipv4/ip_input.c:340
ip_list_rcv_finish.constprop.0+0x1b2/0x6e0 net/ipv4/ip_input.c:583
ip_sublist_rcv net/ipv4/ip_input.c:609 [inline]
ip_list_rcv+0x34e/0x490 net/ipv4/ip_input.c:644
__netif_receive_skb_list_ptype net/core/dev.c:5508 [inline]
__netif_receive_skb_list_core+0x549/0x8e0 net/core/dev.c:5556
__netif_receive_skb_list net/core/dev.c:5608 [inline]
netif_receive_skb_list_internal+0x75e/0xd80 net/core/dev.c:5699
gro_normal_list net/core/dev.c:5853 [inline]
gro_normal_list net/core/dev.c:5849 [inline]
napi_complete_done+0x1f1/0x880 net/core/dev.c:6590
virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline]
virtnet_poll+0xca2/0x11b0 drivers/net/virtio_net.c:1557
__napi_poll+0xaf/0x440 net/core/dev.c:7023
napi_poll net/core/dev.c:7090 [inline]
net_rx_action+0x801/0xb40 net/core/dev.c:7177
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
invoke_softirq kernel/softirq.c:432 [inline]
__irq_exit_rcu+0x123/0x180 kernel/softirq.c:637
irq_exit_rcu+0x5/0x20 kernel/softirq.c:649
common_interrupt+0x52/0xc0 arch/x86/kernel/irq.c:240
asm_common_interrupt+0x1e/0x40 arch/x86/include/asm/idtentry.h:629
RIP: 0033:0x7f5e972bfd57
Code: 39 d1 73 14 0f 1f 80 00 00 00 00 48 8b 50 f8 48 83 e8 08 48 39 ca 77 f3 48 39 c3 73 3e 48 89 13 48 8b 50 f8 48 89 38 49 8b 0e <48> 8b 3e 48 83 c3 08 48 83 c6 08 eb bc 48 39 d1 72 9e 48 39 d0 73
RSP: 002b:00007fff8a413210 EFLAGS: 00000283
RAX: 00007f5e97108990 RBX: 00007f5e97108338 RCX: ffffffff81d3aa45
RDX: ffffffff81d3aa45 RSI: 00007f5e97108340 RDI: ffffffff81d3aa45
RBP: 00007f5e97107eb8 R08: 00007f5e97108d88 R09: 0000000093c2e8d9
R10: 0000000000000000 R11: 0000000000000000 R12: 00007f5e97107eb0
R13: 00007f5e97108338 R14: 00007f5e97107ea8 R15: 0000000000000019
</TASK>
Allocated by task 13:
kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:46 [inline]
set_alloc_info mm/kasan/common.c:434 [inline]
__kasan_slab_alloc+0x90/0xc0 mm/kasan/common.c:467
kasan_slab_alloc include/linux/kasan.h:259 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3234 [inline]
slab_alloc mm/slub.c:3242 [inline]
kmem_cache_alloc+0x202/0x3a0 mm/slub.c:3247
dst_alloc+0x146/0x1f0 net/core/dst.c:92
rt_dst_alloc+0x73/0x430 net/ipv4/route.c:1613
ip_route_input_slow+0x1817/0x3a20 net/ipv4/route.c:2340
ip_route_input_rcu net/ipv4/route.c:2470 [inline]
ip_route_input_noref+0x116/0x2a0 net/ipv4/route.c:2415
ip_rcv_finish_core.constprop.0+0x288/0x1e80 net/ipv4/ip_input.c:354
ip_list_rcv_finish.constprop.0+0x1b2/0x6e0 net/ipv4/ip_input.c:583
ip_sublist_rcv net/ipv4/ip_input.c:609 [inline]
ip_list_rcv+0x34e/0x490 net/ipv4/ip_input.c:644
__netif_receive_skb_list_ptype net/core/dev.c:5508 [inline]
__netif_receive_skb_list_core+0x549/0x8e0 net/core/dev.c:5556
__netif_receive_skb_list net/core/dev.c:5608 [inline]
netif_receive_skb_list_internal+0x75e/0xd80 net/core/dev.c:5699
gro_normal_list net/core/dev.c:5853 [inline]
gro_normal_list net/core/dev.c:5849 [inline]
napi_complete_done+0x1f1/0x880 net/core/dev.c:6590
virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline]
virtnet_poll+0xca2/0x11b0 drivers/net/virtio_net.c:1557
__napi_poll+0xaf/0x440 net/core/dev.c:7023
napi_poll net/core/dev.c:7090 [inline]
net_rx_action+0x801/0xb40 net/core/dev.c:7177
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Freed by task 13:
kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38
kasan_set_track+0x21/0x30 mm/kasan/common.c:46
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free mm/kasan/common.c:328 [inline]
__kasan_slab_free+0xff/0x130 mm/kasan/common.c:374
kasan_slab_free include/linux/kasan.h:235 [inline]
slab_free_hook mm/slub.c:1723 [inline]
slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1749
slab_free mm/slub.c:3513 [inline]
kmem_cache_free+0xbd/0x5d0 mm/slub.c:3530
dst_destroy+0x2d6/0x3f0 net/core/dst.c:127
rcu_do_batch kernel/rcu/tree.c:2506 [inline]
rcu_core+0x7ab/0x1470 kernel/rcu/tree.c:2741
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Last potentially related work creation:
kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38
__kasan_record_aux_stack+0xf5/0x120 mm/kasan/generic.c:348
__call_rcu kernel/rcu/tree.c:2985 [inline]
call_rcu+0xb1/0x740 kernel/rcu/tree.c:3065
dst_release net/core/dst.c:177 [inline]
dst_release+0x79/0xe0 net/core/dst.c:167
tcp_v4_do_rcv+0x612/0x8d0 net/ipv4/tcp_ipv4.c:1712
sk_backlog_rcv include/net/sock.h:1030 [inline]
__release_sock+0x134/0x3b0 net/core/sock.c:2768
release_sock+0x54/0x1b0 net/core/sock.c:3300
tcp_sendmsg+0x36/0x40 net/ipv4/tcp.c:1441
inet_sendmsg+0x99/0xe0 net/ipv4/af_inet.c:819
sock_sendmsg_nosec net/socket.c:704 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:724
sock_write_iter+0x289/0x3c0 net/socket.c:1057
call_write_iter include/linux/fs.h:2162 [inline]
new_sync_write+0x429/0x660 fs/read_write.c:503
vfs_write+0x7cd/0xae0 fs/read_write.c:590
ksys_write+0x1ee/0x250 fs/read_write.c:643
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
The buggy address belongs to the object at ffff88807f1cb700
which belongs to the cache ip_dst_cache of size 176
The buggy address is located 58 bytes inside of
176-byte region [ffff88807f1cb700, ffff88807f1cb7b0)
The buggy address belongs to the page:
page:ffffea0001fc72c0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7f1cb
flags: 0xfff00000000200(slab|node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000000200 dead000000000100 dead000000000122 ffff8881413bb780
raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 0, migratetype Unmovable, gfp_mask 0x112a20(GFP_ATOMIC|__GFP_NOWARN|__GFP_NORETRY|__GFP_HARDWALL), pid 5, ts 108466983062, free_ts 108048976062
prep_new_page mm/page_alloc.c:2418 [inline]
get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4149
__alloc_pages+0x1b2/0x500 mm/page_alloc.c:5369
alloc_pages+0x1a7/0x300 mm/mempolicy.c:2191
alloc_slab_page mm/slub.c:1793 [inline]
allocate_slab mm/slub.c:1930 [inline]
new_slab+0x32d/0x4a0 mm/slub.c:1993
___slab_alloc+0x918/0xfe0 mm/slub.c:3022
__slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3109
slab_alloc_node mm/slub.c:3200 [inline]
slab_alloc mm/slub.c:3242 [inline]
kmem_cache_alloc+0x35c/0x3a0 mm/slub.c:3247
dst_alloc+0x146/0x1f0 net/core/dst.c:92
rt_dst_alloc+0x73/0x430 net/ipv4/route.c:1613
__mkroute_output net/ipv4/route.c:2564 [inline]
ip_route_output_key_hash_rcu+0x921/0x2d00 net/ipv4/route.c:2791
ip_route_output_key_hash+0x18b/0x300 net/ipv4/route.c:2619
__ip_route_output_key include/net/route.h:126 [inline]
ip_route_output_flow+0x23/0x150 net/ipv4/route.c:2850
ip_route_output_key include/net/route.h:142 [inline]
geneve_get_v4_rt+0x3a6/0x830 drivers/net/geneve.c:809
geneve_xmit_skb drivers/net/geneve.c:899 [inline]
geneve_xmit+0xc4a/0x3540 drivers/net/geneve.c:1082
__netdev_start_xmit include/linux/netdevice.h:4994 [inline]
netdev_start_xmit include/linux/netdevice.h:5008 [inline]
xmit_one net/core/dev.c:3590 [inline]
dev_hard_start_xmit+0x1eb/0x920 net/core/dev.c:3606
__dev_queue_xmit+0x299a/0x3650 net/core/dev.c:4229
page last free stack trace:
reset_page_owner include/linux/page_owner.h:24 [inline]
free_pages_prepare mm/page_alloc.c:1338 [inline]
free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1389
free_unref_page_prepare mm/page_alloc.c:3309 [inline]
free_unref_page+0x19/0x690 mm/page_alloc.c:3388
qlink_free mm/kasan/quarantine.c:146 [inline]
qlist_free_all+0x5a/0xc0 mm/kasan/quarantine.c:165
kasan_quarantine_reduce+0x180/0x200 mm/kasan/quarantine.c:272
__kasan_slab_alloc+0xa2/0xc0 mm/kasan/common.c:444
kasan_slab_alloc include/linux/kasan.h:259 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3234 [inline]
kmem_cache_alloc_node+0x255/0x3f0 mm/slub.c:3270
__alloc_skb+0x215/0x340 net/core/skbuff.c:414
alloc_skb include/linux/skbuff.h:1126 [inline]
alloc_skb_with_frags+0x93/0x620 net/core/skbuff.c:6078
sock_alloc_send_pskb+0x783/0x910 net/core/sock.c:2575
mld_newpack+0x1df/0x770 net/ipv6/mcast.c:1754
add_grhead+0x265/0x330 net/ipv6/mcast.c:1857
add_grec+0x1053/0x14e0 net/ipv6/mcast.c:1995
mld_send_initial_cr.part.0+0xf6/0x230 net/ipv6/mcast.c:2242
mld_send_initial_cr net/ipv6/mcast.c:1232 [inline]
mld_dad_work+0x1d3/0x690 net/ipv6/mcast.c:2268
process_one_work+0x9b2/0x1690 kernel/workqueue.c:2298
worker_thread+0x658/0x11f0 kernel/workqueue.c:2445
Memory state around the buggy address:
ffff88807f1cb600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88807f1cb680: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc
>ffff88807f1cb700: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff88807f1cb780: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc
ffff88807f1cb800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
Fixes: 41063e9dd119 ("ipv4: Early TCP socket demux.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20211220143330.680945-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-12-20 22:33:30 +08:00
|
|
|
dst = rcu_dereference(sk->sk_rx_dst);
|
2017-03-09 07:36:49 +08:00
|
|
|
|
|
|
|
if (dst)
|
2021-10-26 00:48:17 +08:00
|
|
|
dst = dst_check(dst, sk->sk_rx_dst_cookie);
|
2017-03-09 07:36:49 +08:00
|
|
|
if (dst) {
|
2017-06-18 01:42:25 +08:00
|
|
|
/* set noref for now.
|
|
|
|
* any place which wants to hold dst has to call
|
|
|
|
* dst_hold_safe()
|
|
|
|
*/
|
|
|
|
skb_dst_set_noref(skb, dst);
|
2017-03-09 07:36:49 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-03 23:01:37 +08:00
|
|
|
INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb)
|
2006-11-28 03:10:57 +08:00
|
|
|
{
|
2022-11-15 05:57:56 +08:00
|
|
|
return __udp6_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
|
2006-11-28 03:10:57 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Throw away all pending data and cancel the corking. Socket is locked.
|
|
|
|
*/
|
|
|
|
static void udp_v6_flush_pending_frames(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
|
2008-06-04 19:49:07 +08:00
|
|
|
if (up->pending == AF_INET)
|
|
|
|
udp_flush_pending_frames(sk);
|
|
|
|
else if (up->pending) {
|
2005-04-17 06:20:36 +08:00
|
|
|
up->len = 0;
|
2024-01-12 18:44:27 +08:00
|
|
|
WRITE_ONCE(up->pending, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
ip6_flush_pending_frames(sk);
|
2007-02-09 22:24:49 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2018-03-31 06:08:05 +08:00
|
|
|
static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
|
|
int addr_len)
|
|
|
|
{
|
2019-04-12 18:56:39 +08:00
|
|
|
if (addr_len < offsetofend(struct sockaddr, sa_family))
|
|
|
|
return -EINVAL;
|
2018-03-31 06:08:05 +08:00
|
|
|
/* The following checks are replicated from __ip6_datagram_connect()
|
|
|
|
* and intended to prevent BPF program called below from accessing
|
|
|
|
* bytes that are out of the bound specified by user in addr_len.
|
|
|
|
*/
|
|
|
|
if (uaddr->sa_family == AF_INET) {
|
2022-04-20 09:58:50 +08:00
|
|
|
if (ipv6_only_sock(sk))
|
2018-03-31 06:08:05 +08:00
|
|
|
return -EAFNOSUPPORT;
|
|
|
|
return udp_pre_connect(sk, uaddr, addr_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (addr_len < SIN6_LEN_RFC2133)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2023-10-12 02:51:04 +08:00
|
|
|
return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
|
2018-03-31 06:08:05 +08:00
|
|
|
}
|
|
|
|
|
2009-07-09 16:09:54 +08:00
|
|
|
/**
|
2014-08-25 04:53:10 +08:00
|
|
|
* udp6_hwcsum_outgoing - handle outgoing HW checksumming
|
|
|
|
* @sk: socket we are sending on
|
|
|
|
* @skb: sk_buff containing the filled-in UDP header
|
|
|
|
* (checksum field must be zeroed out)
|
2020-07-13 07:15:03 +08:00
|
|
|
* @saddr: source address
|
|
|
|
* @daddr: destination address
|
|
|
|
* @len: length of packet
|
2009-07-09 16:09:54 +08:00
|
|
|
*/
|
|
|
|
static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
const struct in6_addr *daddr, int len)
|
|
|
|
{
|
|
|
|
unsigned int offset;
|
|
|
|
struct udphdr *uh = udp_hdr(skb);
|
2015-01-31 23:40:16 +08:00
|
|
|
struct sk_buff *frags = skb_shinfo(skb)->frag_list;
|
2009-07-09 16:09:54 +08:00
|
|
|
__wsum csum = 0;
|
|
|
|
|
2015-01-31 23:40:16 +08:00
|
|
|
if (!frags) {
|
2009-07-09 16:09:54 +08:00
|
|
|
/* Only one fragment on the socket. */
|
|
|
|
skb->csum_start = skb_transport_header(skb) - skb->head;
|
|
|
|
skb->csum_offset = offsetof(struct udphdr, check);
|
|
|
|
uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* HW-checksum won't work as there are two or more
|
|
|
|
* fragments on the socket so that all csums of sk_buffs
|
|
|
|
* should be together
|
|
|
|
*/
|
|
|
|
offset = skb_transport_offset(skb);
|
|
|
|
skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
|
2017-09-14 09:30:51 +08:00
|
|
|
csum = skb->csum;
|
2009-07-09 16:09:54 +08:00
|
|
|
|
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
|
2015-01-31 23:40:16 +08:00
|
|
|
do {
|
|
|
|
csum = csum_add(csum, frags->csum);
|
|
|
|
} while ((frags = frags->next));
|
2009-07-09 16:09:54 +08:00
|
|
|
|
|
|
|
uh->check = csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP,
|
|
|
|
csum);
|
|
|
|
if (uh->check == 0)
|
|
|
|
uh->check = CSUM_MANGLED_0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Sending
|
|
|
|
*/
|
|
|
|
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
|
|
|
|
struct inet_cork *cork)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2015-01-31 23:40:16 +08:00
|
|
|
struct sock *sk = skb->sk;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct udphdr *uh;
|
|
|
|
int err = 0;
|
2007-12-03 19:34:16 +08:00
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2006-11-15 13:35:48 +08:00
|
|
|
__wsum csum = 0;
|
2015-01-31 23:40:16 +08:00
|
|
|
int offset = skb_transport_offset(skb);
|
|
|
|
int len = skb->len - offset;
|
2019-10-03 01:29:23 +08:00
|
|
|
int datalen = len - sizeof(*uh);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a UDP header
|
|
|
|
*/
|
2007-03-14 01:28:48 +08:00
|
|
|
uh = udp_hdr(skb);
|
2011-03-13 05:36:19 +08:00
|
|
|
uh->source = fl6->fl6_sport;
|
|
|
|
uh->dest = fl6->fl6_dport;
|
2015-01-31 23:40:16 +08:00
|
|
|
uh->len = htons(len);
|
2005-04-17 06:20:36 +08:00
|
|
|
uh->check = 0;
|
|
|
|
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
if (cork->gso_size) {
|
|
|
|
const int hlen = skb_network_header_len(skb) +
|
|
|
|
sizeof(struct udphdr);
|
|
|
|
|
2019-01-16 00:40:02 +08:00
|
|
|
if (hlen + cork->gso_size > cork->fragsize) {
|
|
|
|
kfree_skb(skb);
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
return -EINVAL;
|
2019-01-16 00:40:02 +08:00
|
|
|
}
|
2021-12-24 06:24:40 +08:00
|
|
|
if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
|
2019-01-16 00:40:02 +08:00
|
|
|
kfree_skb(skb);
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
return -EINVAL;
|
2019-01-16 00:40:02 +08:00
|
|
|
}
|
2023-09-12 17:17:22 +08:00
|
|
|
if (udp_get_no_check6_tx(sk)) {
|
2019-01-16 00:40:02 +08:00
|
|
|
kfree_skb(skb);
|
2018-05-01 03:58:36 +08:00
|
|
|
return -EINVAL;
|
2019-01-16 00:40:02 +08:00
|
|
|
}
|
udp: Allow GSO transmit from devices with no checksum offload
Today sending a UDP GSO packet from a TUN device results in an EIO error:
import fcntl, os, struct
from socket import *
TUNSETIFF = 0x400454CA
IFF_TUN = 0x0001
IFF_NO_PI = 0x1000
UDP_SEGMENT = 103
tun_fd = os.open("/dev/net/tun", os.O_RDWR)
ifr = struct.pack("16sH", b"tun0", IFF_TUN | IFF_NO_PI)
fcntl.ioctl(tun_fd, TUNSETIFF, ifr)
os.system("ip addr add 192.0.2.1/24 dev tun0")
os.system("ip link set dev tun0 up")
s = socket(AF_INET, SOCK_DGRAM)
s.setsockopt(SOL_UDP, UDP_SEGMENT, 1200)
s.sendto(b"x" * 3000, ("192.0.2.2", 9)) # EIO
This is due to a check in the udp stack if the egress device offers
checksum offload. While TUN/TAP devices, by default, don't advertise this
capability because it requires support from the TUN/TAP reader.
However, the GSO stack has a software fallback for checksum calculation,
which we can use. This way we don't force UDP_SEGMENT users to handle the
EIO error and implement a segmentation fallback.
Lift the restriction so that UDP_SEGMENT can be used with any egress
device. We also need to adjust the UDP GSO code to match the GSO stack
expectation about ip_summed field, as set in commit 8d63bee643f1 ("net:
avoid skb_warn_bad_offload false positives on UFO"). Otherwise we will hit
the bad offload check.
Users should, however, expect a potential performance impact when
batch-sending packets with UDP_SEGMENT without checksum offload on the
egress device. In such case the packet payload is read twice: first during
the sendmsg syscall when copying data from user memory, and then in the GSO
stack for checksum computation. This double memory read can be less
efficient than a regular sendmsg where the checksum is calculated during
the initial data copy from user memory.
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20240626-linux-udpgso-v2-1-422dfcbd6b48@cloudflare.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-06-27 01:51:26 +08:00
|
|
|
if (is_udplite || dst_xfrm(skb_dst(skb))) {
|
2019-01-16 00:40:02 +08:00
|
|
|
kfree_skb(skb);
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
return -EIO;
|
2019-01-16 00:40:02 +08:00
|
|
|
}
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
|
2019-10-03 01:29:23 +08:00
|
|
|
if (datalen > cork->gso_size) {
|
|
|
|
skb_shinfo(skb)->gso_size = cork->gso_size;
|
|
|
|
skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
|
|
|
|
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
|
|
|
|
cork->gso_size);
|
2024-10-11 20:17:30 +08:00
|
|
|
|
|
|
|
/* Don't checksum the payload, skb will get segmented */
|
|
|
|
goto csum_partial;
|
2019-10-03 01:29:23 +08:00
|
|
|
}
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 01:42:17 +08:00
|
|
|
}
|
|
|
|
|
2007-12-03 19:34:16 +08:00
|
|
|
if (is_udplite)
|
2015-01-31 23:40:16 +08:00
|
|
|
csum = udplite_csum(skb);
|
2023-09-12 17:17:22 +08:00
|
|
|
else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */
|
2014-05-03 07:29:58 +08:00
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
goto send;
|
|
|
|
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
|
2018-05-01 03:58:36 +08:00
|
|
|
csum_partial:
|
2015-01-31 23:40:16 +08:00
|
|
|
udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len);
|
2009-07-09 16:09:54 +08:00
|
|
|
goto send;
|
|
|
|
} else
|
2015-01-31 23:40:16 +08:00
|
|
|
csum = udp_csum(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
/* add protocol-dependent pseudo-header */
|
2011-03-13 05:22:43 +08:00
|
|
|
uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
|
2015-01-31 23:40:16 +08:00
|
|
|
len, fl6->flowi6_proto, csum);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (uh->check == 0)
|
2006-11-16 18:36:50 +08:00
|
|
|
uh->check = CSUM_MANGLED_0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-07-09 16:09:54 +08:00
|
|
|
send:
|
2015-01-31 23:40:16 +08:00
|
|
|
err = ip6_send_skb(skb);
|
ip: Report qdisc packet drops
Christoph Lameter pointed out that packet drops at qdisc level where not
accounted in SNMP counters. Only if application sets IP_RECVERR, drops
are reported to user (-ENOBUFS errors) and SNMP counters updated.
IP_RECVERR is used to enable extended reliable error message passing,
but these are not needed to update system wide SNMP stats.
This patch changes things a bit to allow SNMP counters to be updated,
regardless of IP_RECVERR being set or not on the socket.
Example after an UDP tx flood
# netstat -s
...
IP:
1487048 outgoing packets dropped
...
Udp:
...
SndbufErrors: 1487048
send() syscalls, do however still return an OK status, to not
break applications.
Note : send() manual page explicitly says for -ENOBUFS error :
"The output queue for a network interface was full.
This generally indicates that the interface has stopped sending,
but may be caused by transient congestion.
(Normally, this does not occur in Linux. Packets are just silently
dropped when a device queue overflows.) "
This is not true for IP_RECVERR enabled sockets : a send() syscall
that hit a qdisc drop returns an ENOBUFS error.
Many thanks to Christoph, David, and last but not least, Alexey !
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-09-03 09:05:33 +08:00
|
|
|
if (err) {
|
2023-09-13 00:02:08 +08:00
|
|
|
if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) {
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_SNDBUFERRORS, is_udplite);
|
ip: Report qdisc packet drops
Christoph Lameter pointed out that packet drops at qdisc level where not
accounted in SNMP counters. Only if application sets IP_RECVERR, drops
are reported to user (-ENOBUFS errors) and SNMP counters updated.
IP_RECVERR is used to enable extended reliable error message passing,
but these are not needed to update system wide SNMP stats.
This patch changes things a bit to allow SNMP counters to be updated,
regardless of IP_RECVERR being set or not on the socket.
Example after an UDP tx flood
# netstat -s
...
IP:
1487048 outgoing packets dropped
...
Udp:
...
SndbufErrors: 1487048
send() syscalls, do however still return an OK status, to not
break applications.
Note : send() manual page explicitly says for -ENOBUFS error :
"The output queue for a network interface was full.
This generally indicates that the interface has stopped sending,
but may be caused by transient congestion.
(Normally, this does not occur in Linux. Packets are just silently
dropped when a device queue overflows.) "
This is not true for IP_RECVERR enabled sockets : a send() syscall
that hit a qdisc drop returns an ENOBUFS error.
Many thanks to Christoph, David, and last but not least, Alexey !
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-09-03 09:05:33 +08:00
|
|
|
err = 0;
|
|
|
|
}
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
} else {
|
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_OUTDATAGRAMS, is_udplite);
|
|
|
|
}
|
2015-01-31 23:40:16 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int udp_v6_push_pending_frames(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct sk_buff *skb;
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
if (up->pending == AF_INET)
|
|
|
|
return udp_push_pending_frames(sk);
|
|
|
|
|
|
|
|
skb = ip6_finish_skb(sk);
|
|
|
|
if (!skb)
|
|
|
|
goto out;
|
|
|
|
|
2022-01-27 08:36:26 +08:00
|
|
|
err = udp_v6_send_skb(skb, &inet_sk(sk)->cork.fl.u.ip6,
|
|
|
|
&inet_sk(sk)->cork.base);
|
2005-04-17 06:20:36 +08:00
|
|
|
out:
|
|
|
|
up->len = 0;
|
2024-01-12 18:44:27 +08:00
|
|
|
WRITE_ONCE(up->pending, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-03-02 15:37:48 +08:00
|
|
|
int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct ipv6_txoptions opt_space;
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
struct ipv6_pinfo *np = inet6_sk(sk);
|
2014-01-18 05:53:15 +08:00
|
|
|
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
|
2010-06-02 05:35:01 +08:00
|
|
|
struct in6_addr *daddr, *final_p, final;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct ipv6_txoptions *opt = NULL;
|
2015-11-30 11:37:57 +08:00
|
|
|
struct ipv6_txoptions *opt_to_free = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct ip6_flowlabel *flowlabel = NULL;
|
2022-01-27 08:36:29 +08:00
|
|
|
struct inet_cork_full cork;
|
|
|
|
struct flowi6 *fl6 = &cork.fl.u.ip6;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct dst_entry *dst;
|
2016-05-03 12:40:07 +08:00
|
|
|
struct ipcm6_cookie ipc6;
|
2005-04-17 06:20:36 +08:00
|
|
|
int addr_len = msg->msg_namelen;
|
2018-04-03 20:00:09 +08:00
|
|
|
bool connected = false;
|
2005-04-17 06:20:36 +08:00
|
|
|
int ulen = len;
|
2023-09-12 17:17:21 +08:00
|
|
|
int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
|
2005-04-17 06:20:36 +08:00
|
|
|
int err;
|
2007-12-03 19:34:16 +08:00
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2006-11-28 03:10:57 +08:00
|
|
|
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-07-06 22:12:55 +08:00
|
|
|
ipcm6_init(&ipc6);
|
2021-07-01 00:42:44 +08:00
|
|
|
ipc6.gso_size = READ_ONCE(up->gso_size);
|
2023-08-31 21:52:11 +08:00
|
|
|
ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
|
2023-07-28 23:03:15 +08:00
|
|
|
ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
|
2016-05-03 12:40:07 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* destination address check */
|
|
|
|
if (sin6) {
|
|
|
|
if (addr_len < offsetof(struct sockaddr, sa_data))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
switch (sin6->sin6_family) {
|
|
|
|
case AF_INET6:
|
|
|
|
if (addr_len < SIN6_LEN_RFC2133)
|
|
|
|
return -EINVAL;
|
|
|
|
daddr = &sin6->sin6_addr;
|
2017-02-13 06:26:07 +08:00
|
|
|
if (ipv6_addr_any(daddr) &&
|
|
|
|
ipv6_addr_v4mapped(&np->saddr))
|
|
|
|
ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
|
|
|
|
daddr);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
case AF_INET:
|
|
|
|
goto do_udp_sendmsg;
|
|
|
|
case AF_UNSPEC:
|
|
|
|
msg->msg_name = sin6 = NULL;
|
|
|
|
msg->msg_namelen = addr_len = 0;
|
|
|
|
daddr = NULL;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2024-01-12 18:44:27 +08:00
|
|
|
} else if (!READ_ONCE(up->pending)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (sk->sk_state != TCP_ESTABLISHED)
|
|
|
|
return -EDESTADDRREQ;
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
daddr = &sk->sk_v6_daddr;
|
2007-02-09 22:24:49 +08:00
|
|
|
} else
|
2005-04-17 06:20:36 +08:00
|
|
|
daddr = NULL;
|
|
|
|
|
|
|
|
if (daddr) {
|
2007-08-25 14:16:08 +08:00
|
|
|
if (ipv6_addr_v4mapped(daddr)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sockaddr_in sin;
|
|
|
|
sin.sin_family = AF_INET;
|
2009-10-15 14:30:45 +08:00
|
|
|
sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport;
|
2005-04-17 06:20:36 +08:00
|
|
|
sin.sin_addr.s_addr = daddr->s6_addr32[3];
|
|
|
|
msg->msg_name = &sin;
|
|
|
|
msg->msg_namelen = sizeof(sin);
|
|
|
|
do_udp_sendmsg:
|
2023-04-12 21:03:08 +08:00
|
|
|
err = ipv6_only_sock(sk) ?
|
|
|
|
-ENETUNREACH : udp_sendmsg(sk, msg, len);
|
|
|
|
msg->msg_name = sin6;
|
|
|
|
msg->msg_namelen = addr_len;
|
|
|
|
return err;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Rough check on arithmetic overflow,
|
[IPv6]: Fix incorrect length check in rawv6_sendmsg()
In article <20070329.142644.70222545.davem@davemloft.net> (at Thu, 29 Mar 2007 14:26:44 -0700 (PDT)), David Miller <davem@davemloft.net> says:
> From: Sridhar Samudrala <sri@us.ibm.com>
> Date: Thu, 29 Mar 2007 14:17:28 -0700
>
> > The check for length in rawv6_sendmsg() is incorrect.
> > As len is an unsigned int, (len < 0) will never be TRUE.
> > I think checking for IPV6_MAXPLEN(65535) is better.
> >
> > Is it possible to send ipv6 jumbo packets using raw
> > sockets? If so, we can remove this check.
>
> I don't see why such a limitation against jumbo would exist,
> does anyone else?
>
> Thanks for catching this Sridhar. A good compiler should simply
> fail to compile "if (x < 0)" when 'x' is an unsigned type, don't
> you think :-)
Dave, we use "int" for returning value,
so we should fix this anyway, IMHO;
we should not allow len > INT_MAX.
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Acked-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-03-31 05:45:35 +08:00
|
|
|
better check is made in ip6_append_data().
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
if (len > INT_MAX - sizeof(struct udphdr))
|
|
|
|
return -EMSGSIZE;
|
2007-02-09 22:24:49 +08:00
|
|
|
|
2015-01-31 23:40:17 +08:00
|
|
|
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
|
2024-01-12 18:44:27 +08:00
|
|
|
if (READ_ONCE(up->pending)) {
|
|
|
|
if (READ_ONCE(up->pending) == AF_INET)
|
2022-01-27 08:36:23 +08:00
|
|
|
return udp_sendmsg(sk, msg, len);
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* There are pending frames.
|
|
|
|
* The socket lock must be held while it's corked.
|
|
|
|
*/
|
|
|
|
lock_sock(sk);
|
|
|
|
if (likely(up->pending)) {
|
|
|
|
if (unlikely(up->pending != AF_INET6)) {
|
|
|
|
release_sock(sk);
|
|
|
|
return -EAFNOSUPPORT;
|
|
|
|
}
|
|
|
|
dst = NULL;
|
|
|
|
goto do_append_data;
|
|
|
|
}
|
|
|
|
release_sock(sk);
|
|
|
|
}
|
|
|
|
ulen += sizeof(struct udphdr);
|
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
memset(fl6, 0, sizeof(*fl6));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (sin6) {
|
|
|
|
if (sin6->sin6_port == 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
fl6->fl6_dport = sin6->sin6_port;
|
2005-04-17 06:20:36 +08:00
|
|
|
daddr = &sin6->sin6_addr;
|
|
|
|
|
2023-09-13 00:02:12 +08:00
|
|
|
if (inet6_test_bit(SNDFLOW, sk)) {
|
2022-01-27 08:36:29 +08:00
|
|
|
fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
|
|
|
|
if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
|
|
|
|
flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
|
2019-07-07 17:34:45 +08:00
|
|
|
if (IS_ERR(flowlabel))
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise it will be difficult to maintain
|
|
|
|
* sk->sk_dst_cache.
|
|
|
|
*/
|
|
|
|
if (sk->sk_state == TCP_ESTABLISHED &&
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
|
|
|
|
daddr = &sk->sk_v6_daddr;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (addr_len >= sizeof(struct sockaddr_in6) &&
|
|
|
|
sin6->sin6_scope_id &&
|
2013-03-08 10:07:19 +08:00
|
|
|
__ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
|
2022-01-27 08:36:29 +08:00
|
|
|
fl6->flowi6_oif = sin6->sin6_scope_id;
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
if (sk->sk_state != TCP_ESTABLISHED)
|
|
|
|
return -EDESTADDRREQ;
|
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
fl6->fl6_dport = inet->inet_dport;
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
daddr = &sk->sk_v6_daddr;
|
2022-01-27 08:36:29 +08:00
|
|
|
fl6->flowlabel = np->flow_label;
|
2018-04-03 20:00:09 +08:00
|
|
|
connected = true;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
if (!fl6->flowi6_oif)
|
2022-05-14 02:55:41 +08:00
|
|
|
fl6->flowi6_oif = READ_ONCE(sk->sk_bound_dev_if);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
if (!fl6->flowi6_oif)
|
|
|
|
fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
|
2008-12-16 18:08:29 +08:00
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
fl6->flowi6_uid = sk->sk_uid;
|
2009-10-05 16:24:16 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (msg->msg_controllen) {
|
|
|
|
opt = &opt_space;
|
|
|
|
memset(opt, 0, sizeof(struct ipv6_txoptions));
|
|
|
|
opt->tot_len = sizeof(*opt);
|
2016-05-03 12:40:07 +08:00
|
|
|
ipc6.opt = opt;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-04-27 01:42:20 +08:00
|
|
|
err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
|
2024-04-19 01:06:10 +08:00
|
|
|
if (err > 0) {
|
2022-01-27 08:36:29 +08:00
|
|
|
err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6,
|
2018-07-06 22:12:57 +08:00
|
|
|
&ipc6);
|
2024-04-19 01:06:10 +08:00
|
|
|
connected = false;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
if (err < 0) {
|
|
|
|
fl6_sock_release(flowlabel);
|
|
|
|
return err;
|
|
|
|
}
|
2022-01-27 08:36:29 +08:00
|
|
|
if ((fl6->flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
|
|
|
|
flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
|
2019-07-07 17:34:45 +08:00
|
|
|
if (IS_ERR(flowlabel))
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (!(opt->opt_nflen|opt->opt_flen))
|
|
|
|
opt = NULL;
|
|
|
|
}
|
2015-11-30 11:37:57 +08:00
|
|
|
if (!opt) {
|
|
|
|
opt = txopt_get(np);
|
|
|
|
opt_to_free = opt;
|
|
|
|
}
|
2005-11-20 11:23:18 +08:00
|
|
|
if (flowlabel)
|
|
|
|
opt = fl6_merge_options(&opt_space, flowlabel, opt);
|
|
|
|
opt = ipv6_fixup_options(&opt_space, opt);
|
2016-05-03 12:40:07 +08:00
|
|
|
ipc6.opt = opt;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
fl6->flowi6_proto = sk->sk_protocol;
|
|
|
|
fl6->flowi6_mark = ipc6.sockc.mark;
|
|
|
|
fl6->daddr = *daddr;
|
|
|
|
if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
|
|
|
|
fl6->saddr = np->saddr;
|
|
|
|
fl6->fl6_sport = inet->inet_sport;
|
2007-02-09 22:24:49 +08:00
|
|
|
|
2021-08-19 17:24:20 +08:00
|
|
|
if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
|
2018-05-25 23:55:23 +08:00
|
|
|
err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
|
2022-01-27 08:36:29 +08:00
|
|
|
(struct sockaddr *)sin6,
|
2023-10-12 02:51:04 +08:00
|
|
|
&addr_len,
|
2022-01-27 08:36:29 +08:00
|
|
|
&fl6->saddr);
|
2018-05-25 23:55:23 +08:00
|
|
|
if (err)
|
|
|
|
goto out_no_dst;
|
|
|
|
if (sin6) {
|
|
|
|
if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
|
|
|
|
/* BPF program rewrote IPv6-only by IPv4-mapped
|
|
|
|
* IPv6. It's currently unsupported.
|
|
|
|
*/
|
|
|
|
err = -ENOTSUPP;
|
|
|
|
goto out_no_dst;
|
|
|
|
}
|
|
|
|
if (sin6->sin6_port == 0) {
|
|
|
|
/* BPF program set invalid port. Reject it. */
|
|
|
|
err = -EINVAL;
|
|
|
|
goto out_no_dst;
|
|
|
|
}
|
2022-01-27 08:36:29 +08:00
|
|
|
fl6->fl6_dport = sin6->sin6_port;
|
|
|
|
fl6->daddr = sin6->sin6_addr;
|
2018-05-25 23:55:23 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
if (ipv6_addr_any(&fl6->daddr))
|
|
|
|
fl6->daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
|
2019-01-04 17:07:07 +08:00
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
final_p = fl6_update_dst(fl6, opt, &final);
|
2010-06-02 05:35:01 +08:00
|
|
|
if (final_p)
|
2018-04-03 20:00:09 +08:00
|
|
|
connected = false;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) {
|
2023-12-08 18:12:43 +08:00
|
|
|
fl6->flowi6_oif = READ_ONCE(np->mcast_oif);
|
2018-04-03 20:00:09 +08:00
|
|
|
connected = false;
|
2022-01-27 08:36:29 +08:00
|
|
|
} else if (!fl6->flowi6_oif)
|
2023-12-08 18:12:44 +08:00
|
|
|
fl6->flowi6_oif = READ_ONCE(np->ucast_oif);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
|
2006-08-05 14:12:42 +08:00
|
|
|
|
2016-06-12 02:08:19 +08:00
|
|
|
if (ipc6.tclass < 0)
|
|
|
|
ipc6.tclass = np->tclass;
|
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel);
|
2016-06-12 02:08:19 +08:00
|
|
|
|
2022-01-27 08:36:29 +08:00
|
|
|
dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected);
|
2011-03-02 05:19:07 +08:00
|
|
|
if (IS_ERR(dst)) {
|
|
|
|
err = PTR_ERR(dst);
|
|
|
|
dst = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
2007-05-25 09:17:54 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-05-03 12:40:07 +08:00
|
|
|
if (ipc6.hlimit < 0)
|
2022-01-27 08:36:29 +08:00
|
|
|
ipc6.hlimit = ip6_sk_dst_hoplimit(np, fl6, dst);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (msg->msg_flags&MSG_CONFIRM)
|
|
|
|
goto do_confirm;
|
|
|
|
back_from_confirm:
|
|
|
|
|
2015-01-31 23:40:17 +08:00
|
|
|
/* Lockless fast path for the non-corking case */
|
|
|
|
if (!corkreq) {
|
|
|
|
struct sk_buff *skb;
|
|
|
|
|
|
|
|
skb = ip6_make_skb(sk, getfrag, msg, ulen,
|
2016-05-03 12:40:07 +08:00
|
|
|
sizeof(struct udphdr), &ipc6,
|
2024-04-26 23:19:52 +08:00
|
|
|
dst_rt6_info(dst),
|
2018-07-06 22:12:57 +08:00
|
|
|
msg->msg_flags, &cork);
|
2015-01-31 23:40:17 +08:00
|
|
|
err = PTR_ERR(skb);
|
|
|
|
if (!IS_ERR_OR_NULL(skb))
|
2022-01-27 08:36:29 +08:00
|
|
|
err = udp_v6_send_skb(skb, fl6, &cork.base);
|
2022-01-27 08:36:30 +08:00
|
|
|
/* ip6_make_skb steals dst reference */
|
|
|
|
goto out_no_dst;
|
2015-01-31 23:40:17 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
lock_sock(sk);
|
|
|
|
if (unlikely(up->pending)) {
|
|
|
|
/* The socket is already corked while preparing it. */
|
|
|
|
/* ... which is an evident application bug. --ANK */
|
|
|
|
release_sock(sk);
|
|
|
|
|
2014-11-12 02:59:17 +08:00
|
|
|
net_dbg_ratelimited("udp cork app bug 2\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
err = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2024-01-12 18:44:27 +08:00
|
|
|
WRITE_ONCE(up->pending, AF_INET6);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
do_append_data:
|
2016-05-03 12:40:07 +08:00
|
|
|
if (ipc6.dontfrag < 0)
|
2023-09-13 00:02:07 +08:00
|
|
|
ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
|
2005-04-17 06:20:36 +08:00
|
|
|
up->len += ulen;
|
2016-05-03 12:40:07 +08:00
|
|
|
err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
|
2024-04-26 23:19:52 +08:00
|
|
|
&ipc6, fl6, dst_rt6_info(dst),
|
2018-07-06 22:12:57 +08:00
|
|
|
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (err)
|
|
|
|
udp_v6_flush_pending_frames(sk);
|
|
|
|
else if (!corkreq)
|
2006-11-28 01:29:59 +08:00
|
|
|
err = udp_v6_push_pending_frames(sk);
|
2006-10-04 05:35:49 +08:00
|
|
|
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
|
2024-01-12 18:44:27 +08:00
|
|
|
WRITE_ONCE(up->pending, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-01-31 23:40:17 +08:00
|
|
|
if (err > 0)
|
2023-09-13 00:02:08 +08:00
|
|
|
err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0;
|
2015-01-31 23:40:17 +08:00
|
|
|
release_sock(sk);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
out:
|
2008-06-04 00:30:25 +08:00
|
|
|
dst_release(dst);
|
2018-05-25 23:55:23 +08:00
|
|
|
out_no_dst:
|
2005-04-17 06:20:36 +08:00
|
|
|
fl6_sock_release(flowlabel);
|
2015-11-30 11:37:57 +08:00
|
|
|
txopt_put(opt_to_free);
|
2007-09-15 08:15:01 +08:00
|
|
|
if (!err)
|
2005-04-17 06:20:36 +08:00
|
|
|
return len;
|
2006-08-15 15:00:09 +08:00
|
|
|
/*
|
|
|
|
* ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
|
|
|
|
* ENOBUFS might not be good (it's not tunable per se), but otherwise
|
|
|
|
* we don't have a good statistic (IpOutDiscards but it can be too many
|
|
|
|
* things). We could add another new stat but at least for now that
|
|
|
|
* seems like overkill.
|
|
|
|
*/
|
|
|
|
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
|
net: snmp: kill various STATS_USER() helpers
In the old days (before linux-3.0), SNMP counters were duplicated,
one for user context, and one for BH context.
After commit 8f0ea0fe3a03 ("snmp: reduce percpu needs by 50%")
we have a single copy, and what really matters is preemption being
enabled or disabled, since we use this_cpu_inc() or __this_cpu_inc()
respectively.
We therefore kill SNMP_INC_STATS_USER(), SNMP_ADD_STATS_USER(),
NET_INC_STATS_USER(), NET_ADD_STATS_USER(), SCTP_INC_STATS_USER(),
SNMP_INC_STATS64_USER(), SNMP_ADD_STATS64_USER(), TCP_ADD_STATS_USER(),
UDP_INC_STATS_USER(), UDP6_INC_STATS_USER(), and XFRM_INC_STATS_USER()
Following patches will rename __BH helpers to make clear their
usage is not tied to BH being disabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-28 07:44:27 +08:00
|
|
|
UDP6_INC_STATS(sock_net(sk),
|
|
|
|
UDP_MIB_SNDBUFERRORS, is_udplite);
|
2006-08-15 15:00:09 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return err;
|
|
|
|
|
|
|
|
do_confirm:
|
2017-02-07 05:14:16 +08:00
|
|
|
if (msg->msg_flags & MSG_PROBE)
|
2022-01-27 08:36:29 +08:00
|
|
|
dst_confirm_neigh(dst, &fl6->daddr);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!(msg->msg_flags&MSG_PROBE) || len)
|
|
|
|
goto back_from_confirm;
|
|
|
|
err = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
2022-03-22 19:07:20 +08:00
|
|
|
EXPORT_SYMBOL(udpv6_sendmsg);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2023-06-08 02:19:13 +08:00
|
|
|
static void udpv6_splice_eof(struct socket *sock)
|
|
|
|
{
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
|
2024-01-12 18:44:27 +08:00
|
|
|
if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk))
|
2023-06-08 02:19:13 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
lock_sock(sk);
|
2023-09-12 17:17:21 +08:00
|
|
|
if (up->pending && !udp_test_bit(CORK, sk))
|
2023-06-08 02:19:13 +08:00
|
|
|
udp_v6_push_pending_frames(sk);
|
|
|
|
release_sock(sk);
|
|
|
|
}
|
|
|
|
|
2008-06-15 08:04:49 +08:00
|
|
|
void udpv6_destroy_sock(struct sock *sk)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-03-19 14:11:12 +08:00
|
|
|
struct udp_sock *up = udp_sk(sk);
|
2005-04-17 06:20:36 +08:00
|
|
|
lock_sock(sk);
|
2021-06-09 17:49:01 +08:00
|
|
|
|
|
|
|
/* protects from races with udp_abort() */
|
|
|
|
sock_set_flag(sk, SOCK_DEAD);
|
2005-04-17 06:20:36 +08:00
|
|
|
udp_v6_flush_pending_frames(sk);
|
|
|
|
release_sock(sk);
|
|
|
|
|
2018-11-07 19:38:28 +08:00
|
|
|
if (static_branch_unlikely(&udpv6_encap_needed_key)) {
|
|
|
|
if (up->encap_type) {
|
|
|
|
void (*encap_destroy)(struct sock *sk);
|
|
|
|
encap_destroy = READ_ONCE(up->encap_destroy);
|
|
|
|
if (encap_destroy)
|
|
|
|
encap_destroy(sk);
|
|
|
|
}
|
2023-09-12 17:17:27 +08:00
|
|
|
if (udp_test_bit(ENCAP_ENABLED, sk)) {
|
2018-11-15 09:34:50 +08:00
|
|
|
static_branch_dec(&udpv6_encap_needed_key);
|
2021-02-03 16:54:22 +08:00
|
|
|
udp_encap_disable();
|
|
|
|
}
|
2013-03-19 14:11:12 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Socket option code for UDP
|
|
|
|
*/
|
2020-07-23 14:09:07 +08:00
|
|
|
int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
|
|
|
|
unsigned int optlen)
|
2006-03-21 14:45:21 +08:00
|
|
|
{
|
2022-10-21 01:48:52 +08:00
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
|
2020-07-23 14:09:04 +08:00
|
|
|
return udp_lib_setsockopt(sk, level, optname,
|
2020-07-23 14:09:07 +08:00
|
|
|
optval, optlen,
|
2006-11-28 01:29:59 +08:00
|
|
|
udp_v6_push_pending_frames);
|
2006-11-28 03:10:57 +08:00
|
|
|
return ipv6_setsockopt(sk, level, optname, optval, optlen);
|
2006-03-21 14:45:21 +08:00
|
|
|
}
|
|
|
|
|
2006-11-28 03:10:57 +08:00
|
|
|
int udpv6_getsockopt(struct sock *sk, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen)
|
2006-03-21 14:45:21 +08:00
|
|
|
{
|
2008-03-07 08:22:02 +08:00
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE)
|
2006-11-28 01:29:59 +08:00
|
|
|
return udp_lib_getsockopt(sk, level, optname, optval, optlen);
|
2006-11-28 03:10:57 +08:00
|
|
|
return ipv6_getsockopt(sk, level, optname, optval, optlen);
|
2006-03-21 14:45:21 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
|
|
#ifdef CONFIG_PROC_FS
|
2006-11-28 03:10:57 +08:00
|
|
|
int udp6_seq_show(struct seq_file *seq, void *v)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-05-31 23:05:48 +08:00
|
|
|
if (v == SEQ_START_TOKEN) {
|
|
|
|
seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
|
|
|
|
} else {
|
|
|
|
int bucket = ((struct udp_iter_state *)seq->private)->bucket;
|
2023-03-16 23:31:55 +08:00
|
|
|
const struct inet_sock *inet = inet_sk((const struct sock *)v);
|
2013-05-31 23:05:48 +08:00
|
|
|
__u16 srcp = ntohs(inet->inet_sport);
|
|
|
|
__u16 destp = ntohs(inet->inet_dport);
|
2018-06-08 17:35:40 +08:00
|
|
|
__ip6_dgram_sock_seq_show(seq, v, srcp, destp,
|
|
|
|
udp_rqueue_get(v), bucket);
|
2013-05-31 23:05:48 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-04-11 01:42:55 +08:00
|
|
|
const struct seq_operations udp6_seq_ops = {
|
2018-04-11 03:31:50 +08:00
|
|
|
.start = udp_seq_start,
|
|
|
|
.next = udp_seq_next,
|
|
|
|
.stop = udp_seq_stop,
|
|
|
|
.show = udp6_seq_show,
|
|
|
|
};
|
2018-04-11 01:42:55 +08:00
|
|
|
EXPORT_SYMBOL(udp6_seq_ops);
|
2011-10-30 14:46:30 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static struct udp_seq_afinfo udp6_seq_afinfo = {
|
|
|
|
.family = AF_INET6,
|
2022-11-15 05:57:55 +08:00
|
|
|
.udp_table = NULL,
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2010-01-17 11:35:32 +08:00
|
|
|
int __net_init udp6_proc_init(struct net *net)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2018-04-11 01:42:55 +08:00
|
|
|
if (!proc_create_net_data("udp6", 0444, net->proc_net, &udp6_seq_ops,
|
|
|
|
sizeof(struct udp_iter_state), &udp6_seq_afinfo))
|
2018-04-11 03:31:50 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2015-08-15 05:43:38 +08:00
|
|
|
void udp6_proc_exit(struct net *net)
|
|
|
|
{
|
2018-04-11 03:31:50 +08:00
|
|
|
remove_proc_entry("udp6", net->proc_net);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
|
|
|
|
|
|
struct proto udpv6_prot = {
|
2018-03-14 12:57:16 +08:00
|
|
|
.name = "UDPv6",
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.close = udp_lib_close,
|
2018-03-31 06:08:05 +08:00
|
|
|
.pre_connect = udpv6_pre_connect,
|
2018-03-14 12:57:16 +08:00
|
|
|
.connect = ip6_datagram_connect,
|
|
|
|
.disconnect = udp_disconnect,
|
|
|
|
.ioctl = udp_ioctl,
|
tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct().
Originally, inet6_sk(sk)->XXX were changed under lock_sock(), so we were
able to clean them up by calling inet6_destroy_sock() during the IPv6 ->
IPv4 conversion by IPV6_ADDRFORM. However, commit 03485f2adcde ("udpv6:
Add lockless sendmsg() support") added a lockless memory allocation path,
which could cause a memory leak:
setsockopt(IPV6_ADDRFORM) sendmsg()
+-----------------------+ +-------+
- do_ipv6_setsockopt(sk, ...) - udpv6_sendmsg(sk, ...)
- sockopt_lock_sock(sk) ^._ called via udpv6_prot
- lock_sock(sk) before WRITE_ONCE()
- WRITE_ONCE(sk->sk_prot, &tcp_prot)
- inet6_destroy_sock() - if (!corkreq)
- sockopt_release_sock(sk) - ip6_make_skb(sk, ...)
- release_sock(sk) ^._ lockless fast path for
the non-corking case
- __ip6_append_data(sk, ...)
- ipv6_local_rxpmtu(sk, ...)
- xchg(&np->rxpmtu, skb)
^._ rxpmtu is never freed.
- goto out_no_dst;
- lock_sock(sk)
For now, rxpmtu is only the case, but not to miss the future change
and a similar bug fixed in commit e27326009a3d ("net: ping6: Fix
memleak in ipv6_renew_options()."), let's set a new function to IPv6
sk->sk_destruct() and call inet6_cleanup_sock() there. Since the
conversion does not change sk->sk_destruct(), we can guarantee that
we can clean up IPv6 resources finally.
We can now remove all inet6_destroy_sock() calls from IPv6 protocol
specific ->destroy() functions, but such changes are invasive to
backport. So they can be posted as a follow-up later for net-next.
Fixes: 03485f2adcde ("udpv6: Add lockless sendmsg() support")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-10-07 02:53:47 +08:00
|
|
|
.init = udpv6_init_sock,
|
2018-03-14 12:57:16 +08:00
|
|
|
.destroy = udpv6_destroy_sock,
|
|
|
|
.setsockopt = udpv6_setsockopt,
|
|
|
|
.getsockopt = udpv6_getsockopt,
|
|
|
|
.sendmsg = udpv6_sendmsg,
|
|
|
|
.recvmsg = udpv6_recvmsg,
|
2023-06-08 02:19:13 +08:00
|
|
|
.splice_eof = udpv6_splice_eof,
|
2018-03-14 12:57:16 +08:00
|
|
|
.release_cb = ip6_datagram_release_cb,
|
|
|
|
.hash = udp_lib_hash,
|
|
|
|
.unhash = udp_lib_unhash,
|
|
|
|
.rehash = udp_v6_rehash,
|
|
|
|
.get_port = udp_v6_get_port,
|
net: bpf: Handle return value of BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND()
The return value of BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND() in
__inet_bind() is not handled properly. While the return value
is non-zero, it will set inet_saddr and inet_rcv_saddr to 0 and
exit:
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
goto out_release_sock;
}
Let's take UDP for example and see what will happen. For UDP
socket, it will be added to 'udp_prot.h.udp_table->hash' and
'udp_prot.h.udp_table->hash2' after the sk->sk_prot->get_port()
called success. If 'inet->inet_rcv_saddr' is specified here,
then 'sk' will be in the 'hslot2' of 'hash2' that it don't belong
to (because inet_saddr is changed to 0), and UDP packet received
will not be passed to this sock. If 'inet->inet_rcv_saddr' is not
specified here, the sock will work fine, as it can receive packet
properly, which is wired, as the 'bind()' is already failed.
To undo the get_port() operation, introduce the 'put_port' field
for 'struct proto'. For TCP proto, it is inet_put_port(); For UDP
proto, it is udp_lib_unhash(); For icmp proto, it is
ping_unhash().
Therefore, after sys_bind() fail caused by
BPF_CGROUP_RUN_PROG_INET4_POST_BIND(), it will be unbinded, which
means that it can try to be binded to another port.
Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220106132022.3470772-2-imagedong@tencent.com
2022-01-06 21:20:20 +08:00
|
|
|
.put_port = udp_lib_unhash,
|
2021-03-31 10:32:31 +08:00
|
|
|
#ifdef CONFIG_BPF_SYSCALL
|
|
|
|
.psock_update_sk_prot = udp_bpf_update_proto,
|
|
|
|
#endif
|
2022-06-09 14:34:08 +08:00
|
|
|
|
2018-03-14 12:57:16 +08:00
|
|
|
.memory_allocated = &udp_memory_allocated,
|
2022-06-09 14:34:08 +08:00
|
|
|
.per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
|
|
|
|
|
2018-03-14 12:57:16 +08:00
|
|
|
.sysctl_mem = sysctl_udp_mem,
|
|
|
|
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
|
|
|
|
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
|
|
|
|
.obj_size = sizeof(struct udp6_sock),
|
2023-07-20 19:09:01 +08:00
|
|
|
.ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6),
|
2022-11-15 05:57:54 +08:00
|
|
|
.h.udp_table = NULL,
|
2018-03-14 12:57:16 +08:00
|
|
|
.diag_destroy = udp_abort,
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static struct inet_protosw udpv6_protosw = {
|
|
|
|
.type = SOCK_DGRAM,
|
|
|
|
.protocol = IPPROTO_UDP,
|
|
|
|
.prot = &udpv6_prot,
|
|
|
|
.ops = &inet6_dgram_ops,
|
|
|
|
.flags = INET_PROTOSW_PERMANENT,
|
|
|
|
};
|
|
|
|
|
2007-12-11 18:25:35 +08:00
|
|
|
int __init udpv6_init(void)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-12-11 18:25:35 +08:00
|
|
|
int ret;
|
|
|
|
|
2024-03-07 00:00:25 +08:00
|
|
|
net_hotdata.udpv6_protocol = (struct inet6_protocol) {
|
|
|
|
.handler = udpv6_rcv,
|
|
|
|
.err_handler = udpv6_err,
|
|
|
|
.flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
|
|
|
|
};
|
|
|
|
ret = inet6_add_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
|
2012-11-15 16:49:15 +08:00
|
|
|
if (ret)
|
2012-11-15 16:49:22 +08:00
|
|
|
goto out;
|
2012-11-15 16:49:15 +08:00
|
|
|
|
2007-12-11 18:25:35 +08:00
|
|
|
ret = inet6_register_protosw(&udpv6_protosw);
|
|
|
|
if (ret)
|
|
|
|
goto out_udpv6_protocol;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
out_udpv6_protocol:
|
2024-03-07 00:00:25 +08:00
|
|
|
inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
|
2007-12-11 18:25:35 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-12-13 21:34:58 +08:00
|
|
|
void udpv6_exit(void)
|
2007-12-11 18:25:35 +08:00
|
|
|
{
|
|
|
|
inet6_unregister_protosw(&udpv6_protosw);
|
2024-03-07 00:00:25 +08:00
|
|
|
inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|