2019-05-27 14:55:01 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2005-08-12 20:19:38 +08:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* Authors: Lotsa people, from code originally in tcp
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _INET6_HASHTABLES_H
|
|
|
|
#define _INET6_HASHTABLES_H
|
|
|
|
|
2005-08-12 20:26:18 +08:00
|
|
|
|
2011-12-10 17:48:31 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2005-08-12 20:26:18 +08:00
|
|
|
#include <linux/in6.h>
|
|
|
|
#include <linux/ipv6.h>
|
2005-08-12 20:19:38 +08:00
|
|
|
#include <linux/types.h>
|
2007-03-24 02:40:27 +08:00
|
|
|
#include <linux/jhash.h>
|
|
|
|
|
|
|
|
#include <net/inet_sock.h>
|
2005-08-12 20:19:38 +08:00
|
|
|
|
2005-08-12 20:26:18 +08:00
|
|
|
#include <net/ipv6.h>
|
2008-06-17 08:14:11 +08:00
|
|
|
#include <net/netns/hash.h>
|
2005-08-12 20:26:18 +08:00
|
|
|
|
2005-08-12 20:19:38 +08:00
|
|
|
struct inet_hashinfo;
|
|
|
|
|
2013-10-20 03:48:52 +08:00
|
|
|
static inline unsigned int __inet6_ehashfn(const u32 lhash,
|
|
|
|
const u16 lport,
|
|
|
|
const u32 fhash,
|
|
|
|
const __be16 fport,
|
|
|
|
const u32 initval)
|
2005-08-12 20:26:18 +08:00
|
|
|
{
|
2013-10-20 03:48:52 +08:00
|
|
|
const u32 ports = (((u32)lport) << 16) | (__force u32)fport;
|
|
|
|
return jhash_3words(lhash, fhash, ports, initval);
|
2005-08-12 20:26:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
|
|
|
|
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
|
|
|
|
*
|
|
|
|
* The sockhash lock must be held as a reader here.
|
|
|
|
*/
|
2013-09-22 01:22:41 +08:00
|
|
|
struct sock *__inet6_lookup_established(struct net *net,
|
|
|
|
struct inet_hashinfo *hashinfo,
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
const __be16 sport,
|
|
|
|
const struct in6_addr *daddr,
|
2017-08-07 23:44:21 +08:00
|
|
|
const u16 hnum, const int dif,
|
|
|
|
const int sdif);
|
2013-09-22 01:22:41 +08:00
|
|
|
|
2023-07-20 23:30:08 +08:00
|
|
|
typedef u32 (inet6_ehashfn_t)(const struct net *net,
|
|
|
|
const struct in6_addr *laddr, const u16 lport,
|
|
|
|
const struct in6_addr *faddr, const __be16 fport);
|
|
|
|
|
|
|
|
inet6_ehashfn_t inet6_ehashfn;
|
|
|
|
|
|
|
|
INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn);
|
|
|
|
|
2023-07-20 23:30:07 +08:00
|
|
|
struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk,
|
|
|
|
struct sk_buff *skb, int doff,
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
__be16 sport,
|
|
|
|
const struct in6_addr *daddr,
|
2023-07-20 23:30:08 +08:00
|
|
|
unsigned short hnum,
|
|
|
|
inet6_ehashfn_t *ehashfn);
|
2023-07-20 23:30:07 +08:00
|
|
|
|
2013-09-22 01:22:41 +08:00
|
|
|
struct sock *inet6_lookup_listener(struct net *net,
|
|
|
|
struct inet_hashinfo *hashinfo,
|
2016-02-11 00:50:38 +08:00
|
|
|
struct sk_buff *skb, int doff,
|
2013-09-22 01:22:41 +08:00
|
|
|
const struct in6_addr *saddr,
|
|
|
|
const __be16 sport,
|
|
|
|
const struct in6_addr *daddr,
|
2017-08-07 23:44:21 +08:00
|
|
|
const unsigned short hnum,
|
|
|
|
const int dif, const int sdif);
|
2005-08-12 20:26:18 +08:00
|
|
|
|
2023-07-20 23:30:10 +08:00
|
|
|
struct sock *inet6_lookup_run_sk_lookup(struct net *net,
|
|
|
|
int protocol,
|
|
|
|
struct sk_buff *skb, int doff,
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
const __be16 sport,
|
|
|
|
const struct in6_addr *daddr,
|
|
|
|
const u16 hnum, const int dif,
|
|
|
|
inet6_ehashfn_t *ehashfn);
|
|
|
|
|
2008-01-31 21:07:21 +08:00
|
|
|
static inline struct sock *__inet6_lookup(struct net *net,
|
|
|
|
struct inet_hashinfo *hashinfo,
|
2016-02-11 00:50:38 +08:00
|
|
|
struct sk_buff *skb, int doff,
|
2005-08-12 20:26:18 +08:00
|
|
|
const struct in6_addr *saddr,
|
2006-11-08 16:20:00 +08:00
|
|
|
const __be16 sport,
|
2005-08-12 20:26:18 +08:00
|
|
|
const struct in6_addr *daddr,
|
|
|
|
const u16 hnum,
|
2017-08-07 23:44:21 +08:00
|
|
|
const int dif, const int sdif,
|
2016-04-01 23:52:17 +08:00
|
|
|
bool *refcounted)
|
2005-08-12 20:26:18 +08:00
|
|
|
{
|
2008-01-31 21:07:21 +08:00
|
|
|
struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr,
|
2017-08-07 23:44:21 +08:00
|
|
|
sport, daddr, hnum,
|
|
|
|
dif, sdif);
|
2016-04-01 23:52:17 +08:00
|
|
|
*refcounted = true;
|
2005-08-12 20:26:18 +08:00
|
|
|
if (sk)
|
|
|
|
return sk;
|
2016-04-01 23:52:17 +08:00
|
|
|
*refcounted = false;
|
2016-02-11 00:50:38 +08:00
|
|
|
return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
|
2017-08-07 23:44:21 +08:00
|
|
|
daddr, hnum, dif, sdif);
|
2005-08-12 20:26:18 +08:00
|
|
|
}
|
|
|
|
|
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 23:30:11 +08:00
|
|
|
static inline
|
|
|
|
struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff,
|
|
|
|
const struct in6_addr *saddr, const __be16 sport,
|
|
|
|
const struct in6_addr *daddr, const __be16 dport,
|
|
|
|
bool *refcounted, inet6_ehashfn_t *ehashfn)
|
|
|
|
{
|
|
|
|
struct sock *sk, *reuse_sk;
|
|
|
|
bool prefetched;
|
|
|
|
|
|
|
|
sk = skb_steal_sock(skb, refcounted, &prefetched);
|
|
|
|
if (!sk)
|
|
|
|
return NULL;
|
|
|
|
|
2023-08-15 16:53:41 +08:00
|
|
|
if (!prefetched || !sk_fullsock(sk))
|
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 23:30:11 +08:00
|
|
|
return sk;
|
|
|
|
|
|
|
|
if (sk->sk_protocol == IPPROTO_TCP) {
|
|
|
|
if (sk->sk_state != TCP_LISTEN)
|
|
|
|
return sk;
|
|
|
|
} else if (sk->sk_protocol == IPPROTO_UDP) {
|
|
|
|
if (sk->sk_state != TCP_CLOSE)
|
|
|
|
return sk;
|
|
|
|
} else {
|
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
|
|
|
|
saddr, sport, daddr, ntohs(dport),
|
|
|
|
ehashfn);
|
|
|
|
if (!reuse_sk)
|
|
|
|
return sk;
|
|
|
|
|
|
|
|
/* We've chosen a new reuseport sock which is never refcounted. This
|
|
|
|
* implies that sk also isn't refcounted.
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE(*refcounted);
|
|
|
|
|
|
|
|
return reuse_sk;
|
|
|
|
}
|
|
|
|
|
2008-10-08 02:41:57 +08:00
|
|
|
static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
|
2016-02-11 00:50:38 +08:00
|
|
|
struct sk_buff *skb, int doff,
|
2008-10-08 02:41:57 +08:00
|
|
|
const __be16 sport,
|
2014-10-18 00:17:20 +08:00
|
|
|
const __be16 dport,
|
2017-08-07 23:44:21 +08:00
|
|
|
int iif, int sdif,
|
2016-04-01 23:52:17 +08:00
|
|
|
bool *refcounted)
|
2008-10-08 02:41:57 +08:00
|
|
|
{
|
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 23:30:11 +08:00
|
|
|
struct net *net = dev_net(skb_dst(skb)->dev);
|
|
|
|
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
|
|
|
|
struct sock *sk;
|
|
|
|
|
|
|
|
sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport,
|
|
|
|
refcounted, inet6_ehashfn);
|
|
|
|
if (IS_ERR(sk))
|
|
|
|
return NULL;
|
2012-07-26 20:18:11 +08:00
|
|
|
if (sk)
|
2008-10-08 03:41:01 +08:00
|
|
|
return sk;
|
2012-07-26 20:18:11 +08:00
|
|
|
|
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 23:30:11 +08:00
|
|
|
return __inet6_lookup(net, hashinfo, skb,
|
|
|
|
doff, &ip6h->saddr, sport,
|
|
|
|
&ip6h->daddr, ntohs(dport),
|
2017-08-07 23:44:21 +08:00
|
|
|
iif, sdif, refcounted);
|
2008-10-08 02:41:57 +08:00
|
|
|
}
|
|
|
|
|
2013-09-22 01:22:41 +08:00
|
|
|
struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
|
2016-02-11 00:50:38 +08:00
|
|
|
struct sk_buff *skb, int doff,
|
2013-09-22 01:22:41 +08:00
|
|
|
const struct in6_addr *saddr, const __be16 sport,
|
|
|
|
const struct in6_addr *daddr, const __be16 dport,
|
|
|
|
const int dif);
|
2016-02-11 00:50:36 +08:00
|
|
|
|
|
|
|
int inet6_hash(struct sock *sk);
|
2014-11-05 02:59:47 +08:00
|
|
|
|
2022-05-14 02:55:49 +08:00
|
|
|
static inline bool inet6_match(struct net *net, const struct sock *sk,
|
|
|
|
const struct in6_addr *saddr,
|
|
|
|
const struct in6_addr *daddr,
|
|
|
|
const __portpair ports,
|
|
|
|
const int dif, const int sdif)
|
|
|
|
{
|
|
|
|
if (!net_eq(sock_net(sk), net) ||
|
|
|
|
sk->sk_family != AF_INET6 ||
|
|
|
|
sk->sk_portpair != ports ||
|
|
|
|
!ipv6_addr_equal(&sk->sk_v6_daddr, saddr) ||
|
|
|
|
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
|
|
|
|
return false;
|
|
|
|
|
2022-07-26 02:14:42 +08:00
|
|
|
/* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
|
|
|
|
return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
|
|
|
|
sdif);
|
2022-05-14 02:55:49 +08:00
|
|
|
}
|
|
|
|
#endif /* IS_ENABLED(CONFIG_IPV6) */
|
2014-11-05 02:59:47 +08:00
|
|
|
|
2005-08-12 20:19:38 +08:00
|
|
|
#endif /* _INET6_HASHTABLES_H */
|