linux/include/net/inet6_hashtables.h
Lorenz Bauer 9c02bec959 bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.

The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:

    sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
    bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed

Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.

Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.

Fixes: 8e368dc72e ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-25 13:55:55 -07:00

198 lines
5.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Authors: Lotsa people, from code originally in tcp
*/
#ifndef _INET6_HASHTABLES_H
#define _INET6_HASHTABLES_H
#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <linux/jhash.h>
#include <net/inet_sock.h>
#include <net/ipv6.h>
#include <net/netns/hash.h>
struct inet_hashinfo;
static inline unsigned int __inet6_ehashfn(const u32 lhash,
const u16 lport,
const u32 fhash,
const __be16 fport,
const u32 initval)
{
const u32 ports = (((u32)lport) << 16) | (__force u32)fport;
return jhash_3words(lhash, fhash, ports, initval);
}
/*
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
*
* The sockhash lock must be held as a reader here.
*/
struct sock *__inet6_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const struct in6_addr *saddr,
const __be16 sport,
const struct in6_addr *daddr,
const u16 hnum, const int dif,
const int sdif);
typedef u32 (inet6_ehashfn_t)(const struct net *net,
const struct in6_addr *laddr, const u16 lport,
const struct in6_addr *faddr, const __be16 fport);
inet6_ehashfn_t inet6_ehashfn;
INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn);
struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr,
__be16 sport,
const struct in6_addr *daddr,
unsigned short hnum,
inet6_ehashfn_t *ehashfn);
struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr,
const __be16 sport,
const struct in6_addr *daddr,
const unsigned short hnum,
const int dif, const int sdif);
struct sock *inet6_lookup_run_sk_lookup(struct net *net,
int protocol,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr,
const __be16 sport,
const struct in6_addr *daddr,
const u16 hnum, const int dif,
inet6_ehashfn_t *ehashfn);
static inline struct sock *__inet6_lookup(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr,
const __be16 sport,
const struct in6_addr *daddr,
const u16 hnum,
const int dif, const int sdif,
bool *refcounted)
{
struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr,
sport, daddr, hnum,
dif, sdif);
*refcounted = true;
if (sk)
return sk;
*refcounted = false;
return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
daddr, hnum, dif, sdif);
}
static inline
struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff,
const struct in6_addr *saddr, const __be16 sport,
const struct in6_addr *daddr, const __be16 dport,
bool *refcounted, inet6_ehashfn_t *ehashfn)
{
struct sock *sk, *reuse_sk;
bool prefetched;
sk = skb_steal_sock(skb, refcounted, &prefetched);
if (!sk)
return NULL;
if (!prefetched)
return sk;
if (sk->sk_protocol == IPPROTO_TCP) {
if (sk->sk_state != TCP_LISTEN)
return sk;
} else if (sk->sk_protocol == IPPROTO_UDP) {
if (sk->sk_state != TCP_CLOSE)
return sk;
} else {
return sk;
}
reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
saddr, sport, daddr, ntohs(dport),
ehashfn);
if (!reuse_sk)
return sk;
/* We've chosen a new reuseport sock which is never refcounted. This
* implies that sk also isn't refcounted.
*/
WARN_ON_ONCE(*refcounted);
return reuse_sk;
}
static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be16 sport,
const __be16 dport,
int iif, int sdif,
bool *refcounted)
{
struct net *net = dev_net(skb_dst(skb)->dev);
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct sock *sk;
sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport,
refcounted, inet6_ehashfn);
if (IS_ERR(sk))
return NULL;
if (sk)
return sk;
return __inet6_lookup(net, hashinfo, skb,
doff, &ip6h->saddr, sport,
&ip6h->daddr, ntohs(dport),
iif, sdif, refcounted);
}
struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr, const __be16 sport,
const struct in6_addr *daddr, const __be16 dport,
const int dif);
int inet6_hash(struct sock *sk);
static inline bool inet6_match(struct net *net, const struct sock *sk,
const struct in6_addr *saddr,
const struct in6_addr *daddr,
const __portpair ports,
const int dif, const int sdif)
{
if (!net_eq(sock_net(sk), net) ||
sk->sk_family != AF_INET6 ||
sk->sk_portpair != ports ||
!ipv6_addr_equal(&sk->sk_v6_daddr, saddr) ||
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
return false;
/* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
sdif);
}
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif /* _INET6_HASHTABLES_H */