mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-12-01 08:04:22 +08:00
Merge branch 'udp-gro-L4'
Paolo Abeni says: ==================== udp: GRO L4 improvements This series improves the UDP L4 - either 'forward' or 'frag_list' - co-existence with UDP tunnel GRO, allowing the first to take place correctly even for encapsulated UDP traffic. The first for patches are mostly bugfixes, addressing some GRO edge-cases when both tunnels and L4 are present, enabled and in use. The next 3 patches avoid unneeded segmentation when UDP GRO traffic traverses in the receive path UDP tunnels. Finally, some self-tests are included, covering the relevant GRO scenarios. Even if most patches are actually bugfixes, this series is targeting net-next, as overall it makes available a new feature. v2 -> v3: - no code changes, more verbose commit messages and comment in patch 1/8 v1 -> v2: - restrict post segmentation csum fixup to the only the relevant pkts - use individual 'accept_gso_type' fields instead of whole gso bitmask (Willem) - use only ipv6 addesses from test range in self-tests (Willem) - hopefully clarified most individual patches commit messages ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
df82e9c6dd
@ -218,6 +218,7 @@ static struct socket *bareudp_create_sock(struct net *net, __be16 port)
|
||||
if (err < 0)
|
||||
return ERR_PTR(err);
|
||||
|
||||
udp_allow_gso(sock->sk);
|
||||
return sock;
|
||||
}
|
||||
|
||||
|
@ -461,6 +461,7 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6,
|
||||
if (err < 0)
|
||||
return ERR_PTR(err);
|
||||
|
||||
udp_allow_gso(sock->sk);
|
||||
return sock;
|
||||
}
|
||||
|
||||
|
@ -3484,6 +3484,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
|
||||
if (err < 0)
|
||||
return ERR_PTR(err);
|
||||
|
||||
udp_allow_gso(sock->sk);
|
||||
return sock;
|
||||
}
|
||||
|
||||
|
@ -51,7 +51,9 @@ struct udp_sock {
|
||||
* different encapsulation layer set
|
||||
* this
|
||||
*/
|
||||
gro_enabled:1; /* Can accept GRO packets */
|
||||
gro_enabled:1, /* Request GRO aggregation */
|
||||
accept_udp_l4:1,
|
||||
accept_udp_fraglist:1;
|
||||
/*
|
||||
* Following member retains the information to create a UDP header
|
||||
* when the socket is uncorked.
|
||||
@ -131,8 +133,22 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
|
||||
|
||||
static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
return !udp_sk(sk)->gro_enabled && skb_is_gso(skb) &&
|
||||
skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4;
|
||||
if (!skb_is_gso(skb))
|
||||
return false;
|
||||
|
||||
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
|
||||
return true;
|
||||
|
||||
if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void udp_allow_gso(struct sock *sk)
|
||||
{
|
||||
udp_sk(sk)->accept_udp_l4 = 1;
|
||||
udp_sk(sk)->accept_udp_fraglist = 1;
|
||||
}
|
||||
|
||||
#define udp_portaddr_for_each_entry(__sk, list) \
|
||||
|
@ -515,6 +515,29 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
|
||||
return segs;
|
||||
}
|
||||
|
||||
static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
|
||||
{
|
||||
/* UDP-lite can't land here - no GRO */
|
||||
WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov);
|
||||
|
||||
/* UDP packets generated with UDP_SEGMENT and traversing:
|
||||
*
|
||||
* UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx)
|
||||
*
|
||||
* can reach an UDP socket with CHECKSUM_NONE, because
|
||||
* __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE.
|
||||
* SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will
|
||||
* have a valid checksum, as the GRO engine validates the UDP csum
|
||||
* before the aggregation and nobody strips such info in between.
|
||||
* Instead of adding another check in the tunnel fastpath, we can force
|
||||
* a valid csum after the segmentation.
|
||||
* Additionally fixup the UDP CB.
|
||||
*/
|
||||
UDP_SKB_CB(skb)->cscov = skb->len;
|
||||
if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid)
|
||||
skb->csum_valid = 1;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BPF_SYSCALL
|
||||
struct sk_psock;
|
||||
struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
|
||||
|
@ -2178,6 +2178,8 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
|
||||
segs = udp_rcv_segment(sk, skb, true);
|
||||
skb_list_walk_safe(segs, skb, next) {
|
||||
__skb_pull(skb, skb_transport_offset(skb));
|
||||
|
||||
udp_post_segment_fix_csum(skb);
|
||||
ret = udp_queue_rcv_one_skb(sk, skb);
|
||||
if (ret > 0)
|
||||
ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
|
||||
@ -2664,9 +2666,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
|
||||
|
||||
case UDP_GRO:
|
||||
lock_sock(sk);
|
||||
|
||||
/* when enabling GRO, accept the related GSO packet type */
|
||||
if (valbool)
|
||||
udp_tunnel_encap_enable(sk->sk_socket);
|
||||
up->gro_enabled = valbool;
|
||||
up->accept_udp_l4 = valbool;
|
||||
release_sock(sk);
|
||||
break;
|
||||
|
||||
|
@ -515,21 +515,24 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
|
||||
unsigned int off = skb_gro_offset(skb);
|
||||
int flush = 1;
|
||||
|
||||
/* we can do L4 aggregation only if the packet can't land in a tunnel
|
||||
* otherwise we could corrupt the inner stream
|
||||
*/
|
||||
NAPI_GRO_CB(skb)->is_flist = 0;
|
||||
if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
|
||||
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
|
||||
if (!sk || !udp_sk(sk)->gro_receive) {
|
||||
if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
|
||||
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1;
|
||||
|
||||
if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
|
||||
(sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) {
|
||||
pp = call_gro_receive(udp_gro_receive_segment, head, skb);
|
||||
if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
|
||||
(sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist)
|
||||
pp = call_gro_receive(udp_gro_receive_segment, head, skb);
|
||||
return pp;
|
||||
}
|
||||
|
||||
if (!sk || NAPI_GRO_CB(skb)->encap_mark ||
|
||||
if (NAPI_GRO_CB(skb)->encap_mark ||
|
||||
(uh->check && skb->ip_summed != CHECKSUM_PARTIAL &&
|
||||
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
|
||||
!NAPI_GRO_CB(skb)->csum_valid) ||
|
||||
!udp_sk(sk)->gro_receive)
|
||||
!NAPI_GRO_CB(skb)->csum_valid))
|
||||
goto out;
|
||||
|
||||
/* mark that this skb passed once through the tunnel gro layer */
|
||||
@ -639,6 +642,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
|
||||
skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
|
||||
: SKB_GSO_UDP_TUNNEL;
|
||||
|
||||
/* clear the encap mark, so that inner frag_list gro_complete
|
||||
* can take place
|
||||
*/
|
||||
NAPI_GRO_CB(skb)->encap_mark = 0;
|
||||
|
||||
/* Set encapsulation before calling into inner gro_complete()
|
||||
* functions to make them set up the inner offsets.
|
||||
*/
|
||||
@ -662,7 +670,8 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
|
||||
|
||||
if (NAPI_GRO_CB(skb)->is_flist) {
|
||||
/* do fraglist only if there is no outer UDP encap (or we already processed it) */
|
||||
if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
|
||||
uh->len = htons(skb->len - nhoff);
|
||||
|
||||
skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
|
||||
|
@ -749,6 +749,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
|
||||
skb_list_walk_safe(segs, skb, next) {
|
||||
__skb_pull(skb, skb_transport_offset(skb));
|
||||
|
||||
udp_post_segment_fix_csum(skb);
|
||||
ret = udpv6_queue_rcv_one_skb(sk, skb);
|
||||
if (ret > 0)
|
||||
ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
|
||||
|
@ -163,7 +163,8 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
|
||||
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
|
||||
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
|
||||
|
||||
if (NAPI_GRO_CB(skb)->is_flist) {
|
||||
/* do fraglist only if there is no outer UDP encap (or we already processed it) */
|
||||
if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
|
||||
uh->len = htons(skb->len - nhoff);
|
||||
|
||||
skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
|
||||
|
@ -23,6 +23,7 @@ TEST_PROGS += drop_monitor_tests.sh
|
||||
TEST_PROGS += vrf_route_leaking.sh
|
||||
TEST_PROGS += bareudp.sh
|
||||
TEST_PROGS += unicast_extensions.sh
|
||||
TEST_PROGS += udpgro_fwd.sh
|
||||
TEST_PROGS_EXTENDED := in_netns.sh
|
||||
TEST_GEN_FILES = socket nettest
|
||||
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
|
||||
|
251
tools/testing/selftests/net/udpgro_fwd.sh
Executable file
251
tools/testing/selftests/net/udpgro_fwd.sh
Executable file
@ -0,0 +1,251 @@
|
||||
#!/bin/sh
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
readonly BASE="ns-$(mktemp -u XXXXXX)"
|
||||
readonly SRC=2
|
||||
readonly DST=1
|
||||
readonly DST_NAT=100
|
||||
readonly NS_SRC=$BASE$SRC
|
||||
readonly NS_DST=$BASE$DST
|
||||
|
||||
# "baremetal" network used for raw UDP traffic
|
||||
readonly BM_NET_V4=192.168.1.
|
||||
readonly BM_NET_V6=2001:db8::
|
||||
|
||||
# "overlay" network used for UDP over UDP tunnel traffic
|
||||
readonly OL_NET_V4=172.16.1.
|
||||
readonly OL_NET_V6=2001:db8:1::
|
||||
readonly NPROCS=`nproc`
|
||||
|
||||
cleanup() {
|
||||
local ns
|
||||
local -r jobs="$(jobs -p)"
|
||||
[ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
|
||||
|
||||
for ns in $NS_SRC $NS_DST; do
|
||||
ip netns del $ns 2>/dev/null
|
||||
done
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
create_ns() {
|
||||
local net
|
||||
local ns
|
||||
|
||||
for ns in $NS_SRC $NS_DST; do
|
||||
ip netns add $ns
|
||||
ip -n $ns link set dev lo up
|
||||
done
|
||||
|
||||
ip link add name veth$SRC type veth peer name veth$DST
|
||||
|
||||
for ns in $SRC $DST; do
|
||||
ip link set dev veth$ns netns $BASE$ns
|
||||
ip -n $BASE$ns link set dev veth$ns up
|
||||
ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24
|
||||
ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad
|
||||
done
|
||||
ip -n $NS_DST link set veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null
|
||||
}
|
||||
|
||||
create_vxlan_endpoint() {
|
||||
local -r netns=$1
|
||||
local -r bm_dev=$2
|
||||
local -r bm_rem_addr=$3
|
||||
local -r vxlan_dev=$4
|
||||
local -r vxlan_id=$5
|
||||
local -r vxlan_port=4789
|
||||
|
||||
ip -n $netns link set dev $bm_dev up
|
||||
ip -n $netns link add dev $vxlan_dev type vxlan id $vxlan_id \
|
||||
dstport $vxlan_port remote $bm_rem_addr
|
||||
ip -n $netns link set dev $vxlan_dev up
|
||||
}
|
||||
|
||||
create_vxlan_pair() {
|
||||
local ns
|
||||
|
||||
create_ns
|
||||
|
||||
for ns in $SRC $DST; do
|
||||
# note that 3 - $SRC == $DST and 3 - $DST == $SRC
|
||||
create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V4$((3 - $ns)) vxlan$ns 4
|
||||
ip -n $BASE$ns addr add dev vxlan$ns $OL_NET_V4$ns/24
|
||||
done
|
||||
for ns in $SRC $DST; do
|
||||
create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V6$((3 - $ns)) vxlan6$ns 6
|
||||
ip -n $BASE$ns addr add dev vxlan6$ns $OL_NET_V6$ns/24 nodad
|
||||
done
|
||||
}
|
||||
|
||||
is_ipv6() {
|
||||
if [[ $1 =~ .*:.* ]]; then
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
run_test() {
|
||||
local -r msg=$1
|
||||
local -r dst=$2
|
||||
local -r pkts=$3
|
||||
local -r vxpkts=$4
|
||||
local bind=$5
|
||||
local rx_args=""
|
||||
local rx_family="-4"
|
||||
local family=-4
|
||||
local filter=IpInReceives
|
||||
local ipt=iptables
|
||||
|
||||
printf "%-40s" "$msg"
|
||||
|
||||
if is_ipv6 $dst; then
|
||||
# rx program does not support '-6' and implies ipv6 usage by default
|
||||
rx_family=""
|
||||
family=-6
|
||||
filter=Ip6InReceives
|
||||
ipt=ip6tables
|
||||
fi
|
||||
|
||||
rx_args="$rx_family"
|
||||
[ -n "$bind" ] && rx_args="$rx_args -b $bind"
|
||||
|
||||
# send a single GSO packet, segmented in 10 UDP frames.
|
||||
# Always expect 10 UDP frames on RX side as rx socket does
|
||||
# not enable GRO
|
||||
ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 4789
|
||||
ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 8000
|
||||
ip netns exec $NS_DST ./udpgso_bench_rx -C 1000 -R 10 -n 10 -l 1300 $rx_args &
|
||||
local spid=$!
|
||||
sleep 0.1
|
||||
ip netns exec $NS_SRC ./udpgso_bench_tx $family -M 1 -s 13000 -S 1300 -D $dst
|
||||
local retc=$?
|
||||
wait $spid
|
||||
local rets=$?
|
||||
if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
|
||||
echo " fail client exit code $retc, server $rets"
|
||||
ret=1
|
||||
return
|
||||
fi
|
||||
|
||||
local rcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 8000' | \
|
||||
sed -e 's/\[//' -e 's/:.*//'`
|
||||
if [ $rcv != $pkts ]; then
|
||||
echo " fail - received $rvs packets, expected $pkts"
|
||||
ret=1
|
||||
return
|
||||
fi
|
||||
|
||||
local vxrcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 4789' | \
|
||||
sed -e 's/\[//' -e 's/:.*//'`
|
||||
|
||||
# upper net can generate a little noise, allow some tolerance
|
||||
if [ $vxrcv -lt $vxpkts -o $vxrcv -gt $((vxpkts + 3)) ]; then
|
||||
echo " fail - received $vxrcv vxlan packets, expected $vxpkts"
|
||||
ret=1
|
||||
return
|
||||
fi
|
||||
echo " ok"
|
||||
}
|
||||
|
||||
run_bench() {
|
||||
local -r msg=$1
|
||||
local -r dst=$2
|
||||
local family=-4
|
||||
|
||||
printf "%-40s" "$msg"
|
||||
if [ $NPROCS -lt 2 ]; then
|
||||
echo " skip - needed 2 CPUs found $NPROCS"
|
||||
return
|
||||
fi
|
||||
|
||||
is_ipv6 $dst && family=-6
|
||||
|
||||
# bind the sender and the receiver to different CPUs to try
|
||||
# get reproducible results
|
||||
ip netns exec $NS_DST bash -c "echo 2 > /sys/class/net/veth$DST/queues/rx-0/rps_cpus"
|
||||
ip netns exec $NS_DST taskset 0x2 ./udpgso_bench_rx -C 1000 -R 10 &
|
||||
local spid=$!
|
||||
sleep 0.1
|
||||
ip netns exec $NS_SRC taskset 0x1 ./udpgso_bench_tx $family -l 3 -S 1300 -D $dst
|
||||
local retc=$?
|
||||
wait $spid
|
||||
local rets=$?
|
||||
if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
|
||||
echo " fail client exit code $retc, server $rets"
|
||||
ret=1
|
||||
return
|
||||
fi
|
||||
}
|
||||
|
||||
for family in 4 6; do
|
||||
BM_NET=$BM_NET_V4
|
||||
OL_NET=$OL_NET_V4
|
||||
IPT=iptables
|
||||
SUFFIX=24
|
||||
VXDEV=vxlan
|
||||
|
||||
if [ $family = 6 ]; then
|
||||
BM_NET=$BM_NET_V6
|
||||
OL_NET=$OL_NET_V6
|
||||
SUFFIX="64 nodad"
|
||||
VXDEV=vxlan6
|
||||
IPT=ip6tables
|
||||
fi
|
||||
|
||||
echo "IPv$family"
|
||||
|
||||
create_ns
|
||||
run_test "No GRO" $BM_NET$DST 10 0
|
||||
cleanup
|
||||
|
||||
create_ns
|
||||
ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on
|
||||
run_test "GRO frag list" $BM_NET$DST 1 0
|
||||
cleanup
|
||||
|
||||
# UDP GRO fwd skips aggregation when find an udp socket with the GRO option
|
||||
# if there is an UDP tunnel in the running system, such lookup happen
|
||||
# take place.
|
||||
# use NAT to circumvent GRO FWD check
|
||||
create_ns
|
||||
ip -n $NS_DST addr add dev veth$DST $BM_NET$DST_NAT/$SUFFIX
|
||||
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
|
||||
ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $BM_NET$DST_NAT \
|
||||
-j DNAT --to-destination $BM_NET$DST
|
||||
run_test "GRO fwd" $BM_NET$DST_NAT 1 0 $BM_NET$DST
|
||||
cleanup
|
||||
|
||||
create_ns
|
||||
run_bench "UDP fwd perf" $BM_NET$DST
|
||||
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
|
||||
run_bench "UDP GRO fwd perf" $BM_NET$DST
|
||||
cleanup
|
||||
|
||||
create_vxlan_pair
|
||||
ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on
|
||||
run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1
|
||||
cleanup
|
||||
|
||||
# use NAT to circumvent GRO FWD check
|
||||
create_vxlan_pair
|
||||
ip -n $NS_DST addr add dev $VXDEV$DST $OL_NET$DST_NAT/$SUFFIX
|
||||
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
|
||||
ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $OL_NET$DST_NAT \
|
||||
-j DNAT --to-destination $OL_NET$DST
|
||||
|
||||
# load arp cache before running the test to reduce the amount of
|
||||
# stray traffic on top of the UDP tunnel
|
||||
ip netns exec $NS_SRC ping -q -c 1 $OL_NET$DST_NAT >/dev/null
|
||||
run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST
|
||||
cleanup
|
||||
|
||||
create_vxlan_pair
|
||||
run_bench "UDP tunnel fwd perf" $OL_NET$DST
|
||||
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
|
||||
run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST
|
||||
cleanup
|
||||
done
|
||||
|
||||
exit $ret
|
Loading…
Reference in New Issue
Block a user