2021-02-03 21:51:11 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
|
2024-03-08 18:22:30 +08:00
|
|
|
#ifndef _NET_GRO_H
|
|
|
|
#define _NET_GRO_H
|
2021-02-03 21:51:11 +08:00
|
|
|
|
2021-03-19 02:42:23 +08:00
|
|
|
#include <linux/indirect_call_wrapper.h>
|
2021-11-16 01:05:51 +08:00
|
|
|
#include <linux/ip.h>
|
|
|
|
#include <linux/ipv6.h>
|
2021-11-17 18:01:30 +08:00
|
|
|
#include <net/ip6_checksum.h>
|
2021-11-16 01:05:51 +08:00
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <net/udp.h>
|
2024-03-07 00:00:14 +08:00
|
|
|
#include <net/hotdata.h>
|
2021-03-19 02:42:23 +08:00
|
|
|
|
2021-11-16 01:05:51 +08:00
|
|
|
struct napi_gro_cb {
|
2023-06-02 00:14:09 +08:00
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
/* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */
|
|
|
|
void *frag0;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2023-06-02 00:14:09 +08:00
|
|
|
/* Length of frag0. */
|
|
|
|
unsigned int frag0_len;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct {
|
|
|
|
/* used in skb_gro_receive() slow path */
|
|
|
|
struct sk_buff *last;
|
|
|
|
|
|
|
|
/* jiffies when first packet was created/queued */
|
|
|
|
unsigned long age;
|
|
|
|
};
|
|
|
|
};
|
2021-11-16 01:05:51 +08:00
|
|
|
|
|
|
|
/* This indicates where we are processing relative to skb->data. */
|
|
|
|
int data_offset;
|
|
|
|
|
|
|
|
/* This is non-zero if the packet cannot be merged with the new skb. */
|
|
|
|
u16 flush;
|
|
|
|
|
|
|
|
/* Number of segments aggregated. */
|
|
|
|
u16 count;
|
|
|
|
|
xfrm: Support GRO for IPv4 ESP in UDP encapsulation
This patch enables the GRO codepath for IPv4 ESP in UDP encapsulated
packets. Decapsulation happens at L2 and saves a full round through
the stack for each packet. This is also needed to support HW offload
for ESP in UDP encapsulation.
Enabling this would imporove performance for ESP in UDP datapath, i.e
IPsec with NAT in between.
By default GRP for ESP-in-UDP is disabled for UDP sockets.
To enable this feature for an ESP socket, the following two options
need to be set:
1. enable ESP-in-UDP: (this is already set by an IKE daemon).
int type = UDP_ENCAP_ESPINUDP;
setsockopt(fd, SOL_UDP, UDP_ENCAP, &type, sizeof(type));
2. To enable GRO for ESP in UDP socket:
type = true;
setsockopt(fd, SOL_UDP, UDP_GRO, &type, sizeof(type));
Enabling ESP-in-UDP has the side effect of preventing the Linux stack from
seeing ESP packets at the L3 (when ESP OFFLOAD is disabled), as packets are
immediately decapsulated from UDP and decrypted.
This change may affect nftable rules that match on ESP packets at L3.
Also tcpdump won't see the ESP packet.
Developers/admins are advised to review and adapt any nftable rules
accordingly before enabling this feature to prevent potential rule breakage.
Also tcpdump will not see from ESP packets from a ESP in UDP flow, when this
is enabled.
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Co-developed-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Antony Antony <antony.antony@secunet.com>
Reviewed-by: Eyal Birger <eyal.birger@gmail.com>
2023-10-04 21:05:27 +08:00
|
|
|
/* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */
|
2022-02-04 19:28:37 +08:00
|
|
|
u16 proto;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2024-05-15 01:06:15 +08:00
|
|
|
u16 pad;
|
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
iph->id, ...) against all packets in a loop. These flush checks are used in
all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb,
since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both
outer and inner network headers - allowing these checks to be done only
once, in tcp_gro_receive and udp_gro_receive_segment. As a result,
NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are
more declarative and contained in inet_gro_flush, thus removing the need
for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP
flows.
To make sure results are not within noise range - I've made netfilter drop
all TCP packets, and measured CPU performance in GRO (in this case GRO is
responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO:
(gro_receive_network_flush is compiled inline to tcp_gro_receive)
net-next:
6.94% [kernel] [k] inet_gro_receive
3.02% [kernel] [k] tcp_gro_receive
patch applied:
4.27% [kernel] [k] tcp_gro_receive
4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same
results for any encapsulation, in this case inet_gro_receive is top
offender in net-next)
net-next:
10.09% [kernel] [k] inet_gro_receive
2.08% [kernel] [k] tcp_gro_receive
patch applied:
6.97% [kernel] [k] inet_gro_receive
3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-10 03:08:18 +08:00
|
|
|
|
2022-02-17 16:07:55 +08:00
|
|
|
/* Used in napi_gro_cb::free */
|
|
|
|
#define NAPI_GRO_FREE 1
|
|
|
|
#define NAPI_GRO_FREE_STOLEN_HEAD 2
|
2022-02-04 19:28:37 +08:00
|
|
|
/* portion of the cb set to zero at every gro iteration */
|
|
|
|
struct_group(zeroed,
|
|
|
|
|
|
|
|
/* Start offset for remote checksum offload */
|
|
|
|
u16 gro_remcsum_start;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2022-02-04 19:28:37 +08:00
|
|
|
/* This is non-zero if the packet may be of the same flow. */
|
|
|
|
u8 same_flow:1;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2022-02-04 19:28:37 +08:00
|
|
|
/* Used in tunnel GRO receive */
|
|
|
|
u8 encap_mark:1;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2022-02-04 19:28:37 +08:00
|
|
|
/* GRO checksum is valid */
|
|
|
|
u8 csum_valid:1;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2022-02-04 19:28:37 +08:00
|
|
|
/* Number of checksums via CHECKSUM_UNNECESSARY */
|
|
|
|
u8 csum_cnt:3;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2022-02-04 19:28:37 +08:00
|
|
|
/* Free the skb? */
|
|
|
|
u8 free:2;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2022-02-04 19:28:37 +08:00
|
|
|
/* Used in foo-over-udp, set in udp[46]_gro_receive */
|
|
|
|
u8 is_ipv6:1;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2022-02-04 19:28:37 +08:00
|
|
|
/* Used in GRE, set in fou/gue_gro_receive */
|
|
|
|
u8 is_fou:1;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
iph->id, ...) against all packets in a loop. These flush checks are used in
all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb,
since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both
outer and inner network headers - allowing these checks to be done only
once, in tcp_gro_receive and udp_gro_receive_segment. As a result,
NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are
more declarative and contained in inet_gro_flush, thus removing the need
for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP
flows.
To make sure results are not within noise range - I've made netfilter drop
all TCP packets, and measured CPU performance in GRO (in this case GRO is
responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO:
(gro_receive_network_flush is compiled inline to tcp_gro_receive)
net-next:
6.94% [kernel] [k] inet_gro_receive
3.02% [kernel] [k] tcp_gro_receive
patch applied:
4.27% [kernel] [k] tcp_gro_receive
4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same
results for any encapsulation, in this case inet_gro_receive is top
offender in net-next)
net-next:
10.09% [kernel] [k] inet_gro_receive
2.08% [kernel] [k] tcp_gro_receive
patch applied:
6.97% [kernel] [k] inet_gro_receive
3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-10 03:08:18 +08:00
|
|
|
/* Used to determine if ipid_offset can be ignored */
|
|
|
|
u8 ip_fixedid:1;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2022-02-04 19:28:37 +08:00
|
|
|
/* Number of gro_receive callbacks this packet already went through */
|
|
|
|
u8 recursion_counter:4;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2022-02-04 19:28:37 +08:00
|
|
|
/* GRO is done by frag_list pointer chaining. */
|
|
|
|
u8 is_flist:1;
|
|
|
|
);
|
2021-11-16 01:05:51 +08:00
|
|
|
|
2024-05-15 01:06:15 +08:00
|
|
|
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
|
|
|
|
__wsum csum;
|
|
|
|
|
net: gro: fix udp bad offset in socket lookup by adding {inner_}network_offset to napi_gro_cb
Commits a602456 ("udp: Add GRO functions to UDP socket") and 57c67ff ("udp:
additional GRO support") introduce incorrect usage of {ip,ipv6}_hdr in the
complete phase of gro. The functions always return skb->network_header,
which in the case of encapsulated packets at the gro complete phase, is
always set to the innermost L3 of the packet. That means that calling
{ip,ipv6}_hdr for skbs which completed the GRO receive phase (both in
gro_list and *_gro_complete) when parsing an encapsulated packet's _outer_
L3/L4 may return an unexpected value.
This incorrect usage leads to a bug in GRO's UDP socket lookup.
udp{4,6}_lib_lookup_skb functions use ip_hdr/ipv6_hdr respectively. These
*_hdr functions return network_header which will point to the innermost L3,
resulting in the wrong offset being used in __udp{4,6}_lib_lookup with
encapsulated packets.
This patch adds network_offset and inner_network_offset to napi_gro_cb, and
makes sure both are set correctly.
To fix the issue, network_offsets union is used inside napi_gro_cb, in
which both the outer and the inner network offsets are saved.
Reproduction example:
Endpoint configuration example (fou + local address bind)
# ip fou add port 6666 ipproto 4
# ip link add name tun1 type ipip remote 2.2.2.1 local 2.2.2.2 encap fou encap-dport 5555 encap-sport 6666 mode ipip
# ip link set tun1 up
# ip a add 1.1.1.2/24 dev tun1
Netperf TCP_STREAM result on net-next before patch is applied:
net-next main, GRO enabled:
$ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
131072 16384 16384 5.28 2.37
net-next main, GRO disabled:
$ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
131072 16384 16384 5.01 2745.06
patch applied, GRO enabled:
$ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
131072 16384 16384 5.01 2877.38
Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket")
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-04-30 22:35:54 +08:00
|
|
|
/* L3 offsets */
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
u16 network_offset;
|
|
|
|
u16 inner_network_offset;
|
|
|
|
};
|
|
|
|
u16 network_offsets[2];
|
|
|
|
};
|
2021-11-16 01:05:51 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
|
|
|
|
|
|
|
|
#define GRO_RECURSION_LIMIT 15
|
|
|
|
static inline int gro_recursion_inc_test(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT;
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *);
|
|
|
|
static inline struct sk_buff *call_gro_receive(gro_receive_t cb,
|
|
|
|
struct list_head *head,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (unlikely(gro_recursion_inc_test(skb))) {
|
|
|
|
NAPI_GRO_CB(skb)->flush |= 1;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return cb(head, skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *,
|
|
|
|
struct sk_buff *);
|
|
|
|
static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb,
|
|
|
|
struct sock *sk,
|
|
|
|
struct list_head *head,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (unlikely(gro_recursion_inc_test(skb))) {
|
|
|
|
NAPI_GRO_CB(skb)->flush |= 1;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return cb(sk, head, skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return NAPI_GRO_CB(skb)->data_offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int skb_gro_len(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return skb->len - NAPI_GRO_CB(skb)->data_offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
|
|
|
|
{
|
|
|
|
NAPI_GRO_CB(skb)->data_offset += len;
|
|
|
|
}
|
|
|
|
|
2024-03-02 03:37:38 +08:00
|
|
|
static inline void *skb_gro_header_fast(const struct sk_buff *skb,
|
2021-11-16 01:05:51 +08:00
|
|
|
unsigned int offset)
|
|
|
|
{
|
|
|
|
return NAPI_GRO_CB(skb)->frag0 + offset;
|
|
|
|
}
|
|
|
|
|
2024-03-02 03:37:37 +08:00
|
|
|
static inline bool skb_gro_may_pull(const struct sk_buff *skb,
|
|
|
|
unsigned int hlen)
|
2021-11-16 01:05:51 +08:00
|
|
|
{
|
2024-03-02 03:37:39 +08:00
|
|
|
return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len);
|
2021-11-16 01:05:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
|
|
|
|
unsigned int offset)
|
|
|
|
{
|
|
|
|
if (!pskb_may_pull(skb, hlen))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return skb->data + offset;
|
|
|
|
}
|
|
|
|
|
2024-03-02 03:37:38 +08:00
|
|
|
static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen,
|
|
|
|
unsigned int offset)
|
2022-08-23 15:10:49 +08:00
|
|
|
{
|
|
|
|
void *ptr;
|
|
|
|
|
|
|
|
ptr = skb_gro_header_fast(skb, offset);
|
2024-03-02 03:37:37 +08:00
|
|
|
if (!skb_gro_may_pull(skb, hlen))
|
2022-08-23 15:10:49 +08:00
|
|
|
ptr = skb_gro_header_slow(skb, hlen, offset);
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
2024-05-10 03:08:17 +08:00
|
|
|
static inline int skb_gro_receive_network_offset(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark];
|
|
|
|
}
|
|
|
|
|
2024-03-02 03:37:38 +08:00
|
|
|
static inline void *skb_gro_network_header(const struct sk_buff *skb)
|
2021-11-16 01:05:51 +08:00
|
|
|
{
|
2024-03-02 03:37:38 +08:00
|
|
|
if (skb_gro_may_pull(skb, skb_gro_offset(skb)))
|
2024-05-10 03:08:17 +08:00
|
|
|
return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb));
|
2024-03-02 03:37:38 +08:00
|
|
|
|
2024-05-10 03:08:17 +08:00
|
|
|
return skb->data + skb_gro_receive_network_offset(skb);
|
2021-11-16 01:05:51 +08:00
|
|
|
}
|
|
|
|
|
2024-03-02 03:37:38 +08:00
|
|
|
static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb,
|
|
|
|
int proto)
|
2021-11-16 01:05:51 +08:00
|
|
|
{
|
|
|
|
const struct iphdr *iph = skb_gro_network_header(skb);
|
|
|
|
|
|
|
|
return csum_tcpudp_nofold(iph->saddr, iph->daddr,
|
|
|
|
skb_gro_len(skb), proto, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
|
|
|
|
const void *start, unsigned int len)
|
|
|
|
{
|
|
|
|
if (NAPI_GRO_CB(skb)->csum_valid)
|
net: fix recent csum changes
Vladimir reported csum issues after my recent change in skb_postpull_rcsum()
Issue here is the following:
initial skb->csum is the csum of
[part to be pulled][rest of packet]
Old code:
skb->csum = csum_sub(skb->csum, csum_partial(pull, pull_length, 0));
New code:
skb->csum = ~csum_partial(pull, pull_length, ~skb->csum);
This is broken if the csum of [pulled part]
happens to be equal to skb->csum, because end
result of skb->csum is 0 in new code, instead
of being 0xffffffff
David Laight suggested to use
skb->csum = -csum_partial(pull, pull_length, -skb->csum);
I based my patches on existing code present in include/net/seg6.h,
update_csum_diff4() and update_csum_diff16() which might need
a similar fix.
I guess that my tests, mostly pulling 40 bytes of IPv6 header
were not providing enough entropy to hit this bug.
v2: added wsum_negate() to make sparse happy.
Fixes: 29c3002644bd ("net: optimize skb_postpull_rcsum()")
Fixes: 0bd28476f636 ("gro: optimize skb_gro_postpull_rcsum()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Suggested-by: David Laight <David.Laight@ACULAB.COM>
Cc: David Lebrun <dlebrun@google.com>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20211204045356.3659278-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-12-04 12:53:56 +08:00
|
|
|
NAPI_GRO_CB(skb)->csum = wsum_negate(csum_partial(start, len,
|
|
|
|
wsum_negate(NAPI_GRO_CB(skb)->csum)));
|
2021-11-16 01:05:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* GRO checksum functions. These are logical equivalents of the normal
|
|
|
|
* checksum functions (in skbuff.h) except that they operate on the GRO
|
|
|
|
* offsets and fields in sk_buff.
|
|
|
|
*/
|
|
|
|
|
|
|
|
__sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
|
|
|
|
|
|
|
|
static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
|
|
|
|
bool zero_okay,
|
|
|
|
__sum16 check)
|
|
|
|
{
|
|
|
|
return ((skb->ip_summed != CHECKSUM_PARTIAL ||
|
|
|
|
skb_checksum_start_offset(skb) <
|
|
|
|
skb_gro_offset(skb)) &&
|
|
|
|
!skb_at_gro_remcsum_start(skb) &&
|
|
|
|
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
|
|
|
|
(!zero_okay || check));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb,
|
|
|
|
__wsum psum)
|
|
|
|
{
|
|
|
|
if (NAPI_GRO_CB(skb)->csum_valid &&
|
|
|
|
!csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum)))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
NAPI_GRO_CB(skb)->csum = psum;
|
|
|
|
|
|
|
|
return __skb_gro_checksum_complete(skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (NAPI_GRO_CB(skb)->csum_cnt > 0) {
|
|
|
|
/* Consume a checksum from CHECKSUM_UNNECESSARY */
|
|
|
|
NAPI_GRO_CB(skb)->csum_cnt--;
|
|
|
|
} else {
|
|
|
|
/* Update skb for CHECKSUM_UNNECESSARY and csum_level when we
|
|
|
|
* verified a new top level checksum or an encapsulated one
|
|
|
|
* during GRO. This saves work if we fallback to normal path.
|
|
|
|
*/
|
|
|
|
__skb_incr_checksum_unnecessary(skb);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \
|
|
|
|
compute_pseudo) \
|
|
|
|
({ \
|
|
|
|
__sum16 __ret = 0; \
|
|
|
|
if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \
|
|
|
|
__ret = __skb_gro_checksum_validate_complete(skb, \
|
|
|
|
compute_pseudo(skb, proto)); \
|
|
|
|
if (!__ret) \
|
|
|
|
skb_gro_incr_csum_unnecessary(skb); \
|
|
|
|
__ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \
|
|
|
|
__skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo)
|
|
|
|
|
|
|
|
#define skb_gro_checksum_validate_zero_check(skb, proto, check, \
|
|
|
|
compute_pseudo) \
|
|
|
|
__skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo)
|
|
|
|
|
|
|
|
#define skb_gro_checksum_simple_validate(skb) \
|
|
|
|
__skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo)
|
|
|
|
|
|
|
|
static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return (NAPI_GRO_CB(skb)->csum_cnt == 0 &&
|
|
|
|
!NAPI_GRO_CB(skb)->csum_valid);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __skb_gro_checksum_convert(struct sk_buff *skb,
|
|
|
|
__wsum pseudo)
|
|
|
|
{
|
|
|
|
NAPI_GRO_CB(skb)->csum = ~pseudo;
|
|
|
|
NAPI_GRO_CB(skb)->csum_valid = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \
|
|
|
|
do { \
|
|
|
|
if (__skb_gro_checksum_convert_check(skb)) \
|
|
|
|
__skb_gro_checksum_convert(skb, \
|
|
|
|
compute_pseudo(skb, proto)); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
struct gro_remcsum {
|
|
|
|
int offset;
|
|
|
|
__wsum delta;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
|
|
|
|
{
|
|
|
|
grc->offset = 0;
|
|
|
|
grc->delta = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
|
|
|
|
unsigned int off, size_t hdrlen,
|
|
|
|
int start, int offset,
|
|
|
|
struct gro_remcsum *grc,
|
|
|
|
bool nopartial)
|
|
|
|
{
|
|
|
|
__wsum delta;
|
|
|
|
size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
|
|
|
|
|
|
|
|
BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
|
|
|
|
|
|
|
|
if (!nopartial) {
|
|
|
|
NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start;
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
2022-08-23 15:10:49 +08:00
|
|
|
ptr = skb_gro_header(skb, off + plen, off);
|
|
|
|
if (!ptr)
|
|
|
|
return NULL;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
|
|
|
delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum,
|
|
|
|
start, offset);
|
|
|
|
|
|
|
|
/* Adjust skb->csum since we changed the packet */
|
|
|
|
NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
|
|
|
|
|
|
|
|
grc->offset = off + hdrlen + offset;
|
|
|
|
grc->delta = delta;
|
|
|
|
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
|
|
|
|
struct gro_remcsum *grc)
|
|
|
|
{
|
|
|
|
void *ptr;
|
|
|
|
size_t plen = grc->offset + sizeof(u16);
|
|
|
|
|
|
|
|
if (!grc->delta)
|
|
|
|
return;
|
|
|
|
|
2022-08-23 15:10:49 +08:00
|
|
|
ptr = skb_gro_header(skb, plen, grc->offset);
|
|
|
|
if (!ptr)
|
|
|
|
return;
|
2021-11-16 01:05:51 +08:00
|
|
|
|
|
|
|
remcsum_unadjust((__sum16 *)ptr, grc->delta);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_XFRM_OFFLOAD
|
|
|
|
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
|
|
|
|
{
|
|
|
|
if (PTR_ERR(pp) != -EINPROGRESS)
|
|
|
|
NAPI_GRO_CB(skb)->flush |= flush;
|
|
|
|
}
|
|
|
|
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
|
|
|
|
struct sk_buff *pp,
|
|
|
|
int flush,
|
|
|
|
struct gro_remcsum *grc)
|
|
|
|
{
|
|
|
|
if (PTR_ERR(pp) != -EINPROGRESS) {
|
|
|
|
NAPI_GRO_CB(skb)->flush |= flush;
|
|
|
|
skb_gro_remcsum_cleanup(skb, grc);
|
|
|
|
skb->remcsum_offload = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
|
|
|
|
{
|
|
|
|
NAPI_GRO_CB(skb)->flush |= flush;
|
|
|
|
}
|
|
|
|
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
|
|
|
|
struct sk_buff *pp,
|
|
|
|
int flush,
|
|
|
|
struct gro_remcsum *grc)
|
|
|
|
{
|
|
|
|
NAPI_GRO_CB(skb)->flush |= flush;
|
|
|
|
skb_gro_remcsum_cleanup(skb, grc);
|
|
|
|
skb->remcsum_offload = 0;
|
|
|
|
}
|
|
|
|
#endif
|
2021-03-19 02:42:23 +08:00
|
|
|
|
2021-02-03 21:51:11 +08:00
|
|
|
INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
|
|
|
|
struct sk_buff *));
|
|
|
|
INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
|
|
|
|
INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
|
|
|
|
struct sk_buff *));
|
|
|
|
INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
|
2021-03-19 02:42:30 +08:00
|
|
|
|
2021-11-16 01:05:51 +08:00
|
|
|
INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
|
|
|
|
struct sk_buff *));
|
|
|
|
INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
|
|
|
|
|
|
|
|
INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *,
|
|
|
|
struct sk_buff *));
|
|
|
|
INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int));
|
|
|
|
|
2021-03-19 02:42:30 +08:00
|
|
|
#define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \
|
|
|
|
({ \
|
|
|
|
unlikely(gro_recursion_inc_test(skb)) ? \
|
|
|
|
NAPI_GRO_CB(skb)->flush |= 1, NULL : \
|
|
|
|
INDIRECT_CALL_INET(cb, f2, f1, head, skb); \
|
|
|
|
})
|
|
|
|
|
2021-11-16 01:05:51 +08:00
|
|
|
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
|
|
|
|
struct udphdr *uh, struct sock *sk);
|
|
|
|
int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
|
|
|
|
|
|
|
|
static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct udphdr *uh;
|
|
|
|
unsigned int hlen, off;
|
|
|
|
|
|
|
|
off = skb_gro_offset(skb);
|
|
|
|
hlen = off + sizeof(*uh);
|
2022-08-23 15:10:49 +08:00
|
|
|
uh = skb_gro_header(skb, hlen, off);
|
2021-11-16 01:05:51 +08:00
|
|
|
|
|
|
|
return uh;
|
|
|
|
}
|
|
|
|
|
2024-03-02 03:37:38 +08:00
|
|
|
static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb,
|
|
|
|
int proto)
|
2021-11-16 01:05:51 +08:00
|
|
|
{
|
|
|
|
const struct ipv6hdr *iph = skb_gro_network_header(skb);
|
|
|
|
|
|
|
|
return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr,
|
|
|
|
skb_gro_len(skb), proto, 0));
|
|
|
|
}
|
|
|
|
|
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
iph->id, ...) against all packets in a loop. These flush checks are used in
all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb,
since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both
outer and inner network headers - allowing these checks to be done only
once, in tcp_gro_receive and udp_gro_receive_segment. As a result,
NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are
more declarative and contained in inet_gro_flush, thus removing the need
for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP
flows.
To make sure results are not within noise range - I've made netfilter drop
all TCP packets, and measured CPU performance in GRO (in this case GRO is
responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO:
(gro_receive_network_flush is compiled inline to tcp_gro_receive)
net-next:
6.94% [kernel] [k] inet_gro_receive
3.02% [kernel] [k] tcp_gro_receive
patch applied:
4.27% [kernel] [k] tcp_gro_receive
4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same
results for any encapsulation, in this case inet_gro_receive is top
offender in net-next)
net-next:
10.09% [kernel] [k] inet_gro_receive
2.08% [kernel] [k] tcp_gro_receive
patch applied:
6.97% [kernel] [k] inet_gro_receive
3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-10 03:08:18 +08:00
|
|
|
static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2,
|
|
|
|
struct sk_buff *p, bool outer)
|
|
|
|
{
|
|
|
|
const u32 id = ntohl(*(__be32 *)&iph->id);
|
|
|
|
const u32 id2 = ntohl(*(__be32 *)&iph2->id);
|
|
|
|
const u16 ipid_offset = (id >> 16) - (id2 >> 16);
|
|
|
|
const u16 count = NAPI_GRO_CB(p)->count;
|
|
|
|
const u32 df = id & IP_DF;
|
|
|
|
int flush;
|
|
|
|
|
|
|
|
/* All fields must match except length and checksum. */
|
|
|
|
flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF));
|
|
|
|
|
|
|
|
if (flush | (outer && df))
|
|
|
|
return flush;
|
|
|
|
|
|
|
|
/* When we receive our second frame we can make a decision on if we
|
|
|
|
* continue this flow as an atomic flow with a fixed ID or if we use
|
|
|
|
* an incrementing ID.
|
|
|
|
*/
|
|
|
|
if (count == 1 && df && !ipid_offset)
|
|
|
|
NAPI_GRO_CB(p)->ip_fixedid = true;
|
|
|
|
|
|
|
|
return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2)
|
|
|
|
{
|
|
|
|
/* <Version:4><Traffic_Class:8><Flow_Label:20> */
|
|
|
|
__be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
|
|
|
|
|
|
|
|
/* Flush if Traffic Class fields are different. */
|
|
|
|
return !!((first_word & htonl(0x0FF00000)) |
|
|
|
|
(__force __be32)(iph->hop_limit ^ iph2->hop_limit));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __gro_receive_network_flush(const void *th, const void *th2,
|
|
|
|
struct sk_buff *p, const u16 diff,
|
|
|
|
bool outer)
|
|
|
|
{
|
|
|
|
const void *nh = th - diff;
|
|
|
|
const void *nh2 = th2 - diff;
|
|
|
|
|
|
|
|
if (((struct iphdr *)nh)->version == 6)
|
|
|
|
return ipv6_gro_flush(nh, nh2);
|
|
|
|
else
|
|
|
|
return inet_gro_flush(nh, nh2, p, outer);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int gro_receive_network_flush(const void *th, const void *th2,
|
|
|
|
struct sk_buff *p)
|
|
|
|
{
|
|
|
|
const bool encap_mark = NAPI_GRO_CB(p)->encap_mark;
|
|
|
|
int off = skb_transport_offset(p);
|
|
|
|
int flush;
|
|
|
|
|
|
|
|
flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark);
|
|
|
|
if (encap_mark)
|
|
|
|
flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false);
|
|
|
|
|
|
|
|
return flush;
|
|
|
|
}
|
|
|
|
|
2021-11-16 01:05:53 +08:00
|
|
|
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
|
2024-05-02 16:44:42 +08:00
|
|
|
int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
|
2021-11-16 01:05:53 +08:00
|
|
|
|
2021-11-16 01:05:54 +08:00
|
|
|
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
|
|
|
|
static inline void gro_normal_list(struct napi_struct *napi)
|
|
|
|
{
|
|
|
|
if (!napi->rx_count)
|
|
|
|
return;
|
|
|
|
netif_receive_skb_list_internal(&napi->rx_list);
|
|
|
|
INIT_LIST_HEAD(&napi->rx_list);
|
|
|
|
napi->rx_count = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
|
|
|
|
* pass the whole batch up to the stack.
|
|
|
|
*/
|
|
|
|
static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
|
|
|
|
{
|
|
|
|
list_add_tail(&skb->list, &napi->rx_list);
|
|
|
|
napi->rx_count += segs;
|
2024-03-07 00:00:14 +08:00
|
|
|
if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch))
|
2021-11-16 01:05:54 +08:00
|
|
|
gro_normal_list(napi);
|
|
|
|
}
|
|
|
|
|
2023-07-27 23:33:56 +08:00
|
|
|
/* This function is the alternative of 'inet_iif' and 'inet_sdif'
|
|
|
|
* functions in case we can not rely on fields of IPCB.
|
|
|
|
*
|
|
|
|
* The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
|
|
|
|
* The caller must hold the RCU read lock.
|
|
|
|
*/
|
|
|
|
static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
|
|
|
|
{
|
|
|
|
*iif = inet_iif(skb) ?: skb->dev->ifindex;
|
|
|
|
*sdif = 0;
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
|
|
|
|
if (netif_is_l3_slave(skb->dev)) {
|
|
|
|
struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
|
|
|
|
|
|
|
|
*sdif = *iif;
|
|
|
|
*iif = master ? master->ifindex : 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This function is the alternative of 'inet6_iif' and 'inet6_sdif'
|
|
|
|
* functions in case we can not rely on fields of IP6CB.
|
|
|
|
*
|
|
|
|
* The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
|
|
|
|
* The caller must hold the RCU read lock.
|
|
|
|
*/
|
|
|
|
static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
|
|
|
|
{
|
|
|
|
/* using skb->dev->ifindex because skb_dst(skb) is not initialized */
|
|
|
|
*iif = skb->dev->ifindex;
|
|
|
|
*sdif = 0;
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
|
|
|
|
if (netif_is_l3_slave(skb->dev)) {
|
|
|
|
struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
|
|
|
|
|
|
|
|
*sdif = *iif;
|
|
|
|
*iif = master ? master->ifindex : 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2024-03-08 18:22:30 +08:00
|
|
|
struct packet_offload *gro_find_receive_by_type(__be16 type);
|
|
|
|
struct packet_offload *gro_find_complete_by_type(__be16 type);
|
|
|
|
|
|
|
|
#endif /* _NET_GRO_H */
|