2019-05-29 22:12:43 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2013-06-18 08:49:56 +08:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2013 Nicira, Inc.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/in.h>
|
|
|
|
#include <linux/if_arp.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/in6.h>
|
|
|
|
#include <linux/inetdevice.h>
|
|
|
|
#include <linux/netfilter_ipv4.h>
|
|
|
|
#include <linux/etherdevice.h>
|
|
|
|
#include <linux/if_ether.h>
|
|
|
|
#include <linux/if_vlan.h>
|
2015-07-21 16:44:01 +08:00
|
|
|
#include <linux/static_key.h>
|
2013-06-18 08:49:56 +08:00
|
|
|
|
|
|
|
#include <net/ip.h>
|
|
|
|
#include <net/icmp.h>
|
|
|
|
#include <net/protocol.h>
|
|
|
|
#include <net/ip_tunnels.h>
|
2016-05-19 00:06:17 +08:00
|
|
|
#include <net/ip6_tunnel.h>
|
2020-08-05 21:39:31 +08:00
|
|
|
#include <net/ip6_checksum.h>
|
2013-06-18 08:49:56 +08:00
|
|
|
#include <net/arp.h>
|
|
|
|
#include <net/checksum.h>
|
|
|
|
#include <net/dsfield.h>
|
|
|
|
#include <net/inet_ecn.h>
|
|
|
|
#include <net/xfrm.h>
|
|
|
|
#include <net/net_namespace.h>
|
|
|
|
#include <net/netns/generic.h>
|
|
|
|
#include <net/rtnetlink.h>
|
2015-09-23 00:12:11 +08:00
|
|
|
#include <net/dst_metadata.h>
|
2019-11-06 17:01:05 +08:00
|
|
|
#include <net/geneve.h>
|
2019-11-06 17:01:06 +08:00
|
|
|
#include <net/vxlan.h>
|
2019-11-06 17:01:07 +08:00
|
|
|
#include <net/erspan.h>
|
2013-06-18 08:49:56 +08:00
|
|
|
|
2016-05-19 00:06:13 +08:00
|
|
|
const struct ip_tunnel_encap_ops __rcu *
|
|
|
|
iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
|
|
|
|
EXPORT_SYMBOL(iptun_encaps);
|
|
|
|
|
2016-05-19 00:06:17 +08:00
|
|
|
const struct ip6_tnl_encap_ops __rcu *
|
|
|
|
ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
|
|
|
|
EXPORT_SYMBOL(ip6tun_encaps);
|
|
|
|
|
2015-12-25 06:34:54 +08:00
|
|
|
void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
|
|
|
|
__be32 src, __be32 dst, __u8 proto,
|
|
|
|
__u8 tos, __u8 ttl, __be16 df, bool xnet)
|
2013-06-18 08:49:56 +08:00
|
|
|
{
|
2015-09-18 17:47:40 +08:00
|
|
|
int pkt_len = skb->len - skb_inner_network_offset(skb);
|
2015-10-08 05:48:41 +08:00
|
|
|
struct net *net = dev_net(rt->dst.dev);
|
2015-12-25 06:34:54 +08:00
|
|
|
struct net_device *dev = skb->dev;
|
2013-06-18 08:49:56 +08:00
|
|
|
struct iphdr *iph;
|
|
|
|
int err;
|
|
|
|
|
2013-09-02 21:34:57 +08:00
|
|
|
skb_scrub_packet(skb, xnet);
|
|
|
|
|
2016-09-09 06:40:48 +08:00
|
|
|
skb_clear_hash_if_not_l4(skb);
|
2013-06-18 08:49:56 +08:00
|
|
|
skb_dst_set(skb, &rt->dst);
|
|
|
|
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
|
|
|
|
|
|
|
|
/* Push down and install the IP header. */
|
2013-10-01 17:35:51 +08:00
|
|
|
skb_push(skb, sizeof(struct iphdr));
|
2013-06-18 08:49:56 +08:00
|
|
|
skb_reset_network_header(skb);
|
|
|
|
|
|
|
|
iph = ip_hdr(skb);
|
|
|
|
|
|
|
|
iph->version = 4;
|
|
|
|
iph->ihl = sizeof(struct iphdr) >> 2;
|
2018-11-16 23:58:19 +08:00
|
|
|
iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : df;
|
2013-06-18 08:49:56 +08:00
|
|
|
iph->protocol = proto;
|
|
|
|
iph->tos = tos;
|
|
|
|
iph->daddr = dst;
|
|
|
|
iph->saddr = src;
|
|
|
|
iph->ttl = ttl;
|
2015-10-08 05:48:41 +08:00
|
|
|
__ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
|
2013-06-18 08:49:56 +08:00
|
|
|
|
2015-10-08 05:48:46 +08:00
|
|
|
err = ip_local_out(net, sk, skb);
|
2019-06-17 21:34:13 +08:00
|
|
|
|
|
|
|
if (dev) {
|
|
|
|
if (unlikely(net_xmit_eval(err)))
|
|
|
|
pkt_len = 0;
|
|
|
|
iptunnel_xmit_stats(dev, pkt_len);
|
|
|
|
}
|
2013-06-18 08:49:56 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iptunnel_xmit);
|
2013-06-18 08:50:02 +08:00
|
|
|
|
2016-04-05 20:47:12 +08:00
|
|
|
int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
|
|
|
|
__be16 inner_proto, bool raw_proto, bool xnet)
|
2013-06-18 08:50:02 +08:00
|
|
|
{
|
|
|
|
if (unlikely(!pskb_may_pull(skb, hdr_len)))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
skb_pull_rcsum(skb, hdr_len);
|
|
|
|
|
2016-04-05 20:47:12 +08:00
|
|
|
if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
|
2014-10-17 16:53:23 +08:00
|
|
|
struct ethhdr *eh;
|
2013-06-18 08:50:02 +08:00
|
|
|
|
|
|
|
if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2014-10-17 16:53:23 +08:00
|
|
|
eh = (struct ethhdr *)skb->data;
|
2015-05-05 05:33:59 +08:00
|
|
|
if (likely(eth_proto_is_802_3(eh->h_proto)))
|
2013-06-18 08:50:02 +08:00
|
|
|
skb->protocol = eh->h_proto;
|
|
|
|
else
|
|
|
|
skb->protocol = htons(ETH_P_802_2);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
skb->protocol = inner_proto;
|
|
|
|
}
|
|
|
|
|
2013-12-16 14:12:18 +08:00
|
|
|
skb_clear_hash_if_not_l4(skb);
|
2018-11-09 07:18:04 +08:00
|
|
|
__vlan_hwaccel_clear_tag(skb);
|
2013-06-18 08:50:02 +08:00
|
|
|
skb_set_queue_mapping(skb, 0);
|
2016-02-18 18:22:52 +08:00
|
|
|
skb_scrub_packet(skb, xnet);
|
2016-03-20 00:32:02 +08:00
|
|
|
|
|
|
|
return iptunnel_pull_offloads(skb);
|
2013-06-18 08:50:02 +08:00
|
|
|
}
|
2016-04-05 20:47:12 +08:00
|
|
|
EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
|
2013-10-20 02:42:55 +08:00
|
|
|
|
2015-09-23 00:12:11 +08:00
|
|
|
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
|
|
|
|
gfp_t flags)
|
|
|
|
{
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
|
2015-09-23 00:12:11 +08:00
|
|
|
struct metadata_dst *res;
|
|
|
|
struct ip_tunnel_info *dst, *src;
|
|
|
|
|
2017-06-24 04:11:58 +08:00
|
|
|
if (!md || md->type != METADATA_IP_TUNNEL ||
|
|
|
|
md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
|
2015-09-23 00:12:11 +08:00
|
|
|
return NULL;
|
|
|
|
|
2019-11-06 17:01:03 +08:00
|
|
|
src = &md->u.tun_info;
|
|
|
|
res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
|
2015-09-23 00:12:11 +08:00
|
|
|
if (!res)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
dst = &res->u.tun_info;
|
|
|
|
dst->key.tun_id = src->key.tun_id;
|
|
|
|
if (src->mode & IP_TUNNEL_INFO_IPV6)
|
|
|
|
memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
|
|
|
|
sizeof(struct in6_addr));
|
|
|
|
else
|
|
|
|
dst->key.u.ipv4.dst = src->key.u.ipv4.src;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags);
|
2015-09-23 00:12:11 +08:00
|
|
|
dst->mode = src->mode | IP_TUNNEL_INFO_TX;
|
2019-11-06 17:01:03 +08:00
|
|
|
ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
src->options_len, tun_flags);
|
2015-09-23 00:12:11 +08:00
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
|
|
|
|
|
2016-04-15 03:33:37 +08:00
|
|
|
int iptunnel_handle_offloads(struct sk_buff *skb,
|
|
|
|
int gso_type_mask)
|
2013-10-20 02:42:55 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (likely(!skb->encapsulation)) {
|
|
|
|
skb_reset_inner_headers(skb);
|
|
|
|
skb->encapsulation = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (skb_is_gso(skb)) {
|
2016-05-01 01:19:29 +08:00
|
|
|
err = skb_header_unclone(skb, GFP_ATOMIC);
|
2013-10-20 02:42:55 +08:00
|
|
|
if (unlikely(err))
|
2016-04-15 03:33:37 +08:00
|
|
|
return err;
|
2013-10-20 02:42:55 +08:00
|
|
|
skb_shinfo(skb)->gso_type |= gso_type_mask;
|
2016-04-15 03:33:37 +08:00
|
|
|
return 0;
|
2013-10-20 02:42:55 +08:00
|
|
|
}
|
|
|
|
|
2016-02-12 05:02:31 +08:00
|
|
|
if (skb->ip_summed != CHECKSUM_PARTIAL) {
|
2013-10-20 02:42:55 +08:00
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
2016-02-12 05:02:31 +08:00
|
|
|
/* We clear encapsulation here to prevent badly-written
|
|
|
|
* drivers potentially deciding to offload an inner checksum
|
|
|
|
* if we set CHECKSUM_PARTIAL on the outer header.
|
|
|
|
* This should go away when the drivers are all fixed.
|
|
|
|
*/
|
2016-02-12 04:48:04 +08:00
|
|
|
skb->encapsulation = 0;
|
|
|
|
}
|
2013-10-20 02:42:55 +08:00
|
|
|
|
2016-04-15 03:33:37 +08:00
|
|
|
return 0;
|
2013-10-20 02:42:55 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
|
2014-02-20 15:14:23 +08:00
|
|
|
|
tunnels: PMTU discovery support for directly bridged IP packets
It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.
PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.
On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:
VTEPs MUST NOT fragment VXLAN packets. Intermediate routers may
fragment encapsulated VXLAN packets due to the larger frame size.
The destination VTEP MAY silently discard such VXLAN fragments.
The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.
Further, it states that:
Other techniques like Path MTU discovery (see [RFC1191] and
[RFC1981]) MAY be used to address this requirement as well.
Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.
Add the missing bits for bridged cases:
- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
RFC 4443 section 2.4 for ICMPv6. This function is already called by
UDP tunnels
- a new function generating those ICMP or ICMPv6 replies. We can't
reuse icmp_send() and icmp6_send() as we don't see the sender as a
valid destination. This doesn't need to be generic, as we don't
cover any other type of ICMP errors given that we only provide an
encapsulation function to the sender
While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).
This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.
v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-04 13:53:43 +08:00
|
|
|
/**
|
|
|
|
* iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
|
|
|
|
* @skb: Original packet with L2 header
|
|
|
|
* @mtu: MTU value for ICMP error
|
|
|
|
*
|
|
|
|
* Return: length on success, negative error code if message couldn't be built.
|
|
|
|
*/
|
|
|
|
static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
|
|
|
|
{
|
|
|
|
const struct iphdr *iph = ip_hdr(skb);
|
|
|
|
struct icmphdr *icmph;
|
|
|
|
struct iphdr *niph;
|
|
|
|
struct ethhdr eh;
|
|
|
|
int len, err;
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
|
|
|
|
pskb_pull(skb, ETH_HLEN);
|
|
|
|
skb_reset_network_header(skb);
|
|
|
|
|
|
|
|
err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
len = skb->len + sizeof(*icmph);
|
|
|
|
err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
icmph = skb_push(skb, sizeof(*icmph));
|
|
|
|
*icmph = (struct icmphdr) {
|
|
|
|
.type = ICMP_DEST_UNREACH,
|
|
|
|
.code = ICMP_FRAG_NEEDED,
|
|
|
|
.checksum = 0,
|
|
|
|
.un.frag.__unused = 0,
|
2021-01-07 22:40:08 +08:00
|
|
|
.un.frag.mtu = htons(mtu),
|
tunnels: PMTU discovery support for directly bridged IP packets
It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.
PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.
On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:
VTEPs MUST NOT fragment VXLAN packets. Intermediate routers may
fragment encapsulated VXLAN packets due to the larger frame size.
The destination VTEP MAY silently discard such VXLAN fragments.
The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.
Further, it states that:
Other techniques like Path MTU discovery (see [RFC1191] and
[RFC1981]) MAY be used to address this requirement as well.
Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.
Add the missing bits for bridged cases:
- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
RFC 4443 section 2.4 for ICMPv6. This function is already called by
UDP tunnels
- a new function generating those ICMP or ICMPv6 replies. We can't
reuse icmp_send() and icmp6_send() as we don't see the sender as a
valid destination. This doesn't need to be generic, as we don't
cover any other type of ICMP errors given that we only provide an
encapsulation function to the sender
While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).
This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.
v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-04 13:53:43 +08:00
|
|
|
};
|
2023-08-03 23:26:49 +08:00
|
|
|
icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
|
tunnels: PMTU discovery support for directly bridged IP packets
It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.
PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.
On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:
VTEPs MUST NOT fragment VXLAN packets. Intermediate routers may
fragment encapsulated VXLAN packets due to the larger frame size.
The destination VTEP MAY silently discard such VXLAN fragments.
The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.
Further, it states that:
Other techniques like Path MTU discovery (see [RFC1191] and
[RFC1981]) MAY be used to address this requirement as well.
Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.
Add the missing bits for bridged cases:
- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
RFC 4443 section 2.4 for ICMPv6. This function is already called by
UDP tunnels
- a new function generating those ICMP or ICMPv6 replies. We can't
reuse icmp_send() and icmp6_send() as we don't see the sender as a
valid destination. This doesn't need to be generic, as we don't
cover any other type of ICMP errors given that we only provide an
encapsulation function to the sender
While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).
This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.
v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-04 13:53:43 +08:00
|
|
|
skb_reset_transport_header(skb);
|
|
|
|
|
|
|
|
niph = skb_push(skb, sizeof(*niph));
|
|
|
|
*niph = (struct iphdr) {
|
|
|
|
.ihl = sizeof(*niph) / 4u,
|
|
|
|
.version = 4,
|
|
|
|
.tos = 0,
|
|
|
|
.tot_len = htons(len + sizeof(*niph)),
|
|
|
|
.id = 0,
|
|
|
|
.frag_off = htons(IP_DF),
|
|
|
|
.ttl = iph->ttl,
|
|
|
|
.protocol = IPPROTO_ICMP,
|
|
|
|
.saddr = iph->daddr,
|
|
|
|
.daddr = iph->saddr,
|
|
|
|
};
|
|
|
|
ip_send_check(niph);
|
|
|
|
skb_reset_network_header(skb);
|
|
|
|
|
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
|
2021-01-07 22:40:08 +08:00
|
|
|
eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
|
tunnels: PMTU discovery support for directly bridged IP packets
It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.
PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.
On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:
VTEPs MUST NOT fragment VXLAN packets. Intermediate routers may
fragment encapsulated VXLAN packets due to the larger frame size.
The destination VTEP MAY silently discard such VXLAN fragments.
The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.
Further, it states that:
Other techniques like Path MTU discovery (see [RFC1191] and
[RFC1981]) MAY be used to address this requirement as well.
Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.
Add the missing bits for bridged cases:
- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
RFC 4443 section 2.4 for ICMPv6. This function is already called by
UDP tunnels
- a new function generating those ICMP or ICMPv6 replies. We can't
reuse icmp_send() and icmp6_send() as we don't see the sender as a
valid destination. This doesn't need to be generic, as we don't
cover any other type of ICMP errors given that we only provide an
encapsulation function to the sender
While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).
This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.
v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-04 13:53:43 +08:00
|
|
|
skb_reset_mac_header(skb);
|
|
|
|
|
|
|
|
return skb->len;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
|
|
|
|
* @skb: Buffer being sent by encapsulation, L2 headers expected
|
|
|
|
* @mtu: Network MTU for path
|
|
|
|
*
|
|
|
|
* Return: 0 for no ICMP reply, length if built, negative value on error.
|
|
|
|
*/
|
|
|
|
static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
|
|
|
|
{
|
|
|
|
const struct icmphdr *icmph = icmp_hdr(skb);
|
|
|
|
const struct iphdr *iph = ip_hdr(skb);
|
|
|
|
|
tunnels: Fix off-by-one in lower MTU bounds for ICMP/ICMPv6 replies
Jianlin reports that a bridged IPv6 VXLAN endpoint, carrying IPv6
packets over a link with a PMTU estimation of exactly 1350 bytes,
won't trigger ICMPv6 Packet Too Big replies when the encapsulated
datagrams exceed said PMTU value. VXLAN over IPv6 adds 70 bytes of
overhead, so an ICMPv6 reply indicating 1280 bytes as inner MTU
would be legitimate and expected.
This comes from an off-by-one error I introduced in checks added
as part of commit 4cb47a8644cc ("tunnels: PMTU discovery support
for directly bridged IP packets"), whose purpose was to prevent
sending ICMPv6 Packet Too Big messages with an MTU lower than the
smallest permissible IPv6 link MTU, i.e. 1280 bytes.
In iptunnel_pmtud_check_icmpv6(), avoid triggering a reply only if
the advertised MTU would be less than, and not equal to, 1280 bytes.
Also fix the analogous comparison for IPv4, that is, skip the ICMP
reply only if the resulting MTU is strictly less than 576 bytes.
This becomes apparent while running the net/pmtu.sh bridged VXLAN
or GENEVE selftests with adjusted lower-link MTU values. Using
e.g. GENEVE, setting ll_mtu to the values reported below, in the
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() test
function, we can see failures on the following tests:
test | ll_mtu
-------------------------------|--------
pmtu_ipv4_br_geneve4_exception | 626
pmtu_ipv6_br_geneve4_exception | 1330
pmtu_ipv6_br_geneve6_exception | 1350
owing to the different tunneling overheads implied by the
corresponding configurations.
Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Link: https://lore.kernel.org/r/4f5fc2f33bfdf8409549fafd4f952b008bf04d63.1604681709.git.sbrivio@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-07 00:59:52 +08:00
|
|
|
if (mtu < 576 || iph->frag_off != htons(IP_DF))
|
tunnels: PMTU discovery support for directly bridged IP packets
It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.
PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.
On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:
VTEPs MUST NOT fragment VXLAN packets. Intermediate routers may
fragment encapsulated VXLAN packets due to the larger frame size.
The destination VTEP MAY silently discard such VXLAN fragments.
The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.
Further, it states that:
Other techniques like Path MTU discovery (see [RFC1191] and
[RFC1981]) MAY be used to address this requirement as well.
Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.
Add the missing bits for bridged cases:
- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
RFC 4443 section 2.4 for ICMPv6. This function is already called by
UDP tunnels
- a new function generating those ICMP or ICMPv6 replies. We can't
reuse icmp_send() and icmp6_send() as we don't see the sender as a
valid destination. This doesn't need to be generic, as we don't
cover any other type of ICMP errors given that we only provide an
encapsulation function to the sender
While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).
This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.
v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-04 13:53:43 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) ||
|
|
|
|
ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) ||
|
|
|
|
ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return iptunnel_pmtud_build_icmp(skb, mtu);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
/**
|
|
|
|
* iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
|
|
|
|
* @skb: Original packet with L2 header
|
|
|
|
* @mtu: MTU value for ICMPv6 error
|
|
|
|
*
|
|
|
|
* Return: length on success, negative error code if message couldn't be built.
|
|
|
|
*/
|
|
|
|
static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
|
|
|
|
{
|
|
|
|
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
|
|
|
|
struct icmp6hdr *icmp6h;
|
|
|
|
struct ipv6hdr *nip6h;
|
|
|
|
struct ethhdr eh;
|
|
|
|
int len, err;
|
|
|
|
__wsum csum;
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
|
|
|
|
pskb_pull(skb, ETH_HLEN);
|
|
|
|
skb_reset_network_header(skb);
|
|
|
|
|
|
|
|
err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
len = skb->len + sizeof(*icmp6h);
|
|
|
|
err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
icmp6h = skb_push(skb, sizeof(*icmp6h));
|
|
|
|
*icmp6h = (struct icmp6hdr) {
|
|
|
|
.icmp6_type = ICMPV6_PKT_TOOBIG,
|
|
|
|
.icmp6_code = 0,
|
|
|
|
.icmp6_cksum = 0,
|
|
|
|
.icmp6_mtu = htonl(mtu),
|
|
|
|
};
|
|
|
|
skb_reset_transport_header(skb);
|
|
|
|
|
|
|
|
nip6h = skb_push(skb, sizeof(*nip6h));
|
|
|
|
*nip6h = (struct ipv6hdr) {
|
|
|
|
.priority = 0,
|
|
|
|
.version = 6,
|
|
|
|
.flow_lbl = { 0 },
|
|
|
|
.payload_len = htons(len),
|
|
|
|
.nexthdr = IPPROTO_ICMPV6,
|
|
|
|
.hop_limit = ip6h->hop_limit,
|
|
|
|
.saddr = ip6h->daddr,
|
|
|
|
.daddr = ip6h->saddr,
|
|
|
|
};
|
|
|
|
skb_reset_network_header(skb);
|
|
|
|
|
2024-02-01 16:38:15 +08:00
|
|
|
csum = skb_checksum(skb, skb_transport_offset(skb), len, 0);
|
tunnels: PMTU discovery support for directly bridged IP packets
It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.
PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.
On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:
VTEPs MUST NOT fragment VXLAN packets. Intermediate routers may
fragment encapsulated VXLAN packets due to the larger frame size.
The destination VTEP MAY silently discard such VXLAN fragments.
The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.
Further, it states that:
Other techniques like Path MTU discovery (see [RFC1191] and
[RFC1981]) MAY be used to address this requirement as well.
Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.
Add the missing bits for bridged cases:
- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
RFC 4443 section 2.4 for ICMPv6. This function is already called by
UDP tunnels
- a new function generating those ICMP or ICMPv6 replies. We can't
reuse icmp_send() and icmp6_send() as we don't see the sender as a
valid destination. This doesn't need to be generic, as we don't
cover any other type of ICMP errors given that we only provide an
encapsulation function to the sender
While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).
This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.
v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-04 13:53:43 +08:00
|
|
|
icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
|
|
|
|
IPPROTO_ICMPV6, csum);
|
|
|
|
|
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
|
2021-01-07 22:40:08 +08:00
|
|
|
eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
|
tunnels: PMTU discovery support for directly bridged IP packets
It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.
PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.
On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:
VTEPs MUST NOT fragment VXLAN packets. Intermediate routers may
fragment encapsulated VXLAN packets due to the larger frame size.
The destination VTEP MAY silently discard such VXLAN fragments.
The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.
Further, it states that:
Other techniques like Path MTU discovery (see [RFC1191] and
[RFC1981]) MAY be used to address this requirement as well.
Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.
Add the missing bits for bridged cases:
- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
RFC 4443 section 2.4 for ICMPv6. This function is already called by
UDP tunnels
- a new function generating those ICMP or ICMPv6 replies. We can't
reuse icmp_send() and icmp6_send() as we don't see the sender as a
valid destination. This doesn't need to be generic, as we don't
cover any other type of ICMP errors given that we only provide an
encapsulation function to the sender
While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).
This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.
v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-04 13:53:43 +08:00
|
|
|
skb_reset_mac_header(skb);
|
|
|
|
|
|
|
|
return skb->len;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
|
|
|
|
* @skb: Buffer being sent by encapsulation, L2 headers expected
|
|
|
|
* @mtu: Network MTU for path
|
|
|
|
*
|
|
|
|
* Return: 0 for no ICMPv6 reply, length if built, negative value on error.
|
|
|
|
*/
|
|
|
|
static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
|
|
|
|
{
|
|
|
|
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
|
|
|
|
int stype = ipv6_addr_type(&ip6h->saddr);
|
|
|
|
u8 proto = ip6h->nexthdr;
|
|
|
|
__be16 frag_off;
|
|
|
|
int offset;
|
|
|
|
|
tunnels: Fix off-by-one in lower MTU bounds for ICMP/ICMPv6 replies
Jianlin reports that a bridged IPv6 VXLAN endpoint, carrying IPv6
packets over a link with a PMTU estimation of exactly 1350 bytes,
won't trigger ICMPv6 Packet Too Big replies when the encapsulated
datagrams exceed said PMTU value. VXLAN over IPv6 adds 70 bytes of
overhead, so an ICMPv6 reply indicating 1280 bytes as inner MTU
would be legitimate and expected.
This comes from an off-by-one error I introduced in checks added
as part of commit 4cb47a8644cc ("tunnels: PMTU discovery support
for directly bridged IP packets"), whose purpose was to prevent
sending ICMPv6 Packet Too Big messages with an MTU lower than the
smallest permissible IPv6 link MTU, i.e. 1280 bytes.
In iptunnel_pmtud_check_icmpv6(), avoid triggering a reply only if
the advertised MTU would be less than, and not equal to, 1280 bytes.
Also fix the analogous comparison for IPv4, that is, skip the ICMP
reply only if the resulting MTU is strictly less than 576 bytes.
This becomes apparent while running the net/pmtu.sh bridged VXLAN
or GENEVE selftests with adjusted lower-link MTU values. Using
e.g. GENEVE, setting ll_mtu to the values reported below, in the
test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() test
function, we can see failures on the following tests:
test | ll_mtu
-------------------------------|--------
pmtu_ipv4_br_geneve4_exception | 626
pmtu_ipv6_br_geneve4_exception | 1330
pmtu_ipv6_br_geneve6_exception | 1350
owing to the different tunneling overheads implied by the
corresponding configurations.
Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Link: https://lore.kernel.org/r/4f5fc2f33bfdf8409549fafd4f952b008bf04d63.1604681709.git.sbrivio@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-07 00:59:52 +08:00
|
|
|
if (mtu < IPV6_MIN_MTU)
|
tunnels: PMTU discovery support for directly bridged IP packets
It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.
PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.
On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:
VTEPs MUST NOT fragment VXLAN packets. Intermediate routers may
fragment encapsulated VXLAN packets due to the larger frame size.
The destination VTEP MAY silently discard such VXLAN fragments.
The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.
Further, it states that:
Other techniques like Path MTU discovery (see [RFC1191] and
[RFC1981]) MAY be used to address this requirement as well.
Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.
Add the missing bits for bridged cases:
- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
RFC 4443 section 2.4 for ICMPv6. This function is already called by
UDP tunnels
- a new function generating those ICMP or ICMPv6 replies. We can't
reuse icmp_send() and icmp6_send() as we don't see the sender as a
valid destination. This doesn't need to be generic, as we don't
cover any other type of ICMP errors given that we only provide an
encapsulation function to the sender
While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).
This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.
v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-04 13:53:43 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
|
|
|
|
stype == IPV6_ADDR_LOOPBACK)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
|
|
|
|
&frag_off);
|
|
|
|
if (offset < 0 || (frag_off & htons(~0x7)))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (proto == IPPROTO_ICMPV6) {
|
|
|
|
struct icmp6hdr *icmp6h;
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, skb_network_header(skb) +
|
|
|
|
offset + 1 - skb->data))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
|
|
|
|
if (icmpv6_is_err(icmp6h->icmp6_type) ||
|
|
|
|
icmp6h->icmp6_type == NDISC_REDIRECT)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return iptunnel_pmtud_build_icmpv6(skb, mtu);
|
|
|
|
}
|
|
|
|
#endif /* IS_ENABLED(CONFIG_IPV6) */
|
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
|
|
|
|
* @skb: Buffer being sent by encapsulation, L2 headers expected
|
|
|
|
* @encap_dst: Destination for tunnel encapsulation (outer IP)
|
|
|
|
* @headroom: Encapsulation header size, bytes
|
|
|
|
* @reply: Build matching ICMP or ICMPv6 message as a result
|
|
|
|
*
|
|
|
|
* L2 tunnel implementations that can carry IP and can be directly bridged
|
|
|
|
* (currently UDP tunnels) can't always rely on IP forwarding paths to handle
|
|
|
|
* PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
|
|
|
|
* based on payload and sent back by the encapsulation itself.
|
|
|
|
*
|
|
|
|
* For routable interfaces, we just need to update the PMTU for the destination.
|
|
|
|
*
|
|
|
|
* Return: 0 if ICMP error not needed, length if built, negative value on error
|
|
|
|
*/
|
|
|
|
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
|
|
|
|
int headroom, bool reply)
|
|
|
|
{
|
|
|
|
u32 mtu = dst_mtu(encap_dst) - headroom;
|
|
|
|
|
|
|
|
if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
|
2022-06-24 23:30:20 +08:00
|
|
|
(!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu))
|
tunnels: PMTU discovery support for directly bridged IP packets
It's currently possible to bridge Ethernet tunnels carrying IP
packets directly to external interfaces without assigning them
addresses and routes on the bridged network itself: this is the case
for UDP tunnels bridged with a standard bridge or by Open vSwitch.
PMTU discovery is currently broken with those configurations, because
the encapsulation effectively decreases the MTU of the link, and
while we are able to account for this using PMTU discovery on the
lower layer, we don't have a way to relay ICMP or ICMPv6 messages
needed by the sender, because we don't have valid routes to it.
On the other hand, as a tunnel endpoint, we can't fragment packets
as a general approach: this is for instance clearly forbidden for
VXLAN by RFC 7348, section 4.3:
VTEPs MUST NOT fragment VXLAN packets. Intermediate routers may
fragment encapsulated VXLAN packets due to the larger frame size.
The destination VTEP MAY silently discard such VXLAN fragments.
The same paragraph recommends that the MTU over the physical network
accomodates for encapsulations, but this isn't a practical option for
complex topologies, especially for typical Open vSwitch use cases.
Further, it states that:
Other techniques like Path MTU discovery (see [RFC1191] and
[RFC1981]) MAY be used to address this requirement as well.
Now, PMTU discovery already works for routed interfaces, we get
route exceptions created by the encapsulation device as they receive
ICMP Fragmentation Needed and ICMPv6 Packet Too Big messages, and
we already rebuild those messages with the appropriate MTU and route
them back to the sender.
Add the missing bits for bridged cases:
- checks in skb_tunnel_check_pmtu() to understand if it's appropriate
to trigger a reply according to RFC 1122 section 3.2.2 for ICMP and
RFC 4443 section 2.4 for ICMPv6. This function is already called by
UDP tunnels
- a new function generating those ICMP or ICMPv6 replies. We can't
reuse icmp_send() and icmp6_send() as we don't see the sender as a
valid destination. This doesn't need to be generic, as we don't
cover any other type of ICMP errors given that we only provide an
encapsulation function to the sender
While at it, make the MTU check in skb_tunnel_check_pmtu() accurate:
we might receive GSO buffers here, and the passed headroom already
includes the inner MAC length, so we don't have to account for it
a second time (that would imply three MAC headers on the wire, but
there are just two).
This issue became visible while bridging IPv6 packets with 4500 bytes
of payload over GENEVE using IPv4 with a PMTU of 4000. Given the 50
bytes of encapsulation headroom, we would advertise MTU as 3950, and
we would reject fragmented IPv6 datagrams of 3958 bytes size on the
wire. We're exclusively dealing with network MTU here, though, so we
could get Ethernet frames up to 3964 octets in that case.
v2:
- moved skb_tunnel_check_pmtu() to ip_tunnel_core.c (David Ahern)
- split IPv4/IPv6 functions (David Ahern)
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-04 13:53:43 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
skb_dst_update_pmtu_no_confirm(skb, mtu);
|
|
|
|
|
|
|
|
if (!reply || skb->pkt_type == PACKET_HOST)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (skb->protocol == htons(ETH_P_IP))
|
|
|
|
return iptunnel_pmtud_check_icmp(skb, mtu);
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
if (skb->protocol == htons(ETH_P_IPV6))
|
|
|
|
return iptunnel_pmtud_check_icmpv6(skb, mtu);
|
|
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_tunnel_check_pmtu);
|
|
|
|
|
2015-08-14 22:40:40 +08:00
|
|
|
static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
|
2019-11-21 18:11:27 +08:00
|
|
|
[LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS },
|
2015-08-14 22:40:40 +08:00
|
|
|
[LWTUNNEL_IP_ID] = { .type = NLA_U64 },
|
|
|
|
[LWTUNNEL_IP_DST] = { .type = NLA_U32 },
|
|
|
|
[LWTUNNEL_IP_SRC] = { .type = NLA_U32 },
|
|
|
|
[LWTUNNEL_IP_TTL] = { .type = NLA_U8 },
|
|
|
|
[LWTUNNEL_IP_TOS] = { .type = NLA_U8 },
|
|
|
|
[LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
|
2019-11-06 17:01:05 +08:00
|
|
|
[LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED },
|
2015-07-21 16:44:00 +08:00
|
|
|
};
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
|
|
|
|
[LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED },
|
2019-11-06 17:01:06 +08:00
|
|
|
[LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED },
|
2019-11-06 17:01:07 +08:00
|
|
|
[LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED },
|
2019-11-06 17:01:05 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static const struct nla_policy
|
|
|
|
geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
|
|
|
|
[LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
|
|
|
|
[LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
|
|
|
|
[LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
|
|
|
|
};
|
|
|
|
|
2019-11-06 17:01:06 +08:00
|
|
|
static const struct nla_policy
|
|
|
|
vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
|
|
|
|
[LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 },
|
|
|
|
};
|
|
|
|
|
2019-11-06 17:01:07 +08:00
|
|
|
static const struct nla_policy
|
|
|
|
erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
|
|
|
|
[LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 },
|
|
|
|
[LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
|
|
|
|
[LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
|
|
|
|
[LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
|
|
|
|
};
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
static int ip_tun_parse_opts_geneve(struct nlattr *attr,
|
2019-11-19 17:39:11 +08:00
|
|
|
struct ip_tunnel_info *info, int opts_len,
|
2019-11-06 17:01:05 +08:00
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
|
|
|
|
int data_len, err;
|
|
|
|
|
2019-11-10 12:16:22 +08:00
|
|
|
err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
|
|
|
|
geneve_opt_policy, extack);
|
2019-11-06 17:01:05 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
|
|
|
|
!tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
|
|
|
|
!tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
|
|
|
|
data_len = nla_len(attr);
|
|
|
|
if (data_len % 4)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (info) {
|
2019-11-19 17:39:11 +08:00
|
|
|
struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
|
2019-11-06 17:01:05 +08:00
|
|
|
|
|
|
|
memcpy(opt->opt_data, nla_data(attr), data_len);
|
|
|
|
opt->length = data_len / 4;
|
|
|
|
attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
|
|
|
|
opt->opt_class = nla_get_be16(attr);
|
|
|
|
attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
|
|
|
|
opt->type = nla_get_u8(attr);
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
|
2019-11-06 17:01:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return sizeof(struct geneve_opt) + data_len;
|
|
|
|
}
|
|
|
|
|
2019-11-06 17:01:06 +08:00
|
|
|
static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
|
2019-11-19 17:39:11 +08:00
|
|
|
struct ip_tunnel_info *info, int opts_len,
|
2019-11-06 17:01:06 +08:00
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
|
|
|
|
int err;
|
|
|
|
|
2019-11-10 12:16:22 +08:00
|
|
|
err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
|
|
|
|
vxlan_opt_policy, extack);
|
2019-11-06 17:01:06 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (info) {
|
2019-11-19 17:39:11 +08:00
|
|
|
struct vxlan_metadata *md =
|
|
|
|
ip_tunnel_info_opts(info) + opts_len;
|
2019-11-06 17:01:06 +08:00
|
|
|
|
|
|
|
attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
|
|
|
|
md->gbp = nla_get_u32(attr);
|
2020-09-13 19:51:51 +08:00
|
|
|
md->gbp &= VXLAN_GBP_MASK;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
|
2019-11-06 17:01:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return sizeof(struct vxlan_metadata);
|
|
|
|
}
|
|
|
|
|
2019-11-06 17:01:07 +08:00
|
|
|
static int ip_tun_parse_opts_erspan(struct nlattr *attr,
|
2019-11-19 17:39:11 +08:00
|
|
|
struct ip_tunnel_info *info, int opts_len,
|
2019-11-06 17:01:07 +08:00
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
|
|
|
|
int err;
|
2019-11-21 18:14:50 +08:00
|
|
|
u8 ver;
|
2019-11-06 17:01:07 +08:00
|
|
|
|
2019-11-10 12:16:22 +08:00
|
|
|
err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
|
|
|
|
erspan_opt_policy, extack);
|
2019-11-06 17:01:07 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
|
|
|
|
return -EINVAL;
|
|
|
|
|
2019-11-21 18:14:50 +08:00
|
|
|
ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
|
|
|
|
if (ver == 1) {
|
|
|
|
if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
|
|
|
|
return -EINVAL;
|
|
|
|
} else if (ver == 2) {
|
|
|
|
if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
|
|
|
|
!tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
|
|
|
|
return -EINVAL;
|
|
|
|
} else {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2019-11-06 17:01:07 +08:00
|
|
|
if (info) {
|
2019-11-19 17:39:11 +08:00
|
|
|
struct erspan_metadata *md =
|
|
|
|
ip_tunnel_info_opts(info) + opts_len;
|
2019-11-06 17:01:07 +08:00
|
|
|
|
2019-11-21 18:14:50 +08:00
|
|
|
md->version = ver;
|
|
|
|
if (ver == 1) {
|
2019-11-06 17:01:07 +08:00
|
|
|
attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
|
|
|
|
md->u.index = nla_get_be32(attr);
|
2019-11-21 18:14:50 +08:00
|
|
|
} else {
|
2019-11-06 17:01:07 +08:00
|
|
|
attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
|
|
|
|
md->u.md2.dir = nla_get_u8(attr);
|
|
|
|
attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
|
|
|
|
set_hwid(&md->u.md2, nla_get_u8(attr));
|
|
|
|
}
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
|
2019-11-06 17:01:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return sizeof(struct erspan_metadata);
|
|
|
|
}
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
2021-01-07 22:40:08 +08:00
|
|
|
int err, rem, opt_len, opts_len = 0;
|
2019-11-19 17:39:11 +08:00
|
|
|
struct nlattr *nla;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
u32 type = 0;
|
2019-11-06 17:01:05 +08:00
|
|
|
|
|
|
|
if (!attr)
|
|
|
|
return 0;
|
|
|
|
|
2019-11-19 17:39:11 +08:00
|
|
|
err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
|
|
|
|
ip_opts_policy, extack);
|
2019-11-06 17:01:05 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2019-11-19 17:39:11 +08:00
|
|
|
nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
|
|
|
|
switch (nla_type(nla)) {
|
|
|
|
case LWTUNNEL_IP_OPTS_GENEVE:
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
|
2019-11-19 17:39:11 +08:00
|
|
|
return -EINVAL;
|
|
|
|
opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
|
|
|
|
extack);
|
|
|
|
if (opt_len < 0)
|
|
|
|
return opt_len;
|
|
|
|
opts_len += opt_len;
|
|
|
|
if (opts_len > IP_TUNNEL_OPTS_MAX)
|
|
|
|
return -EINVAL;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
type = IP_TUNNEL_GENEVE_OPT_BIT;
|
2019-11-19 17:39:11 +08:00
|
|
|
break;
|
|
|
|
case LWTUNNEL_IP_OPTS_VXLAN:
|
|
|
|
if (type)
|
|
|
|
return -EINVAL;
|
|
|
|
opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
|
|
|
|
extack);
|
|
|
|
if (opt_len < 0)
|
|
|
|
return opt_len;
|
|
|
|
opts_len += opt_len;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
type = IP_TUNNEL_VXLAN_OPT_BIT;
|
2019-11-19 17:39:11 +08:00
|
|
|
break;
|
|
|
|
case LWTUNNEL_IP_OPTS_ERSPAN:
|
|
|
|
if (type)
|
|
|
|
return -EINVAL;
|
|
|
|
opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
|
|
|
|
extack);
|
|
|
|
if (opt_len < 0)
|
|
|
|
return opt_len;
|
|
|
|
opts_len += opt_len;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
type = IP_TUNNEL_ERSPAN_OPT_BIT;
|
2019-11-19 17:39:11 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
2019-11-06 17:01:05 +08:00
|
|
|
|
2019-11-19 17:39:11 +08:00
|
|
|
return opts_len;
|
2019-11-06 17:01:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int ip_tun_get_optlen(struct nlattr *attr,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
return ip_tun_parse_opts(attr, NULL, extack);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
return ip_tun_parse_opts(attr, info, extack);
|
|
|
|
}
|
|
|
|
|
2020-03-28 06:00:21 +08:00
|
|
|
static int ip_tun_build_state(struct net *net, struct nlattr *attr,
|
2015-08-25 00:45:41 +08:00
|
|
|
unsigned int family, const void *cfg,
|
2017-05-28 06:19:28 +08:00
|
|
|
struct lwtunnel_state **ts,
|
|
|
|
struct netlink_ext_ack *extack)
|
2015-07-21 16:44:00 +08:00
|
|
|
{
|
2015-08-14 22:40:40 +08:00
|
|
|
struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
|
2019-11-06 17:01:05 +08:00
|
|
|
struct lwtunnel_state *new_state;
|
|
|
|
struct ip_tunnel_info *tun_info;
|
|
|
|
int err, opt_len;
|
2015-07-21 16:44:00 +08:00
|
|
|
|
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 20:07:28 +08:00
|
|
|
err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
|
|
|
|
ip_tun_policy, extack);
|
2015-07-21 16:44:00 +08:00
|
|
|
if (err < 0)
|
|
|
|
return err;
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
|
|
|
|
if (opt_len < 0)
|
|
|
|
return opt_len;
|
|
|
|
|
|
|
|
new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
|
2015-07-21 16:44:00 +08:00
|
|
|
if (!new_state)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
new_state->type = LWTUNNEL_ENCAP_IP;
|
|
|
|
|
|
|
|
tun_info = lwt_tun_info(new_state);
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
|
|
|
|
if (err < 0) {
|
|
|
|
lwtstate_free(new_state);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2019-02-23 21:32:54 +08:00
|
|
|
#ifdef CONFIG_DST_CACHE
|
|
|
|
err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
|
|
|
|
if (err) {
|
|
|
|
lwtstate_free(new_state);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-08-14 22:40:40 +08:00
|
|
|
if (tb[LWTUNNEL_IP_ID])
|
2016-01-07 06:22:45 +08:00
|
|
|
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
|
2015-07-21 16:44:00 +08:00
|
|
|
|
2015-08-14 22:40:40 +08:00
|
|
|
if (tb[LWTUNNEL_IP_DST])
|
2016-03-31 18:21:38 +08:00
|
|
|
tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
|
2015-07-21 16:44:00 +08:00
|
|
|
|
2015-08-14 22:40:40 +08:00
|
|
|
if (tb[LWTUNNEL_IP_SRC])
|
2016-03-31 18:21:38 +08:00
|
|
|
tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
|
2015-07-21 16:44:00 +08:00
|
|
|
|
2015-08-14 22:40:40 +08:00
|
|
|
if (tb[LWTUNNEL_IP_TTL])
|
2015-08-20 19:56:24 +08:00
|
|
|
tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
|
2015-07-21 16:44:00 +08:00
|
|
|
|
2015-08-14 22:40:40 +08:00
|
|
|
if (tb[LWTUNNEL_IP_TOS])
|
2015-08-20 19:56:24 +08:00
|
|
|
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
|
2015-07-21 16:44:00 +08:00
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (tb[LWTUNNEL_IP_FLAGS]) {
|
|
|
|
IP_TUNNEL_DECLARE_FLAGS(flags);
|
|
|
|
|
|
|
|
ip_tunnel_flags_from_be16(flags,
|
|
|
|
nla_get_be16(tb[LWTUNNEL_IP_FLAGS]));
|
|
|
|
ip_tunnel_clear_options_present(flags);
|
|
|
|
|
|
|
|
ip_tunnel_flags_or(tun_info->key.tun_flags,
|
|
|
|
tun_info->key.tun_flags, flags);
|
|
|
|
}
|
2015-07-21 16:44:00 +08:00
|
|
|
|
|
|
|
tun_info->mode = IP_TUNNEL_INFO_TX;
|
2019-11-06 17:01:05 +08:00
|
|
|
tun_info->options_len = opt_len;
|
2015-07-21 16:44:00 +08:00
|
|
|
|
|
|
|
*ts = new_state;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-02-23 21:32:54 +08:00
|
|
|
static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_DST_CACHE
|
|
|
|
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
|
|
|
|
|
|
|
|
dst_cache_destroy(&tun_info->dst_cache);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
|
|
|
|
struct ip_tunnel_info *tun_info)
|
|
|
|
{
|
|
|
|
struct geneve_opt *opt;
|
|
|
|
struct nlattr *nest;
|
2019-11-19 17:39:11 +08:00
|
|
|
int offset = 0;
|
2019-11-06 17:01:05 +08:00
|
|
|
|
|
|
|
nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
|
|
|
|
if (!nest)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2019-11-19 17:39:11 +08:00
|
|
|
while (tun_info->options_len > offset) {
|
|
|
|
opt = ip_tunnel_info_opts(tun_info) + offset;
|
|
|
|
if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
|
|
|
|
opt->opt_class) ||
|
|
|
|
nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
|
|
|
|
nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
|
|
|
|
opt->opt_data)) {
|
|
|
|
nla_nest_cancel(skb, nest);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
offset += sizeof(*opt) + opt->length * 4;
|
2019-11-06 17:01:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
nla_nest_end(skb, nest);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-11-06 17:01:06 +08:00
|
|
|
static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
|
|
|
|
struct ip_tunnel_info *tun_info)
|
|
|
|
{
|
|
|
|
struct vxlan_metadata *md;
|
|
|
|
struct nlattr *nest;
|
|
|
|
|
|
|
|
nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
|
|
|
|
if (!nest)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
md = ip_tunnel_info_opts(tun_info);
|
|
|
|
if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
|
|
|
|
nla_nest_cancel(skb, nest);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
nla_nest_end(skb, nest);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-11-06 17:01:07 +08:00
|
|
|
static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
|
|
|
|
struct ip_tunnel_info *tun_info)
|
|
|
|
{
|
|
|
|
struct erspan_metadata *md;
|
|
|
|
struct nlattr *nest;
|
|
|
|
|
|
|
|
nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
|
|
|
|
if (!nest)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
md = ip_tunnel_info_opts(tun_info);
|
2019-11-18 18:10:12 +08:00
|
|
|
if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
|
2019-11-06 17:01:07 +08:00
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (md->version == 1 &&
|
|
|
|
nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (md->version == 2 &&
|
|
|
|
(nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
|
|
|
|
nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
|
|
|
|
get_hwid(&md->u.md2))))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
nla_nest_end(skb, nest);
|
|
|
|
return 0;
|
|
|
|
err:
|
|
|
|
nla_nest_cancel(skb, nest);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
|
|
|
|
struct ip_tunnel_info *tun_info)
|
|
|
|
{
|
|
|
|
struct nlattr *nest;
|
|
|
|
int err = 0;
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (!ip_tunnel_is_options_present(tun_info->key.tun_flags))
|
2019-11-06 17:01:05 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
nest = nla_nest_start_noflag(skb, type);
|
|
|
|
if (!nest)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags))
|
2019-11-06 17:01:05 +08:00
|
|
|
err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags))
|
2019-11-06 17:01:06 +08:00
|
|
|
err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
|
2019-11-06 17:01:07 +08:00
|
|
|
err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
|
2019-11-06 17:01:05 +08:00
|
|
|
|
|
|
|
if (err) {
|
|
|
|
nla_nest_cancel(skb, nest);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
nla_nest_end(skb, nest);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-07-21 16:44:00 +08:00
|
|
|
static int ip_tun_fill_encap_info(struct sk_buff *skb,
|
|
|
|
struct lwtunnel_state *lwtstate)
|
|
|
|
{
|
|
|
|
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
|
|
|
|
|
2016-04-22 23:31:18 +08:00
|
|
|
if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
|
|
|
|
LWTUNNEL_IP_PAD) ||
|
2016-03-31 18:21:38 +08:00
|
|
|
nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
|
|
|
|
nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
|
2015-08-20 19:56:24 +08:00
|
|
|
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
|
|
|
|
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
nla_put_be16(skb, LWTUNNEL_IP_FLAGS,
|
|
|
|
ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
|
2019-11-06 17:01:05 +08:00
|
|
|
ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
|
2015-07-21 16:44:00 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
|
|
|
|
{
|
|
|
|
int opt_len;
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (!ip_tunnel_is_options_present(info->key.tun_flags))
|
2019-11-06 17:01:05 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
|
2019-11-19 17:39:11 +08:00
|
|
|
struct geneve_opt *opt;
|
|
|
|
int offset = 0;
|
|
|
|
|
|
|
|
opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */
|
|
|
|
while (info->options_len > offset) {
|
|
|
|
opt = ip_tunnel_info_opts(info) + offset;
|
|
|
|
opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */
|
|
|
|
+ nla_total_size(1) /* OPT_GENEVE_TYPE */
|
|
|
|
+ nla_total_size(opt->length * 4);
|
|
|
|
/* OPT_GENEVE_DATA */
|
|
|
|
offset += sizeof(*opt) + opt->length * 4;
|
|
|
|
}
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
} else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
|
2019-11-06 17:01:06 +08:00
|
|
|
opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */
|
|
|
|
+ nla_total_size(4); /* OPT_VXLAN_GBP */
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
} else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
|
2019-11-10 12:21:18 +08:00
|
|
|
struct erspan_metadata *md = ip_tunnel_info_opts(info);
|
|
|
|
|
2019-11-06 17:01:07 +08:00
|
|
|
opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */
|
|
|
|
+ nla_total_size(1) /* OPT_ERSPAN_VER */
|
2019-11-10 12:21:18 +08:00
|
|
|
+ (md->version == 1 ? nla_total_size(4)
|
|
|
|
/* OPT_ERSPAN_INDEX (v1) */
|
|
|
|
: nla_total_size(1) +
|
|
|
|
nla_total_size(1));
|
|
|
|
/* OPT_ERSPAN_DIR + HWID (v2) */
|
2019-11-06 17:01:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return opt_len;
|
|
|
|
}
|
|
|
|
|
2015-07-21 16:44:00 +08:00
|
|
|
static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
|
|
|
|
{
|
2016-04-22 23:31:18 +08:00
|
|
|
return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */
|
2015-08-14 22:40:40 +08:00
|
|
|
+ nla_total_size(4) /* LWTUNNEL_IP_DST */
|
|
|
|
+ nla_total_size(4) /* LWTUNNEL_IP_SRC */
|
|
|
|
+ nla_total_size(1) /* LWTUNNEL_IP_TOS */
|
|
|
|
+ nla_total_size(1) /* LWTUNNEL_IP_TTL */
|
2019-11-06 17:01:05 +08:00
|
|
|
+ nla_total_size(2) /* LWTUNNEL_IP_FLAGS */
|
|
|
|
+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
|
|
|
|
/* LWTUNNEL_IP_OPTS */
|
2015-07-21 16:44:00 +08:00
|
|
|
}
|
|
|
|
|
2015-08-19 00:42:09 +08:00
|
|
|
static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
|
|
|
|
{
|
2019-11-06 17:01:04 +08:00
|
|
|
struct ip_tunnel_info *info_a = lwt_tun_info(a);
|
|
|
|
struct ip_tunnel_info *info_b = lwt_tun_info(b);
|
|
|
|
|
|
|
|
return memcmp(info_a, info_b, sizeof(info_a->key)) ||
|
|
|
|
info_a->mode != info_b->mode ||
|
|
|
|
info_a->options_len != info_b->options_len ||
|
|
|
|
memcmp(ip_tunnel_info_opts(info_a),
|
|
|
|
ip_tunnel_info_opts(info_b), info_a->options_len);
|
2015-08-19 00:42:09 +08:00
|
|
|
}
|
|
|
|
|
2015-07-21 16:44:00 +08:00
|
|
|
static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
|
|
|
|
.build_state = ip_tun_build_state,
|
2019-02-23 21:32:54 +08:00
|
|
|
.destroy_state = ip_tun_destroy_state,
|
2015-07-21 16:44:00 +08:00
|
|
|
.fill_encap = ip_tun_fill_encap_info,
|
|
|
|
.get_encap_size = ip_tun_encap_nlsize,
|
2015-08-19 00:42:09 +08:00
|
|
|
.cmp_encap = ip_tun_cmp_encap,
|
2017-01-25 00:26:47 +08:00
|
|
|
.owner = THIS_MODULE,
|
2015-07-21 16:44:00 +08:00
|
|
|
};
|
|
|
|
|
2015-08-20 19:56:32 +08:00
|
|
|
static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
|
2019-11-21 18:11:27 +08:00
|
|
|
[LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS },
|
2015-08-20 19:56:32 +08:00
|
|
|
[LWTUNNEL_IP6_ID] = { .type = NLA_U64 },
|
|
|
|
[LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) },
|
|
|
|
[LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) },
|
|
|
|
[LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 },
|
|
|
|
[LWTUNNEL_IP6_TC] = { .type = NLA_U8 },
|
|
|
|
[LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
|
2019-11-21 18:11:27 +08:00
|
|
|
[LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED },
|
2015-08-20 19:56:32 +08:00
|
|
|
};
|
|
|
|
|
2020-03-28 06:00:21 +08:00
|
|
|
static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
|
2015-08-25 00:45:41 +08:00
|
|
|
unsigned int family, const void *cfg,
|
2017-05-28 06:19:28 +08:00
|
|
|
struct lwtunnel_state **ts,
|
|
|
|
struct netlink_ext_ack *extack)
|
2015-08-20 19:56:32 +08:00
|
|
|
{
|
|
|
|
struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
|
2019-11-06 17:01:05 +08:00
|
|
|
struct lwtunnel_state *new_state;
|
|
|
|
struct ip_tunnel_info *tun_info;
|
|
|
|
int err, opt_len;
|
2015-08-20 19:56:32 +08:00
|
|
|
|
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 20:07:28 +08:00
|
|
|
err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
|
|
|
|
ip6_tun_policy, extack);
|
2015-08-20 19:56:32 +08:00
|
|
|
if (err < 0)
|
|
|
|
return err;
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
|
|
|
|
if (opt_len < 0)
|
|
|
|
return opt_len;
|
|
|
|
|
|
|
|
new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
|
2015-08-20 19:56:32 +08:00
|
|
|
if (!new_state)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
new_state->type = LWTUNNEL_ENCAP_IP6;
|
|
|
|
|
|
|
|
tun_info = lwt_tun_info(new_state);
|
|
|
|
|
2019-11-06 17:01:05 +08:00
|
|
|
err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
|
|
|
|
if (err < 0) {
|
|
|
|
lwtstate_free(new_state);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-08-20 19:56:32 +08:00
|
|
|
if (tb[LWTUNNEL_IP6_ID])
|
2016-01-07 06:22:45 +08:00
|
|
|
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
|
2015-08-20 19:56:32 +08:00
|
|
|
|
|
|
|
if (tb[LWTUNNEL_IP6_DST])
|
|
|
|
tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
|
|
|
|
|
|
|
|
if (tb[LWTUNNEL_IP6_SRC])
|
|
|
|
tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
|
|
|
|
|
|
|
|
if (tb[LWTUNNEL_IP6_HOPLIMIT])
|
|
|
|
tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
|
|
|
|
|
|
|
|
if (tb[LWTUNNEL_IP6_TC])
|
|
|
|
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (tb[LWTUNNEL_IP6_FLAGS]) {
|
|
|
|
IP_TUNNEL_DECLARE_FLAGS(flags);
|
|
|
|
__be16 data;
|
|
|
|
|
|
|
|
data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
|
|
|
|
ip_tunnel_flags_from_be16(flags, data);
|
|
|
|
ip_tunnel_clear_options_present(flags);
|
|
|
|
|
|
|
|
ip_tunnel_flags_or(tun_info->key.tun_flags,
|
|
|
|
tun_info->key.tun_flags, flags);
|
|
|
|
}
|
2015-08-20 19:56:32 +08:00
|
|
|
|
2015-08-29 02:48:20 +08:00
|
|
|
tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
|
2019-11-06 17:01:05 +08:00
|
|
|
tun_info->options_len = opt_len;
|
2015-08-20 19:56:32 +08:00
|
|
|
|
|
|
|
*ts = new_state;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ip6_tun_fill_encap_info(struct sk_buff *skb,
|
|
|
|
struct lwtunnel_state *lwtstate)
|
|
|
|
{
|
|
|
|
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
|
|
|
|
|
2016-04-22 23:31:18 +08:00
|
|
|
if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
|
|
|
|
LWTUNNEL_IP6_PAD) ||
|
2015-08-20 19:56:32 +08:00
|
|
|
nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
|
|
|
|
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
|
2016-03-28 00:06:11 +08:00
|
|
|
nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
|
|
|
|
nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
nla_put_be16(skb, LWTUNNEL_IP6_FLAGS,
|
|
|
|
ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
|
2019-11-06 17:01:05 +08:00
|
|
|
ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
|
2015-08-20 19:56:32 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
|
|
|
|
{
|
2016-04-22 23:31:18 +08:00
|
|
|
return nla_total_size_64bit(8) /* LWTUNNEL_IP6_ID */
|
2015-08-20 19:56:32 +08:00
|
|
|
+ nla_total_size(16) /* LWTUNNEL_IP6_DST */
|
|
|
|
+ nla_total_size(16) /* LWTUNNEL_IP6_SRC */
|
|
|
|
+ nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */
|
|
|
|
+ nla_total_size(1) /* LWTUNNEL_IP6_TC */
|
2019-11-06 17:01:05 +08:00
|
|
|
+ nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */
|
|
|
|
+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
|
|
|
|
/* LWTUNNEL_IP6_OPTS */
|
2015-08-20 19:56:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
|
|
|
|
.build_state = ip6_tun_build_state,
|
|
|
|
.fill_encap = ip6_tun_fill_encap_info,
|
|
|
|
.get_encap_size = ip6_tun_encap_nlsize,
|
|
|
|
.cmp_encap = ip_tun_cmp_encap,
|
2017-01-25 00:26:47 +08:00
|
|
|
.owner = THIS_MODULE,
|
2015-08-20 19:56:32 +08:00
|
|
|
};
|
|
|
|
|
2015-07-23 16:08:44 +08:00
|
|
|
void __init ip_tunnel_core_init(void)
|
2015-07-21 16:44:00 +08:00
|
|
|
{
|
2016-03-16 08:42:51 +08:00
|
|
|
/* If you land here, make sure whether increasing ip_tunnel_info's
|
|
|
|
* options_len is a reasonable choice with its usage in front ends
|
|
|
|
* (f.e., it's part of flow keys, etc).
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
|
|
|
|
|
2015-07-21 16:44:00 +08:00
|
|
|
lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
|
2015-08-20 19:56:32 +08:00
|
|
|
lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
|
2015-07-21 16:44:00 +08:00
|
|
|
}
|
2015-07-21 16:44:01 +08:00
|
|
|
|
2018-05-09 00:06:58 +08:00
|
|
|
DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
|
2015-07-21 16:44:01 +08:00
|
|
|
EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
|
|
|
|
|
|
|
|
void ip_tunnel_need_metadata(void)
|
|
|
|
{
|
2018-05-09 00:06:58 +08:00
|
|
|
static_branch_inc(&ip_tunnel_metadata_cnt);
|
2015-07-21 16:44:01 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
|
|
|
|
|
|
|
|
void ip_tunnel_unneed_metadata(void)
|
|
|
|
{
|
2018-05-09 00:06:58 +08:00
|
|
|
static_branch_dec(&ip_tunnel_metadata_cnt);
|
2015-07-21 16:44:01 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
|
2020-06-30 09:06:18 +08:00
|
|
|
|
|
|
|
/* Returns either the correct skb->protocol value, or 0 if invalid. */
|
|
|
|
__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (skb_network_header(skb) >= skb->head &&
|
|
|
|
(skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) &&
|
|
|
|
ip_hdr(skb)->version == 4)
|
|
|
|
return htons(ETH_P_IP);
|
|
|
|
if (skb_network_header(skb) >= skb->head &&
|
|
|
|
(skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) &&
|
|
|
|
ipv6_hdr(skb)->version == 6)
|
|
|
|
return htons(ETH_P_IPV6);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ip_tunnel_parse_protocol);
|
|
|
|
|
|
|
|
const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
|
|
|
|
EXPORT_SYMBOL(ip_tunnel_header_ops);
|
2022-09-29 21:52:02 +08:00
|
|
|
|
|
|
|
/* This function returns true when ENCAP attributes are present in the nl msg */
|
|
|
|
bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
|
|
|
|
struct ip_tunnel_encap *encap)
|
|
|
|
{
|
|
|
|
bool ret = false;
|
|
|
|
|
|
|
|
memset(encap, 0, sizeof(*encap));
|
|
|
|
|
|
|
|
if (!data)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (data[IFLA_IPTUN_ENCAP_TYPE]) {
|
|
|
|
ret = true;
|
|
|
|
encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
|
|
|
|
ret = true;
|
|
|
|
encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data[IFLA_IPTUN_ENCAP_SPORT]) {
|
|
|
|
ret = true;
|
|
|
|
encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data[IFLA_IPTUN_ENCAP_DPORT]) {
|
|
|
|
ret = true;
|
|
|
|
encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
|
2022-09-29 21:52:03 +08:00
|
|
|
|
|
|
|
void ip_tunnel_netlink_parms(struct nlattr *data[],
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern *parms)
|
2022-09-29 21:52:03 +08:00
|
|
|
{
|
|
|
|
if (data[IFLA_IPTUN_LINK])
|
|
|
|
parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
|
|
|
|
|
|
|
|
if (data[IFLA_IPTUN_LOCAL])
|
|
|
|
parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
|
|
|
|
|
|
|
|
if (data[IFLA_IPTUN_REMOTE])
|
|
|
|
parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
|
|
|
|
|
|
|
|
if (data[IFLA_IPTUN_TTL]) {
|
|
|
|
parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
|
|
|
|
if (parms->iph.ttl)
|
|
|
|
parms->iph.frag_off = htons(IP_DF);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (data[IFLA_IPTUN_TOS])
|
|
|
|
parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
|
|
|
|
|
|
|
|
if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
|
|
|
|
parms->iph.frag_off = htons(IP_DF);
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (data[IFLA_IPTUN_FLAGS]) {
|
|
|
|
__be16 flags;
|
|
|
|
|
|
|
|
flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
|
|
|
|
ip_tunnel_flags_from_be16(parms->i_flags, flags);
|
|
|
|
}
|
2022-09-29 21:52:03 +08:00
|
|
|
|
|
|
|
if (data[IFLA_IPTUN_PROTO])
|
|
|
|
parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);
|