2019-05-29 22:12:43 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2013-03-25 22:49:35 +08:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2013 Nicira, Inc.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
|
|
|
#include <linux/capability.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/uaccess.h>
|
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/in.h>
|
|
|
|
#include <linux/tcp.h>
|
|
|
|
#include <linux/udp.h>
|
|
|
|
#include <linux/if_arp.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/in6.h>
|
|
|
|
#include <linux/inetdevice.h>
|
|
|
|
#include <linux/igmp.h>
|
|
|
|
#include <linux/netfilter_ipv4.h>
|
|
|
|
#include <linux/etherdevice.h>
|
|
|
|
#include <linux/if_ether.h>
|
|
|
|
#include <linux/if_vlan.h>
|
|
|
|
#include <linux/rculist.h>
|
2014-01-27 14:43:57 +08:00
|
|
|
#include <linux/err.h>
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/ip.h>
|
|
|
|
#include <net/icmp.h>
|
|
|
|
#include <net/protocol.h>
|
|
|
|
#include <net/ip_tunnels.h>
|
|
|
|
#include <net/arp.h>
|
|
|
|
#include <net/checksum.h>
|
|
|
|
#include <net/dsfield.h>
|
|
|
|
#include <net/inet_ecn.h>
|
|
|
|
#include <net/xfrm.h>
|
|
|
|
#include <net/net_namespace.h>
|
|
|
|
#include <net/netns/generic.h>
|
|
|
|
#include <net/rtnetlink.h>
|
2014-09-18 03:25:58 +08:00
|
|
|
#include <net/udp.h>
|
2016-09-16 04:00:29 +08:00
|
|
|
#include <net/dst_metadata.h>
|
2014-11-05 01:06:51 +08:00
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
#include <net/ipv6.h>
|
|
|
|
#include <net/ip6_fib.h>
|
|
|
|
#include <net/ip6_route.h>
|
|
|
|
#endif
|
|
|
|
|
2014-01-19 16:43:42 +08:00
|
|
|
static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
|
|
|
return hash_32((__force u32)key ^ (__force u32)remote,
|
|
|
|
IP_TNL_HASH_BITS);
|
|
|
|
}
|
|
|
|
|
2024-03-27 23:23:52 +08:00
|
|
|
static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
const unsigned long *flags, __be32 key)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
|
|
|
|
return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
|
|
|
|
|
|
|
|
return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Fallback tunnel: no source, no destination, no key, no options
|
|
|
|
|
|
|
|
Tunnel hash table:
|
|
|
|
We require exact key match i.e. if a key is present in packet
|
|
|
|
it will match only tunnel with the same key; if it is not present,
|
|
|
|
it will match only keyless tunnel.
|
|
|
|
|
|
|
|
All keysless packets, if not matched configured keyless tunnels
|
|
|
|
will match fallback tunnel.
|
|
|
|
Given src, dst and key, find appropriate for input tunnel.
|
|
|
|
*/
|
|
|
|
struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
int link, const unsigned long *flags,
|
2013-03-25 22:49:35 +08:00
|
|
|
__be32 remote, __be32 local,
|
|
|
|
__be32 key)
|
|
|
|
{
|
|
|
|
struct ip_tunnel *t, *cand = NULL;
|
|
|
|
struct hlist_head *head;
|
2020-06-17 00:51:51 +08:00
|
|
|
struct net_device *ndev;
|
|
|
|
unsigned int hash;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2014-01-19 16:43:42 +08:00
|
|
|
hash = ip_tunnel_hash(key, remote);
|
2013-03-25 22:49:35 +08:00
|
|
|
head = &itn->tunnels[hash];
|
|
|
|
|
|
|
|
hlist_for_each_entry_rcu(t, head, hash_node) {
|
|
|
|
if (local != t->parms.iph.saddr ||
|
|
|
|
remote != t->parms.iph.daddr ||
|
|
|
|
!(t->dev->flags & IFF_UP))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!ip_tunnel_key_match(&t->parms, flags, key))
|
|
|
|
continue;
|
|
|
|
|
2024-02-13 14:32:34 +08:00
|
|
|
if (READ_ONCE(t->parms.link) == link)
|
2013-03-25 22:49:35 +08:00
|
|
|
return t;
|
2024-02-13 14:32:34 +08:00
|
|
|
cand = t;
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
hlist_for_each_entry_rcu(t, head, hash_node) {
|
|
|
|
if (remote != t->parms.iph.daddr ||
|
2014-07-05 06:26:37 +08:00
|
|
|
t->parms.iph.saddr != 0 ||
|
2013-03-25 22:49:35 +08:00
|
|
|
!(t->dev->flags & IFF_UP))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!ip_tunnel_key_match(&t->parms, flags, key))
|
|
|
|
continue;
|
|
|
|
|
2024-02-13 14:32:34 +08:00
|
|
|
if (READ_ONCE(t->parms.link) == link)
|
2013-03-25 22:49:35 +08:00
|
|
|
return t;
|
2024-02-13 14:32:34 +08:00
|
|
|
if (!cand)
|
2013-03-25 22:49:35 +08:00
|
|
|
cand = t;
|
|
|
|
}
|
|
|
|
|
2014-01-19 16:43:42 +08:00
|
|
|
hash = ip_tunnel_hash(key, 0);
|
2013-03-25 22:49:35 +08:00
|
|
|
head = &itn->tunnels[hash];
|
|
|
|
|
|
|
|
hlist_for_each_entry_rcu(t, head, hash_node) {
|
2014-07-05 06:26:37 +08:00
|
|
|
if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
|
|
|
|
(local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!(t->dev->flags & IFF_UP))
|
2013-03-25 22:49:35 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!ip_tunnel_key_match(&t->parms, flags, key))
|
|
|
|
continue;
|
|
|
|
|
2024-02-13 14:32:34 +08:00
|
|
|
if (READ_ONCE(t->parms.link) == link)
|
2013-03-25 22:49:35 +08:00
|
|
|
return t;
|
2024-02-13 14:32:34 +08:00
|
|
|
if (!cand)
|
2013-03-25 22:49:35 +08:00
|
|
|
cand = t;
|
|
|
|
}
|
|
|
|
|
|
|
|
hlist_for_each_entry_rcu(t, head, hash_node) {
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
|
|
|
|
t->parms.i_key != key) ||
|
2014-07-05 06:26:37 +08:00
|
|
|
t->parms.iph.saddr != 0 ||
|
|
|
|
t->parms.iph.daddr != 0 ||
|
2013-03-25 22:49:35 +08:00
|
|
|
!(t->dev->flags & IFF_UP))
|
|
|
|
continue;
|
|
|
|
|
2024-02-13 14:32:34 +08:00
|
|
|
if (READ_ONCE(t->parms.link) == link)
|
2013-03-25 22:49:35 +08:00
|
|
|
return t;
|
2024-02-13 14:32:34 +08:00
|
|
|
if (!cand)
|
2013-03-25 22:49:35 +08:00
|
|
|
cand = t;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cand)
|
|
|
|
return cand;
|
|
|
|
|
2015-08-08 14:51:42 +08:00
|
|
|
t = rcu_dereference(itn->collect_md_tun);
|
2017-09-12 17:47:56 +08:00
|
|
|
if (t && t->dev->flags & IFF_UP)
|
2015-08-08 14:51:42 +08:00
|
|
|
return t;
|
|
|
|
|
2020-06-17 00:51:51 +08:00
|
|
|
ndev = READ_ONCE(itn->fb_tunnel_dev);
|
|
|
|
if (ndev && ndev->flags & IFF_UP)
|
|
|
|
return netdev_priv(ndev);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
|
|
|
|
|
|
|
|
static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern *parms)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
|
|
|
unsigned int h;
|
|
|
|
__be32 remote;
|
2014-02-21 15:41:09 +08:00
|
|
|
__be32 i_key = parms->i_key;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
|
|
|
|
remote = parms->iph.daddr;
|
|
|
|
else
|
|
|
|
remote = 0;
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
|
|
|
|
test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
|
2014-02-21 15:41:09 +08:00
|
|
|
i_key = 0;
|
|
|
|
|
|
|
|
h = ip_tunnel_hash(i_key, remote);
|
2013-03-25 22:49:35 +08:00
|
|
|
return &itn->tunnels[h];
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
|
|
|
|
{
|
|
|
|
struct hlist_head *head = ip_bucket(itn, &t->parms);
|
|
|
|
|
2015-08-08 14:51:42 +08:00
|
|
|
if (t->collect_md)
|
|
|
|
rcu_assign_pointer(itn->collect_md_tun, t);
|
2013-03-25 22:49:35 +08:00
|
|
|
hlist_add_head_rcu(&t->hash_node, head);
|
|
|
|
}
|
|
|
|
|
2015-08-08 14:51:42 +08:00
|
|
|
static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
2015-08-08 14:51:42 +08:00
|
|
|
if (t->collect_md)
|
|
|
|
rcu_assign_pointer(itn->collect_md_tun, NULL);
|
2013-03-25 22:49:35 +08:00
|
|
|
hlist_del_init_rcu(&t->hash_node);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern *parms,
|
2013-03-25 22:49:35 +08:00
|
|
|
int type)
|
|
|
|
{
|
|
|
|
__be32 remote = parms->iph.daddr;
|
|
|
|
__be32 local = parms->iph.saddr;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
IP_TUNNEL_DECLARE_FLAGS(flags);
|
2013-03-25 22:49:35 +08:00
|
|
|
__be32 key = parms->i_key;
|
|
|
|
int link = parms->link;
|
|
|
|
struct ip_tunnel *t = NULL;
|
|
|
|
struct hlist_head *head = ip_bucket(itn, parms);
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
ip_tunnel_flags_copy(flags, parms->i_flags);
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
hlist_for_each_entry_rcu(t, head, hash_node) {
|
|
|
|
if (local == t->parms.iph.saddr &&
|
|
|
|
remote == t->parms.iph.daddr &&
|
2024-02-13 14:32:34 +08:00
|
|
|
link == READ_ONCE(t->parms.link) &&
|
2014-06-08 07:03:08 +08:00
|
|
|
type == t->dev->type &&
|
|
|
|
ip_tunnel_key_match(&t->parms, flags, key))
|
2013-03-25 22:49:35 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct net_device *__ip_tunnel_create(struct net *net,
|
|
|
|
const struct rtnl_link_ops *ops,
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern *parms)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
struct ip_tunnel *tunnel;
|
|
|
|
struct net_device *dev;
|
|
|
|
char name[IFNAMSIZ];
|
|
|
|
|
2018-04-05 21:39:27 +08:00
|
|
|
err = -E2BIG;
|
|
|
|
if (parms->name[0]) {
|
|
|
|
if (!dev_valid_name(parms->name))
|
|
|
|
goto failed;
|
2022-07-11 21:55:37 +08:00
|
|
|
strscpy(name, parms->name, IFNAMSIZ);
|
2018-04-05 21:39:27 +08:00
|
|
|
} else {
|
|
|
|
if (strlen(ops->kind) > (IFNAMSIZ - 3))
|
2013-03-25 22:49:35 +08:00
|
|
|
goto failed;
|
2018-06-07 06:56:54 +08:00
|
|
|
strcpy(name, ops->kind);
|
|
|
|
strcat(name, "%d");
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_RTNL();
|
net: set name_assign_type in alloc_netdev()
Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert
all users to pass NET_NAME_UNKNOWN.
Coccinelle patch:
@@
expression sizeof_priv, name, setup, txqs, rxqs, count;
@@
(
-alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs)
+alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs)
|
-alloc_netdev_mq(sizeof_priv, name, setup, count)
+alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count)
|
-alloc_netdev(sizeof_priv, name, setup)
+alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup)
)
v9: move comments here from the wrong commit
Signed-off-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-14 22:37:24 +08:00
|
|
|
dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
|
2013-03-25 22:49:35 +08:00
|
|
|
if (!dev) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto failed;
|
|
|
|
}
|
|
|
|
dev_net_set(dev, net);
|
|
|
|
|
|
|
|
dev->rtnl_link_ops = ops;
|
|
|
|
|
|
|
|
tunnel = netdev_priv(dev);
|
|
|
|
tunnel->parms = *parms;
|
2013-06-26 22:11:28 +08:00
|
|
|
tunnel->net = net;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
err = register_netdevice(dev);
|
|
|
|
if (err)
|
|
|
|
goto failed_free;
|
|
|
|
|
|
|
|
return dev;
|
|
|
|
|
|
|
|
failed_free:
|
|
|
|
free_netdev(dev);
|
|
|
|
failed:
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ip_tunnel_bind_dev(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct net_device *tdev = NULL;
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
|
|
|
const struct iphdr *iph;
|
|
|
|
int hlen = LL_MAX_HEADER;
|
|
|
|
int mtu = ETH_DATA_LEN;
|
|
|
|
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
|
|
|
|
|
|
|
|
iph = &tunnel->parms.iph;
|
|
|
|
|
|
|
|
/* Guess output device to choose reasonable mtu and needed_headroom */
|
|
|
|
if (iph->daddr) {
|
|
|
|
struct flowi4 fl4;
|
|
|
|
struct rtable *rt;
|
|
|
|
|
2018-02-27 21:53:38 +08:00
|
|
|
ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
|
|
|
|
iph->saddr, tunnel->parms.o_key,
|
2022-04-14 01:43:20 +08:00
|
|
|
RT_TOS(iph->tos), dev_net(dev),
|
2022-08-18 15:41:18 +08:00
|
|
|
tunnel->parms.link, tunnel->fwmark, 0, 0);
|
2014-01-03 03:48:26 +08:00
|
|
|
rt = ip_route_output_key(tunnel->net, &fl4);
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
if (!IS_ERR(rt)) {
|
|
|
|
tdev = rt->dst.dev;
|
|
|
|
ip_rt_put(rt);
|
|
|
|
}
|
|
|
|
if (dev->type != ARPHRD_ETHER)
|
|
|
|
dev->flags |= IFF_POINTOPOINT;
|
2016-04-28 17:04:51 +08:00
|
|
|
|
|
|
|
dst_cache_reset(&tunnel->dst_cache);
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!tdev && tunnel->parms.link)
|
2013-08-13 23:51:11 +08:00
|
|
|
tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
if (tdev) {
|
|
|
|
hlen = tdev->hard_header_len + tdev->needed_headroom;
|
2018-05-31 16:59:32 +08:00
|
|
|
mtu = min(tdev->mtu, IP_MAX_MTU);
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
dev->needed_headroom = t_hlen + hlen;
|
2021-07-09 11:45:02 +08:00
|
|
|
mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2017-12-11 23:17:39 +08:00
|
|
|
if (mtu < IPV4_MIN_MTU)
|
|
|
|
mtu = IPV4_MIN_MTU;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
return mtu;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ip_tunnel *ip_tunnel_create(struct net *net,
|
|
|
|
struct ip_tunnel_net *itn,
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern *parms)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
2014-05-15 11:43:20 +08:00
|
|
|
struct ip_tunnel *nt;
|
2013-03-25 22:49:35 +08:00
|
|
|
struct net_device *dev;
|
2016-10-21 01:55:24 +08:00
|
|
|
int t_hlen;
|
2018-03-23 01:53:33 +08:00
|
|
|
int mtu;
|
|
|
|
int err;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
net: do not create fallback tunnels for non-default namespaces
fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.
These tunnels are also automatically created when a new network
namespace is created, at a great cost.
In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)
Add a new sysctl so that we can opt-out from this automatic creation.
Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.
Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
(for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
done
wait
lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m37.521s
user 0m0.886s
sys 7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m4.761s
user 0m0.851s
sys 1m8.343s
lpk43:~#
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-09 04:51:41 +08:00
|
|
|
dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
|
2013-03-25 22:49:35 +08:00
|
|
|
if (IS_ERR(dev))
|
2014-02-14 20:14:39 +08:00
|
|
|
return ERR_CAST(dev);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2018-03-23 01:53:33 +08:00
|
|
|
mtu = ip_tunnel_bind_dev(dev);
|
|
|
|
err = dev_set_mtu(dev, mtu);
|
|
|
|
if (err)
|
|
|
|
goto err_dev_set_mtu;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
nt = netdev_priv(dev);
|
2016-10-21 01:55:24 +08:00
|
|
|
t_hlen = nt->hlen + sizeof(struct iphdr);
|
|
|
|
dev->min_mtu = ETH_MIN_MTU;
|
2021-01-30 06:27:47 +08:00
|
|
|
dev->max_mtu = IP_MAX_MTU - t_hlen;
|
2021-07-09 11:45:02 +08:00
|
|
|
if (dev->type == ARPHRD_ETHER)
|
|
|
|
dev->max_mtu -= dev->hard_header_len;
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
ip_tunnel_add(itn, nt);
|
|
|
|
return nt;
|
2018-03-23 01:53:33 +08:00
|
|
|
|
|
|
|
err_dev_set_mtu:
|
|
|
|
unregister_netdevice(dev);
|
|
|
|
return ERR_PTR(err);
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
2023-04-07 21:38:53 +08:00
|
|
|
void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
|
|
|
|
{
|
|
|
|
const struct iphdr *iph = ip_hdr(skb);
|
|
|
|
const struct udphdr *udph;
|
|
|
|
|
|
|
|
if (iph->protocol != IPPROTO_UDP)
|
|
|
|
return;
|
|
|
|
|
|
|
|
udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
|
|
|
|
info->encap.sport = udph->source;
|
|
|
|
info->encap.dport = udph->dest;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
|
2015-08-08 14:51:42 +08:00
|
|
|
const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
|
|
|
|
bool log_ecn_error)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
|
|
|
const struct iphdr *iph = ip_hdr(skb);
|
2024-03-07 18:07:16 +08:00
|
|
|
int nh, err;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_NET_IPGRE_BROADCAST
|
|
|
|
if (ipv4_is_multicast(iph->daddr)) {
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(tunnel->dev, multicast);
|
2013-03-25 22:49:35 +08:00
|
|
|
skb->pkt_type = PACKET_BROADCAST;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
|
|
|
|
test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(tunnel->dev, rx_crc_errors);
|
|
|
|
DEV_STATS_INC(tunnel->dev, rx_errors);
|
2013-03-25 22:49:35 +08:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
|
|
|
|
if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
|
2013-03-25 22:49:35 +08:00
|
|
|
(tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
|
|
|
|
DEV_STATS_INC(tunnel->dev, rx_errors);
|
2013-03-25 22:49:35 +08:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
tunnel->i_seqno = ntohl(tpi->seq) + 1;
|
|
|
|
}
|
|
|
|
|
2024-03-07 18:07:16 +08:00
|
|
|
/* Save offset of outer header relative to skb->head,
|
|
|
|
* because we are going to reset the network header to the inner header
|
|
|
|
* and might change skb->head.
|
|
|
|
*/
|
|
|
|
nh = skb_network_header(skb) - skb->head;
|
|
|
|
|
net: Set true network header for ECN decapsulation
In cases where the header straight after the tunnel header was
another ethernet header (TEB), instead of the network header,
the ECN decapsulation code would treat the ethernet header as if
it was an IP header, resulting in mishandling and possible
wrong drops or corruption of the IP header.
In this case, ECT(1) is sent, so IP_ECN_decapsulate tries to copy it to the
inner IPv4 header, and correct its checksum.
The offset of the ECT bits in an IPv4 header corresponds to the
lower 2 bits of the second octet of the destination MAC address
in the ethernet header.
The IPv4 checksum corresponds to end of the source address.
In order to reproduce:
$ ip netns add A
$ ip netns add B
$ ip -n A link add _v0 type veth peer name _v1 netns B
$ ip -n A link set _v0 up
$ ip -n A addr add dev _v0 10.254.3.1/24
$ ip -n A route add default dev _v0 scope global
$ ip -n B link set _v1 up
$ ip -n B addr add dev _v1 10.254.1.6/24
$ ip -n B route add default dev _v1 scope global
$ ip -n B link add gre1 type gretap local 10.254.1.6 remote 10.254.3.1 key 0x49000000
$ ip -n B link set gre1 up
# Now send an IPv4/GRE/Eth/IPv4 frame where the outer header has ECT(1),
# and the inner header has no ECT bits set:
$ cat send_pkt.py
#!/usr/bin/env python3
from scapy.all import *
pkt = IP(b'E\x01\x00\xa7\x00\x00\x00\x00@/`%\n\xfe\x03\x01\n\xfe\x01\x06 \x00eXI\x00'
b'\x00\x00\x18\xbe\x92\xa0\xee&\x18\xb0\x92\xa0l&\x08\x00E\x00\x00}\x8b\x85'
b'@\x00\x01\x01\xe4\xf2\x82\x82\x82\x01\x82\x82\x82\x02\x08\x00d\x11\xa6\xeb'
b'3\x1e\x1e\\xf3\\xf7`\x00\x00\x00\x00ZN\x00\x00\x00\x00\x00\x00\x10\x11\x12'
b'\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./01234'
b'56789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ')
send(pkt)
$ sudo ip netns exec B tcpdump -neqlllvi gre1 icmp & ; sleep 1
$ sudo ip netns exec A python3 send_pkt.py
In the original packet, the source/destinatio MAC addresses are
dst=18:be:92:a0:ee:26 src=18:b0:92:a0:6c:26
In the received packet, they are
dst=18:bd:92:a0:ee:26 src=18:b0:92:a0:6c:27
Thanks to Lahav Schlesinger <lschlesinger@drivenets.com> and Isaac Garzon <isaac@speed.io>
for helping me pinpoint the origin.
Fixes: b723748750ec ("tunnel: Propagate ECT(1) when decapsulating as recommended by RFC6040")
Cc: David S. Miller <davem@davemloft.net>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: David Ahern <dsahern@kernel.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-23 01:01:28 +08:00
|
|
|
skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
|
2014-05-05 06:20:04 +08:00
|
|
|
|
2024-03-07 18:07:16 +08:00
|
|
|
if (!pskb_inet_may_pull(skb)) {
|
|
|
|
DEV_STATS_INC(tunnel->dev, rx_length_errors);
|
|
|
|
DEV_STATS_INC(tunnel->dev, rx_errors);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
iph = (struct iphdr *)(skb->head + nh);
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
err = IP_ECN_decapsulate(iph, skb);
|
|
|
|
if (unlikely(err)) {
|
|
|
|
if (log_ecn_error)
|
|
|
|
net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
|
|
|
|
&iph->saddr, iph->tos);
|
|
|
|
if (err > 1) {
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(tunnel->dev, rx_frame_errors);
|
|
|
|
DEV_STATS_INC(tunnel->dev, rx_errors);
|
2013-03-25 22:49:35 +08:00
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-06 04:37:12 +08:00
|
|
|
dev_sw_netstats_rx_add(tunnel->dev, skb->len);
|
2013-11-13 06:39:13 +08:00
|
|
|
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
|
|
|
|
|
2013-06-18 08:50:02 +08:00
|
|
|
if (tunnel->dev->type == ARPHRD_ETHER) {
|
|
|
|
skb->protocol = eth_type_trans(skb, tunnel->dev);
|
|
|
|
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
|
|
|
|
} else {
|
|
|
|
skb->dev = tunnel->dev;
|
|
|
|
}
|
2013-08-13 23:51:09 +08:00
|
|
|
|
2015-08-08 14:51:42 +08:00
|
|
|
if (tun_dst)
|
|
|
|
skb_dst_set(skb, (struct dst_entry *)tun_dst);
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
gro_cells_receive(&tunnel->gro_cells, skb);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
drop:
|
2017-06-15 10:29:29 +08:00
|
|
|
if (tun_dst)
|
|
|
|
dst_release((struct dst_entry *)tun_dst);
|
2013-03-25 22:49:35 +08:00
|
|
|
kfree_skb(skb);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
|
|
|
|
|
2014-11-13 03:54:09 +08:00
|
|
|
int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
|
|
|
|
unsigned int num)
|
|
|
|
{
|
2014-12-17 04:05:20 +08:00
|
|
|
if (num >= MAX_IPTUN_ENCAP_OPS)
|
|
|
|
return -ERANGE;
|
|
|
|
|
2014-11-13 03:54:09 +08:00
|
|
|
return !cmpxchg((const struct ip_tunnel_encap_ops **)
|
|
|
|
&iptun_encaps[num],
|
|
|
|
NULL, ops) ? 0 : -1;
|
2014-09-18 03:25:58 +08:00
|
|
|
}
|
2014-11-13 03:54:09 +08:00
|
|
|
EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
|
|
|
|
|
|
|
|
int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
|
|
|
|
unsigned int num)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2014-12-17 04:05:20 +08:00
|
|
|
if (num >= MAX_IPTUN_ENCAP_OPS)
|
|
|
|
return -ERANGE;
|
|
|
|
|
2014-11-13 03:54:09 +08:00
|
|
|
ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
|
|
|
|
&iptun_encaps[num],
|
|
|
|
ops, NULL) == ops) ? 0 : -1;
|
|
|
|
|
|
|
|
synchronize_net();
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
|
2014-09-18 03:25:58 +08:00
|
|
|
|
|
|
|
int ip_tunnel_encap_setup(struct ip_tunnel *t,
|
|
|
|
struct ip_tunnel_encap *ipencap)
|
|
|
|
{
|
|
|
|
int hlen;
|
|
|
|
|
|
|
|
memset(&t->encap, 0, sizeof(t->encap));
|
|
|
|
|
|
|
|
hlen = ip_encap_hlen(ipencap);
|
|
|
|
if (hlen < 0)
|
|
|
|
return hlen;
|
|
|
|
|
|
|
|
t->encap.type = ipencap->type;
|
|
|
|
t->encap.sport = ipencap->sport;
|
|
|
|
t->encap.dport = ipencap->dport;
|
|
|
|
t->encap.flags = ipencap->flags;
|
|
|
|
|
|
|
|
t->encap_hlen = hlen;
|
|
|
|
t->hlen = t->encap_hlen + t->tun_hlen;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
|
|
|
|
|
2013-07-03 01:57:33 +08:00
|
|
|
static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
|
2015-07-07 13:34:13 +08:00
|
|
|
struct rtable *rt, __be16 df,
|
2019-01-22 18:39:50 +08:00
|
|
|
const struct iphdr *inner_iph,
|
|
|
|
int tunnel_hlen, __be32 dst, bool md)
|
2013-07-03 01:57:33 +08:00
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
2019-01-22 18:39:50 +08:00
|
|
|
int pkt_size;
|
2013-07-03 01:57:33 +08:00
|
|
|
int mtu;
|
|
|
|
|
2019-01-22 18:39:50 +08:00
|
|
|
tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
|
2021-01-30 06:27:47 +08:00
|
|
|
pkt_size = skb->len - tunnel_hlen;
|
2021-07-09 11:45:02 +08:00
|
|
|
pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
|
2019-01-22 18:39:50 +08:00
|
|
|
|
2021-07-09 11:45:02 +08:00
|
|
|
if (df) {
|
2021-01-30 06:27:47 +08:00
|
|
|
mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
|
2021-07-09 11:45:02 +08:00
|
|
|
mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
|
|
|
|
} else {
|
2019-03-06 18:25:42 +08:00
|
|
|
mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
|
2021-07-09 11:45:02 +08:00
|
|
|
}
|
2013-07-03 01:57:33 +08:00
|
|
|
|
2019-03-06 18:25:42 +08:00
|
|
|
if (skb_valid_dst(skb))
|
2019-12-22 10:51:13 +08:00
|
|
|
skb_dst_update_pmtu_no_confirm(skb, mtu);
|
2013-07-03 01:57:33 +08:00
|
|
|
|
|
|
|
if (skb->protocol == htons(ETH_P_IP)) {
|
|
|
|
if (!skb_is_gso(skb) &&
|
2015-07-07 13:34:13 +08:00
|
|
|
(inner_iph->frag_off & htons(IP_DF)) &&
|
|
|
|
mtu < pkt_size) {
|
2021-02-27 08:40:19 +08:00
|
|
|
icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
|
2013-07-03 01:57:33 +08:00
|
|
|
return -E2BIG;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
else if (skb->protocol == htons(ETH_P_IPV6)) {
|
2019-03-06 18:25:42 +08:00
|
|
|
struct rt6_info *rt6;
|
2019-01-22 18:39:50 +08:00
|
|
|
__be32 daddr;
|
|
|
|
|
2024-04-26 23:19:52 +08:00
|
|
|
rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
|
2019-03-06 18:25:42 +08:00
|
|
|
NULL;
|
2019-01-22 18:39:50 +08:00
|
|
|
daddr = md ? dst : tunnel->parms.iph.daddr;
|
2013-07-03 01:57:33 +08:00
|
|
|
|
|
|
|
if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
|
|
|
|
mtu >= IPV6_MIN_MTU) {
|
2019-01-22 18:39:50 +08:00
|
|
|
if ((daddr && !ipv4_is_multicast(daddr)) ||
|
2013-07-03 01:57:33 +08:00
|
|
|
rt6->rt6i_dst.plen == 128) {
|
|
|
|
rt6->rt6i_flags |= RTF_MODIFIED;
|
|
|
|
dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
|
|
|
|
mtu < pkt_size) {
|
2021-02-27 08:40:19 +08:00
|
|
|
icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
|
2013-07-03 01:57:33 +08:00
|
|
|
return -E2BIG;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-02-20 21:56:02 +08:00
|
|
|
static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
|
|
|
|
{
|
|
|
|
/* we must cap headroom to some upperlimit, else pskb_expand_head
|
|
|
|
* will overflow header offsets in skb_headers_offset_update().
|
|
|
|
*/
|
|
|
|
static const unsigned int max_allowed = 512;
|
|
|
|
|
|
|
|
if (headroom > max_allowed)
|
|
|
|
headroom = max_allowed;
|
|
|
|
|
|
|
|
if (headroom > READ_ONCE(dev->needed_headroom))
|
|
|
|
WRITE_ONCE(dev->needed_headroom, headroom);
|
|
|
|
}
|
|
|
|
|
2019-01-22 18:39:50 +08:00
|
|
|
void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
|
|
|
|
u8 proto, int tunnel_hlen)
|
2016-09-16 04:00:29 +08:00
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
|
|
|
u32 headroom = sizeof(struct iphdr);
|
|
|
|
struct ip_tunnel_info *tun_info;
|
|
|
|
const struct ip_tunnel_key *key;
|
|
|
|
const struct iphdr *inner_iph;
|
2019-01-22 18:39:49 +08:00
|
|
|
struct rtable *rt = NULL;
|
2016-09-16 04:00:29 +08:00
|
|
|
struct flowi4 fl4;
|
|
|
|
__be16 df = 0;
|
|
|
|
u8 tos, ttl;
|
2019-01-22 18:39:49 +08:00
|
|
|
bool use_cache;
|
2016-09-16 04:00:29 +08:00
|
|
|
|
|
|
|
tun_info = skb_tunnel_info(skb);
|
|
|
|
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
|
|
|
|
ip_tunnel_info_af(tun_info) != AF_INET))
|
|
|
|
goto tx_error;
|
|
|
|
key = &tun_info->key;
|
|
|
|
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
|
|
|
|
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
|
|
|
|
tos = key->tos;
|
|
|
|
if (tos == 1) {
|
|
|
|
if (skb->protocol == htons(ETH_P_IP))
|
|
|
|
tos = inner_iph->tos;
|
|
|
|
else if (skb->protocol == htons(ETH_P_IPV6))
|
|
|
|
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
|
|
|
|
}
|
2019-01-22 18:39:51 +08:00
|
|
|
ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
|
|
|
|
tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
|
2022-08-18 15:41:18 +08:00
|
|
|
dev_net(dev), 0, skb->mark, skb_get_hash(skb),
|
|
|
|
key->flow_flags);
|
2023-04-07 21:38:53 +08:00
|
|
|
|
|
|
|
if (!tunnel_hlen)
|
|
|
|
tunnel_hlen = ip_encap_hlen(&tun_info->encap);
|
|
|
|
|
|
|
|
if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
|
2016-09-16 04:00:29 +08:00
|
|
|
goto tx_error;
|
2019-01-22 18:39:49 +08:00
|
|
|
|
|
|
|
use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
|
|
|
|
if (use_cache)
|
|
|
|
rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
|
|
|
|
if (!rt) {
|
|
|
|
rt = ip_route_output_key(tunnel->net, &fl4);
|
|
|
|
if (IS_ERR(rt)) {
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(dev, tx_carrier_errors);
|
2019-01-22 18:39:49 +08:00
|
|
|
goto tx_error;
|
|
|
|
}
|
|
|
|
if (use_cache)
|
|
|
|
dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
|
|
|
|
fl4.saddr);
|
2016-09-16 04:00:29 +08:00
|
|
|
}
|
|
|
|
if (rt->dst.dev == dev) {
|
|
|
|
ip_rt_put(rt);
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(dev, collisions);
|
2016-09-16 04:00:29 +08:00
|
|
|
goto tx_error;
|
|
|
|
}
|
2019-01-22 18:39:50 +08:00
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
|
2019-01-22 18:39:50 +08:00
|
|
|
df = htons(IP_DF);
|
|
|
|
if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
|
|
|
|
key->u.ipv4.dst, true)) {
|
|
|
|
ip_rt_put(rt);
|
|
|
|
goto tx_error;
|
|
|
|
}
|
|
|
|
|
2016-09-16 04:00:29 +08:00
|
|
|
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
|
|
|
|
ttl = key->ttl;
|
|
|
|
if (ttl == 0) {
|
|
|
|
if (skb->protocol == htons(ETH_P_IP))
|
|
|
|
ttl = inner_iph->ttl;
|
|
|
|
else if (skb->protocol == htons(ETH_P_IPV6))
|
|
|
|
ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
|
|
|
|
else
|
|
|
|
ttl = ip4_dst_hoplimit(&rt->dst);
|
|
|
|
}
|
2019-01-22 18:39:50 +08:00
|
|
|
|
2016-09-16 04:00:29 +08:00
|
|
|
headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
|
2024-02-20 21:56:02 +08:00
|
|
|
if (skb_cow_head(skb, headroom)) {
|
2016-09-16 04:00:29 +08:00
|
|
|
ip_rt_put(rt);
|
|
|
|
goto tx_dropped;
|
|
|
|
}
|
2024-02-20 21:56:02 +08:00
|
|
|
|
|
|
|
ip_tunnel_adj_headroom(dev, headroom);
|
|
|
|
|
2017-09-07 14:08:34 +08:00
|
|
|
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
|
|
|
|
df, !net_eq(tunnel->net, dev_net(dev)));
|
2016-09-16 04:00:29 +08:00
|
|
|
return;
|
|
|
|
tx_error:
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(dev, tx_errors);
|
2016-09-16 04:00:29 +08:00
|
|
|
goto kfree;
|
|
|
|
tx_dropped:
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(dev, tx_dropped);
|
2016-09-16 04:00:29 +08:00
|
|
|
kfree:
|
|
|
|
kfree_skb(skb);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
|
2014-09-18 03:25:58 +08:00
|
|
|
const struct iphdr *tnl_params, u8 protocol)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
2019-02-24 08:24:45 +08:00
|
|
|
struct ip_tunnel_info *tun_info = NULL;
|
2013-03-25 22:49:35 +08:00
|
|
|
const struct iphdr *inner_iph;
|
|
|
|
unsigned int max_headroom; /* The extra header space needed */
|
2019-02-24 08:24:45 +08:00
|
|
|
struct rtable *rt = NULL; /* Route to the other host */
|
2022-07-11 17:17:19 +08:00
|
|
|
__be16 payload_protocol;
|
2019-02-24 08:24:45 +08:00
|
|
|
bool use_cache = false;
|
|
|
|
struct flowi4 fl4;
|
|
|
|
bool md = false;
|
2014-05-16 13:34:39 +08:00
|
|
|
bool connected;
|
2019-02-24 08:24:45 +08:00
|
|
|
u8 tos, ttl;
|
|
|
|
__be32 dst;
|
|
|
|
__be16 df;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
|
2014-05-16 13:34:39 +08:00
|
|
|
connected = (tunnel->parms.iph.daddr != 0);
|
2022-07-11 17:17:19 +08:00
|
|
|
payload_protocol = skb_protocol(skb, true);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2016-02-22 07:58:05 +08:00
|
|
|
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
dst = tnl_params->daddr;
|
|
|
|
if (dst == 0) {
|
|
|
|
/* NBMA tunnel */
|
|
|
|
|
2015-04-03 16:17:26 +08:00
|
|
|
if (!skb_dst(skb)) {
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(dev, tx_fifo_errors);
|
2013-03-25 22:49:35 +08:00
|
|
|
goto tx_error;
|
|
|
|
}
|
|
|
|
|
2019-01-19 13:11:25 +08:00
|
|
|
tun_info = skb_tunnel_info(skb);
|
|
|
|
if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
|
|
|
|
ip_tunnel_info_af(tun_info) == AF_INET &&
|
2019-02-24 08:24:45 +08:00
|
|
|
tun_info->key.u.ipv4.dst) {
|
2019-01-19 13:11:25 +08:00
|
|
|
dst = tun_info->key.u.ipv4.dst;
|
2019-02-24 08:24:45 +08:00
|
|
|
md = true;
|
|
|
|
connected = true;
|
2022-07-11 17:17:19 +08:00
|
|
|
} else if (payload_protocol == htons(ETH_P_IP)) {
|
2013-03-25 22:49:35 +08:00
|
|
|
rt = skb_rtable(skb);
|
|
|
|
dst = rt_nexthop(rt, inner_iph->daddr);
|
|
|
|
}
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2022-07-11 17:17:19 +08:00
|
|
|
else if (payload_protocol == htons(ETH_P_IPV6)) {
|
2013-03-25 22:49:35 +08:00
|
|
|
const struct in6_addr *addr6;
|
|
|
|
struct neighbour *neigh;
|
|
|
|
bool do_tx_error_icmp;
|
|
|
|
int addr_type;
|
|
|
|
|
|
|
|
neigh = dst_neigh_lookup(skb_dst(skb),
|
|
|
|
&ipv6_hdr(skb)->daddr);
|
2015-04-03 16:17:26 +08:00
|
|
|
if (!neigh)
|
2013-03-25 22:49:35 +08:00
|
|
|
goto tx_error;
|
|
|
|
|
|
|
|
addr6 = (const struct in6_addr *)&neigh->primary_key;
|
|
|
|
addr_type = ipv6_addr_type(addr6);
|
|
|
|
|
|
|
|
if (addr_type == IPV6_ADDR_ANY) {
|
|
|
|
addr6 = &ipv6_hdr(skb)->daddr;
|
|
|
|
addr_type = ipv6_addr_type(addr6);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
|
|
|
|
do_tx_error_icmp = true;
|
|
|
|
else {
|
|
|
|
do_tx_error_icmp = false;
|
|
|
|
dst = addr6->s6_addr32[3];
|
|
|
|
}
|
|
|
|
neigh_release(neigh);
|
|
|
|
if (do_tx_error_icmp)
|
|
|
|
goto tx_error_icmp;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
else
|
|
|
|
goto tx_error;
|
2014-01-03 03:48:26 +08:00
|
|
|
|
2019-02-24 08:24:45 +08:00
|
|
|
if (!md)
|
|
|
|
connected = false;
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
tos = tnl_params->tos;
|
|
|
|
if (tos & 0x1) {
|
|
|
|
tos &= ~0x1;
|
2022-07-11 17:17:19 +08:00
|
|
|
if (payload_protocol == htons(ETH_P_IP)) {
|
2013-03-25 22:49:35 +08:00
|
|
|
tos = inner_iph->tos;
|
2014-01-03 03:48:26 +08:00
|
|
|
connected = false;
|
2022-07-11 17:17:19 +08:00
|
|
|
} else if (payload_protocol == htons(ETH_P_IPV6)) {
|
2013-03-25 22:49:35 +08:00
|
|
|
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
|
2014-01-03 03:48:26 +08:00
|
|
|
connected = false;
|
|
|
|
}
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
2018-03-06 13:53:44 +08:00
|
|
|
ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
|
2022-04-14 01:43:20 +08:00
|
|
|
tunnel->parms.o_key, RT_TOS(tos),
|
2024-02-13 14:32:34 +08:00
|
|
|
dev_net(dev), READ_ONCE(tunnel->parms.link),
|
2022-08-18 15:41:18 +08:00
|
|
|
tunnel->fwmark, skb_get_hash(skb), 0);
|
2014-01-03 03:48:26 +08:00
|
|
|
|
2023-04-07 21:38:53 +08:00
|
|
|
if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
|
2014-09-18 03:25:58 +08:00
|
|
|
goto tx_error;
|
|
|
|
|
2019-02-24 08:24:45 +08:00
|
|
|
if (connected && md) {
|
|
|
|
use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
|
|
|
|
if (use_cache)
|
|
|
|
rt = dst_cache_get_ip4(&tun_info->dst_cache,
|
|
|
|
&fl4.saddr);
|
|
|
|
} else {
|
|
|
|
rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
|
|
|
|
&fl4.saddr) : NULL;
|
|
|
|
}
|
2014-01-03 03:48:26 +08:00
|
|
|
|
|
|
|
if (!rt) {
|
|
|
|
rt = ip_route_output_key(tunnel->net, &fl4);
|
|
|
|
|
|
|
|
if (IS_ERR(rt)) {
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(dev, tx_carrier_errors);
|
2014-01-03 03:48:26 +08:00
|
|
|
goto tx_error;
|
|
|
|
}
|
2019-02-24 08:24:45 +08:00
|
|
|
if (use_cache)
|
|
|
|
dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
|
|
|
|
fl4.saddr);
|
|
|
|
else if (!md && connected)
|
2016-02-12 22:43:55 +08:00
|
|
|
dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
|
|
|
|
fl4.saddr);
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
2014-01-03 03:48:26 +08:00
|
|
|
|
2013-06-18 08:49:56 +08:00
|
|
|
if (rt->dst.dev == dev) {
|
2013-03-25 22:49:35 +08:00
|
|
|
ip_rt_put(rt);
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(dev, collisions);
|
2013-03-25 22:49:35 +08:00
|
|
|
goto tx_error;
|
|
|
|
}
|
|
|
|
|
2021-01-06 07:15:22 +08:00
|
|
|
df = tnl_params->frag_off;
|
2022-07-11 17:17:19 +08:00
|
|
|
if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
|
2021-01-06 07:15:22 +08:00
|
|
|
df |= (inner_iph->frag_off & htons(IP_DF));
|
|
|
|
|
|
|
|
if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
|
2013-07-03 01:57:33 +08:00
|
|
|
ip_rt_put(rt);
|
|
|
|
goto tx_error;
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (tunnel->err_count > 0) {
|
|
|
|
if (time_before(jiffies,
|
|
|
|
tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
|
|
|
|
tunnel->err_count--;
|
|
|
|
|
|
|
|
dst_link_failure(skb);
|
|
|
|
} else
|
|
|
|
tunnel->err_count = 0;
|
|
|
|
}
|
|
|
|
|
2013-09-26 00:57:47 +08:00
|
|
|
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
|
2013-03-25 22:49:35 +08:00
|
|
|
ttl = tnl_params->ttl;
|
|
|
|
if (ttl == 0) {
|
2022-07-11 17:17:19 +08:00
|
|
|
if (payload_protocol == htons(ETH_P_IP))
|
2013-03-25 22:49:35 +08:00
|
|
|
ttl = inner_iph->ttl;
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2022-07-11 17:17:19 +08:00
|
|
|
else if (payload_protocol == htons(ETH_P_IPV6))
|
2013-03-25 22:49:35 +08:00
|
|
|
ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
|
|
|
|
#endif
|
|
|
|
else
|
|
|
|
ttl = ip4_dst_hoplimit(&rt->dst);
|
|
|
|
}
|
|
|
|
|
2013-06-18 08:49:56 +08:00
|
|
|
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
|
2014-10-04 06:48:07 +08:00
|
|
|
+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
|
2013-10-01 17:33:59 +08:00
|
|
|
|
2024-02-20 21:56:02 +08:00
|
|
|
if (skb_cow_head(skb, max_headroom)) {
|
2014-06-06 08:34:37 +08:00
|
|
|
ip_rt_put(rt);
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(dev, tx_dropped);
|
2014-01-19 10:27:49 +08:00
|
|
|
kfree_skb(skb);
|
2013-10-01 17:33:59 +08:00
|
|
|
return;
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
|
2024-02-20 21:56:02 +08:00
|
|
|
ip_tunnel_adj_headroom(dev, max_headroom);
|
|
|
|
|
2015-12-25 06:34:54 +08:00
|
|
|
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
|
|
|
|
df, !net_eq(tunnel->net, dev_net(dev)));
|
2013-03-25 22:49:35 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
tx_error_icmp:
|
|
|
|
dst_link_failure(skb);
|
|
|
|
#endif
|
|
|
|
tx_error:
|
2022-11-15 16:53:58 +08:00
|
|
|
DEV_STATS_INC(dev, tx_errors);
|
2014-01-19 10:27:49 +08:00
|
|
|
kfree_skb(skb);
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
|
|
|
|
|
|
|
|
static void ip_tunnel_update(struct ip_tunnel_net *itn,
|
|
|
|
struct ip_tunnel *t,
|
|
|
|
struct net_device *dev,
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern *p,
|
2017-04-20 00:30:54 +08:00
|
|
|
bool set_mtu,
|
|
|
|
__u32 fwmark)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
2015-08-08 14:51:42 +08:00
|
|
|
ip_tunnel_del(itn, t);
|
2013-03-25 22:49:35 +08:00
|
|
|
t->parms.iph.saddr = p->iph.saddr;
|
|
|
|
t->parms.iph.daddr = p->iph.daddr;
|
|
|
|
t->parms.i_key = p->i_key;
|
|
|
|
t->parms.o_key = p->o_key;
|
|
|
|
if (dev->type != ARPHRD_ETHER) {
|
2021-10-13 00:06:34 +08:00
|
|
|
__dev_addr_set(dev, &p->iph.saddr, 4);
|
2013-03-25 22:49:35 +08:00
|
|
|
memcpy(dev->broadcast, &p->iph.daddr, 4);
|
|
|
|
}
|
|
|
|
ip_tunnel_add(itn, t);
|
|
|
|
|
|
|
|
t->parms.iph.ttl = p->iph.ttl;
|
|
|
|
t->parms.iph.tos = p->iph.tos;
|
|
|
|
t->parms.iph.frag_off = p->iph.frag_off;
|
|
|
|
|
2017-04-20 00:30:54 +08:00
|
|
|
if (t->parms.link != p->link || t->fwmark != fwmark) {
|
2013-03-25 22:49:35 +08:00
|
|
|
int mtu;
|
|
|
|
|
2024-02-13 14:32:34 +08:00
|
|
|
WRITE_ONCE(t->parms.link, p->link);
|
2017-04-20 00:30:54 +08:00
|
|
|
t->fwmark = fwmark;
|
2013-03-25 22:49:35 +08:00
|
|
|
mtu = ip_tunnel_bind_dev(dev);
|
|
|
|
if (set_mtu)
|
2024-05-06 18:28:12 +08:00
|
|
|
WRITE_ONCE(dev->mtu, mtu);
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
2016-02-12 22:43:55 +08:00
|
|
|
dst_cache_reset(&t->dst_cache);
|
2013-03-25 22:49:35 +08:00
|
|
|
netdev_state_change(dev);
|
|
|
|
}
|
|
|
|
|
2024-03-27 23:23:52 +08:00
|
|
|
int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
|
|
|
|
int cmd)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
|
|
|
int err = 0;
|
2014-04-16 17:19:32 +08:00
|
|
|
struct ip_tunnel *t = netdev_priv(dev);
|
|
|
|
struct net *net = t->net;
|
|
|
|
struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
switch (cmd) {
|
|
|
|
case SIOCGETTUNNEL:
|
2014-04-16 17:19:32 +08:00
|
|
|
if (dev == itn->fb_tunnel_dev) {
|
2013-03-25 22:49:35 +08:00
|
|
|
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
|
2015-04-03 16:17:26 +08:00
|
|
|
if (!t)
|
2014-04-16 17:19:32 +08:00
|
|
|
t = netdev_priv(dev);
|
|
|
|
}
|
2013-03-25 22:49:35 +08:00
|
|
|
memcpy(p, &t->parms, sizeof(*p));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SIOCADDTUNNEL:
|
|
|
|
case SIOCCHGTUNNEL:
|
|
|
|
err = -EPERM;
|
|
|
|
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
|
|
|
|
goto done;
|
|
|
|
if (p->iph.ttl)
|
|
|
|
p->iph.frag_off |= htons(IP_DF);
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
|
|
|
|
if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
|
2014-06-08 06:06:25 +08:00
|
|
|
p->i_key = 0;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
|
2014-06-08 06:06:25 +08:00
|
|
|
p->o_key = 0;
|
|
|
|
}
|
2013-03-25 22:49:35 +08:00
|
|
|
|
net: do not create fallback tunnels for non-default namespaces
fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.
These tunnels are also automatically created when a new network
namespace is created, at a great cost.
In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)
Add a new sysctl so that we can opt-out from this automatic creation.
Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.
Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
(for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
done
wait
lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m37.521s
user 0m0.886s
sys 7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m4.761s
user 0m0.851s
sys 1m8.343s
lpk43:~#
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-09 04:51:41 +08:00
|
|
|
t = ip_tunnel_find(itn, p, itn->type);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2014-09-22 15:11:08 +08:00
|
|
|
if (cmd == SIOCADDTUNNEL) {
|
|
|
|
if (!t) {
|
|
|
|
t = ip_tunnel_create(net, itn, p);
|
|
|
|
err = PTR_ERR_OR_ZERO(t);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = -EEXIST;
|
2014-05-15 13:07:02 +08:00
|
|
|
break;
|
2014-02-14 20:14:39 +08:00
|
|
|
}
|
2013-03-25 22:49:35 +08:00
|
|
|
if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
|
2015-04-03 16:17:27 +08:00
|
|
|
if (t) {
|
2013-03-25 22:49:35 +08:00
|
|
|
if (t->dev != dev) {
|
|
|
|
err = -EEXIST;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
unsigned int nflags = 0;
|
|
|
|
|
|
|
|
if (ipv4_is_multicast(p->iph.daddr))
|
|
|
|
nflags = IFF_BROADCAST;
|
|
|
|
else if (p->iph.daddr)
|
|
|
|
nflags = IFF_POINTOPOINT;
|
|
|
|
|
|
|
|
if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
|
|
|
|
err = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
t = netdev_priv(dev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (t) {
|
|
|
|
err = 0;
|
2017-04-20 00:30:54 +08:00
|
|
|
ip_tunnel_update(itn, t, dev, p, true, 0);
|
2014-02-14 20:14:39 +08:00
|
|
|
} else {
|
|
|
|
err = -ENOENT;
|
|
|
|
}
|
2013-03-25 22:49:35 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
case SIOCDELTUNNEL:
|
|
|
|
err = -EPERM;
|
|
|
|
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
if (dev == itn->fb_tunnel_dev) {
|
|
|
|
err = -ENOENT;
|
|
|
|
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
|
2015-04-03 16:17:26 +08:00
|
|
|
if (!t)
|
2013-03-25 22:49:35 +08:00
|
|
|
goto done;
|
|
|
|
err = -EPERM;
|
|
|
|
if (t == netdev_priv(itn->fb_tunnel_dev))
|
|
|
|
goto done;
|
|
|
|
dev = t->dev;
|
|
|
|
}
|
|
|
|
unregister_netdevice(dev);
|
|
|
|
err = 0;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
err = -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
return err;
|
|
|
|
}
|
2020-05-19 21:03:13 +08:00
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
|
|
|
|
|
2024-03-27 23:23:52 +08:00
|
|
|
bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
|
|
|
|
const void __user *data)
|
|
|
|
{
|
|
|
|
struct ip_tunnel_parm p;
|
|
|
|
|
|
|
|
if (copy_from_user(&p, data, sizeof(p)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
strscpy(kp->name, p.name);
|
|
|
|
kp->link = p.link;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
|
|
|
|
ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
|
2024-03-27 23:23:52 +08:00
|
|
|
kp->i_key = p.i_key;
|
|
|
|
kp->o_key = p.o_key;
|
|
|
|
memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
|
|
|
|
|
|
|
|
bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
|
|
|
|
{
|
|
|
|
struct ip_tunnel_parm p;
|
|
|
|
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
|
|
|
|
!ip_tunnel_flags_is_be16_compat(kp->o_flags))
|
|
|
|
return false;
|
|
|
|
|
2024-04-05 00:03:02 +08:00
|
|
|
memset(&p, 0, sizeof(p));
|
|
|
|
|
2024-03-27 23:23:52 +08:00
|
|
|
strscpy(p.name, kp->name);
|
|
|
|
p.link = kp->link;
|
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 23:23:53 +08:00
|
|
|
p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
|
|
|
|
p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
|
2024-03-27 23:23:52 +08:00
|
|
|
p.i_key = kp->i_key;
|
|
|
|
p.o_key = kp->o_key;
|
|
|
|
memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
|
|
|
|
|
|
|
|
return !copy_to_user(data, &p, sizeof(p));
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
|
|
|
|
|
2021-07-27 21:45:06 +08:00
|
|
|
int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
|
|
|
|
void __user *data, int cmd)
|
2020-05-19 21:03:13 +08:00
|
|
|
{
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern p;
|
2020-05-19 21:03:13 +08:00
|
|
|
int err;
|
|
|
|
|
2024-03-27 23:23:52 +08:00
|
|
|
if (!ip_tunnel_parm_from_user(&p, data))
|
2020-05-19 21:03:13 +08:00
|
|
|
return -EFAULT;
|
|
|
|
err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
|
2024-03-27 23:23:52 +08:00
|
|
|
if (!err && !ip_tunnel_parm_to_user(data, &p))
|
2020-05-19 21:03:13 +08:00
|
|
|
return -EFAULT;
|
|
|
|
return err;
|
|
|
|
}
|
2021-07-27 21:45:06 +08:00
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 08:05:58 +08:00
|
|
|
int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
|
|
|
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
|
2021-01-30 06:27:47 +08:00
|
|
|
int max_mtu = IP_MAX_MTU - t_hlen;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2021-07-09 11:45:02 +08:00
|
|
|
if (dev->type == ARPHRD_ETHER)
|
|
|
|
max_mtu -= dev->hard_header_len;
|
|
|
|
|
2016-10-21 01:55:24 +08:00
|
|
|
if (new_mtu < ETH_MIN_MTU)
|
2013-03-25 22:49:35 +08:00
|
|
|
return -EINVAL;
|
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 08:05:58 +08:00
|
|
|
|
|
|
|
if (new_mtu > max_mtu) {
|
|
|
|
if (strict)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
new_mtu = max_mtu;
|
|
|
|
}
|
|
|
|
|
2024-05-06 18:28:12 +08:00
|
|
|
WRITE_ONCE(dev->mtu, new_mtu);
|
2013-03-25 22:49:35 +08:00
|
|
|
return 0;
|
|
|
|
}
|
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 08:05:58 +08:00
|
|
|
EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
|
|
|
|
|
|
|
|
int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
|
|
|
|
{
|
|
|
|
return __ip_tunnel_change_mtu(dev, new_mtu, true);
|
|
|
|
}
|
2013-03-25 22:49:35 +08:00
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
|
|
|
|
|
|
|
|
static void ip_tunnel_dev_free(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
|
|
|
|
|
|
|
gro_cells_destroy(&tunnel->gro_cells);
|
2016-02-12 22:43:55 +08:00
|
|
|
dst_cache_destroy(&tunnel->dst_cache);
|
2013-03-25 22:49:35 +08:00
|
|
|
free_percpu(dev->tstats);
|
|
|
|
}
|
|
|
|
|
|
|
|
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
|
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
|
|
|
struct ip_tunnel_net *itn;
|
|
|
|
|
2013-08-13 23:51:11 +08:00
|
|
|
itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
if (itn->fb_tunnel_dev != dev) {
|
2015-08-08 14:51:42 +08:00
|
|
|
ip_tunnel_del(itn, netdev_priv(dev));
|
2013-03-25 22:49:35 +08:00
|
|
|
unregister_netdevice_queue(dev, head);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
|
|
|
|
|
2015-01-15 22:11:17 +08:00
|
|
|
struct net *ip_tunnel_get_link_net(const struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
|
|
|
|
2024-05-04 03:20:59 +08:00
|
|
|
return READ_ONCE(tunnel->net);
|
2015-01-15 22:11:17 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ip_tunnel_get_link_net);
|
|
|
|
|
2015-04-02 23:07:02 +08:00
|
|
|
int ip_tunnel_get_iflink(const struct net_device *dev)
|
|
|
|
{
|
2024-02-13 14:32:34 +08:00
|
|
|
const struct ip_tunnel *tunnel = netdev_priv(dev);
|
2015-04-02 23:07:02 +08:00
|
|
|
|
2024-02-13 14:32:34 +08:00
|
|
|
return READ_ONCE(tunnel->parms.link);
|
2015-04-02 23:07:02 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ip_tunnel_get_iflink);
|
|
|
|
|
netns: make struct pernet_operations::id unsigned int
Make struct pernet_operations::id unsigned.
There are 2 reasons to do so:
1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.
2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.
"int" being used as an array index needs to be sign-extended
to 64-bit before being used.
void f(long *p, int i)
{
g(p[i]);
}
roughly translates to
movsx rsi, esi
mov rdi, [rsi+...]
call g
MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.
Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:
static inline void *net_generic(const struct net *net, int id)
{
...
ptr = ng->ptr[id - 1];
...
}
And this function is used a lot, so those sign extensions add up.
Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]
However, overall balance is in negative direction:
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
function old new delta
nfsd4_lock 3886 3959 +73
tipc_link_build_proto_msg 1096 1140 +44
mac80211_hwsim_new_radio 2776 2808 +32
tipc_mon_rcv 1032 1058 +26
svcauth_gss_legacy_init 1413 1429 +16
tipc_bcbase_select_primary 379 392 +13
nfsd4_exchange_id 1247 1260 +13
nfsd4_setclientid_confirm 782 793 +11
...
put_client_renew_locked 494 480 -14
ip_set_sockfn_get 730 716 -14
geneve_sock_add 829 813 -16
nfsd4_sequence_done 721 703 -18
nlmclnt_lookup_host 708 686 -22
nfsd4_lockt 1085 1063 -22
nfs_get_client 1077 1050 -27
tcf_bpf_init 1106 1076 -30
nfsd4_encode_fattr 5997 5930 -67
Total: Before=154856051, After=154854321, chg -0.00%
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 09:58:21 +08:00
|
|
|
int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
|
2013-03-25 22:49:35 +08:00
|
|
|
struct rtnl_link_ops *ops, char *devname)
|
|
|
|
{
|
|
|
|
struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern parms;
|
2013-08-06 13:51:37 +08:00
|
|
|
unsigned int i;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
net: do not create fallback tunnels for non-default namespaces
fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.
These tunnels are also automatically created when a new network
namespace is created, at a great cost.
In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)
Add a new sysctl so that we can opt-out from this automatic creation.
Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.
Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
(for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
done
wait
lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m37.521s
user 0m0.886s
sys 7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m4.761s
user 0m0.851s
sys 1m8.343s
lpk43:~#
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-09 04:51:41 +08:00
|
|
|
itn->rtnl_link_ops = ops;
|
2013-08-06 13:51:37 +08:00
|
|
|
for (i = 0; i < IP_TNL_HASH_SIZE; i++)
|
|
|
|
INIT_HLIST_HEAD(&itn->tunnels[i]);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
net: do not create fallback tunnels for non-default namespaces
fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.
These tunnels are also automatically created when a new network
namespace is created, at a great cost.
In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)
Add a new sysctl so that we can opt-out from this automatic creation.
Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.
Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
(for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
done
wait
lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m37.521s
user 0m0.886s
sys 7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m4.761s
user 0m0.851s
sys 1m8.343s
lpk43:~#
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-09 04:51:41 +08:00
|
|
|
if (!ops || !net_has_fallback_tunnels(net)) {
|
|
|
|
struct ip_tunnel_net *it_init_net;
|
|
|
|
|
|
|
|
it_init_net = net_generic(&init_net, ip_tnl_net_id);
|
|
|
|
itn->type = it_init_net->type;
|
2013-03-25 22:49:35 +08:00
|
|
|
itn->fb_tunnel_dev = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
2013-08-06 13:51:37 +08:00
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
memset(&parms, 0, sizeof(parms));
|
|
|
|
if (devname)
|
2022-07-11 21:55:37 +08:00
|
|
|
strscpy(parms.name, devname, IFNAMSIZ);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
rtnl_lock();
|
|
|
|
itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
|
2013-08-19 15:05:10 +08:00
|
|
|
/* FB netdevice is special: we have one, and only one per netns.
|
|
|
|
* Allowing to move it to another netns is clearly unsafe.
|
|
|
|
*/
|
2013-10-01 17:34:48 +08:00
|
|
|
if (!IS_ERR(itn->fb_tunnel_dev)) {
|
2013-08-23 16:15:37 +08:00
|
|
|
itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
|
2014-05-19 17:36:56 +08:00
|
|
|
itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
|
2013-10-01 17:34:48 +08:00
|
|
|
ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
|
net: do not create fallback tunnels for non-default namespaces
fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.
These tunnels are also automatically created when a new network
namespace is created, at a great cost.
In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)
Add a new sysctl so that we can opt-out from this automatic creation.
Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.
Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
(for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
done
wait
lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m37.521s
user 0m0.886s
sys 7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m4.761s
user 0m0.851s
sys 1m8.343s
lpk43:~#
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-09 04:51:41 +08:00
|
|
|
itn->type = itn->fb_tunnel_dev->type;
|
2013-10-01 17:34:48 +08:00
|
|
|
}
|
2013-08-23 16:15:37 +08:00
|
|
|
rtnl_unlock();
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2014-01-27 14:43:57 +08:00
|
|
|
return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
|
|
|
|
|
net: do not create fallback tunnels for non-default namespaces
fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.
These tunnels are also automatically created when a new network
namespace is created, at a great cost.
In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)
Add a new sysctl so that we can opt-out from this automatic creation.
Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.
Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
(for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
done
wait
lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m37.521s
user 0m0.886s
sys 7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m4.761s
user 0m0.851s
sys 1m8.343s
lpk43:~#
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-09 04:51:41 +08:00
|
|
|
static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
|
|
|
|
struct list_head *head,
|
2013-08-13 23:51:11 +08:00
|
|
|
struct rtnl_link_ops *ops)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
2013-08-13 23:51:11 +08:00
|
|
|
struct net_device *dev, *aux;
|
2013-03-25 22:49:35 +08:00
|
|
|
int h;
|
|
|
|
|
2013-08-13 23:51:11 +08:00
|
|
|
for_each_netdev_safe(net, dev, aux)
|
|
|
|
if (dev->rtnl_link_ops == ops)
|
|
|
|
unregister_netdevice_queue(dev, head);
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
|
|
|
|
struct ip_tunnel *t;
|
|
|
|
struct hlist_node *n;
|
|
|
|
struct hlist_head *thead = &itn->tunnels[h];
|
|
|
|
|
|
|
|
hlist_for_each_entry_safe(t, n, thead, hash_node)
|
2013-08-13 23:51:11 +08:00
|
|
|
/* If dev is in the same netns, it has already
|
|
|
|
* been added to the list by the previous loop.
|
|
|
|
*/
|
|
|
|
if (!net_eq(dev_net(t->dev), net))
|
|
|
|
unregister_netdevice_queue(t->dev, head);
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-09-20 07:27:09 +08:00
|
|
|
void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
|
2024-02-06 22:43:10 +08:00
|
|
|
struct rtnl_link_ops *ops,
|
|
|
|
struct list_head *dev_to_kill)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
2017-09-20 07:27:09 +08:00
|
|
|
struct ip_tunnel_net *itn;
|
|
|
|
struct net *net;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2024-02-06 22:43:10 +08:00
|
|
|
ASSERT_RTNL();
|
2017-09-20 07:27:09 +08:00
|
|
|
list_for_each_entry(net, net_list, exit_list) {
|
|
|
|
itn = net_generic(net, id);
|
2024-02-06 22:43:10 +08:00
|
|
|
ip_tunnel_destroy(net, itn, dev_to_kill, ops);
|
2017-09-20 07:27:09 +08:00
|
|
|
}
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
2017-09-20 07:27:09 +08:00
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern *p, __u32 fwmark)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
|
|
|
struct ip_tunnel *nt;
|
|
|
|
struct net *net = dev_net(dev);
|
|
|
|
struct ip_tunnel_net *itn;
|
|
|
|
int mtu;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
nt = netdev_priv(dev);
|
|
|
|
itn = net_generic(net, nt->ip_tnl_net_id);
|
|
|
|
|
2015-08-08 14:51:42 +08:00
|
|
|
if (nt->collect_md) {
|
|
|
|
if (rtnl_dereference(itn->collect_md_tun))
|
|
|
|
return -EEXIST;
|
|
|
|
} else {
|
|
|
|
if (ip_tunnel_find(itn, p, dev->type))
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2013-06-26 22:11:28 +08:00
|
|
|
nt->net = net;
|
2013-03-25 22:49:35 +08:00
|
|
|
nt->parms = *p;
|
2017-04-20 00:30:54 +08:00
|
|
|
nt->fwmark = fwmark;
|
2013-03-25 22:49:35 +08:00
|
|
|
err = register_netdevice(dev);
|
|
|
|
if (err)
|
2018-03-23 01:53:33 +08:00
|
|
|
goto err_register_netdevice;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
|
|
|
|
eth_hw_addr_random(dev);
|
|
|
|
|
|
|
|
mtu = ip_tunnel_bind_dev(dev);
|
2018-03-16 00:16:28 +08:00
|
|
|
if (tb[IFLA_MTU]) {
|
2021-01-30 06:27:47 +08:00
|
|
|
unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
|
2018-03-16 00:16:28 +08:00
|
|
|
|
2021-07-09 11:45:02 +08:00
|
|
|
if (dev->type == ARPHRD_ETHER)
|
|
|
|
max -= dev->hard_header_len;
|
|
|
|
|
2021-01-30 06:27:47 +08:00
|
|
|
mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
|
2018-03-23 01:53:33 +08:00
|
|
|
}
|
2013-03-25 22:49:35 +08:00
|
|
|
|
2018-03-29 23:42:14 +08:00
|
|
|
err = dev_set_mtu(dev, mtu);
|
|
|
|
if (err)
|
|
|
|
goto err_dev_set_mtu;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
ip_tunnel_add(itn, nt);
|
2018-03-23 01:53:33 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_dev_set_mtu:
|
|
|
|
unregister_netdevice(dev);
|
|
|
|
err_register_netdevice:
|
2013-03-25 22:49:35 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
|
|
|
|
|
|
|
|
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
|
2024-03-27 23:23:52 +08:00
|
|
|
struct ip_tunnel_parm_kern *p, __u32 fwmark)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
2013-08-13 23:51:11 +08:00
|
|
|
struct ip_tunnel *t;
|
2013-03-25 22:49:35 +08:00
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
2013-08-13 23:51:11 +08:00
|
|
|
struct net *net = tunnel->net;
|
2013-03-25 22:49:35 +08:00
|
|
|
struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
|
|
|
|
|
|
|
|
if (dev == itn->fb_tunnel_dev)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
t = ip_tunnel_find(itn, p, dev->type);
|
|
|
|
|
|
|
|
if (t) {
|
|
|
|
if (t->dev != dev)
|
|
|
|
return -EEXIST;
|
|
|
|
} else {
|
2013-08-13 23:51:11 +08:00
|
|
|
t = tunnel;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
|
|
|
if (dev->type != ARPHRD_ETHER) {
|
|
|
|
unsigned int nflags = 0;
|
|
|
|
|
|
|
|
if (ipv4_is_multicast(p->iph.daddr))
|
|
|
|
nflags = IFF_BROADCAST;
|
|
|
|
else if (p->iph.daddr)
|
|
|
|
nflags = IFF_POINTOPOINT;
|
|
|
|
|
|
|
|
if ((dev->flags ^ nflags) &
|
|
|
|
(IFF_POINTOPOINT | IFF_BROADCAST))
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-20 00:30:54 +08:00
|
|
|
ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
|
2013-03-25 22:49:35 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
|
|
|
|
|
|
|
|
int ip_tunnel_init(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
|
|
|
struct iphdr *iph = &tunnel->parms.iph;
|
2014-02-14 03:46:28 +08:00
|
|
|
int err;
|
2013-03-25 22:49:35 +08:00
|
|
|
|
net: Fix inconsistent teardown and release of private netdev state.
Network devices can allocate reasources and private memory using
netdev_ops->ndo_init(). However, the release of these resources
can occur in one of two different places.
Either netdev_ops->ndo_uninit() or netdev->destructor().
The decision of which operation frees the resources depends upon
whether it is necessary for all netdev refs to be released before it
is safe to perform the freeing.
netdev_ops->ndo_uninit() presumably can occur right after the
NETDEV_UNREGISTER notifier completes and the unicast and multicast
address lists are flushed.
netdev->destructor(), on the other hand, does not run until the
netdev references all go away.
Further complicating the situation is that netdev->destructor()
almost universally does also a free_netdev().
This creates a problem for the logic in register_netdevice().
Because all callers of register_netdevice() manage the freeing
of the netdev, and invoke free_netdev(dev) if register_netdevice()
fails.
If netdev_ops->ndo_init() succeeds, but something else fails inside
of register_netdevice(), it does call ndo_ops->ndo_uninit(). But
it is not able to invoke netdev->destructor().
This is because netdev->destructor() will do a free_netdev() and
then the caller of register_netdevice() will do the same.
However, this means that the resources that would normally be released
by netdev->destructor() will not be.
Over the years drivers have added local hacks to deal with this, by
invoking their destructor parts by hand when register_netdevice()
fails.
Many drivers do not try to deal with this, and instead we have leaks.
Let's close this hole by formalizing the distinction between what
private things need to be freed up by netdev->destructor() and whether
the driver needs unregister_netdevice() to perform the free_netdev().
netdev->priv_destructor() performs all actions to free up the private
resources that used to be freed by netdev->destructor(), except for
free_netdev().
netdev->needs_free_netdev is a boolean that indicates whether
free_netdev() should be done at the end of unregister_netdevice().
Now, register_netdevice() can sanely release all resources after
ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit()
and netdev->priv_destructor().
And at the end of unregister_netdevice(), we invoke
netdev->priv_destructor() and optionally call free_netdev().
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-09 00:52:56 +08:00
|
|
|
dev->needs_free_netdev = true;
|
|
|
|
dev->priv_destructor = ip_tunnel_dev_free;
|
2014-02-14 03:46:28 +08:00
|
|
|
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
|
2013-03-25 22:49:35 +08:00
|
|
|
if (!dev->tstats)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2016-02-12 22:43:55 +08:00
|
|
|
err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
|
|
|
|
if (err) {
|
2014-01-03 03:48:33 +08:00
|
|
|
free_percpu(dev->tstats);
|
2016-02-12 22:43:55 +08:00
|
|
|
return err;
|
2014-01-03 03:48:33 +08:00
|
|
|
}
|
|
|
|
|
2013-03-25 22:49:35 +08:00
|
|
|
err = gro_cells_init(&tunnel->gro_cells, dev);
|
|
|
|
if (err) {
|
2016-02-12 22:43:55 +08:00
|
|
|
dst_cache_destroy(&tunnel->dst_cache);
|
2013-03-25 22:49:35 +08:00
|
|
|
free_percpu(dev->tstats);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
tunnel->dev = dev;
|
2013-08-13 23:51:11 +08:00
|
|
|
tunnel->net = dev_net(dev);
|
2013-03-25 22:49:35 +08:00
|
|
|
strcpy(tunnel->parms.name, dev->name);
|
|
|
|
iph->version = 4;
|
|
|
|
iph->ihl = 5;
|
|
|
|
|
2020-01-21 22:26:24 +08:00
|
|
|
if (tunnel->collect_md)
|
2015-08-08 14:51:42 +08:00
|
|
|
netif_keep_dst(dev);
|
net: add netdev_lockdep_set_classes() to virtual drivers
Based on a syzbot report, it appears many virtual
drivers do not yet use netdev_lockdep_set_classes(),
triggerring lockdep false positives.
WARNING: possible recursive locking detected
6.8.0-rc4-next-20240212-syzkaller #0 Not tainted
syz-executor.0/19016 is trying to acquire lock:
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
but task is already holding lock:
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
lock(_xmit_ETHER#2);
lock(_xmit_ETHER#2);
*** DEADLOCK ***
May be due to missing lock nesting notation
9 locks held by syz-executor.0/19016:
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock net/core/rtnetlink.c:79 [inline]
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x82c/0x1040 net/core/rtnetlink.c:6603
#1: ffffc90000a08c00 ((&in_dev->mr_ifc_timer)){+.-.}-{0:0}, at: call_timer_fn+0xc0/0x600 kernel/time/timer.c:1697
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
stack backtrace:
CPU: 1 PID: 19016 Comm: syz-executor.0 Not tainted 6.8.0-rc4-next-20240212-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
check_deadlock kernel/locking/lockdep.c:3062 [inline]
validate_chain+0x15c1/0x58e0 kernel/locking/lockdep.c:3856
__lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1e4/0x530 kernel/locking/lockdep.c:5754
__raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
_raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154
spin_lock include/linux/spinlock.h:351 [inline]
__netif_tx_lock include/linux/netdevice.h:4452 [inline]
sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
iptunnel_xmit+0x540/0x9b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x20ee/0x2960 net/ipv4/ip_tunnel.c:831
erspan_xmit+0x9de/0x1460 net/ipv4/ip_gre.c:720
__netdev_start_xmit include/linux/netdevice.h:4989 [inline]
netdev_start_xmit include/linux/netdevice.h:5003 [inline]
xmit_one net/core/dev.c:3555 [inline]
dev_hard_start_xmit+0x242/0x770 net/core/dev.c:3571
sch_direct_xmit+0x2b6/0x5f0 net/sched/sch_generic.c:342
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
igmpv3_send_cr net/ipv4/igmp.c:723 [inline]
igmp_ifc_timer_expire+0xb71/0xd90 net/ipv4/igmp.c:813
call_timer_fn+0x17e/0x600 kernel/time/timer.c:1700
expire_timers kernel/time/timer.c:1751 [inline]
__run_timers+0x621/0x830 kernel/time/timer.c:2038
run_timer_softirq+0x67/0xf0 kernel/time/timer.c:2051
__do_softirq+0x2bc/0x943 kernel/softirq.c:554
invoke_softirq kernel/softirq.c:428 [inline]
__irq_exit_rcu+0xf2/0x1c0 kernel/softirq.c:633
irq_exit_rcu+0x9/0x30 kernel/softirq.c:645
instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1076 [inline]
sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1076
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
RIP: 0010:resched_offsets_ok kernel/sched/core.c:10127 [inline]
RIP: 0010:__might_resched+0x16f/0x780 kernel/sched/core.c:10142
Code: 00 4c 89 e8 48 c1 e8 03 48 ba 00 00 00 00 00 fc ff df 48 89 44 24 38 0f b6 04 10 84 c0 0f 85 87 04 00 00 41 8b 45 00 c1 e0 08 <01> d8 44 39 e0 0f 85 d6 00 00 00 44 89 64 24 1c 48 8d bc 24 a0 00
RSP: 0018:ffffc9000ee069e0 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8880296a9e00
RDX: dffffc0000000000 RSI: ffff8880296a9e00 RDI: ffffffff8bfe8fa0
RBP: ffffc9000ee06b00 R08: ffffffff82326877 R09: 1ffff11002b5ad1b
R10: dffffc0000000000 R11: ffffed1002b5ad1c R12: 0000000000000000
R13: ffff8880296aa23c R14: 000000000000062a R15: 1ffff92001dc0d44
down_write+0x19/0x50 kernel/locking/rwsem.c:1578
kernfs_activate fs/kernfs/dir.c:1403 [inline]
kernfs_add_one+0x4af/0x8b0 fs/kernfs/dir.c:819
__kernfs_create_file+0x22e/0x2e0 fs/kernfs/file.c:1056
sysfs_add_file_mode_ns+0x24a/0x310 fs/sysfs/file.c:307
create_files fs/sysfs/group.c:64 [inline]
internal_create_group+0x4f4/0xf20 fs/sysfs/group.c:152
internal_create_groups fs/sysfs/group.c:192 [inline]
sysfs_create_groups+0x56/0x120 fs/sysfs/group.c:218
create_dir lib/kobject.c:78 [inline]
kobject_add_internal+0x472/0x8d0 lib/kobject.c:240
kobject_add_varg lib/kobject.c:374 [inline]
kobject_init_and_add+0x124/0x190 lib/kobject.c:457
netdev_queue_add_kobject net/core/net-sysfs.c:1706 [inline]
netdev_queue_update_kobjects+0x1f3/0x480 net/core/net-sysfs.c:1758
register_queue_kobjects net/core/net-sysfs.c:1819 [inline]
netdev_register_kobject+0x265/0x310 net/core/net-sysfs.c:2059
register_netdevice+0x1191/0x19c0 net/core/dev.c:10298
bond_newlink+0x3b/0x90 drivers/net/bonding/bond_netlink.c:576
rtnl_newlink_create net/core/rtnetlink.c:3506 [inline]
__rtnl_newlink net/core/rtnetlink.c:3726 [inline]
rtnl_newlink+0x158f/0x20a0 net/core/rtnetlink.c:3739
rtnetlink_rcv_msg+0x885/0x1040 net/core/rtnetlink.c:6606
netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543
netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline]
netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367
netlink_sendmsg+0xa3c/0xd70 net/netlink/af_netlink.c:1908
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0x221/0x270 net/socket.c:745
__sys_sendto+0x3a4/0x4f0 net/socket.c:2191
__do_sys_sendto net/socket.c:2203 [inline]
__se_sys_sendto net/socket.c:2199 [inline]
__x64_sys_sendto+0xde/0x100 net/socket.c:2199
do_syscall_64+0xfb/0x240
entry_SYSCALL_64_after_hwframe+0x6d/0x75
RIP: 0033:0x7fc3fa87fa9c
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240212140700.2795436-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-02-12 22:07:00 +08:00
|
|
|
netdev_lockdep_set_classes(dev);
|
2013-03-25 22:49:35 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_init);
|
|
|
|
|
|
|
|
void ip_tunnel_uninit(struct net_device *dev)
|
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
2013-08-13 23:51:11 +08:00
|
|
|
struct net *net = tunnel->net;
|
2013-03-25 22:49:35 +08:00
|
|
|
struct ip_tunnel_net *itn;
|
|
|
|
|
|
|
|
itn = net_generic(net, tunnel->ip_tnl_net_id);
|
2020-06-17 00:51:51 +08:00
|
|
|
ip_tunnel_del(itn, netdev_priv(dev));
|
|
|
|
if (itn->fb_tunnel_dev == dev)
|
|
|
|
WRITE_ONCE(itn->fb_tunnel_dev, NULL);
|
2014-01-03 03:48:26 +08:00
|
|
|
|
2016-02-12 22:43:55 +08:00
|
|
|
dst_cache_reset(&tunnel->dst_cache);
|
2013-03-25 22:49:35 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
|
|
|
|
|
|
|
|
/* Do least required initialization, rest of init is done in tunnel_init call */
|
netns: make struct pernet_operations::id unsigned int
Make struct pernet_operations::id unsigned.
There are 2 reasons to do so:
1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.
2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.
"int" being used as an array index needs to be sign-extended
to 64-bit before being used.
void f(long *p, int i)
{
g(p[i]);
}
roughly translates to
movsx rsi, esi
mov rdi, [rsi+...]
call g
MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.
Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:
static inline void *net_generic(const struct net *net, int id)
{
...
ptr = ng->ptr[id - 1];
...
}
And this function is used a lot, so those sign extensions add up.
Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]
However, overall balance is in negative direction:
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
function old new delta
nfsd4_lock 3886 3959 +73
tipc_link_build_proto_msg 1096 1140 +44
mac80211_hwsim_new_radio 2776 2808 +32
tipc_mon_rcv 1032 1058 +26
svcauth_gss_legacy_init 1413 1429 +16
tipc_bcbase_select_primary 379 392 +13
nfsd4_exchange_id 1247 1260 +13
nfsd4_setclientid_confirm 782 793 +11
...
put_client_renew_locked 494 480 -14
ip_set_sockfn_get 730 716 -14
geneve_sock_add 829 813 -16
nfsd4_sequence_done 721 703 -18
nlmclnt_lookup_host 708 686 -22
nfsd4_lockt 1085 1063 -22
nfs_get_client 1077 1050 -27
tcf_bpf_init 1106 1076 -30
nfsd4_encode_fattr 5997 5930 -67
Total: Before=154856051, After=154854321, chg -0.00%
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 09:58:21 +08:00
|
|
|
void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
|
2013-03-25 22:49:35 +08:00
|
|
|
{
|
|
|
|
struct ip_tunnel *tunnel = netdev_priv(dev);
|
|
|
|
tunnel->ip_tnl_net_id = net_id;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ip_tunnel_setup);
|
|
|
|
|
2024-02-09 00:42:41 +08:00
|
|
|
MODULE_DESCRIPTION("IPv4 tunnel implementation library");
|
2013-03-25 22:49:35 +08:00
|
|
|
MODULE_LICENSE("GPL");
|