2019-05-27 14:55:01 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Linux INET6 implementation
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Pedro Roque <roque@di.fc.ul.pt>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _NET_IPV6_H
|
|
|
|
#define _NET_IPV6_H
|
|
|
|
|
|
|
|
#include <linux/ipv6.h>
|
|
|
|
#include <linux/hardirq.h>
|
2013-02-21 20:18:52 +08:00
|
|
|
#include <linux/jhash.h>
|
2017-07-04 14:34:54 +08:00
|
|
|
#include <linux/refcount.h>
|
2019-07-07 17:34:45 +08:00
|
|
|
#include <linux/jump_label_ratelimit.h>
|
2007-07-31 08:05:49 +08:00
|
|
|
#include <net/if_inet6.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/flow.h>
|
2015-05-12 20:56:07 +08:00
|
|
|
#include <net/flow_dissector.h>
|
2022-02-04 21:58:11 +08:00
|
|
|
#include <net/inet_dscp.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/snmp.h>
|
2017-12-02 04:52:30 +08:00
|
|
|
#include <net/netns/hash.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-12-29 08:49:13 +08:00
|
|
|
struct ip_tunnel_info;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define SIN6_LEN_RFC2133 24
|
|
|
|
|
|
|
|
#define IPV6_MAXPLEN 65535
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NextHeader field of IPv6 header
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define NEXTHDR_HOP 0 /* Hop-by-hop option header. */
|
2021-03-11 23:53:18 +08:00
|
|
|
#define NEXTHDR_IPV4 4 /* IPv4 in IPv6 */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define NEXTHDR_TCP 6 /* TCP segment. */
|
|
|
|
#define NEXTHDR_UDP 17 /* UDP message. */
|
|
|
|
#define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */
|
|
|
|
#define NEXTHDR_ROUTING 43 /* Routing header. */
|
|
|
|
#define NEXTHDR_FRAGMENT 44 /* Fragmentation/reassembly header. */
|
2012-08-10 08:51:50 +08:00
|
|
|
#define NEXTHDR_GRE 47 /* GRE header. */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define NEXTHDR_ESP 50 /* Encapsulating security payload. */
|
|
|
|
#define NEXTHDR_AUTH 51 /* Authentication header. */
|
|
|
|
#define NEXTHDR_ICMP 58 /* ICMP for IPv6. */
|
|
|
|
#define NEXTHDR_NONE 59 /* No next header */
|
|
|
|
#define NEXTHDR_DEST 60 /* Destination options header. */
|
2013-07-23 12:37:45 +08:00
|
|
|
#define NEXTHDR_SCTP 132 /* SCTP message. */
|
2006-08-24 11:34:26 +08:00
|
|
|
#define NEXTHDR_MOBILITY 135 /* Mobility header. */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define NEXTHDR_MAX 255
|
|
|
|
|
|
|
|
#define IPV6_DEFAULT_HOPLIMIT 64
|
|
|
|
#define IPV6_DEFAULT_MCASTHOPS 1
|
|
|
|
|
2017-10-31 05:16:00 +08:00
|
|
|
/* Limits on Hop-by-Hop and Destination options.
|
|
|
|
*
|
|
|
|
* Per RFC8200 there is no limit on the maximum number or lengths of options in
|
|
|
|
* Hop-by-Hop or Destination options other then the packet must fit in an MTU.
|
|
|
|
* We allow configurable limits in order to mitigate potential denial of
|
|
|
|
* service attacks.
|
|
|
|
*
|
|
|
|
* There are three limits that may be set:
|
|
|
|
* - Limit the number of options in a Hop-by-Hop or Destination options
|
|
|
|
* extension header
|
|
|
|
* - Limit the byte length of a Hop-by-Hop or Destination options extension
|
|
|
|
* header
|
|
|
|
* - Disallow unknown options
|
|
|
|
*
|
|
|
|
* The limits are expressed in corresponding sysctls:
|
|
|
|
*
|
|
|
|
* ipv6.sysctl.max_dst_opts_cnt
|
|
|
|
* ipv6.sysctl.max_hbh_opts_cnt
|
|
|
|
* ipv6.sysctl.max_dst_opts_len
|
|
|
|
* ipv6.sysctl.max_hbh_opts_len
|
|
|
|
*
|
|
|
|
* max_*_opts_cnt is the number of TLVs that are allowed for Destination
|
|
|
|
* options or Hop-by-Hop options. If the number is less than zero then unknown
|
|
|
|
* TLVs are disallowed and the number of known options that are allowed is the
|
|
|
|
* absolute value. Setting the value to INT_MAX indicates no limit.
|
|
|
|
*
|
|
|
|
* max_*_opts_len is the length limit in bytes of a Destination or
|
|
|
|
* Hop-by-Hop options extension header. Setting the value to INT_MAX
|
|
|
|
* indicates no length limit.
|
|
|
|
*
|
|
|
|
* If a limit is exceeded when processing an extension header the packet is
|
|
|
|
* silently discarded.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Default limits for Hop-by-Hop and Destination options */
|
|
|
|
#define IP6_DEFAULT_MAX_DST_OPTS_CNT 8
|
|
|
|
#define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8
|
|
|
|
#define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */
|
|
|
|
#define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Addr type
|
|
|
|
*
|
|
|
|
* type - unicast | multicast
|
|
|
|
* scope - local | site | global
|
|
|
|
* v4 - compat
|
|
|
|
* v4mapped
|
|
|
|
* any
|
|
|
|
* loopback
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define IPV6_ADDR_ANY 0x0000U
|
|
|
|
|
2018-02-28 07:48:21 +08:00
|
|
|
#define IPV6_ADDR_UNICAST 0x0001U
|
|
|
|
#define IPV6_ADDR_MULTICAST 0x0002U
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define IPV6_ADDR_LOOPBACK 0x0010U
|
|
|
|
#define IPV6_ADDR_LINKLOCAL 0x0020U
|
|
|
|
#define IPV6_ADDR_SITELOCAL 0x0040U
|
|
|
|
|
|
|
|
#define IPV6_ADDR_COMPATv4 0x0080U
|
|
|
|
|
|
|
|
#define IPV6_ADDR_SCOPE_MASK 0x00f0U
|
|
|
|
|
|
|
|
#define IPV6_ADDR_MAPPED 0x1000U
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Addr scopes
|
|
|
|
*/
|
|
|
|
#define IPV6_ADDR_MC_SCOPE(a) \
|
|
|
|
((a)->s6_addr[1] & 0x0f) /* nonstandard */
|
|
|
|
#define __IPV6_ADDR_SCOPE_INVALID -1
|
|
|
|
#define IPV6_ADDR_SCOPE_NODELOCAL 0x01
|
|
|
|
#define IPV6_ADDR_SCOPE_LINKLOCAL 0x02
|
|
|
|
#define IPV6_ADDR_SCOPE_SITELOCAL 0x05
|
|
|
|
#define IPV6_ADDR_SCOPE_ORGLOCAL 0x08
|
|
|
|
#define IPV6_ADDR_SCOPE_GLOBAL 0x0e
|
|
|
|
|
2011-02-15 21:19:20 +08:00
|
|
|
/*
|
|
|
|
* Addr flags
|
|
|
|
*/
|
|
|
|
#define IPV6_ADDR_MC_FLAG_TRANSIENT(a) \
|
|
|
|
((a)->s6_addr[1] & 0x10)
|
|
|
|
#define IPV6_ADDR_MC_FLAG_PREFIX(a) \
|
|
|
|
((a)->s6_addr[1] & 0x20)
|
|
|
|
#define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a) \
|
|
|
|
((a)->s6_addr[1] & 0x40)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* fragmentation header
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct frag_hdr {
|
2006-11-08 16:21:46 +08:00
|
|
|
__u8 nexthdr;
|
|
|
|
__u8 reserved;
|
|
|
|
__be16 frag_off;
|
|
|
|
__be32 identification;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2022-05-14 02:34:00 +08:00
|
|
|
/*
|
|
|
|
* Jumbo payload option, as described in RFC 2675 2.
|
|
|
|
*/
|
|
|
|
struct hop_jumbo_hdr {
|
|
|
|
u8 nexthdr;
|
|
|
|
u8 hdrlen;
|
|
|
|
u8 tlv_type; /* IPV6_TLV_JUMBO, 0xC2 */
|
|
|
|
u8 tlv_len; /* 4 */
|
|
|
|
__be32 jumbo_payload_len;
|
|
|
|
};
|
|
|
|
|
2013-12-04 01:39:29 +08:00
|
|
|
#define IP6_MF 0x0001
|
|
|
|
#define IP6_OFFSET 0xFFF8
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2019-05-29 19:25:32 +08:00
|
|
|
struct ip6_fraglist_iter {
|
|
|
|
struct ipv6hdr *tmp_hdr;
|
|
|
|
struct sk_buff *frag;
|
|
|
|
int offset;
|
|
|
|
unsigned int hlen;
|
|
|
|
__be32 frag_id;
|
|
|
|
u8 nexthdr;
|
|
|
|
};
|
|
|
|
|
|
|
|
int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
|
|
|
|
u8 nexthdr, __be32 frag_id,
|
|
|
|
struct ip6_fraglist_iter *iter);
|
|
|
|
void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter);
|
|
|
|
|
|
|
|
static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter)
|
|
|
|
{
|
|
|
|
struct sk_buff *skb = iter->frag;
|
|
|
|
|
|
|
|
iter->frag = skb->next;
|
|
|
|
skb_mark_not_on_list(skb);
|
|
|
|
|
|
|
|
return skb;
|
|
|
|
}
|
|
|
|
|
2019-05-29 19:25:34 +08:00
|
|
|
struct ip6_frag_state {
|
|
|
|
u8 *prevhdr;
|
|
|
|
unsigned int hlen;
|
|
|
|
unsigned int mtu;
|
|
|
|
unsigned int left;
|
|
|
|
int offset;
|
|
|
|
int ptr;
|
|
|
|
int hroom;
|
|
|
|
int troom;
|
|
|
|
__be32 frag_id;
|
|
|
|
u8 nexthdr;
|
|
|
|
};
|
|
|
|
|
|
|
|
void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
|
|
|
|
unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
|
|
|
|
u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state);
|
|
|
|
struct sk_buff *ip6_frag_next(struct sk_buff *skb,
|
|
|
|
struct ip6_frag_state *state);
|
|
|
|
|
2014-05-14 01:17:33 +08:00
|
|
|
#define IP6_REPLY_MARK(net, mark) \
|
|
|
|
((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/sock.h>
|
|
|
|
|
|
|
|
/* sysctls */
|
|
|
|
extern int sysctl_mld_max_msf;
|
2014-09-02 21:49:25 +08:00
|
|
|
extern int sysctl_mld_qrv;
|
2008-01-09 16:33:11 +08:00
|
|
|
|
2016-04-28 07:44:43 +08:00
|
|
|
#define _DEVINC(net, statname, mod, idev, field) \
|
2007-09-17 07:52:35 +08:00
|
|
|
({ \
|
2006-11-04 19:11:37 +08:00
|
|
|
struct inet6_dev *_idev = (idev); \
|
|
|
|
if (likely(_idev != NULL)) \
|
2016-04-28 07:44:43 +08:00
|
|
|
mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\
|
|
|
|
mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\
|
2006-11-04 19:11:37 +08:00
|
|
|
})
|
2007-09-17 07:52:35 +08:00
|
|
|
|
2011-05-19 09:14:23 +08:00
|
|
|
/* per device counters are atomic_long_t */
|
2016-04-28 07:44:43 +08:00
|
|
|
#define _DEVINCATOMIC(net, statname, mod, idev, field) \
|
2011-05-19 09:14:23 +08:00
|
|
|
({ \
|
|
|
|
struct inet6_dev *_idev = (idev); \
|
|
|
|
if (likely(_idev != NULL)) \
|
|
|
|
SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
|
2016-04-28 07:44:43 +08:00
|
|
|
mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\
|
2011-05-19 09:14:23 +08:00
|
|
|
})
|
|
|
|
|
2011-11-13 09:24:04 +08:00
|
|
|
/* per device and per net counters are atomic_long_t */
|
|
|
|
#define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field) \
|
|
|
|
({ \
|
|
|
|
struct inet6_dev *_idev = (idev); \
|
|
|
|
if (likely(_idev != NULL)) \
|
|
|
|
SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
|
|
|
|
SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\
|
|
|
|
})
|
|
|
|
|
2016-04-28 07:44:43 +08:00
|
|
|
#define _DEVADD(net, statname, mod, idev, field, val) \
|
2007-10-15 17:40:06 +08:00
|
|
|
({ \
|
|
|
|
struct inet6_dev *_idev = (idev); \
|
|
|
|
if (likely(_idev != NULL)) \
|
2016-04-28 07:44:43 +08:00
|
|
|
mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \
|
|
|
|
mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\
|
2007-10-15 17:40:06 +08:00
|
|
|
})
|
|
|
|
|
2016-04-28 07:44:43 +08:00
|
|
|
#define _DEVUPD(net, statname, mod, idev, field, val) \
|
2009-04-27 17:45:02 +08:00
|
|
|
({ \
|
|
|
|
struct inet6_dev *_idev = (idev); \
|
|
|
|
if (likely(_idev != NULL)) \
|
2016-04-28 07:44:43 +08:00
|
|
|
mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \
|
|
|
|
mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
|
2009-04-27 17:45:02 +08:00
|
|
|
})
|
|
|
|
|
2007-09-17 07:52:35 +08:00
|
|
|
/* MIBs */
|
|
|
|
|
2008-10-09 01:35:11 +08:00
|
|
|
#define IP6_INC_STATS(net, idev,field) \
|
2016-04-28 07:44:43 +08:00
|
|
|
_DEVINC(net, ipv6, , idev, field)
|
2016-04-28 07:44:40 +08:00
|
|
|
#define __IP6_INC_STATS(net, idev,field) \
|
2016-04-28 07:44:43 +08:00
|
|
|
_DEVINC(net, ipv6, __, idev, field)
|
2009-04-27 17:45:02 +08:00
|
|
|
#define IP6_ADD_STATS(net, idev,field,val) \
|
2016-04-28 07:44:43 +08:00
|
|
|
_DEVADD(net, ipv6, , idev, field, val)
|
2016-04-28 07:44:40 +08:00
|
|
|
#define __IP6_ADD_STATS(net, idev,field,val) \
|
2016-04-28 07:44:43 +08:00
|
|
|
_DEVADD(net, ipv6, __, idev, field, val)
|
2009-04-27 17:45:02 +08:00
|
|
|
#define IP6_UPD_PO_STATS(net, idev,field,val) \
|
2016-04-28 07:44:43 +08:00
|
|
|
_DEVUPD(net, ipv6, , idev, field, val)
|
2016-04-28 07:44:41 +08:00
|
|
|
#define __IP6_UPD_PO_STATS(net, idev,field,val) \
|
2016-04-28 07:44:43 +08:00
|
|
|
_DEVUPD(net, ipv6, __, idev, field, val)
|
2008-10-09 01:35:11 +08:00
|
|
|
#define ICMP6_INC_STATS(net, idev, field) \
|
2011-05-19 09:14:23 +08:00
|
|
|
_DEVINCATOMIC(net, icmpv6, , idev, field)
|
2016-04-28 07:44:36 +08:00
|
|
|
#define __ICMP6_INC_STATS(net, idev, field) \
|
2016-04-28 07:44:43 +08:00
|
|
|
_DEVINCATOMIC(net, icmpv6, __, idev, field)
|
2008-10-09 01:35:11 +08:00
|
|
|
|
|
|
|
#define ICMP6MSGOUT_INC_STATS(net, idev, field) \
|
2011-11-13 09:24:04 +08:00
|
|
|
_DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256)
|
2016-04-28 07:44:42 +08:00
|
|
|
#define ICMP6MSGIN_INC_STATS(net, idev, field) \
|
2011-11-13 09:24:04 +08:00
|
|
|
_DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field)
|
2007-09-17 07:52:35 +08:00
|
|
|
|
2009-11-03 11:26:03 +08:00
|
|
|
struct ip6_ra_chain {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct ip6_ra_chain *next;
|
|
|
|
struct sock *sk;
|
|
|
|
int sel;
|
|
|
|
void (*destructor)(struct sock *);
|
|
|
|
};
|
|
|
|
|
|
|
|
extern struct ip6_ra_chain *ip6_ra_chain;
|
|
|
|
extern rwlock_t ip6_ra_lock;
|
|
|
|
|
|
|
|
/*
|
|
|
|
This structure is prepared by protocol, when parsing
|
|
|
|
ancillary data and passed to IPv6.
|
|
|
|
*/
|
|
|
|
|
2009-11-03 11:26:03 +08:00
|
|
|
struct ipv6_txoptions {
|
2017-07-04 14:34:54 +08:00
|
|
|
refcount_t refcnt;
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Length of this structure */
|
|
|
|
int tot_len;
|
|
|
|
|
|
|
|
/* length of extension headers */
|
|
|
|
|
|
|
|
__u16 opt_flen; /* after fragment hdr */
|
|
|
|
__u16 opt_nflen; /* before fragment hdr */
|
|
|
|
|
|
|
|
struct ipv6_opt_hdr *hopopt;
|
|
|
|
struct ipv6_opt_hdr *dst0opt;
|
|
|
|
struct ipv6_rt_hdr *srcrt; /* Routing Header */
|
|
|
|
struct ipv6_opt_hdr *dst1opt;
|
2015-11-30 11:37:57 +08:00
|
|
|
struct rcu_head rcu;
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */
|
|
|
|
};
|
|
|
|
|
2019-07-01 21:39:36 +08:00
|
|
|
/* flowlabel_reflect sysctl values */
|
|
|
|
enum flowlabel_reflect {
|
|
|
|
FLOWLABEL_REFLECT_ESTABLISHED = 1,
|
|
|
|
FLOWLABEL_REFLECT_TCP_RESET = 2,
|
|
|
|
FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES = 4,
|
|
|
|
};
|
|
|
|
|
2009-11-03 11:26:03 +08:00
|
|
|
struct ip6_flowlabel {
|
2013-03-07 12:20:32 +08:00
|
|
|
struct ip6_flowlabel __rcu *next;
|
2006-11-08 16:25:17 +08:00
|
|
|
__be32 label;
|
2007-05-04 08:39:04 +08:00
|
|
|
atomic_t users;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct in6_addr dst;
|
|
|
|
struct ipv6_txoptions *opt;
|
|
|
|
unsigned long linger;
|
2013-01-30 17:27:47 +08:00
|
|
|
struct rcu_head rcu;
|
2005-04-17 06:20:36 +08:00
|
|
|
u8 share;
|
2012-05-25 00:37:59 +08:00
|
|
|
union {
|
|
|
|
struct pid *pid;
|
|
|
|
kuid_t uid;
|
|
|
|
} owner;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long lastuse;
|
|
|
|
unsigned long expires;
|
2008-03-27 07:53:08 +08:00
|
|
|
struct net *fl_net;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
ipv6: Flow label state ranges
This patch divides the IPv6 flow label space into two ranges:
0-7ffff is reserved for flow label manager, 80000-fffff will be
used for creating auto flow labels (per RFC6438). This only affects how
labels are set on transmit, it does not affect receive. This range split
can be disbaled by systcl.
Background:
IPv6 flow labels have been an unmitigated disappointment thus far
in the lifetime of IPv6. Support in HW devices to use them for ECMP
is lacking, and OSes don't turn them on by default. If we had these
we could get much better hashing in IPv6 networks without resorting
to DPI, possibly eliminating some of the motivations to to define new
encaps in UDP just for getting ECMP.
Unfortunately, the initial specfications of IPv6 did not clarify
how they are to be used. There has always been a vague concept that
these can be used for ECMP, flow hashing, etc. and we do now have a
good standard how to this in RFC6438. The problem is that flow labels
can be either stateful or stateless (as in RFC6438), and we are
presented with the possibility that a stateless label may collide
with a stateful one. Attempts to split the flow label space were
rejected in IETF. When we added support in Linux for RFC6438, we
could not turn on flow labels by default due to this conflict.
This patch splits the flow label space and should give us
a path to enabling auto flow labels by default for all IPv6 packets.
This is an API change so we need to consider compatibility with
existing deployment. The stateful range is chosen to be the lower
values in hopes that most uses would have chosen small numbers.
Once we resolve the stateless/stateful issue, we can proceed to
look at enabling RFC6438 flow labels by default (starting with
scaled testing).
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-30 06:33:21 +08:00
|
|
|
#define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF)
|
|
|
|
#define IPV6_FLOWLABEL_MASK cpu_to_be32(0x000FFFFF)
|
|
|
|
#define IPV6_FLOWLABEL_STATELESS_FLAG cpu_to_be32(0x00080000)
|
|
|
|
|
2013-12-08 22:46:58 +08:00
|
|
|
#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
|
2014-01-15 17:03:30 +08:00
|
|
|
#define IPV6_TCLASS_SHIFT 20
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-11-03 11:26:03 +08:00
|
|
|
struct ipv6_fl_socklist {
|
2013-03-07 12:20:32 +08:00
|
|
|
struct ipv6_fl_socklist __rcu *next;
|
|
|
|
struct ip6_flowlabel *fl;
|
|
|
|
struct rcu_head rcu;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2016-05-03 12:40:07 +08:00
|
|
|
struct ipcm6_cookie {
|
2018-07-06 22:12:57 +08:00
|
|
|
struct sockcm_cookie sockc;
|
2016-05-03 12:40:07 +08:00
|
|
|
__s16 hlimit;
|
|
|
|
__s16 tclass;
|
2021-11-16 03:02:36 +08:00
|
|
|
__u16 gso_size;
|
2016-05-03 12:40:07 +08:00
|
|
|
__s8 dontfrag;
|
|
|
|
struct ipv6_txoptions *opt;
|
|
|
|
};
|
|
|
|
|
2018-07-06 22:12:55 +08:00
|
|
|
static inline void ipcm6_init(struct ipcm6_cookie *ipc6)
|
|
|
|
{
|
|
|
|
*ipc6 = (struct ipcm6_cookie) {
|
|
|
|
.hlimit = -1,
|
|
|
|
.tclass = -1,
|
|
|
|
.dontfrag = -1,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6,
|
|
|
|
const struct ipv6_pinfo *np)
|
|
|
|
{
|
|
|
|
*ipc6 = (struct ipcm6_cookie) {
|
|
|
|
.hlimit = -1,
|
|
|
|
.tclass = np->tclass,
|
|
|
|
.dontfrag = np->dontfrag,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2015-11-30 11:37:57 +08:00
|
|
|
static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
|
|
|
|
{
|
|
|
|
struct ipv6_txoptions *opt;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
opt = rcu_dereference(np->opt);
|
2016-02-18 08:20:33 +08:00
|
|
|
if (opt) {
|
2017-07-04 14:34:54 +08:00
|
|
|
if (!refcount_inc_not_zero(&opt->refcnt))
|
2016-02-18 08:20:33 +08:00
|
|
|
opt = NULL;
|
|
|
|
else
|
|
|
|
opt = rcu_pointer_handoff(opt);
|
|
|
|
}
|
2015-11-30 11:37:57 +08:00
|
|
|
rcu_read_unlock();
|
|
|
|
return opt;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void txopt_put(struct ipv6_txoptions *opt)
|
|
|
|
{
|
2017-07-04 14:34:54 +08:00
|
|
|
if (opt && refcount_dec_and_test(&opt->refcnt))
|
2015-11-30 11:37:57 +08:00
|
|
|
kfree_rcu(opt, rcu);
|
|
|
|
}
|
|
|
|
|
2022-02-16 00:00:37 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2019-07-07 17:34:45 +08:00
|
|
|
struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label);
|
|
|
|
|
|
|
|
extern struct static_key_false_deferred ipv6_flowlabel_exclusive;
|
|
|
|
static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk,
|
|
|
|
__be32 label)
|
|
|
|
{
|
2022-02-16 00:00:37 +08:00
|
|
|
if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) &&
|
|
|
|
READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl))
|
2019-07-07 17:34:45 +08:00
|
|
|
return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
2022-02-16 00:00:37 +08:00
|
|
|
#endif
|
2019-07-07 17:34:45 +08:00
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
|
|
|
|
struct ip6_flowlabel *fl,
|
|
|
|
struct ipv6_txoptions *fopt);
|
|
|
|
void fl6_free_socklist(struct sock *sk);
|
2020-07-23 14:09:01 +08:00
|
|
|
int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen);
|
2014-01-18 00:15:04 +08:00
|
|
|
int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
|
|
|
|
int flags);
|
2013-09-22 01:22:42 +08:00
|
|
|
int ip6_flowlabel_init(void);
|
|
|
|
void ip6_flowlabel_cleanup(void);
|
2018-01-23 04:06:42 +08:00
|
|
|
bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static inline void fl6_sock_release(struct ip6_flowlabel *fl)
|
|
|
|
{
|
|
|
|
if (fl)
|
|
|
|
atomic_dec(&fl->users);
|
|
|
|
}
|
|
|
|
|
2023-02-11 02:47:07 +08:00
|
|
|
enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
|
|
|
|
u8 code, __be32 info);
|
2012-07-12 15:33:37 +08:00
|
|
|
|
2017-10-06 14:46:14 +08:00
|
|
|
void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
|
|
|
|
struct icmp6hdr *thdr, int len);
|
2013-05-23 04:17:31 +08:00
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
int ip6_ra_control(struct sock *sk, int sel);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
int ipv6_parse_hopopts(struct sk_buff *skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
struct ipv6_txoptions *ipv6_dup_options(struct sock *sk,
|
|
|
|
struct ipv6_txoptions *opt);
|
|
|
|
struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
|
|
|
|
struct ipv6_txoptions *opt,
|
|
|
|
int newtype,
|
2018-07-04 21:58:05 +08:00
|
|
|
struct ipv6_opt_hdr *newopt);
|
2022-01-27 08:36:31 +08:00
|
|
|
struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
|
|
|
|
struct ipv6_txoptions *opt);
|
|
|
|
|
|
|
|
static inline struct ipv6_txoptions *
|
|
|
|
ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt)
|
|
|
|
{
|
|
|
|
if (!opt)
|
|
|
|
return NULL;
|
|
|
|
return __ipv6_fixup_options(opt_space, opt);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-09-28 00:50:56 +08:00
|
|
|
bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
|
|
|
|
const struct inet6_skb_parm *opt);
|
2016-06-28 03:02:51 +08:00
|
|
|
struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
|
|
|
|
struct ipv6_txoptions *opt);
|
2005-12-14 15:24:28 +08:00
|
|
|
|
2022-05-14 02:34:01 +08:00
|
|
|
/* This helper is specialized for BIG TCP needs.
|
|
|
|
* It assumes the hop_jumbo_hdr will immediately follow the IPV6 header.
|
|
|
|
* It assumes headers are already in skb->head.
|
|
|
|
* Returns 0, or IPPROTO_TCP if a BIG TCP packet is there.
|
|
|
|
*/
|
|
|
|
static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
const struct hop_jumbo_hdr *jhdr;
|
|
|
|
const struct ipv6hdr *nhdr;
|
|
|
|
|
2022-05-14 02:34:03 +08:00
|
|
|
if (likely(skb->len <= GRO_LEGACY_MAX_SIZE))
|
2022-05-14 02:34:01 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (skb->protocol != htons(ETH_P_IPV6))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (skb_network_offset(skb) +
|
|
|
|
sizeof(struct ipv6hdr) +
|
|
|
|
sizeof(struct hop_jumbo_hdr) > skb_headlen(skb))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
nhdr = ipv6_hdr(skb);
|
|
|
|
|
|
|
|
if (nhdr->nexthdr != NEXTHDR_HOP)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1);
|
|
|
|
if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 ||
|
|
|
|
jhdr->nexthdr != IPPROTO_TCP)
|
|
|
|
return 0;
|
|
|
|
return jhdr->nexthdr;
|
|
|
|
}
|
|
|
|
|
2022-12-10 12:16:45 +08:00
|
|
|
/* Return 0 if HBH header is successfully removed
|
|
|
|
* Or if HBH removal is unnecessary (packet is not big TCP)
|
|
|
|
* Return error to indicate dropping the packet
|
|
|
|
*/
|
|
|
|
static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
const int hophdr_len = sizeof(struct hop_jumbo_hdr);
|
|
|
|
int nexthdr = ipv6_has_hopopt_jumbo(skb);
|
|
|
|
struct ipv6hdr *h6;
|
|
|
|
|
|
|
|
if (!nexthdr)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (skb_cow_head(skb, 0))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/* Remove the HBH header.
|
|
|
|
* Layout: [Ethernet header][IPv6 header][HBH][L4 Header]
|
|
|
|
*/
|
|
|
|
memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb),
|
|
|
|
skb_network_header(skb) - skb_mac_header(skb) +
|
|
|
|
sizeof(struct ipv6hdr));
|
|
|
|
|
|
|
|
__skb_pull(skb, hophdr_len);
|
|
|
|
skb->network_header += hophdr_len;
|
|
|
|
skb->mac_header += hophdr_len;
|
|
|
|
|
|
|
|
h6 = ipv6_hdr(skb);
|
|
|
|
h6->nexthdr = nexthdr;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-11-30 18:25:59 +08:00
|
|
|
static inline bool ipv6_accept_ra(struct inet6_dev *idev)
|
|
|
|
{
|
|
|
|
/* If forwarding is enabled, RA are not accepted unless the special
|
|
|
|
* hybrid mode (accept_ra=2) is enabled.
|
|
|
|
*/
|
|
|
|
return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 :
|
|
|
|
idev->cnf.accept_ra;
|
|
|
|
}
|
|
|
|
|
net: increase fragment memory usage limits
Increase the amount of memory usage limits for incomplete
IP fragments.
Arguing for new thresh high/low values:
High threshold = 4 MBytes
Low threshold = 3 MBytes
The fragmentation memory accounting code, tries to account for the
real memory usage, by measuring both the size of frag queue struct
(inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) and the SKB's truesize.
We want to be able to handle/hold-on-to enough fragments, to ensure
good performance, without causing incomplete fragments to hurt
scalability, by causing the number of inet_frag_queue to grow too much
(resulting longer searches for frag queues).
For IPv4, how much memory does the largest frag consume.
Maximum size fragment is 64K, which is approx 44 fragments with
MTU(1500) sized packets. Sizeof(struct ipq) is 200. A 1500 byte
packet results in a truesize of 2944 (not 2048 as I first assumed)
(44*2944)+200 = 129736 bytes
The current default high thresh of 262144 bytes, is obviously
problematic, as only two 64K fragments can fit in the queue at the
same time.
How many 64K fragment can we fit into 4 MBytes:
4*2^20/((44*2944)+200) = 32.34 fragment in queues
An attacker could send a separate/distinct fake fragment packets per
queue, causing us to allocate one inet_frag_queue per packet, and thus
attacking the hash table and its lists.
How many frag queue do we need to store, and given a current hash size
of 64, what is the average list length.
Using one MTU sized fragment per inet_frag_queue, each consuming
(2944+200) 3144 bytes.
4*2^20/(2944+200) = 1334 frag queues -> 21 avg list length
An attack could send small fragments, the smallest packet I could send
resulted in a truesize of 896 bytes (I'm a little surprised by this).
4*2^20/(896+200) = 3827 frag queues -> 59 avg list length
When increasing these number, we also need to followup with
improvements, that is going to help scalability. Simply increasing
the hash size, is not enough as the current implementation does not
have a per hash bucket locking.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-15 15:16:35 +08:00
|
|
|
#define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */
|
|
|
|
#define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */
|
2010-02-17 02:40:04 +08:00
|
|
|
#define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
int __ipv6_addr_type(const struct in6_addr *addr);
|
2005-11-09 01:38:12 +08:00
|
|
|
static inline int ipv6_addr_type(const struct in6_addr *addr)
|
|
|
|
{
|
|
|
|
return __ipv6_addr_type(addr) & 0xffff;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static inline int ipv6_addr_scope(const struct in6_addr *addr)
|
|
|
|
{
|
2005-11-09 01:38:12 +08:00
|
|
|
return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __ipv6_addr_src_scope(int type)
|
|
|
|
{
|
2010-09-23 04:43:57 +08:00
|
|
|
return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16);
|
2005-11-09 01:38:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int ipv6_addr_src_scope(const struct in6_addr *addr)
|
|
|
|
{
|
|
|
|
return __ipv6_addr_src_scope(__ipv6_addr_type(addr));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2013-03-08 10:07:16 +08:00
|
|
|
static inline bool __ipv6_addr_needs_scope_id(int type)
|
|
|
|
{
|
|
|
|
return type & IPV6_ADDR_LINKLOCAL ||
|
|
|
|
(type & IPV6_ADDR_MULTICAST &&
|
|
|
|
(type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface)
|
|
|
|
{
|
|
|
|
return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2)
|
|
|
|
{
|
2007-05-04 08:39:04 +08:00
|
|
|
return memcmp(a1, a2, sizeof(struct in6_addr));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2012-07-11 03:05:57 +08:00
|
|
|
static inline bool
|
2006-03-21 10:03:16 +08:00
|
|
|
ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m,
|
|
|
|
const struct in6_addr *a2)
|
|
|
|
{
|
2012-07-11 03:05:57 +08:00
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
|
|
|
const unsigned long *ul1 = (const unsigned long *)a1;
|
|
|
|
const unsigned long *ulm = (const unsigned long *)m;
|
|
|
|
const unsigned long *ul2 = (const unsigned long *)a2;
|
|
|
|
|
|
|
|
return !!(((ul1[0] ^ ul2[0]) & ulm[0]) |
|
|
|
|
((ul1[1] ^ ul2[1]) & ulm[1]));
|
|
|
|
#else
|
2010-09-23 04:43:57 +08:00
|
|
|
return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) |
|
|
|
|
((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) |
|
|
|
|
((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) |
|
|
|
|
((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3]));
|
2012-07-11 03:05:57 +08:00
|
|
|
#endif
|
2006-03-21 10:03:16 +08:00
|
|
|
}
|
|
|
|
|
2018-02-28 07:48:21 +08:00
|
|
|
static inline void ipv6_addr_prefix(struct in6_addr *pfx,
|
2005-04-17 06:20:36 +08:00
|
|
|
const struct in6_addr *addr,
|
|
|
|
int plen)
|
|
|
|
{
|
|
|
|
/* caller must guarantee 0 <= plen <= 128 */
|
|
|
|
int o = plen >> 3,
|
|
|
|
b = plen & 0x7;
|
|
|
|
|
2007-05-04 08:39:04 +08:00
|
|
|
memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr));
|
2005-04-17 06:20:36 +08:00
|
|
|
memcpy(pfx->s6_addr, addr, o);
|
2007-05-04 08:39:04 +08:00
|
|
|
if (b != 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b);
|
|
|
|
}
|
|
|
|
|
2015-12-10 05:46:31 +08:00
|
|
|
static inline void ipv6_addr_prefix_copy(struct in6_addr *addr,
|
|
|
|
const struct in6_addr *pfx,
|
|
|
|
int plen)
|
|
|
|
{
|
|
|
|
/* caller must guarantee 0 <= plen <= 128 */
|
|
|
|
int o = plen >> 3,
|
|
|
|
b = plen & 0x7;
|
|
|
|
|
|
|
|
memcpy(addr->s6_addr, pfx, o);
|
|
|
|
if (b != 0) {
|
|
|
|
addr->s6_addr[o] &= ~(0xff00 >> b);
|
|
|
|
addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-01-14 15:10:24 +08:00
|
|
|
static inline void __ipv6_addr_set_half(__be32 *addr,
|
|
|
|
__be32 wh, __be32 wl)
|
|
|
|
{
|
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
|
|
|
#if defined(__BIG_ENDIAN)
|
|
|
|
if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) {
|
|
|
|
*(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#elif defined(__LITTLE_ENDIAN)
|
|
|
|
if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) {
|
|
|
|
*(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
addr[0] = wh;
|
|
|
|
addr[1] = wl;
|
|
|
|
}
|
|
|
|
|
2018-02-28 07:48:21 +08:00
|
|
|
static inline void ipv6_addr_set(struct in6_addr *addr,
|
2006-09-28 09:44:54 +08:00
|
|
|
__be32 w1, __be32 w2,
|
|
|
|
__be32 w3, __be32 w4)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-01-14 15:10:24 +08:00
|
|
|
__ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2);
|
|
|
|
__ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2012-05-18 14:14:11 +08:00
|
|
|
static inline bool ipv6_addr_equal(const struct in6_addr *a1,
|
|
|
|
const struct in6_addr *a2)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2012-07-11 03:05:57 +08:00
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
|
|
|
const unsigned long *ul1 = (const unsigned long *)a1;
|
|
|
|
const unsigned long *ul2 = (const unsigned long *)a2;
|
|
|
|
|
|
|
|
return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
|
|
|
|
#else
|
2010-09-23 04:43:57 +08:00
|
|
|
return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) |
|
|
|
|
(a1->s6_addr32[1] ^ a2->s6_addr32[1]) |
|
|
|
|
(a1->s6_addr32[2] ^ a2->s6_addr32[2]) |
|
|
|
|
(a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0;
|
2012-07-11 03:05:57 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2013-01-14 15:10:38 +08:00
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
|
|
|
static inline bool __ipv6_prefix_equal64_half(const __be64 *a1,
|
|
|
|
const __be64 *a2,
|
|
|
|
unsigned int len)
|
|
|
|
{
|
2013-01-17 05:30:17 +08:00
|
|
|
if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len))))
|
2013-01-14 15:10:38 +08:00
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
|
|
|
|
const struct in6_addr *addr2,
|
|
|
|
unsigned int prefixlen)
|
|
|
|
{
|
|
|
|
const __be64 *a1 = (const __be64 *)addr1;
|
|
|
|
const __be64 *a2 = (const __be64 *)addr2;
|
|
|
|
|
|
|
|
if (prefixlen >= 64) {
|
|
|
|
if (a1[0] ^ a2[0])
|
|
|
|
return false;
|
|
|
|
return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64);
|
|
|
|
}
|
|
|
|
return __ipv6_prefix_equal64_half(a1, a2, prefixlen);
|
|
|
|
}
|
|
|
|
#else
|
2013-01-14 15:10:31 +08:00
|
|
|
static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
|
|
|
|
const struct in6_addr *addr2,
|
|
|
|
unsigned int prefixlen)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-01-14 15:10:31 +08:00
|
|
|
const __be32 *a1 = addr1->s6_addr32;
|
|
|
|
const __be32 *a2 = addr2->s6_addr32;
|
2012-04-15 13:58:06 +08:00
|
|
|
unsigned int pdw, pbi;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* check complete u32 in prefix */
|
|
|
|
pdw = prefixlen >> 5;
|
|
|
|
if (pdw && memcmp(a1, a2, pdw << 2))
|
2012-05-18 14:14:11 +08:00
|
|
|
return false;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* check incomplete u32 in prefix */
|
|
|
|
pbi = prefixlen & 0x1f;
|
|
|
|
if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi))))
|
2012-05-18 14:14:11 +08:00
|
|
|
return false;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-05-18 14:14:11 +08:00
|
|
|
return true;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2013-01-14 15:10:38 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-05-18 14:14:11 +08:00
|
|
|
static inline bool ipv6_addr_any(const struct in6_addr *a)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2012-07-11 03:05:57 +08:00
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
|
|
|
const unsigned long *ul = (const unsigned long *)a;
|
|
|
|
|
|
|
|
return (ul[0] | ul[1]) == 0UL;
|
|
|
|
#else
|
2010-09-23 04:43:57 +08:00
|
|
|
return (a->s6_addr32[0] | a->s6_addr32[1] |
|
|
|
|
a->s6_addr32[2] | a->s6_addr32[3]) == 0;
|
2012-07-11 03:05:57 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2012-07-18 16:11:12 +08:00
|
|
|
static inline u32 ipv6_addr_hash(const struct in6_addr *a)
|
|
|
|
{
|
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
|
|
|
const unsigned long *ul = (const unsigned long *)a;
|
|
|
|
unsigned long x = ul[0] ^ ul[1];
|
|
|
|
|
|
|
|
return (u32)(x ^ (x >> 32));
|
|
|
|
#else
|
|
|
|
return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^
|
|
|
|
a->s6_addr32[2] ^ a->s6_addr32[3]);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2013-02-21 20:18:52 +08:00
|
|
|
/* more secured version of ipv6_addr_hash() */
|
2013-10-20 03:48:52 +08:00
|
|
|
static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval)
|
2013-02-21 20:18:52 +08:00
|
|
|
{
|
tcp: Reduce chance of collisions in inet6_hashfn().
For both IPv4 and IPv6 incoming TCP connections are tracked in a hash
table with a hash over the source & destination addresses and ports.
However, the IPv6 hash is insufficient and can lead to a high rate of
collisions.
The IPv6 hash used an XOR to fit everything into the 96 bits for the
fast jenkins hash, meaning it is possible for an external entity to
ensure the hash collides, thus falling back to a linear search in the
bucket, which is slow.
We take the approach of hash the full length of IPv6 address in
__ipv6_addr_jhash() so that all users can benefit from a more secure
version.
While this may look like it adds overhead, the reality of modern CPUs
means that this is unmeasurable in real world scenarios.
In simulating with llvm-mca, the increase in cycles for the hashing
code was ~16 cycles on Skylake (from a base of ~155), and an extra ~9
on Nehalem (base of ~173).
In commit dd6d2910c5e0 ("netfilter: conntrack: switch to siphash")
netfilter switched from a jenkins hash to a siphash, but even the faster
hsiphash is a more significant overhead (~20-30%) in some preliminary
testing. So, in this patch, we keep to the more conservative approach to
ensure we don't add much overhead per SYN.
In testing, this results in a consistently even spread across the
connection buckets. In both testing and real-world scenarios, we have
not found any measurable performance impact.
Fixes: 08dcdbf6a7b9 ("ipv6: use a stronger hash for tcp")
Signed-off-by: Stewart Smith <trawets@amazon.com>
Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20230721222410.17914-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-07-22 06:24:10 +08:00
|
|
|
return jhash2((__force const u32 *)a->s6_addr32,
|
|
|
|
ARRAY_SIZE(a->s6_addr32), initval);
|
2013-02-21 20:18:52 +08:00
|
|
|
}
|
|
|
|
|
2012-05-18 14:14:11 +08:00
|
|
|
static inline bool ipv6_addr_loopback(const struct in6_addr *a)
|
2008-06-20 07:33:57 +08:00
|
|
|
{
|
2013-01-14 15:10:06 +08:00
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
2014-07-16 18:55:46 +08:00
|
|
|
const __be64 *be = (const __be64 *)a;
|
2013-01-14 15:10:06 +08:00
|
|
|
|
2014-07-16 18:55:46 +08:00
|
|
|
return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL;
|
2013-01-14 15:10:06 +08:00
|
|
|
#else
|
2010-09-23 04:43:57 +08:00
|
|
|
return (a->s6_addr32[0] | a->s6_addr32[1] |
|
2014-07-16 18:55:46 +08:00
|
|
|
a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0;
|
2013-01-14 15:10:06 +08:00
|
|
|
#endif
|
2008-06-20 07:33:57 +08:00
|
|
|
}
|
|
|
|
|
2014-07-16 18:55:46 +08:00
|
|
|
/*
|
|
|
|
* Note that we must __force cast these to unsigned long to make sparse happy,
|
|
|
|
* since all of the endian-annotated types are fixed size regardless of arch.
|
|
|
|
*/
|
2012-05-18 14:14:11 +08:00
|
|
|
static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
|
2007-08-25 14:16:08 +08:00
|
|
|
{
|
2013-01-14 15:10:14 +08:00
|
|
|
return (
|
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
2014-07-16 18:55:46 +08:00
|
|
|
*(unsigned long *)a |
|
2013-01-14 15:10:14 +08:00
|
|
|
#else
|
2014-07-16 18:55:46 +08:00
|
|
|
(__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) |
|
2013-01-14 15:10:14 +08:00
|
|
|
#endif
|
2014-07-16 18:55:46 +08:00
|
|
|
(__force unsigned long)(a->s6_addr32[2] ^
|
|
|
|
cpu_to_be32(0x0000ffff))) == 0UL;
|
2007-08-25 14:16:08 +08:00
|
|
|
}
|
|
|
|
|
2019-10-02 01:49:06 +08:00
|
|
|
static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a)
|
|
|
|
{
|
|
|
|
return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]);
|
|
|
|
}
|
|
|
|
|
2017-12-02 04:52:30 +08:00
|
|
|
static inline u32 ipv6_portaddr_hash(const struct net *net,
|
|
|
|
const struct in6_addr *addr6,
|
|
|
|
unsigned int port)
|
|
|
|
{
|
|
|
|
unsigned int hash, mix = net_hash_mix(net);
|
|
|
|
|
|
|
|
if (ipv6_addr_any(addr6))
|
|
|
|
hash = jhash_1word(0, mix);
|
|
|
|
else if (ipv6_addr_v4mapped(addr6))
|
|
|
|
hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
|
|
|
|
else
|
|
|
|
hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
|
|
|
|
|
|
|
|
return hash ^ port;
|
|
|
|
}
|
|
|
|
|
2008-02-29 12:55:46 +08:00
|
|
|
/*
|
|
|
|
* Check for a RFC 4843 ORCHID address
|
|
|
|
* (Overlay Routable Cryptographic Hash Identifiers)
|
|
|
|
*/
|
2012-05-18 14:14:11 +08:00
|
|
|
static inline bool ipv6_addr_orchid(const struct in6_addr *a)
|
2008-02-29 12:55:46 +08:00
|
|
|
{
|
2010-09-23 04:43:57 +08:00
|
|
|
return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010);
|
2008-02-29 12:55:46 +08:00
|
|
|
}
|
|
|
|
|
2014-04-29 10:57:34 +08:00
|
|
|
static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr)
|
|
|
|
{
|
|
|
|
return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000);
|
|
|
|
}
|
|
|
|
|
2008-01-18 22:50:56 +08:00
|
|
|
static inline void ipv6_addr_set_v4mapped(const __be32 addr,
|
|
|
|
struct in6_addr *v4mapped)
|
|
|
|
{
|
|
|
|
ipv6_addr_set(v4mapped,
|
|
|
|
0, 0,
|
|
|
|
htonl(0x0000FFFF),
|
|
|
|
addr);
|
|
|
|
}
|
|
|
|
|
2005-11-09 01:37:56 +08:00
|
|
|
/*
|
|
|
|
* find the first different bit between two addresses
|
|
|
|
* length of address must be a multiple of 32bits
|
|
|
|
*/
|
2013-01-14 15:09:54 +08:00
|
|
|
static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen)
|
2005-11-09 01:37:56 +08:00
|
|
|
{
|
2006-11-15 12:56:33 +08:00
|
|
|
const __be32 *a1 = token1, *a2 = token2;
|
2005-11-09 01:37:56 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
addrlen >>= 2;
|
|
|
|
|
|
|
|
for (i = 0; i < addrlen; i++) {
|
2006-11-15 12:56:33 +08:00
|
|
|
__be32 xb = a1[i] ^ a2[i];
|
|
|
|
if (xb)
|
2010-03-29 14:00:05 +08:00
|
|
|
return i * 32 + 31 - __fls(ntohl(xb));
|
2005-11-09 01:37:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2018-02-28 07:48:21 +08:00
|
|
|
* we should *never* get to this point since that
|
2005-11-09 01:37:56 +08:00
|
|
|
* would mean the addrs are equal
|
|
|
|
*
|
|
|
|
* However, we do get to it 8) And exacly, when
|
|
|
|
* addresses are equal 8)
|
|
|
|
*
|
|
|
|
* ip route add 1111::/128 via ...
|
|
|
|
* ip route add 1111::/64 via ...
|
|
|
|
* and we are here.
|
|
|
|
*
|
|
|
|
* Ideally, this function should stop comparison
|
|
|
|
* at prefix length. It does not, but it is still OK,
|
|
|
|
* if returned value is greater than prefix length.
|
|
|
|
* --ANK (980803)
|
|
|
|
*/
|
2010-09-23 04:43:57 +08:00
|
|
|
return addrlen << 5;
|
2005-11-09 01:37:56 +08:00
|
|
|
}
|
|
|
|
|
2013-01-14 15:09:54 +08:00
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
|
|
|
static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen)
|
|
|
|
{
|
|
|
|
const __be64 *a1 = token1, *a2 = token2;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
addrlen >>= 3;
|
|
|
|
|
|
|
|
for (i = 0; i < addrlen; i++) {
|
|
|
|
__be64 xb = a1[i] ^ a2[i];
|
|
|
|
if (xb)
|
|
|
|
return i * 64 + 63 - __fls(be64_to_cpu(xb));
|
|
|
|
}
|
|
|
|
|
|
|
|
return addrlen << 6;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen)
|
|
|
|
{
|
|
|
|
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
|
|
|
|
if (__builtin_constant_p(addrlen) && !(addrlen & 7))
|
|
|
|
return __ipv6_addr_diff64(token1, token2, addrlen);
|
|
|
|
#endif
|
|
|
|
return __ipv6_addr_diff32(token1, token2, addrlen);
|
|
|
|
}
|
|
|
|
|
2005-11-09 01:37:56 +08:00
|
|
|
static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2)
|
|
|
|
{
|
|
|
|
return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
|
|
|
|
}
|
|
|
|
|
2015-05-26 07:02:21 +08:00
|
|
|
__be32 ipv6_select_ident(struct net *net,
|
|
|
|
const struct in6_addr *daddr,
|
|
|
|
const struct in6_addr *saddr);
|
net: accept UFO datagrams from tuntap and packet
Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.
Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.
Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.
It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.
To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
("net: avoid skb_warn_bad_offload false positives on UFO").
(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32 and this is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.
Tested
Booted a v4.13 guest kernel with QEMU. On a host kernel before this
patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
enabled, same as on a v4.13 host kernel.
A UFO packet sent from the guest appears on the tap device:
host:
nc -l -p -u 8000 &
tcpdump -n -i tap0
guest:
dd if=/dev/zero of=payload.txt bs=1 count=2000
nc -u 192.16.1.1 8000 < payload.txt
Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
packets arriving fragmented:
./with_tap_pair.sh ./tap_send_ufo tap0 tap1
(from https://github.com/wdebruij/kerneltools/tree/master/tests)
Changes
v1 -> v2
- simplified set_offload change (review comment)
- documented test procedure
Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-21 23:22:25 +08:00
|
|
|
__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
|
2014-10-31 02:27:17 +08:00
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
int ip6_dst_hoplimit(struct dst_entry *dst);
|
2013-08-31 13:44:28 +08:00
|
|
|
|
2014-04-29 10:57:34 +08:00
|
|
|
static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
|
|
|
|
struct dst_entry *dst)
|
|
|
|
{
|
|
|
|
int hlimit;
|
|
|
|
|
|
|
|
if (ipv6_addr_is_multicast(&fl6->daddr))
|
|
|
|
hlimit = np->mcast_hops;
|
|
|
|
else
|
|
|
|
hlimit = np->hop_limit;
|
|
|
|
if (hlimit < 0)
|
|
|
|
hlimit = ip6_dst_hoplimit(dst);
|
|
|
|
return hlimit;
|
|
|
|
}
|
|
|
|
|
2015-06-05 00:16:40 +08:00
|
|
|
/* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store
|
|
|
|
* Equivalent to : flow->v6addrs.src = iph->saddr;
|
|
|
|
* flow->v6addrs.dst = iph->daddr;
|
|
|
|
*/
|
|
|
|
static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
|
|
|
|
const struct ipv6hdr *iph)
|
|
|
|
{
|
|
|
|
BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) !=
|
|
|
|
offsetof(typeof(flow->addrs), v6addrs.src) +
|
|
|
|
sizeof(flow->addrs.v6addrs.src));
|
2022-11-15 22:24:00 +08:00
|
|
|
memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs));
|
2015-06-05 00:16:40 +08:00
|
|
|
flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
|
|
|
|
}
|
|
|
|
|
2014-07-09 02:15:03 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2015-08-01 07:52:12 +08:00
|
|
|
|
2018-08-02 04:05:10 +08:00
|
|
|
static inline bool ipv6_can_nonlocal_bind(struct net *net,
|
|
|
|
struct inet_sock *inet)
|
|
|
|
{
|
|
|
|
return net->ipv6.sysctl.ip_nonlocal_bind ||
|
|
|
|
inet->freebind || inet->transparent;
|
|
|
|
}
|
|
|
|
|
2015-08-01 07:52:12 +08:00
|
|
|
/* Sysctl settings for net ipv6.auto_flowlabels */
|
|
|
|
#define IP6_AUTO_FLOW_LABEL_OFF 0
|
|
|
|
#define IP6_AUTO_FLOW_LABEL_OPTOUT 1
|
|
|
|
#define IP6_AUTO_FLOW_LABEL_OPTIN 2
|
|
|
|
#define IP6_AUTO_FLOW_LABEL_FORCED 3
|
|
|
|
|
|
|
|
#define IP6_AUTO_FLOW_LABEL_MAX IP6_AUTO_FLOW_LABEL_FORCED
|
|
|
|
|
2015-08-01 07:52:14 +08:00
|
|
|
#define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OPTOUT
|
2015-08-01 07:52:12 +08:00
|
|
|
|
2014-07-02 12:33:10 +08:00
|
|
|
static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
|
2015-08-01 07:52:11 +08:00
|
|
|
__be32 flowlabel, bool autolabel,
|
|
|
|
struct flowi6 *fl6)
|
2014-07-02 12:33:10 +08:00
|
|
|
{
|
2015-08-01 07:52:12 +08:00
|
|
|
u32 hash;
|
2014-07-02 12:33:10 +08:00
|
|
|
|
ipv6: fix flow labels when the traffic class is non-0
ip6_make_flowlabel() determines the flow label for IPv6 packets. It's
supposed to be passed a flow label, which it returns as is if non-0 and
in some other cases, otherwise it calculates a new value.
The problem is callers often pass a flowi6.flowlabel, which may also
contain traffic class bits. If the traffic class is non-0
ip6_make_flowlabel() mistakes the non-0 it gets as a flow label and
returns the whole thing. Thus it can return a 'flow label' longer than
20b and the low 20b of that is typically 0 resulting in packets with 0
label. Moreover, different packets of a flow may be labeled differently.
For a TCP flow with ECN non-payload and payload packets get different
labels as exemplified by this pair of consecutive packets:
(pure ACK)
Internet Protocol Version 6, Src: 2002:af5:11a3::, Dst: 2002:af5:11a2::
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... .... .... 0001 1100 1110 0100 1001 = Flow Label: 0x1ce49
Payload Length: 32
Next Header: TCP (6)
(payload)
Internet Protocol Version 6, Src: 2002:af5:11a3::, Dst: 2002:af5:11a2::
0110 .... = Version: 6
.... 0000 0010 .... .... .... .... .... = Traffic Class: 0x02 (DSCP: CS0, ECN: ECT(0))
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..10 .... .... .... .... .... = Explicit Congestion Notification: ECN-Capable Transport codepoint '10' (2)
.... .... .... 0000 0000 0000 0000 0000 = Flow Label: 0x00000
Payload Length: 688
Next Header: TCP (6)
This patch allows ip6_make_flowlabel() to be passed more than just a
flow label and has it extract the part it really wants. This was simpler
than modifying the callers. With this patch packets like the above become
Internet Protocol Version 6, Src: 2002:af5:11a3::, Dst: 2002:af5:11a2::
0110 .... = Version: 6
.... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT)
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0)
.... .... .... 1010 1111 1010 0101 1110 = Flow Label: 0xafa5e
Payload Length: 32
Next Header: TCP (6)
Internet Protocol Version 6, Src: 2002:af5:11a3::, Dst: 2002:af5:11a2::
0110 .... = Version: 6
.... 0000 0010 .... .... .... .... .... = Traffic Class: 0x02 (DSCP: CS0, ECN: ECT(0))
.... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0)
.... .... ..10 .... .... .... .... .... = Explicit Congestion Notification: ECN-Capable Transport codepoint '10' (2)
.... .... .... 1010 1111 1010 0101 1110 = Flow Label: 0xafa5e
Payload Length: 688
Next Header: TCP (6)
Signed-off-by: Dimitris Michailidis <dmichail@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-31 06:09:42 +08:00
|
|
|
/* @flowlabel may include more than a flow label, eg, the traffic class.
|
|
|
|
* Here we want only the flow label value.
|
|
|
|
*/
|
|
|
|
flowlabel &= IPV6_FLOWLABEL_MASK;
|
|
|
|
|
2015-08-01 07:52:12 +08:00
|
|
|
if (flowlabel ||
|
|
|
|
net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF ||
|
|
|
|
(!autolabel &&
|
|
|
|
net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED))
|
|
|
|
return flowlabel;
|
2014-07-02 12:33:10 +08:00
|
|
|
|
2015-08-01 07:52:12 +08:00
|
|
|
hash = skb_get_hash_flowi6(skb, fl6);
|
2014-07-02 12:33:10 +08:00
|
|
|
|
2015-08-01 07:52:12 +08:00
|
|
|
/* Since this is being sent on the wire obfuscate hash a bit
|
|
|
|
* to minimize possbility that any useful information to an
|
|
|
|
* attacker is leaked. Only lower 20 bits are relevant.
|
|
|
|
*/
|
2018-07-18 00:12:39 +08:00
|
|
|
hash = rol32(hash, 16);
|
ipv6: Flow label state ranges
This patch divides the IPv6 flow label space into two ranges:
0-7ffff is reserved for flow label manager, 80000-fffff will be
used for creating auto flow labels (per RFC6438). This only affects how
labels are set on transmit, it does not affect receive. This range split
can be disbaled by systcl.
Background:
IPv6 flow labels have been an unmitigated disappointment thus far
in the lifetime of IPv6. Support in HW devices to use them for ECMP
is lacking, and OSes don't turn them on by default. If we had these
we could get much better hashing in IPv6 networks without resorting
to DPI, possibly eliminating some of the motivations to to define new
encaps in UDP just for getting ECMP.
Unfortunately, the initial specfications of IPv6 did not clarify
how they are to be used. There has always been a vague concept that
these can be used for ECMP, flow hashing, etc. and we do now have a
good standard how to this in RFC6438. The problem is that flow labels
can be either stateful or stateless (as in RFC6438), and we are
presented with the possibility that a stateless label may collide
with a stateful one. Attempts to split the flow label space were
rejected in IETF. When we added support in Linux for RFC6438, we
could not turn on flow labels by default due to this conflict.
This patch splits the flow label space and should give us
a path to enabling auto flow labels by default for all IPv6 packets.
This is an API change so we need to consider compatibility with
existing deployment. The stateful range is chosen to be the lower
values in hopes that most uses would have chosen small numbers.
Once we resolve the stateless/stateful issue, we can proceed to
look at enabling RFC6438 flow labels by default (starting with
scaled testing).
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-30 06:33:21 +08:00
|
|
|
|
2015-08-01 07:52:12 +08:00
|
|
|
flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
|
|
|
|
|
|
|
|
if (net->ipv6.sysctl.flowlabel_state_ranges)
|
|
|
|
flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;
|
2014-07-02 12:33:10 +08:00
|
|
|
|
|
|
|
return flowlabel;
|
|
|
|
}
|
2015-08-01 07:52:12 +08:00
|
|
|
|
|
|
|
static inline int ip6_default_np_autolabel(struct net *net)
|
|
|
|
{
|
|
|
|
switch (net->ipv6.sysctl.auto_flowlabels) {
|
|
|
|
case IP6_AUTO_FLOW_LABEL_OFF:
|
|
|
|
case IP6_AUTO_FLOW_LABEL_OPTIN:
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
case IP6_AUTO_FLOW_LABEL_OPTOUT:
|
|
|
|
case IP6_AUTO_FLOW_LABEL_FORCED:
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
2014-07-09 02:15:03 +08:00
|
|
|
#else
|
|
|
|
static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
|
2015-08-01 07:52:12 +08:00
|
|
|
__be32 flowlabel, bool autolabel,
|
|
|
|
struct flowi6 *fl6)
|
2014-07-09 02:15:03 +08:00
|
|
|
{
|
|
|
|
return flowlabel;
|
|
|
|
}
|
2015-08-01 07:52:12 +08:00
|
|
|
static inline int ip6_default_np_autolabel(struct net *net)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2014-07-09 02:15:03 +08:00
|
|
|
#endif
|
|
|
|
|
2018-03-11 15:45:47 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
static inline int ip6_multipath_hash_policy(const struct net *net)
|
|
|
|
{
|
|
|
|
return net->ipv6.sysctl.multipath_hash_policy;
|
|
|
|
}
|
2021-05-18 02:15:22 +08:00
|
|
|
static inline u32 ip6_multipath_hash_fields(const struct net *net)
|
|
|
|
{
|
|
|
|
return net->ipv6.sysctl.multipath_hash_fields;
|
|
|
|
}
|
2018-03-11 15:45:47 +08:00
|
|
|
#else
|
|
|
|
static inline int ip6_multipath_hash_policy(const struct net *net)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2021-05-18 02:15:22 +08:00
|
|
|
static inline u32 ip6_multipath_hash_fields(const struct net *net)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2018-03-11 15:45:47 +08:00
|
|
|
#endif
|
2014-07-02 12:33:10 +08:00
|
|
|
|
2013-01-13 13:01:39 +08:00
|
|
|
/*
|
|
|
|
* Header manipulation
|
|
|
|
*/
|
|
|
|
static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass,
|
|
|
|
__be32 flowlabel)
|
|
|
|
{
|
2013-01-17 11:10:57 +08:00
|
|
|
*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel;
|
2013-01-13 13:01:39 +08:00
|
|
|
}
|
|
|
|
|
2013-01-13 13:01:51 +08:00
|
|
|
static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr)
|
|
|
|
{
|
|
|
|
return *(__be32 *)hdr & IPV6_FLOWINFO_MASK;
|
|
|
|
}
|
|
|
|
|
2013-12-08 22:47:00 +08:00
|
|
|
static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr)
|
|
|
|
{
|
|
|
|
return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK;
|
|
|
|
}
|
|
|
|
|
2014-01-15 17:03:30 +08:00
|
|
|
static inline u8 ip6_tclass(__be32 flowinfo)
|
|
|
|
{
|
|
|
|
return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT;
|
|
|
|
}
|
2016-03-19 01:37:57 +08:00
|
|
|
|
2022-02-04 21:58:11 +08:00
|
|
|
static inline dscp_t ip6_dscp(__be32 flowinfo)
|
|
|
|
{
|
|
|
|
return inet_dsfield_to_dscp(ip6_tclass(flowinfo));
|
|
|
|
}
|
|
|
|
|
2016-03-19 01:37:57 +08:00
|
|
|
static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
|
|
|
|
{
|
|
|
|
return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
|
|
|
|
}
|
|
|
|
|
2018-06-04 17:36:05 +08:00
|
|
|
static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
|
|
|
|
{
|
|
|
|
return fl6->flowlabel & IPV6_FLOWLABEL_MASK;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Prototypes exported by ipv6
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* rcv function (called from netdevice level)
|
|
|
|
*/
|
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
int ipv6_rcv(struct sk_buff *skb, struct net_device *dev,
|
|
|
|
struct packet_type *pt, struct net_device *orig_dev);
|
2018-07-05 22:49:42 +08:00
|
|
|
void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
|
|
|
|
struct net_device *orig_dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-09-16 09:04:18 +08:00
|
|
|
int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
|
2006-01-07 15:03:34 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* upper-layer output functions
|
|
|
|
*/
|
2015-09-25 22:39:20 +08:00
|
|
|
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
|
2019-09-24 23:01:14 +08:00
|
|
|
__u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority);
|
2013-09-22 01:22:42 +08:00
|
|
|
|
|
|
|
int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);
|
|
|
|
|
|
|
|
int ip6_append_data(struct sock *sk,
|
|
|
|
int getfrag(void *from, char *to, int offset, int len,
|
|
|
|
int odd, struct sk_buff *skb),
|
2022-06-07 20:00:27 +08:00
|
|
|
void *from, size_t length, int transhdrlen,
|
2016-05-03 12:40:07 +08:00
|
|
|
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
|
2018-07-06 22:12:57 +08:00
|
|
|
struct rt6_info *rt, unsigned int flags);
|
2013-09-22 01:22:42 +08:00
|
|
|
|
|
|
|
int ip6_push_pending_frames(struct sock *sk);
|
|
|
|
|
|
|
|
void ip6_flush_pending_frames(struct sock *sk);
|
|
|
|
|
2015-01-31 23:40:15 +08:00
|
|
|
int ip6_send_skb(struct sk_buff *skb);
|
|
|
|
|
|
|
|
struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue,
|
|
|
|
struct inet_cork_full *cork,
|
|
|
|
struct inet6_cork *v6_cork);
|
|
|
|
struct sk_buff *ip6_make_skb(struct sock *sk,
|
|
|
|
int getfrag(void *from, char *to, int offset,
|
|
|
|
int len, int odd, struct sk_buff *skb),
|
2022-06-07 20:00:27 +08:00
|
|
|
void *from, size_t length, int transhdrlen,
|
2022-01-27 08:36:28 +08:00
|
|
|
struct ipcm6_cookie *ipc6,
|
2016-05-03 12:40:07 +08:00
|
|
|
struct rt6_info *rt, unsigned int flags,
|
2018-07-06 22:12:57 +08:00
|
|
|
struct inet_cork_full *cork);
|
2015-01-31 23:40:15 +08:00
|
|
|
|
|
|
|
static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
|
|
|
|
{
|
|
|
|
return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork,
|
|
|
|
&inet6_sk(sk)->cork);
|
|
|
|
}
|
|
|
|
|
2015-07-31 04:34:53 +08:00
|
|
|
int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
|
|
|
|
struct flowi6 *fl6);
|
2019-12-04 22:35:52 +08:00
|
|
|
struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
|
2013-08-28 14:04:14 +08:00
|
|
|
const struct in6_addr *final_dst);
|
2013-09-22 01:22:42 +08:00
|
|
|
struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
|
2018-04-03 20:00:08 +08:00
|
|
|
const struct in6_addr *final_dst,
|
|
|
|
bool connected);
|
2020-02-24 13:27:50 +08:00
|
|
|
struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
|
|
|
|
struct net_device *dev,
|
|
|
|
struct net *net, struct socket *sock,
|
|
|
|
struct in6_addr *saddr,
|
|
|
|
const struct ip_tunnel_info *info,
|
|
|
|
u8 protocol, bool use_cache);
|
2013-09-22 01:22:42 +08:00
|
|
|
struct dst_entry *ip6_blackhole_route(struct net *net,
|
|
|
|
struct dst_entry *orig_dst);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* skb processing functions
|
|
|
|
*/
|
|
|
|
|
2015-10-08 05:48:47 +08:00
|
|
|
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
|
2013-09-22 01:22:42 +08:00
|
|
|
int ip6_forward(struct sk_buff *skb);
|
|
|
|
int ip6_input(struct sk_buff *skb);
|
|
|
|
int ip6_mc_input(struct sk_buff *skb);
|
2018-11-07 19:38:32 +08:00
|
|
|
void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
|
|
|
|
bool have_final);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-10-08 05:48:45 +08:00
|
|
|
int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
|
2015-10-08 05:48:46 +08:00
|
|
|
int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
|
2008-01-12 11:15:08 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Extension header (options) processing
|
|
|
|
*/
|
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
|
2016-11-08 21:59:20 +08:00
|
|
|
u8 *proto, struct in6_addr **daddr_p,
|
|
|
|
struct in6_addr *saddr);
|
2013-09-22 01:22:42 +08:00
|
|
|
void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
|
|
|
|
u8 *proto);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp,
|
|
|
|
__be16 *frag_offp);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
bool ipv6_ext_hdr(u8 nexthdr);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-11-10 09:05:07 +08:00
|
|
|
enum {
|
2012-11-10 09:11:31 +08:00
|
|
|
IP6_FH_F_FRAG = (1 << 0),
|
|
|
|
IP6_FH_F_AUTH = (1 << 1),
|
|
|
|
IP6_FH_F_SKIP_RH = (1 << 2),
|
2012-11-10 09:05:07 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/* find specified header and get offset to it */
|
2013-09-22 01:22:42 +08:00
|
|
|
int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target,
|
|
|
|
unsigned short *fragoff, int *fragflg);
|
2012-11-10 09:05:07 +08:00
|
|
|
|
2016-06-28 03:06:15 +08:00
|
|
|
int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type);
|
2006-08-24 10:18:35 +08:00
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
|
|
|
|
const struct ipv6_txoptions *opt,
|
|
|
|
struct in6_addr *orig);
|
2010-06-02 05:35:01 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* socket options (ipv6_sockglue.c)
|
|
|
|
*/
|
2021-10-26 00:48:22 +08:00
|
|
|
DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2022-08-17 14:18:34 +08:00
|
|
|
int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
|
|
|
|
unsigned int optlen);
|
2020-07-23 14:09:07 +08:00
|
|
|
int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
|
|
|
|
unsigned int optlen);
|
2022-09-02 08:29:31 +08:00
|
|
|
int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
|
|
|
|
sockptr_t optval, sockptr_t optlen);
|
2013-09-22 01:22:42 +08:00
|
|
|
int ipv6_getsockopt(struct sock *sk, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen);
|
|
|
|
|
2016-11-29 20:09:44 +08:00
|
|
|
int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr,
|
|
|
|
int addr_len);
|
2013-09-22 01:22:42 +08:00
|
|
|
int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
|
2014-01-20 12:16:39 +08:00
|
|
|
int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
|
|
|
|
int addr_len);
|
ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update
There is a case in connected UDP socket such that
getsockopt(IPV6_MTU) will return a stale MTU value. The reproducible
sequence could be the following:
1. Create a connected UDP socket
2. Send some datagrams out
3. Receive a ICMPV6_PKT_TOOBIG
4. No new outgoing datagrams to trigger the sk_dst_check()
logic to update the sk->sk_dst_cache.
5. getsockopt(IPV6_MTU) returns the mtu from the invalid
sk->sk_dst_cache instead of the newly created RTF_CACHE clone.
This patch updates the sk->sk_dst_cache for a connected datagram sk
during pmtu-update code path.
Note that the sk->sk_v6_daddr is used to do the route lookup
instead of skb->data (i.e. iph). It is because a UDP socket can become
connected after sending out some datagrams in un-connected state. or
It can be connected multiple times to different destinations. Hence,
iph may not be related to where sk is currently connected to.
It is done under '!sock_owned_by_user(sk)' condition because
the user may make another ip6_datagram_connect() (i.e changing
the sk->sk_v6_daddr) while dst lookup is happening in the pmtu-update
code path.
For the sock_owned_by_user(sk) == true case, the next patch will
introduce a release_cb() which will update the sk->sk_dst_cache.
Test:
Server (Connected UDP Socket):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Route Details:
[root@arch-fb-vm1 ~]# ip -6 r show | egrep '2fac'
2fac::/64 dev eth0 proto kernel metric 256 pref medium
2fac:face::/64 via 2fac::face dev eth0 metric 1024 pref medium
A simple python code to create a connected UDP socket:
import socket
import errno
HOST = '2fac::1'
PORT = 8080
s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
s.bind((HOST, PORT))
s.connect(('2fac:face::face', 53))
print("connected")
while True:
try:
data = s.recv(1024)
except socket.error as se:
if se.errno == errno.EMSGSIZE:
pmtu = s.getsockopt(41, 24)
print("PMTU:%d" % pmtu)
break
s.close()
Python program output after getting a ICMPV6_PKT_TOOBIG:
[root@arch-fb-vm1 ~]# python2 ~/devshare/kernel/tasks/fib6/udp-connect-53-8080.py
connected
PMTU:1300
Cache routes after recieving TOOBIG:
[root@arch-fb-vm1 ~]# ip -6 r show table cache
2fac:face::face via 2fac::face dev eth0 metric 0
cache expires 463sec mtu 1300 pref medium
Client (Send the ICMPV6_PKT_TOOBIG):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
scapy is used to generate the TOOBIG message. Here is the scapy script I have
used:
>>> p=Ether(src='da:75:4d:36:ac:32', dst='52:54:00:12:34:66', type=0x86dd)/IPv6(src='2fac::face', dst='2fac::1')/ICMPv6PacketTooBig(mtu=1300)/IPv6(src='2fac::
1',dst='2fac:face::face', nh='UDP')/UDP(sport=8080,dport=53)
>>> sendp(p, iface='qemubr0')
Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reported-by: Wei Wang <weiwan@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-12 06:29:36 +08:00
|
|
|
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
|
2016-04-12 06:29:37 +08:00
|
|
|
void ip6_datagram_release_cb(struct sock *sk);
|
2013-09-22 01:22:42 +08:00
|
|
|
|
2013-11-23 07:46:12 +08:00
|
|
|
int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
|
|
|
|
int *addr_len);
|
|
|
|
int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
|
|
|
|
int *addr_len);
|
2013-09-22 01:22:42 +08:00
|
|
|
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
|
|
|
|
u32 info, u8 *payload);
|
|
|
|
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info);
|
|
|
|
void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);
|
|
|
|
|
2022-10-07 02:53:46 +08:00
|
|
|
void inet6_cleanup_sock(struct sock *sk);
|
tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct().
Originally, inet6_sk(sk)->XXX were changed under lock_sock(), so we were
able to clean them up by calling inet6_destroy_sock() during the IPv6 ->
IPv4 conversion by IPV6_ADDRFORM. However, commit 03485f2adcde ("udpv6:
Add lockless sendmsg() support") added a lockless memory allocation path,
which could cause a memory leak:
setsockopt(IPV6_ADDRFORM) sendmsg()
+-----------------------+ +-------+
- do_ipv6_setsockopt(sk, ...) - udpv6_sendmsg(sk, ...)
- sockopt_lock_sock(sk) ^._ called via udpv6_prot
- lock_sock(sk) before WRITE_ONCE()
- WRITE_ONCE(sk->sk_prot, &tcp_prot)
- inet6_destroy_sock() - if (!corkreq)
- sockopt_release_sock(sk) - ip6_make_skb(sk, ...)
- release_sock(sk) ^._ lockless fast path for
the non-corking case
- __ip6_append_data(sk, ...)
- ipv6_local_rxpmtu(sk, ...)
- xchg(&np->rxpmtu, skb)
^._ rxpmtu is never freed.
- goto out_no_dst;
- lock_sock(sk)
For now, rxpmtu is only the case, but not to miss the future change
and a similar bug fixed in commit e27326009a3d ("net: ping6: Fix
memleak in ipv6_renew_options()."), let's set a new function to IPv6
sk->sk_destruct() and call inet6_cleanup_sock() there. Since the
conversion does not change sk->sk_destruct(), we can guarantee that
we can clean up IPv6 resources finally.
We can now remove all inet6_destroy_sock() calls from IPv6 protocol
specific ->destroy() functions, but such changes are invasive to
backport. So they can be posted as a follow-up later for net-next.
Fixes: 03485f2adcde ("udpv6: Add lockless sendmsg() support")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-10-07 02:53:47 +08:00
|
|
|
void inet6_sock_destruct(struct sock *sk);
|
2013-09-22 01:22:42 +08:00
|
|
|
int inet6_release(struct socket *sock);
|
|
|
|
int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
|
2018-02-13 03:00:20 +08:00
|
|
|
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
|
2013-09-22 01:22:42 +08:00
|
|
|
int peer);
|
|
|
|
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
|
2020-05-18 14:28:06 +08:00
|
|
|
int inet6_compat_ioctl(struct socket *sock, unsigned int cmd,
|
|
|
|
unsigned long arg);
|
2013-09-22 01:22:42 +08:00
|
|
|
|
|
|
|
int inet6_hash_connect(struct inet_timewait_death_row *death_row,
|
2005-12-14 15:25:44 +08:00
|
|
|
struct sock *sk);
|
2020-01-25 08:04:02 +08:00
|
|
|
int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
|
|
|
|
int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
|
|
|
|
int flags);
|
2005-12-14 15:25:44 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* reassembly.c
|
|
|
|
*/
|
2005-12-23 04:49:22 +08:00
|
|
|
extern const struct proto_ops inet6_stream_ops;
|
|
|
|
extern const struct proto_ops inet6_dgram_ops;
|
2017-06-04 00:29:25 +08:00
|
|
|
extern const struct proto_ops inet6_sockraw_ops;
|
2005-08-16 13:18:02 +08:00
|
|
|
|
2005-12-27 12:43:12 +08:00
|
|
|
struct group_source_req;
|
|
|
|
struct group_filter;
|
|
|
|
|
2013-09-22 01:22:42 +08:00
|
|
|
int ip6_mc_source(int add, int omode, struct sock *sk,
|
|
|
|
struct group_source_req *pgsr);
|
2020-03-31 03:43:10 +08:00
|
|
|
int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
|
|
|
|
struct sockaddr_storage *list);
|
2013-09-22 01:22:42 +08:00
|
|
|
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
|
2022-09-02 08:28:53 +08:00
|
|
|
sockptr_t optval, size_t ss_offset);
|
2005-08-16 13:18:02 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROC_FS
|
2013-09-22 01:22:42 +08:00
|
|
|
int ac6_proc_init(struct net *net);
|
|
|
|
void ac6_proc_exit(struct net *net);
|
|
|
|
int raw6_proc_init(void);
|
|
|
|
void raw6_proc_exit(void);
|
|
|
|
int tcp6_proc_init(struct net *net);
|
|
|
|
void tcp6_proc_exit(struct net *net);
|
|
|
|
int udp6_proc_init(struct net *net);
|
|
|
|
void udp6_proc_exit(struct net *net);
|
|
|
|
int udplite6_proc_init(void);
|
|
|
|
void udplite6_proc_exit(void);
|
|
|
|
int ipv6_misc_proc_init(void);
|
|
|
|
void ipv6_misc_proc_exit(void);
|
|
|
|
int snmp6_register_dev(struct inet6_dev *idev);
|
|
|
|
int snmp6_unregister_dev(struct inet6_dev *idev);
|
2005-08-16 13:18:02 +08:00
|
|
|
|
2007-04-25 12:54:09 +08:00
|
|
|
#else
|
2008-03-27 07:52:32 +08:00
|
|
|
static inline int ac6_proc_init(struct net *net) { return 0; }
|
|
|
|
static inline void ac6_proc_exit(struct net *net) { }
|
|
|
|
static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; }
|
|
|
|
static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; }
|
2005-08-16 13:18:02 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-16 13:18:02 +08:00
|
|
|
#ifdef CONFIG_SYSCTL
|
2013-09-22 01:22:42 +08:00
|
|
|
struct ctl_table *ipv6_icmp_sysctl_init(struct net *net);
|
|
|
|
struct ctl_table *ipv6_route_sysctl_init(struct net *net);
|
|
|
|
int ipv6_sysctl_register(void);
|
|
|
|
void ipv6_sysctl_unregister(void);
|
2005-08-16 13:18:02 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-02-26 01:58:34 +08:00
|
|
|
int ipv6_sock_mc_join(struct sock *sk, int ifindex,
|
|
|
|
const struct in6_addr *addr);
|
ipv6/mcast: init as INCLUDE when join SSM INCLUDE group
This an IPv6 version patch of "ipv4/igmp: init group mode as INCLUDE when
join source group". From RFC3810, part 6.1:
If no per-interface state existed for that
multicast address before the change (i.e., the change consisted of
creating a new per-interface record), or if no state exists after the
change (i.e., the change consisted of deleting a per-interface
record), then the "non-existent" state is considered to have an
INCLUDE filter mode and an empty source list.
Which means a new multicast group should start with state IN(). Currently,
for MLDv2 SSM JOIN_SOURCE_GROUP mode, we first call ipv6_sock_mc_join(),
then ip6_mc_source(), which will trigger a TO_IN() message instead of
ALLOW().
The issue was exposed by commit a052517a8ff65 ("net/multicast: should not
send source list records when have filter mode change"). Before this change,
we sent both ALLOW(A) and TO_IN(A). Now, we only send TO_IN(A).
Fix it by adding a new parameter to init group mode. Also add some wrapper
functions to avoid changing too much code.
v1 -> v2:
In the first version I only cleared the group change record. But this is not
enough. Because when a new group join, it will init as EXCLUDE and trigger
a filter mode change in ip/ip6_mc_add_src(), which will clear all source
addresses sf_crcount. This will prevent early joined address sending state
change records if multi source addressed joined at the same time.
In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM
JOIN_SOURCE_GROUP. I also split the original patch into two separated patches
for IPv4 and IPv6.
There is also a difference between v4 and v6 version. For IPv6, when the
interface goes down and up, we will send correct state change record with
unspecified IPv6 address (::) with function ipv6_mc_up(). But after DAD is
completed, we resend the change record TO_IN() in mld_send_initial_cr().
Fix it by sending ALLOW() for INCLUDE mode in mld_send_initial_cr().
Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:27 +08:00
|
|
|
int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
|
|
|
|
const struct in6_addr *addr, unsigned int mode);
|
2015-02-26 01:58:34 +08:00
|
|
|
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
|
|
|
|
const struct in6_addr *addr);
|
2020-05-28 13:12:31 +08:00
|
|
|
|
|
|
|
static inline int ip6_sock_set_v6only(struct sock *sk)
|
|
|
|
{
|
|
|
|
if (inet_sk(sk)->inet_num)
|
|
|
|
return -EINVAL;
|
|
|
|
lock_sock(sk);
|
|
|
|
sk->sk_ipv6only = true;
|
|
|
|
release_sock(sk);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-05-28 13:12:32 +08:00
|
|
|
static inline void ip6_sock_set_recverr(struct sock *sk)
|
|
|
|
{
|
|
|
|
lock_sock(sk);
|
|
|
|
inet6_sk(sk)->recverr = true;
|
|
|
|
release_sock(sk);
|
|
|
|
}
|
|
|
|
|
2020-05-28 13:12:33 +08:00
|
|
|
static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
|
|
|
|
{
|
|
|
|
unsigned int pref = 0;
|
|
|
|
unsigned int prefmask = ~0;
|
|
|
|
|
|
|
|
/* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
|
|
|
|
switch (val & (IPV6_PREFER_SRC_PUBLIC |
|
|
|
|
IPV6_PREFER_SRC_TMP |
|
|
|
|
IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
|
|
|
|
case IPV6_PREFER_SRC_PUBLIC:
|
|
|
|
pref |= IPV6_PREFER_SRC_PUBLIC;
|
|
|
|
prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
|
|
|
|
IPV6_PREFER_SRC_TMP);
|
|
|
|
break;
|
|
|
|
case IPV6_PREFER_SRC_TMP:
|
|
|
|
pref |= IPV6_PREFER_SRC_TMP;
|
|
|
|
prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
|
|
|
|
IPV6_PREFER_SRC_TMP);
|
|
|
|
break;
|
|
|
|
case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
|
|
|
|
prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
|
|
|
|
IPV6_PREFER_SRC_TMP);
|
|
|
|
break;
|
|
|
|
case 0:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check HOME/COA conflicts */
|
|
|
|
switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) {
|
|
|
|
case IPV6_PREFER_SRC_HOME:
|
|
|
|
prefmask &= ~IPV6_PREFER_SRC_COA;
|
|
|
|
break;
|
|
|
|
case IPV6_PREFER_SRC_COA:
|
|
|
|
pref |= IPV6_PREFER_SRC_COA;
|
|
|
|
break;
|
|
|
|
case 0:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check CGA/NONCGA conflicts */
|
|
|
|
switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
|
|
|
|
case IPV6_PREFER_SRC_CGA:
|
|
|
|
case IPV6_PREFER_SRC_NONCGA:
|
|
|
|
case 0:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
lock_sock(sk);
|
|
|
|
ret = __ip6_sock_set_addr_preferences(sk, val);
|
|
|
|
release_sock(sk);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-05-28 13:12:34 +08:00
|
|
|
static inline void ip6_sock_set_recvpktinfo(struct sock *sk)
|
|
|
|
{
|
|
|
|
lock_sock(sk);
|
|
|
|
inet6_sk(sk)->rxopt.bits.rxinfo = true;
|
|
|
|
release_sock(sk);
|
|
|
|
}
|
|
|
|
|
2005-08-16 13:18:02 +08:00
|
|
|
#endif /* _NET_IPV6_H */
|