2019-05-27 14:55:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2005-06-22 03:43:18 +08:00
|
|
|
/*
|
|
|
|
*
|
|
|
|
* Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
|
|
|
|
* & Swedish University of Agricultural Sciences.
|
|
|
|
*
|
2007-02-09 22:24:47 +08:00
|
|
|
* Jens Laas <jens.laas@data.slu.se> Swedish University of
|
2005-06-22 03:43:18 +08:00
|
|
|
* Agricultural Sciences.
|
2007-02-09 22:24:47 +08:00
|
|
|
*
|
2005-06-22 03:43:18 +08:00
|
|
|
* Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
|
|
|
|
*
|
2011-03-31 09:57:33 +08:00
|
|
|
* This work is based on the LPC-trie which is originally described in:
|
2007-02-09 22:24:47 +08:00
|
|
|
*
|
2005-06-22 03:43:18 +08:00
|
|
|
* An experimental study of compression methods for dynamic tries
|
|
|
|
* Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
|
2020-07-07 01:38:50 +08:00
|
|
|
* https://www.csc.kth.se/~snilsson/software/dyntrie2/
|
2005-06-22 03:43:18 +08:00
|
|
|
*
|
|
|
|
* IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
|
|
|
|
* IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
|
|
|
|
*
|
|
|
|
* Code from fib_hash has been reused which includes the following header:
|
|
|
|
*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* IPv4 FIB: lookup engine and maintenance routines.
|
|
|
|
*
|
|
|
|
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
|
|
|
*
|
2005-12-23 03:25:10 +08:00
|
|
|
* Substantial contributions to this work comes from:
|
|
|
|
*
|
|
|
|
* David S. Miller, <davem@davemloft.net>
|
|
|
|
* Stephen Hemminger <shemminger@osdl.org>
|
|
|
|
* Paul E. McKenney <paulmck@us.ibm.com>
|
|
|
|
* Patrick McHardy <kaber@trash.net>
|
2005-06-22 03:43:18 +08:00
|
|
|
*/
|
2018-02-25 02:20:33 +08:00
|
|
|
#include <linux/cache.h>
|
2016-12-25 03:46:01 +08:00
|
|
|
#include <linux/uaccess.h>
|
2007-10-19 14:40:25 +08:00
|
|
|
#include <linux/bitops.h>
|
2005-06-22 03:43:18 +08:00
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/socket.h>
|
|
|
|
#include <linux/sockios.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/in.h>
|
|
|
|
#include <linux/inet.h>
|
2006-01-04 06:38:34 +08:00
|
|
|
#include <linux/inetdevice.h>
|
2005-06-22 03:43:18 +08:00
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/if_arp.h>
|
|
|
|
#include <linux/proc_fs.h>
|
2005-08-26 04:01:29 +08:00
|
|
|
#include <linux/rcupdate.h>
|
2005-06-22 03:43:18 +08:00
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/netlink.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/list.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2011-07-15 23:47:34 +08:00
|
|
|
#include <linux/export.h>
|
2015-05-27 12:19:03 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2016-09-26 18:52:29 +08:00
|
|
|
#include <linux/notifier.h>
|
2007-09-12 18:01:34 +08:00
|
|
|
#include <net/net_namespace.h>
|
2005-06-22 03:43:18 +08:00
|
|
|
#include <net/ip.h>
|
|
|
|
#include <net/protocol.h>
|
|
|
|
#include <net/route.h>
|
|
|
|
#include <net/tcp.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/ip_fib.h>
|
2017-08-03 19:28:11 +08:00
|
|
|
#include <net/fib_notifier.h>
|
2015-08-28 23:42:09 +08:00
|
|
|
#include <trace/events/fib.h>
|
2005-06-22 03:43:18 +08:00
|
|
|
#include "fib_lookup.h"
|
|
|
|
|
2019-10-03 17:49:27 +08:00
|
|
|
static int call_fib_entry_notifier(struct notifier_block *nb,
|
2016-12-03 23:45:07 +08:00
|
|
|
enum fib_event_type event_type, u32 dst,
|
2019-10-03 17:49:30 +08:00
|
|
|
int dst_len, struct fib_alias *fa,
|
|
|
|
struct netlink_ext_ack *extack)
|
2016-12-03 23:45:07 +08:00
|
|
|
{
|
|
|
|
struct fib_entry_notifier_info info = {
|
2019-10-03 17:49:30 +08:00
|
|
|
.info.extack = extack,
|
2016-12-03 23:45:07 +08:00
|
|
|
.dst = dst,
|
|
|
|
.dst_len = dst_len,
|
2017-10-19 02:39:13 +08:00
|
|
|
.fi = fa->fa_info,
|
|
|
|
.tos = fa->fa_tos,
|
|
|
|
.type = fa->fa_type,
|
|
|
|
.tb_id = fa->tb_id,
|
2016-12-03 23:45:07 +08:00
|
|
|
};
|
2019-10-03 17:49:27 +08:00
|
|
|
return call_fib4_notifier(nb, event_type, &info.info);
|
2016-12-03 23:45:07 +08:00
|
|
|
}
|
|
|
|
|
2016-09-26 18:52:29 +08:00
|
|
|
static int call_fib_entry_notifiers(struct net *net,
|
|
|
|
enum fib_event_type event_type, u32 dst,
|
2017-10-28 08:37:13 +08:00
|
|
|
int dst_len, struct fib_alias *fa,
|
|
|
|
struct netlink_ext_ack *extack)
|
2016-09-26 18:52:29 +08:00
|
|
|
{
|
|
|
|
struct fib_entry_notifier_info info = {
|
2017-10-28 08:37:13 +08:00
|
|
|
.info.extack = extack,
|
2016-09-26 18:52:29 +08:00
|
|
|
.dst = dst,
|
|
|
|
.dst_len = dst_len,
|
2017-10-19 02:39:13 +08:00
|
|
|
.fi = fa->fa_info,
|
|
|
|
.tos = fa->fa_tos,
|
|
|
|
.type = fa->fa_type,
|
|
|
|
.tb_id = fa->tb_id,
|
2016-09-26 18:52:29 +08:00
|
|
|
};
|
2017-08-03 19:28:11 +08:00
|
|
|
return call_fib4_notifiers(net, event_type, &info.info);
|
2016-09-26 18:52:29 +08:00
|
|
|
}
|
|
|
|
|
2006-03-21 13:35:01 +08:00
|
|
|
#define MAX_STAT_DEPTH 32
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-23 07:51:26 +08:00
|
|
|
#define KEYLENGTH (8*sizeof(t_key))
|
|
|
|
#define KEY_MAX ((t_key)~0)
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
typedef unsigned int t_key;
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
|
|
|
|
#define IS_TNODE(n) ((n)->bits)
|
|
|
|
#define IS_LEAF(n) (!(n)->bits)
|
2005-08-26 04:01:29 +08:00
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector {
|
2015-01-01 02:55:35 +08:00
|
|
|
t_key key;
|
|
|
|
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
|
2015-03-05 07:02:33 +08:00
|
|
|
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
|
2015-01-01 02:57:08 +08:00
|
|
|
unsigned char slen;
|
2015-01-01 02:55:47 +08:00
|
|
|
union {
|
2015-03-05 07:02:33 +08:00
|
|
|
/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
|
2015-02-26 07:31:51 +08:00
|
|
|
struct hlist_head leaf;
|
2015-03-05 07:02:33 +08:00
|
|
|
/* This array is valid if (pos | bits) > 0 (TNODE) */
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector __rcu *tnode[0];
|
2015-01-01 02:55:47 +08:00
|
|
|
};
|
2005-06-22 03:43:18 +08:00
|
|
|
};
|
|
|
|
|
2015-03-07 01:54:27 +08:00
|
|
|
struct tnode {
|
2015-03-07 01:54:33 +08:00
|
|
|
struct rcu_head rcu;
|
2015-03-07 01:54:39 +08:00
|
|
|
t_key empty_children; /* KEYLENGTH bits needed */
|
|
|
|
t_key full_children; /* KEYLENGTH bits needed */
|
2015-03-07 01:54:46 +08:00
|
|
|
struct key_vector __rcu *parent;
|
2015-03-07 01:54:27 +08:00
|
|
|
struct key_vector kv[1];
|
2015-03-07 01:54:33 +08:00
|
|
|
#define tn_bits kv[0].bits
|
2015-03-07 01:54:27 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
|
2015-03-05 07:02:33 +08:00
|
|
|
#define LEAF_SIZE TNODE_SIZE(1)
|
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
|
|
|
struct trie_use_stats {
|
|
|
|
unsigned int gets;
|
|
|
|
unsigned int backtrack;
|
|
|
|
unsigned int semantic_match_passed;
|
|
|
|
unsigned int semantic_match_miss;
|
|
|
|
unsigned int null_node_hit;
|
2005-07-06 06:02:40 +08:00
|
|
|
unsigned int resize_node_skipped;
|
2005-06-22 03:43:18 +08:00
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct trie_stat {
|
|
|
|
unsigned int totdepth;
|
|
|
|
unsigned int maxdepth;
|
|
|
|
unsigned int tnodes;
|
|
|
|
unsigned int leaves;
|
|
|
|
unsigned int nullpointers;
|
2008-01-23 13:54:05 +08:00
|
|
|
unsigned int prefixes;
|
2006-03-21 13:35:01 +08:00
|
|
|
unsigned int nodesizes[MAX_STAT_DEPTH];
|
2005-07-20 05:01:51 +08:00
|
|
|
};
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
struct trie {
|
2015-03-07 01:54:52 +08:00
|
|
|
struct key_vector kv[1];
|
2005-06-22 03:43:18 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2015-01-01 02:55:29 +08:00
|
|
|
struct trie_use_stats __percpu *stats;
|
2005-06-22 03:43:18 +08:00
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
static struct key_vector *resize(struct trie *t, struct key_vector *tn);
|
2019-03-21 00:18:59 +08:00
|
|
|
static unsigned int tnode_free_size;
|
2009-07-14 16:33:08 +08:00
|
|
|
|
|
|
|
/*
|
2019-03-21 00:18:59 +08:00
|
|
|
* synchronize_rcu after call_rcu for outstanding dirty memory; it should be
|
|
|
|
* especially useful before resizing the root node with PREEMPT_NONE configs;
|
|
|
|
* the value was obtained experimentally, aiming to avoid visible slowdown.
|
2009-07-14 16:33:08 +08:00
|
|
|
*/
|
2019-03-21 00:18:59 +08:00
|
|
|
unsigned int sysctl_fib_sync_mem = 512 * 1024;
|
|
|
|
unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
|
|
|
|
unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2018-02-25 02:20:33 +08:00
|
|
|
static struct kmem_cache *fn_alias_kmem __ro_after_init;
|
|
|
|
static struct kmem_cache *trie_leaf_kmem __ro_after_init;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 01:54:33 +08:00
|
|
|
static inline struct tnode *tn_info(struct key_vector *kv)
|
|
|
|
{
|
|
|
|
return container_of(kv, struct tnode, kv[0]);
|
|
|
|
}
|
|
|
|
|
2015-01-01 02:55:35 +08:00
|
|
|
/* caller must hold RTNL */
|
2015-03-07 01:54:46 +08:00
|
|
|
#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
|
2015-03-07 01:54:14 +08:00
|
|
|
#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
|
2011-03-31 16:51:35 +08:00
|
|
|
|
2015-01-01 02:55:35 +08:00
|
|
|
/* caller must hold RCU read lock or RTNL */
|
2015-03-07 01:54:46 +08:00
|
|
|
#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
|
2015-03-07 01:54:14 +08:00
|
|
|
#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
|
2011-03-31 16:51:35 +08:00
|
|
|
|
2015-01-01 02:55:35 +08:00
|
|
|
/* wrapper for rcu_assign_pointer */
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
|
2008-01-18 19:31:36 +08:00
|
|
|
{
|
2015-01-01 02:55:47 +08:00
|
|
|
if (n)
|
2015-03-07 01:54:46 +08:00
|
|
|
rcu_assign_pointer(tn_info(n)->parent, tp);
|
2007-08-11 06:22:13 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:46 +08:00
|
|
|
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
|
2015-01-01 02:55:35 +08:00
|
|
|
|
|
|
|
/* This provides us with the number of children in this node, in the case of a
|
|
|
|
* leaf this will return 0 meaning none of the children are accessible.
|
2008-03-23 08:59:58 +08:00
|
|
|
*/
|
2015-03-07 01:54:21 +08:00
|
|
|
static inline unsigned long child_length(const struct key_vector *tn)
|
2007-08-11 06:22:13 +08:00
|
|
|
{
|
2015-01-01 02:55:35 +08:00
|
|
|
return (1ul << tn->bits) & ~(1ul);
|
2007-08-11 06:22:13 +08:00
|
|
|
}
|
2005-08-26 04:01:29 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
|
|
|
|
|
2015-03-07 01:54:21 +08:00
|
|
|
static inline unsigned long get_index(t_key key, struct key_vector *kv)
|
|
|
|
{
|
|
|
|
unsigned long index = key ^ kv->key;
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
|
|
|
|
return 0;
|
|
|
|
|
2015-03-07 01:54:21 +08:00
|
|
|
return index >> kv->pos;
|
|
|
|
}
|
|
|
|
|
2015-01-01 02:56:12 +08:00
|
|
|
/* To understand this stuff, an understanding of keys and all their bits is
|
|
|
|
* necessary. Every node in the trie has a key associated with it, but not
|
|
|
|
* all of the bits in that key are significant.
|
|
|
|
*
|
|
|
|
* Consider a node 'n' and its parent 'tp'.
|
|
|
|
*
|
|
|
|
* If n is a leaf, every bit in its key is significant. Its presence is
|
|
|
|
* necessitated by path compression, since during a tree traversal (when
|
|
|
|
* searching for a leaf - unless we are doing an insertion) we will completely
|
|
|
|
* ignore all skipped bits we encounter. Thus we need to verify, at the end of
|
|
|
|
* a potentially successful search, that we have indeed been walking the
|
|
|
|
* correct key path.
|
|
|
|
*
|
|
|
|
* Note that we can never "miss" the correct key in the tree if present by
|
|
|
|
* following the wrong path. Path compression ensures that segments of the key
|
|
|
|
* that are the same for all keys with a given prefix are skipped, but the
|
|
|
|
* skipped part *is* identical for each node in the subtrie below the skipped
|
|
|
|
* bit! trie_insert() in this implementation takes care of that.
|
|
|
|
*
|
|
|
|
* if n is an internal node - a 'tnode' here, the various parts of its key
|
|
|
|
* have many different meanings.
|
|
|
|
*
|
|
|
|
* Example:
|
|
|
|
* _________________________________________________________________
|
|
|
|
* | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
|
|
|
|
* -----------------------------------------------------------------
|
|
|
|
* 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
|
|
|
|
*
|
|
|
|
* _________________________________________________________________
|
|
|
|
* | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
|
|
|
|
* -----------------------------------------------------------------
|
|
|
|
* 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
|
|
|
|
*
|
|
|
|
* tp->pos = 22
|
|
|
|
* tp->bits = 3
|
|
|
|
* n->pos = 13
|
|
|
|
* n->bits = 4
|
|
|
|
*
|
|
|
|
* First, let's just ignore the bits that come before the parent tp, that is
|
|
|
|
* the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
|
|
|
|
* point we do not use them for anything.
|
|
|
|
*
|
|
|
|
* The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
|
|
|
|
* index into the parent's child array. That is, they will be used to find
|
|
|
|
* 'n' among tp's children.
|
|
|
|
*
|
2016-08-18 12:33:28 +08:00
|
|
|
* The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
|
2015-01-01 02:56:12 +08:00
|
|
|
* for the node n.
|
|
|
|
*
|
|
|
|
* All the bits we have seen so far are significant to the node n. The rest
|
|
|
|
* of the bits are really not needed or indeed known in n->key.
|
|
|
|
*
|
|
|
|
* The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
|
|
|
|
* n's child array, and will of course be different for each child.
|
|
|
|
*
|
2016-08-18 12:33:28 +08:00
|
|
|
* The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
|
2015-01-01 02:56:12 +08:00
|
|
|
* at this point.
|
|
|
|
*/
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2007-12-14 01:47:57 +08:00
|
|
|
static const int halve_threshold = 25;
|
|
|
|
static const int inflate_threshold = 50;
|
ipv4: Fix fib_trie rebalancing, part 4 (root thresholds)
Pawel Staszewski wrote:
<blockquote>
Some time ago i report this:
http://bugzilla.kernel.org/show_bug.cgi?id=6648
and now with 2.6.29 / 2.6.29.1 / 2.6.29.3 and 2.6.30 it back
dmesg output:
oprofile: using NMI interrupt.
Fix inflate_threshold_root. Now=15 size=11 bits
...
Fix inflate_threshold_root. Now=15 size=11 bits
cat /proc/net/fib_triestat
Basic info: size of leaf: 40 bytes, size of tnode: 56 bytes.
Main:
Aver depth: 2.28
Max depth: 6
Leaves: 276539
Prefixes: 289922
Internal nodes: 66762
1: 35046 2: 13824 3: 9508 4: 4897 5: 2331 6: 1149 7: 5
9: 1 18: 1
Pointers: 691228
Null ptrs: 347928
Total size: 35709 kB
</blockquote>
It seems, the current threshold for root resizing is too aggressive,
and it causes misleading warnings during big updates, but it might be
also responsible for memory problems, especially with non-preempt
configs, when RCU freeing is delayed long after call_rcu.
It should be also mentioned that because of non-atomic changes during
resizing/rebalancing the current lookup algorithm can miss valid leaves
so it's additional argument to shorten these activities even at a cost
of a minimally longer searching.
This patch restores values before the patch "[IPV4]: fib_trie root
node settings", commit: 965ffea43d4ebe8cd7b9fee78d651268dd7d23c5 from
v2.6.22.
Pawel's report:
<blockquote>
I dont see any big change of (cpu load or faster/slower
routing/propagating routes from bgpd or something else) - in avg there
is from 2% to 3% more of CPU load i dont know why but it is - i change
from "preempt" to "no preempt" 3 times and check this my "mpstat -P ALL
1 30"
always avg cpu load was from 2 to 3% more compared to "no preempt"
[...]
cat /proc/net/fib_triestat
Basic info: size of leaf: 20 bytes, size of tnode: 36 bytes.
Main:
Aver depth: 2.44
Max depth: 6
Leaves: 277814
Prefixes: 291306
Internal nodes: 66420
1: 32737 2: 14850 3: 10332 4: 4871 5: 2313 6: 942 7: 371 8: 3 17: 1
Pointers: 599098
Null ptrs: 254865
Total size: 18067 kB
</blockquote>
According to this and other similar reports average depth is slightly
increased (~0.2), and root nodes are shorter (log 17 vs. 18), but
there is no visible performance decrease. So, until memory handling is
improved or added parameters for changing this individually, this
patch resets to safer defaults.
Reported-by: Pawel Staszewski <pstaszewski@itcare.pl>
Reported-by: Jorge Boncompte [DTI2] <jorge@dti2.net>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Tested-by: Pawel Staszewski <pstaszewski@itcare.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-07-08 10:39:16 +08:00
|
|
|
static const int halve_threshold_root = 15;
|
2009-08-29 14:57:15 +08:00
|
|
|
static const int inflate_threshold_root = 30;
|
2005-08-26 04:01:29 +08:00
|
|
|
|
|
|
|
static void __alias_free_mem(struct rcu_head *head)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2005-08-26 04:01:29 +08:00
|
|
|
struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
|
|
|
|
kmem_cache_free(fn_alias_kmem, fa);
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2005-08-26 04:01:29 +08:00
|
|
|
static inline void alias_free_mem_rcu(struct fib_alias *fa)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2005-08-26 04:01:29 +08:00
|
|
|
call_rcu(&fa->rcu, __alias_free_mem);
|
|
|
|
}
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-03-05 07:04:46 +08:00
|
|
|
#define TNODE_VMALLOC_MAX \
|
2015-03-07 01:54:08 +08:00
|
|
|
ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-01-01 02:55:41 +08:00
|
|
|
static void __node_free_rcu(struct rcu_head *head)
|
2008-04-10 18:47:34 +08:00
|
|
|
{
|
2015-03-07 01:54:33 +08:00
|
|
|
struct tnode *n = container_of(head, struct tnode, rcu);
|
2015-01-01 02:55:41 +08:00
|
|
|
|
2015-03-07 01:54:33 +08:00
|
|
|
if (!n->tn_bits)
|
2015-01-01 02:55:41 +08:00
|
|
|
kmem_cache_free(trie_leaf_kmem, n);
|
|
|
|
else
|
2016-01-23 07:11:02 +08:00
|
|
|
kvfree(n);
|
2008-04-10 18:47:34 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:33 +08:00
|
|
|
#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
|
2015-01-01 02:55:41 +08:00
|
|
|
|
2015-03-07 01:54:27 +08:00
|
|
|
static struct tnode *tnode_alloc(int bits)
|
2005-07-06 05:44:55 +08:00
|
|
|
{
|
2015-03-05 07:04:46 +08:00
|
|
|
size_t size;
|
|
|
|
|
|
|
|
/* verify bits is within bounds */
|
|
|
|
if (bits > TNODE_VMALLOC_MAX)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* determine size and verify it is non-zero and didn't overflow */
|
|
|
|
size = TNODE_SIZE(1ul << bits);
|
|
|
|
|
2005-08-26 04:01:29 +08:00
|
|
|
if (size <= PAGE_SIZE)
|
2008-01-14 14:31:44 +08:00
|
|
|
return kzalloc(size, GFP_KERNEL);
|
2008-04-10 17:56:38 +08:00
|
|
|
else
|
2010-11-20 15:46:35 +08:00
|
|
|
return vzalloc(size);
|
2008-04-10 17:56:38 +08:00
|
|
|
}
|
2005-08-26 04:01:29 +08:00
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline void empty_child_inc(struct key_vector *n)
|
2015-01-23 07:51:26 +08:00
|
|
|
{
|
2019-06-19 05:14:40 +08:00
|
|
|
tn_info(n)->empty_children++;
|
|
|
|
|
|
|
|
if (!tn_info(n)->empty_children)
|
|
|
|
tn_info(n)->full_children++;
|
2015-01-23 07:51:26 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline void empty_child_dec(struct key_vector *n)
|
2015-01-23 07:51:26 +08:00
|
|
|
{
|
2019-06-19 05:14:40 +08:00
|
|
|
if (!tn_info(n)->empty_children)
|
|
|
|
tn_info(n)->full_children--;
|
|
|
|
|
|
|
|
tn_info(n)->empty_children--;
|
2015-01-23 07:51:26 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
|
2005-08-26 04:01:29 +08:00
|
|
|
{
|
2015-06-08 11:54:51 +08:00
|
|
|
struct key_vector *l;
|
|
|
|
struct tnode *kv;
|
2015-03-07 01:54:27 +08:00
|
|
|
|
2015-06-08 11:54:51 +08:00
|
|
|
kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
|
2015-03-07 01:54:27 +08:00
|
|
|
if (!kv)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* initialize key vector */
|
2015-06-08 11:54:51 +08:00
|
|
|
l = kv->kv;
|
2015-03-07 01:54:27 +08:00
|
|
|
l->key = key;
|
|
|
|
l->pos = 0;
|
|
|
|
l->bits = 0;
|
|
|
|
l->slen = fa->fa_slen;
|
|
|
|
|
|
|
|
/* link leaf to fib alias */
|
|
|
|
INIT_HLIST_HEAD(&l->leaf);
|
|
|
|
hlist_add_head(&fa->fa_list, &l->leaf);
|
|
|
|
|
2005-08-26 04:01:29 +08:00
|
|
|
return l;
|
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static struct key_vector *tnode_new(t_key key, int pos, int bits)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-01-01 02:55:35 +08:00
|
|
|
unsigned int shift = pos + bits;
|
2015-06-08 11:54:51 +08:00
|
|
|
struct key_vector *tn;
|
|
|
|
struct tnode *tnode;
|
2015-01-01 02:55:35 +08:00
|
|
|
|
|
|
|
/* verify bits and pos their msb bits clear and values are valid */
|
|
|
|
BUG_ON(!bits || (shift > KEYLENGTH));
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-06-08 11:54:51 +08:00
|
|
|
tnode = tnode_alloc(bits);
|
2015-03-07 01:54:27 +08:00
|
|
|
if (!tnode)
|
|
|
|
return NULL;
|
|
|
|
|
2015-06-08 11:54:51 +08:00
|
|
|
pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
|
|
|
|
sizeof(struct key_vector *) << bits);
|
|
|
|
|
2015-03-07 01:54:27 +08:00
|
|
|
if (bits == KEYLENGTH)
|
2015-03-07 01:54:39 +08:00
|
|
|
tnode->full_children = 1;
|
2015-03-07 01:54:27 +08:00
|
|
|
else
|
2015-03-07 01:54:39 +08:00
|
|
|
tnode->empty_children = 1ul << bits;
|
2015-03-07 01:54:27 +08:00
|
|
|
|
2015-06-08 11:54:51 +08:00
|
|
|
tn = tnode->kv;
|
2015-03-07 01:54:27 +08:00
|
|
|
tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
|
|
|
|
tn->pos = pos;
|
|
|
|
tn->bits = bits;
|
|
|
|
tn->slen = pos;
|
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
return tn;
|
|
|
|
}
|
|
|
|
|
2015-01-01 02:56:12 +08:00
|
|
|
/* Check whether a tnode 'n' is "full", i.e. it is an internal node
|
2005-06-22 03:43:18 +08:00
|
|
|
* and no bits are skipped. See discussion in dyntree paper p. 6
|
|
|
|
*/
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-01-01 02:56:12 +08:00
|
|
|
return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-01-01 02:56:43 +08:00
|
|
|
/* Add a child at position i overwriting the old value.
|
|
|
|
* Update the value of full_children and empty_children.
|
|
|
|
*/
|
2015-03-07 01:54:08 +08:00
|
|
|
static void put_child(struct key_vector *tn, unsigned long i,
|
|
|
|
struct key_vector *n)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-07 01:54:14 +08:00
|
|
|
struct key_vector *chi = get_child(tn, i);
|
2015-01-01 02:56:43 +08:00
|
|
|
int isfull, wasfull;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 01:54:21 +08:00
|
|
|
BUG_ON(i >= child_length(tn));
|
2005-08-24 12:59:41 +08:00
|
|
|
|
2015-01-23 07:51:26 +08:00
|
|
|
/* update emptyChildren, overflow into fullChildren */
|
2015-04-03 16:17:27 +08:00
|
|
|
if (!n && chi)
|
2015-01-23 07:51:26 +08:00
|
|
|
empty_child_inc(tn);
|
2015-04-03 16:17:27 +08:00
|
|
|
if (n && !chi)
|
2015-01-23 07:51:26 +08:00
|
|
|
empty_child_dec(tn);
|
2005-07-20 05:01:51 +08:00
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
/* update fullChildren */
|
2015-01-01 02:56:43 +08:00
|
|
|
wasfull = tnode_full(tn, chi);
|
2005-06-22 03:43:18 +08:00
|
|
|
isfull = tnode_full(tn, n);
|
2015-01-01 02:56:43 +08:00
|
|
|
|
2005-07-20 05:01:51 +08:00
|
|
|
if (wasfull && !isfull)
|
2015-03-07 01:54:39 +08:00
|
|
|
tn_info(tn)->full_children--;
|
2005-07-20 05:01:51 +08:00
|
|
|
else if (!wasfull && isfull)
|
2015-03-07 01:54:39 +08:00
|
|
|
tn_info(tn)->full_children++;
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-01-01 02:57:08 +08:00
|
|
|
if (n && (tn->slen < n->slen))
|
|
|
|
tn->slen = n->slen;
|
|
|
|
|
2015-03-05 07:02:33 +08:00
|
|
|
rcu_assign_pointer(tn->tnode[i], n);
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static void update_children(struct key_vector *tn)
|
2015-01-23 07:51:14 +08:00
|
|
|
{
|
|
|
|
unsigned long i;
|
|
|
|
|
|
|
|
/* update all of the child parent pointers */
|
2015-03-07 01:54:21 +08:00
|
|
|
for (i = child_length(tn); i;) {
|
2015-03-07 01:54:14 +08:00
|
|
|
struct key_vector *inode = get_child(tn, --i);
|
2015-01-23 07:51:14 +08:00
|
|
|
|
|
|
|
if (!inode)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Either update the children of a tnode that
|
|
|
|
* already belongs to us or update the child
|
|
|
|
* to point to ourselves.
|
|
|
|
*/
|
|
|
|
if (node_parent(inode) == tn)
|
|
|
|
update_children(inode);
|
|
|
|
else
|
|
|
|
node_set_parent(inode, tn);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
static inline void put_child_root(struct key_vector *tp, t_key key,
|
|
|
|
struct key_vector *n)
|
2015-01-01 02:56:06 +08:00
|
|
|
{
|
2015-03-07 01:54:52 +08:00
|
|
|
if (IS_TRIE(tp))
|
|
|
|
rcu_assign_pointer(tp->tnode[0], n);
|
2015-01-01 02:56:06 +08:00
|
|
|
else
|
2015-03-07 01:54:52 +08:00
|
|
|
put_child(tp, get_index(key, tp), n);
|
2015-01-01 02:56:06 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline void tnode_free_init(struct key_vector *tn)
|
2011-03-31 16:51:35 +08:00
|
|
|
{
|
2015-03-07 01:54:33 +08:00
|
|
|
tn_info(tn)->rcu.next = NULL;
|
2015-01-01 02:56:49 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline void tnode_free_append(struct key_vector *tn,
|
|
|
|
struct key_vector *n)
|
2015-01-01 02:56:49 +08:00
|
|
|
{
|
2015-03-07 01:54:33 +08:00
|
|
|
tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
|
|
|
|
tn_info(tn)->rcu.next = &tn_info(n)->rcu;
|
2015-01-01 02:56:49 +08:00
|
|
|
}
|
2011-03-31 16:51:35 +08:00
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static void tnode_free(struct key_vector *tn)
|
2015-01-01 02:56:49 +08:00
|
|
|
{
|
2015-03-07 01:54:33 +08:00
|
|
|
struct callback_head *head = &tn_info(tn)->rcu;
|
2015-01-01 02:56:49 +08:00
|
|
|
|
|
|
|
while (head) {
|
|
|
|
head = head->next;
|
2015-03-05 07:02:33 +08:00
|
|
|
tnode_free_size += TNODE_SIZE(1ul << tn->bits);
|
2015-01-01 02:56:49 +08:00
|
|
|
node_free(tn);
|
|
|
|
|
2015-03-07 01:54:33 +08:00
|
|
|
tn = container_of(head, struct tnode, rcu)->kv;
|
2015-01-01 02:56:49 +08:00
|
|
|
}
|
|
|
|
|
2022-07-07 07:40:03 +08:00
|
|
|
if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) {
|
2015-01-01 02:56:49 +08:00
|
|
|
tnode_free_size = 0;
|
|
|
|
synchronize_rcu();
|
2011-03-31 16:51:35 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
static struct key_vector *replace(struct trie *t,
|
|
|
|
struct key_vector *oldtnode,
|
|
|
|
struct key_vector *tn)
|
2015-01-23 07:51:14 +08:00
|
|
|
{
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *tp = node_parent(oldtnode);
|
2015-01-23 07:51:14 +08:00
|
|
|
unsigned long i;
|
|
|
|
|
|
|
|
/* setup the parent pointer out of and back into this node */
|
|
|
|
NODE_INIT_PARENT(tn, tp);
|
2015-03-07 01:54:52 +08:00
|
|
|
put_child_root(tp, tn->key, tn);
|
2015-01-23 07:51:14 +08:00
|
|
|
|
|
|
|
/* update all of the child parent pointers */
|
|
|
|
update_children(tn);
|
|
|
|
|
|
|
|
/* all pointers should be clean so we are done */
|
|
|
|
tnode_free(oldtnode);
|
|
|
|
|
|
|
|
/* resize children now that oldtnode is freed */
|
2015-03-07 01:54:21 +08:00
|
|
|
for (i = child_length(tn); i;) {
|
2015-03-07 01:54:14 +08:00
|
|
|
struct key_vector *inode = get_child(tn, --i);
|
2015-01-23 07:51:14 +08:00
|
|
|
|
|
|
|
/* resize child node */
|
|
|
|
if (tnode_full(tn, inode))
|
2015-03-07 01:54:52 +08:00
|
|
|
tn = resize(t, inode);
|
2015-01-23 07:51:14 +08:00
|
|
|
}
|
2015-03-07 01:54:02 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
return tp;
|
2015-01-23 07:51:14 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
static struct key_vector *inflate(struct trie *t,
|
|
|
|
struct key_vector *oldtnode)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *tn;
|
2015-01-23 07:51:14 +08:00
|
|
|
unsigned long i;
|
2015-01-01 02:56:12 +08:00
|
|
|
t_key m;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2005-08-24 12:59:41 +08:00
|
|
|
pr_debug("In inflate\n");
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-01 02:56:12 +08:00
|
|
|
tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
|
2005-08-24 12:59:41 +08:00
|
|
|
if (!tn)
|
2015-03-07 01:54:02 +08:00
|
|
|
goto notnode;
|
2005-07-06 06:02:40 +08:00
|
|
|
|
2015-01-23 07:51:14 +08:00
|
|
|
/* prepare oldtnode to be freed */
|
|
|
|
tnode_free_init(oldtnode);
|
|
|
|
|
2015-01-01 02:56:55 +08:00
|
|
|
/* Assemble all of the pointers in our cluster, in this case that
|
|
|
|
* represents all of the pointers out of our allocated nodes that
|
|
|
|
* point to existing tnodes and the links between our allocated
|
|
|
|
* nodes.
|
2005-07-06 06:02:40 +08:00
|
|
|
*/
|
2015-03-07 01:54:21 +08:00
|
|
|
for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
|
2015-03-07 01:54:14 +08:00
|
|
|
struct key_vector *inode = get_child(oldtnode, --i);
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *node0, *node1;
|
2015-01-23 07:51:14 +08:00
|
|
|
unsigned long j, k;
|
2005-07-20 05:01:51 +08:00
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
/* An empty child */
|
2015-04-03 16:17:26 +08:00
|
|
|
if (!inode)
|
2005-06-22 03:43:18 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
/* A leaf or an internal node with skipped bits */
|
2015-01-01 02:55:47 +08:00
|
|
|
if (!tnode_full(oldtnode, inode)) {
|
2015-01-01 02:56:12 +08:00
|
|
|
put_child(tn, get_index(inode->key, tn), inode);
|
2005-06-22 03:43:18 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2015-01-23 07:51:14 +08:00
|
|
|
/* drop the node in the old tnode free list */
|
|
|
|
tnode_free_append(oldtnode, inode);
|
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
/* An internal node with two children */
|
|
|
|
if (inode->bits == 1) {
|
2015-03-07 01:54:14 +08:00
|
|
|
put_child(tn, 2 * i + 1, get_child(inode, 1));
|
|
|
|
put_child(tn, 2 * i, get_child(inode, 0));
|
2005-08-10 11:24:39 +08:00
|
|
|
continue;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2005-08-10 11:24:39 +08:00
|
|
|
/* We will replace this node 'inode' with two new
|
2015-01-01 02:56:55 +08:00
|
|
|
* ones, 'node0' and 'node1', each with half of the
|
2005-08-10 11:24:39 +08:00
|
|
|
* original children. The two new nodes will have
|
|
|
|
* a position one bit further down the key and this
|
|
|
|
* means that the "significant" part of their keys
|
|
|
|
* (see the discussion near the top of this file)
|
|
|
|
* will differ by one bit, which will be "0" in
|
2015-01-01 02:56:55 +08:00
|
|
|
* node0's key and "1" in node1's key. Since we are
|
2005-08-10 11:24:39 +08:00
|
|
|
* moving the key position by one step, the bit that
|
|
|
|
* we are moving away from - the bit at position
|
2015-01-01 02:56:55 +08:00
|
|
|
* (tn->pos) - is the one that will differ between
|
|
|
|
* node0 and node1. So... we synthesize that bit in the
|
|
|
|
* two new keys.
|
2005-08-10 11:24:39 +08:00
|
|
|
*/
|
2015-01-01 02:56:55 +08:00
|
|
|
node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
|
|
|
|
if (!node1)
|
|
|
|
goto nomem;
|
2015-01-23 07:51:14 +08:00
|
|
|
node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
|
2015-01-01 02:56:55 +08:00
|
|
|
|
2015-01-23 07:51:14 +08:00
|
|
|
tnode_free_append(tn, node1);
|
2015-01-01 02:56:55 +08:00
|
|
|
if (!node0)
|
|
|
|
goto nomem;
|
|
|
|
tnode_free_append(tn, node0);
|
|
|
|
|
|
|
|
/* populate child pointers in new nodes */
|
2015-03-07 01:54:21 +08:00
|
|
|
for (k = child_length(inode), j = k / 2; j;) {
|
2015-03-07 01:54:14 +08:00
|
|
|
put_child(node1, --j, get_child(inode, --k));
|
|
|
|
put_child(node0, j, get_child(inode, j));
|
|
|
|
put_child(node1, --j, get_child(inode, --k));
|
|
|
|
put_child(node0, j, get_child(inode, j));
|
2015-01-01 02:56:55 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-01 02:56:55 +08:00
|
|
|
/* link new nodes to parent */
|
|
|
|
NODE_INIT_PARENT(node1, tn);
|
|
|
|
NODE_INIT_PARENT(node0, tn);
|
2005-07-06 06:02:40 +08:00
|
|
|
|
2015-01-01 02:56:55 +08:00
|
|
|
/* link parent to nodes */
|
|
|
|
put_child(tn, 2 * i + 1, node1);
|
|
|
|
put_child(tn, 2 * i, node0);
|
|
|
|
}
|
2005-07-06 06:02:40 +08:00
|
|
|
|
2015-01-23 07:51:14 +08:00
|
|
|
/* setup the parent pointers into and out of this node */
|
2015-03-07 01:54:02 +08:00
|
|
|
return replace(t, oldtnode, tn);
|
2005-08-10 11:25:06 +08:00
|
|
|
nomem:
|
2015-01-01 02:56:49 +08:00
|
|
|
/* all pointers should be clean so we are done */
|
|
|
|
tnode_free(tn);
|
2015-03-07 01:54:02 +08:00
|
|
|
notnode:
|
|
|
|
return NULL;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
static struct key_vector *halve(struct trie *t,
|
|
|
|
struct key_vector *oldtnode)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *tn;
|
2015-01-01 02:56:55 +08:00
|
|
|
unsigned long i;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2005-08-24 12:59:41 +08:00
|
|
|
pr_debug("In halve\n");
|
2005-07-20 05:01:51 +08:00
|
|
|
|
2015-01-01 02:56:12 +08:00
|
|
|
tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
|
2005-08-10 11:25:06 +08:00
|
|
|
if (!tn)
|
2015-03-07 01:54:02 +08:00
|
|
|
goto notnode;
|
2005-07-06 06:02:40 +08:00
|
|
|
|
2015-01-23 07:51:14 +08:00
|
|
|
/* prepare oldtnode to be freed */
|
|
|
|
tnode_free_init(oldtnode);
|
|
|
|
|
2015-01-01 02:56:55 +08:00
|
|
|
/* Assemble all of the pointers in our cluster, in this case that
|
|
|
|
* represents all of the pointers out of our allocated nodes that
|
|
|
|
* point to existing tnodes and the links between our allocated
|
|
|
|
* nodes.
|
2005-07-06 06:02:40 +08:00
|
|
|
*/
|
2015-03-07 01:54:21 +08:00
|
|
|
for (i = child_length(oldtnode); i;) {
|
2015-03-07 01:54:14 +08:00
|
|
|
struct key_vector *node1 = get_child(oldtnode, --i);
|
|
|
|
struct key_vector *node0 = get_child(oldtnode, --i);
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *inode;
|
2005-07-06 06:02:40 +08:00
|
|
|
|
2015-01-01 02:56:55 +08:00
|
|
|
/* At least one of the children is empty */
|
|
|
|
if (!node1 || !node0) {
|
|
|
|
put_child(tn, i / 2, node1 ? : node0);
|
|
|
|
continue;
|
|
|
|
}
|
2005-07-20 05:01:51 +08:00
|
|
|
|
2005-07-06 06:02:40 +08:00
|
|
|
/* Two nonempty children */
|
2015-01-01 02:56:55 +08:00
|
|
|
inode = tnode_new(node0->key, oldtnode->pos, 1);
|
2015-03-07 01:54:02 +08:00
|
|
|
if (!inode)
|
|
|
|
goto nomem;
|
2015-01-01 02:56:55 +08:00
|
|
|
tnode_free_append(tn, inode);
|
2005-07-06 06:02:40 +08:00
|
|
|
|
2015-01-01 02:56:55 +08:00
|
|
|
/* initialize pointers out of node */
|
|
|
|
put_child(inode, 1, node1);
|
|
|
|
put_child(inode, 0, node0);
|
|
|
|
NODE_INIT_PARENT(inode, tn);
|
|
|
|
|
|
|
|
/* link parent to node */
|
|
|
|
put_child(tn, i / 2, inode);
|
2005-07-06 06:02:40 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-23 07:51:14 +08:00
|
|
|
/* setup the parent pointers into and out of this node */
|
2015-03-07 01:54:02 +08:00
|
|
|
return replace(t, oldtnode, tn);
|
|
|
|
nomem:
|
|
|
|
/* all pointers should be clean so we are done */
|
|
|
|
tnode_free(tn);
|
|
|
|
notnode:
|
|
|
|
return NULL;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
static struct key_vector *collapse(struct trie *t,
|
|
|
|
struct key_vector *oldtnode)
|
2015-01-23 07:51:26 +08:00
|
|
|
{
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *n, *tp;
|
2015-01-23 07:51:26 +08:00
|
|
|
unsigned long i;
|
|
|
|
|
|
|
|
/* scan the tnode looking for that one child that might still exist */
|
2015-03-07 01:54:21 +08:00
|
|
|
for (n = NULL, i = child_length(oldtnode); !n && i;)
|
2015-03-07 01:54:14 +08:00
|
|
|
n = get_child(oldtnode, --i);
|
2015-01-23 07:51:26 +08:00
|
|
|
|
|
|
|
/* compress one level */
|
|
|
|
tp = node_parent(oldtnode);
|
2015-03-07 01:54:52 +08:00
|
|
|
put_child_root(tp, oldtnode->key, n);
|
2015-01-23 07:51:26 +08:00
|
|
|
node_set_parent(n, tp);
|
|
|
|
|
|
|
|
/* drop dead node */
|
|
|
|
node_free(oldtnode);
|
2015-03-07 01:54:52 +08:00
|
|
|
|
|
|
|
return tp;
|
2015-01-23 07:51:26 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static unsigned char update_suffix(struct key_vector *tn)
|
2015-01-01 02:57:08 +08:00
|
|
|
{
|
|
|
|
unsigned char slen = tn->pos;
|
|
|
|
unsigned long stride, i;
|
2016-12-01 20:27:57 +08:00
|
|
|
unsigned char slen_max;
|
|
|
|
|
|
|
|
/* only vector 0 can have a suffix length greater than or equal to
|
|
|
|
* tn->pos + tn->bits, the second highest node will have a suffix
|
|
|
|
* length at most of tn->pos + tn->bits - 1
|
|
|
|
*/
|
|
|
|
slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
|
2015-01-01 02:57:08 +08:00
|
|
|
|
|
|
|
/* search though the list of children looking for nodes that might
|
|
|
|
* have a suffix greater than the one we currently have. This is
|
|
|
|
* why we start with a stride of 2 since a stride of 1 would
|
|
|
|
* represent the nodes with suffix length equal to tn->pos
|
|
|
|
*/
|
2015-03-07 01:54:21 +08:00
|
|
|
for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
|
2015-03-07 01:54:14 +08:00
|
|
|
struct key_vector *n = get_child(tn, i);
|
2015-01-01 02:57:08 +08:00
|
|
|
|
|
|
|
if (!n || (n->slen <= slen))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* update stride and slen based on new value */
|
|
|
|
stride <<= (n->slen - slen);
|
|
|
|
slen = n->slen;
|
|
|
|
i &= ~(stride - 1);
|
|
|
|
|
2016-12-01 20:27:57 +08:00
|
|
|
/* stop searching if we have hit the maximum possible value */
|
|
|
|
if (slen >= slen_max)
|
2015-01-01 02:57:08 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
tn->slen = slen;
|
|
|
|
|
|
|
|
return slen;
|
|
|
|
}
|
|
|
|
|
2015-01-01 02:56:37 +08:00
|
|
|
/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
|
|
|
|
* the Helsinki University of Technology and Matti Tikkanen of Nokia
|
|
|
|
* Telecommunications, page 6:
|
|
|
|
* "A node is doubled if the ratio of non-empty children to all
|
|
|
|
* children in the *doubled* node is at least 'high'."
|
|
|
|
*
|
|
|
|
* 'high' in this instance is the variable 'inflate_threshold'. It
|
|
|
|
* is expressed as a percentage, so we multiply it with
|
2015-03-07 01:54:21 +08:00
|
|
|
* child_length() and instead of multiplying by 2 (since the
|
2015-01-01 02:56:37 +08:00
|
|
|
* child array will be doubled by inflate()) and multiplying
|
|
|
|
* the left-hand side by 100 (to handle the percentage thing) we
|
|
|
|
* multiply the left-hand side by 50.
|
|
|
|
*
|
2015-03-07 01:54:21 +08:00
|
|
|
* The left-hand side may look a bit weird: child_length(tn)
|
2015-01-01 02:56:37 +08:00
|
|
|
* - tn->empty_children is of course the number of non-null children
|
|
|
|
* in the current node. tn->full_children is the number of "full"
|
|
|
|
* children, that is non-null tnodes with a skip value of 0.
|
|
|
|
* All of those will be doubled in the resulting inflated tnode, so
|
|
|
|
* we just count them one extra time here.
|
|
|
|
*
|
|
|
|
* A clearer way to write this would be:
|
|
|
|
*
|
|
|
|
* to_be_doubled = tn->full_children;
|
2015-03-07 01:54:21 +08:00
|
|
|
* not_to_be_doubled = child_length(tn) - tn->empty_children -
|
2015-01-01 02:56:37 +08:00
|
|
|
* tn->full_children;
|
|
|
|
*
|
2015-03-07 01:54:21 +08:00
|
|
|
* new_child_length = child_length(tn) * 2;
|
2015-01-01 02:56:37 +08:00
|
|
|
*
|
|
|
|
* new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
|
|
|
|
* new_child_length;
|
|
|
|
* if (new_fill_factor >= inflate_threshold)
|
|
|
|
*
|
|
|
|
* ...and so on, tho it would mess up the while () loop.
|
|
|
|
*
|
|
|
|
* anyway,
|
|
|
|
* 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
|
|
|
|
* inflate_threshold
|
|
|
|
*
|
|
|
|
* avoid a division:
|
|
|
|
* 100 * (not_to_be_doubled + 2*to_be_doubled) >=
|
|
|
|
* inflate_threshold * new_child_length
|
|
|
|
*
|
|
|
|
* expand not_to_be_doubled and to_be_doubled, and shorten:
|
2015-03-07 01:54:21 +08:00
|
|
|
* 100 * (child_length(tn) - tn->empty_children +
|
2015-01-01 02:56:37 +08:00
|
|
|
* tn->full_children) >= inflate_threshold * new_child_length
|
|
|
|
*
|
|
|
|
* expand new_child_length:
|
2015-03-07 01:54:21 +08:00
|
|
|
* 100 * (child_length(tn) - tn->empty_children +
|
2015-01-01 02:56:37 +08:00
|
|
|
* tn->full_children) >=
|
2015-03-07 01:54:21 +08:00
|
|
|
* inflate_threshold * child_length(tn) * 2
|
2015-01-01 02:56:37 +08:00
|
|
|
*
|
|
|
|
* shorten again:
|
2015-03-07 01:54:21 +08:00
|
|
|
* 50 * (tn->full_children + child_length(tn) -
|
2015-01-01 02:56:37 +08:00
|
|
|
* tn->empty_children) >= inflate_threshold *
|
2015-03-07 01:54:21 +08:00
|
|
|
* child_length(tn)
|
2015-01-01 02:56:37 +08:00
|
|
|
*
|
|
|
|
*/
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
|
2015-01-01 02:56:37 +08:00
|
|
|
{
|
2015-03-07 01:54:21 +08:00
|
|
|
unsigned long used = child_length(tn);
|
2015-01-01 02:56:37 +08:00
|
|
|
unsigned long threshold = used;
|
|
|
|
|
|
|
|
/* Keep root node larger */
|
2015-03-07 01:54:52 +08:00
|
|
|
threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
|
2015-03-07 01:54:39 +08:00
|
|
|
used -= tn_info(tn)->empty_children;
|
|
|
|
used += tn_info(tn)->full_children;
|
2015-01-01 02:56:37 +08:00
|
|
|
|
2015-01-23 07:51:26 +08:00
|
|
|
/* if bits == KEYLENGTH then pos = 0, and will fail below */
|
|
|
|
|
|
|
|
return (used > 1) && tn->pos && ((50 * used) >= threshold);
|
2015-01-01 02:56:37 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
|
2015-01-01 02:56:37 +08:00
|
|
|
{
|
2015-03-07 01:54:21 +08:00
|
|
|
unsigned long used = child_length(tn);
|
2015-01-01 02:56:37 +08:00
|
|
|
unsigned long threshold = used;
|
|
|
|
|
|
|
|
/* Keep root node larger */
|
2015-03-07 01:54:52 +08:00
|
|
|
threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
|
2015-03-07 01:54:39 +08:00
|
|
|
used -= tn_info(tn)->empty_children;
|
2015-01-01 02:56:37 +08:00
|
|
|
|
2015-01-23 07:51:26 +08:00
|
|
|
/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
|
|
|
|
|
|
|
|
return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
|
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline bool should_collapse(struct key_vector *tn)
|
2015-01-23 07:51:26 +08:00
|
|
|
{
|
2015-03-07 01:54:21 +08:00
|
|
|
unsigned long used = child_length(tn);
|
2015-01-23 07:51:26 +08:00
|
|
|
|
2015-03-07 01:54:39 +08:00
|
|
|
used -= tn_info(tn)->empty_children;
|
2015-01-23 07:51:26 +08:00
|
|
|
|
|
|
|
/* account for bits == KEYLENGTH case */
|
2015-03-07 01:54:39 +08:00
|
|
|
if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
|
2015-01-23 07:51:26 +08:00
|
|
|
used -= KEY_MAX;
|
|
|
|
|
|
|
|
/* One child or none, time to drop us from the trie */
|
|
|
|
return used < 2;
|
2015-01-01 02:56:37 +08:00
|
|
|
}
|
|
|
|
|
2015-01-01 02:56:31 +08:00
|
|
|
#define MAX_WORK 10
|
2015-03-07 01:54:52 +08:00
|
|
|
static struct key_vector *resize(struct trie *t, struct key_vector *tn)
|
2015-01-01 02:56:31 +08:00
|
|
|
{
|
2015-03-07 01:54:02 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
|
|
|
struct trie_use_stats __percpu *stats = t->stats;
|
|
|
|
#endif
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *tp = node_parent(tn);
|
2015-03-07 01:54:52 +08:00
|
|
|
unsigned long cindex = get_index(tn->key, tp);
|
2015-01-23 07:51:20 +08:00
|
|
|
int max_work = MAX_WORK;
|
2015-01-01 02:56:31 +08:00
|
|
|
|
|
|
|
pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
|
|
|
|
tn, inflate_threshold, halve_threshold);
|
|
|
|
|
2015-01-01 02:56:43 +08:00
|
|
|
/* track the tnode via the pointer from the parent instead of
|
|
|
|
* doing it ourselves. This way we can let RCU fully do its
|
|
|
|
* thing without us interfering
|
|
|
|
*/
|
2015-03-07 01:54:52 +08:00
|
|
|
BUG_ON(tn != get_child(tp, cindex));
|
2015-01-01 02:56:43 +08:00
|
|
|
|
2015-01-01 02:56:37 +08:00
|
|
|
/* Double as long as the resulting node has a number of
|
|
|
|
* nonempty nodes that are above the threshold.
|
2015-01-01 02:56:31 +08:00
|
|
|
*/
|
2015-03-24 02:51:53 +08:00
|
|
|
while (should_inflate(tp, tn) && max_work) {
|
2015-03-07 01:54:52 +08:00
|
|
|
tp = inflate(t, tn);
|
|
|
|
if (!tp) {
|
2015-01-01 02:56:31 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2015-03-07 01:54:02 +08:00
|
|
|
this_cpu_inc(stats->resize_node_skipped);
|
2015-01-01 02:56:31 +08:00
|
|
|
#endif
|
|
|
|
break;
|
|
|
|
}
|
2015-01-01 02:56:43 +08:00
|
|
|
|
2015-03-24 02:51:53 +08:00
|
|
|
max_work--;
|
2015-03-07 01:54:52 +08:00
|
|
|
tn = get_child(tp, cindex);
|
2015-01-01 02:56:31 +08:00
|
|
|
}
|
|
|
|
|
2015-03-24 02:51:53 +08:00
|
|
|
/* update parent in case inflate failed */
|
|
|
|
tp = node_parent(tn);
|
|
|
|
|
2015-01-01 02:56:31 +08:00
|
|
|
/* Return if at least one inflate is run */
|
|
|
|
if (max_work != MAX_WORK)
|
2015-03-24 02:51:53 +08:00
|
|
|
return tp;
|
2015-01-01 02:56:31 +08:00
|
|
|
|
2015-01-01 02:56:37 +08:00
|
|
|
/* Halve as long as the number of empty children in this
|
2015-01-01 02:56:31 +08:00
|
|
|
* node is above threshold.
|
|
|
|
*/
|
2015-03-24 02:51:53 +08:00
|
|
|
while (should_halve(tp, tn) && max_work) {
|
2015-03-07 01:54:52 +08:00
|
|
|
tp = halve(t, tn);
|
|
|
|
if (!tp) {
|
2015-01-01 02:56:31 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2015-03-07 01:54:02 +08:00
|
|
|
this_cpu_inc(stats->resize_node_skipped);
|
2015-01-01 02:56:31 +08:00
|
|
|
#endif
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2015-03-24 02:51:53 +08:00
|
|
|
max_work--;
|
2015-03-07 01:54:52 +08:00
|
|
|
tn = get_child(tp, cindex);
|
2015-01-01 02:56:43 +08:00
|
|
|
}
|
2015-01-01 02:56:31 +08:00
|
|
|
|
|
|
|
/* Only one child remains */
|
2015-03-07 01:54:52 +08:00
|
|
|
if (should_collapse(tn))
|
|
|
|
return collapse(t, tn);
|
|
|
|
|
2015-03-24 02:51:53 +08:00
|
|
|
/* update parent in case halve failed */
|
2016-12-01 20:27:57 +08:00
|
|
|
return node_parent(tn);
|
2015-01-01 02:56:31 +08:00
|
|
|
}
|
|
|
|
|
2016-12-01 20:27:52 +08:00
|
|
|
static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
|
2015-01-01 02:57:08 +08:00
|
|
|
{
|
2016-12-01 20:27:52 +08:00
|
|
|
unsigned char node_slen = tn->slen;
|
|
|
|
|
|
|
|
while ((node_slen > tn->pos) && (node_slen > slen)) {
|
|
|
|
slen = update_suffix(tn);
|
|
|
|
if (node_slen == slen)
|
2015-01-01 02:57:08 +08:00
|
|
|
break;
|
2016-12-01 20:27:52 +08:00
|
|
|
|
|
|
|
tn = node_parent(tn);
|
|
|
|
node_slen = tn->slen;
|
2015-01-01 02:57:08 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-01 20:27:52 +08:00
|
|
|
static void node_push_suffix(struct key_vector *tn, unsigned char slen)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2016-12-01 20:27:52 +08:00
|
|
|
while (tn->slen < slen) {
|
|
|
|
tn->slen = slen;
|
2015-01-01 02:57:08 +08:00
|
|
|
tn = node_parent(tn);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-08-26 04:01:29 +08:00
|
|
|
/* rcu_read_lock needs to be hold by caller from readside */
|
2015-03-07 01:54:08 +08:00
|
|
|
static struct key_vector *fib_find_node(struct trie *t,
|
|
|
|
struct key_vector **tp, u32 key)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-07 01:54:52 +08:00
|
|
|
struct key_vector *pn, *n = t->kv;
|
|
|
|
unsigned long index = 0;
|
|
|
|
|
|
|
|
do {
|
|
|
|
pn = n;
|
|
|
|
n = get_child_rcu(n, index);
|
|
|
|
|
|
|
|
if (!n)
|
|
|
|
break;
|
2015-01-01 02:56:00 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
index = get_cindex(key, n);
|
2015-01-01 02:56:00 +08:00
|
|
|
|
|
|
|
/* This bit of code is a bit tricky but it combines multiple
|
|
|
|
* checks into a single check. The prefix consists of the
|
|
|
|
* prefix plus zeros for the bits in the cindex. The index
|
|
|
|
* is the difference between the key and this value. From
|
|
|
|
* this we can actually derive several pieces of data.
|
2015-03-05 07:01:59 +08:00
|
|
|
* if (index >= (1ul << bits))
|
2015-01-01 02:56:00 +08:00
|
|
|
* we have a mismatch in skip bits and failed
|
2015-01-23 07:51:08 +08:00
|
|
|
* else
|
|
|
|
* we know the value is cindex
|
2015-03-05 07:01:59 +08:00
|
|
|
*
|
|
|
|
* This check is safe even if bits == KEYLENGTH due to the
|
|
|
|
* fact that we can only allocate a node with 32 bits if a
|
|
|
|
* long is greater than 32 bits.
|
2015-01-01 02:56:00 +08:00
|
|
|
*/
|
2015-03-05 07:01:59 +08:00
|
|
|
if (index >= (1ul << n->bits)) {
|
|
|
|
n = NULL;
|
|
|
|
break;
|
|
|
|
}
|
2015-01-01 02:56:00 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
/* keep searching until we find a perfect match leaf or NULL */
|
|
|
|
} while (IS_TNODE(n));
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
*tp = pn;
|
2015-03-05 07:01:59 +08:00
|
|
|
|
2015-01-01 02:56:00 +08:00
|
|
|
return n;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-01-23 07:51:39 +08:00
|
|
|
/* Return the first fib alias matching TOS with
|
|
|
|
* priority less than or equal to PRIO.
|
2019-12-14 23:53:08 +08:00
|
|
|
* If 'find_first' is set, return the first matching
|
|
|
|
* fib alias, regardless of TOS and priority.
|
2015-01-23 07:51:39 +08:00
|
|
|
*/
|
2015-02-26 07:31:51 +08:00
|
|
|
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
|
2019-12-14 23:53:08 +08:00
|
|
|
u8 tos, u32 prio, u32 tb_id,
|
|
|
|
bool find_first)
|
2015-01-23 07:51:39 +08:00
|
|
|
{
|
|
|
|
struct fib_alias *fa;
|
|
|
|
|
|
|
|
if (!fah)
|
|
|
|
return NULL;
|
|
|
|
|
2015-02-26 07:31:31 +08:00
|
|
|
hlist_for_each_entry(fa, fah, fa_list) {
|
2015-02-26 07:31:51 +08:00
|
|
|
if (fa->fa_slen < slen)
|
|
|
|
continue;
|
|
|
|
if (fa->fa_slen != slen)
|
|
|
|
break;
|
2015-03-13 05:46:29 +08:00
|
|
|
if (fa->tb_id > tb_id)
|
|
|
|
continue;
|
|
|
|
if (fa->tb_id != tb_id)
|
|
|
|
break;
|
2019-12-14 23:53:08 +08:00
|
|
|
if (find_first)
|
|
|
|
return fa;
|
2015-01-23 07:51:39 +08:00
|
|
|
if (fa->fa_tos > tos)
|
|
|
|
continue;
|
|
|
|
if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
|
|
|
|
return fa;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
ipv4: Add "offload" and "trap" indications to routes
When performing L3 offload, routes and nexthops are usually programmed
into two different tables in the underlying device. Therefore, the fact
that a nexthop resides in hardware does not necessarily mean that all
the associated routes also reside in hardware and vice-versa.
While the kernel can signal to user space the presence of a nexthop in
hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag
for routes. In addition, the fact that a route resides in hardware does
not necessarily mean that the traffic is offloaded. For example,
unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap
packets to the CPU so that the kernel will be able to generate the
appropriate ICMP error packet.
This patch adds an "offload" and "trap" indications to IPv4 routes, so
that users will have better visibility into the offload process.
'struct fib_alias' is extended with two new fields that indicate if the
route resides in hardware or not and if it is offloading traffic from
the kernel or trapping packets to it. Note that the new fields are added
in the 6 bytes hole and therefore the struct still fits in a single
cache line [1].
Capable drivers are expected to invoke fib_alias_hw_flags_set() with the
route's key in order to set the flags.
The indications are dumped to user space via a new flags (i.e.,
'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the
ancillary header.
v2:
* Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set()
[1]
struct fib_alias {
struct hlist_node fa_list; /* 0 16 */
struct fib_info * fa_info; /* 16 8 */
u8 fa_tos; /* 24 1 */
u8 fa_type; /* 25 1 */
u8 fa_state; /* 26 1 */
u8 fa_slen; /* 27 1 */
u32 tb_id; /* 28 4 */
s16 fa_default; /* 32 2 */
u8 offload:1; /* 34: 0 1 */
u8 trap:1; /* 34: 1 1 */
u8 unused:6; /* 34: 2 1 */
/* XXX 5 bytes hole, try to pack */
struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */
/* size: 56, cachelines: 1, members: 12 */
/* sum members: 50, holes: 1, sum holes: 5 */
/* sum bitfield members: 8 bits (1 bytes) */
/* forced alignments: 1, forced holes: 1, sum forced holes: 5 */
/* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-14 19:23:11 +08:00
|
|
|
static struct fib_alias *
|
|
|
|
fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
|
|
|
|
{
|
|
|
|
u8 slen = KEYLENGTH - fri->dst_len;
|
|
|
|
struct key_vector *l, *tp;
|
|
|
|
struct fib_table *tb;
|
|
|
|
struct fib_alias *fa;
|
|
|
|
struct trie *t;
|
|
|
|
|
|
|
|
tb = fib_get_table(net, fri->tb_id);
|
|
|
|
if (!tb)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
t = (struct trie *)tb->tb_data;
|
|
|
|
l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
|
|
|
|
if (!l)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
|
|
|
|
if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
|
|
|
|
fa->fa_tos == fri->tos && fa->fa_info == fri->fi &&
|
|
|
|
fa->fa_type == fri->type)
|
|
|
|
return fa;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
|
|
|
|
{
|
2022-07-23 02:22:05 +08:00
|
|
|
u8 fib_notify_on_flag_change;
|
ipv4: Add "offload" and "trap" indications to routes
When performing L3 offload, routes and nexthops are usually programmed
into two different tables in the underlying device. Therefore, the fact
that a nexthop resides in hardware does not necessarily mean that all
the associated routes also reside in hardware and vice-versa.
While the kernel can signal to user space the presence of a nexthop in
hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag
for routes. In addition, the fact that a route resides in hardware does
not necessarily mean that the traffic is offloaded. For example,
unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap
packets to the CPU so that the kernel will be able to generate the
appropriate ICMP error packet.
This patch adds an "offload" and "trap" indications to IPv4 routes, so
that users will have better visibility into the offload process.
'struct fib_alias' is extended with two new fields that indicate if the
route resides in hardware or not and if it is offloading traffic from
the kernel or trapping packets to it. Note that the new fields are added
in the 6 bytes hole and therefore the struct still fits in a single
cache line [1].
Capable drivers are expected to invoke fib_alias_hw_flags_set() with the
route's key in order to set the flags.
The indications are dumped to user space via a new flags (i.e.,
'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the
ancillary header.
v2:
* Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set()
[1]
struct fib_alias {
struct hlist_node fa_list; /* 0 16 */
struct fib_info * fa_info; /* 16 8 */
u8 fa_tos; /* 24 1 */
u8 fa_type; /* 25 1 */
u8 fa_state; /* 26 1 */
u8 fa_slen; /* 27 1 */
u32 tb_id; /* 28 4 */
s16 fa_default; /* 32 2 */
u8 offload:1; /* 34: 0 1 */
u8 trap:1; /* 34: 1 1 */
u8 unused:6; /* 34: 2 1 */
/* XXX 5 bytes hole, try to pack */
struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */
/* size: 56, cachelines: 1, members: 12 */
/* sum members: 50, holes: 1, sum holes: 5 */
/* sum bitfield members: 8 bits (1 bytes) */
/* forced alignments: 1, forced holes: 1, sum forced holes: 5 */
/* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-14 19:23:11 +08:00
|
|
|
struct fib_alias *fa_match;
|
2021-02-02 03:47:52 +08:00
|
|
|
struct sk_buff *skb;
|
|
|
|
int err;
|
ipv4: Add "offload" and "trap" indications to routes
When performing L3 offload, routes and nexthops are usually programmed
into two different tables in the underlying device. Therefore, the fact
that a nexthop resides in hardware does not necessarily mean that all
the associated routes also reside in hardware and vice-versa.
While the kernel can signal to user space the presence of a nexthop in
hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag
for routes. In addition, the fact that a route resides in hardware does
not necessarily mean that the traffic is offloaded. For example,
unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap
packets to the CPU so that the kernel will be able to generate the
appropriate ICMP error packet.
This patch adds an "offload" and "trap" indications to IPv4 routes, so
that users will have better visibility into the offload process.
'struct fib_alias' is extended with two new fields that indicate if the
route resides in hardware or not and if it is offloading traffic from
the kernel or trapping packets to it. Note that the new fields are added
in the 6 bytes hole and therefore the struct still fits in a single
cache line [1].
Capable drivers are expected to invoke fib_alias_hw_flags_set() with the
route's key in order to set the flags.
The indications are dumped to user space via a new flags (i.e.,
'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the
ancillary header.
v2:
* Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set()
[1]
struct fib_alias {
struct hlist_node fa_list; /* 0 16 */
struct fib_info * fa_info; /* 16 8 */
u8 fa_tos; /* 24 1 */
u8 fa_type; /* 25 1 */
u8 fa_state; /* 26 1 */
u8 fa_slen; /* 27 1 */
u32 tb_id; /* 28 4 */
s16 fa_default; /* 32 2 */
u8 offload:1; /* 34: 0 1 */
u8 trap:1; /* 34: 1 1 */
u8 unused:6; /* 34: 2 1 */
/* XXX 5 bytes hole, try to pack */
struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */
/* size: 56, cachelines: 1, members: 12 */
/* sum members: 50, holes: 1, sum holes: 5 */
/* sum bitfield members: 8 bits (1 bytes) */
/* forced alignments: 1, forced holes: 1, sum forced holes: 5 */
/* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-14 19:23:11 +08:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
fa_match = fib_find_matching_alias(net, fri);
|
|
|
|
if (!fa_match)
|
|
|
|
goto out;
|
|
|
|
|
2022-02-17 01:32:16 +08:00
|
|
|
/* These are paired with the WRITE_ONCE() happening in this function.
|
|
|
|
* The reason is that we are only protected by RCU at this point.
|
|
|
|
*/
|
|
|
|
if (READ_ONCE(fa_match->offload) == fri->offload &&
|
|
|
|
READ_ONCE(fa_match->trap) == fri->trap &&
|
|
|
|
READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
|
2021-02-02 03:47:52 +08:00
|
|
|
goto out;
|
|
|
|
|
2022-02-17 01:32:16 +08:00
|
|
|
WRITE_ONCE(fa_match->offload, fri->offload);
|
|
|
|
WRITE_ONCE(fa_match->trap, fri->trap);
|
2021-02-07 16:22:51 +08:00
|
|
|
|
2022-07-23 02:22:05 +08:00
|
|
|
fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change);
|
|
|
|
|
2021-02-07 16:22:51 +08:00
|
|
|
/* 2 means send notifications only if offload_failed was changed. */
|
2022-07-23 02:22:05 +08:00
|
|
|
if (fib_notify_on_flag_change == 2 &&
|
2022-02-17 01:32:16 +08:00
|
|
|
READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
|
2021-02-07 16:22:51 +08:00
|
|
|
goto out;
|
|
|
|
|
2022-02-17 01:32:16 +08:00
|
|
|
WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);
|
ipv4: Add "offload" and "trap" indications to routes
When performing L3 offload, routes and nexthops are usually programmed
into two different tables in the underlying device. Therefore, the fact
that a nexthop resides in hardware does not necessarily mean that all
the associated routes also reside in hardware and vice-versa.
While the kernel can signal to user space the presence of a nexthop in
hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag
for routes. In addition, the fact that a route resides in hardware does
not necessarily mean that the traffic is offloaded. For example,
unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap
packets to the CPU so that the kernel will be able to generate the
appropriate ICMP error packet.
This patch adds an "offload" and "trap" indications to IPv4 routes, so
that users will have better visibility into the offload process.
'struct fib_alias' is extended with two new fields that indicate if the
route resides in hardware or not and if it is offloading traffic from
the kernel or trapping packets to it. Note that the new fields are added
in the 6 bytes hole and therefore the struct still fits in a single
cache line [1].
Capable drivers are expected to invoke fib_alias_hw_flags_set() with the
route's key in order to set the flags.
The indications are dumped to user space via a new flags (i.e.,
'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the
ancillary header.
v2:
* Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set()
[1]
struct fib_alias {
struct hlist_node fa_list; /* 0 16 */
struct fib_info * fa_info; /* 16 8 */
u8 fa_tos; /* 24 1 */
u8 fa_type; /* 25 1 */
u8 fa_state; /* 26 1 */
u8 fa_slen; /* 27 1 */
u32 tb_id; /* 28 4 */
s16 fa_default; /* 32 2 */
u8 offload:1; /* 34: 0 1 */
u8 trap:1; /* 34: 1 1 */
u8 unused:6; /* 34: 2 1 */
/* XXX 5 bytes hole, try to pack */
struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */
/* size: 56, cachelines: 1, members: 12 */
/* sum members: 50, holes: 1, sum holes: 5 */
/* sum bitfield members: 8 bits (1 bytes) */
/* forced alignments: 1, forced holes: 1, sum forced holes: 5 */
/* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-14 19:23:11 +08:00
|
|
|
|
2022-07-23 02:22:05 +08:00
|
|
|
if (!fib_notify_on_flag_change)
|
2021-02-02 03:47:52 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
|
|
|
|
if (!skb) {
|
|
|
|
err = -ENOBUFS;
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0);
|
|
|
|
if (err < 0) {
|
|
|
|
/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
|
|
|
|
WARN_ON(err == -EMSGSIZE);
|
|
|
|
kfree_skb(skb);
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
|
|
|
|
rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC);
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
errout:
|
|
|
|
rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err);
|
ipv4: Add "offload" and "trap" indications to routes
When performing L3 offload, routes and nexthops are usually programmed
into two different tables in the underlying device. Therefore, the fact
that a nexthop resides in hardware does not necessarily mean that all
the associated routes also reside in hardware and vice-versa.
While the kernel can signal to user space the presence of a nexthop in
hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag
for routes. In addition, the fact that a route resides in hardware does
not necessarily mean that the traffic is offloaded. For example,
unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap
packets to the CPU so that the kernel will be able to generate the
appropriate ICMP error packet.
This patch adds an "offload" and "trap" indications to IPv4 routes, so
that users will have better visibility into the offload process.
'struct fib_alias' is extended with two new fields that indicate if the
route resides in hardware or not and if it is offloading traffic from
the kernel or trapping packets to it. Note that the new fields are added
in the 6 bytes hole and therefore the struct still fits in a single
cache line [1].
Capable drivers are expected to invoke fib_alias_hw_flags_set() with the
route's key in order to set the flags.
The indications are dumped to user space via a new flags (i.e.,
'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the
ancillary header.
v2:
* Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set()
[1]
struct fib_alias {
struct hlist_node fa_list; /* 0 16 */
struct fib_info * fa_info; /* 16 8 */
u8 fa_tos; /* 24 1 */
u8 fa_type; /* 25 1 */
u8 fa_state; /* 26 1 */
u8 fa_slen; /* 27 1 */
u32 tb_id; /* 28 4 */
s16 fa_default; /* 32 2 */
u8 offload:1; /* 34: 0 1 */
u8 trap:1; /* 34: 1 1 */
u8 unused:6; /* 34: 2 1 */
/* XXX 5 bytes hole, try to pack */
struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */
/* size: 56, cachelines: 1, members: 12 */
/* sum members: 50, holes: 1, sum holes: 5 */
/* sum bitfield members: 8 bits (1 bytes) */
/* forced alignments: 1, forced holes: 1, sum forced holes: 5 */
/* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-14 19:23:11 +08:00
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static void trie_rebalance(struct trie *t, struct key_vector *tn)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-07 01:54:52 +08:00
|
|
|
while (!IS_TRIE(tn))
|
|
|
|
tn = resize(t, tn);
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static int fib_insert_node(struct trie *t, struct key_vector *tp,
|
2015-03-05 07:02:18 +08:00
|
|
|
struct fib_alias *new, t_key key)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *n, *l;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-05 07:02:18 +08:00
|
|
|
l = leaf_new(key, new);
|
2015-02-26 07:31:51 +08:00
|
|
|
if (!l)
|
2015-03-07 01:54:02 +08:00
|
|
|
goto noleaf;
|
2015-03-05 07:02:18 +08:00
|
|
|
|
|
|
|
/* retrieve child from parent node */
|
2015-03-07 01:54:52 +08:00
|
|
|
n = get_child(tp, get_index(key, tp));
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-01 02:56:06 +08:00
|
|
|
/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
|
|
|
|
*
|
|
|
|
* Add a new tnode here
|
|
|
|
* first tnode need some special handling
|
|
|
|
* leaves us in position for handling as case 3
|
|
|
|
*/
|
|
|
|
if (n) {
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *tn;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-01 02:56:12 +08:00
|
|
|
tn = tnode_new(key, __fls(key ^ n->key), 1);
|
2015-03-07 01:54:02 +08:00
|
|
|
if (!tn)
|
|
|
|
goto notnode;
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-01-01 02:56:06 +08:00
|
|
|
/* initialize routes out of node */
|
|
|
|
NODE_INIT_PARENT(tn, tp);
|
|
|
|
put_child(tn, get_index(key, tn) ^ 1, n);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-01 02:56:06 +08:00
|
|
|
/* start adding routes into the node */
|
2015-03-07 01:54:52 +08:00
|
|
|
put_child_root(tp, key, tn);
|
2015-01-01 02:56:06 +08:00
|
|
|
node_set_parent(n, tn);
|
2014-12-11 13:49:22 +08:00
|
|
|
|
2015-01-01 02:56:06 +08:00
|
|
|
/* parent now has a NULL spot where the leaf can go */
|
2014-12-11 13:49:22 +08:00
|
|
|
tp = tn;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-01-01 02:56:06 +08:00
|
|
|
/* Case 3: n is NULL, and will just insert a new leaf */
|
2016-12-01 20:27:57 +08:00
|
|
|
node_push_suffix(tp, new->fa_slen);
|
2015-03-05 07:02:18 +08:00
|
|
|
NODE_INIT_PARENT(l, tp);
|
2015-03-07 01:54:52 +08:00
|
|
|
put_child_root(tp, key, l);
|
2015-03-05 07:02:18 +08:00
|
|
|
trie_rebalance(t, tp);
|
|
|
|
|
|
|
|
return 0;
|
2015-03-07 01:54:02 +08:00
|
|
|
notnode:
|
|
|
|
node_free(l);
|
|
|
|
noleaf:
|
|
|
|
return -ENOMEM;
|
2015-03-05 07:02:18 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static int fib_insert_alias(struct trie *t, struct key_vector *tp,
|
|
|
|
struct key_vector *l, struct fib_alias *new,
|
2015-03-05 07:02:18 +08:00
|
|
|
struct fib_alias *fa, t_key key)
|
|
|
|
{
|
|
|
|
if (!l)
|
|
|
|
return fib_insert_node(t, tp, new, key);
|
|
|
|
|
|
|
|
if (fa) {
|
|
|
|
hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
|
2015-01-01 02:56:06 +08:00
|
|
|
} else {
|
2015-03-05 07:02:18 +08:00
|
|
|
struct fib_alias *last;
|
|
|
|
|
|
|
|
hlist_for_each_entry(last, &l->leaf, fa_list) {
|
|
|
|
if (new->fa_slen < last->fa_slen)
|
|
|
|
break;
|
2015-03-13 05:46:29 +08:00
|
|
|
if ((new->fa_slen == last->fa_slen) &&
|
|
|
|
(new->tb_id > last->tb_id))
|
|
|
|
break;
|
2015-03-05 07:02:18 +08:00
|
|
|
fa = last;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fa)
|
|
|
|
hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
|
|
|
|
else
|
|
|
|
hlist_add_head_rcu(&new->fa_list, &l->leaf);
|
2015-01-01 02:56:06 +08:00
|
|
|
}
|
2005-08-26 04:01:29 +08:00
|
|
|
|
2015-03-05 07:02:18 +08:00
|
|
|
/* if we added to the tail node then we need to update slen */
|
|
|
|
if (l->slen < new->fa_slen) {
|
|
|
|
l->slen = new->fa_slen;
|
2016-12-01 20:27:52 +08:00
|
|
|
node_push_suffix(tp, new->fa_slen);
|
2015-03-05 07:02:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2017-05-28 06:19:26 +08:00
|
|
|
static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
|
2017-05-28 06:19:25 +08:00
|
|
|
{
|
2017-05-28 06:19:26 +08:00
|
|
|
if (plen > KEYLENGTH) {
|
|
|
|
NL_SET_ERR_MSG(extack, "Invalid prefix length");
|
2017-05-28 06:19:25 +08:00
|
|
|
return false;
|
2017-05-28 06:19:26 +08:00
|
|
|
}
|
2017-05-28 06:19:25 +08:00
|
|
|
|
2017-05-28 06:19:26 +08:00
|
|
|
if ((plen < KEYLENGTH) && (key << plen)) {
|
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Invalid prefix for given prefix length");
|
2017-05-28 06:19:25 +08:00
|
|
|
return false;
|
2017-05-28 06:19:26 +08:00
|
|
|
}
|
2017-05-28 06:19:25 +08:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-12-14 23:53:07 +08:00
|
|
|
static void fib_remove_alias(struct trie *t, struct key_vector *tp,
|
|
|
|
struct key_vector *l, struct fib_alias *old);
|
|
|
|
|
2015-03-05 07:02:18 +08:00
|
|
|
/* Caller must hold RTNL. */
|
2016-09-26 18:52:29 +08:00
|
|
|
int fib_table_insert(struct net *net, struct fib_table *tb,
|
2017-05-22 00:12:02 +08:00
|
|
|
struct fib_config *cfg, struct netlink_ext_ack *extack)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-05 07:01:59 +08:00
|
|
|
struct trie *t = (struct trie *)tb->tb_data;
|
2005-06-22 03:43:18 +08:00
|
|
|
struct fib_alias *fa, *new_fa;
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *l, *tp;
|
2016-09-07 23:20:46 +08:00
|
|
|
u16 nlflags = NLM_F_EXCL;
|
2005-06-22 03:43:18 +08:00
|
|
|
struct fib_info *fi;
|
2015-02-26 07:31:51 +08:00
|
|
|
u8 plen = cfg->fc_dst_len;
|
|
|
|
u8 slen = KEYLENGTH - plen;
|
2006-08-18 09:14:52 +08:00
|
|
|
u8 tos = cfg->fc_tos;
|
2015-03-05 07:01:59 +08:00
|
|
|
u32 key;
|
2005-06-22 03:43:18 +08:00
|
|
|
int err;
|
|
|
|
|
2006-08-18 09:14:52 +08:00
|
|
|
key = ntohl(cfg->fc_dst);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2017-05-28 06:19:26 +08:00
|
|
|
if (!fib_valid_key_len(key, plen, extack))
|
2005-06-22 03:43:18 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
2017-05-28 06:19:25 +08:00
|
|
|
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
|
|
|
|
|
2017-05-22 00:12:02 +08:00
|
|
|
fi = fib_create_info(cfg, extack);
|
2006-08-18 09:14:52 +08:00
|
|
|
if (IS_ERR(fi)) {
|
|
|
|
err = PTR_ERR(fi);
|
2005-06-22 03:43:18 +08:00
|
|
|
goto err;
|
2006-08-18 09:14:52 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-05 07:01:59 +08:00
|
|
|
l = fib_find_node(t, &tp, key);
|
2015-03-13 05:46:29 +08:00
|
|
|
fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
|
2019-12-14 23:53:08 +08:00
|
|
|
tb->tb_id, false) : NULL;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
/* Now fa, if non-NULL, points to the first fib alias
|
|
|
|
* with the same keys [prefix,tos,priority], if such key already
|
|
|
|
* exists or to the node before which we will insert new one.
|
|
|
|
*
|
|
|
|
* If fa is NULL, we will need to allocate a new one and
|
2015-02-26 07:31:31 +08:00
|
|
|
* insert to the tail of the section matching the suffix length
|
|
|
|
* of the new alias.
|
2005-06-22 03:43:18 +08:00
|
|
|
*/
|
|
|
|
|
2008-01-29 13:18:06 +08:00
|
|
|
if (fa && fa->fa_tos == tos &&
|
|
|
|
fa->fa_info->fib_priority == fi->fib_priority) {
|
|
|
|
struct fib_alias *fa_first, *fa_match;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
err = -EEXIST;
|
2006-08-18 09:14:52 +08:00
|
|
|
if (cfg->fc_nlflags & NLM_F_EXCL)
|
2005-06-22 03:43:18 +08:00
|
|
|
goto out;
|
|
|
|
|
2016-09-07 23:20:46 +08:00
|
|
|
nlflags &= ~NLM_F_EXCL;
|
|
|
|
|
2008-01-29 13:18:06 +08:00
|
|
|
/* We have 2 goals:
|
|
|
|
* 1. Find exact match for type, scope, fib_info to avoid
|
|
|
|
* duplicate routes
|
|
|
|
* 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
|
|
|
|
*/
|
|
|
|
fa_match = NULL;
|
|
|
|
fa_first = fa;
|
2015-02-26 07:31:31 +08:00
|
|
|
hlist_for_each_entry_from(fa, fa_list) {
|
2015-03-13 05:46:29 +08:00
|
|
|
if ((fa->fa_slen != slen) ||
|
|
|
|
(fa->tb_id != tb->tb_id) ||
|
|
|
|
(fa->fa_tos != tos))
|
2008-01-29 13:18:06 +08:00
|
|
|
break;
|
|
|
|
if (fa->fa_info->fib_priority != fi->fib_priority)
|
|
|
|
break;
|
|
|
|
if (fa->fa_type == cfg->fc_type &&
|
|
|
|
fa->fa_info == fi) {
|
|
|
|
fa_match = fa;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-08-18 09:14:52 +08:00
|
|
|
if (cfg->fc_nlflags & NLM_F_REPLACE) {
|
2005-06-22 03:43:18 +08:00
|
|
|
struct fib_info *fi_drop;
|
|
|
|
u8 state;
|
|
|
|
|
2016-09-07 23:20:46 +08:00
|
|
|
nlflags |= NLM_F_REPLACE;
|
2008-01-29 13:18:06 +08:00
|
|
|
fa = fa_first;
|
|
|
|
if (fa_match) {
|
|
|
|
if (fa == fa_match)
|
|
|
|
err = 0;
|
2008-01-18 19:45:18 +08:00
|
|
|
goto out;
|
2008-01-29 13:18:06 +08:00
|
|
|
}
|
2005-08-26 04:01:29 +08:00
|
|
|
err = -ENOBUFS;
|
2006-12-07 12:33:17 +08:00
|
|
|
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
|
2015-04-03 16:17:26 +08:00
|
|
|
if (!new_fa)
|
2005-08-26 04:01:29 +08:00
|
|
|
goto out;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
fi_drop = fa->fa_info;
|
2005-08-26 04:01:29 +08:00
|
|
|
new_fa->fa_tos = fa->fa_tos;
|
|
|
|
new_fa->fa_info = fi;
|
2006-08-18 09:14:52 +08:00
|
|
|
new_fa->fa_type = cfg->fc_type;
|
2005-06-22 03:43:18 +08:00
|
|
|
state = fa->fa_state;
|
2008-01-29 13:18:06 +08:00
|
|
|
new_fa->fa_state = state & ~FA_S_ACCESSED;
|
2015-02-26 07:31:44 +08:00
|
|
|
new_fa->fa_slen = fa->fa_slen;
|
2015-05-22 19:40:09 +08:00
|
|
|
new_fa->tb_id = tb->tb_id;
|
2015-07-22 15:43:23 +08:00
|
|
|
new_fa->fa_default = -1;
|
ipv4: Add "offload" and "trap" indications to routes
When performing L3 offload, routes and nexthops are usually programmed
into two different tables in the underlying device. Therefore, the fact
that a nexthop resides in hardware does not necessarily mean that all
the associated routes also reside in hardware and vice-versa.
While the kernel can signal to user space the presence of a nexthop in
hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag
for routes. In addition, the fact that a route resides in hardware does
not necessarily mean that the traffic is offloaded. For example,
unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap
packets to the CPU so that the kernel will be able to generate the
appropriate ICMP error packet.
This patch adds an "offload" and "trap" indications to IPv4 routes, so
that users will have better visibility into the offload process.
'struct fib_alias' is extended with two new fields that indicate if the
route resides in hardware or not and if it is offloading traffic from
the kernel or trapping packets to it. Note that the new fields are added
in the 6 bytes hole and therefore the struct still fits in a single
cache line [1].
Capable drivers are expected to invoke fib_alias_hw_flags_set() with the
route's key in order to set the flags.
The indications are dumped to user space via a new flags (i.e.,
'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the
ancillary header.
v2:
* Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set()
[1]
struct fib_alias {
struct hlist_node fa_list; /* 0 16 */
struct fib_info * fa_info; /* 16 8 */
u8 fa_tos; /* 24 1 */
u8 fa_type; /* 25 1 */
u8 fa_state; /* 26 1 */
u8 fa_slen; /* 27 1 */
u32 tb_id; /* 28 4 */
s16 fa_default; /* 32 2 */
u8 offload:1; /* 34: 0 1 */
u8 trap:1; /* 34: 1 1 */
u8 unused:6; /* 34: 2 1 */
/* XXX 5 bytes hole, try to pack */
struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */
/* size: 56, cachelines: 1, members: 12 */
/* sum members: 50, holes: 1, sum holes: 5 */
/* sum bitfield members: 8 bits (1 bytes) */
/* forced alignments: 1, forced holes: 1, sum forced holes: 5 */
/* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-14 19:23:11 +08:00
|
|
|
new_fa->offload = 0;
|
|
|
|
new_fa->trap = 0;
|
IPv4: Add "offload failed" indication to routes
After installing a route to the kernel, user space receives an
acknowledgment, which means the route was installed in the kernel, but not
necessarily in hardware.
The asynchronous nature of route installation in hardware can lead to a
routing daemon advertising a route before it was actually installed in
hardware. This can result in packet loss or mis-routed packets until the
route is installed in hardware.
To avoid such cases, previous patch set added the ability to emit
RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/RTM_F_TRAP flags
are changed, this behavior is controlled by sysctl.
With the above mentioned behavior, it is possible to know from user-space
if the route was offloaded, but if the offload fails there is no indication
to user-space. Following a failure, a routing daemon will wait indefinitely
for a notification that will never come.
This patch adds an "offload_failed" indication to IPv4 routes, so that
users will have better visibility into the offload process.
'struct fib_alias', and 'struct fib_rt_info' are extended with new field
that indicates if route offload failed. Note that the new field is added
using unused bit and therefore there is no need to increase structs size.
Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-07 16:22:50 +08:00
|
|
|
new_fa->offload_failed = 0;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2020-01-14 19:23:09 +08:00
|
|
|
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
|
|
|
|
|
2019-12-14 23:53:09 +08:00
|
|
|
if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
|
2020-01-14 19:23:09 +08:00
|
|
|
tb->tb_id, true) == new_fa) {
|
2019-12-14 23:53:09 +08:00
|
|
|
enum fib_event_type fib_event;
|
|
|
|
|
2019-12-14 23:53:15 +08:00
|
|
|
fib_event = FIB_EVENT_ENTRY_REPLACE;
|
2019-12-14 23:53:09 +08:00
|
|
|
err = call_fib_entry_notifiers(net, fib_event,
|
|
|
|
key, plen,
|
|
|
|
new_fa, extack);
|
2020-01-14 19:23:09 +08:00
|
|
|
if (err) {
|
|
|
|
hlist_replace_rcu(&new_fa->fa_list,
|
|
|
|
&fa->fa_list);
|
2019-12-14 23:53:09 +08:00
|
|
|
goto out_free_new_fa;
|
2020-01-14 19:23:09 +08:00
|
|
|
}
|
2019-12-14 23:53:09 +08:00
|
|
|
}
|
2018-03-28 09:21:58 +08:00
|
|
|
|
2017-02-09 17:28:40 +08:00
|
|
|
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
|
|
|
|
tb->tb_id, &cfg->fc_nlinfo, nlflags);
|
|
|
|
|
2005-08-26 04:01:29 +08:00
|
|
|
alias_free_mem_rcu(fa);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
fib_release_info(fi_drop);
|
|
|
|
if (state & FA_S_ACCESSED)
|
2012-09-07 08:45:29 +08:00
|
|
|
rt_cache_flush(cfg->fc_nlinfo.nl_net);
|
2016-09-26 18:52:29 +08:00
|
|
|
|
2005-08-10 11:24:39 +08:00
|
|
|
goto succeeded;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
/* Error if we find a perfect match which
|
|
|
|
* uses the same scope, type, and nexthop
|
|
|
|
* information.
|
|
|
|
*/
|
2008-01-29 13:18:06 +08:00
|
|
|
if (fa_match)
|
|
|
|
goto out;
|
2008-01-23 13:53:36 +08:00
|
|
|
|
2019-12-14 23:53:15 +08:00
|
|
|
if (cfg->fc_nlflags & NLM_F_APPEND)
|
2016-09-07 23:20:46 +08:00
|
|
|
nlflags |= NLM_F_APPEND;
|
2019-12-14 23:53:15 +08:00
|
|
|
else
|
2008-01-29 13:18:06 +08:00
|
|
|
fa = fa_first;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
err = -ENOENT;
|
2006-08-18 09:14:52 +08:00
|
|
|
if (!(cfg->fc_nlflags & NLM_F_CREATE))
|
2005-06-22 03:43:18 +08:00
|
|
|
goto out;
|
|
|
|
|
2016-09-07 23:20:46 +08:00
|
|
|
nlflags |= NLM_F_CREATE;
|
2005-06-22 03:43:18 +08:00
|
|
|
err = -ENOBUFS;
|
2006-12-07 12:33:17 +08:00
|
|
|
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
|
2015-04-03 16:17:26 +08:00
|
|
|
if (!new_fa)
|
2005-06-22 03:43:18 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
new_fa->fa_info = fi;
|
|
|
|
new_fa->fa_tos = tos;
|
2006-08-18 09:14:52 +08:00
|
|
|
new_fa->fa_type = cfg->fc_type;
|
2005-06-22 03:43:18 +08:00
|
|
|
new_fa->fa_state = 0;
|
2015-02-26 07:31:51 +08:00
|
|
|
new_fa->fa_slen = slen;
|
2015-03-07 05:47:00 +08:00
|
|
|
new_fa->tb_id = tb->tb_id;
|
2015-07-22 15:43:23 +08:00
|
|
|
new_fa->fa_default = -1;
|
ipv4: Add "offload" and "trap" indications to routes
When performing L3 offload, routes and nexthops are usually programmed
into two different tables in the underlying device. Therefore, the fact
that a nexthop resides in hardware does not necessarily mean that all
the associated routes also reside in hardware and vice-versa.
While the kernel can signal to user space the presence of a nexthop in
hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag
for routes. In addition, the fact that a route resides in hardware does
not necessarily mean that the traffic is offloaded. For example,
unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap
packets to the CPU so that the kernel will be able to generate the
appropriate ICMP error packet.
This patch adds an "offload" and "trap" indications to IPv4 routes, so
that users will have better visibility into the offload process.
'struct fib_alias' is extended with two new fields that indicate if the
route resides in hardware or not and if it is offloading traffic from
the kernel or trapping packets to it. Note that the new fields are added
in the 6 bytes hole and therefore the struct still fits in a single
cache line [1].
Capable drivers are expected to invoke fib_alias_hw_flags_set() with the
route's key in order to set the flags.
The indications are dumped to user space via a new flags (i.e.,
'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the
ancillary header.
v2:
* Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set()
[1]
struct fib_alias {
struct hlist_node fa_list; /* 0 16 */
struct fib_info * fa_info; /* 16 8 */
u8 fa_tos; /* 24 1 */
u8 fa_type; /* 25 1 */
u8 fa_state; /* 26 1 */
u8 fa_slen; /* 27 1 */
u32 tb_id; /* 28 4 */
s16 fa_default; /* 32 2 */
u8 offload:1; /* 34: 0 1 */
u8 trap:1; /* 34: 1 1 */
u8 unused:6; /* 34: 2 1 */
/* XXX 5 bytes hole, try to pack */
struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */
/* size: 56, cachelines: 1, members: 12 */
/* sum members: 50, holes: 1, sum holes: 5 */
/* sum bitfield members: 8 bits (1 bytes) */
/* forced alignments: 1, forced holes: 1, sum forced holes: 5 */
/* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-14 19:23:11 +08:00
|
|
|
new_fa->offload = 0;
|
|
|
|
new_fa->trap = 0;
|
IPv4: Add "offload failed" indication to routes
After installing a route to the kernel, user space receives an
acknowledgment, which means the route was installed in the kernel, but not
necessarily in hardware.
The asynchronous nature of route installation in hardware can lead to a
routing daemon advertising a route before it was actually installed in
hardware. This can result in packet loss or mis-routed packets until the
route is installed in hardware.
To avoid such cases, previous patch set added the ability to emit
RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/RTM_F_TRAP flags
are changed, this behavior is controlled by sysctl.
With the above mentioned behavior, it is possible to know from user-space
if the route was offloaded, but if the offload fails there is no indication
to user-space. Following a failure, a routing daemon will wait indefinitely
for a notification that will never come.
This patch adds an "offload_failed" indication to IPv4 routes, so that
users will have better visibility into the offload process.
'struct fib_alias', and 'struct fib_rt_info' are extended with new field
that indicates if route offload failed. Note that the new field is added
using unused bit and therefore there is no need to increase structs size.
Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-07 16:22:50 +08:00
|
|
|
new_fa->offload_failed = 0;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2019-12-14 23:53:07 +08:00
|
|
|
/* Insert new entry to the list. */
|
|
|
|
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
|
2018-03-28 09:21:57 +08:00
|
|
|
if (err)
|
|
|
|
goto out_free_new_fa;
|
|
|
|
|
2019-12-14 23:53:07 +08:00
|
|
|
/* The alias was already inserted, so the node must exist. */
|
|
|
|
l = l ? l : fib_find_node(t, &tp, key);
|
2022-11-20 15:28:38 +08:00
|
|
|
if (WARN_ON_ONCE(!l)) {
|
|
|
|
err = -ENOENT;
|
2019-12-14 23:53:07 +08:00
|
|
|
goto out_free_new_fa;
|
2022-11-20 15:28:38 +08:00
|
|
|
}
|
2019-12-14 23:53:07 +08:00
|
|
|
|
2019-12-14 23:53:10 +08:00
|
|
|
if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
|
|
|
|
new_fa) {
|
|
|
|
enum fib_event_type fib_event;
|
|
|
|
|
2019-12-14 23:53:15 +08:00
|
|
|
fib_event = FIB_EVENT_ENTRY_REPLACE;
|
2019-12-14 23:53:10 +08:00
|
|
|
err = call_fib_entry_notifiers(net, fib_event, key, plen,
|
|
|
|
new_fa, extack);
|
|
|
|
if (err)
|
|
|
|
goto out_remove_new_fa;
|
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2011-04-15 05:49:37 +08:00
|
|
|
if (!plen)
|
|
|
|
tb->tb_num_default++;
|
|
|
|
|
2012-09-07 08:45:29 +08:00
|
|
|
rt_cache_flush(cfg->fc_nlinfo.nl_net);
|
2015-03-07 05:47:00 +08:00
|
|
|
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
|
2015-06-18 02:07:01 +08:00
|
|
|
&cfg->fc_nlinfo, nlflags);
|
2005-06-22 03:43:18 +08:00
|
|
|
succeeded:
|
|
|
|
return 0;
|
2005-06-29 06:00:39 +08:00
|
|
|
|
2019-12-14 23:53:07 +08:00
|
|
|
out_remove_new_fa:
|
|
|
|
fib_remove_alias(t, tp, l, new_fa);
|
2005-06-29 06:00:39 +08:00
|
|
|
out_free_new_fa:
|
|
|
|
kmem_cache_free(fn_alias_kmem, new_fa);
|
2005-06-22 03:43:18 +08:00
|
|
|
out:
|
|
|
|
fib_release_info(fi);
|
2005-08-10 11:24:39 +08:00
|
|
|
err:
|
2005-06-22 03:43:18 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
|
2015-01-01 02:55:54 +08:00
|
|
|
{
|
|
|
|
t_key prefix = n->key;
|
|
|
|
|
|
|
|
return (key ^ prefix) & (prefix | -prefix);
|
|
|
|
}
|
|
|
|
|
2020-05-27 02:56:17 +08:00
|
|
|
bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
|
|
|
|
const struct flowi4 *flp)
|
|
|
|
{
|
|
|
|
if (nhc->nhc_flags & RTNH_F_DEAD)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (ip_ignore_linkdown(nhc->nhc_dev) &&
|
|
|
|
nhc->nhc_flags & RTNH_F_LINKDOWN &&
|
|
|
|
!(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
|
|
|
|
if (flp->flowi4_oif &&
|
|
|
|
flp->flowi4_oif != nhc->nhc_oif)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-01-01 02:56:24 +08:00
|
|
|
/* should be called with rcu_read_lock */
|
2011-03-12 08:54:08 +08:00
|
|
|
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
|
2010-10-05 18:41:36 +08:00
|
|
|
struct fib_result *res, int fib_flags)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-07 05:47:00 +08:00
|
|
|
struct trie *t = (struct trie *) tb->tb_data;
|
2015-01-01 02:55:29 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
|
|
|
struct trie_use_stats __percpu *stats = t->stats;
|
|
|
|
#endif
|
2015-01-01 02:55:54 +08:00
|
|
|
const t_key key = ntohl(flp->daddr);
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *n, *pn;
|
2015-02-26 07:31:51 +08:00
|
|
|
struct fib_alias *fa;
|
2015-03-05 07:04:03 +08:00
|
|
|
unsigned long index;
|
2015-01-01 02:55:54 +08:00
|
|
|
t_key cindex;
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
pn = t->kv;
|
|
|
|
cindex = 0;
|
|
|
|
|
|
|
|
n = get_child_rcu(pn, cindex);
|
2018-05-24 08:08:47 +08:00
|
|
|
if (!n) {
|
|
|
|
trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
|
2015-01-01 02:56:24 +08:00
|
|
|
return -EAGAIN;
|
2018-05-24 08:08:47 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2015-01-01 02:55:29 +08:00
|
|
|
this_cpu_inc(stats->gets);
|
2005-06-22 03:43:18 +08:00
|
|
|
#endif
|
|
|
|
|
2015-01-01 02:55:54 +08:00
|
|
|
/* Step 1: Travel to the longest prefix match in the trie */
|
|
|
|
for (;;) {
|
2015-03-07 01:54:52 +08:00
|
|
|
index = get_cindex(key, n);
|
2015-01-01 02:55:54 +08:00
|
|
|
|
|
|
|
/* This bit of code is a bit tricky but it combines multiple
|
|
|
|
* checks into a single check. The prefix consists of the
|
|
|
|
* prefix plus zeros for the "bits" in the prefix. The index
|
|
|
|
* is the difference between the key and this value. From
|
|
|
|
* this we can actually derive several pieces of data.
|
2015-03-05 07:04:03 +08:00
|
|
|
* if (index >= (1ul << bits))
|
2015-01-01 02:55:54 +08:00
|
|
|
* we have a mismatch in skip bits and failed
|
2015-01-23 07:51:08 +08:00
|
|
|
* else
|
|
|
|
* we know the value is cindex
|
2015-03-05 07:04:03 +08:00
|
|
|
*
|
|
|
|
* This check is safe even if bits == KEYLENGTH due to the
|
|
|
|
* fact that we can only allocate a node with 32 bits if a
|
|
|
|
* long is greater than 32 bits.
|
2015-01-01 02:55:54 +08:00
|
|
|
*/
|
2015-03-05 07:04:03 +08:00
|
|
|
if (index >= (1ul << n->bits))
|
2015-01-01 02:55:54 +08:00
|
|
|
break;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-01 02:55:54 +08:00
|
|
|
/* we have found a leaf. Prefixes have already been compared */
|
|
|
|
if (IS_LEAF(n))
|
2008-01-23 13:53:36 +08:00
|
|
|
goto found;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-01 02:55:54 +08:00
|
|
|
/* only record pn and cindex if we are going to be chopping
|
|
|
|
* bits later. Otherwise we are just wasting cycles.
|
2005-08-10 11:24:39 +08:00
|
|
|
*/
|
2015-01-01 02:57:08 +08:00
|
|
|
if (n->slen > n->pos) {
|
2015-01-01 02:55:54 +08:00
|
|
|
pn = n;
|
|
|
|
cindex = index;
|
2005-08-10 11:24:39 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 01:54:14 +08:00
|
|
|
n = get_child_rcu(n, index);
|
2015-01-01 02:55:54 +08:00
|
|
|
if (unlikely(!n))
|
|
|
|
goto backtrace;
|
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-01 02:55:54 +08:00
|
|
|
/* Step 2: Sort out leaves and begin backtracing for longest prefix */
|
|
|
|
for (;;) {
|
|
|
|
/* record the pointer where our next node pointer is stored */
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector __rcu **cptr = n->tnode;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-01-01 02:55:54 +08:00
|
|
|
/* This test verifies that none of the bits that differ
|
|
|
|
* between the key and the prefix exist in the region of
|
|
|
|
* the lsb and higher in the prefix.
|
2005-08-10 11:24:39 +08:00
|
|
|
*/
|
2015-01-01 02:57:08 +08:00
|
|
|
if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
|
2015-01-01 02:55:54 +08:00
|
|
|
goto backtrace;
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-01-01 02:55:54 +08:00
|
|
|
/* exit out and process leaf */
|
|
|
|
if (unlikely(IS_LEAF(n)))
|
|
|
|
break;
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-01-01 02:55:54 +08:00
|
|
|
/* Don't bother recording parent info. Since we are in
|
|
|
|
* prefix match mode we will have to come back to wherever
|
|
|
|
* we started this traversal anyway
|
2005-08-10 11:24:39 +08:00
|
|
|
*/
|
|
|
|
|
2015-01-01 02:55:54 +08:00
|
|
|
while ((n = rcu_dereference(*cptr)) == NULL) {
|
2005-06-22 03:43:18 +08:00
|
|
|
backtrace:
|
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2015-01-01 02:55:54 +08:00
|
|
|
if (!n)
|
|
|
|
this_cpu_inc(stats->null_node_hit);
|
2005-06-22 03:43:18 +08:00
|
|
|
#endif
|
2015-01-01 02:55:54 +08:00
|
|
|
/* If we are at cindex 0 there are no more bits for
|
|
|
|
* us to strip at this level so we must ascend back
|
|
|
|
* up one level to see if there are any more bits to
|
|
|
|
* be stripped there.
|
|
|
|
*/
|
|
|
|
while (!cindex) {
|
|
|
|
t_key pkey = pn->key;
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
/* If we don't have a parent then there is
|
|
|
|
* nothing for us to do as we do not have any
|
|
|
|
* further nodes to parse.
|
|
|
|
*/
|
2018-05-24 08:08:47 +08:00
|
|
|
if (IS_TRIE(pn)) {
|
|
|
|
trace_fib_table_lookup(tb->tb_id, flp,
|
|
|
|
NULL, -EAGAIN);
|
2015-01-01 02:56:24 +08:00
|
|
|
return -EAGAIN;
|
2018-05-24 08:08:47 +08:00
|
|
|
}
|
2015-01-01 02:55:54 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
|
|
|
this_cpu_inc(stats->backtrack);
|
|
|
|
#endif
|
|
|
|
/* Get Child's index */
|
2015-03-07 01:54:52 +08:00
|
|
|
pn = node_parent_rcu(pn);
|
2015-01-01 02:55:54 +08:00
|
|
|
cindex = get_index(pkey, pn);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* strip the least significant bit from the cindex */
|
|
|
|
cindex &= cindex - 1;
|
|
|
|
|
|
|
|
/* grab pointer for next child node */
|
2015-03-05 07:02:33 +08:00
|
|
|
cptr = &pn->tnode[cindex];
|
2005-07-20 05:01:51 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
2015-01-01 02:55:54 +08:00
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
found:
|
2015-03-05 07:04:03 +08:00
|
|
|
/* this line carries forward the xor from earlier in the function */
|
|
|
|
index = key ^ n->key;
|
|
|
|
|
2015-01-01 02:55:54 +08:00
|
|
|
/* Step 3: Process the leaf, if that fails fall back to backtracing */
|
2015-02-26 07:31:51 +08:00
|
|
|
hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
|
|
|
|
struct fib_info *fi = fa->fa_info;
|
2020-05-27 02:56:17 +08:00
|
|
|
struct fib_nh_common *nhc;
|
2015-02-26 07:31:51 +08:00
|
|
|
int nhsel, err;
|
2015-01-01 02:56:24 +08:00
|
|
|
|
2016-01-29 05:42:24 +08:00
|
|
|
if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
|
|
|
|
if (index >= (1ul << fa->fa_slen))
|
|
|
|
continue;
|
|
|
|
}
|
2015-02-26 07:31:51 +08:00
|
|
|
if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
|
|
|
|
continue;
|
|
|
|
if (fi->fib_dead)
|
|
|
|
continue;
|
|
|
|
if (fa->fa_info->fib_scope < flp->flowi4_scope)
|
|
|
|
continue;
|
|
|
|
fib_alias_accessed(fa);
|
|
|
|
err = fib_props[fa->fa_type].error;
|
|
|
|
if (unlikely(err < 0)) {
|
2019-06-04 11:19:51 +08:00
|
|
|
out_reject:
|
2015-01-01 02:56:24 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2015-02-26 07:31:51 +08:00
|
|
|
this_cpu_inc(stats->semantic_match_passed);
|
2015-01-01 02:56:24 +08:00
|
|
|
#endif
|
2018-05-24 08:08:47 +08:00
|
|
|
trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
|
2015-02-26 07:31:51 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
if (fi->fib_flags & RTNH_F_DEAD)
|
|
|
|
continue;
|
2019-06-04 11:19:51 +08:00
|
|
|
|
2020-05-27 02:56:17 +08:00
|
|
|
if (unlikely(fi->nh)) {
|
|
|
|
if (nexthop_is_blackhole(fi->nh)) {
|
|
|
|
err = fib_props[RTN_BLACKHOLE].error;
|
|
|
|
goto out_reject;
|
|
|
|
}
|
|
|
|
|
|
|
|
nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
|
|
|
|
&nhsel);
|
|
|
|
if (nhc)
|
|
|
|
goto set_result;
|
|
|
|
goto miss;
|
2019-06-04 11:19:51 +08:00
|
|
|
}
|
|
|
|
|
2019-06-04 11:19:49 +08:00
|
|
|
for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
|
2020-05-27 02:56:17 +08:00
|
|
|
nhc = fib_info_nhc(fi, nhsel);
|
2015-02-26 07:31:51 +08:00
|
|
|
|
2020-05-27 02:56:17 +08:00
|
|
|
if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
|
2015-02-26 07:31:51 +08:00
|
|
|
continue;
|
2020-05-27 02:56:17 +08:00
|
|
|
set_result:
|
2015-02-26 07:31:51 +08:00
|
|
|
if (!(fib_flags & FIB_LOOKUP_NOREF))
|
2017-07-04 14:35:02 +08:00
|
|
|
refcount_inc(&fi->fib_clntref);
|
2015-02-26 07:31:51 +08:00
|
|
|
|
2017-05-26 01:42:37 +08:00
|
|
|
res->prefix = htonl(n->key);
|
2015-02-26 07:31:51 +08:00
|
|
|
res->prefixlen = KEYLENGTH - fa->fa_slen;
|
|
|
|
res->nh_sel = nhsel;
|
2019-04-03 05:11:55 +08:00
|
|
|
res->nhc = nhc;
|
2015-02-26 07:31:51 +08:00
|
|
|
res->type = fa->fa_type;
|
|
|
|
res->scope = fi->fib_scope;
|
|
|
|
res->fi = fi;
|
|
|
|
res->table = tb;
|
|
|
|
res->fa_head = &n->leaf;
|
2015-01-01 02:56:24 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2015-02-26 07:31:51 +08:00
|
|
|
this_cpu_inc(stats->semantic_match_passed);
|
2015-01-01 02:56:24 +08:00
|
|
|
#endif
|
2019-04-03 05:11:55 +08:00
|
|
|
trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
|
2015-08-28 23:42:09 +08:00
|
|
|
|
2015-02-26 07:31:51 +08:00
|
|
|
return err;
|
2015-01-01 02:56:24 +08:00
|
|
|
}
|
2015-02-26 07:31:44 +08:00
|
|
|
}
|
2020-05-27 02:56:17 +08:00
|
|
|
miss:
|
2015-01-01 02:56:24 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2015-02-26 07:31:44 +08:00
|
|
|
this_cpu_inc(stats->semantic_match_miss);
|
2015-01-01 02:56:24 +08:00
|
|
|
#endif
|
|
|
|
goto backtrace;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
2011-08-25 19:46:12 +08:00
|
|
|
EXPORT_SYMBOL_GPL(fib_table_lookup);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static void fib_remove_alias(struct trie *t, struct key_vector *tp,
|
|
|
|
struct key_vector *l, struct fib_alias *old)
|
2015-03-05 07:02:18 +08:00
|
|
|
{
|
|
|
|
/* record the location of the previous list_info entry */
|
|
|
|
struct hlist_node **pprev = old->fa_list.pprev;
|
|
|
|
struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
|
|
|
|
|
|
|
|
/* remove the fib_alias from the list */
|
|
|
|
hlist_del_rcu(&old->fa_list);
|
|
|
|
|
|
|
|
/* if we emptied the list this leaf will be freed and we can sort
|
|
|
|
* out parent suffix lengths as a part of trie_rebalance
|
|
|
|
*/
|
|
|
|
if (hlist_empty(&l->leaf)) {
|
2016-12-01 20:27:57 +08:00
|
|
|
if (tp->slen == l->slen)
|
|
|
|
node_pull_suffix(tp, tp->pos);
|
2015-03-07 01:54:52 +08:00
|
|
|
put_child_root(tp, l->key, NULL);
|
2015-03-05 07:02:18 +08:00
|
|
|
node_free(l);
|
|
|
|
trie_rebalance(t, tp);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* only access fa if it is pointing at the last valid hlist_node */
|
|
|
|
if (*pprev)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* update the trie with the latest suffix length */
|
|
|
|
l->slen = fa->fa_slen;
|
2016-12-01 20:27:52 +08:00
|
|
|
node_pull_suffix(tp, fa->fa_slen);
|
2015-03-05 07:02:18 +08:00
|
|
|
}
|
|
|
|
|
2019-12-14 23:53:11 +08:00
|
|
|
static void fib_notify_alias_delete(struct net *net, u32 key,
|
|
|
|
struct hlist_head *fah,
|
|
|
|
struct fib_alias *fa_to_delete,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct fib_alias *fa_next, *fa_to_notify;
|
|
|
|
u32 tb_id = fa_to_delete->tb_id;
|
|
|
|
u8 slen = fa_to_delete->fa_slen;
|
|
|
|
enum fib_event_type fib_event;
|
|
|
|
|
|
|
|
/* Do not notify if we do not care about the route. */
|
|
|
|
if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Determine if the route should be replaced by the next route in the
|
|
|
|
* list.
|
|
|
|
*/
|
|
|
|
fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
|
|
|
|
struct fib_alias, fa_list);
|
|
|
|
if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
|
2019-12-14 23:53:15 +08:00
|
|
|
fib_event = FIB_EVENT_ENTRY_REPLACE;
|
2019-12-14 23:53:11 +08:00
|
|
|
fa_to_notify = fa_next;
|
|
|
|
} else {
|
2019-12-14 23:53:15 +08:00
|
|
|
fib_event = FIB_EVENT_ENTRY_DEL;
|
2019-12-14 23:53:11 +08:00
|
|
|
fa_to_notify = fa_to_delete;
|
|
|
|
}
|
|
|
|
call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
|
|
|
|
fa_to_notify, extack);
|
|
|
|
}
|
|
|
|
|
2015-03-05 07:02:18 +08:00
|
|
|
/* Caller must hold RTNL. */
|
2016-09-26 18:52:29 +08:00
|
|
|
int fib_table_delete(struct net *net, struct fib_table *tb,
|
2017-05-28 06:19:26 +08:00
|
|
|
struct fib_config *cfg, struct netlink_ext_ack *extack)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
|
|
|
struct trie *t = (struct trie *) tb->tb_data;
|
|
|
|
struct fib_alias *fa, *fa_to_delete;
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *l, *tp;
|
2015-02-26 07:31:51 +08:00
|
|
|
u8 plen = cfg->fc_dst_len;
|
|
|
|
u8 slen = KEYLENGTH - plen;
|
2015-03-05 07:01:59 +08:00
|
|
|
u8 tos = cfg->fc_tos;
|
|
|
|
u32 key;
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2006-08-18 09:14:52 +08:00
|
|
|
key = ntohl(cfg->fc_dst);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2017-05-28 06:19:26 +08:00
|
|
|
if (!fib_valid_key_len(key, plen, extack))
|
2005-06-22 03:43:18 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
2015-03-05 07:01:59 +08:00
|
|
|
l = fib_find_node(t, &tp, key);
|
2005-07-20 05:01:51 +08:00
|
|
|
if (!l)
|
2005-06-22 03:43:18 +08:00
|
|
|
return -ESRCH;
|
|
|
|
|
2019-12-14 23:53:08 +08:00
|
|
|
fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
|
2005-06-22 03:43:18 +08:00
|
|
|
if (!fa)
|
|
|
|
return -ESRCH;
|
|
|
|
|
2005-08-24 12:59:41 +08:00
|
|
|
pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
fa_to_delete = NULL;
|
2015-02-26 07:31:31 +08:00
|
|
|
hlist_for_each_entry_from(fa, fa_list) {
|
2005-06-22 03:43:18 +08:00
|
|
|
struct fib_info *fi = fa->fa_info;
|
|
|
|
|
2015-03-13 05:46:29 +08:00
|
|
|
if ((fa->fa_slen != slen) ||
|
|
|
|
(fa->tb_id != tb->tb_id) ||
|
|
|
|
(fa->fa_tos != tos))
|
2005-06-22 03:43:18 +08:00
|
|
|
break;
|
|
|
|
|
2006-08-18 09:14:52 +08:00
|
|
|
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
|
|
|
|
(cfg->fc_scope == RT_SCOPE_NOWHERE ||
|
2011-03-25 09:06:47 +08:00
|
|
|
fa->fa_info->fib_scope == cfg->fc_scope) &&
|
2011-03-19 20:13:46 +08:00
|
|
|
(!cfg->fc_prefsrc ||
|
|
|
|
fi->fib_prefsrc == cfg->fc_prefsrc) &&
|
2006-08-18 09:14:52 +08:00
|
|
|
(!cfg->fc_protocol ||
|
|
|
|
fi->fib_protocol == cfg->fc_protocol) &&
|
2020-03-28 06:00:21 +08:00
|
|
|
fib_nh_match(net, cfg, fi, extack) == 0 &&
|
2017-08-23 10:07:26 +08:00
|
|
|
fib_metrics_match(cfg, fi)) {
|
2005-06-22 03:43:18 +08:00
|
|
|
fa_to_delete = fa;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-08-10 11:24:39 +08:00
|
|
|
if (!fa_to_delete)
|
|
|
|
return -ESRCH;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2019-12-14 23:53:11 +08:00
|
|
|
fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
|
2015-03-05 07:02:18 +08:00
|
|
|
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
|
2007-05-24 05:55:06 +08:00
|
|
|
&cfg->fc_nlinfo, 0);
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2011-04-15 05:49:37 +08:00
|
|
|
if (!plen)
|
|
|
|
tb->tb_num_default--;
|
|
|
|
|
2015-03-05 07:02:18 +08:00
|
|
|
fib_remove_alias(t, tp, l, fa_to_delete);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-05 07:02:18 +08:00
|
|
|
if (fa_to_delete->fa_state & FA_S_ACCESSED)
|
2012-09-07 08:45:29 +08:00
|
|
|
rt_cache_flush(cfg->fc_nlinfo.nl_net);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-05 07:02:18 +08:00
|
|
|
fib_release_info(fa_to_delete->fa_info);
|
|
|
|
alias_free_mem_rcu(fa_to_delete);
|
2005-08-10 11:24:39 +08:00
|
|
|
return 0;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
/* Scan for the next leaf starting at the provided key value */
|
2015-03-07 01:54:08 +08:00
|
|
|
static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *pn, *n = *tn;
|
2015-03-05 06:59:19 +08:00
|
|
|
unsigned long cindex;
|
2008-01-23 13:55:32 +08:00
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
/* this loop is meant to try and find the key in the trie */
|
2015-03-07 01:54:52 +08:00
|
|
|
do {
|
2015-03-05 06:59:19 +08:00
|
|
|
/* record parent and next child index */
|
|
|
|
pn = n;
|
2015-10-28 06:06:45 +08:00
|
|
|
cindex = (key > pn->key) ? get_index(key, pn) : 0;
|
2015-03-07 01:54:52 +08:00
|
|
|
|
|
|
|
if (cindex >> pn->bits)
|
|
|
|
break;
|
2008-01-23 13:55:32 +08:00
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
/* descend into the next child */
|
2015-03-07 01:54:14 +08:00
|
|
|
n = get_child_rcu(pn, cindex++);
|
2015-03-07 01:54:52 +08:00
|
|
|
if (!n)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* guarantee forward progress on the keys */
|
|
|
|
if (IS_LEAF(n) && (n->key >= key))
|
|
|
|
goto found;
|
|
|
|
} while (IS_TNODE(n));
|
2008-01-23 13:55:32 +08:00
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
/* this loop will search for the next leaf with a greater key */
|
2015-03-07 01:54:52 +08:00
|
|
|
while (!IS_TRIE(pn)) {
|
2015-03-05 06:59:19 +08:00
|
|
|
/* if we exhausted the parent node we will need to climb */
|
|
|
|
if (cindex >= (1ul << pn->bits)) {
|
|
|
|
t_key pkey = pn->key;
|
2008-01-23 13:55:32 +08:00
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
pn = node_parent_rcu(pn);
|
|
|
|
cindex = get_index(pkey, pn) + 1;
|
|
|
|
continue;
|
|
|
|
}
|
2008-01-23 13:55:32 +08:00
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
/* grab the next available node */
|
2015-03-07 01:54:14 +08:00
|
|
|
n = get_child_rcu(pn, cindex++);
|
2015-03-05 06:59:19 +08:00
|
|
|
if (!n)
|
|
|
|
continue;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
/* no need to compare keys since we bumped the index */
|
|
|
|
if (IS_LEAF(n))
|
|
|
|
goto found;
|
2008-02-01 08:45:47 +08:00
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
/* Rescan start scanning in new node */
|
|
|
|
pn = n;
|
|
|
|
cindex = 0;
|
|
|
|
}
|
2008-02-12 13:12:49 +08:00
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
*tn = pn;
|
|
|
|
return NULL; /* Root of trie */
|
|
|
|
found:
|
|
|
|
/* if we are at the limit for keys just return NULL for the tnode */
|
2015-03-07 01:54:52 +08:00
|
|
|
*tn = pn;
|
2015-03-05 06:59:19 +08:00
|
|
|
return n;
|
2008-02-01 08:45:47 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 05:47:00 +08:00
|
|
|
static void fib_trie_free(struct fib_table *tb)
|
|
|
|
{
|
|
|
|
struct trie *t = (struct trie *)tb->tb_data;
|
|
|
|
struct key_vector *pn = t->kv;
|
|
|
|
unsigned long cindex = 1;
|
|
|
|
struct hlist_node *tmp;
|
|
|
|
struct fib_alias *fa;
|
|
|
|
|
|
|
|
/* walk trie in reverse order and free everything */
|
|
|
|
for (;;) {
|
|
|
|
struct key_vector *n;
|
|
|
|
|
|
|
|
if (!(cindex--)) {
|
|
|
|
t_key pkey = pn->key;
|
|
|
|
|
|
|
|
if (IS_TRIE(pn))
|
|
|
|
break;
|
|
|
|
|
|
|
|
n = pn;
|
|
|
|
pn = node_parent(pn);
|
|
|
|
|
|
|
|
/* drop emptied tnode */
|
|
|
|
put_child_root(pn, n->key, NULL);
|
|
|
|
node_free(n);
|
|
|
|
|
|
|
|
cindex = get_index(pkey, pn);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* grab the next available node */
|
|
|
|
n = get_child(pn, cindex);
|
|
|
|
if (!n)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (IS_TNODE(n)) {
|
|
|
|
/* record pn and cindex for leaf walking */
|
|
|
|
pn = n;
|
|
|
|
cindex = 1ul << n->bits;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
|
|
|
|
hlist_del_rcu(&fa->fa_list);
|
|
|
|
alias_free_mem_rcu(fa);
|
|
|
|
}
|
|
|
|
|
|
|
|
put_child_root(pn, n->key, NULL);
|
|
|
|
node_free(n);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
|
|
|
free_percpu(t->stats);
|
|
|
|
#endif
|
|
|
|
kfree(tb);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
|
|
|
|
{
|
|
|
|
struct trie *ot = (struct trie *)oldtb->tb_data;
|
|
|
|
struct key_vector *l, *tp = ot->kv;
|
|
|
|
struct fib_table *local_tb;
|
|
|
|
struct fib_alias *fa;
|
|
|
|
struct trie *lt;
|
|
|
|
t_key key = 0;
|
|
|
|
|
|
|
|
if (oldtb->tb_data == oldtb->__data)
|
|
|
|
return oldtb;
|
|
|
|
|
|
|
|
local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
|
|
|
|
if (!local_tb)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
lt = (struct trie *)local_tb->tb_data;
|
|
|
|
|
|
|
|
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
|
|
|
|
struct key_vector *local_l = NULL, *local_tp;
|
|
|
|
|
2020-07-29 16:37:13 +08:00
|
|
|
hlist_for_each_entry(fa, &l->leaf, fa_list) {
|
2015-03-07 05:47:00 +08:00
|
|
|
struct fib_alias *new_fa;
|
|
|
|
|
|
|
|
if (local_tb->tb_id != fa->tb_id)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* clone fa for new local table */
|
|
|
|
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
|
|
|
|
if (!new_fa)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
memcpy(new_fa, fa, sizeof(*fa));
|
|
|
|
|
|
|
|
/* insert clone into table */
|
|
|
|
if (!local_l)
|
|
|
|
local_l = fib_find_node(lt, &local_tp, l->key);
|
|
|
|
|
|
|
|
if (fib_insert_alias(lt, local_tp, local_l, new_fa,
|
2016-11-15 18:46:12 +08:00
|
|
|
NULL, l->key)) {
|
|
|
|
kmem_cache_free(fn_alias_kmem, new_fa);
|
2015-03-07 05:47:00 +08:00
|
|
|
goto out;
|
2016-11-15 18:46:12 +08:00
|
|
|
}
|
2015-03-07 05:47:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* stop loop if key wrapped back to 0 */
|
|
|
|
key = l->key + 1;
|
|
|
|
if (key < l->key)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return local_tb;
|
|
|
|
out:
|
|
|
|
fib_trie_free(local_tb);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2016-11-15 18:46:06 +08:00
|
|
|
/* Caller must hold RTNL */
|
|
|
|
void fib_table_flush_external(struct fib_table *tb)
|
|
|
|
{
|
|
|
|
struct trie *t = (struct trie *)tb->tb_data;
|
|
|
|
struct key_vector *pn = t->kv;
|
|
|
|
unsigned long cindex = 1;
|
|
|
|
struct hlist_node *tmp;
|
|
|
|
struct fib_alias *fa;
|
|
|
|
|
|
|
|
/* walk trie in reverse order */
|
|
|
|
for (;;) {
|
|
|
|
unsigned char slen = 0;
|
|
|
|
struct key_vector *n;
|
|
|
|
|
|
|
|
if (!(cindex--)) {
|
|
|
|
t_key pkey = pn->key;
|
|
|
|
|
|
|
|
/* cannot resize the trie vector */
|
|
|
|
if (IS_TRIE(pn))
|
|
|
|
break;
|
|
|
|
|
2016-12-01 20:27:57 +08:00
|
|
|
/* update the suffix to address pulled leaves */
|
|
|
|
if (pn->slen > pn->pos)
|
|
|
|
update_suffix(pn);
|
|
|
|
|
2016-11-15 18:46:06 +08:00
|
|
|
/* resize completed node */
|
|
|
|
pn = resize(t, pn);
|
|
|
|
cindex = get_index(pkey, pn);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* grab the next available node */
|
|
|
|
n = get_child(pn, cindex);
|
|
|
|
if (!n)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (IS_TNODE(n)) {
|
|
|
|
/* record pn and cindex for leaf walking */
|
|
|
|
pn = n;
|
|
|
|
cindex = 1ul << n->bits;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
|
|
|
|
/* if alias was cloned to local then we just
|
|
|
|
* need to remove the local copy from main
|
|
|
|
*/
|
|
|
|
if (tb->tb_id != fa->tb_id) {
|
|
|
|
hlist_del_rcu(&fa->fa_list);
|
|
|
|
alias_free_mem_rcu(fa);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* record local slen */
|
|
|
|
slen = fa->fa_slen;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* update leaf slen */
|
|
|
|
n->slen = slen;
|
|
|
|
|
|
|
|
if (hlist_empty(&n->leaf)) {
|
|
|
|
put_child_root(pn, n->key, NULL);
|
|
|
|
node_free(n);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
/* Caller must hold RTNL. */
|
2019-01-09 17:57:39 +08:00
|
|
|
int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-05 06:58:19 +08:00
|
|
|
struct trie *t = (struct trie *)tb->tb_data;
|
2015-03-07 01:54:52 +08:00
|
|
|
struct key_vector *pn = t->kv;
|
|
|
|
unsigned long cindex = 1;
|
2015-03-05 06:58:19 +08:00
|
|
|
struct hlist_node *tmp;
|
|
|
|
struct fib_alias *fa;
|
2008-01-23 13:55:32 +08:00
|
|
|
int found = 0;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
/* walk trie in reverse order */
|
|
|
|
for (;;) {
|
|
|
|
unsigned char slen = 0;
|
|
|
|
struct key_vector *n;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
if (!(cindex--)) {
|
|
|
|
t_key pkey = pn->key;
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
/* cannot resize the trie vector */
|
|
|
|
if (IS_TRIE(pn))
|
|
|
|
break;
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2016-12-01 20:27:57 +08:00
|
|
|
/* update the suffix to address pulled leaves */
|
|
|
|
if (pn->slen > pn->pos)
|
|
|
|
update_suffix(pn);
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
/* resize completed node */
|
|
|
|
pn = resize(t, pn);
|
|
|
|
cindex = get_index(pkey, pn);
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
continue;
|
|
|
|
}
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
/* grab the next available node */
|
|
|
|
n = get_child(pn, cindex);
|
|
|
|
if (!n)
|
|
|
|
continue;
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
if (IS_TNODE(n)) {
|
|
|
|
/* record pn and cindex for leaf walking */
|
|
|
|
pn = n;
|
|
|
|
cindex = 1ul << n->bits;
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
continue;
|
|
|
|
}
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
|
|
|
|
struct fib_info *fi = fa->fa_info;
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2019-01-09 17:57:39 +08:00
|
|
|
if (!fi || tb->tb_id != fa->tb_id ||
|
|
|
|
(!(fi->fib_flags & RTNH_F_DEAD) &&
|
|
|
|
!fib_props[fa->fa_type].error)) {
|
|
|
|
slen = fa->fa_slen;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Do not flush error routes if network namespace is
|
|
|
|
* not being dismantled
|
|
|
|
*/
|
|
|
|
if (!flush_all && fib_props[fa->fa_type].error) {
|
2015-03-07 01:54:52 +08:00
|
|
|
slen = fa->fa_slen;
|
|
|
|
continue;
|
|
|
|
}
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2019-12-14 23:53:12 +08:00
|
|
|
fib_notify_alias_delete(net, n->key, &n->leaf, fa,
|
|
|
|
NULL);
|
2015-03-05 06:58:19 +08:00
|
|
|
hlist_del_rcu(&fa->fa_list);
|
|
|
|
fib_release_info(fa->fa_info);
|
|
|
|
alias_free_mem_rcu(fa);
|
|
|
|
found++;
|
2015-01-23 07:51:45 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
/* update leaf slen */
|
|
|
|
n->slen = slen;
|
2015-03-05 06:58:19 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
if (hlist_empty(&n->leaf)) {
|
|
|
|
put_child_root(pn, n->key, NULL);
|
|
|
|
node_free(n);
|
|
|
|
}
|
2015-01-23 07:51:45 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2005-08-24 12:59:41 +08:00
|
|
|
pr_debug("trie_flush found=%d\n", found);
|
2005-06-22 03:43:18 +08:00
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
2019-05-23 03:04:42 +08:00
|
|
|
/* derived from fib_trie_free */
|
|
|
|
static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
|
|
|
|
struct nl_info *info)
|
|
|
|
{
|
|
|
|
struct trie *t = (struct trie *)tb->tb_data;
|
|
|
|
struct key_vector *pn = t->kv;
|
|
|
|
unsigned long cindex = 1;
|
|
|
|
struct fib_alias *fa;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
struct key_vector *n;
|
|
|
|
|
|
|
|
if (!(cindex--)) {
|
|
|
|
t_key pkey = pn->key;
|
|
|
|
|
|
|
|
if (IS_TRIE(pn))
|
|
|
|
break;
|
|
|
|
|
|
|
|
pn = node_parent(pn);
|
|
|
|
cindex = get_index(pkey, pn);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* grab the next available node */
|
|
|
|
n = get_child(pn, cindex);
|
|
|
|
if (!n)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (IS_TNODE(n)) {
|
|
|
|
/* record pn and cindex for leaf walking */
|
|
|
|
pn = n;
|
|
|
|
cindex = 1ul << n->bits;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
hlist_for_each_entry(fa, &n->leaf, fa_list) {
|
|
|
|
struct fib_info *fi = fa->fa_info;
|
|
|
|
|
|
|
|
if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
|
|
|
|
KEYLENGTH - fa->fa_slen, tb->tb_id,
|
|
|
|
info, NLM_F_REPLACE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void fib_info_notify_update(struct net *net, struct nl_info *info)
|
|
|
|
{
|
|
|
|
unsigned int h;
|
|
|
|
|
|
|
|
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
|
|
|
|
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
|
|
|
|
struct fib_table *tb;
|
|
|
|
|
2020-08-27 00:48:10 +08:00
|
|
|
hlist_for_each_entry_rcu(tb, head, tb_hlist,
|
|
|
|
lockdep_rtnl_is_held())
|
2019-05-23 03:04:42 +08:00
|
|
|
__fib_info_notify_update(net, tb, info);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-03 17:49:28 +08:00
|
|
|
static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
|
2019-10-03 17:49:30 +08:00
|
|
|
struct notifier_block *nb,
|
|
|
|
struct netlink_ext_ack *extack)
|
2016-12-03 23:45:07 +08:00
|
|
|
{
|
|
|
|
struct fib_alias *fa;
|
2019-12-14 23:53:13 +08:00
|
|
|
int last_slen = -1;
|
2019-10-03 17:49:28 +08:00
|
|
|
int err;
|
2016-12-03 23:45:07 +08:00
|
|
|
|
|
|
|
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
|
|
|
|
struct fib_info *fi = fa->fa_info;
|
|
|
|
|
|
|
|
if (!fi)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* local and main table can share the same trie,
|
|
|
|
* so don't notify twice for the same entry.
|
|
|
|
*/
|
|
|
|
if (tb->tb_id != fa->tb_id)
|
|
|
|
continue;
|
|
|
|
|
2019-12-14 23:53:13 +08:00
|
|
|
if (fa->fa_slen == last_slen)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
last_slen = fa->fa_slen;
|
2019-12-14 23:53:15 +08:00
|
|
|
err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
|
2019-12-14 23:53:13 +08:00
|
|
|
l->key, KEYLENGTH - fa->fa_slen,
|
|
|
|
fa, extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2016-12-03 23:45:07 +08:00
|
|
|
}
|
2019-10-03 17:49:28 +08:00
|
|
|
return 0;
|
2016-12-03 23:45:07 +08:00
|
|
|
}
|
|
|
|
|
2019-10-03 17:49:30 +08:00
|
|
|
static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
|
|
|
|
struct netlink_ext_ack *extack)
|
2016-12-03 23:45:07 +08:00
|
|
|
{
|
|
|
|
struct trie *t = (struct trie *)tb->tb_data;
|
|
|
|
struct key_vector *l, *tp = t->kv;
|
|
|
|
t_key key = 0;
|
2019-10-03 17:49:28 +08:00
|
|
|
int err;
|
2016-12-03 23:45:07 +08:00
|
|
|
|
|
|
|
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
|
2019-10-03 17:49:30 +08:00
|
|
|
err = fib_leaf_notify(l, tb, nb, extack);
|
2019-10-03 17:49:28 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
2016-12-03 23:45:07 +08:00
|
|
|
|
|
|
|
key = l->key + 1;
|
|
|
|
/* stop in case of wrap around */
|
|
|
|
if (key < l->key)
|
|
|
|
break;
|
|
|
|
}
|
2019-10-03 17:49:28 +08:00
|
|
|
return 0;
|
2016-12-03 23:45:07 +08:00
|
|
|
}
|
|
|
|
|
2019-10-03 17:49:30 +08:00
|
|
|
int fib_notify(struct net *net, struct notifier_block *nb,
|
|
|
|
struct netlink_ext_ack *extack)
|
2016-12-03 23:45:07 +08:00
|
|
|
{
|
|
|
|
unsigned int h;
|
2019-10-03 17:49:28 +08:00
|
|
|
int err;
|
2016-12-03 23:45:07 +08:00
|
|
|
|
|
|
|
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
|
|
|
|
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
|
|
|
|
struct fib_table *tb;
|
|
|
|
|
2019-10-03 17:49:28 +08:00
|
|
|
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
|
2019-10-03 17:49:30 +08:00
|
|
|
err = fib_table_notify(tb, nb, extack);
|
2019-10-03 17:49:28 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
}
|
2016-12-03 23:45:07 +08:00
|
|
|
}
|
2019-10-03 17:49:28 +08:00
|
|
|
return 0;
|
2016-12-03 23:45:07 +08:00
|
|
|
}
|
|
|
|
|
2015-03-05 07:02:44 +08:00
|
|
|
static void __trie_free_rcu(struct rcu_head *head)
|
2010-10-28 10:00:43 +08:00
|
|
|
{
|
2015-03-05 07:02:44 +08:00
|
|
|
struct fib_table *tb = container_of(head, struct fib_table, rcu);
|
2015-01-01 02:55:29 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
|
|
|
struct trie *t = (struct trie *)tb->tb_data;
|
|
|
|
|
2015-03-07 05:47:00 +08:00
|
|
|
if (tb->tb_data == tb->__data)
|
|
|
|
free_percpu(t->stats);
|
2015-01-01 02:55:29 +08:00
|
|
|
#endif /* CONFIG_IP_FIB_TRIE_STATS */
|
2010-10-28 10:00:43 +08:00
|
|
|
kfree(tb);
|
|
|
|
}
|
|
|
|
|
2015-03-05 07:02:44 +08:00
|
|
|
void fib_free_table(struct fib_table *tb)
|
|
|
|
{
|
|
|
|
call_rcu(&tb->rcu, __trie_free_rcu);
|
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
|
2018-10-16 09:56:43 +08:00
|
|
|
struct sk_buff *skb, struct netlink_callback *cb,
|
|
|
|
struct fib_dump_filter *filter)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2018-10-16 09:56:43 +08:00
|
|
|
unsigned int flags = NLM_F_MULTI;
|
2015-02-26 07:31:51 +08:00
|
|
|
__be32 xkey = htonl(l->key);
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
int i, s_i, i_fa, s_fa, err;
|
2005-06-22 03:43:18 +08:00
|
|
|
struct fib_alias *fa;
|
|
|
|
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
if (filter->filter_set ||
|
|
|
|
!filter->dump_exceptions || !filter->dump_routes)
|
2018-10-16 09:56:43 +08:00
|
|
|
flags |= NLM_F_DUMP_FILTERED;
|
|
|
|
|
2015-02-26 07:31:51 +08:00
|
|
|
s_i = cb->args[4];
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
s_fa = cb->args[5];
|
2005-06-22 03:43:18 +08:00
|
|
|
i = 0;
|
|
|
|
|
2005-08-26 04:01:29 +08:00
|
|
|
/* rcu_read_lock is hold by caller */
|
2015-02-26 07:31:51 +08:00
|
|
|
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
struct fib_info *fi = fa->fa_info;
|
2017-05-16 14:19:17 +08:00
|
|
|
|
2018-10-16 09:56:43 +08:00
|
|
|
if (i < s_i)
|
|
|
|
goto next;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
i_fa = 0;
|
|
|
|
|
2018-10-16 09:56:43 +08:00
|
|
|
if (tb->tb_id != fa->tb_id)
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
if (filter->filter_set) {
|
|
|
|
if (filter->rt_type && fa->fa_type != filter->rt_type)
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
if ((filter->protocol &&
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
fi->fib_protocol != filter->protocol))
|
2018-10-16 09:56:43 +08:00
|
|
|
goto next;
|
|
|
|
|
|
|
|
if (filter->dev &&
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
!fib_info_nh_uses_dev(fi, filter->dev))
|
2018-10-16 09:56:43 +08:00
|
|
|
goto next;
|
2015-03-07 05:47:00 +08:00
|
|
|
}
|
|
|
|
|
ipv4: Fix off-by-one in route dump counter without netlink strict checking
In commit ee28906fd7a1 ("ipv4: Dump route exceptions if requested") I
added a counter of per-node dumped routes (including actual routes and
exceptions), analogous to the existing counter for dumped nodes. Dumping
exceptions means we need to also keep track of how many routes are dumped
for each node: this would be just one route per node, without exceptions.
When netlink strict checking is not enabled, we dump both routes and
exceptions at the same time: the RTM_F_CLONED flag is not used as a
filter. In this case, the per-node counter 'i_fa' is incremented by one
to track the single dumped route, then also incremented by one for each
exception dumped, and then stored as netlink callback argument as skip
counter, 's_fa', to be used when a partial dump operation restarts.
The per-node counter needs to be increased by one also when we skip a
route (exception) due to a previous non-zero skip counter, because it
needs to match the existing skip counter, if we are dumping both routes
and exceptions. I missed this, and only incremented the counter, for
regular routes, if the previous skip counter was zero. This means that,
in case of a mixed dump, partial dump operations after the first one
will start with a mismatching skip counter value, one less than expected.
This means in turn that the first exception for a given node is skipped
every time a partial dump operation restarts, if netlink strict checking
is not enabled (iproute < 5.0).
It turns out I didn't repeat the test in its final version, commit
de755a85130e ("selftests: pmtu: Introduce list_flush_ipv4_exception test
case"), which also counts the number of route exceptions returned, with
iproute2 versions < 5.0 -- I was instead using the equivalent of the IPv6
test as it was before commit b964641e9925 ("selftests: pmtu: Make
list_flush_ipv6_exception test more demanding").
Always increment the per-node counter by one if we previously dumped
a regular route, so that it matches the current skip counter.
Fixes: ee28906fd7a1 ("ipv4: Dump route exceptions if requested")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-30 01:55:08 +08:00
|
|
|
if (filter->dump_routes) {
|
|
|
|
if (!s_fa) {
|
2020-01-14 19:23:10 +08:00
|
|
|
struct fib_rt_info fri;
|
|
|
|
|
|
|
|
fri.fi = fi;
|
|
|
|
fri.tb_id = tb->tb_id;
|
|
|
|
fri.dst = xkey;
|
|
|
|
fri.dst_len = KEYLENGTH - fa->fa_slen;
|
|
|
|
fri.tos = fa->fa_tos;
|
|
|
|
fri.type = fa->fa_type;
|
2022-02-17 01:32:16 +08:00
|
|
|
fri.offload = READ_ONCE(fa->offload);
|
|
|
|
fri.trap = READ_ONCE(fa->trap);
|
|
|
|
fri.offload_failed = READ_ONCE(fa->offload_failed);
|
ipv4: Fix off-by-one in route dump counter without netlink strict checking
In commit ee28906fd7a1 ("ipv4: Dump route exceptions if requested") I
added a counter of per-node dumped routes (including actual routes and
exceptions), analogous to the existing counter for dumped nodes. Dumping
exceptions means we need to also keep track of how many routes are dumped
for each node: this would be just one route per node, without exceptions.
When netlink strict checking is not enabled, we dump both routes and
exceptions at the same time: the RTM_F_CLONED flag is not used as a
filter. In this case, the per-node counter 'i_fa' is incremented by one
to track the single dumped route, then also incremented by one for each
exception dumped, and then stored as netlink callback argument as skip
counter, 's_fa', to be used when a partial dump operation restarts.
The per-node counter needs to be increased by one also when we skip a
route (exception) due to a previous non-zero skip counter, because it
needs to match the existing skip counter, if we are dumping both routes
and exceptions. I missed this, and only incremented the counter, for
regular routes, if the previous skip counter was zero. This means that,
in case of a mixed dump, partial dump operations after the first one
will start with a mismatching skip counter value, one less than expected.
This means in turn that the first exception for a given node is skipped
every time a partial dump operation restarts, if netlink strict checking
is not enabled (iproute < 5.0).
It turns out I didn't repeat the test in its final version, commit
de755a85130e ("selftests: pmtu: Introduce list_flush_ipv4_exception test
case"), which also counts the number of route exceptions returned, with
iproute2 versions < 5.0 -- I was instead using the equivalent of the IPv6
test as it was before commit b964641e9925 ("selftests: pmtu: Make
list_flush_ipv6_exception test more demanding").
Always increment the per-node counter by one if we previously dumped
a regular route, so that it matches the current skip counter.
Fixes: ee28906fd7a1 ("ipv4: Dump route exceptions if requested")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-30 01:55:08 +08:00
|
|
|
err = fib_dump_info(skb,
|
|
|
|
NETLINK_CB(cb->skb).portid,
|
|
|
|
cb->nlh->nlmsg_seq,
|
2020-01-14 19:23:10 +08:00
|
|
|
RTM_NEWROUTE, &fri, flags);
|
ipv4: Fix off-by-one in route dump counter without netlink strict checking
In commit ee28906fd7a1 ("ipv4: Dump route exceptions if requested") I
added a counter of per-node dumped routes (including actual routes and
exceptions), analogous to the existing counter for dumped nodes. Dumping
exceptions means we need to also keep track of how many routes are dumped
for each node: this would be just one route per node, without exceptions.
When netlink strict checking is not enabled, we dump both routes and
exceptions at the same time: the RTM_F_CLONED flag is not used as a
filter. In this case, the per-node counter 'i_fa' is incremented by one
to track the single dumped route, then also incremented by one for each
exception dumped, and then stored as netlink callback argument as skip
counter, 's_fa', to be used when a partial dump operation restarts.
The per-node counter needs to be increased by one also when we skip a
route (exception) due to a previous non-zero skip counter, because it
needs to match the existing skip counter, if we are dumping both routes
and exceptions. I missed this, and only incremented the counter, for
regular routes, if the previous skip counter was zero. This means that,
in case of a mixed dump, partial dump operations after the first one
will start with a mismatching skip counter value, one less than expected.
This means in turn that the first exception for a given node is skipped
every time a partial dump operation restarts, if netlink strict checking
is not enabled (iproute < 5.0).
It turns out I didn't repeat the test in its final version, commit
de755a85130e ("selftests: pmtu: Introduce list_flush_ipv4_exception test
case"), which also counts the number of route exceptions returned, with
iproute2 versions < 5.0 -- I was instead using the equivalent of the IPv6
test as it was before commit b964641e9925 ("selftests: pmtu: Make
list_flush_ipv6_exception test more demanding").
Always increment the per-node counter by one if we previously dumped
a regular route, so that it matches the current skip counter.
Fixes: ee28906fd7a1 ("ipv4: Dump route exceptions if requested")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-30 01:55:08 +08:00
|
|
|
if (err < 0)
|
|
|
|
goto stop;
|
|
|
|
}
|
|
|
|
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
i_fa++;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
|
|
|
|
if (filter->dump_exceptions) {
|
|
|
|
err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
|
2019-08-24 08:11:38 +08:00
|
|
|
&i_fa, s_fa, flags);
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
if (err < 0)
|
|
|
|
goto stop;
|
|
|
|
}
|
|
|
|
|
2018-10-16 09:56:43 +08:00
|
|
|
next:
|
2008-01-23 13:56:11 +08:00
|
|
|
i++;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
2008-01-23 13:56:11 +08:00
|
|
|
|
2008-02-01 08:45:47 +08:00
|
|
|
cb->args[4] = i;
|
2005-06-22 03:43:18 +08:00
|
|
|
return skb->len;
|
ipv4: Dump route exceptions if requested
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.
This implies that the command 'ip route list cache' doesn't return any
result anymore.
If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.
With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.
A single additional argument is sufficient, even if we traverse logically
nested structures (nexthop objects, hash table buckets, bucket chains): it
doesn't matter if we stop in the middle of any of those, because they are
always traversed the same way. As an example, s_i values in [], s_fa
values in ():
node (fa) #1 [1]
nexthop #1
bucket #1 -> #0 in chain (1)
bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
bucket #3 -> #0 in chain (5) -> #1 in chain (6)
nexthop #2
bucket #1 -> #0 in chain (7) -> #1 in chain (8)
bucket #2 -> #0 in chain (9)
--
node (fa) #2 [2]
nexthop #1
bucket #1 -> #0 in chain (1) -> #1 in chain (2)
bucket #2 -> #0 in chain (3)
it doesn't matter if we stop at (3), (4), (7) for "node #1", or at (2)
for "node #2": walking flattens all that.
It would even be possible to drop the distinction between the in-tree
(s_i) and in-node (s_fa) counter, but a further improvement might
advise against this. This is only as accurate as the existing tracking
mechanism for leaves: if a partial dump is restarted after exceptions
are removed or expired, we might skip some non-dumped entries.
To improve this, we could attach a 'sernum' attribute (similar to the
one used for IPv6) to nexthop entities, and bump this counter whenever
exceptions change: having a distinction between the two counters would
make this more convenient.
Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0
3.5-rc4 + + + + +
4.4
4.9
4.14
4.15
4.19
5.0
5.1
fixed + + + + +
v7:
- Move loop over nexthop objects to route.c, and pass struct fib_info
and table ID to it, not a struct fib_alias (suggested by David Ahern)
- While at it, note that the NULL check on fa->fa_info is redundant,
and the check on RTNH_F_DEAD is also not consistent with what's done
with regular route listing: just keep it for nhc_flags
- Rename entry point function for dumping exceptions to
fib_dump_info_fnhe(), and rearrange arguments for consistency with
fib_dump_info()
- Rename fnhe_dump_buckets() to fnhe_dump_bucket() and make it handle
one bucket at a time
- Expand commit message to describe why we can have a single "skip"
counter for all exceptions stored in bucket chains in nexthop objects
(suggested by David Ahern)
v6:
- Rebased onto net-next
- Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
avoids need to export rt_fill_info() and to touch exceptions from
fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
(suggested by David Ahern)
Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 23:45:23 +08:00
|
|
|
|
|
|
|
stop:
|
|
|
|
cb->args[4] = i;
|
|
|
|
cb->args[5] = i_fa;
|
|
|
|
return err;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-03-05 07:02:44 +08:00
|
|
|
/* rcu_read_lock needs to be hold by caller from readside */
|
2009-09-20 18:35:36 +08:00
|
|
|
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
|
2018-10-16 09:56:43 +08:00
|
|
|
struct netlink_callback *cb, struct fib_dump_filter *filter)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-05 06:59:19 +08:00
|
|
|
struct trie *t = (struct trie *)tb->tb_data;
|
2015-03-07 01:54:52 +08:00
|
|
|
struct key_vector *l, *tp = t->kv;
|
2008-01-23 13:57:22 +08:00
|
|
|
/* Dump starting at last key.
|
|
|
|
* Note: 0.0.0.0/0 (ie default) is first key.
|
|
|
|
*/
|
2015-03-05 06:59:19 +08:00
|
|
|
int count = cb->args[2];
|
|
|
|
t_key key = cb->args[3];
|
2008-01-23 13:56:11 +08:00
|
|
|
|
2020-01-11 01:03:58 +08:00
|
|
|
/* First time here, count and key are both always 0. Count > 0
|
|
|
|
* and key == 0 means the dump has wrapped around and we are done.
|
|
|
|
*/
|
|
|
|
if (count && !key)
|
|
|
|
return skb->len;
|
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
|
2017-05-16 14:19:17 +08:00
|
|
|
int err;
|
|
|
|
|
2018-10-16 09:56:43 +08:00
|
|
|
err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
|
2017-05-16 14:19:17 +08:00
|
|
|
if (err < 0) {
|
2015-03-05 06:59:19 +08:00
|
|
|
cb->args[3] = key;
|
|
|
|
cb->args[2] = count;
|
2017-05-16 14:19:17 +08:00
|
|
|
return err;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
2008-01-23 13:57:22 +08:00
|
|
|
|
2008-02-01 08:45:47 +08:00
|
|
|
++count;
|
2015-03-05 06:59:19 +08:00
|
|
|
key = l->key + 1;
|
|
|
|
|
2008-02-01 08:45:47 +08:00
|
|
|
memset(&cb->args[4], 0,
|
|
|
|
sizeof(cb->args) - 4*sizeof(cb->args[0]));
|
2015-03-05 06:59:19 +08:00
|
|
|
|
|
|
|
/* stop loop if key wrapped back to 0 */
|
|
|
|
if (key < l->key)
|
|
|
|
break;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
2015-03-05 06:59:19 +08:00
|
|
|
|
|
|
|
cb->args[3] = key;
|
|
|
|
cb->args[2] = count;
|
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
return skb->len;
|
|
|
|
}
|
|
|
|
|
2011-02-02 07:30:56 +08:00
|
|
|
void __init fib_trie_init(void)
|
2008-01-15 15:14:20 +08:00
|
|
|
{
|
2008-01-23 13:53:36 +08:00
|
|
|
fn_alias_kmem = kmem_cache_create("ip_fib_alias",
|
|
|
|
sizeof(struct fib_alias),
|
memcg: enable accounting for IP address and routing-related objects
An netadmin inside container can use 'ip a a' and 'ip r a'
to assign a large number of ipv4/ipv6 addresses and routing entries
and force kernel to allocate megabytes of unaccounted memory
for long-lived per-netdevice related kernel objects:
'struct in_ifaddr', 'struct inet6_ifaddr', 'struct fib6_node',
'struct rt6_info', 'struct fib_rules' and ip_fib caches.
These objects can be manually removed, though usually they lives
in memory till destroy of its net namespace.
It makes sense to account for them to restrict the host's memory
consumption from inside the memcg-limited container.
One of such objects is the 'struct fib6_node' mostly allocated in
net/ipv6/route.c::__ip6_ins_rt() inside the lock_bh()/unlock_bh() section:
write_lock_bh(&table->tb6_lock);
err = fib6_add(&table->tb6_root, rt, info, mxc);
write_unlock_bh(&table->tb6_lock);
In this case it is not enough to simply add SLAB_ACCOUNT to corresponding
kmem cache. The proper memory cgroup still cannot be found due to the
incorrect 'in_interrupt()' check used in memcg_kmem_bypass().
Obsoleted in_interrupt() does not describe real execution context properly.
>From include/linux/preempt.h:
The following macros are deprecated and should not be used in new code:
in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
To verify the current execution context new macro should be used instead:
in_task() - We're in task context
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 18:44:31 +08:00
|
|
|
0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
|
2008-01-23 13:51:50 +08:00
|
|
|
|
|
|
|
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
|
2015-03-05 07:02:33 +08:00
|
|
|
LEAF_SIZE,
|
memcg: enable accounting for IP address and routing-related objects
An netadmin inside container can use 'ip a a' and 'ip r a'
to assign a large number of ipv4/ipv6 addresses and routing entries
and force kernel to allocate megabytes of unaccounted memory
for long-lived per-netdevice related kernel objects:
'struct in_ifaddr', 'struct inet6_ifaddr', 'struct fib6_node',
'struct rt6_info', 'struct fib_rules' and ip_fib caches.
These objects can be manually removed, though usually they lives
in memory till destroy of its net namespace.
It makes sense to account for them to restrict the host's memory
consumption from inside the memcg-limited container.
One of such objects is the 'struct fib6_node' mostly allocated in
net/ipv6/route.c::__ip6_ins_rt() inside the lock_bh()/unlock_bh() section:
write_lock_bh(&table->tb6_lock);
err = fib6_add(&table->tb6_root, rt, info, mxc);
write_unlock_bh(&table->tb6_lock);
In this case it is not enough to simply add SLAB_ACCOUNT to corresponding
kmem cache. The proper memory cgroup still cannot be found due to the
incorrect 'in_interrupt()' check used in memcg_kmem_bypass().
Obsoleted in_interrupt() does not describe real execution context properly.
>From include/linux/preempt.h:
The following macros are deprecated and should not be used in new code:
in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
To verify the current execution context new macro should be used instead:
in_task() - We're in task context
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-19 18:44:31 +08:00
|
|
|
0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
|
2008-01-15 15:14:20 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 05:47:00 +08:00
|
|
|
struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
|
|
|
struct fib_table *tb;
|
|
|
|
struct trie *t;
|
2015-03-07 05:47:00 +08:00
|
|
|
size_t sz = sizeof(*tb);
|
|
|
|
|
|
|
|
if (!alias)
|
|
|
|
sz += sizeof(struct trie);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 05:47:00 +08:00
|
|
|
tb = kzalloc(sz, GFP_KERNEL);
|
2015-04-03 16:17:26 +08:00
|
|
|
if (!tb)
|
2005-06-22 03:43:18 +08:00
|
|
|
return NULL;
|
|
|
|
|
|
|
|
tb->tb_id = id;
|
2011-04-15 05:49:37 +08:00
|
|
|
tb->tb_num_default = 0;
|
2015-03-07 05:47:00 +08:00
|
|
|
tb->tb_data = (alias ? alias->__data : tb->__data);
|
|
|
|
|
|
|
|
if (alias)
|
|
|
|
return tb;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
t = (struct trie *) tb->tb_data;
|
2015-03-07 01:54:52 +08:00
|
|
|
t->kv[0].pos = KEYLENGTH;
|
|
|
|
t->kv[0].slen = KEYLENGTH;
|
2015-01-01 02:55:29 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
|
|
|
t->stats = alloc_percpu(struct trie_use_stats);
|
|
|
|
if (!t->stats) {
|
|
|
|
kfree(tb);
|
|
|
|
tb = NULL;
|
|
|
|
}
|
|
|
|
#endif
|
2005-06-22 03:43:18 +08:00
|
|
|
|
|
|
|
return tb;
|
|
|
|
}
|
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
/* Depth first Trie walk iterator */
|
|
|
|
struct fib_trie_iter {
|
2008-01-10 19:27:17 +08:00
|
|
|
struct seq_net_private p;
|
2008-03-24 13:43:56 +08:00
|
|
|
struct fib_table *tb;
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *tnode;
|
2010-09-10 07:32:28 +08:00
|
|
|
unsigned int index;
|
|
|
|
unsigned int depth;
|
2005-09-10 04:35:42 +08:00
|
|
|
};
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-01-01 02:56:18 +08:00
|
|
|
unsigned long cindex = iter->index;
|
2015-03-07 01:54:52 +08:00
|
|
|
struct key_vector *pn = iter->tnode;
|
|
|
|
t_key pkey;
|
2007-01-25 06:42:04 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
|
|
|
|
iter->tnode, iter->index, iter->depth);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
while (!IS_TRIE(pn)) {
|
|
|
|
while (cindex < child_length(pn)) {
|
|
|
|
struct key_vector *n = get_child_rcu(pn, cindex++);
|
|
|
|
|
|
|
|
if (!n)
|
|
|
|
continue;
|
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
if (IS_LEAF(n)) {
|
2015-03-07 01:54:52 +08:00
|
|
|
iter->tnode = pn;
|
|
|
|
iter->index = cindex;
|
2005-09-10 04:35:42 +08:00
|
|
|
} else {
|
|
|
|
/* push down one level */
|
2015-01-01 02:55:47 +08:00
|
|
|
iter->tnode = n;
|
2005-09-10 04:35:42 +08:00
|
|
|
iter->index = 0;
|
|
|
|
++iter->depth;
|
|
|
|
}
|
2015-03-07 01:54:52 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
return n;
|
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
/* Current node exhausted, pop back up */
|
|
|
|
pkey = pn->key;
|
|
|
|
pn = node_parent_rcu(pn);
|
|
|
|
cindex = get_index(pkey, pn) + 1;
|
2005-09-10 04:35:42 +08:00
|
|
|
--iter->depth;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
2005-09-10 04:35:42 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
/* record root node so further searches know we are done */
|
|
|
|
iter->tnode = pn;
|
|
|
|
iter->index = 0;
|
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
return NULL;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
|
|
|
|
struct trie *t)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-06-08 11:54:51 +08:00
|
|
|
struct key_vector *n, *pn;
|
2006-03-21 13:34:12 +08:00
|
|
|
|
2007-03-09 12:44:43 +08:00
|
|
|
if (!t)
|
2006-03-21 13:34:12 +08:00
|
|
|
return NULL;
|
|
|
|
|
2015-06-08 11:54:51 +08:00
|
|
|
pn = t->kv;
|
2015-03-07 01:54:52 +08:00
|
|
|
n = rcu_dereference(pn->tnode[0]);
|
2008-03-24 13:43:56 +08:00
|
|
|
if (!n)
|
2006-03-21 13:34:12 +08:00
|
|
|
return NULL;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2008-03-24 13:43:56 +08:00
|
|
|
if (IS_TNODE(n)) {
|
2015-01-01 02:55:47 +08:00
|
|
|
iter->tnode = n;
|
2008-03-24 13:43:56 +08:00
|
|
|
iter->index = 0;
|
|
|
|
iter->depth = 1;
|
|
|
|
} else {
|
2015-03-07 01:54:52 +08:00
|
|
|
iter->tnode = pn;
|
2008-03-24 13:43:56 +08:00
|
|
|
iter->index = 0;
|
|
|
|
iter->depth = 0;
|
2005-08-10 11:24:39 +08:00
|
|
|
}
|
2008-03-24 13:43:56 +08:00
|
|
|
|
|
|
|
return n;
|
2005-09-10 04:35:42 +08:00
|
|
|
}
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
|
|
|
|
{
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *n;
|
2005-09-10 04:35:42 +08:00
|
|
|
struct fib_trie_iter iter;
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
memset(s, 0, sizeof(*s));
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
rcu_read_lock();
|
2008-03-24 13:43:56 +08:00
|
|
|
for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
|
2005-09-10 04:35:42 +08:00
|
|
|
if (IS_LEAF(n)) {
|
2015-02-26 07:31:51 +08:00
|
|
|
struct fib_alias *fa;
|
2008-01-23 13:54:05 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
s->leaves++;
|
|
|
|
s->totdepth += iter.depth;
|
|
|
|
if (iter.depth > s->maxdepth)
|
|
|
|
s->maxdepth = iter.depth;
|
2008-01-23 13:54:05 +08:00
|
|
|
|
2015-02-26 07:31:51 +08:00
|
|
|
hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
|
2008-01-23 13:54:05 +08:00
|
|
|
++s->prefixes;
|
2005-09-10 04:35:42 +08:00
|
|
|
} else {
|
|
|
|
s->tnodes++;
|
2015-01-01 02:55:47 +08:00
|
|
|
if (n->bits < MAX_STAT_DEPTH)
|
|
|
|
s->nodesizes[n->bits]++;
|
2015-03-07 01:54:39 +08:00
|
|
|
s->nullpointers += tn_info(n)->empty_children;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
}
|
2005-08-26 04:01:29 +08:00
|
|
|
rcu_read_unlock();
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
/*
|
|
|
|
* This outputs /proc/net/fib_triestats
|
|
|
|
*/
|
|
|
|
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2010-09-10 07:32:28 +08:00
|
|
|
unsigned int i, max, pointers, bytes, avdepth;
|
2005-07-20 05:01:51 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
if (stat->leaves)
|
|
|
|
avdepth = stat->totdepth*100 / stat->leaves;
|
|
|
|
else
|
|
|
|
avdepth = 0;
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2008-01-23 13:53:36 +08:00
|
|
|
seq_printf(seq, "\tAver depth: %u.%02d\n",
|
|
|
|
avdepth / 100, avdepth % 100);
|
2005-09-10 04:35:42 +08:00
|
|
|
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
|
2015-03-05 07:02:33 +08:00
|
|
|
bytes = LEAF_SIZE * stat->leaves;
|
2008-01-23 13:54:05 +08:00
|
|
|
|
|
|
|
seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
|
2015-02-26 07:31:51 +08:00
|
|
|
bytes += sizeof(struct fib_alias) * stat->prefixes;
|
2008-01-23 13:54:05 +08:00
|
|
|
|
2008-01-13 12:55:55 +08:00
|
|
|
seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
|
2015-03-05 07:02:33 +08:00
|
|
|
bytes += TNODE_SIZE(0) * stat->tnodes;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2006-03-21 13:35:01 +08:00
|
|
|
max = MAX_STAT_DEPTH;
|
|
|
|
while (max > 0 && stat->nodesizes[max-1] == 0)
|
2005-09-10 04:35:42 +08:00
|
|
|
max--;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
pointers = 0;
|
2013-07-23 03:01:58 +08:00
|
|
|
for (i = 1; i < max; i++)
|
2005-09-10 04:35:42 +08:00
|
|
|
if (stat->nodesizes[i] != 0) {
|
2008-01-13 12:55:55 +08:00
|
|
|
seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
|
2005-09-10 04:35:42 +08:00
|
|
|
pointers += (1<<i) * stat->nodesizes[i];
|
|
|
|
}
|
|
|
|
seq_putc(seq, '\n');
|
2008-01-13 12:55:55 +08:00
|
|
|
seq_printf(seq, "\tPointers: %u\n", pointers);
|
2005-08-26 04:01:29 +08:00
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
bytes += sizeof(struct key_vector *) * pointers;
|
2008-01-13 12:55:55 +08:00
|
|
|
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
|
|
|
|
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
|
2008-01-13 13:23:17 +08:00
|
|
|
}
|
2005-08-26 04:01:29 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2008-01-13 13:23:17 +08:00
|
|
|
static void trie_show_usage(struct seq_file *seq,
|
2015-01-01 02:55:29 +08:00
|
|
|
const struct trie_use_stats __percpu *stats)
|
2008-01-13 13:23:17 +08:00
|
|
|
{
|
2015-01-01 02:55:29 +08:00
|
|
|
struct trie_use_stats s = { 0 };
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
/* loop through all of the CPUs and gather up the stats */
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
|
|
|
|
|
|
|
|
s.gets += pcpu->gets;
|
|
|
|
s.backtrack += pcpu->backtrack;
|
|
|
|
s.semantic_match_passed += pcpu->semantic_match_passed;
|
|
|
|
s.semantic_match_miss += pcpu->semantic_match_miss;
|
|
|
|
s.null_node_hit += pcpu->null_node_hit;
|
|
|
|
s.resize_node_skipped += pcpu->resize_node_skipped;
|
|
|
|
}
|
|
|
|
|
2008-01-13 13:23:17 +08:00
|
|
|
seq_printf(seq, "\nCounters:\n---------\n");
|
2015-01-01 02:55:29 +08:00
|
|
|
seq_printf(seq, "gets = %u\n", s.gets);
|
|
|
|
seq_printf(seq, "backtracks = %u\n", s.backtrack);
|
2008-01-23 13:53:36 +08:00
|
|
|
seq_printf(seq, "semantic match passed = %u\n",
|
2015-01-01 02:55:29 +08:00
|
|
|
s.semantic_match_passed);
|
|
|
|
seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
|
|
|
|
seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
|
|
|
|
seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
|
2005-09-10 04:35:42 +08:00
|
|
|
}
|
2008-01-13 13:23:17 +08:00
|
|
|
#endif /* CONFIG_IP_FIB_TRIE_STATS */
|
|
|
|
|
2008-03-24 13:43:56 +08:00
|
|
|
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
|
2008-01-15 15:11:54 +08:00
|
|
|
{
|
2008-03-24 13:43:56 +08:00
|
|
|
if (tb->tb_id == RT_TABLE_LOCAL)
|
|
|
|
seq_puts(seq, "Local:\n");
|
|
|
|
else if (tb->tb_id == RT_TABLE_MAIN)
|
|
|
|
seq_puts(seq, "Main:\n");
|
|
|
|
else
|
|
|
|
seq_printf(seq, "Id %d:\n", tb->tb_id);
|
2008-01-15 15:11:54 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2008-03-24 13:43:56 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
|
|
|
|
{
|
2008-01-10 19:27:17 +08:00
|
|
|
struct net *net = (struct net *)seq->private;
|
2008-03-24 13:43:56 +08:00
|
|
|
unsigned int h;
|
2007-12-07 16:47:47 +08:00
|
|
|
|
2008-01-15 15:11:54 +08:00
|
|
|
seq_printf(seq,
|
2008-01-23 13:53:36 +08:00
|
|
|
"Basic info: size of leaf:"
|
2017-02-28 06:30:02 +08:00
|
|
|
" %zd bytes, size of tnode: %zd bytes.\n",
|
2015-03-05 07:02:33 +08:00
|
|
|
LEAF_SIZE, TNODE_SIZE(0));
|
2008-01-15 15:11:54 +08:00
|
|
|
|
2020-03-26 06:01:00 +08:00
|
|
|
rcu_read_lock();
|
2008-03-24 13:43:56 +08:00
|
|
|
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
|
|
|
|
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
|
|
|
|
struct fib_table *tb;
|
|
|
|
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 09:06:00 +08:00
|
|
|
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
|
2008-03-24 13:43:56 +08:00
|
|
|
struct trie *t = (struct trie *) tb->tb_data;
|
|
|
|
struct trie_stat stat;
|
2007-12-07 16:47:47 +08:00
|
|
|
|
2008-03-24 13:43:56 +08:00
|
|
|
if (!t)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
fib_table_print(seq, tb);
|
|
|
|
|
|
|
|
trie_collect_stats(t, &stat);
|
|
|
|
trie_show_stats(seq, &stat);
|
|
|
|
#ifdef CONFIG_IP_FIB_TRIE_STATS
|
2015-01-01 02:55:29 +08:00
|
|
|
trie_show_usage(seq, t->stats);
|
2008-03-24 13:43:56 +08:00
|
|
|
#endif
|
|
|
|
}
|
2020-03-26 06:01:00 +08:00
|
|
|
cond_resched_rcu();
|
2008-03-24 13:43:56 +08:00
|
|
|
}
|
2020-03-26 06:01:00 +08:00
|
|
|
rcu_read_unlock();
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
return 0;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2008-03-26 01:36:06 +08:00
|
|
|
struct fib_trie_iter *iter = seq->private;
|
|
|
|
struct net *net = seq_file_net(seq);
|
2005-09-10 04:35:42 +08:00
|
|
|
loff_t idx = 0;
|
2008-03-24 13:43:56 +08:00
|
|
|
unsigned int h;
|
2005-09-10 04:35:42 +08:00
|
|
|
|
2008-03-24 13:43:56 +08:00
|
|
|
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
|
|
|
|
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
|
|
|
|
struct fib_table *tb;
|
2005-09-10 04:35:42 +08:00
|
|
|
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 09:06:00 +08:00
|
|
|
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *n;
|
2008-03-24 13:43:56 +08:00
|
|
|
|
|
|
|
for (n = fib_trie_get_first(iter,
|
|
|
|
(struct trie *) tb->tb_data);
|
|
|
|
n; n = fib_trie_get_next(iter))
|
|
|
|
if (pos == idx++) {
|
|
|
|
iter->tb = tb;
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
}
|
2005-09-10 04:35:42 +08:00
|
|
|
}
|
2008-03-24 13:43:56 +08:00
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
|
2008-01-13 13:25:02 +08:00
|
|
|
__acquires(RCU)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2005-09-10 04:35:42 +08:00
|
|
|
rcu_read_lock();
|
2008-03-26 01:36:06 +08:00
|
|
|
return fib_trie_get_idx(seq, *pos);
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2005-09-10 04:35:42 +08:00
|
|
|
struct fib_trie_iter *iter = seq->private;
|
2008-03-26 01:36:06 +08:00
|
|
|
struct net *net = seq_file_net(seq);
|
2008-03-24 13:43:56 +08:00
|
|
|
struct fib_table *tb = iter->tb;
|
|
|
|
struct hlist_node *tb_node;
|
|
|
|
unsigned int h;
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *n;
|
2005-09-10 04:35:42 +08:00
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
++*pos;
|
2008-03-24 13:43:56 +08:00
|
|
|
/* next node in same table */
|
|
|
|
n = fib_trie_get_next(iter);
|
|
|
|
if (n)
|
|
|
|
return n;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2008-03-24 13:43:56 +08:00
|
|
|
/* walk rest of this hash chain */
|
|
|
|
h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
|
2011-03-31 16:51:35 +08:00
|
|
|
while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
|
2008-03-24 13:43:56 +08:00
|
|
|
tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
|
|
|
|
n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
|
|
|
|
if (n)
|
|
|
|
goto found;
|
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2008-03-24 13:43:56 +08:00
|
|
|
/* new hash chain */
|
|
|
|
while (++h < FIB_TABLE_HASHSZ) {
|
|
|
|
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 09:06:00 +08:00
|
|
|
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
|
2008-03-24 13:43:56 +08:00
|
|
|
n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
|
|
|
|
if (n)
|
|
|
|
goto found;
|
|
|
|
}
|
|
|
|
}
|
2005-09-10 04:35:42 +08:00
|
|
|
return NULL;
|
2008-03-24 13:43:56 +08:00
|
|
|
|
|
|
|
found:
|
|
|
|
iter->tb = tb;
|
|
|
|
return n;
|
2005-09-10 04:35:42 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
|
2008-01-13 13:25:02 +08:00
|
|
|
__releases(RCU)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2005-09-10 04:35:42 +08:00
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
static void seq_indent(struct seq_file *seq, int n)
|
|
|
|
{
|
2010-09-10 07:32:28 +08:00
|
|
|
while (n-- > 0)
|
|
|
|
seq_puts(seq, " ");
|
2005-09-10 04:35:42 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2008-01-15 15:09:56 +08:00
|
|
|
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
|
2005-09-10 04:35:42 +08:00
|
|
|
{
|
2007-03-09 12:44:43 +08:00
|
|
|
switch (s) {
|
2005-09-10 04:35:42 +08:00
|
|
|
case RT_SCOPE_UNIVERSE: return "universe";
|
|
|
|
case RT_SCOPE_SITE: return "site";
|
|
|
|
case RT_SCOPE_LINK: return "link";
|
|
|
|
case RT_SCOPE_HOST: return "host";
|
|
|
|
case RT_SCOPE_NOWHERE: return "nowhere";
|
|
|
|
default:
|
2008-01-15 15:09:56 +08:00
|
|
|
snprintf(buf, len, "scope=%d", s);
|
2005-09-10 04:35:42 +08:00
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2009-08-06 01:42:58 +08:00
|
|
|
static const char *const rtn_type_names[__RTN_MAX] = {
|
2005-09-10 04:35:42 +08:00
|
|
|
[RTN_UNSPEC] = "UNSPEC",
|
|
|
|
[RTN_UNICAST] = "UNICAST",
|
|
|
|
[RTN_LOCAL] = "LOCAL",
|
|
|
|
[RTN_BROADCAST] = "BROADCAST",
|
|
|
|
[RTN_ANYCAST] = "ANYCAST",
|
|
|
|
[RTN_MULTICAST] = "MULTICAST",
|
|
|
|
[RTN_BLACKHOLE] = "BLACKHOLE",
|
|
|
|
[RTN_UNREACHABLE] = "UNREACHABLE",
|
|
|
|
[RTN_PROHIBIT] = "PROHIBIT",
|
|
|
|
[RTN_THROW] = "THROW",
|
|
|
|
[RTN_NAT] = "NAT",
|
|
|
|
[RTN_XRESOLVE] = "XRESOLVE",
|
|
|
|
};
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2010-09-10 07:32:28 +08:00
|
|
|
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
|
2005-09-10 04:35:42 +08:00
|
|
|
{
|
|
|
|
if (t < __RTN_MAX && rtn_type_names[t])
|
|
|
|
return rtn_type_names[t];
|
2008-01-15 15:09:56 +08:00
|
|
|
snprintf(buf, len, "type %u", t);
|
2005-09-10 04:35:42 +08:00
|
|
|
return buf;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
/* Pretty print the trie */
|
|
|
|
static int fib_trie_seq_show(struct seq_file *seq, void *v)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2005-09-10 04:35:42 +08:00
|
|
|
const struct fib_trie_iter *iter = seq->private;
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *n = v;
|
2005-07-20 05:01:51 +08:00
|
|
|
|
2015-03-07 01:54:52 +08:00
|
|
|
if (IS_TRIE(node_parent_rcu(n)))
|
2008-03-24 13:43:56 +08:00
|
|
|
fib_table_print(seq, iter->tb);
|
2007-01-27 11:06:01 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
if (IS_TNODE(n)) {
|
2015-01-01 02:55:47 +08:00
|
|
|
__be32 prf = htonl(n->key);
|
2005-08-10 11:24:39 +08:00
|
|
|
|
2015-01-01 02:56:12 +08:00
|
|
|
seq_indent(seq, iter->depth-1);
|
|
|
|
seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
|
|
|
|
&prf, KEYLENGTH - n->pos - n->bits, n->bits,
|
2015-03-07 01:54:39 +08:00
|
|
|
tn_info(n)->full_children,
|
|
|
|
tn_info(n)->empty_children);
|
2005-09-10 04:35:42 +08:00
|
|
|
} else {
|
2015-01-01 02:55:47 +08:00
|
|
|
__be32 val = htonl(n->key);
|
2015-02-26 07:31:51 +08:00
|
|
|
struct fib_alias *fa;
|
2005-09-10 04:35:42 +08:00
|
|
|
|
|
|
|
seq_indent(seq, iter->depth);
|
2008-10-31 15:53:57 +08:00
|
|
|
seq_printf(seq, " |-- %pI4\n", &val);
|
2008-01-23 13:54:37 +08:00
|
|
|
|
2015-02-26 07:31:51 +08:00
|
|
|
hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
|
|
|
|
char buf1[32], buf2[32];
|
|
|
|
|
|
|
|
seq_indent(seq, iter->depth + 1);
|
|
|
|
seq_printf(seq, " /%zu %s %s",
|
|
|
|
KEYLENGTH - fa->fa_slen,
|
|
|
|
rtn_scope(buf1, sizeof(buf1),
|
|
|
|
fa->fa_info->fib_scope),
|
|
|
|
rtn_type(buf2, sizeof(buf2),
|
|
|
|
fa->fa_type));
|
|
|
|
if (fa->fa_tos)
|
|
|
|
seq_printf(seq, " tos=%d", fa->fa_tos);
|
|
|
|
seq_putc(seq, '\n');
|
2005-09-10 04:35:42 +08:00
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
2005-09-10 04:35:42 +08:00
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-03-13 05:34:29 +08:00
|
|
|
static const struct seq_operations fib_trie_seq_ops = {
|
2005-09-10 04:35:42 +08:00
|
|
|
.start = fib_trie_seq_start,
|
|
|
|
.next = fib_trie_seq_next,
|
|
|
|
.stop = fib_trie_seq_stop,
|
|
|
|
.show = fib_trie_seq_show,
|
2005-06-22 03:43:18 +08:00
|
|
|
};
|
|
|
|
|
2008-02-12 13:14:39 +08:00
|
|
|
struct fib_route_iter {
|
|
|
|
struct seq_net_private p;
|
2015-03-05 06:59:19 +08:00
|
|
|
struct fib_table *main_tb;
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *tnode;
|
2008-02-12 13:14:39 +08:00
|
|
|
loff_t pos;
|
|
|
|
t_key key;
|
|
|
|
};
|
|
|
|
|
2015-03-07 01:54:08 +08:00
|
|
|
static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
|
|
|
|
loff_t pos)
|
2008-02-12 13:14:39 +08:00
|
|
|
{
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *l, **tp = &iter->tnode;
|
2015-03-05 06:59:19 +08:00
|
|
|
t_key key;
|
2008-02-12 13:14:39 +08:00
|
|
|
|
2016-11-05 03:11:57 +08:00
|
|
|
/* use cached location of previously found key */
|
2015-03-05 06:59:19 +08:00
|
|
|
if (iter->pos > 0 && pos >= iter->pos) {
|
|
|
|
key = iter->key;
|
|
|
|
} else {
|
2016-11-05 03:11:57 +08:00
|
|
|
iter->pos = 1;
|
2015-03-05 06:59:19 +08:00
|
|
|
key = 0;
|
2008-02-12 13:14:39 +08:00
|
|
|
}
|
|
|
|
|
2016-11-05 03:11:57 +08:00
|
|
|
pos -= iter->pos;
|
|
|
|
|
|
|
|
while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
|
2015-03-05 06:59:19 +08:00
|
|
|
key = l->key + 1;
|
2008-02-12 13:14:39 +08:00
|
|
|
iter->pos++;
|
2015-03-05 06:59:19 +08:00
|
|
|
l = NULL;
|
|
|
|
|
|
|
|
/* handle unlikely case of a key wrap */
|
|
|
|
if (!key)
|
|
|
|
break;
|
2008-02-12 13:14:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (l)
|
2016-11-05 03:11:57 +08:00
|
|
|
iter->key = l->key; /* remember it */
|
2008-02-12 13:14:39 +08:00
|
|
|
else
|
|
|
|
iter->pos = 0; /* forget it */
|
|
|
|
|
|
|
|
return l;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
|
|
|
|
__acquires(RCU)
|
|
|
|
{
|
|
|
|
struct fib_route_iter *iter = seq->private;
|
|
|
|
struct fib_table *tb;
|
2015-03-05 06:59:19 +08:00
|
|
|
struct trie *t;
|
2008-02-12 13:14:39 +08:00
|
|
|
|
|
|
|
rcu_read_lock();
|
2015-03-05 06:59:19 +08:00
|
|
|
|
2008-03-26 01:36:06 +08:00
|
|
|
tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
|
2008-02-12 13:14:39 +08:00
|
|
|
if (!tb)
|
|
|
|
return NULL;
|
|
|
|
|
2015-03-05 06:59:19 +08:00
|
|
|
iter->main_tb = tb;
|
2016-08-03 22:13:01 +08:00
|
|
|
t = (struct trie *)tb->tb_data;
|
|
|
|
iter->tnode = t->kv;
|
2015-03-05 06:59:19 +08:00
|
|
|
|
|
|
|
if (*pos != 0)
|
|
|
|
return fib_route_get_idx(iter, *pos);
|
|
|
|
|
|
|
|
iter->pos = 0;
|
2016-11-05 03:11:57 +08:00
|
|
|
iter->key = KEY_MAX;
|
2015-03-05 06:59:19 +08:00
|
|
|
|
|
|
|
return SEQ_START_TOKEN;
|
2008-02-12 13:14:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
|
|
|
{
|
|
|
|
struct fib_route_iter *iter = seq->private;
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *l = NULL;
|
2016-11-05 03:11:57 +08:00
|
|
|
t_key key = iter->key + 1;
|
2008-02-12 13:14:39 +08:00
|
|
|
|
|
|
|
++*pos;
|
2015-03-05 06:59:19 +08:00
|
|
|
|
|
|
|
/* only allow key of 0 for start of sequence */
|
|
|
|
if ((v == SEQ_START_TOKEN) || key)
|
|
|
|
l = leaf_walk_rcu(&iter->tnode, key);
|
|
|
|
|
|
|
|
if (l) {
|
2016-11-05 03:11:57 +08:00
|
|
|
iter->key = l->key;
|
2008-02-12 13:14:39 +08:00
|
|
|
iter->pos++;
|
2015-03-05 06:59:19 +08:00
|
|
|
} else {
|
|
|
|
iter->pos = 0;
|
2008-02-12 13:14:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return l;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fib_route_seq_stop(struct seq_file *seq, void *v)
|
|
|
|
__releases(RCU)
|
|
|
|
{
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2019-06-04 11:19:49 +08:00
|
|
|
static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2010-09-10 07:32:28 +08:00
|
|
|
unsigned int flags = 0;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2010-09-10 07:32:28 +08:00
|
|
|
if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
|
|
|
|
flags = RTF_REJECT;
|
2019-06-04 11:19:49 +08:00
|
|
|
if (fi) {
|
2019-06-04 11:19:50 +08:00
|
|
|
const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
|
2019-06-04 11:19:49 +08:00
|
|
|
|
2019-06-04 11:19:50 +08:00
|
|
|
if (nhc->nhc_gw.ipv4)
|
2019-06-04 11:19:49 +08:00
|
|
|
flags |= RTF_GATEWAY;
|
|
|
|
}
|
2006-09-27 13:21:45 +08:00
|
|
|
if (mask == htonl(0xFFFFFFFF))
|
2005-09-10 04:35:42 +08:00
|
|
|
flags |= RTF_HOST;
|
|
|
|
flags |= RTF_UP;
|
|
|
|
return flags;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
/*
|
|
|
|
* This outputs /proc/net/route.
|
|
|
|
* The format of the file is not supposed to be changed
|
2010-09-10 07:32:28 +08:00
|
|
|
* and needs to be same as fib_hash output to avoid breaking
|
2005-09-10 04:35:42 +08:00
|
|
|
* legacy utilities
|
|
|
|
*/
|
|
|
|
static int fib_route_seq_show(struct seq_file *seq, void *v)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2015-03-12 07:36:08 +08:00
|
|
|
struct fib_route_iter *iter = seq->private;
|
|
|
|
struct fib_table *tb = iter->main_tb;
|
2015-02-26 07:31:51 +08:00
|
|
|
struct fib_alias *fa;
|
2015-03-07 01:54:08 +08:00
|
|
|
struct key_vector *l = v;
|
2015-02-26 07:31:44 +08:00
|
|
|
__be32 prefix;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2005-09-10 04:35:42 +08:00
|
|
|
if (v == SEQ_START_TOKEN) {
|
|
|
|
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
|
|
|
|
"\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
|
|
|
|
"\tWindow\tIRTT");
|
|
|
|
return 0;
|
|
|
|
}
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-02-26 07:31:44 +08:00
|
|
|
prefix = htonl(l->key);
|
|
|
|
|
2015-02-26 07:31:51 +08:00
|
|
|
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
|
2019-06-04 11:19:49 +08:00
|
|
|
struct fib_info *fi = fa->fa_info;
|
2015-02-26 07:31:51 +08:00
|
|
|
__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
|
|
|
|
unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-02-26 07:31:51 +08:00
|
|
|
if ((fa->fa_type == RTN_BROADCAST) ||
|
|
|
|
(fa->fa_type == RTN_MULTICAST))
|
|
|
|
continue;
|
2005-06-22 03:43:18 +08:00
|
|
|
|
2015-03-12 07:36:08 +08:00
|
|
|
if (fa->tb_id != tb->tb_id)
|
|
|
|
continue;
|
|
|
|
|
2015-02-26 07:31:51 +08:00
|
|
|
seq_setwidth(seq, 127);
|
|
|
|
|
2019-06-04 11:19:49 +08:00
|
|
|
if (fi) {
|
2019-06-04 11:19:50 +08:00
|
|
|
struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
|
|
|
|
__be32 gw = 0;
|
|
|
|
|
|
|
|
if (nhc->nhc_gw_family == AF_INET)
|
|
|
|
gw = nhc->nhc_gw.ipv4;
|
2019-06-04 11:19:49 +08:00
|
|
|
|
2015-02-26 07:31:51 +08:00
|
|
|
seq_printf(seq,
|
|
|
|
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
|
|
|
|
"%d\t%08X\t%d\t%u\t%u",
|
2019-06-04 11:19:50 +08:00
|
|
|
nhc->nhc_dev ? nhc->nhc_dev->name : "*",
|
|
|
|
prefix, gw, flags, 0, 0,
|
2015-02-26 07:31:51 +08:00
|
|
|
fi->fib_priority,
|
|
|
|
mask,
|
|
|
|
(fi->fib_advmss ?
|
|
|
|
fi->fib_advmss + 40 : 0),
|
|
|
|
fi->fib_window,
|
|
|
|
fi->fib_rtt >> 3);
|
2019-06-04 11:19:49 +08:00
|
|
|
} else {
|
2015-02-26 07:31:51 +08:00
|
|
|
seq_printf(seq,
|
|
|
|
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
|
|
|
|
"%d\t%08X\t%d\t%u\t%u",
|
|
|
|
prefix, 0, flags, 0, 0, 0,
|
|
|
|
mask, 0, 0, 0);
|
2019-06-04 11:19:49 +08:00
|
|
|
}
|
2015-02-26 07:31:51 +08:00
|
|
|
seq_pad(seq, '\n');
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-03-13 05:34:29 +08:00
|
|
|
static const struct seq_operations fib_route_seq_ops = {
|
2008-02-12 13:14:39 +08:00
|
|
|
.start = fib_route_seq_start,
|
|
|
|
.next = fib_route_seq_next,
|
|
|
|
.stop = fib_route_seq_stop,
|
2005-09-10 04:35:42 +08:00
|
|
|
.show = fib_route_seq_show,
|
2005-06-22 03:43:18 +08:00
|
|
|
};
|
|
|
|
|
2008-01-10 19:21:09 +08:00
|
|
|
int __net_init fib_proc_init(struct net *net)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2018-04-11 01:42:55 +08:00
|
|
|
if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
|
|
|
|
sizeof(struct fib_trie_iter)))
|
2005-09-10 04:35:42 +08:00
|
|
|
goto out1;
|
|
|
|
|
2018-04-14 02:38:35 +08:00
|
|
|
if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
|
|
|
|
fib_triestat_seq_show, NULL))
|
2005-09-10 04:35:42 +08:00
|
|
|
goto out2;
|
|
|
|
|
2018-04-11 01:42:55 +08:00
|
|
|
if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
|
|
|
|
sizeof(struct fib_route_iter)))
|
2005-09-10 04:35:42 +08:00
|
|
|
goto out3;
|
|
|
|
|
2005-06-22 03:43:18 +08:00
|
|
|
return 0;
|
2005-09-10 04:35:42 +08:00
|
|
|
|
|
|
|
out3:
|
2013-02-18 09:34:56 +08:00
|
|
|
remove_proc_entry("fib_triestat", net->proc_net);
|
2005-09-10 04:35:42 +08:00
|
|
|
out2:
|
2013-02-18 09:34:56 +08:00
|
|
|
remove_proc_entry("fib_trie", net->proc_net);
|
2005-09-10 04:35:42 +08:00
|
|
|
out1:
|
|
|
|
return -ENOMEM;
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
2008-01-10 19:21:09 +08:00
|
|
|
void __net_exit fib_proc_exit(struct net *net)
|
2005-06-22 03:43:18 +08:00
|
|
|
{
|
2013-02-18 09:34:56 +08:00
|
|
|
remove_proc_entry("fib_trie", net->proc_net);
|
|
|
|
remove_proc_entry("fib_triestat", net->proc_net);
|
|
|
|
remove_proc_entry("route", net->proc_net);
|
2005-06-22 03:43:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_PROC_FS */
|