net/ipv6: Add knob to skip DELROUTE message on device down

Another difference between IPv4 and IPv6 is the generation of RTM_DELROUTE
notifications when a device is taken down (admin down) or deleted. IPv4
does not generate a message for routes evicted by the down or delete;
IPv6 does. A NOS at scale really needs to avoid these messages and have
IPv4 and IPv6 behave similarly, relying on userspace to handle link
notifications and evict the routes.

At this point existing user behavior needs to be preserved. Since
notifications are a global action (not per app) the only way to preserve
existing behavior and allow the messages to be skipped is to add a new
sysctl (net/ipv6/route/skip_notify_on_dev_down) which can be set to
disable the notificatioons.

IPv6 route code already supports the option to skip the message (it is
used for multipath routes for example). Besides the new sysctl we need
to pass the skip_notify setting through the generic fib6_clean and
fib6_walk functions to fib6_clean_node and to set skip_notify on calls
to __ip_del_rt for the addrconf_ifdown path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David Ahern 2018-10-11 20:17:21 -07:00 committed by David S. Miller
parent 7cc2d504da
commit 7c6bb7d2fa
5 changed files with 46 additions and 6 deletions

View File

@ -1442,6 +1442,14 @@ max_hbh_length - INTEGER
header.
Default: INT_MAX (unlimited)
skip_notify_on_dev_down - BOOLEAN
Controls whether an RTM_DELROUTE message is generated for routes
removed when a device is taken down or deleted. IPv4 does not
generate this message; IPv6 does by default. Setting this sysctl
to true skips the message, making IPv4 and IPv6 on par in relying
on userspace caches to track link events and evict routes.
Default: false (generate message)
IPv6 Fragmentation:
ip6frag_high_thresh - INTEGER

View File

@ -407,6 +407,9 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
void *arg);
void fib6_clean_all_skip_notify(struct net *net,
int (*func)(struct fib6_info *, void *arg),
void *arg);
int fib6_add(struct fib6_node *root, struct fib6_info *rt,
struct nl_info *info, struct netlink_ext_ack *extack);

View File

@ -45,6 +45,7 @@ struct netns_sysctl_ipv6 {
int max_dst_opts_len;
int max_hbh_opts_len;
int seg6_flowlabel;
bool skip_notify_on_dev_down;
};
struct netns_ipv6 {

View File

@ -47,6 +47,7 @@ struct fib6_cleaner {
int (*func)(struct fib6_info *, void *arg);
int sernum;
void *arg;
bool skip_notify;
};
#ifdef CONFIG_IPV6_SUBTREES
@ -1956,6 +1957,7 @@ static int fib6_clean_node(struct fib6_walker *w)
struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
struct nl_info info = {
.nl_net = c->net,
.skip_notify = c->skip_notify,
};
if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
@ -2007,7 +2009,7 @@ static int fib6_clean_node(struct fib6_walker *w)
static void fib6_clean_tree(struct net *net, struct fib6_node *root,
int (*func)(struct fib6_info *, void *arg),
int sernum, void *arg)
int sernum, void *arg, bool skip_notify)
{
struct fib6_cleaner c;
@ -2019,13 +2021,14 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
c.sernum = sernum;
c.arg = arg;
c.net = net;
c.skip_notify = skip_notify;
fib6_walk(net, &c.w);
}
static void __fib6_clean_all(struct net *net,
int (*func)(struct fib6_info *, void *),
int sernum, void *arg)
int sernum, void *arg, bool skip_notify)
{
struct fib6_table *table;
struct hlist_head *head;
@ -2037,7 +2040,7 @@ static void __fib6_clean_all(struct net *net,
hlist_for_each_entry_rcu(table, head, tb6_hlist) {
spin_lock_bh(&table->tb6_lock);
fib6_clean_tree(net, &table->tb6_root,
func, sernum, arg);
func, sernum, arg, skip_notify);
spin_unlock_bh(&table->tb6_lock);
}
}
@ -2047,14 +2050,21 @@ static void __fib6_clean_all(struct net *net,
void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
void *arg)
{
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
}
void fib6_clean_all_skip_notify(struct net *net,
int (*func)(struct fib6_info *, void *),
void *arg)
{
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
}
static void fib6_flush_trees(struct net *net)
{
int new_sernum = fib6_new_sernum(net);
__fib6_clean_all(net, NULL, new_sernum, NULL);
__fib6_clean_all(net, NULL, new_sernum, NULL, false);
}
/*

View File

@ -4026,8 +4026,12 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
.event = event,
},
};
struct net *net = dev_net(dev);
fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
if (net->ipv6.sysctl.skip_notify_on_dev_down)
fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
else
fib6_clean_all(net, fib6_ifdown, &arg);
}
void rt6_disable_ip(struct net_device *dev, unsigned long event)
@ -5031,6 +5035,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
return 0;
}
static int zero;
static int one = 1;
static struct ctl_table ipv6_route_table_template[] = {
{
.procname = "flush",
@ -5102,6 +5109,15 @@ static struct ctl_table ipv6_route_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
{
.procname = "skip_notify_on_dev_down",
.data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
.extra2 = &one,
},
{ }
};
@ -5125,6 +5141,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
@ -5189,6 +5206,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
net->ipv6.sysctl.skip_notify_on_dev_down = 0;
net->ipv6.ip6_rt_gc_expire = 30*HZ;