From a2e2ff560f5113e8ca31432fbd985f5f1889efdc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 16 Jun 2016 16:24:24 -0700 Subject: [PATCH 1/3] net: ipv6: Move ip6_route_get_saddr to inline VRF driver needs access to ip6_route_get_saddr code. Since it does little beyond ipv6_dev_get_saddr and ipv6_dev_get_saddr is already exported for modules move ip6_route_get_saddr to the header as an inline. Code move only; no functional change. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 21 ++++++++++++++++++--- net/ipv6/route.c | 17 ----------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f55bf3d294aa..d97305d0e71f 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -18,6 +18,7 @@ struct route_info { __u8 prefix[0]; /* 0,8 or 16 */ }; +#include #include #include #include @@ -88,9 +89,23 @@ int ip6_route_add(struct fib6_config *cfg); int ip6_ins_rt(struct rt6_info *); int ip6_del_rt(struct rt6_info *); -int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, - const struct in6_addr *daddr, unsigned int prefs, - struct in6_addr *saddr); +static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr) +{ + struct inet6_dev *idev = + rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; + int err = 0; + + if (rt && rt->rt6i_prefsrc.plen) + *saddr = rt->rt6i_prefsrc.addr; + else + err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, + daddr, prefs, saddr); + + return err; +} struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, int flags); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9e1516785dac..08b77f421268 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2586,23 +2586,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, return rt; } -int ip6_route_get_saddr(struct net *net, - struct rt6_info *rt, - const struct in6_addr *daddr, - unsigned int prefs, - struct in6_addr *saddr) -{ - struct inet6_dev *idev = - rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; - int err = 0; - if (rt && rt->rt6i_prefsrc.plen) - *saddr = rt->rt6i_prefsrc.addr; - else - err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, - daddr, prefs, saddr); - return err; -} - /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { struct net_device *dev; From 0d240e7811c4ec1965760ee4643b5bbc9cfacbb3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 16 Jun 2016 16:24:25 -0700 Subject: [PATCH 2/3] net: vrf: Implement get_saddr for IPv6 IPv6 source address selection needs to consider the real egress route. Similar to IPv4 implement a get_saddr6 method which is called if source address has not been set. The get_saddr6 method does a full lookup which means pulling a route from the VRF FIB table and properly considering linklocal/multicast destination addresses. Lookup failures (eg., unreachable) then cause the source address selection to fail which gets propagated back to the caller. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/net/l3mdev.h | 11 +++++++++++ net/ipv6/ip6_output.c | 12 ++++++++++-- net/l3mdev/l3mdev.c | 24 ++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 2 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 32173aa9208e..b3762822b653 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -999,6 +999,46 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, return dst; } + +/* called under rcu_read_lock */ +static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk, + struct flowi6 *fl6) +{ + struct net *net = dev_net(dev); + struct dst_entry *dst; + struct rt6_info *rt; + int err; + + if (rt6_need_strict(&fl6->daddr)) { + rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, + RT6_LOOKUP_F_IFACE); + if (unlikely(!rt)) + return 0; + + dst = &rt->dst; + } else { + __u8 flags = fl6->flowi6_flags; + + fl6->flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; + fl6->flowi6_flags |= FLOWI_FLAG_SKIP_NH_OIF; + + dst = ip6_route_output(net, sk, fl6); + rt = (struct rt6_info *)dst; + + fl6->flowi6_flags = flags; + } + + err = dst->error; + if (!err) { + err = ip6_route_get_saddr(net, rt, &fl6->daddr, + sk ? inet6_sk(sk)->srcprefs : 0, + &fl6->saddr); + } + + dst_release(dst); + + return err; +} #endif static const struct l3mdev_ops vrf_l3mdev_ops = { @@ -1008,6 +1048,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_l3_rcv = vrf_l3_rcv, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_get_rt6_dst = vrf_get_rt6_dst, + .l3mdev_get_saddr6 = vrf_get_saddr6, #endif }; diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index f8a416ec674c..818fd4f100fc 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -39,6 +39,9 @@ struct l3mdev_ops { /* IPv6 ops */ struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev, struct flowi6 *fl6); + int (*l3mdev_get_saddr6)(struct net_device *dev, + const struct sock *sk, + struct flowi6 *fl6); }; #ifdef CONFIG_NET_L3_MASTER_DEV @@ -140,6 +143,8 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex) int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4); struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6); +int l3mdev_get_saddr6(struct net *net, const struct sock *sk, + struct flowi6 *fl6); static inline struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto) @@ -230,6 +235,12 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6) return NULL; } +static inline int l3mdev_get_saddr6(struct net *net, const struct sock *sk, + struct flowi6 *fl6) +{ + return 0; +} + static inline struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fd3217579b65..1dfc402d9ad1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -910,6 +910,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, int err; int flags = 0; + if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif && + (!*dst || !(*dst)->error)) { + err = l3mdev_get_saddr6(net, sk, fl6); + if (err) + goto out_err; + } + /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the @@ -999,10 +1006,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, return 0; out_err_release: - if (err == -ENETUNREACH) - IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); dst_release(*dst); *dst = NULL; +out_err: + if (err == -ENETUNREACH) + IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; } diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index d90e4ef09e85..c4a1c3e84e12 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -162,6 +162,30 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) } EXPORT_SYMBOL_GPL(l3mdev_get_saddr); +int l3mdev_get_saddr6(struct net *net, const struct sock *sk, + struct flowi6 *fl6) +{ + struct net_device *dev; + int rc = 0; + + if (fl6->flowi6_oif) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); + if (dev && netif_is_l3_slave(dev)) + dev = netdev_master_upper_dev_get_rcu(dev); + + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_get_saddr6) + rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6); + + rcu_read_unlock(); + } + + return rc; +} +EXPORT_SYMBOL_GPL(l3mdev_get_saddr6); + /** * l3mdev_fib_rule_match - Determine if flowi references an * L3 master device From afbac6010aec514998214fb19a1f37732b7a1d77 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 16 Jun 2016 16:24:26 -0700 Subject: [PATCH 3/3] net: ipv6: Address selection needs to consider L3 domains IPv6 version of 3f2fb9a834cb ("net: l3mdev: address selection should only consider devices in L3 domain") and the follow up commit, a17b693cdd876 ("net: l3mdev: prefer VRF master for source address selection"). That is, if outbound device is given then the address preference order is an address from that device, an address from the master device if it is enslaved, and then an address from a device in the same L3 domain. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 31 ++++++++++++++++++++++++++++ net/ipv6/addrconf.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 818fd4f100fc..e90095091aa0 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -79,6 +79,31 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) return rc; } +static inline +const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) +{ + /* netdev_master_upper_dev_get_rcu calls + * list_first_or_null_rcu to walk the upper dev list. + * list_first_or_null_rcu does not handle a const arg. We aren't + * making changes, just want the master device from that list so + * typecast to remove the const + */ + struct net_device *dev = (struct net_device *)_dev; + const struct net_device *master; + + if (!dev) + return NULL; + + if (netif_is_l3_master(dev)) + master = dev; + else if (netif_is_l3_slave(dev)) + master = netdev_master_upper_dev_get_rcu(dev); + else + master = NULL; + + return master; +} + /* get index of an interface to use for FIB lookups. For devices * enslaved to an L3 master device FIB lookups are based on the * master index @@ -190,6 +215,12 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) return 0; } +static inline +const struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) +{ + return NULL; +} + static inline int l3mdev_fib_oif_rcu(struct net_device *dev) { return dev ? dev->ifindex : 0; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c8fc3f96b11..a1f6b7b31531 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1524,6 +1524,28 @@ out: return hiscore_idx; } +static int ipv6_get_saddr_master(struct net *net, + const struct net_device *dst_dev, + const struct net_device *master, + struct ipv6_saddr_dst *dst, + struct ipv6_saddr_score *scores, + int hiscore_idx) +{ + struct inet6_dev *idev; + + idev = __in6_dev_get(dst_dev); + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev, + scores, hiscore_idx); + + idev = __in6_dev_get(master); + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev, + scores, hiscore_idx); + + return hiscore_idx; +} + int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) @@ -1577,13 +1599,39 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, if (idev) hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } else { + const struct net_device *master; + int master_idx = 0; + + /* if dst_dev exists and is enslaved to an L3 device, then + * prefer addresses from dst_dev and then the master over + * any other enslaved devices in the L3 domain. + */ + master = l3mdev_master_dev_rcu(dst_dev); + if (master) { + master_idx = master->ifindex; + + hiscore_idx = ipv6_get_saddr_master(net, dst_dev, + master, &dst, + scores, hiscore_idx); + + if (scores[hiscore_idx].ifa) + goto out; + } + for_each_netdev_rcu(net, dev) { + /* only consider addresses on devices in the + * same L3 domain + */ + if (l3mdev_master_ifindex_rcu(dev) != master_idx) + continue; idev = __in6_dev_get(dev); if (!idev) continue; hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } } + +out: rcu_read_unlock(); hiscore = &scores[hiscore_idx];