From 932bc4d7a53ba418de67fdab533248df5b36c752 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:57:58 +0200
Subject: [PATCH 01/49] net: add skb_dst_set_noref_force

Rename skb_dst_set_noref to __skb_dst_set_noref and
add force flag as suggested by David Miller. The new wrapper
skb_dst_set_noref_force will force dst entries that are not
cached to be attached as skb dst without taking reference
as long as provided dst is reclaimed after RCU grace period.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/linux/skbuff.h | 35 ++++++++++++++++++++++++++++++++++-
 net/core/dst.c         |  9 +++++----
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 878e0ee81068..364e2440a7ee 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -575,7 +575,40 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 	skb->_skb_refdst = (unsigned long)dst;
 }
 
-extern void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst);
+extern void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst,
+				bool force);
+
+/**
+ * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst.
+ * If dst entry is cached, we do not take reference and dst_release
+ * will be avoided by refdst_drop. If dst entry is not cached, we take
+ * reference, so that last dst_release can destroy the dst immediately.
+ */
+static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+	__skb_dst_set_noref(skb, dst, false);
+}
+
+/**
+ * skb_dst_set_noref_force - sets skb dst, without taking reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst.
+ * No reference is taken and no dst_release will be called. While for
+ * cached dsts deferred reclaim is a basic feature, for entries that are
+ * not cached it is caller's job to guarantee that last dst_release for
+ * provided dst happens when nobody uses it, eg. after a RCU grace period.
+ */
+static inline void skb_dst_set_noref_force(struct sk_buff *skb,
+					   struct dst_entry *dst)
+{
+	__skb_dst_set_noref(skb, dst, true);
+}
 
 /**
  * skb_dst_is_noref - Test if skb dst isn't refcounted
diff --git a/net/core/dst.c b/net/core/dst.c
index 35fd12f1a69c..df9cc810ec8e 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -320,27 +320,28 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
 EXPORT_SYMBOL(__dst_destroy_metrics_generic);
 
 /**
- * skb_dst_set_noref - sets skb dst, without a reference
+ * __skb_dst_set_noref - sets skb dst, without a reference
  * @skb: buffer
  * @dst: dst entry
+ * @force: if force is set, use noref version even for DST_NOCACHE entries
  *
  * Sets skb dst, assuming a reference was not taken on dst
  * skb_dst_drop() should not dst_release() this dst
  */
-void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
 {
 	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
 	/* If dst not in cache, we must take a reference, because
 	 * dst_release() will destroy dst as soon as its refcount becomes zero
 	 */
-	if (unlikely(dst->flags & DST_NOCACHE)) {
+	if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
 		dst_hold(dst);
 		skb_dst_set(skb, dst);
 	} else {
 		skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
 	}
 }
-EXPORT_SYMBOL(skb_dst_set_noref);
+EXPORT_SYMBOL(__skb_dst_set_noref);
 
 /* Dirty hack. We did it in 2.2 (in __dst_free),
  * we have _very_ good reasons not to repeat

From c90558dae51cef334f3d9d447cf7c0fd1bfe725d Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:57:59 +0200
Subject: [PATCH 02/49] ipvs: avoid routing by TOS for real server

Avoid replacing the cached route for real server
on every packet with different TOS. I doubt that routing
by TOS for real server is used at all, so we should be
better with such optimization.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |  1 -
 net/netfilter/ipvs/ip_vs_xmit.c | 58 ++++++++++++++-------------------
 2 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index bee87badabef..64db11769cc2 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -753,7 +753,6 @@ struct ip_vs_dest {
 	/* for destination cache */
 	spinlock_t		dst_lock;	/* lock of dst_cache */
 	struct dst_entry	*dst_cache;	/* destination cache entry */
-	u32			dst_rtos;	/* RT_TOS(tos) for dst */
 	u32			dst_cookie;
 	union nf_inet_addr	dst_saddr;
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index ee6b7a9f1ec2..4b0bd15ad7c1 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -57,27 +57,24 @@ enum {
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
-		u32 dst_cookie)
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct dst_entry *dst, u32 dst_cookie)
 {
 	struct dst_entry *old_dst;
 
 	old_dst = dest->dst_cache;
 	dest->dst_cache = dst;
-	dest->dst_rtos = rtos;
 	dest->dst_cookie = dst_cookie;
 	dst_release(old_dst);
 }
 
 static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
+__ip_vs_dst_check(struct ip_vs_dest *dest)
 {
 	struct dst_entry *dst = dest->dst_cache;
 
 	if (!dst)
 		return NULL;
-	if ((dst->obsolete || rtos != dest->dst_rtos) &&
-	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
+	if (dst->obsolete && dst->ops->check(dst, dest->dst_cookie) == NULL) {
 		dest->dst_cache = NULL;
 		dst_release(dst);
 		return NULL;
@@ -104,7 +101,7 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
 
 /* Get route to daddr, update *saddr, optionally bind route to saddr */
 static struct rtable *do_output_route4(struct net *net, __be32 daddr,
-				       u32 rtos, int rt_mode, __be32 *saddr)
+				       int rt_mode, __be32 *saddr)
 {
 	struct flowi4 fl4;
 	struct rtable *rt;
@@ -113,7 +110,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = daddr;
 	fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
-	fl4.flowi4_tos = rtos;
 	fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
 			   FLOWI_FLAG_KNOWN_NH : 0;
 
@@ -124,7 +120,7 @@ retry:
 		if (PTR_ERR(rt) == -EINVAL && *saddr &&
 		    rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
 			*saddr = 0;
-			flowi4_update_output(&fl4, 0, rtos, daddr, 0);
+			flowi4_update_output(&fl4, 0, 0, daddr, 0);
 			goto retry;
 		}
 		IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
@@ -132,7 +128,7 @@ retry:
 	} else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
 		ip_rt_put(rt);
 		*saddr = fl4.saddr;
-		flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr);
+		flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
 		loop++;
 		goto retry;
 	}
@@ -143,7 +139,7 @@ retry:
 /* Get route to destination or remote server */
 static struct rtable *
 __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
-		   __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr)
+		   __be32 daddr, int rt_mode, __be32 *ret_saddr)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct rtable *rt;			/* Route to the other host */
@@ -152,19 +148,18 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
-		if (!(rt = (struct rtable *)
-		      __ip_vs_dst_check(dest, rtos))) {
-			rt = do_output_route4(net, dest->addr.ip, rtos,
-					      rt_mode, &dest->dst_saddr.ip);
+		rt = (struct rtable *) __ip_vs_dst_check(dest);
+		if (!rt) {
+			rt = do_output_route4(net, dest->addr.ip, rt_mode,
+					      &dest->dst_saddr.ip);
 			if (!rt) {
 				spin_unlock(&dest->dst_lock);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
-			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
-				  "rtos=%X\n",
+			__ip_vs_dst_set(dest, dst_clone(&rt->dst), 0);
+			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
 				  &dest->addr.ip, &dest->dst_saddr.ip,
-				  atomic_read(&rt->dst.__refcnt), rtos);
+				  atomic_read(&rt->dst.__refcnt));
 		}
 		daddr = dest->addr.ip;
 		if (ret_saddr)
@@ -177,7 +172,7 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 		 * for performance reasons because we do not remember saddr
 		 */
 		rt_mode &= ~IP_VS_RT_MODE_CONNECT;
-		rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr);
+		rt = do_output_route4(net, daddr, rt_mode, &saddr);
 		if (!rt)
 			return NULL;
 		if (ret_saddr)
@@ -307,7 +302,7 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest);
 		if (!rt) {
 			u32 cookie;
 
@@ -320,7 +315,7 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 			}
 			rt = (struct rt6_info *) dst;
 			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
-			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+			__ip_vs_dst_set(dest, dst_clone(&rt->dst), cookie);
 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
 				  &dest->addr.in6, &dest->dst_saddr.in6,
 				  atomic_read(&rt->dst.__refcnt));
@@ -449,8 +444,9 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos),
-				      IP_VS_RT_MODE_NON_LOCAL, NULL)))
+	rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
+				NULL);
+	if (!rt)
 		goto tx_error_icmp;
 
 	/* MTU checking */
@@ -581,10 +577,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	}
 
 	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
-				      RT_TOS(iph->tos),
 				      IP_VS_RT_MODE_LOCAL |
-					IP_VS_RT_MODE_NON_LOCAL |
-					IP_VS_RT_MODE_RDR, NULL)))
+				      IP_VS_RT_MODE_NON_LOCAL |
+				      IP_VS_RT_MODE_RDR, NULL)))
 		goto tx_error_icmp;
 	local = rt->rt_flags & RTCF_LOCAL;
 	/*
@@ -832,10 +827,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	EnterFunction(10);
 
 	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
-				      RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
-						   IP_VS_RT_MODE_NON_LOCAL |
-						   IP_VS_RT_MODE_CONNECT,
-						   &saddr)))
+				      IP_VS_RT_MODE_LOCAL |
+				      IP_VS_RT_MODE_NON_LOCAL |
+				      IP_VS_RT_MODE_CONNECT, &saddr)))
 		goto tx_error_icmp;
 	if (rt->rt_flags & RTCF_LOCAL) {
 		ip_rt_put(rt);
@@ -1067,7 +1061,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	EnterFunction(10);
 
 	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
-				      RT_TOS(iph->tos),
 				      IP_VS_RT_MODE_LOCAL |
 				      IP_VS_RT_MODE_NON_LOCAL |
 				      IP_VS_RT_MODE_KNOWN_NH, NULL)))
@@ -1223,7 +1216,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
 		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
 	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
-				      RT_TOS(ip_hdr(skb)->tos),
 				      rt_mode, NULL)))
 		goto tx_error_icmp;
 	local = rt->rt_flags & RTCF_LOCAL;

From 313eae637f0ce2a37fc1e591f5ac930ec7301b8f Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:00 +0200
Subject: [PATCH 03/49] ipvs: prefer NETDEV_DOWN event to free cached dsts

The real server becomes unreachable on down event,
no need to wait device unregistration. Should help in
releasing dsts early before dst->dev is replaced with lo.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 8104120f16a5..6b55ba69c800 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1514,10 +1514,8 @@ __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
 	spin_unlock_bh(&dest->dst_lock);
 
 }
-/*
- * Netdev event receiver
- * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
- * a device that is "unregister" it must be released.
+/* Netdev event receiver
+ * Currently only NETDEV_DOWN is handled to release refs to cached dsts
  */
 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 			    void *ptr)
@@ -1529,7 +1527,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	struct ip_vs_dest *dest;
 	unsigned int idx;
 
-	if (event != NETDEV_UNREGISTER || !ipvs)
+	if (event != NETDEV_DOWN || !ipvs)
 		return NOTIFY_DONE;
 	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
 	EnterFunction(2);

From b8abdf098487fe56dfcbeda029bb662effd57ac5 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:01 +0200
Subject: [PATCH 04/49] ipvs: convert the IP_VS_XMIT macros to functions

It was a bad idea to hide return statements in macros.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 130 +++++++++++++++++---------------
 1 file changed, 70 insertions(+), 60 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 4b0bd15ad7c1..7cd7c61692ac 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -376,45 +376,59 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 	dest->dst_saddr.ip = 0;
 }
 
-#define IP_VS_XMIT_TUNNEL(skb, cp)				\
-({								\
-	int __ret = NF_ACCEPT;					\
-								\
-	(skb)->ipvs_property = 1;				\
-	if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))		\
-		__ret = ip_vs_confirm_conntrack(skb);		\
-	if (__ret == NF_ACCEPT) {				\
-		nf_reset(skb);					\
-		skb_forward_csum(skb);				\
-	}							\
-	__ret;							\
-})
+/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
+static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
+					    struct ip_vs_conn *cp)
+{
+	int ret = NF_ACCEPT;
 
-#define IP_VS_XMIT_NAT(pf, skb, cp, local)		\
-do {							\
-	(skb)->ipvs_property = 1;			\
-	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
-		ip_vs_notrack(skb);			\
-	else						\
-		ip_vs_update_conntrack(skb, cp, 1);	\
-	if (local)					\
-		return NF_ACCEPT;			\
-	skb_forward_csum(skb);				\
-	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
-		skb_dst(skb)->dev, dst_output);		\
-} while (0)
+	skb->ipvs_property = 1;
+	if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
+		ret = ip_vs_confirm_conntrack(skb);
+	if (ret == NF_ACCEPT) {
+		nf_reset(skb);
+		skb_forward_csum(skb);
+	}
+	return ret;
+}
 
-#define IP_VS_XMIT(pf, skb, cp, local)			\
-do {							\
-	(skb)->ipvs_property = 1;			\
-	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
-		ip_vs_notrack(skb);			\
-	if (local)					\
-		return NF_ACCEPT;			\
-	skb_forward_csum(skb);				\
-	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
-		skb_dst(skb)->dev, dst_output);		\
-} while (0)
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
+					 struct ip_vs_conn *cp, int local)
+{
+	int ret = NF_STOLEN;
+
+	skb->ipvs_property = 1;
+	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+		ip_vs_notrack(skb);
+	else
+		ip_vs_update_conntrack(skb, cp, 1);
+	if (!local) {
+		skb_forward_csum(skb);
+		NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+			dst_output);
+	} else
+		ret = NF_ACCEPT;
+	return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
+				     struct ip_vs_conn *cp, int local)
+{
+	int ret = NF_STOLEN;
+
+	skb->ipvs_property = 1;
+	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+		ip_vs_notrack(skb);
+	if (!local) {
+		skb_forward_csum(skb);
+		NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+			dst_output);
+	} else
+		ret = NF_ACCEPT;
+	return ret;
+}
 
 
 /*
@@ -425,7 +439,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 {
 	/* we do not touch skb and do not need pskb ptr */
-	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+	return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
 }
 
 
@@ -476,7 +490,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -537,7 +551,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -562,7 +576,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rtable *rt;		/* Route to the other host */
 	int mtu;
 	struct iphdr *iph = ip_hdr(skb);
-	int local;
+	int local, rc;
 
 	EnterFunction(10);
 
@@ -655,10 +669,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
+	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 
 	LeaveFunction(10);
-	return NF_STOLEN;
+	return rc;
 
   tx_error_icmp:
 	dst_link_failure(skb);
@@ -678,7 +692,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct rt6_info *rt;		/* Route to the other host */
 	int mtu;
-	int local;
+	int local, rc;
 
 	EnterFunction(10);
 
@@ -771,10 +785,10 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
+	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 
 	LeaveFunction(10);
-	return NF_STOLEN;
+	return rc;
 
 tx_error_icmp:
 	dst_link_failure(skb);
@@ -833,7 +847,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_icmp;
 	if (rt->rt_flags & RTCF_LOCAL) {
 		ip_rt_put(rt);
-		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
 	}
 
 	tdev = rt->dst.dev;
@@ -905,7 +919,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
 	if (ret == NF_ACCEPT)
 		ip_local_out(skb);
 	else if (ret == NF_DROP)
@@ -948,7 +962,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_icmp;
 	if (__ip_vs_is_local_route6(rt)) {
 		dst_release(&rt->dst);
-		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
 	}
 
 	tdev = rt->dst.dev;
@@ -1023,7 +1037,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
 	if (ret == NF_ACCEPT)
 		ip6_local_out(skb);
 	else if (ret == NF_DROP)
@@ -1067,7 +1081,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_icmp;
 	if (rt->rt_flags & RTCF_LOCAL) {
 		ip_rt_put(rt);
-		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
 	}
 
 	/* MTU checking */
@@ -1097,7 +1111,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -1126,7 +1140,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_icmp;
 	if (__ip_vs_is_local_route6(rt)) {
 		dst_release(&rt->dst);
-		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
 	}
 
 	/* MTU checking */
@@ -1162,7 +1176,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -1283,9 +1297,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
-
-	rc = NF_STOLEN;
+	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 	goto out;
 
   tx_error_icmp:
@@ -1404,9 +1416,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
-
-	rc = NF_STOLEN;
+	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 	goto out;
 
 tx_error_icmp:

From d1deae4d3ab37d833f278fec975a8f2ddeb78f3b Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:02 +0200
Subject: [PATCH 05/49] ipvs: rename functions related to dst_cache reset

Move and give better names to two functions:

- ip_vs_dst_reset to __ip_vs_dst_cache_reset
- __ip_vs_dev_reset to ip_vs_forget_dev

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |  1 -
 net/netfilter/ipvs/ip_vs_ctl.c  | 34 +++++++++++++++++++++------------
 net/netfilter/ipvs/ip_vs_xmit.c | 14 --------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 64db11769cc2..8ad73a834655 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1427,7 +1427,6 @@ extern int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 extern int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 			   struct ip_vs_protocol *pp, int offset,
 			   unsigned int hooknum, struct ip_vs_iphdr *iph);
-extern void ip_vs_dst_reset(struct ip_vs_dest *dest);
 
 #ifdef CONFIG_IP_VS_IPV6
 extern int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 6b55ba69c800..5265eaa7b42b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -641,6 +641,17 @@ struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
 	return dest;
 }
 
+/* Release dst_cache for dest in user context */
+static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
+{
+	struct dst_entry *old_dst;
+
+	old_dst = dest->dst_cache;
+	dest->dst_cache = NULL;
+	dst_release(old_dst);
+	dest->dst_saddr.ip = 0;
+}
+
 /*
  *  Lookup dest by {svc,addr,port} in the destination trash.
  *  The destination trash is used to hold the destinations that are removed
@@ -690,7 +701,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
 				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
 				      ntohs(dest->port));
 			list_del(&dest->n_list);
-			ip_vs_dst_reset(dest);
+			__ip_vs_dst_cache_reset(dest);
 			__ip_vs_unbind_svc(dest);
 			free_percpu(dest->stats.cpustats);
 			kfree(dest);
@@ -717,7 +728,7 @@ static void ip_vs_trash_cleanup(struct net *net)
 
 	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
 		list_del(&dest->n_list);
-		ip_vs_dst_reset(dest);
+		__ip_vs_dst_cache_reset(dest);
 		__ip_vs_unbind_svc(dest);
 		free_percpu(dest->stats.cpustats);
 		kfree(dest);
@@ -811,7 +822,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	dest->l_threshold = udest->l_threshold;
 
 	spin_lock_bh(&dest->dst_lock);
-	ip_vs_dst_reset(dest);
+	__ip_vs_dst_cache_reset(dest);
 	spin_unlock_bh(&dest->dst_lock);
 
 	if (add)
@@ -1037,7 +1048,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
 			      dest->vfwmark,
 			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
 			      ntohs(dest->port));
-		ip_vs_dst_reset(dest);
+		__ip_vs_dst_cache_reset(dest);
 		/* simply decrease svc->refcnt here, let the caller check
 		   and release the service if nobody refers to it.
 		   Only user context can release destination and service,
@@ -1496,11 +1507,10 @@ void ip_vs_service_net_cleanup(struct net *net)
 	mutex_unlock(&__ip_vs_mutex);
 	LeaveFunction(2);
 }
-/*
- * Release dst hold by dst_cache
- */
+
+/* Put all references for device (dst_cache) */
 static inline void
-__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
+ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
 {
 	spin_lock_bh(&dest->dst_lock);
 	if (dest->dst_cache && dest->dst_cache->dev == dev) {
@@ -1509,7 +1519,7 @@ __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
 			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
 			      ntohs(dest->port),
 			      atomic_read(&dest->refcnt));
-		ip_vs_dst_reset(dest);
+		__ip_vs_dst_cache_reset(dest);
 	}
 	spin_unlock_bh(&dest->dst_lock);
 
@@ -1537,7 +1547,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 			if (net_eq(svc->net, net)) {
 				list_for_each_entry(dest, &svc->destinations,
 						    n_list) {
-					__ip_vs_dev_reset(dest, dev);
+					ip_vs_forget_dev(dest, dev);
 				}
 			}
 		}
@@ -1546,7 +1556,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 			if (net_eq(svc->net, net)) {
 				list_for_each_entry(dest, &svc->destinations,
 						    n_list) {
-					__ip_vs_dev_reset(dest, dev);
+					ip_vs_forget_dev(dest, dev);
 				}
 			}
 
@@ -1554,7 +1564,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	}
 
 	list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
-		__ip_vs_dev_reset(dest, dev);
+		ip_vs_forget_dev(dest, dev);
 	}
 	mutex_unlock(&__ip_vs_mutex);
 	LeaveFunction(2);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 7cd7c61692ac..6448a2e862dc 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -362,20 +362,6 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 #endif
 
 
-/*
- *	Release dest->dst_cache before a dest is removed
- */
-void
-ip_vs_dst_reset(struct ip_vs_dest *dest)
-{
-	struct dst_entry *old_dst;
-
-	old_dst = dest->dst_cache;
-	dest->dst_cache = NULL;
-	dst_release(old_dst);
-	dest->dst_saddr.ip = 0;
-}
-
 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 					    struct ip_vs_conn *cp)

From 183dce554ac79302fbeb86d8419440ed7021b8a0 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:03 +0200
Subject: [PATCH 06/49] ipvs: no need to reroute anymore on DNAT over loopback

After commit 70e7341673 (ipv4: Show that ip_send_reply()
is purely unicast routine.) we do not need to reroute DNAT-ed
traffic over loopback because reply uses iph daddr and not
rt_spec_dst.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 58 ++-------------------------------
 1 file changed, 2 insertions(+), 56 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6448a2e862dc..c942d36e38a5 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -207,44 +207,6 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 	return rt;
 }
 
-/* Reroute packet to local IPv4 stack after DNAT */
-static int
-__ip_vs_reroute_locally(struct sk_buff *skb)
-{
-	struct rtable *rt = skb_rtable(skb);
-	struct net_device *dev = rt->dst.dev;
-	struct net *net = dev_net(dev);
-	struct iphdr *iph = ip_hdr(skb);
-
-	if (rt_is_input_route(rt)) {
-		unsigned long orefdst = skb->_skb_refdst;
-
-		if (ip_route_input(skb, iph->daddr, iph->saddr,
-				   iph->tos, skb->dev))
-			return 0;
-		refdst_drop(orefdst);
-	} else {
-		struct flowi4 fl4 = {
-			.daddr = iph->daddr,
-			.saddr = iph->saddr,
-			.flowi4_tos = RT_TOS(iph->tos),
-			.flowi4_mark = skb->mark,
-		};
-
-		rt = ip_route_output_key(net, &fl4);
-		if (IS_ERR(rt))
-			return 0;
-		if (!(rt->rt_flags & RTCF_LOCAL)) {
-			ip_rt_put(rt);
-			return 0;
-		}
-		/* Drop old route. */
-		skb_dst_drop(skb);
-		skb_dst_set(skb, &rt->dst);
-	}
-	return 1;
-}
-
 #ifdef CONFIG_IP_VS_IPV6
 
 static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
@@ -635,16 +597,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		/* drop old route */
 		skb_dst_drop(skb);
 		skb_dst_set(skb, &rt->dst);
-	} else {
+	} else
 		ip_rt_put(rt);
-		/*
-		 * Some IPv4 replies get local address from routes,
-		 * not from iph, so while we DNAT after routing
-		 * we need this second input/output route.
-		 */
-		if (!__ip_vs_reroute_locally(skb))
-			goto tx_error;
-	}
 
 	IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
 
@@ -1269,16 +1223,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		/* drop the old route when skb is not shared */
 		skb_dst_drop(skb);
 		skb_dst_set(skb, &rt->dst);
-	} else {
+	} else
 		ip_rt_put(rt);
-		/*
-		 * Some IPv4 replies get local address from routes,
-		 * not from iph, so while we DNAT after routing
-		 * we need this second input/output route.
-		 */
-		if (!__ip_vs_reroute_locally(skb))
-			goto tx_error;
-	}
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;

From f11cb2c2aa1ba28091910eaecaa40906ec31101a Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:04 +0200
Subject: [PATCH 07/49] ipvs: do not use skb_share_check

We run in contexts like ip_rcv, ipv6_rcv, br_handle_frame,
do not expect shared skbs.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 45 ++-------------------------------
 1 file changed, 2 insertions(+), 43 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index c942d36e38a5..e6e2d3f50091 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -421,14 +421,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 	}
 
-	/*
-	 * Call ip_send_check because we are not sure it is called
-	 * after ip_defrag. Is copy-on-write needed?
-	 */
-	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
-		ip_rt_put(rt);
-		return NF_STOLEN;
-	}
 	ip_send_check(ip_hdr(skb));
 
 	/* drop old route */
@@ -482,16 +474,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 	}
 
-	/*
-	 * Call ip_send_check because we are not sure it is called
-	 * after ip_defrag. Is copy-on-write needed?
-	 */
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (unlikely(skb == NULL)) {
-		dst_release(&rt->dst);
-		return NF_STOLEN;
-	}
-
 	/* drop old route */
 	skb_dst_drop(skb);
 	skb_dst_set(skb, &rt->dst);
@@ -708,7 +690,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ipv6_hdr(skb)->daddr = cp->daddr.in6;
 
 	if (!local || !skb->dev) {
-		/* drop the old route when skb is not shared */
 		skb_dst_drop(skb);
 		skb_dst_set(skb, &rt->dst);
 	} else {
@@ -814,8 +795,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 */
 	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
 
-	if (skb_headroom(skb) < max_headroom
-	    || skb_cloned(skb) || skb_shared(skb)) {
+	if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
 		struct sk_buff *new_skb =
 			skb_realloc_headroom(skb, max_headroom);
 		if (!new_skb) {
@@ -935,8 +915,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 */
 	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
 
-	if (skb_headroom(skb) < max_headroom
-	    || skb_cloned(skb) || skb_shared(skb)) {
+	if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
 		struct sk_buff *new_skb =
 			skb_realloc_headroom(skb, max_headroom);
 		if (!new_skb) {
@@ -1034,14 +1013,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 	}
 
-	/*
-	 * Call ip_send_check because we are not sure it is called
-	 * after ip_defrag. Is copy-on-write needed?
-	 */
-	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
-		ip_rt_put(rt);
-		return NF_STOLEN;
-	}
 	ip_send_check(ip_hdr(skb));
 
 	/* drop old route */
@@ -1099,16 +1070,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error;
 	}
 
-	/*
-	 * Call ip_send_check because we are not sure it is called
-	 * after ip_defrag. Is copy-on-write needed?
-	 */
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (unlikely(skb == NULL)) {
-		dst_release(&rt->dst);
-		return NF_STOLEN;
-	}
-
 	/* drop old route */
 	skb_dst_drop(skb);
 	skb_dst_set(skb, &rt->dst);
@@ -1220,7 +1181,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_vs_nat_icmp(skb, pp, cp, 0);
 
 	if (!local) {
-		/* drop the old route when skb is not shared */
 		skb_dst_drop(skb);
 		skb_dst_set(skb, &rt->dst);
 	} else
@@ -1337,7 +1297,6 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
 
 	if (!local || !skb->dev) {
-		/* drop the old route when skb is not shared */
 		skb_dst_drop(skb);
 		skb_dst_set(skb, &rt->dst);
 	} else {

From 4115ded131645a7311a5d9afa9ff2b263dd91c2d Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:05 +0200
Subject: [PATCH 08/49] ipvs: consolidate all dst checks on transmit in one
 place

Consolidate the PMTU checks, ICMP sending and
skb_dst modification in __ip_vs_get_out_rt and
__ip_vs_get_out_rt_v6. Now skb_dst is changed early
to simplify the transmitters.

Make sure update_pmtu is called only for local clients.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 631 ++++++++++++--------------------
 1 file changed, 237 insertions(+), 394 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index e6e2d3f50091..603eb8a50e19 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -51,6 +51,7 @@ enum {
 				      */
 	IP_VS_RT_MODE_CONNECT	= 8, /* Always bind route to saddr */
 	IP_VS_RT_MODE_KNOWN_NH	= 16,/* Route via remote addr */
+	IP_VS_RT_MODE_TUNNEL	= 32,/* Tunnel mode */
 };
 
 /*
@@ -137,13 +138,17 @@ retry:
 }
 
 /* Get route to destination or remote server */
-static struct rtable *
+static int
 __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 		   __be32 daddr, int rt_mode, __be32 *ret_saddr)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct rtable *rt;			/* Route to the other host */
 	struct rtable *ort;			/* Original route */
+	struct iphdr *iph;
+	__be16 df;
+	int mtu;
 	int local;
 
 	if (dest) {
@@ -154,7 +159,7 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 					      &dest->dst_saddr.ip);
 			if (!rt) {
 				spin_unlock(&dest->dst_lock);
-				return NULL;
+				goto err_unreach;
 			}
 			__ip_vs_dst_set(dest, dst_clone(&rt->dst), 0);
 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
@@ -174,37 +179,78 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 		rt_mode &= ~IP_VS_RT_MODE_CONNECT;
 		rt = do_output_route4(net, daddr, rt_mode, &saddr);
 		if (!rt)
-			return NULL;
+			goto err_unreach;
 		if (ret_saddr)
 			*ret_saddr = saddr;
 	}
 
-	local = rt->rt_flags & RTCF_LOCAL;
+	local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
 	if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
 	      rt_mode)) {
 		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
 			     (rt->rt_flags & RTCF_LOCAL) ?
 			     "local":"non-local", &daddr);
-		ip_rt_put(rt);
-		return NULL;
+		goto err_put;
 	}
-	if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
-	    !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) {
-		IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
-			     "requires NAT method, dest: %pI4\n",
-			     &ip_hdr(skb)->daddr, &daddr);
+	iph = ip_hdr(skb);
+	if (likely(!local)) {
+		if (unlikely(ipv4_is_loopback(iph->saddr))) {
+			IP_VS_DBG_RL("Stopping traffic from loopback address "
+				     "%pI4 to non-local address, dest: %pI4\n",
+				     &iph->saddr, &daddr);
+			goto err_put;
+		}
+	} else {
+		ort = skb_rtable(skb);
+		if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+		    !(ort->rt_flags & RTCF_LOCAL)) {
+			IP_VS_DBG_RL("Redirect from non-local address %pI4 to "
+				     "local requires NAT method, dest: %pI4\n",
+				     &iph->daddr, &daddr);
+			goto err_put;
+		}
+		/* skb to local stack, preserve old route */
 		ip_rt_put(rt);
-		return NULL;
-	}
-	if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
-		IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
-			     "to non-local address, dest: %pI4\n",
-			     &ip_hdr(skb)->saddr, &daddr);
-		ip_rt_put(rt);
-		return NULL;
+		return local;
 	}
 
-	return rt;
+	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
+		mtu = dst_mtu(&rt->dst);
+		df = iph->frag_off & htons(IP_DF);
+	} else {
+		struct sock *sk = skb->sk;
+
+		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+		if (mtu < 68) {
+			IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
+			goto err_put;
+		}
+		ort = skb_rtable(skb);
+		if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+			ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+		/* MTU check allowed? */
+		df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0;
+	}
+
+	/* MTU checking */
+	if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) {
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr);
+		goto err_put;
+	}
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	return local;
+
+err_put:
+	ip_rt_put(rt);
+	return -1;
+
+err_unreach:
+	dst_link_failure(skb);
+	return -1;
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -251,15 +297,16 @@ out_err:
 /*
  * Get route to destination or remote server
  */
-static struct rt6_info *
+static int
 __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 		      struct in6_addr *daddr, struct in6_addr *ret_saddr,
-		      int do_xfrm, int rt_mode)
+		      struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct rt6_info *rt;			/* Route to the other host */
 	struct rt6_info *ort;			/* Original route */
 	struct dst_entry *dst;
+	int mtu;
 	int local;
 
 	if (dest) {
@@ -273,7 +320,7 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 						      do_xfrm);
 			if (!dst) {
 				spin_unlock(&dest->dst_lock);
-				return NULL;
+				goto err_unreach;
 			}
 			rt = (struct rt6_info *) dst;
 			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
@@ -288,7 +335,7 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 	} else {
 		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
 		if (!dst)
-			return NULL;
+			goto err_unreach;
 		rt = (struct rt6_info *) dst;
 	}
 
@@ -297,29 +344,72 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 	      rt_mode)) {
 		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
 			     local ? "local":"non-local", daddr);
-		dst_release(&rt->dst);
-		return NULL;
+		goto err_put;
 	}
-	if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
-	    !((ort = (struct rt6_info *) skb_dst(skb)) &&
-	      __ip_vs_is_local_route6(ort))) {
-		IP_VS_DBG_RL("Redirect from non-local address %pI6c to local "
-			     "requires NAT method, dest: %pI6c\n",
-			     &ipv6_hdr(skb)->daddr, daddr);
+	if (likely(!local)) {
+		if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+			     ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+					    IPV6_ADDR_LOOPBACK)) {
+			IP_VS_DBG_RL("Stopping traffic from loopback address "
+				     "%pI6c to non-local address, "
+				     "dest: %pI6c\n",
+				     &ipv6_hdr(skb)->saddr, daddr);
+			goto err_put;
+		}
+	} else {
+		ort = (struct rt6_info *) skb_dst(skb);
+		if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+		    !__ip_vs_is_local_route6(ort)) {
+			IP_VS_DBG_RL("Redirect from non-local address %pI6c "
+				     "to local requires NAT method, "
+				     "dest: %pI6c\n",
+				     &ipv6_hdr(skb)->daddr, daddr);
+			goto err_put;
+		}
+		/* skb to local stack, preserve old route */
 		dst_release(&rt->dst);
-		return NULL;
-	}
-	if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
-		     ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
-				    IPV6_ADDR_LOOPBACK)) {
-		IP_VS_DBG_RL("Stopping traffic from loopback address %pI6c "
-			     "to non-local address, dest: %pI6c\n",
-			     &ipv6_hdr(skb)->saddr, daddr);
-		dst_release(&rt->dst);
-		return NULL;
+		return local;
 	}
 
-	return rt;
+	/* MTU checking */
+	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
+		mtu = dst_mtu(&rt->dst);
+	else {
+		struct sock *sk = skb->sk;
+
+		mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+		if (mtu < IPV6_MIN_MTU) {
+			IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+				     IPV6_MIN_MTU);
+			goto err_put;
+		}
+		ort = (struct rt6_info *) skb_dst(skb);
+		if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+			ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+	}
+
+	if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
+		if (!skb->dev)
+			skb->dev = net->loopback_dev;
+		/* only send ICMP too big on first fragment */
+		if (!ipvsh->fragoffs)
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr);
+		goto err_put;
+	}
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	return local;
+
+err_put:
+	dst_release(&rt->dst);
+	return -1;
+
+err_unreach:
+	dst_link_failure(skb);
+	return -1;
 }
 #endif
 
@@ -400,32 +490,15 @@ int
 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 {
-	struct rtable *rt;			/* Route to the other host */
 	struct iphdr  *iph = ip_hdr(skb);
-	int    mtu;
 
 	EnterFunction(10);
 
-	rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
-				NULL);
-	if (!rt)
-		goto tx_error_icmp;
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->dst);
-	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
-	    !skb_is_gso(skb)) {
-		ip_rt_put(rt);
-		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+	if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
+			       NULL) < 0)
 		goto tx_error;
-	}
 
-	ip_send_check(ip_hdr(skb));
-
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
+	ip_send_check(iph);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
@@ -435,8 +508,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	LeaveFunction(10);
 	return NF_STOLEN;
 
- tx_error_icmp:
-	dst_link_failure(skb);
  tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
@@ -446,37 +517,13 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 #ifdef CONFIG_IP_VS_IPV6
 int
 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
+		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 {
-	struct rt6_info *rt;			/* Route to the other host */
-	int    mtu;
-
 	EnterFunction(10);
 
-	rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr.in6, NULL, 0,
-				   IP_VS_RT_MODE_NON_LOCAL);
-	if (!rt)
-		goto tx_error_icmp;
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->dst);
-	if (__mtu_check_toobig_v6(skb, mtu)) {
-		if (!skb->dev) {
-			struct net *net = dev_net(skb_dst(skb)->dev);
-
-			skb->dev = net->loopback_dev;
-		}
-		/* only send ICMP too big on first fragment */
-		if (!iph->fragoffs)
-			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		dst_release(&rt->dst);
-		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+	if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL,
+				  ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
 		goto tx_error;
-	}
-
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
@@ -486,8 +533,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	LeaveFunction(10);
 	return NF_STOLEN;
 
- tx_error_icmp:
-	dst_link_failure(skb);
  tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
@@ -504,28 +549,29 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	       struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 {
 	struct rtable *rt;		/* Route to the other host */
-	int mtu;
-	struct iphdr *iph = ip_hdr(skb);
-	int local, rc;
+	int local, rc, was_input;
 
 	EnterFunction(10);
 
 	/* check if it is a connection of no-client-port */
 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
 		__be16 _pt, *p;
-		p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+
+		p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
 		if (p == NULL)
 			goto tx_error;
 		ip_vs_conn_fill_cport(cp, *p);
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
-				      IP_VS_RT_MODE_LOCAL |
-				      IP_VS_RT_MODE_NON_LOCAL |
-				      IP_VS_RT_MODE_RDR, NULL)))
-		goto tx_error_icmp;
-	local = rt->rt_flags & RTCF_LOCAL;
+	was_input = rt_is_input_route(skb_rtable(skb));
+	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				   IP_VS_RT_MODE_LOCAL |
+				   IP_VS_RT_MODE_NON_LOCAL |
+				   IP_VS_RT_MODE_RDR, NULL);
+	if (local < 0)
+		goto tx_error;
+	rt = skb_rtable(skb);
 	/*
 	 * Avoid duplicate tuple in reply direction for NAT traffic
 	 * to local address when connection is sync-ed
@@ -539,49 +585,31 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 			IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
 					 "ip_vs_nat_xmit(): "
 					 "stopping DNAT to local address");
-			goto tx_error_put;
+			goto tx_error;
 		}
 	}
 #endif
 
 	/* From world but DNAT to loopback address? */
-	if (local && ipv4_is_loopback(cp->daddr.ip) &&
-	    rt_is_input_route(skb_rtable(skb))) {
+	if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
 		IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
 				 "stopping DNAT to loopback address");
-		goto tx_error_put;
-	}
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->dst);
-	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
-	    !skb_is_gso(skb)) {
-		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
-				 "ip_vs_nat_xmit(): frag needed for");
-		goto tx_error_put;
+		goto tx_error;
 	}
 
 	/* copy-on-write the packet before mangling it */
 	if (!skb_make_writable(skb, sizeof(struct iphdr)))
-		goto tx_error_put;
+		goto tx_error;
 
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
-		goto tx_error_put;
+		goto tx_error;
 
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
-		goto tx_error_put;
+		goto tx_error;
 	ip_hdr(skb)->daddr = cp->daddr.ip;
 	ip_send_check(ip_hdr(skb));
 
-	if (!local) {
-		/* drop old route */
-		skb_dst_drop(skb);
-		skb_dst_set(skb, &rt->dst);
-	} else
-		ip_rt_put(rt);
-
 	IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
 
 	/* FIXME: when application helper enlarges the packet and the length
@@ -596,44 +624,40 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	LeaveFunction(10);
 	return rc;
 
-  tx_error_icmp:
-	dst_link_failure(skb);
   tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
 	return NF_STOLEN;
-  tx_error_put:
-	ip_rt_put(rt);
-	goto tx_error;
 }
 
 #ifdef CONFIG_IP_VS_IPV6
 int
 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
+		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 {
 	struct rt6_info *rt;		/* Route to the other host */
-	int mtu;
 	int local, rc;
 
 	EnterFunction(10);
 
 	/* check if it is a connection of no-client-port */
-	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !iph->fragoffs)) {
+	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
 		__be16 _pt, *p;
-		p = skb_header_pointer(skb, iph->len, sizeof(_pt), &_pt);
+		p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
 		if (p == NULL)
 			goto tx_error;
 		ip_vs_conn_fill_cport(cp, *p);
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
-					 0, (IP_VS_RT_MODE_LOCAL |
-					     IP_VS_RT_MODE_NON_LOCAL |
-					     IP_VS_RT_MODE_RDR))))
-		goto tx_error_icmp;
-	local = __ip_vs_is_local_route6(rt);
+	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+				      ipvsh, 0,
+				      IP_VS_RT_MODE_LOCAL |
+				      IP_VS_RT_MODE_NON_LOCAL |
+				      IP_VS_RT_MODE_RDR);
+	if (local < 0)
+		goto tx_error;
+	rt = (struct rt6_info *) skb_dst(skb);
 	/*
 	 * Avoid duplicate tuple in reply direction for NAT traffic
 	 * to local address when connection is sync-ed
@@ -647,7 +671,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 			IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
 					 "ip_vs_nat_xmit_v6(): "
 					 "stopping DNAT to local address");
-			goto tx_error_put;
+			goto tx_error;
 		}
 	}
 #endif
@@ -658,45 +682,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
 				 "ip_vs_nat_xmit_v6(): "
 				 "stopping DNAT to loopback address");
-		goto tx_error_put;
-	}
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->dst);
-	if (__mtu_check_toobig_v6(skb, mtu)) {
-		if (!skb->dev) {
-			struct net *net = dev_net(skb_dst(skb)->dev);
-
-			skb->dev = net->loopback_dev;
-		}
-		/* only send ICMP too big on first fragment */
-		if (!iph->fragoffs)
-			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
-				 "ip_vs_nat_xmit_v6(): frag needed for");
-		goto tx_error_put;
+		goto tx_error;
 	}
 
 	/* copy-on-write the packet before mangling it */
 	if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
-		goto tx_error_put;
+		goto tx_error;
 
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
-		goto tx_error_put;
+		goto tx_error;
 
 	/* mangle the packet */
-	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, iph))
+	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
 		goto tx_error;
 	ipv6_hdr(skb)->daddr = cp->daddr.in6;
 
-	if (!local || !skb->dev) {
-		skb_dst_drop(skb);
-		skb_dst_set(skb, &rt->dst);
-	} else {
-		/* destined to loopback, do we need to change route? */
-		dst_release(&rt->dst);
-	}
-
 	IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
 
 	/* FIXME: when application helper enlarges the packet and the length
@@ -711,15 +711,10 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	LeaveFunction(10);
 	return rc;
 
-tx_error_icmp:
-	dst_link_failure(skb);
 tx_error:
 	LeaveFunction(10);
 	kfree_skb(skb);
 	return NF_STOLEN;
-tx_error_put:
-	dst_release(&rt->dst);
-	goto tx_error;
 }
 #endif
 
@@ -756,40 +751,26 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	__be16 df;
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
-	int    mtu;
-	int ret;
+	int ret, local;
 
 	EnterFunction(10);
 
-	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
-				      IP_VS_RT_MODE_LOCAL |
-				      IP_VS_RT_MODE_NON_LOCAL |
-				      IP_VS_RT_MODE_CONNECT, &saddr)))
-		goto tx_error_icmp;
-	if (rt->rt_flags & RTCF_LOCAL) {
-		ip_rt_put(rt);
+	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				   IP_VS_RT_MODE_LOCAL |
+				   IP_VS_RT_MODE_NON_LOCAL |
+				   IP_VS_RT_MODE_CONNECT |
+				   IP_VS_RT_MODE_TUNNEL, &saddr);
+	if (local < 0)
+		goto tx_error;
+	if (local)
 		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
-	}
 
+	rt = skb_rtable(skb);
 	tdev = rt->dst.dev;
 
-	mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
-	if (mtu < 68) {
-		IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
-		goto tx_error_put;
-	}
-	if (rt_is_output_route(skb_rtable(skb)))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
 	/* Copy DF, reset fragment offset and MF */
 	df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0;
 
-	if (df && mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb)) {
-		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error_put;
-	}
-
 	/*
 	 * Okay, now see if we can stuff it in the buffer as-is.
 	 */
@@ -798,12 +779,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
 		struct sk_buff *new_skb =
 			skb_realloc_headroom(skb, max_headroom);
-		if (!new_skb) {
-			ip_rt_put(rt);
-			kfree_skb(skb);
-			IP_VS_ERR_RL("%s(): no memory\n", __func__);
-			return NF_STOLEN;
-		}
+
+		if (!new_skb)
+			goto tx_error;
 		consume_skb(skb);
 		skb = new_skb;
 		old_iph = ip_hdr(skb);
@@ -818,10 +796,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	skb_reset_network_header(skb);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	/*
 	 *	Push down and install the IPIP header.
 	 */
@@ -849,15 +823,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	return NF_STOLEN;
 
-  tx_error_icmp:
-	dst_link_failure(skb);
   tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
 	return NF_STOLEN;
-tx_error_put:
-	ip_rt_put(rt);
-	goto tx_error;
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -871,45 +840,23 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
 	struct ipv6hdr  *iph;		/* Our new IP header */
 	unsigned int max_headroom;	/* The extra header space needed */
-	int    mtu;
-	int ret;
+	int ret, local;
 
 	EnterFunction(10);
 
-	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
-					 &saddr, 1, (IP_VS_RT_MODE_LOCAL |
-						     IP_VS_RT_MODE_NON_LOCAL))))
-		goto tx_error_icmp;
-	if (__ip_vs_is_local_route6(rt)) {
-		dst_release(&rt->dst);
+	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+				      &saddr, ipvsh, 1,
+				      IP_VS_RT_MODE_LOCAL |
+				      IP_VS_RT_MODE_NON_LOCAL |
+				      IP_VS_RT_MODE_TUNNEL);
+	if (local < 0)
+		goto tx_error;
+	if (local)
 		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
-	}
 
+	rt = (struct rt6_info *) skb_dst(skb);
 	tdev = rt->dst.dev;
 
-	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
-	if (mtu < IPV6_MIN_MTU) {
-		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
-			     IPV6_MIN_MTU);
-		goto tx_error_put;
-	}
-	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
-	/* MTU checking: Notice that 'mtu' have been adjusted before hand */
-	if (__mtu_check_toobig_v6(skb, mtu)) {
-		if (!skb->dev) {
-			struct net *net = dev_net(skb_dst(skb)->dev);
-
-			skb->dev = net->loopback_dev;
-		}
-		/* only send ICMP too big on first fragment */
-		if (!ipvsh->fragoffs)
-			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error_put;
-	}
-
 	/*
 	 * Okay, now see if we can stuff it in the buffer as-is.
 	 */
@@ -918,12 +865,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
 		struct sk_buff *new_skb =
 			skb_realloc_headroom(skb, max_headroom);
-		if (!new_skb) {
-			dst_release(&rt->dst);
-			kfree_skb(skb);
-			IP_VS_ERR_RL("%s(): no memory\n", __func__);
-			return NF_STOLEN;
-		}
+
+		if (!new_skb)
+			goto tx_error;
 		consume_skb(skb);
 		skb = new_skb;
 		old_iph = ipv6_hdr(skb);
@@ -935,10 +879,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	skb_reset_network_header(skb);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	/*
 	 *	Push down and install the IPIP header.
 	 */
@@ -966,15 +906,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	return NF_STOLEN;
 
-tx_error_icmp:
-	dst_link_failure(skb);
 tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
 	return NF_STOLEN;
-tx_error_put:
-	dst_release(&rt->dst);
-	goto tx_error;
 }
 #endif
 
@@ -987,38 +922,21 @@ int
 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	      struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 {
-	struct rtable *rt;			/* Route to the other host */
-	struct iphdr  *iph = ip_hdr(skb);
-	int    mtu;
+	int local;
 
 	EnterFunction(10);
 
-	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
-				      IP_VS_RT_MODE_LOCAL |
-				      IP_VS_RT_MODE_NON_LOCAL |
-				      IP_VS_RT_MODE_KNOWN_NH, NULL)))
-		goto tx_error_icmp;
-	if (rt->rt_flags & RTCF_LOCAL) {
-		ip_rt_put(rt);
-		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
-	}
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->dst);
-	if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
-	    !skb_is_gso(skb)) {
-		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		ip_rt_put(rt);
-		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				   IP_VS_RT_MODE_LOCAL |
+				   IP_VS_RT_MODE_NON_LOCAL |
+				   IP_VS_RT_MODE_KNOWN_NH, NULL);
+	if (local < 0)
 		goto tx_error;
-	}
+	if (local)
+		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
 
 	ip_send_check(ip_hdr(skb));
 
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
-
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
@@ -1027,8 +945,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	LeaveFunction(10);
 	return NF_STOLEN;
 
-  tx_error_icmp:
-	dst_link_failure(skb);
   tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
@@ -1038,41 +954,20 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 #ifdef CONFIG_IP_VS_IPV6
 int
 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		 struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
+		 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 {
-	struct rt6_info *rt;			/* Route to the other host */
-	int    mtu;
+	int local;
 
 	EnterFunction(10);
 
-	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
-					 0, (IP_VS_RT_MODE_LOCAL |
-					     IP_VS_RT_MODE_NON_LOCAL))))
-		goto tx_error_icmp;
-	if (__ip_vs_is_local_route6(rt)) {
-		dst_release(&rt->dst);
-		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
-	}
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->dst);
-	if (__mtu_check_toobig_v6(skb, mtu)) {
-		if (!skb->dev) {
-			struct net *net = dev_net(skb_dst(skb)->dev);
-
-			skb->dev = net->loopback_dev;
-		}
-		/* only send ICMP too big on first fragment */
-		if (!iph->fragoffs)
-			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		dst_release(&rt->dst);
-		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+				      ipvsh, 0,
+				      IP_VS_RT_MODE_LOCAL |
+				      IP_VS_RT_MODE_NON_LOCAL);
+	if (local < 0)
 		goto tx_error;
-	}
-
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
+	if (local)
+		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
@@ -1082,8 +977,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	LeaveFunction(10);
 	return NF_STOLEN;
 
-tx_error_icmp:
-	dst_link_failure(skb);
 tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
@@ -1102,10 +995,9 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		struct ip_vs_iphdr *iph)
 {
 	struct rtable	*rt;	/* Route to the other host */
-	int mtu;
 	int rc;
 	int local;
-	int rt_mode;
+	int rt_mode, was_input;
 
 	EnterFunction(10);
 
@@ -1125,15 +1017,16 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/*
 	 * mangle and send the packet here (only for VS/NAT)
 	 */
+	was_input = rt_is_input_route(skb_rtable(skb));
 
 	/* LOCALNODE from FORWARD hook is not supported */
 	rt_mode = (hooknum != NF_INET_FORWARD) ?
 		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
 		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
-	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
-				      rt_mode, NULL)))
-		goto tx_error_icmp;
-	local = rt->rt_flags & RTCF_LOCAL;
+	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL);
+	if (local < 0)
+		goto tx_error;
+	rt = skb_rtable(skb);
 
 	/*
 	 * Avoid duplicate tuple in reply direction for NAT traffic
@@ -1148,71 +1041,49 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 			IP_VS_DBG(10, "%s(): "
 				  "stopping DNAT to local address %pI4\n",
 				  __func__, &cp->daddr.ip);
-			goto tx_error_put;
+			goto tx_error;
 		}
 	}
 #endif
 
 	/* From world but DNAT to loopback address? */
-	if (local && ipv4_is_loopback(cp->daddr.ip) &&
-	    rt_is_input_route(skb_rtable(skb))) {
+	if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
 		IP_VS_DBG(1, "%s(): "
 			  "stopping DNAT to loopback %pI4\n",
 			  __func__, &cp->daddr.ip);
-		goto tx_error_put;
-	}
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->dst);
-	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
-	    !skb_is_gso(skb)) {
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error_put;
+		goto tx_error;
 	}
 
 	/* copy-on-write the packet before mangling it */
 	if (!skb_make_writable(skb, offset))
-		goto tx_error_put;
+		goto tx_error;
 
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
-		goto tx_error_put;
+		goto tx_error;
 
 	ip_vs_nat_icmp(skb, pp, cp, 0);
 
-	if (!local) {
-		skb_dst_drop(skb);
-		skb_dst_set(skb, &rt->dst);
-	} else
-		ip_rt_put(rt);
-
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 	goto out;
 
-  tx_error_icmp:
-	dst_link_failure(skb);
   tx_error:
 	dev_kfree_skb(skb);
 	rc = NF_STOLEN;
   out:
 	LeaveFunction(10);
 	return rc;
-  tx_error_put:
-	ip_rt_put(rt);
-	goto tx_error;
 }
 
 #ifdef CONFIG_IP_VS_IPV6
 int
 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
-		struct ip_vs_iphdr *iph)
+		struct ip_vs_iphdr *ipvsh)
 {
 	struct rt6_info	*rt;	/* Route to the other host */
-	int mtu;
 	int rc;
 	int local;
 	int rt_mode;
@@ -1224,7 +1095,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	   translate address/port back */
 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
 		if (cp->packet_xmit)
-			rc = cp->packet_xmit(skb, cp, pp, iph);
+			rc = cp->packet_xmit(skb, cp, pp, ipvsh);
 		else
 			rc = NF_ACCEPT;
 		/* do not touch skb anymore */
@@ -1240,11 +1111,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	rt_mode = (hooknum != NF_INET_FORWARD) ?
 		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
 		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
-	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
-					 0, rt_mode)))
-		goto tx_error_icmp;
-
-	local = __ip_vs_is_local_route6(rt);
+	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+				      ipvsh, 0, rt_mode);
+	if (local < 0)
+		goto tx_error;
+	rt = (struct rt6_info *) skb_dst(skb);
 	/*
 	 * Avoid duplicate tuple in reply direction for NAT traffic
 	 * to local address when connection is sync-ed
@@ -1258,7 +1129,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 			IP_VS_DBG(10, "%s(): "
 				  "stopping DNAT to local address %pI6\n",
 				  __func__, &cp->daddr.in6);
-			goto tx_error_put;
+			goto tx_error;
 		}
 	}
 #endif
@@ -1269,57 +1140,29 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG(1, "%s(): "
 			  "stopping DNAT to loopback %pI6\n",
 			  __func__, &cp->daddr.in6);
-		goto tx_error_put;
-	}
-
-	/* MTU checking */
-	mtu = dst_mtu(&rt->dst);
-	if (__mtu_check_toobig_v6(skb, mtu)) {
-		if (!skb->dev) {
-			struct net *net = dev_net(skb_dst(skb)->dev);
-
-			skb->dev = net->loopback_dev;
-		}
-		/* only send ICMP too big on first fragment */
-		if (!iph->fragoffs)
-			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error_put;
+		goto tx_error;
 	}
 
 	/* copy-on-write the packet before mangling it */
 	if (!skb_make_writable(skb, offset))
-		goto tx_error_put;
+		goto tx_error;
 
 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
-		goto tx_error_put;
+		goto tx_error;
 
 	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
 
-	if (!local || !skb->dev) {
-		skb_dst_drop(skb);
-		skb_dst_set(skb, &rt->dst);
-	} else {
-		/* destined to loopback, do we need to change route? */
-		dst_release(&rt->dst);
-	}
-
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 	goto out;
 
-tx_error_icmp:
-	dst_link_failure(skb);
 tx_error:
 	dev_kfree_skb(skb);
 	rc = NF_STOLEN;
 out:
 	LeaveFunction(10);
 	return rc;
-tx_error_put:
-	dst_release(&rt->dst);
-	goto tx_error;
 }
 #endif

From 026ace060dfe29275d2188297a62fa37d6c1a02c Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:06 +0200
Subject: [PATCH 09/49] ipvs: optimize dst usage for real server

Currently when forwarding requests to real servers
we use dst_lock and atomic operations when cloning the
dst_cache value. As the dst_cache value does not change
most of the time it is better to use RCU and to lock
dst_lock only when we need to replace the obsoleted dst.
For this to work we keep dst_cache in new structure protected
by RCU. For packets to remote real servers we will use noref
version of dst_cache, it will be valid while we are in RCU
read-side critical section because now dst_release for replaced
dsts will be invoked after the grace period. Packets to
local real servers that are passed to local stack with
NF_ACCEPT need a dst clone.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |  12 +-
 net/netfilter/ipvs/ip_vs_core.c |  11 +-
 net/netfilter/ipvs/ip_vs_ctl.c  |  25 +++--
 net/netfilter/ipvs/ip_vs_xmit.c | 192 ++++++++++++++++++++++++--------
 4 files changed, 179 insertions(+), 61 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 8ad73a834655..a150ff5d838c 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -724,6 +724,13 @@ struct ip_vs_service {
 	struct ip_vs_pe		*pe;
 };
 
+/* Information for cached dst */
+struct ip_vs_dest_dst {
+	struct dst_entry	*dst_cache;	/* destination cache entry */
+	u32			dst_cookie;
+	union nf_inet_addr	dst_saddr;
+	struct rcu_head		rcu_head;
+};
 
 /*
  *	The real server destination forwarding entry
@@ -752,9 +759,7 @@ struct ip_vs_dest {
 
 	/* for destination cache */
 	spinlock_t		dst_lock;	/* lock of dst_cache */
-	struct dst_entry	*dst_cache;	/* destination cache entry */
-	u32			dst_cookie;
-	union nf_inet_addr	dst_saddr;
+	struct ip_vs_dest_dst __rcu *dest_dst;	/* cached dst info */
 
 	/* for virtual service */
 	struct ip_vs_service	*svc;		/* service it belongs to */
@@ -1427,6 +1432,7 @@ extern int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 extern int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 			   struct ip_vs_protocol *pp, int offset,
 			   unsigned int hooknum, struct ip_vs_iphdr *iph);
+extern void ip_vs_dest_dst_rcu_free(struct rcu_head *head);
 
 #ifdef CONFIG_IP_VS_IPV6
 extern int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2aef23ed748b..6ad24e7ef440 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1395,10 +1395,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 				goto ignore_ipip;
 			/* Prefer the resulting PMTU */
 			if (dest) {
-				spin_lock(&dest->dst_lock);
-				if (dest->dst_cache)
-					mtu = dst_mtu(dest->dst_cache);
-				spin_unlock(&dest->dst_lock);
+				struct ip_vs_dest_dst *dest_dst;
+
+				rcu_read_lock();
+				dest_dst = rcu_dereference(dest->dest_dst);
+				if (dest_dst)
+					mtu = dst_mtu(dest_dst->dst_cache);
+				rcu_read_unlock();
 			}
 			if (mtu > 68 + sizeof(struct iphdr))
 				mtu -= sizeof(struct iphdr);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5265eaa7b42b..ef48cc55f775 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -641,15 +641,26 @@ struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
 	return dest;
 }
 
-/* Release dst_cache for dest in user context */
+void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
+{
+	struct ip_vs_dest_dst *dest_dst = container_of(head,
+						       struct ip_vs_dest_dst,
+						       rcu_head);
+
+	dst_release(dest_dst->dst_cache);
+	kfree(dest_dst);
+}
+
+/* Release dest_dst and dst_cache for dest in user context */
 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
 {
-	struct dst_entry *old_dst;
+	struct ip_vs_dest_dst *old;
 
-	old_dst = dest->dst_cache;
-	dest->dst_cache = NULL;
-	dst_release(old_dst);
-	dest->dst_saddr.ip = 0;
+	old = rcu_dereference_protected(dest->dest_dst, 1);
+	if (old) {
+		RCU_INIT_POINTER(dest->dest_dst, NULL);
+		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+	}
 }
 
 /*
@@ -1513,7 +1524,7 @@ static inline void
 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
 {
 	spin_lock_bh(&dest->dst_lock);
-	if (dest->dst_cache && dest->dst_cache->dev == dev) {
+	if (dest->dest_dst && dest->dest_dst->dst_cache->dev == dev) {
 		IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
 			      dev->name,
 			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 603eb8a50e19..3db7889edefc 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -17,6 +17,8 @@
  * - not all connections have destination server, for example,
  * connections in backup server when fwmark is used
  * - bypass connections use daddr from packet
+ * - we can use dst without ref while sending in RCU section, we use
+ * ref when returning NF_ACCEPT for NAT-ed packet via loopback
  * LOCAL_OUT rules:
  * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
  * - skb->pkt_type is not set yet
@@ -54,34 +56,51 @@ enum {
 	IP_VS_RT_MODE_TUNNEL	= 32,/* Tunnel mode */
 };
 
+static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
+{
+	return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
+}
+
+static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
+{
+	kfree(dest_dst);
+}
+
 /*
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, struct dst_entry *dst, u32 dst_cookie)
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
+		struct dst_entry *dst, u32 dst_cookie)
 {
-	struct dst_entry *old_dst;
+	struct ip_vs_dest_dst *old;
 
-	old_dst = dest->dst_cache;
-	dest->dst_cache = dst;
-	dest->dst_cookie = dst_cookie;
-	dst_release(old_dst);
+	old = rcu_dereference_protected(dest->dest_dst,
+					lockdep_is_held(&dest->dst_lock));
+
+	if (dest_dst) {
+		dest_dst->dst_cache = dst;
+		dest_dst->dst_cookie = dst_cookie;
+	}
+	rcu_assign_pointer(dest->dest_dst, dest_dst);
+
+	if (old)
+		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
 }
 
-static inline struct dst_entry *
+static inline struct ip_vs_dest_dst *
 __ip_vs_dst_check(struct ip_vs_dest *dest)
 {
-	struct dst_entry *dst = dest->dst_cache;
+	struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
+	struct dst_entry *dst;
 
-	if (!dst)
+	if (!dest_dst)
 		return NULL;
-	if (dst->obsolete && dst->ops->check(dst, dest->dst_cookie) == NULL) {
-		dest->dst_cache = NULL;
-		dst_release(dst);
+	dst = dest_dst->dst_cache;
+	if (dst->obsolete &&
+	    dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
 		return NULL;
-	}
-	dst_hold(dst);
-	return dst;
+	return dest_dst;
 }
 
 static inline bool
@@ -144,35 +163,48 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_dest_dst *dest_dst;
 	struct rtable *rt;			/* Route to the other host */
 	struct rtable *ort;			/* Original route */
 	struct iphdr *iph;
 	__be16 df;
 	int mtu;
-	int local;
+	int local, noref = 1;
 
 	if (dest) {
-		spin_lock(&dest->dst_lock);
-		rt = (struct rtable *) __ip_vs_dst_check(dest);
-		if (!rt) {
-			rt = do_output_route4(net, dest->addr.ip, rt_mode,
-					      &dest->dst_saddr.ip);
-			if (!rt) {
+		dest_dst = __ip_vs_dst_check(dest);
+		if (likely(dest_dst))
+			rt = (struct rtable *) dest_dst->dst_cache;
+		else {
+			dest_dst = ip_vs_dest_dst_alloc();
+			spin_lock(&dest->dst_lock);
+			if (!dest_dst) {
+				__ip_vs_dst_set(dest, NULL, NULL, 0);
 				spin_unlock(&dest->dst_lock);
 				goto err_unreach;
 			}
-			__ip_vs_dst_set(dest, dst_clone(&rt->dst), 0);
+			rt = do_output_route4(net, dest->addr.ip, rt_mode,
+					      &dest_dst->dst_saddr.ip);
+			if (!rt) {
+				__ip_vs_dst_set(dest, NULL, NULL, 0);
+				spin_unlock(&dest->dst_lock);
+				ip_vs_dest_dst_free(dest_dst);
+				goto err_unreach;
+			}
+			__ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+			spin_unlock(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
-				  &dest->addr.ip, &dest->dst_saddr.ip,
+				  &dest->addr.ip, &dest_dst->dst_saddr.ip,
 				  atomic_read(&rt->dst.__refcnt));
 		}
 		daddr = dest->addr.ip;
 		if (ret_saddr)
-			*ret_saddr = dest->dst_saddr.ip;
-		spin_unlock(&dest->dst_lock);
+			*ret_saddr = dest_dst->dst_saddr.ip;
 	} else {
 		__be32 saddr = htonl(INADDR_ANY);
 
+		noref = 0;
+
 		/* For such unconfigured boxes avoid many route lookups
 		 * for performance reasons because we do not remember saddr
 		 */
@@ -210,7 +242,8 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 			goto err_put;
 		}
 		/* skb to local stack, preserve old route */
-		ip_rt_put(rt);
+		if (!noref)
+			ip_rt_put(rt);
 		return local;
 	}
 
@@ -240,12 +273,19 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 	}
 
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
+	if (noref) {
+		if (!local)
+			skb_dst_set_noref_force(skb, &rt->dst);
+		else
+			skb_dst_set(skb, dst_clone(&rt->dst));
+	} else
+		skb_dst_set(skb, &rt->dst);
 
 	return local;
 
 err_put:
-	ip_rt_put(rt);
+	if (!noref)
+		ip_rt_put(rt);
 	return -1;
 
 err_unreach:
@@ -303,36 +343,48 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 		      struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct ip_vs_dest_dst *dest_dst;
 	struct rt6_info *rt;			/* Route to the other host */
 	struct rt6_info *ort;			/* Original route */
 	struct dst_entry *dst;
 	int mtu;
-	int local;
+	int local, noref = 1;
 
 	if (dest) {
-		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest);
-		if (!rt) {
+		dest_dst = __ip_vs_dst_check(dest);
+		if (likely(dest_dst))
+			rt = (struct rt6_info *) dest_dst->dst_cache;
+		else {
 			u32 cookie;
 
+			dest_dst = ip_vs_dest_dst_alloc();
+			spin_lock(&dest->dst_lock);
+			if (!dest_dst) {
+				__ip_vs_dst_set(dest, NULL, NULL, 0);
+				spin_unlock(&dest->dst_lock);
+				goto err_unreach;
+			}
 			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
-						      &dest->dst_saddr.in6,
+						      &dest_dst->dst_saddr.in6,
 						      do_xfrm);
 			if (!dst) {
+				__ip_vs_dst_set(dest, NULL, NULL, 0);
 				spin_unlock(&dest->dst_lock);
+				ip_vs_dest_dst_free(dest_dst);
 				goto err_unreach;
 			}
 			rt = (struct rt6_info *) dst;
 			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
-			__ip_vs_dst_set(dest, dst_clone(&rt->dst), cookie);
+			__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+			spin_unlock(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
-				  &dest->addr.in6, &dest->dst_saddr.in6,
+				  &dest->addr.in6, &dest_dst->dst_saddr.in6,
 				  atomic_read(&rt->dst.__refcnt));
 		}
 		if (ret_saddr)
-			*ret_saddr = dest->dst_saddr.in6;
-		spin_unlock(&dest->dst_lock);
+			*ret_saddr = dest_dst->dst_saddr.in6;
 	} else {
+		noref = 0;
 		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
 		if (!dst)
 			goto err_unreach;
@@ -367,7 +419,8 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 			goto err_put;
 		}
 		/* skb to local stack, preserve old route */
-		dst_release(&rt->dst);
+		if (!noref)
+			dst_release(&rt->dst);
 		return local;
 	}
 
@@ -399,12 +452,19 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 	}
 
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->dst);
+	if (noref) {
+		if (!local)
+			skb_dst_set_noref_force(skb, &rt->dst);
+		else
+			skb_dst_set(skb, dst_clone(&rt->dst));
+	} else
+		skb_dst_set(skb, &rt->dst);
 
 	return local;
 
 err_put:
-	dst_release(&rt->dst);
+	if (!noref)
+		dst_release(&rt->dst);
 	return -1;
 
 err_unreach:
@@ -494,6 +554,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
+	rcu_read_lock();
 	if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
 			       NULL) < 0)
 		goto tx_error;
@@ -504,12 +565,14 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	skb->local_df = 1;
 
 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+	rcu_read_unlock();
 
 	LeaveFunction(10);
 	return NF_STOLEN;
 
  tx_error:
 	kfree_skb(skb);
+	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
 }
@@ -521,6 +584,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	EnterFunction(10);
 
+	rcu_read_lock();
 	if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL,
 				  ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
 		goto tx_error;
@@ -529,12 +593,14 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	skb->local_df = 1;
 
 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+	rcu_read_unlock();
 
 	LeaveFunction(10);
 	return NF_STOLEN;
 
  tx_error:
 	kfree_skb(skb);
+	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
 }
@@ -553,6 +619,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
+	rcu_read_lock();
 	/* check if it is a connection of no-client-port */
 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
 		__be16 _pt, *p;
@@ -620,12 +687,14 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	skb->local_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+	rcu_read_unlock();
 
 	LeaveFunction(10);
 	return rc;
 
   tx_error:
 	kfree_skb(skb);
+	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
 }
@@ -640,6 +709,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
+	rcu_read_lock();
 	/* check if it is a connection of no-client-port */
 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
 		__be16 _pt, *p;
@@ -707,6 +777,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	skb->local_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+	rcu_read_unlock();
 
 	LeaveFunction(10);
 	return rc;
@@ -714,6 +785,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 tx_error:
 	LeaveFunction(10);
 	kfree_skb(skb);
+	rcu_read_unlock();
 	return NF_STOLEN;
 }
 #endif
@@ -755,6 +827,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
+	rcu_read_lock();
 	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
 				   IP_VS_RT_MODE_LOCAL |
 				   IP_VS_RT_MODE_NON_LOCAL |
@@ -762,8 +835,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 				   IP_VS_RT_MODE_TUNNEL, &saddr);
 	if (local < 0)
 		goto tx_error;
-	if (local)
+	if (local) {
+		rcu_read_unlock();
 		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+	}
 
 	rt = skb_rtable(skb);
 	tdev = rt->dst.dev;
@@ -818,6 +893,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		ip_local_out(skb);
 	else if (ret == NF_DROP)
 		kfree_skb(skb);
+	rcu_read_unlock();
 
 	LeaveFunction(10);
 
@@ -825,6 +901,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
   tx_error:
 	kfree_skb(skb);
+	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
 }
@@ -844,6 +921,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
+	rcu_read_lock();
 	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
 				      &saddr, ipvsh, 1,
 				      IP_VS_RT_MODE_LOCAL |
@@ -851,8 +929,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 				      IP_VS_RT_MODE_TUNNEL);
 	if (local < 0)
 		goto tx_error;
-	if (local)
+	if (local) {
+		rcu_read_unlock();
 		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
+	}
 
 	rt = (struct rt6_info *) skb_dst(skb);
 	tdev = rt->dst.dev;
@@ -901,6 +981,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		ip6_local_out(skb);
 	else if (ret == NF_DROP)
 		kfree_skb(skb);
+	rcu_read_unlock();
 
 	LeaveFunction(10);
 
@@ -908,6 +989,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 tx_error:
 	kfree_skb(skb);
+	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
 }
@@ -926,14 +1008,17 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
+	rcu_read_lock();
 	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
 				   IP_VS_RT_MODE_LOCAL |
 				   IP_VS_RT_MODE_NON_LOCAL |
 				   IP_VS_RT_MODE_KNOWN_NH, NULL);
 	if (local < 0)
 		goto tx_error;
-	if (local)
+	if (local) {
+		rcu_read_unlock();
 		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+	}
 
 	ip_send_check(ip_hdr(skb));
 
@@ -941,12 +1026,14 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	skb->local_df = 1;
 
 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+	rcu_read_unlock();
 
 	LeaveFunction(10);
 	return NF_STOLEN;
 
   tx_error:
 	kfree_skb(skb);
+	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
 }
@@ -960,25 +1047,30 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
+	rcu_read_lock();
 	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
 				      ipvsh, 0,
 				      IP_VS_RT_MODE_LOCAL |
 				      IP_VS_RT_MODE_NON_LOCAL);
 	if (local < 0)
 		goto tx_error;
-	if (local)
+	if (local) {
+		rcu_read_unlock();
 		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
+	}
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+	rcu_read_unlock();
 
 	LeaveFunction(10);
 	return NF_STOLEN;
 
 tx_error:
 	kfree_skb(skb);
+	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
 }
@@ -1023,6 +1115,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	rt_mode = (hooknum != NF_INET_FORWARD) ?
 		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
 		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+	rcu_read_lock();
 	local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL);
 	if (local < 0)
 		goto tx_error;
@@ -1067,10 +1160,12 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	skb->local_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+	rcu_read_unlock();
 	goto out;
 
   tx_error:
-	dev_kfree_skb(skb);
+	kfree_skb(skb);
+	rcu_read_unlock();
 	rc = NF_STOLEN;
   out:
 	LeaveFunction(10);
@@ -1111,6 +1206,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	rt_mode = (hooknum != NF_INET_FORWARD) ?
 		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
 		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+	rcu_read_lock();
 	local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
 				      ipvsh, 0, rt_mode);
 	if (local < 0)
@@ -1156,10 +1252,12 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	skb->local_df = 1;
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+	rcu_read_unlock();
 	goto out;
 
 tx_error:
-	dev_kfree_skb(skb);
+	kfree_skb(skb);
+	rcu_read_unlock();
 	rc = NF_STOLEN;
 out:
 	LeaveFunction(10);

From 363c97d7435ebba8a040f86e29bdec79ee182f0c Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:07 +0200
Subject: [PATCH 10/49] ipvs: convert app locks

We use locks like tcp_app_lock, udp_app_lock,
sctp_app_lock to protect access to the protocol hash tables
from readers in packet context while the application
instances (inc) are [un]registered under global mutex.

As the hash tables are mostly read when conns are
created and bound to app, use RCU for readers and reclaim
app instance after grace period.

Simplify ip_vs_app_inc_get because we use usecnt
only for statistics and rely on module refcounting.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h                   |  4 +---
 net/netfilter/ipvs/ip_vs_app.c        | 27 +++++++++++++++++++--------
 net/netfilter/ipvs/ip_vs_ftp.c        |  2 ++
 net/netfilter/ipvs/ip_vs_proto_sctp.c | 18 ++++++------------
 net/netfilter/ipvs/ip_vs_proto_tcp.c  | 18 ++++++------------
 net/netfilter/ipvs/ip_vs_proto_udp.c  | 19 ++++++-------------
 6 files changed, 40 insertions(+), 48 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a150ff5d838c..84ca17141a44 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -823,6 +823,7 @@ struct ip_vs_app {
 	struct ip_vs_app	*app;		/* its real application */
 	__be16			port;		/* port number in net order */
 	atomic_t		usecnt;		/* usage counter */
+	struct rcu_head		rcu_head;
 
 	/*
 	 * output hook: Process packet in inout direction, diff set for TCP.
@@ -908,7 +909,6 @@ struct netns_ipvs {
 	#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS)
 	#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1)
 	struct list_head	tcp_apps[TCP_APP_TAB_SIZE];
-	spinlock_t		tcp_app_lock;
 #endif
 	/* ip_vs_proto_udp */
 #ifdef CONFIG_IP_VS_PROTO_UDP
@@ -916,7 +916,6 @@ struct netns_ipvs {
 	#define	UDP_APP_TAB_SIZE	(1 << UDP_APP_TAB_BITS)
 	#define	UDP_APP_TAB_MASK	(UDP_APP_TAB_SIZE - 1)
 	struct list_head	udp_apps[UDP_APP_TAB_SIZE];
-	spinlock_t		udp_app_lock;
 #endif
 	/* ip_vs_proto_sctp */
 #ifdef CONFIG_IP_VS_PROTO_SCTP
@@ -925,7 +924,6 @@ struct netns_ipvs {
 	#define SCTP_APP_TAB_MASK	(SCTP_APP_TAB_SIZE - 1)
 	/* Hash table for SCTP application incarnations	 */
 	struct list_head	sctp_apps[SCTP_APP_TAB_SIZE];
-	spinlock_t		sctp_app_lock;
 #endif
 	/* ip_vs_conn */
 	atomic_t		conn_count;      /*  connection counter */
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 0b779d7df881..a95603004bc4 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -58,6 +58,18 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
 	module_put(app->module);
 }
 
+static void ip_vs_app_inc_destroy(struct ip_vs_app *inc)
+{
+	kfree(inc->timeout_table);
+	kfree(inc);
+}
+
+static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
+{
+	struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head);
+
+	ip_vs_app_inc_destroy(inc);
+}
 
 /*
  *	Allocate/initialize app incarnation and register it in proto apps.
@@ -106,8 +118,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
 	return 0;
 
   out:
-	kfree(inc->timeout_table);
-	kfree(inc);
+	ip_vs_app_inc_destroy(inc);
 	return ret;
 }
 
@@ -131,8 +142,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
 
 	list_del(&inc->a_list);
 
-	kfree(inc->timeout_table);
-	kfree(inc);
+	call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);
 }
 
 
@@ -144,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
 {
 	int result;
 
-	atomic_inc(&inc->usecnt);
-	if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
-		atomic_dec(&inc->usecnt);
+	result = ip_vs_app_get(inc->app);
+	if (result)
+		atomic_inc(&inc->usecnt);
 	return result;
 }
 
@@ -156,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
  */
 void ip_vs_app_inc_put(struct ip_vs_app *inc)
 {
-	ip_vs_app_put(inc->app);
 	atomic_dec(&inc->usecnt);
+	ip_vs_app_put(inc->app);
 }
 
 
@@ -218,6 +228,7 @@ out_unlock:
 /*
  *	ip_vs_app unregistration routine
  *	We are sure there are no app incarnations attached to services
+ *	Caller should use synchronize_rcu() or rcu_barrier()
  */
 void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
 {
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 4f53a5f04437..7f90825ef1ce 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -480,6 +480,7 @@ static int __init ip_vs_ftp_init(void)
 	int rv;
 
 	rv = register_pernet_subsys(&ip_vs_ftp_ops);
+	/* rcu_barrier() is called by netns on error */
 	return rv;
 }
 
@@ -489,6 +490,7 @@ static int __init ip_vs_ftp_init(void)
 static void __exit ip_vs_ftp_exit(void)
 {
 	unregister_pernet_subsys(&ip_vs_ftp_ops);
+	/* rcu_barrier() is called by netns */
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index cd1d7298f7ba..f7190cdf023e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -1016,30 +1016,25 @@ static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
 
 	hash = sctp_app_hashkey(port);
 
-	spin_lock_bh(&ipvs->sctp_app_lock);
 	list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+	list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]);
 	atomic_inc(&pd->appcnt);
 out:
-	spin_unlock_bh(&ipvs->sctp_app_lock);
 
 	return ret;
 }
 
 static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
 {
-	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
 
-	spin_lock_bh(&ipvs->sctp_app_lock);
 	atomic_dec(&pd->appcnt);
-	list_del(&inc->p_list);
-	spin_unlock_bh(&ipvs->sctp_app_lock);
+	list_del_rcu(&inc->p_list);
 }
 
 static int sctp_app_conn_bind(struct ip_vs_conn *cp)
@@ -1055,12 +1050,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = sctp_app_hashkey(cp->vport);
 
-	spin_lock(&ipvs->sctp_app_lock);
-	list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&ipvs->sctp_app_lock);
+			rcu_read_unlock();
 
 			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
 					"%s:%u to app %s on port %u\n",
@@ -1076,7 +1071,7 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&ipvs->sctp_app_lock);
+	rcu_read_unlock();
 out:
 	return result;
 }
@@ -1090,7 +1085,6 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
-	spin_lock_init(&ipvs->sctp_app_lock);
 	pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
 							sizeof(sctp_timeouts));
 	if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 9af653a75825..0bbc3feae682 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -580,18 +580,16 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
 
 	hash = tcp_app_hashkey(port);
 
-	spin_lock_bh(&ipvs->tcp_app_lock);
 	list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+	list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
 	atomic_inc(&pd->appcnt);
 
   out:
-	spin_unlock_bh(&ipvs->tcp_app_lock);
 	return ret;
 }
 
@@ -599,13 +597,10 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
 static void
 tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
 {
-	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
 
-	spin_lock_bh(&ipvs->tcp_app_lock);
 	atomic_dec(&pd->appcnt);
-	list_del(&inc->p_list);
-	spin_unlock_bh(&ipvs->tcp_app_lock);
+	list_del_rcu(&inc->p_list);
 }
 
 
@@ -624,12 +619,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = tcp_app_hashkey(cp->vport);
 
-	spin_lock(&ipvs->tcp_app_lock);
-	list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&ipvs->tcp_app_lock);
+			rcu_read_unlock();
 
 			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
 				      "%s:%u to app %s on port %u\n",
@@ -646,7 +641,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&ipvs->tcp_app_lock);
+	rcu_read_unlock();
 
   out:
 	return result;
@@ -676,7 +671,6 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
-	spin_lock_init(&ipvs->tcp_app_lock);
 	pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
 							sizeof(tcp_timeouts));
 	if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 503a842c90d2..1a03e2d9c6ba 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -359,19 +359,16 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc)
 
 	hash = udp_app_hashkey(port);
 
-
-	spin_lock_bh(&ipvs->udp_app_lock);
 	list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+	list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]);
 	atomic_inc(&pd->appcnt);
 
   out:
-	spin_unlock_bh(&ipvs->udp_app_lock);
 	return ret;
 }
 
@@ -380,12 +377,9 @@ static void
 udp_unregister_app(struct net *net, struct ip_vs_app *inc)
 {
 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
-	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	spin_lock_bh(&ipvs->udp_app_lock);
 	atomic_dec(&pd->appcnt);
-	list_del(&inc->p_list);
-	spin_unlock_bh(&ipvs->udp_app_lock);
+	list_del_rcu(&inc->p_list);
 }
 
 
@@ -403,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = udp_app_hashkey(cp->vport);
 
-	spin_lock(&ipvs->udp_app_lock);
-	list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&ipvs->udp_app_lock);
+			rcu_read_unlock();
 
 			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
 				      "%s:%u to app %s on port %u\n",
@@ -425,7 +419,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&ipvs->udp_app_lock);
+	rcu_read_unlock();
 
   out:
 	return result;
@@ -467,7 +461,6 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
-	spin_lock_init(&ipvs->udp_app_lock);
 	pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
 							sizeof(udp_timeouts));
 	if (!pd->timeout_table)

From 276472eae063d717b775fdfc87529937402d0e08 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:08 +0200
Subject: [PATCH 11/49] ipvs: remove rs_lock by using RCU

rs_lock was used to protect rs_table (hash table)
from updaters (under global mutex) and readers (packet handlers).
We can remove rs_lock by using RCU lock for readers. Reclaiming
dest only with kfree_rcu is enough because the readers access
only fields from the ip_vs_dest structure.

Use hlist for rs_table.

As we are now using hlist_del_rcu, introduce in_rs_table
flag as replacement for the list_empty checks which do not
work with RCU. It is needed because only NAT dests are in
the rs_table.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             | 14 +++---
 net/netfilter/ipvs/ip_vs_core.c |  5 +-
 net/netfilter/ipvs/ip_vs_ctl.c  | 88 ++++++++++++---------------------
 3 files changed, 41 insertions(+), 66 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 84ca17141a44..b06aa6c939fa 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -738,7 +738,7 @@ struct ip_vs_dest_dst {
  */
 struct ip_vs_dest {
 	struct list_head	n_list;   /* for the dests in the service */
-	struct list_head	d_list;   /* for table with all the dests */
+	struct hlist_node	d_list;   /* for table with all the dests */
 
 	u16			af;		/* address family */
 	__be16			port;		/* port number of the server */
@@ -767,6 +767,9 @@ struct ip_vs_dest {
 	__be16			vport;		/* virtual port number */
 	union nf_inet_addr	vaddr;		/* virtual IP address */
 	__u32			vfwmark;	/* firewall mark of service */
+
+	struct rcu_head		rcu_head;
+	unsigned int		in_rs_table:1;	/* we are in rs_table */
 };
 
 
@@ -897,7 +900,7 @@ struct netns_ipvs {
 	#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
 	#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
 
-	struct list_head	rs_table[IP_VS_RTAB_SIZE];
+	struct hlist_head	rs_table[IP_VS_RTAB_SIZE];
 	/* ip_vs_app */
 	struct list_head	app_list;
 	/* ip_vs_proto */
@@ -933,7 +936,6 @@ struct netns_ipvs {
 
 	int			num_services;    /* no of virtual services */
 
-	rwlock_t		rs_lock;         /* real services table */
 	/* Trash for destinations */
 	struct list_head	dest_trash;
 	/* Service counters */
@@ -1376,9 +1378,9 @@ static inline void ip_vs_service_put(struct ip_vs_service *svc)
 	atomic_dec(&svc->usecnt);
 }
 
-extern struct ip_vs_dest *
-ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
-			  const union nf_inet_addr *daddr, __be16 dport);
+extern bool
+ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+		       const union nf_inet_addr *daddr, __be16 dport);
 
 extern int ip_vs_use_count_inc(void);
 extern void ip_vs_use_count_dec(void);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 6ad24e7ef440..4fc749c6ec27 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1161,9 +1161,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 					 sizeof(_ports), _ports, &iph);
 		if (pptr == NULL)
 			return NF_ACCEPT;	/* Not for me */
-		if (ip_vs_lookup_real_service(net, af, iph.protocol,
-					      &iph.saddr,
-					      pptr[0])) {
+		if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+					   pptr[0])) {
 			/*
 			 * Notify the real server: there is no
 			 * existing entry if it is not RST
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ef48cc55f775..182d95807bd5 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -508,17 +508,13 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
 		& IP_VS_RTAB_MASK;
 }
 
-/*
- *	Hashes ip_vs_dest in rs_table by <proto,addr,port>.
- *	should be called with locked tables.
- */
-static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
+/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
+static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
 {
 	unsigned int hash;
 
-	if (!list_empty(&dest->d_list)) {
-		return 0;
-	}
+	if (dest->in_rs_table)
+		return;
 
 	/*
 	 *	Hash by proto,addr,port,
@@ -526,60 +522,47 @@ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
 	 */
 	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
 
-	list_add(&dest->d_list, &ipvs->rs_table[hash]);
-
-	return 1;
+	hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
+	dest->in_rs_table = 1;
 }
 
-/*
- *	UNhashes ip_vs_dest from rs_table.
- *	should be called with locked tables.
- */
-static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+/* Unhash ip_vs_dest from rs_table. */
+static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
 {
 	/*
 	 * Remove it from the rs_table table.
 	 */
-	if (!list_empty(&dest->d_list)) {
-		list_del_init(&dest->d_list);
+	if (dest->in_rs_table) {
+		hlist_del_rcu(&dest->d_list);
+		dest->in_rs_table = 0;
 	}
-
-	return 1;
 }
 
-/*
- *	Lookup real service by <proto,addr,port> in the real service table.
- */
-struct ip_vs_dest *
-ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
-			  const union nf_inet_addr *daddr,
-			  __be16 dport)
+/* Check if real service by <proto,addr,port> is present */
+bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+			    const union nf_inet_addr *daddr, __be16 dport)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	unsigned int hash;
 	struct ip_vs_dest *dest;
 
-	/*
-	 *	Check for "full" addressed entries
-	 *	Return the first found entry
-	 */
+	/* Check for "full" addressed entries */
 	hash = ip_vs_rs_hashkey(af, daddr, dport);
 
-	read_lock(&ipvs->rs_lock);
-	list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
-		if ((dest->af == af)
-		    && ip_vs_addr_equal(af, &dest->addr, daddr)
-		    && (dest->port == dport)
-		    && ((dest->protocol == protocol) ||
-			dest->vfwmark)) {
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+		if (dest->port == dport &&
+		    dest->af == af &&
+		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
+		    (dest->protocol == protocol || dest->vfwmark)) {
 			/* HIT */
-			read_unlock(&ipvs->rs_lock);
-			return dest;
+			rcu_read_unlock();
+			return true;
 		}
 	}
-	read_unlock(&ipvs->rs_lock);
+	rcu_read_unlock();
 
-	return NULL;
+	return false;
 }
 
 /*
@@ -612,9 +595,6 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
  * the backup synchronization daemon. It finds the
  * destination to be bound to the received connection
  * on the backup.
- *
- * ip_vs_lookup_real_service() looked promissing, but
- * seems not working as expected.
  */
 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
 				   const union nf_inet_addr *daddr,
@@ -715,7 +695,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
 			__ip_vs_dst_cache_reset(dest);
 			__ip_vs_unbind_svc(dest);
 			free_percpu(dest->stats.cpustats);
-			kfree(dest);
+			kfree_rcu(dest, rcu_head);
 		}
 	}
 
@@ -742,7 +722,7 @@ static void ip_vs_trash_cleanup(struct net *net)
 		__ip_vs_dst_cache_reset(dest);
 		__ip_vs_unbind_svc(dest);
 		free_percpu(dest->stats.cpustats);
-		kfree(dest);
+		kfree_rcu(dest, rcu_head);
 	}
 }
 
@@ -807,9 +787,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 		 *    Put the real service in rs_table if not present.
 		 *    For now only for NAT!
 		 */
-		write_lock_bh(&ipvs->rs_lock);
 		ip_vs_rs_hash(ipvs, dest);
-		write_unlock_bh(&ipvs->rs_lock);
 	}
 	atomic_set(&dest->conn_flags, conn_flags);
 
@@ -905,7 +883,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 	atomic_set(&dest->persistconns, 0);
 	atomic_set(&dest->refcnt, 1);
 
-	INIT_LIST_HEAD(&dest->d_list);
+	INIT_HLIST_NODE(&dest->d_list);
 	spin_lock_init(&dest->dst_lock);
 	spin_lock_init(&dest->stats.lock);
 	__ip_vs_update_dest(svc, dest, udest, 1);
@@ -1045,9 +1023,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
 	/*
 	 *  Remove it from the d-linked list with the real services.
 	 */
-	write_lock_bh(&ipvs->rs_lock);
 	ip_vs_rs_unhash(dest);
-	write_unlock_bh(&ipvs->rs_lock);
 
 	/*
 	 *  Decrease the refcnt of the dest, and free the dest
@@ -1067,7 +1043,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
 		   time, so the operation here is OK */
 		atomic_dec(&dest->svc->refcnt);
 		free_percpu(dest->stats.cpustats);
-		kfree(dest);
+		kfree_rcu(dest, rcu_head);
 	} else {
 		IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
 			      "dest->refcnt=%d\n",
@@ -3811,11 +3787,9 @@ int __net_init ip_vs_control_net_init(struct net *net)
 	int idx;
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	rwlock_init(&ipvs->rs_lock);
-
 	/* Initialize rs_table */
 	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
-		INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+		INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
 
 	INIT_LIST_HEAD(&ipvs->dest_trash);
 	atomic_set(&ipvs->ftpsvc_counter, 0);
@@ -3892,7 +3866,7 @@ int __init ip_vs_control_init(void)
 
 	EnterFunction(2);
 
-	/* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
+	/* Initialize svc_table, ip_vs_svc_fwm_table */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
 		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);

From 60b6aa3b319d902db49dbaee7433fe2ac7d0cdb5 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:09 +0200
Subject: [PATCH 12/49] ipvs: convert locks used in persistence engines

Allow the readers to use RCU lock and for
PE module registrations use global mutex instead of
spinlock. All PE modules need to use synchronize_rcu
in their module exit handler.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_pe.c     | 43 ++++++++++---------------------
 net/netfilter/ipvs/ip_vs_pe_sip.c |  1 +
 2 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 5cf859ccb31b..5d9774c4cc4c 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -13,8 +13,8 @@
 /* IPVS pe list */
 static LIST_HEAD(ip_vs_pe);
 
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_pe_lock);
+/* semaphore for IPVS PEs. */
+static DEFINE_MUTEX(ip_vs_pe_mutex);
 
 /* Bind a service with a pe */
 void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
@@ -36,9 +36,8 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
 	IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
 		  pe_name);
 
-	spin_lock_bh(&ip_vs_pe_lock);
-
-	list_for_each_entry(pe, &ip_vs_pe, n_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {
 		/* Test and get the modules atomically */
 		if (pe->module &&
 		    !try_module_get(pe->module)) {
@@ -47,14 +46,14 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
 		}
 		if (strcmp(pe_name, pe->name)==0) {
 			/* HIT */
-			spin_unlock_bh(&ip_vs_pe_lock);
+			rcu_read_unlock();
 			return pe;
 		}
 		if (pe->module)
 			module_put(pe->module);
 	}
+	rcu_read_unlock();
 
-	spin_unlock_bh(&ip_vs_pe_lock);
 	return NULL;
 }
 
@@ -83,22 +82,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
 	/* increase the module use count */
 	ip_vs_use_count_inc();
 
-	spin_lock_bh(&ip_vs_pe_lock);
-
-	if (!list_empty(&pe->n_list)) {
-		spin_unlock_bh(&ip_vs_pe_lock);
-		ip_vs_use_count_dec();
-		pr_err("%s(): [%s] pe already linked\n",
-		       __func__, pe->name);
-		return -EINVAL;
-	}
-
+	mutex_lock(&ip_vs_pe_mutex);
 	/* Make sure that the pe with this name doesn't exist
 	 * in the pe list.
 	 */
 	list_for_each_entry(tmp, &ip_vs_pe, n_list) {
 		if (strcmp(tmp->name, pe->name) == 0) {
-			spin_unlock_bh(&ip_vs_pe_lock);
+			mutex_unlock(&ip_vs_pe_mutex);
 			ip_vs_use_count_dec();
 			pr_err("%s(): [%s] pe already existed "
 			       "in the system\n", __func__, pe->name);
@@ -106,8 +96,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
 		}
 	}
 	/* Add it into the d-linked pe list */
-	list_add(&pe->n_list, &ip_vs_pe);
-	spin_unlock_bh(&ip_vs_pe_lock);
+	list_add_rcu(&pe->n_list, &ip_vs_pe);
+	mutex_unlock(&ip_vs_pe_mutex);
 
 	pr_info("[%s] pe registered.\n", pe->name);
 
@@ -118,17 +108,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe);
 /* Unregister a pe from the pe list */
 int unregister_ip_vs_pe(struct ip_vs_pe *pe)
 {
-	spin_lock_bh(&ip_vs_pe_lock);
-	if (list_empty(&pe->n_list)) {
-		spin_unlock_bh(&ip_vs_pe_lock);
-		pr_err("%s(): [%s] pe is not in the list. failed\n",
-		       __func__, pe->name);
-		return -EINVAL;
-	}
-
+	mutex_lock(&ip_vs_pe_mutex);
 	/* Remove it from the d-linked pe list */
-	list_del(&pe->n_list);
-	spin_unlock_bh(&ip_vs_pe_lock);
+	list_del_rcu(&pe->n_list);
+	mutex_unlock(&ip_vs_pe_mutex);
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 12475ef88daf..00cc0241ed87 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -172,6 +172,7 @@ static int __init ip_vs_sip_init(void)
 static void __exit ip_vs_sip_cleanup(void)
 {
 	unregister_ip_vs_pe(&ip_vs_sip_pe);
+	synchronize_rcu();
 }
 
 module_init(ip_vs_sip_init);

From 088339a57d6042a8a19a3d5794594b558cd7b624 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:10 +0200
Subject: [PATCH 13/49] ipvs: convert connection locking

Convert __ip_vs_conntbl_lock_array as follows:

- readers that do not modify conn lists will use RCU lock
- updaters that modify lists will use spinlock_t

Now for conn lookups we will use RCU read-side
critical section. Without using __ip_vs_conn_get such
places have access to connection fields and can
dereference some pointers like pe and pe_data plus
the ability to update timer expiration. If full access
is required we contend for reference.

We add barrier in __ip_vs_conn_put, so that
other CPUs see the refcnt operation after other writes.

With the introduction of ip_vs_conn_unlink()
we try to reorganize ip_vs_conn_expire(), so that
unhashing of connections that should stay more time is
avoided, even if it is for very short time.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |  12 ++
 net/netfilter/ipvs/ip_vs_conn.c | 230 +++++++++++++++++---------------
 2 files changed, 134 insertions(+), 108 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index b06aa6c939fa..5700b07b5186 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -620,6 +620,8 @@ struct ip_vs_conn {
 	const struct ip_vs_pe	*pe;
 	char			*pe_data;
 	__u8			pe_data_len;
+
+	struct rcu_head		rcu_head;
 };
 
 /*
@@ -1185,9 +1187,19 @@ struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
 					     const struct ip_vs_iphdr *iph,
 					     int inverse);
 
+/* Get reference to gain full access to conn.
+ * By default, RCU read-side critical sections have access only to
+ * conn fields and its PE data, see ip_vs_conn_rcu_free() for reference.
+ */
+static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp)
+{
+	return atomic_inc_not_zero(&cp->refcnt);
+}
+
 /* put back the conn without restarting its timer */
 static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
 {
+	smp_mb__before_atomic_dec();
 	atomic_dec(&cp->refcnt);
 }
 extern void ip_vs_conn_put(struct ip_vs_conn *cp);
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 704e514e02ab..b0cd2be01d75 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -79,51 +79,21 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
 
 struct ip_vs_aligned_lock
 {
-	rwlock_t	l;
+	spinlock_t	l;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 /* lock array for conn table */
 static struct ip_vs_aligned_lock
 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
 
-static inline void ct_read_lock(unsigned int key)
-{
-	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock(unsigned int key)
-{
-	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
 static inline void ct_write_lock(unsigned int key)
 {
-	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+	spin_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
 }
 
 static inline void ct_write_unlock(unsigned int key)
 {
-	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_lock_bh(unsigned int key)
-{
-	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock_bh(unsigned int key)
-{
-	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock_bh(unsigned int key)
-{
-	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock_bh(unsigned int key)
-{
-	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+	spin_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
 }
 
 
@@ -201,9 +171,9 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	spin_lock(&cp->lock);
 
 	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
-		hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
 		cp->flags |= IP_VS_CONN_F_HASHED;
 		atomic_inc(&cp->refcnt);
+		hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
 		ret = 1;
 	} else {
 		pr_err("%s(): request for already hashed, called from %pF\n",
@@ -220,7 +190,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 
 /*
  *	UNhashes ip_vs_conn from ip_vs_conn_tab.
- *	returns bool success.
+ *	returns bool success. Caller should hold conn reference.
  */
 static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 {
@@ -234,7 +204,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	spin_lock(&cp->lock);
 
 	if (cp->flags & IP_VS_CONN_F_HASHED) {
-		hlist_del(&cp->c_list);
+		hlist_del_rcu(&cp->c_list);
 		cp->flags &= ~IP_VS_CONN_F_HASHED;
 		atomic_dec(&cp->refcnt);
 		ret = 1;
@@ -247,6 +217,36 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	return ret;
 }
 
+/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
+{
+	unsigned int hash;
+	bool ret;
+
+	hash = ip_vs_conn_hashkey_conn(cp);
+
+	ct_write_lock(hash);
+	spin_lock(&cp->lock);
+
+	if (cp->flags & IP_VS_CONN_F_HASHED) {
+		ret = false;
+		/* Decrease refcnt and unlink conn only if we are last user */
+		if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
+			hlist_del_rcu(&cp->c_list);
+			cp->flags &= ~IP_VS_CONN_F_HASHED;
+			ret = true;
+		}
+	} else
+		ret = atomic_read(&cp->refcnt) ? false : true;
+
+	spin_unlock(&cp->lock);
+	ct_write_unlock(hash);
+
+	return ret;
+}
+
 
 /*
  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
@@ -262,9 +262,9 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 
 	hash = ip_vs_conn_hashkey_param(p, false);
 
-	ct_read_lock(hash);
+	rcu_read_lock();
 
-	hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
 		if (cp->af == p->af &&
 		    p->cport == cp->cport && p->vport == cp->vport &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
@@ -272,14 +272,15 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
 		    p->protocol == cp->protocol &&
 		    ip_vs_conn_net_eq(cp, p->net)) {
+			if (!__ip_vs_conn_get(cp))
+				continue;
 			/* HIT */
-			atomic_inc(&cp->refcnt);
-			ct_read_unlock(hash);
+			rcu_read_unlock();
 			return cp;
 		}
 	}
 
-	ct_read_unlock(hash);
+	rcu_read_unlock();
 
 	return NULL;
 }
@@ -346,14 +347,16 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 
 	hash = ip_vs_conn_hashkey_param(p, false);
 
-	ct_read_lock(hash);
+	rcu_read_lock();
 
-	hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
 		if (!ip_vs_conn_net_eq(cp, p->net))
 			continue;
 		if (p->pe_data && p->pe->ct_match) {
-			if (p->pe == cp->pe && p->pe->ct_match(p, cp))
-				goto out;
+			if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
+				if (__ip_vs_conn_get(cp))
+					goto out;
+			}
 			continue;
 		}
 
@@ -365,15 +368,15 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 				     p->af, p->vaddr, &cp->vaddr) &&
 		    p->cport == cp->cport && p->vport == cp->vport &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    p->protocol == cp->protocol)
-			goto out;
+		    p->protocol == cp->protocol) {
+			if (__ip_vs_conn_get(cp))
+				goto out;
+		}
 	}
 	cp = NULL;
 
   out:
-	if (cp)
-		atomic_inc(&cp->refcnt);
-	ct_read_unlock(hash);
+	rcu_read_unlock();
 
 	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
 		      ip_vs_proto_name(p->protocol),
@@ -398,23 +401,24 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 	 */
 	hash = ip_vs_conn_hashkey_param(p, true);
 
-	ct_read_lock(hash);
+	rcu_read_lock();
 
-	hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
 		if (cp->af == p->af &&
 		    p->vport == cp->cport && p->cport == cp->dport &&
 		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
 		    p->protocol == cp->protocol &&
 		    ip_vs_conn_net_eq(cp, p->net)) {
+			if (!__ip_vs_conn_get(cp))
+				continue;
 			/* HIT */
-			atomic_inc(&cp->refcnt);
 			ret = cp;
 			break;
 		}
 	}
 
-	ct_read_unlock(hash);
+	rcu_read_unlock();
 
 	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
 		      ip_vs_proto_name(p->protocol),
@@ -757,41 +761,36 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
 		 * Simply decrease the refcnt of the template,
 		 * don't restart its timer.
 		 */
-		atomic_dec(&ct->refcnt);
+		__ip_vs_conn_put(ct);
 		return 0;
 	}
 	return 1;
 }
 
+static void ip_vs_conn_rcu_free(struct rcu_head *head)
+{
+	struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
+					     rcu_head);
+
+	ip_vs_pe_put(cp->pe);
+	kfree(cp->pe_data);
+	kmem_cache_free(ip_vs_conn_cachep, cp);
+}
+
 static void ip_vs_conn_expire(unsigned long data)
 {
 	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
 	struct net *net = ip_vs_conn_net(cp);
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	cp->timeout = 60*HZ;
-
-	/*
-	 *	hey, I'm using it
-	 */
-	atomic_inc(&cp->refcnt);
-
 	/*
 	 *	do I control anybody?
 	 */
 	if (atomic_read(&cp->n_control))
 		goto expire_later;
 
-	/*
-	 *	unhash it if it is hashed in the conn table
-	 */
-	if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
-		goto expire_later;
-
-	/*
-	 *	refcnt==1 implies I'm the only one referrer
-	 */
-	if (likely(atomic_read(&cp->refcnt) == 1)) {
+	/* Unlink conn if not referenced anymore */
+	if (likely(ip_vs_conn_unlink(cp))) {
 		/* delete the timer if it is activated by other users */
 		del_timer(&cp->timer);
 
@@ -810,38 +809,41 @@ static void ip_vs_conn_expire(unsigned long data)
 				ip_vs_conn_drop_conntrack(cp);
 		}
 
-		ip_vs_pe_put(cp->pe);
-		kfree(cp->pe_data);
 		if (unlikely(cp->app != NULL))
 			ip_vs_unbind_app(cp);
 		ip_vs_unbind_dest(cp);
 		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
 			atomic_dec(&ip_vs_conn_no_cport_cnt);
+		call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
 		atomic_dec(&ipvs->conn_count);
-
-		kmem_cache_free(ip_vs_conn_cachep, cp);
 		return;
 	}
 
-	/* hash it back to the table */
-	ip_vs_conn_hash(cp);
-
   expire_later:
-	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
-		  atomic_read(&cp->refcnt)-1,
+	IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
+		  atomic_read(&cp->refcnt),
 		  atomic_read(&cp->n_control));
 
+	atomic_inc(&cp->refcnt);
+	cp->timeout = 60*HZ;
+
 	if (ipvs->sync_state & IP_VS_STATE_MASTER)
 		ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
 
 	ip_vs_conn_put(cp);
 }
 
-
+/* Modify timer, so that it expires as soon as possible.
+ * Can be called without reference only if under RCU lock.
+ */
 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 {
-	if (del_timer(&cp->timer))
-		mod_timer(&cp->timer, jiffies);
+	/* Using mod_timer_pending will ensure the timer is not
+	 * modified after the final del_timer in ip_vs_conn_expire.
+	 */
+	if (timer_pending(&cp->timer) &&
+	    time_after(cp->timer.expires, jiffies))
+		mod_timer_pending(&cp->timer, jiffies);
 }
 
 
@@ -952,14 +954,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
 	struct ip_vs_iter_state *iter = seq->private;
 
 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
-		ct_read_lock_bh(idx);
-		hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+			/* __ip_vs_conn_get() is not needed by
+			 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
+			 */
 			if (pos-- == 0) {
 				iter->l = &ip_vs_conn_tab[idx];
 				return cp;
 			}
 		}
-		ct_read_unlock_bh(idx);
+		rcu_read_unlock();
 	}
 
 	return NULL;
@@ -977,6 +982,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct ip_vs_conn *cp = v;
 	struct ip_vs_iter_state *iter = seq->private;
+	struct hlist_node *e;
 	struct hlist_head *l = iter->l;
 	int idx;
 
@@ -985,19 +991,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		return ip_vs_conn_array(seq, 0);
 
 	/* more on same hash chain? */
-	if (cp->c_list.next)
-		return hlist_entry(cp->c_list.next, struct ip_vs_conn, c_list);
+	e = rcu_dereference(hlist_next_rcu(&cp->c_list));
+	if (e)
+		return hlist_entry(e, struct ip_vs_conn, c_list);
+	rcu_read_unlock();
 
 	idx = l - ip_vs_conn_tab;
-	ct_read_unlock_bh(idx);
-
 	while (++idx < ip_vs_conn_tab_size) {
-		ct_read_lock_bh(idx);
-		hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
 			iter->l = &ip_vs_conn_tab[idx];
 			return cp;
 		}
-		ct_read_unlock_bh(idx);
+		rcu_read_unlock();
 	}
 	iter->l = NULL;
 	return NULL;
@@ -1009,7 +1015,7 @@ static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
 	struct hlist_head *l = iter->l;
 
 	if (l)
-		ct_read_unlock_bh(l - ip_vs_conn_tab);
+		rcu_read_unlock();
 }
 
 static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
@@ -1188,7 +1194,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
 void ip_vs_random_dropentry(struct net *net)
 {
 	int idx;
-	struct ip_vs_conn *cp;
+	struct ip_vs_conn *cp, *cp_c;
 
 	/*
 	 * Randomly scan 1/32 of the whole table every second
@@ -1199,9 +1205,9 @@ void ip_vs_random_dropentry(struct net *net)
 		/*
 		 *  Lock is actually needed in this loop.
 		 */
-		ct_write_lock_bh(hash);
+		rcu_read_lock();
 
-		hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
 			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
 				/* connection template */
 				continue;
@@ -1228,12 +1234,15 @@ void ip_vs_random_dropentry(struct net *net)
 
 			IP_VS_DBG(4, "del connection\n");
 			ip_vs_conn_expire_now(cp);
-			if (cp->control) {
+			cp_c = cp->control;
+			/* cp->control is valid only with reference to cp */
+			if (cp_c && __ip_vs_conn_get(cp)) {
 				IP_VS_DBG(4, "del conn template\n");
-				ip_vs_conn_expire_now(cp->control);
+				ip_vs_conn_expire_now(cp_c);
+				__ip_vs_conn_put(cp);
 			}
 		}
-		ct_write_unlock_bh(hash);
+		rcu_read_unlock();
 	}
 }
 
@@ -1244,7 +1253,7 @@ void ip_vs_random_dropentry(struct net *net)
 static void ip_vs_conn_flush(struct net *net)
 {
 	int idx;
-	struct ip_vs_conn *cp;
+	struct ip_vs_conn *cp, *cp_c;
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
 flush_again:
@@ -1252,19 +1261,22 @@ flush_again:
 		/*
 		 *  Lock is actually needed in this loop.
 		 */
-		ct_write_lock_bh(idx);
+		rcu_read_lock();
 
-		hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
 			if (!ip_vs_conn_net_eq(cp, net))
 				continue;
 			IP_VS_DBG(4, "del connection\n");
 			ip_vs_conn_expire_now(cp);
-			if (cp->control) {
+			cp_c = cp->control;
+			/* cp->control is valid only with reference to cp */
+			if (cp_c && __ip_vs_conn_get(cp)) {
 				IP_VS_DBG(4, "del conn template\n");
-				ip_vs_conn_expire_now(cp->control);
+				ip_vs_conn_expire_now(cp_c);
+				__ip_vs_conn_put(cp);
 			}
 		}
-		ct_write_unlock_bh(idx);
+		rcu_read_unlock();
 	}
 
 	/* the counter may be not NULL, because maybe some conn entries
@@ -1331,7 +1343,7 @@ int __init ip_vs_conn_init(void)
 		INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
 
 	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
-		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+		spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
 	}
 
 	/* calculate the random value for connection hash */
@@ -1342,6 +1354,8 @@ int __init ip_vs_conn_init(void)
 
 void ip_vs_conn_cleanup(void)
 {
+	/* Wait all ip_vs_conn_rcu_free() callbacks to complete */
+	rcu_barrier();
 	/* Release the empty cache */
 	kmem_cache_destroy(ip_vs_conn_cachep);
 	vfree(ip_vs_conn_tab);

From 1845ed0bb29fa7864781021e0c8d06af318f358a Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:11 +0200
Subject: [PATCH 14/49] ipvs: reorder keys in connection structure

__ip_vs_conn_in_get and ip_vs_conn_out_get are
hot places. Optimize them, so that ports are matched first.
By moving net and fwmark below, on 32-bit arch we can fit
caddr in 32-byte cache line and all addresses in 64-byte
cache line.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             | 12 ++++++------
 net/netfilter/ipvs/ip_vs_conn.c | 19 ++++++++++---------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 5700b07b5186..929e04c6b82e 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -566,20 +566,19 @@ struct ip_vs_conn_param {
  */
 struct ip_vs_conn {
 	struct hlist_node	c_list;         /* hashed list heads */
-#ifdef CONFIG_NET_NS
-	struct net              *net;           /* Name space */
-#endif
 	/* Protocol, addresses and port numbers */
-	u16                     af;             /* address family */
 	__be16                  cport;
-	__be16                  vport;
 	__be16                  dport;
-	__u32                   fwmark;         /* Fire wall mark from skb */
+	__be16                  vport;
+	u16			af;		/* address family */
 	union nf_inet_addr      caddr;          /* client address */
 	union nf_inet_addr      vaddr;          /* virtual address */
 	union nf_inet_addr      daddr;          /* destination address */
 	volatile __u32          flags;          /* status flags */
 	__u16                   protocol;       /* Which protocol (TCP/UDP) */
+#ifdef CONFIG_NET_NS
+	struct net              *net;           /* Name space */
+#endif
 
 	/* counter and timer */
 	atomic_t		refcnt;		/* reference count */
@@ -593,6 +592,7 @@ struct ip_vs_conn {
 						 * state transition triggerd
 						 * synchronization
 						 */
+	__u32			fwmark;		/* Fire wall mark from skb */
 	unsigned long		sync_endtime;	/* jiffies + sent_retries */
 
 	/* Control members */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index b0cd2be01d75..416015b34c23 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -265,8 +265,8 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 	rcu_read_lock();
 
 	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == p->af &&
-		    p->cport == cp->cport && p->vport == cp->vport &&
+		if (p->cport == cp->cport && p->vport == cp->vport &&
+		    cp->af == p->af &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
 		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
@@ -350,9 +350,9 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 	rcu_read_lock();
 
 	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (!ip_vs_conn_net_eq(cp, p->net))
-			continue;
-		if (p->pe_data && p->pe->ct_match) {
+		if (unlikely(p->pe_data && p->pe->ct_match)) {
+			if (!ip_vs_conn_net_eq(cp, p->net))
+				continue;
 			if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
 				if (__ip_vs_conn_get(cp))
 					goto out;
@@ -366,9 +366,10 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 		     * p->vaddr is a fwmark */
 		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
 				     p->af, p->vaddr, &cp->vaddr) &&
-		    p->cport == cp->cport && p->vport == cp->vport &&
+		    p->vport == cp->vport && p->cport == cp->cport &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    p->protocol == cp->protocol) {
+		    p->protocol == cp->protocol &&
+		    ip_vs_conn_net_eq(cp, p->net)) {
 			if (__ip_vs_conn_get(cp))
 				goto out;
 		}
@@ -404,8 +405,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 	rcu_read_lock();
 
 	hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == p->af &&
-		    p->vport == cp->cport && p->cport == cp->dport &&
+		if (p->vport == cp->cport && p->cport == cp->dport &&
+		    cp->af == p->af &&
 		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
 		    p->protocol == cp->protocol &&

From 9a05475cebdd6341884b5901e53870be26e65158 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:58:12 +0200
Subject: [PATCH 15/49] ipvs: avoid kmem_cache_zalloc in ip_vs_conn_new

We have many fields to set and few to reset,
use kmem_cache_alloc instead to save some cycles.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             | 15 +++++++++++++++
 net/netfilter/ipvs/ip_vs_conn.c | 24 +++++++++++++++++++-----
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 929e04c6b82e..43886bb282fb 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -233,6 +233,21 @@ static inline void ip_vs_addr_copy(int af, union nf_inet_addr *dst,
 	dst->ip = src->ip;
 }
 
+static inline void ip_vs_addr_set(int af, union nf_inet_addr *dst,
+				  const union nf_inet_addr *src)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		dst->in6 = src->in6;
+		return;
+	}
+#endif
+	dst->ip = src->ip;
+	dst->all[1] = 0;
+	dst->all[2] = 0;
+	dst->all[3] = 0;
+}
+
 static inline int ip_vs_addr_equal(int af, const union nf_inet_addr *a,
 				   const union nf_inet_addr *b)
 {
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 416015b34c23..e3e2b4d3b6d8 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -861,7 +861,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
 							   p->protocol);
 
-	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+	cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
 	if (cp == NULL) {
 		IP_VS_ERR_RL("%s(): no memory\n", __func__);
 		return NULL;
@@ -872,13 +872,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	ip_vs_conn_net_set(cp, p->net);
 	cp->af		   = p->af;
 	cp->protocol	   = p->protocol;
-	ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+	ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
 	cp->cport	   = p->cport;
-	ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+	ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr);
 	cp->vport	   = p->vport;
 	/* proto should only be IPPROTO_IP if d_addr is a fwmark */
-	ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
-			&cp->daddr, daddr);
+	ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+		       &cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
 	cp->fwmark         = fwmark;
@@ -887,6 +887,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 		cp->pe = p->pe;
 		cp->pe_data = p->pe_data;
 		cp->pe_data_len = p->pe_data_len;
+	} else {
+		cp->pe = NULL;
+		cp->pe_data = NULL;
+		cp->pe_data_len = 0;
 	}
 	spin_lock_init(&cp->lock);
 
@@ -897,18 +901,28 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	 */
 	atomic_set(&cp->refcnt, 1);
 
+	cp->control = NULL;
 	atomic_set(&cp->n_control, 0);
 	atomic_set(&cp->in_pkts, 0);
 
+	cp->packet_xmit = NULL;
+	cp->app = NULL;
+	cp->app_data = NULL;
+	/* reset struct ip_vs_seq */
+	cp->in_seq.delta = 0;
+	cp->out_seq.delta = 0;
+
 	atomic_inc(&ipvs->conn_count);
 	if (flags & IP_VS_CONN_F_NO_CPORT)
 		atomic_inc(&ip_vs_conn_no_cport_cnt);
 
 	/* Bind the connection with a destination server */
+	cp->dest = NULL;
 	ip_vs_bind_dest(cp, dest);
 
 	/* Set its state and timeout */
 	cp->state = 0;
+	cp->old_state = 0;
 	cp->timeout = 3*HZ;
 	cp->sync_endtime = jiffies & ~3UL;
 

From 71dfa982f177d7b1e51e8d48b4c69da0b0e17e3c Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:36 +0200
Subject: [PATCH 16/49] ipvs: change ip_vs_sched_lock to mutex

The global list with schedulers ip_vs_schedulers
is accessed only from user context - configuration and
scheduler module [un]registration. Use ip_vs_sched_mutex
instead.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_sched.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index d6bf20d6cdbe..7f11d3d37a42 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -35,8 +35,8 @@ EXPORT_SYMBOL(ip_vs_scheduler_err);
  */
 static LIST_HEAD(ip_vs_schedulers);
 
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_sched_lock);
+/* semaphore for schedulers */
+static DEFINE_MUTEX(ip_vs_sched_mutex);
 
 
 /*
@@ -92,7 +92,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
 
 	IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
 
-	spin_lock_bh(&ip_vs_sched_lock);
+	mutex_lock(&ip_vs_sched_mutex);
 
 	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
 		/*
@@ -106,14 +106,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
 		}
 		if (strcmp(sched_name, sched->name)==0) {
 			/* HIT */
-			spin_unlock_bh(&ip_vs_sched_lock);
+			mutex_unlock(&ip_vs_sched_mutex);
 			return sched;
 		}
 		if (sched->module)
 			module_put(sched->module);
 	}
 
-	spin_unlock_bh(&ip_vs_sched_lock);
+	mutex_unlock(&ip_vs_sched_mutex);
 	return NULL;
 }
 
@@ -192,10 +192,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	/* increase the module use count */
 	ip_vs_use_count_inc();
 
-	spin_lock_bh(&ip_vs_sched_lock);
+	mutex_lock(&ip_vs_sched_mutex);
 
 	if (!list_empty(&scheduler->n_list)) {
-		spin_unlock_bh(&ip_vs_sched_lock);
+		mutex_unlock(&ip_vs_sched_mutex);
 		ip_vs_use_count_dec();
 		pr_err("%s(): [%s] scheduler already linked\n",
 		       __func__, scheduler->name);
@@ -208,7 +208,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	 */
 	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
 		if (strcmp(scheduler->name, sched->name) == 0) {
-			spin_unlock_bh(&ip_vs_sched_lock);
+			mutex_unlock(&ip_vs_sched_mutex);
 			ip_vs_use_count_dec();
 			pr_err("%s(): [%s] scheduler already existed "
 			       "in the system\n", __func__, scheduler->name);
@@ -219,7 +219,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	 *	Add it into the d-linked scheduler list
 	 */
 	list_add(&scheduler->n_list, &ip_vs_schedulers);
-	spin_unlock_bh(&ip_vs_sched_lock);
+	mutex_unlock(&ip_vs_sched_mutex);
 
 	pr_info("[%s] scheduler registered.\n", scheduler->name);
 
@@ -237,9 +237,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 		return -EINVAL;
 	}
 
-	spin_lock_bh(&ip_vs_sched_lock);
+	mutex_lock(&ip_vs_sched_mutex);
 	if (list_empty(&scheduler->n_list)) {
-		spin_unlock_bh(&ip_vs_sched_lock);
+		mutex_unlock(&ip_vs_sched_mutex);
 		pr_err("%s(): [%s] scheduler is not in the list. failed\n",
 		       __func__, scheduler->name);
 		return -EINVAL;
@@ -249,7 +249,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	 *	Remove it from the d-linked scheduler list
 	 */
 	list_del(&scheduler->n_list);
-	spin_unlock_bh(&ip_vs_sched_lock);
+	mutex_unlock(&ip_vs_sched_mutex);
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();

From 6b6df46663e7aa6f7b1d82435a3488f9b81316b3 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:37 +0200
Subject: [PATCH 17/49] ipvs: preparations for using rcu in schedulers

Allow schedulers to use rcu_dereference when
returning destination on lookup. The RCU read-side critical
section will allow ip_vs_bind_dest to get dest refcnt as
preparation for the step where destinations will be
deleted without an IP_VS_WAIT_WHILE guard that holds the
packet processing during update.

	Add new optional scheduler methods add_dest,
del_dest and upd_dest. For now the methods are called
together with update_service but update_service will be
removed in a following change.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             | 6 ++++++
 net/netfilter/ipvs/ip_vs_core.c | 6 ++++++
 net/netfilter/ipvs/ip_vs_ctl.c  | 8 ++++++++
 3 files changed, 20 insertions(+)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 43886bb282fb..d91385c1b3a2 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -805,6 +805,12 @@ struct ip_vs_scheduler {
 	int (*done_service)(struct ip_vs_service *svc);
 	/* scheduler updating service */
 	int (*update_service)(struct ip_vs_service *svc);
+	/* dest is linked */
+	int (*add_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest);
+	/* dest is unlinked */
+	int (*del_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest);
+	/* dest is updated */
+	int (*upd_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest);
 
 	/* selecting a server from the given service */
 	struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 4fc749c6ec27..939ad11ed534 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -301,8 +301,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		 * template is not available.
 		 * return *ignored=0 i.e. ICMP and NF_DROP
 		 */
+		rcu_read_lock();
 		dest = svc->scheduler->schedule(svc, skb);
 		if (!dest) {
+			rcu_read_unlock();
 			IP_VS_DBG(1, "p-schedule: no dest found.\n");
 			kfree(param.pe_data);
 			*ignored = 0;
@@ -318,6 +320,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		 * when the template expires */
 		ct = ip_vs_conn_new(&param, &dest->addr, dport,
 				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
+		rcu_read_unlock();
 		if (ct == NULL) {
 			kfree(param.pe_data);
 			*ignored = -1;
@@ -446,8 +449,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 		return NULL;
 	}
 
+	rcu_read_lock();
 	dest = svc->scheduler->schedule(svc, skb);
 	if (dest == NULL) {
+		rcu_read_unlock();
 		IP_VS_DBG(1, "Schedule: no dest found.\n");
 		return NULL;
 	}
@@ -468,6 +473,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 		cp = ip_vs_conn_new(&p, &dest->addr,
 				    dest->port ? dest->port : pptr[1],
 				    flags, dest, skb->mark);
+		rcu_read_unlock();
 		if (!cp) {
 			*ignored = -1;
 			return NULL;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 182d95807bd5..d64f800a3426 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -825,6 +825,11 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	if (add) {
 		list_add(&dest->n_list, &svc->destinations);
 		svc->num_dests++;
+		if (svc->scheduler->add_dest)
+			svc->scheduler->add_dest(svc, dest);
+	} else {
+		if (svc->scheduler->upd_dest)
+			svc->scheduler->upd_dest(svc, dest);
 	}
 
 	/* call the update_service, because server weight may be changed */
@@ -1071,6 +1076,9 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
 	list_del(&dest->n_list);
 	svc->num_dests--;
 
+	if (svcupd && svc->scheduler->del_dest)
+		svc->scheduler->del_dest(svc, dest);
+
 	/*
 	 *  Call the update_service function of its scheduler
 	 */

From fca9c20ae1e510525f8a2aaa25861789fd721193 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:38 +0200
Subject: [PATCH 18/49] ipvs: add ip_vs_dest_hold and ip_vs_dest_put

ip_vs_dest_hold will be used under RCU lock
while ip_vs_dest_put can be called even after dest
is removed from service, as it happens for conns and
some schedulers.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             | 10 ++++++++++
 net/netfilter/ipvs/ip_vs_conn.c |  9 ++-------
 net/netfilter/ipvs/ip_vs_ctl.c  |  4 ++--
 net/netfilter/ipvs/ip_vs_sync.c |  4 ++--
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index d91385c1b3a2..7d3027f2c06e 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1427,6 +1427,16 @@ ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr,
 		__u16 protocol, __u32 fwmark, __u32 flags);
 extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
 
+static inline void ip_vs_dest_hold(struct ip_vs_dest *dest)
+{
+	atomic_inc(&dest->refcnt);
+}
+
+static inline void ip_vs_dest_put(struct ip_vs_dest *dest)
+{
+	smp_mb__before_atomic_dec();
+	atomic_dec(&dest->refcnt);
+}
 
 /*
  *      IPVS sync daemon data and function prototypes
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index e3e2b4d3b6d8..1b29e4a2b26c 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -554,7 +554,7 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 		return;
 
 	/* Increase the refcnt counter of the dest */
-	atomic_inc(&dest->refcnt);
+	ip_vs_dest_hold(dest);
 
 	conn_flags = atomic_read(&dest->conn_flags);
 	if (cp->protocol != IPPROTO_UDP)
@@ -700,12 +700,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
 			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 	}
 
-	/*
-	 * Simply decrease the refcnt of the dest, because the
-	 * dest will be either in service's destination list
-	 * or in the trash.
-	 */
-	atomic_dec(&dest->refcnt);
+	ip_vs_dest_put(dest);
 }
 
 static int expire_quiescent_template(struct netns_ipvs *ipvs,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index d64f800a3426..a4f638880470 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -616,7 +616,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
 	if (!dest)
 		dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
 	if (dest)
-		atomic_inc(&dest->refcnt);
+		ip_vs_dest_hold(dest);
 	ip_vs_service_put(svc);
 	return dest;
 }
@@ -1056,7 +1056,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
 			      ntohs(dest->port),
 			      atomic_read(&dest->refcnt));
 		list_add(&dest->n_list, &ipvs->dest_trash);
-		atomic_inc(&dest->refcnt);
+		ip_vs_dest_hold(dest);
 	}
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 44fd10c539ac..6cc3e52f1f35 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -861,7 +861,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 		if (!dest) {
 			dest = ip_vs_try_bind_dest(cp);
 			if (dest)
-				atomic_dec(&dest->refcnt);
+				ip_vs_dest_put(dest);
 		}
 	} else {
 		/*
@@ -874,7 +874,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 
 		cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
 		if (dest)
-			atomic_dec(&dest->refcnt);
+			ip_vs_dest_put(dest);
 		if (!cp) {
 			if (param->pe_data)
 				kfree(param->pe_data);

From 8f3d0023b9bde2e9806455287ca2af3837e1836c Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:39 +0200
Subject: [PATCH 19/49] ipvs: convert dh scheduler to rcu

Use the new add_dest and del_dest methods
to reassign dests.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_dh.c | 81 +++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 36 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 7f3b0cc00b7a..ebe80f44e712 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -51,7 +51,7 @@
  *      IPVS DH bucket
  */
 struct ip_vs_dh_bucket {
-	struct ip_vs_dest       *dest;          /* real server (cache) */
+	struct ip_vs_dest __rcu	*dest;	/* real server (cache) */
 };
 
 /*
@@ -64,6 +64,10 @@ struct ip_vs_dh_bucket {
 #define IP_VS_DH_TAB_SIZE               (1 << IP_VS_DH_TAB_BITS)
 #define IP_VS_DH_TAB_MASK               (IP_VS_DH_TAB_SIZE - 1)
 
+struct ip_vs_dh_state {
+	struct ip_vs_dh_bucket		buckets[IP_VS_DH_TAB_SIZE];
+	struct rcu_head			rcu_head;
+};
 
 /*
  *	Returns hash value for IPVS DH entry
@@ -85,10 +89,9 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
  *      Get ip_vs_dest associated with supplied parameters.
  */
 static inline struct ip_vs_dest *
-ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
-	     const union nf_inet_addr *addr)
+ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr)
 {
-	return (tbl[ip_vs_dh_hashkey(af, addr)]).dest;
+	return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest);
 }
 
 
@@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
  *      Assign all the hash buckets of the specified table with the service.
  */
 static int
-ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc)
 {
 	int i;
 	struct ip_vs_dh_bucket *b;
 	struct list_head *p;
 	struct ip_vs_dest *dest;
+	bool empty;
 
-	b = tbl;
+	b = &s->buckets[0];
 	p = &svc->destinations;
+	empty = list_empty(p);
 	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
-		if (list_empty(p)) {
-			b->dest = NULL;
-		} else {
+		dest = rcu_dereference_protected(b->dest, 1);
+		if (dest)
+			ip_vs_dest_put(dest);
+		if (empty)
+			RCU_INIT_POINTER(b->dest, NULL);
+		else {
 			if (p == &svc->destinations)
 				p = p->next;
 
 			dest = list_entry(p, struct ip_vs_dest, n_list);
-			atomic_inc(&dest->refcnt);
-			b->dest = dest;
+			ip_vs_dest_hold(dest);
+			RCU_INIT_POINTER(b->dest, dest);
 
 			p = p->next;
 		}
@@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
 /*
  *      Flush all the hash buckets of the specified table.
  */
-static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+static void ip_vs_dh_flush(struct ip_vs_dh_state *s)
 {
 	int i;
 	struct ip_vs_dh_bucket *b;
+	struct ip_vs_dest *dest;
 
-	b = tbl;
+	b = &s->buckets[0];
 	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
-		if (b->dest) {
-			atomic_dec(&b->dest->refcnt);
-			b->dest = NULL;
+		dest = rcu_dereference_protected(b->dest, 1);
+		if (dest) {
+			ip_vs_dest_put(dest);
+			RCU_INIT_POINTER(b->dest, NULL);
 		}
 		b++;
 	}
@@ -145,21 +155,20 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
 
 static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
 {
-	struct ip_vs_dh_bucket *tbl;
+	struct ip_vs_dh_state *s;
 
 	/* allocate the DH table for this service */
-	tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
-		      GFP_KERNEL);
-	if (tbl == NULL)
+	s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL);
+	if (s == NULL)
 		return -ENOMEM;
 
-	svc->sched_data = tbl;
+	svc->sched_data = s;
 	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
 		  "current service\n",
 		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
 
-	/* assign the hash buckets with the updated service */
-	ip_vs_dh_assign(tbl, svc);
+	/* assign the hash buckets with current dests */
+	ip_vs_dh_reassign(s, svc);
 
 	return 0;
 }
@@ -167,13 +176,13 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
 
 static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
 {
-	struct ip_vs_dh_bucket *tbl = svc->sched_data;
+	struct ip_vs_dh_state *s = svc->sched_data;
 
 	/* got to clean up hash buckets here */
-	ip_vs_dh_flush(tbl);
+	ip_vs_dh_flush(s);
 
 	/* release the table itself */
-	kfree(svc->sched_data);
+	kfree_rcu(s, rcu_head);
 	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
 		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
 
@@ -181,15 +190,13 @@ static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_dh_dest_changed(struct ip_vs_service *svc,
+				 struct ip_vs_dest *dest)
 {
-	struct ip_vs_dh_bucket *tbl = svc->sched_data;
-
-	/* got to clean up hash buckets here */
-	ip_vs_dh_flush(tbl);
+	struct ip_vs_dh_state *s = svc->sched_data;
 
 	/* assign the hash buckets with the updated service */
-	ip_vs_dh_assign(tbl, svc);
+	ip_vs_dh_reassign(s, svc);
 
 	return 0;
 }
@@ -212,19 +219,20 @@ static struct ip_vs_dest *
 ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 {
 	struct ip_vs_dest *dest;
-	struct ip_vs_dh_bucket *tbl;
+	struct ip_vs_dh_state *s;
 	struct ip_vs_iphdr iph;
 
 	ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
 
 	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 
-	tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
-	dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr);
+	s = (struct ip_vs_dh_state *) svc->sched_data;
+	dest = ip_vs_dh_get(svc->af, s, &iph.daddr);
 	if (!dest
 	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
 	    || atomic_read(&dest->weight) <= 0
 	    || is_overloaded(dest)) {
+		ip_vs_scheduler_err(svc, "no destination available");
 		return NULL;
 	}
 
@@ -248,7 +256,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler =
 	.n_list =		LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
 	.init_service =		ip_vs_dh_init_svc,
 	.done_service =		ip_vs_dh_done_svc,
-	.update_service =	ip_vs_dh_update_svc,
+	.add_dest =		ip_vs_dh_dest_changed,
+	.del_dest =		ip_vs_dh_dest_changed,
 	.schedule =		ip_vs_dh_schedule,
 };
 

From c2a4ffb70eef396148234c1228824008501ab8d2 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:40 +0200
Subject: [PATCH 20/49] ipvs: convert lblc scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations. The read_lock for sched_lock is
removed. Use a dead flag to prevent new entries to be created
while scheduler is reclaimed. Use hlist for the hash table.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_lblc.c | 96 +++++++++++++++++++--------------
 1 file changed, 55 insertions(+), 41 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index fdd89b9564ea..b873e177435a 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -90,11 +90,12 @@
  *      IP address and its destination server
  */
 struct ip_vs_lblc_entry {
-	struct list_head        list;
+	struct hlist_node	list;
 	int			af;		/* address family */
 	union nf_inet_addr      addr;           /* destination IP address */
-	struct ip_vs_dest       *dest;          /* real server (cache) */
+	struct ip_vs_dest __rcu	*dest;          /* real server (cache) */
 	unsigned long           lastuse;        /* last used time */
+	struct rcu_head		rcu_head;
 };
 
 
@@ -102,12 +103,14 @@ struct ip_vs_lblc_entry {
  *      IPVS lblc hash table
  */
 struct ip_vs_lblc_table {
-	struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
+	struct rcu_head		rcu_head;
+	struct hlist_head __rcu bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
+	struct timer_list       periodic_timer; /* collect stale entries */
 	atomic_t                entries;        /* number of entries */
 	int                     max_size;       /* maximum size of entries */
-	struct timer_list       periodic_timer; /* collect stale entries */
 	int                     rover;          /* rover for expire check */
 	int                     counter;        /* counter for no expire */
+	bool			dead;
 };
 
 
@@ -129,13 +132,16 @@ static ctl_table vs_vars_table[] = {
 
 static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
 {
-	list_del(&en->list);
+	struct ip_vs_dest *dest;
+
+	hlist_del_rcu(&en->list);
 	/*
 	 * We don't kfree dest because it is referred either by its service
 	 * or the trash dest list.
 	 */
-	atomic_dec(&en->dest->refcnt);
-	kfree(en);
+	dest = rcu_dereference_protected(en->dest, 1);
+	ip_vs_dest_put(dest);
+	kfree_rcu(en, rcu_head);
 }
 
 
@@ -165,15 +171,12 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
 {
 	unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
 
-	list_add(&en->list, &tbl->bucket[hash]);
+	hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
 	atomic_inc(&tbl->entries);
 }
 
 
-/*
- *  Get ip_vs_lblc_entry associated with supplied parameters. Called under read
- *  lock
- */
+/* Get ip_vs_lblc_entry associated with supplied parameters. */
 static inline struct ip_vs_lblc_entry *
 ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
 	       const union nf_inet_addr *addr)
@@ -181,7 +184,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
 	unsigned int hash = ip_vs_lblc_hashkey(af, addr);
 	struct ip_vs_lblc_entry *en;
 
-	list_for_each_entry(en, &tbl->bucket[hash], list)
+	hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
 		if (ip_vs_addr_equal(af, &en->addr, addr))
 			return en;
 
@@ -209,14 +212,20 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
 		ip_vs_addr_copy(dest->af, &en->addr, daddr);
 		en->lastuse = jiffies;
 
-		atomic_inc(&dest->refcnt);
-		en->dest = dest;
+		ip_vs_dest_hold(dest);
+		RCU_INIT_POINTER(en->dest, dest);
 
 		ip_vs_lblc_hash(tbl, en);
-	} else if (en->dest != dest) {
-		atomic_dec(&en->dest->refcnt);
-		atomic_inc(&dest->refcnt);
-		en->dest = dest;
+	} else {
+		struct ip_vs_dest *old_dest;
+
+		old_dest = rcu_dereference_protected(en->dest, 1);
+		if (old_dest != dest) {
+			ip_vs_dest_put(old_dest);
+			ip_vs_dest_hold(dest);
+			/* No ordering constraints for refcnt */
+			RCU_INIT_POINTER(en->dest, dest);
+		}
 	}
 
 	return en;
@@ -226,17 +235,22 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
 /*
  *      Flush all the entries of the specified table.
  */
-static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+static void ip_vs_lblc_flush(struct ip_vs_service *svc)
 {
-	struct ip_vs_lblc_entry *en, *nxt;
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+	struct ip_vs_lblc_entry *en;
+	struct hlist_node *next;
 	int i;
 
+	write_lock_bh(&svc->sched_lock);
+	tbl->dead = 1;
 	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
 			ip_vs_lblc_free(en);
 			atomic_dec(&tbl->entries);
 		}
 	}
+	write_unlock_bh(&svc->sched_lock);
 }
 
 static int sysctl_lblc_expiration(struct ip_vs_service *svc)
@@ -252,7 +266,8 @@ static int sysctl_lblc_expiration(struct ip_vs_service *svc)
 static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
 {
 	struct ip_vs_lblc_table *tbl = svc->sched_data;
-	struct ip_vs_lblc_entry *en, *nxt;
+	struct ip_vs_lblc_entry *en;
+	struct hlist_node *next;
 	unsigned long now = jiffies;
 	int i, j;
 
@@ -260,7 +275,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
 		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 
 		write_lock(&svc->sched_lock);
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 			if (time_before(now,
 					en->lastuse +
 					sysctl_lblc_expiration(svc)))
@@ -293,7 +308,8 @@ static void ip_vs_lblc_check_expire(unsigned long data)
 	unsigned long now = jiffies;
 	int goal;
 	int i, j;
-	struct ip_vs_lblc_entry *en, *nxt;
+	struct ip_vs_lblc_entry *en;
+	struct hlist_node *next;
 
 	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
 		/* do full expiration check */
@@ -315,7 +331,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
 		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 
 		write_lock(&svc->sched_lock);
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
 				continue;
 
@@ -354,11 +370,12 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 	 *    Initialize the hash buckets
 	 */
 	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
-		INIT_LIST_HEAD(&tbl->bucket[i]);
+		INIT_HLIST_HEAD(&tbl->bucket[i]);
 	}
 	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
 	tbl->rover = 0;
 	tbl->counter = 1;
+	tbl->dead = 0;
 
 	/*
 	 *    Hook periodic timer for garbage collection
@@ -379,10 +396,10 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 	del_timer_sync(&tbl->periodic_timer);
 
 	/* got to clean up table entries here */
-	ip_vs_lblc_flush(tbl);
+	ip_vs_lblc_flush(svc);
 
 	/* release the table itself */
-	kfree(tbl);
+	kfree_rcu(tbl, rcu_head);
 	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
 		  sizeof(*tbl));
 
@@ -408,7 +425,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
 	 * The server with weight=0 is quiesced and will not receive any
 	 * new connection.
 	 */
-	list_for_each_entry(dest, &svc->destinations, n_list) {
+	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 			continue;
 		if (atomic_read(&dest->weight) > 0) {
@@ -423,7 +440,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
 	 *    Find the destination with the least load.
 	 */
   nextstage:
-	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
 		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 			continue;
 
@@ -457,7 +474,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
 		struct ip_vs_dest *d;
 
-		list_for_each_entry(d, &svc->destinations, n_list) {
+		list_for_each_entry_rcu(d, &svc->destinations, n_list) {
 			if (atomic_read(&d->activeconns)*2
 			    < atomic_read(&d->weight)) {
 				return 1;
@@ -484,7 +501,6 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 
 	/* First look in our cache */
-	read_lock(&svc->sched_lock);
 	en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr);
 	if (en) {
 		/* We only hold a read lock, but this is atomic */
@@ -499,14 +515,11 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		 * free up entries from the trash at any time.
 		 */
 
-		if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
-			dest = en->dest;
+		dest = rcu_dereference(en->dest);
+		if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
+		    atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+			goto out;
 	}
-	read_unlock(&svc->sched_lock);
-
-	/* If the destination has a weight and is not overloaded, use it */
-	if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
-		goto out;
 
 	/* No cache entry or it is invalid, time to schedule */
 	dest = __ip_vs_lblc_schedule(svc);
@@ -517,7 +530,8 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	/* If we fail to create a cache entry, we'll just use the valid dest */
 	write_lock(&svc->sched_lock);
-	ip_vs_lblc_new(tbl, &iph.daddr, dest);
+	if (!tbl->dead)
+		ip_vs_lblc_new(tbl, &iph.daddr, dest);
 	write_unlock(&svc->sched_lock);
 
 out:

From c5549571f975ab519f9f3831327dc456bfd6b1ef Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:41 +0200
Subject: [PATCH 21/49] ipvs: convert lblcr scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations. The read_lock for sched_lock is
removed. The set.lock is removed because now it is used in
rare cases, mostly under sched_lock.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_lblcr.c | 171 ++++++++++++++++---------------
 1 file changed, 90 insertions(+), 81 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index c03b6a3ade2f..c22f173d528c 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -89,40 +89,44 @@
  */
 struct ip_vs_dest_set_elem {
 	struct list_head	list;          /* list link */
-	struct ip_vs_dest       *dest;          /* destination server */
+	struct ip_vs_dest __rcu *dest;         /* destination server */
+	struct rcu_head		rcu_head;
 };
 
 struct ip_vs_dest_set {
 	atomic_t                size;           /* set size */
 	unsigned long           lastmod;        /* last modified time */
 	struct list_head	list;           /* destination list */
-	rwlock_t	        lock;           /* lock for this list */
 };
 
 
-static struct ip_vs_dest_set_elem *
-ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
+				  struct ip_vs_dest *dest, bool check)
 {
 	struct ip_vs_dest_set_elem *e;
 
-	list_for_each_entry(e, &set->list, list) {
-		if (e->dest == dest)
-			/* already existed */
-			return NULL;
+	if (check) {
+		list_for_each_entry(e, &set->list, list) {
+			struct ip_vs_dest *d;
+
+			d = rcu_dereference_protected(e->dest, 1);
+			if (d == dest)
+				/* already existed */
+				return;
+		}
 	}
 
 	e = kmalloc(sizeof(*e), GFP_ATOMIC);
 	if (e == NULL)
-		return NULL;
+		return;
 
-	atomic_inc(&dest->refcnt);
-	e->dest = dest;
+	ip_vs_dest_hold(dest);
+	RCU_INIT_POINTER(e->dest, dest);
 
-	list_add(&e->list, &set->list);
+	list_add_rcu(&e->list, &set->list);
 	atomic_inc(&set->size);
 
 	set->lastmod = jiffies;
-	return e;
 }
 
 static void
@@ -131,13 +135,16 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
 	struct ip_vs_dest_set_elem *e;
 
 	list_for_each_entry(e, &set->list, list) {
-		if (e->dest == dest) {
+		struct ip_vs_dest *d;
+
+		d = rcu_dereference_protected(e->dest, 1);
+		if (d == dest) {
 			/* HIT */
 			atomic_dec(&set->size);
 			set->lastmod = jiffies;
-			atomic_dec(&e->dest->refcnt);
-			list_del(&e->list);
-			kfree(e);
+			ip_vs_dest_put(dest);
+			list_del_rcu(&e->list);
+			kfree_rcu(e, rcu_head);
 			break;
 		}
 	}
@@ -147,17 +154,18 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
 {
 	struct ip_vs_dest_set_elem *e, *ep;
 
-	write_lock(&set->lock);
 	list_for_each_entry_safe(e, ep, &set->list, list) {
+		struct ip_vs_dest *d;
+
+		d = rcu_dereference_protected(e->dest, 1);
 		/*
 		 * We don't kfree dest because it is referred either
 		 * by its service or by the trash dest list.
 		 */
-		atomic_dec(&e->dest->refcnt);
-		list_del(&e->list);
-		kfree(e);
+		ip_vs_dest_put(d);
+		list_del_rcu(&e->list);
+		kfree_rcu(e, rcu_head);
 	}
-	write_unlock(&set->lock);
 }
 
 /* get weighted least-connection node in the destination set */
@@ -171,8 +179,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
 		return NULL;
 
 	/* select the first destination server, whose weight > 0 */
-	list_for_each_entry(e, &set->list, list) {
-		least = e->dest;
+	list_for_each_entry_rcu(e, &set->list, list) {
+		least = rcu_dereference(e->dest);
 		if (least->flags & IP_VS_DEST_F_OVERLOAD)
 			continue;
 
@@ -186,8 +194,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
 
 	/* find the destination with the weighted least load */
   nextstage:
-	list_for_each_entry(e, &set->list, list) {
-		dest = e->dest;
+	list_for_each_entry_continue_rcu(e, &set->list, list) {
+		dest = rcu_dereference(e->dest);
 		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 			continue;
 
@@ -224,7 +232,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
 
 	/* select the first destination server, whose weight > 0 */
 	list_for_each_entry(e, &set->list, list) {
-		most = e->dest;
+		most = rcu_dereference_protected(e->dest, 1);
 		if (atomic_read(&most->weight) > 0) {
 			moh = ip_vs_dest_conn_overhead(most);
 			goto nextstage;
@@ -234,8 +242,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
 
 	/* find the destination with the weighted most load */
   nextstage:
-	list_for_each_entry(e, &set->list, list) {
-		dest = e->dest;
+	list_for_each_entry_continue(e, &set->list, list) {
+		dest = rcu_dereference_protected(e->dest, 1);
 		doh = ip_vs_dest_conn_overhead(dest);
 		/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
 		if ((moh * atomic_read(&dest->weight) <
@@ -262,11 +270,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
  *      IP address and its destination server set
  */
 struct ip_vs_lblcr_entry {
-	struct list_head        list;
+	struct hlist_node       list;
 	int			af;		/* address family */
 	union nf_inet_addr      addr;           /* destination IP address */
 	struct ip_vs_dest_set   set;            /* destination server set */
 	unsigned long           lastuse;        /* last used time */
+	struct rcu_head		rcu_head;
 };
 
 
@@ -274,12 +283,14 @@ struct ip_vs_lblcr_entry {
  *      IPVS lblcr hash table
  */
 struct ip_vs_lblcr_table {
-	struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
+	struct rcu_head		rcu_head;
+	struct hlist_head __rcu bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
 	atomic_t                entries;        /* number of entries */
 	int                     max_size;       /* maximum size of entries */
 	struct timer_list       periodic_timer; /* collect stale entries */
 	int                     rover;          /* rover for expire check */
 	int                     counter;        /* counter for no expire */
+	bool			dead;
 };
 
 
@@ -302,9 +313,9 @@ static ctl_table vs_vars_table[] = {
 
 static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
 {
-	list_del(&en->list);
+	hlist_del_rcu(&en->list);
 	ip_vs_dest_set_eraseall(&en->set);
-	kfree(en);
+	kfree_rcu(en, rcu_head);
 }
 
 
@@ -334,15 +345,12 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
 {
 	unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
 
-	list_add(&en->list, &tbl->bucket[hash]);
+	hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
 	atomic_inc(&tbl->entries);
 }
 
 
-/*
- *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under
- *  read lock.
- */
+/* Get ip_vs_lblcr_entry associated with supplied parameters. */
 static inline struct ip_vs_lblcr_entry *
 ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
 		const union nf_inet_addr *addr)
@@ -350,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
 	unsigned int hash = ip_vs_lblcr_hashkey(af, addr);
 	struct ip_vs_lblcr_entry *en;
 
-	list_for_each_entry(en, &tbl->bucket[hash], list)
+	hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
 		if (ip_vs_addr_equal(af, &en->addr, addr))
 			return en;
 
@@ -381,14 +389,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
 		/* initialize its dest set */
 		atomic_set(&(en->set.size), 0);
 		INIT_LIST_HEAD(&en->set.list);
-		rwlock_init(&en->set.lock);
+
+		ip_vs_dest_set_insert(&en->set, dest, false);
 
 		ip_vs_lblcr_hash(tbl, en);
+		return en;
 	}
 
-	write_lock(&en->set.lock);
-	ip_vs_dest_set_insert(&en->set, dest);
-	write_unlock(&en->set.lock);
+	ip_vs_dest_set_insert(&en->set, dest, true);
 
 	return en;
 }
@@ -397,17 +405,21 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
 /*
  *      Flush all the entries of the specified table.
  */
-static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
 {
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
 	int i;
-	struct ip_vs_lblcr_entry *en, *nxt;
+	struct ip_vs_lblcr_entry *en;
+	struct hlist_node *next;
 
-	/* No locking required, only called during cleanup. */
+	write_lock_bh(&svc->sched_lock);
+	tbl->dead = 1;
 	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
 			ip_vs_lblcr_free(en);
 		}
 	}
+	write_unlock_bh(&svc->sched_lock);
 }
 
 static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
@@ -425,13 +437,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
 	struct ip_vs_lblcr_table *tbl = svc->sched_data;
 	unsigned long now = jiffies;
 	int i, j;
-	struct ip_vs_lblcr_entry *en, *nxt;
+	struct ip_vs_lblcr_entry *en;
+	struct hlist_node *next;
 
 	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
 
 		write_lock(&svc->sched_lock);
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 			if (time_after(en->lastuse +
 				       sysctl_lblcr_expiration(svc), now))
 				continue;
@@ -463,7 +476,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
 	unsigned long now = jiffies;
 	int goal;
 	int i, j;
-	struct ip_vs_lblcr_entry *en, *nxt;
+	struct ip_vs_lblcr_entry *en;
+	struct hlist_node *next;
 
 	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
 		/* do full expiration check */
@@ -485,7 +499,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
 		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
 
 		write_lock(&svc->sched_lock);
-		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 			if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
 				continue;
 
@@ -523,11 +537,12 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 	 *    Initialize the hash buckets
 	 */
 	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-		INIT_LIST_HEAD(&tbl->bucket[i]);
+		INIT_HLIST_HEAD(&tbl->bucket[i]);
 	}
 	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
 	tbl->rover = 0;
 	tbl->counter = 1;
+	tbl->dead = 0;
 
 	/*
 	 *    Hook periodic timer for garbage collection
@@ -548,10 +563,10 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
 	del_timer_sync(&tbl->periodic_timer);
 
 	/* got to clean up table entries here */
-	ip_vs_lblcr_flush(tbl);
+	ip_vs_lblcr_flush(svc);
 
 	/* release the table itself */
-	kfree(tbl);
+	kfree_rcu(tbl, rcu_head);
 	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
 		  sizeof(*tbl));
 
@@ -577,7 +592,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
 	 * The server with weight=0 is quiesced and will not receive any
 	 * new connection.
 	 */
-	list_for_each_entry(dest, &svc->destinations, n_list) {
+	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 			continue;
 
@@ -593,7 +608,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
 	 *    Find the destination with the least load.
 	 */
   nextstage:
-	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
 		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 			continue;
 
@@ -627,7 +642,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
 		struct ip_vs_dest *d;
 
-		list_for_each_entry(d, &svc->destinations, n_list) {
+		list_for_each_entry_rcu(d, &svc->destinations, n_list) {
 			if (atomic_read(&d->activeconns)*2
 			    < atomic_read(&d->weight)) {
 				return 1;
@@ -646,7 +661,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 {
 	struct ip_vs_lblcr_table *tbl = svc->sched_data;
 	struct ip_vs_iphdr iph;
-	struct ip_vs_dest *dest = NULL;
+	struct ip_vs_dest *dest;
 	struct ip_vs_lblcr_entry *en;
 
 	ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
@@ -654,53 +669,46 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 
 	/* First look in our cache */
-	read_lock(&svc->sched_lock);
 	en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
 	if (en) {
-		/* We only hold a read lock, but this is atomic */
 		en->lastuse = jiffies;
 
 		/* Get the least loaded destination */
-		read_lock(&en->set.lock);
 		dest = ip_vs_dest_set_min(&en->set);
-		read_unlock(&en->set.lock);
 
 		/* More than one destination + enough time passed by, cleanup */
 		if (atomic_read(&en->set.size) > 1 &&
-				time_after(jiffies, en->set.lastmod +
+		    time_after(jiffies, en->set.lastmod +
 				sysctl_lblcr_expiration(svc))) {
-			struct ip_vs_dest *m;
+			write_lock(&svc->sched_lock);
+			if (atomic_read(&en->set.size) > 1) {
+				struct ip_vs_dest *m;
 
-			write_lock(&en->set.lock);
-			m = ip_vs_dest_set_max(&en->set);
-			if (m)
-				ip_vs_dest_set_erase(&en->set, m);
-			write_unlock(&en->set.lock);
+				m = ip_vs_dest_set_max(&en->set);
+				if (m)
+					ip_vs_dest_set_erase(&en->set, m);
+			}
+			write_unlock(&svc->sched_lock);
 		}
 
 		/* If the destination is not overloaded, use it */
-		if (dest && !is_overloaded(dest, svc)) {
-			read_unlock(&svc->sched_lock);
+		if (dest && !is_overloaded(dest, svc))
 			goto out;
-		}
 
 		/* The cache entry is invalid, time to schedule */
 		dest = __ip_vs_lblcr_schedule(svc);
 		if (!dest) {
 			ip_vs_scheduler_err(svc, "no destination available");
-			read_unlock(&svc->sched_lock);
 			return NULL;
 		}
 
 		/* Update our cache entry */
-		write_lock(&en->set.lock);
-		ip_vs_dest_set_insert(&en->set, dest);
-		write_unlock(&en->set.lock);
-	}
-	read_unlock(&svc->sched_lock);
-
-	if (dest)
+		write_lock(&svc->sched_lock);
+		if (!tbl->dead)
+			ip_vs_dest_set_insert(&en->set, dest, true);
+		write_unlock(&svc->sched_lock);
 		goto out;
+	}
 
 	/* No cache entry, time to schedule */
 	dest = __ip_vs_lblcr_schedule(svc);
@@ -711,7 +719,8 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	/* If we fail to create a cache entry, we'll just use the valid dest */
 	write_lock(&svc->sched_lock);
-	ip_vs_lblcr_new(tbl, &iph.daddr, dest);
+	if (!tbl->dead)
+		ip_vs_lblcr_new(tbl, &iph.daddr, dest);
 	write_unlock(&svc->sched_lock);
 
 out:

From 4ebd288b69dcebde1adc5e4c21758ef6dfc7e06f Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:42 +0200
Subject: [PATCH 22/49] ipvs: convert lc scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_lc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index f391819c0cca..0cabf78fbc31 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -42,7 +42,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	 * served, but no new connection is assigned to the server.
 	 */
 
-	list_for_each_entry(dest, &svc->destinations, n_list) {
+	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 		if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
 		    atomic_read(&dest->weight) == 0)
 			continue;

From f92ea8f09605c26e35789a6865e87d5e3d8aaddd Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:43 +0200
Subject: [PATCH 23/49] ipvs: convert nq scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_nq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index 984d9c137d84..51dc0cf20d90 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -75,7 +75,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	 * new connections.
 	 */
 
-	list_for_each_entry(dest, &svc->destinations, n_list) {
+	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 
 		if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
 		    !atomic_read(&dest->weight))

From c0d0c0a1c0d35b76f3859b69110f614056d845fe Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:44 +0200
Subject: [PATCH 24/49] ipvs: convert rr scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations. As the previous entry
could be unlinked, limit the list traversals to 2 when
lookup started from previous entry.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_rr.c | 59 ++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index c49b388d1085..3942890c6f19 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
 {
-	svc->sched_data = &svc->destinations;
+	struct list_head *p;
+
+	write_lock_bh(&svc->sched_lock);
+	p = (struct list_head *) svc->sched_data;
+	/* dest is already unlinked, so p->prev is not valid but
+	 * p->next is valid, use it to reach previous entry.
+	 */
+	if (p == &dest->n_list)
+		svc->sched_data = p->next->prev;
+	write_unlock_bh(&svc->sched_lock);
 	return 0;
 }
 
@@ -48,35 +57,40 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
 static struct ip_vs_dest *
 ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 {
-	struct list_head *p, *q;
-	struct ip_vs_dest *dest;
+	struct list_head *p;
+	struct ip_vs_dest *dest, *last;
+	int pass = 0;
 
 	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 
 	write_lock(&svc->sched_lock);
-	p = (struct list_head *)svc->sched_data;
-	p = p->next;
-	q = p;
-	do {
-		/* skip list head */
-		if (q == &svc->destinations) {
-			q = q->next;
-			continue;
-		}
+	p = (struct list_head *) svc->sched_data;
+	last = dest = list_entry(p, struct ip_vs_dest, n_list);
 
-		dest = list_entry(q, struct ip_vs_dest, n_list);
-		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-		    atomic_read(&dest->weight) > 0)
-			/* HIT */
-			goto out;
-		q = q->next;
-	} while (q != p);
+	do {
+		list_for_each_entry_continue_rcu(dest,
+						 &svc->destinations,
+						 n_list) {
+			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+			    atomic_read(&dest->weight) > 0)
+				/* HIT */
+				goto out;
+			if (dest == last)
+				goto stop;
+		}
+		pass++;
+		/* Previous dest could be unlinked, do not loop forever.
+		 * If we stay at head there is no need for 2nd pass.
+		 */
+	} while (pass < 2 && p != &svc->destinations);
+
+stop:
 	write_unlock(&svc->sched_lock);
 	ip_vs_scheduler_err(svc, "no destination available");
 	return NULL;
 
   out:
-	svc->sched_data = q;
+	svc->sched_data = &dest->n_list;
 	write_unlock(&svc->sched_lock);
 	IP_VS_DBG_BUF(6, "RR: server %s:%u "
 		      "activeconns %d refcnt %d weight %d\n",
@@ -94,7 +108,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
 	.module =		THIS_MODULE,
 	.n_list =		LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
 	.init_service =		ip_vs_rr_init_svc,
-	.update_service =	ip_vs_rr_update_svc,
+	.add_dest =		NULL,
+	.del_dest =		ip_vs_rr_del_dest,
 	.schedule =		ip_vs_rr_schedule,
 };
 

From 9be52aba7a7fdaaad82d88b2e66b0d215877a1fd Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:45 +0200
Subject: [PATCH 25/49] ipvs: convert sed scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_sed.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index 89ead246ed3d..d01187084b7f 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -79,7 +79,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	 * new connections.
 	 */
 
-	list_for_each_entry(dest, &svc->destinations, n_list) {
+	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
 		    atomic_read(&dest->weight) > 0) {
 			least = dest;
@@ -94,7 +94,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	 *    Find the destination with the least load.
 	 */
   nextstage:
-	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
 		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 			continue;
 		doh = ip_vs_sed_dest_overhead(dest);

From 1acb7f6761626f4834ea5ce16d8a4dd2a5dd1949 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:46 +0200
Subject: [PATCH 26/49] ipvs: convert sh scheduler to rcu

Use the 3 new methods to reassign dests.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_sh.c | 81 +++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 36 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index e33126994628..55e76d8db711 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -53,7 +53,7 @@
  *      IPVS SH bucket
  */
 struct ip_vs_sh_bucket {
-	struct ip_vs_dest       *dest;          /* real server (cache) */
+	struct ip_vs_dest __rcu	*dest;	/* real server (cache) */
 };
 
 /*
@@ -66,6 +66,10 @@ struct ip_vs_sh_bucket {
 #define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
 #define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
 
+struct ip_vs_sh_state {
+	struct ip_vs_sh_bucket		buckets[IP_VS_SH_TAB_SIZE];
+	struct rcu_head			rcu_head;
+};
 
 /*
  *	Returns hash value for IPVS SH entry
@@ -87,10 +91,9 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
  *      Get ip_vs_dest associated with supplied parameters.
  */
 static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
-	     const union nf_inet_addr *addr)
+ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
 {
-	return (tbl[ip_vs_sh_hashkey(af, addr)]).dest;
+	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
 }
 
 
@@ -98,27 +101,32 @@ ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
  *      Assign all the hash buckets of the specified table with the service.
  */
 static int
-ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
 {
 	int i;
 	struct ip_vs_sh_bucket *b;
 	struct list_head *p;
 	struct ip_vs_dest *dest;
 	int d_count;
+	bool empty;
 
-	b = tbl;
+	b = &s->buckets[0];
 	p = &svc->destinations;
+	empty = list_empty(p);
 	d_count = 0;
 	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
-		if (list_empty(p)) {
-			b->dest = NULL;
-		} else {
+		dest = rcu_dereference_protected(b->dest, 1);
+		if (dest)
+			ip_vs_dest_put(dest);
+		if (empty)
+			RCU_INIT_POINTER(b->dest, NULL);
+		else {
 			if (p == &svc->destinations)
 				p = p->next;
 
 			dest = list_entry(p, struct ip_vs_dest, n_list);
-			atomic_inc(&dest->refcnt);
-			b->dest = dest;
+			ip_vs_dest_hold(dest);
+			RCU_INIT_POINTER(b->dest, dest);
 
 			IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
 				      i, IP_VS_DBG_ADDR(svc->af, &dest->addr),
@@ -140,16 +148,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
 /*
  *      Flush all the hash buckets of the specified table.
  */
-static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
 {
 	int i;
 	struct ip_vs_sh_bucket *b;
+	struct ip_vs_dest *dest;
 
-	b = tbl;
+	b = &s->buckets[0];
 	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
-		if (b->dest) {
-			atomic_dec(&b->dest->refcnt);
-			b->dest = NULL;
+		dest = rcu_dereference_protected(b->dest, 1);
+		if (dest) {
+			ip_vs_dest_put(dest);
+			RCU_INIT_POINTER(b->dest, NULL);
 		}
 		b++;
 	}
@@ -158,21 +168,20 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
 
 static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
 {
-	struct ip_vs_sh_bucket *tbl;
+	struct ip_vs_sh_state *s;
 
 	/* allocate the SH table for this service */
-	tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
-		      GFP_KERNEL);
-	if (tbl == NULL)
+	s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
+	if (s == NULL)
 		return -ENOMEM;
 
-	svc->sched_data = tbl;
+	svc->sched_data = s;
 	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
 		  "current service\n",
 		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 
-	/* assign the hash buckets with the updated service */
-	ip_vs_sh_assign(tbl, svc);
+	/* assign the hash buckets with current dests */
+	ip_vs_sh_reassign(s, svc);
 
 	return 0;
 }
@@ -180,13 +189,13 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
 
 static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
 {
-	struct ip_vs_sh_bucket *tbl = svc->sched_data;
+	struct ip_vs_sh_state *s = svc->sched_data;
 
 	/* got to clean up hash buckets here */
-	ip_vs_sh_flush(tbl);
+	ip_vs_sh_flush(s);
 
 	/* release the table itself */
-	kfree(svc->sched_data);
+	kfree_rcu(s, rcu_head);
 	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
 		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 
@@ -194,15 +203,13 @@ static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
+				 struct ip_vs_dest *dest)
 {
-	struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
-	/* got to clean up hash buckets here */
-	ip_vs_sh_flush(tbl);
+	struct ip_vs_sh_state *s = svc->sched_data;
 
 	/* assign the hash buckets with the updated service */
-	ip_vs_sh_assign(tbl, svc);
+	ip_vs_sh_reassign(s, svc);
 
 	return 0;
 }
@@ -225,15 +232,15 @@ static struct ip_vs_dest *
 ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 {
 	struct ip_vs_dest *dest;
-	struct ip_vs_sh_bucket *tbl;
+	struct ip_vs_sh_state *s;
 	struct ip_vs_iphdr iph;
 
 	ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
 
 	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 
-	tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
-	dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr);
+	s = (struct ip_vs_sh_state *) svc->sched_data;
+	dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
 	if (!dest
 	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
 	    || atomic_read(&dest->weight) <= 0
@@ -262,7 +269,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler =
 	.n_list	 =		LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
 	.init_service =		ip_vs_sh_init_svc,
 	.done_service =		ip_vs_sh_done_svc,
-	.update_service =	ip_vs_sh_update_svc,
+	.add_dest =		ip_vs_sh_dest_changed,
+	.del_dest =		ip_vs_sh_dest_changed,
+	.upd_dest =		ip_vs_sh_dest_changed,
 	.schedule =		ip_vs_sh_schedule,
 };
 

From b310faad3e710b6822ebc0ec100db3682444e412 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:47 +0200
Subject: [PATCH 27/49] ipvs: convert wlc scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_wlc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index bc1bfc48a17f..dafae881c622 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -51,7 +51,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	 * new connections.
 	 */
 
-	list_for_each_entry(dest, &svc->destinations, n_list) {
+	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
 		    atomic_read(&dest->weight) > 0) {
 			least = dest;
@@ -66,7 +66,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	 *    Find the destination with the least load.
 	 */
   nextstage:
-	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
 		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 			continue;
 		doh = ip_vs_dest_conn_overhead(dest);

From 08cb2d032f13da4a076b51639b104a830b6bd18c Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:48 +0200
Subject: [PATCH 28/49] ipvs: convert wrr scheduler to rcu

The schedule method now needs _rcu list-traversal
primitive for svc->destinations. As the weight for some
dest can be reduced during dest selection, change the
algorithm to check weights by using minimum weights in the
1 .. max_weight-(di-1) range, with the same step (di). By this
way we ensure that there will be always a weight >= 1 check
before claiming that all destinations are overloaded.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_wrr.c | 167 ++++++++++++++++++++-------------
 1 file changed, 103 insertions(+), 64 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 231be7dd547a..98cb05e345c9 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -29,14 +29,45 @@
 
 #include <net/ip_vs.h>
 
+/* The WRR algorithm depends on some caclulations:
+ * - mw: maximum weight
+ * - di: weight step, greatest common divisor from all weights
+ * - cw: current required weight
+ * As result, all weights are in the [di..mw] range with a step=di.
+ *
+ * First, we start with cw = mw and select dests with weight >= cw.
+ * Then cw is reduced with di and all dests are checked again.
+ * Last pass should be with cw = di. We have mw/di passes in total:
+ *
+ * pass 1: cw = max weight
+ * pass 2: cw = max weight - di
+ * pass 3: cw = max weight - 2 * di
+ * ...
+ * last pass: cw = di
+ *
+ * Weights are supposed to be >= di but we run in parallel with
+ * weight changes, it is possible some dest weight to be reduced
+ * below di, bad if it is the only available dest.
+ *
+ * So, we modify how mw is calculated, now it is reduced with (di - 1),
+ * so that last cw is 1 to catch such dests with weight below di:
+ * pass 1: cw = max weight - (di - 1)
+ * pass 2: cw = max weight - di - (di - 1)
+ * pass 3: cw = max weight - 2 * di - (di - 1)
+ * ...
+ * last pass: cw = 1
+ *
+ */
+
 /*
  * current destination pointer for weighted round-robin scheduling
  */
 struct ip_vs_wrr_mark {
-	struct list_head *cl;	/* current list head */
+	struct ip_vs_dest *cl;	/* current dest or head */
 	int cw;			/* current weight */
 	int mw;			/* maximum weight */
 	int di;			/* decreasing interval */
+	struct rcu_head		rcu_head;
 };
 
 
@@ -88,10 +119,10 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
 	if (mark == NULL)
 		return -ENOMEM;
 
-	mark->cl = &svc->destinations;
-	mark->cw = 0;
-	mark->mw = ip_vs_wrr_max_weight(svc);
+	mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
 	mark->di = ip_vs_wrr_gcd_weight(svc);
+	mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+	mark->cw = mark->mw;
 	svc->sched_data = mark;
 
 	return 0;
@@ -100,24 +131,31 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
 
 static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
 {
+	struct ip_vs_wrr_mark *mark = svc->sched_data;
+
 	/*
 	 *    Release the mark variable
 	 */
-	kfree(svc->sched_data);
+	kfree_rcu(mark, rcu_head);
 
 	return 0;
 }
 
 
-static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
+				  struct ip_vs_dest *dest)
 {
 	struct ip_vs_wrr_mark *mark = svc->sched_data;
 
-	mark->cl = &svc->destinations;
-	mark->mw = ip_vs_wrr_max_weight(svc);
+	write_lock_bh(&svc->sched_lock);
+	mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
 	mark->di = ip_vs_wrr_gcd_weight(svc);
-	if (mark->cw > mark->mw)
-		mark->cw = 0;
+	mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+	if (mark->cw > mark->mw || !mark->cw)
+		mark->cw = mark->mw;
+	else if (mark->di > 1)
+		mark->cw = (mark->cw / mark->di) * mark->di + 1;
+	write_unlock_bh(&svc->sched_lock);
 	return 0;
 }
 
@@ -128,80 +166,79 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
 static struct ip_vs_dest *
 ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 {
-	struct ip_vs_dest *dest;
+	struct ip_vs_dest *dest, *last, *stop = NULL;
 	struct ip_vs_wrr_mark *mark = svc->sched_data;
-	struct list_head *p;
+	bool last_pass = false, restarted = false;
 
 	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 
-	/*
-	 * This loop will always terminate, because mark->cw in (0, max_weight]
-	 * and at least one server has its weight equal to max_weight.
-	 */
 	write_lock(&svc->sched_lock);
-	p = mark->cl;
+	dest = mark->cl;
+	/* No available dests? */
+	if (mark->mw == 0)
+		goto err_noavail;
+	last = dest;
+	/* Stop only after all dests were checked for weight >= 1 (last pass) */
 	while (1) {
-		if (mark->cl == &svc->destinations) {
-			/* it is at the head of the destination list */
-
-			if (mark->cl == mark->cl->next) {
-				/* no dest entry */
-				ip_vs_scheduler_err(svc,
-					"no destination available: "
-					"no destinations present");
-				dest = NULL;
-				goto out;
-			}
-
-			mark->cl = svc->destinations.next;
-			mark->cw -= mark->di;
-			if (mark->cw <= 0) {
-				mark->cw = mark->mw;
-				/*
-				 * Still zero, which means no available servers.
-				 */
-				if (mark->cw == 0) {
-					mark->cl = &svc->destinations;
-					ip_vs_scheduler_err(svc,
-						"no destination available");
-					dest = NULL;
-					goto out;
-				}
-			}
-		} else
-			mark->cl = mark->cl->next;
-
-		if (mark->cl != &svc->destinations) {
-			/* not at the head of the list */
-			dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+		list_for_each_entry_continue_rcu(dest,
+						 &svc->destinations,
+						 n_list) {
 			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-			    atomic_read(&dest->weight) >= mark->cw) {
-				/* got it */
-				break;
-			}
+			    atomic_read(&dest->weight) >= mark->cw)
+				goto found;
+			if (dest == stop)
+				goto err_over;
 		}
-
-		if (mark->cl == p && mark->cw == mark->di) {
-			/* back to the start, and no dest is found.
-			   It is only possible when all dests are OVERLOADED */
-			dest = NULL;
-			ip_vs_scheduler_err(svc,
-				"no destination available: "
-				"all destinations are overloaded");
-			goto out;
+		mark->cw -= mark->di;
+		if (mark->cw <= 0) {
+			mark->cw = mark->mw;
+			/* Stop if we tried last pass from first dest:
+			 * 1. last_pass: we started checks when cw > di but
+			 *	then all dests were checked for w >= 1
+			 * 2. last was head: the first and only traversal
+			 *	was for weight >= 1, for all dests.
+			 */
+			if (last_pass ||
+			    &last->n_list == &svc->destinations)
+				goto err_over;
+			restarted = true;
+		}
+		last_pass = mark->cw <= mark->di;
+		if (last_pass && restarted &&
+		    &last->n_list != &svc->destinations) {
+			/* First traversal was for w >= 1 but only
+			 * for dests after 'last', now do the same
+			 * for all dests up to 'last'.
+			 */
+			stop = last;
 		}
 	}
 
+found:
 	IP_VS_DBG_BUF(6, "WRR: server %s:%u "
 		      "activeconns %d refcnt %d weight %d\n",
 		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
 		      atomic_read(&dest->activeconns),
 		      atomic_read(&dest->refcnt),
 		      atomic_read(&dest->weight));
+	mark->cl = dest;
 
   out:
 	write_unlock(&svc->sched_lock);
 	return dest;
+
+err_noavail:
+	mark->cl = dest;
+	dest = NULL;
+	ip_vs_scheduler_err(svc, "no destination available");
+	goto out;
+
+err_over:
+	mark->cl = dest;
+	dest = NULL;
+	ip_vs_scheduler_err(svc, "no destination available: "
+			    "all destinations are overloaded");
+	goto out;
 }
 
 
@@ -212,7 +249,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
 	.n_list =		LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
 	.init_service =		ip_vs_wrr_init_svc,
 	.done_service =		ip_vs_wrr_done_svc,
-	.update_service =	ip_vs_wrr_update_svc,
+	.add_dest =		ip_vs_wrr_dest_changed,
+	.del_dest =		ip_vs_wrr_dest_changed,
+	.upd_dest =		ip_vs_wrr_dest_changed,
 	.schedule =		ip_vs_wrr_schedule,
 };
 

From 578bc3ef1e473abb9ea99046a307fef0094b22af Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:49 +0200
Subject: [PATCH 29/49] ipvs: reorganize dest trash

All dests will go to trash, no exceptions.
But we have to use new list node t_list for this, due
to RCU changes in following patches. Dests will wait there
initial grace period and later all conns and schedulers to
put their reference. The dests don't get reference for
staying in dest trash as before.

	As result, we do not load ip_vs_dest_put with
extra checks for last refcnt and the schedulers do not
need to play games with atomic_inc_not_zero while
selecting best destination.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h            |   9 ++
 net/netfilter/ipvs/ip_vs_ctl.c | 178 +++++++++++++++++++--------------
 2 files changed, 113 insertions(+), 74 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 7d3027f2c06e..18aeb85079f8 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -749,6 +749,8 @@ struct ip_vs_dest_dst {
 	struct rcu_head		rcu_head;
 };
 
+/* In grace period after removing */
+#define IP_VS_DEST_STATE_REMOVING	0x01
 /*
  *	The real server destination forwarding entry
  *	with ip address, port number, and so on.
@@ -766,6 +768,7 @@ struct ip_vs_dest {
 
 	atomic_t		refcnt;		/* reference counter */
 	struct ip_vs_stats      stats;          /* statistics */
+	unsigned long		state;		/* state flags */
 
 	/* connection counters and thresholds */
 	atomic_t		activeconns;	/* active connections */
@@ -785,6 +788,7 @@ struct ip_vs_dest {
 	union nf_inet_addr	vaddr;		/* virtual IP address */
 	__u32			vfwmark;	/* firewall mark of service */
 
+	struct list_head	t_list;		/* in dest_trash */
 	struct rcu_head		rcu_head;
 	unsigned int		in_rs_table:1;	/* we are in rs_table */
 };
@@ -912,6 +916,9 @@ struct ipvs_master_sync_state {
 	struct netns_ipvs	*ipvs;
 };
 
+/* How much time to keep dests in trash */
+#define IP_VS_DEST_TRASH_PERIOD		(120 * HZ)
+
 /* IPVS in network namespace */
 struct netns_ipvs {
 	int			gen;		/* Generation */
@@ -961,6 +968,8 @@ struct netns_ipvs {
 
 	/* Trash for destinations */
 	struct list_head	dest_trash;
+	spinlock_t		dest_trash_lock;
+	struct timer_list	dest_trash_timer; /* expiration timer */
 	/* Service counters */
 	atomic_t		ftpsvc_counter;
 	atomic_t		nullsvc_counter;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a4f638880470..80d366b7daab 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -71,7 +71,7 @@ int ip_vs_get_debug_level(void)
 
 
 /*  Protos */
-static void __ip_vs_del_service(struct ip_vs_service *svc);
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
 
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -657,19 +657,25 @@ static struct ip_vs_dest *
 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
 		     __be16 dport)
 {
-	struct ip_vs_dest *dest, *nxt;
+	struct ip_vs_dest *dest;
 	struct netns_ipvs *ipvs = net_ipvs(svc->net);
 
 	/*
 	 * Find the destination in trash
 	 */
-	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
+	spin_lock_bh(&ipvs->dest_trash_lock);
+	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
 		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
 			      "dest->refcnt=%d\n",
 			      dest->vfwmark,
 			      IP_VS_DBG_ADDR(svc->af, &dest->addr),
 			      ntohs(dest->port),
 			      atomic_read(&dest->refcnt));
+		/* We can not reuse dest while in grace period
+		 * because conns still can use dest->svc
+		 */
+		if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
+			continue;
 		if (dest->af == svc->af &&
 		    ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
 		    dest->port == dport &&
@@ -679,29 +685,27 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
 		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
 		      dest->vport == svc->port))) {
 			/* HIT */
-			return dest;
-		}
-
-		/*
-		 * Try to purge the destination from trash if not referenced
-		 */
-		if (atomic_read(&dest->refcnt) == 1) {
-			IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
-				      "from trash\n",
-				      dest->vfwmark,
-				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
-				      ntohs(dest->port));
-			list_del(&dest->n_list);
-			__ip_vs_dst_cache_reset(dest);
-			__ip_vs_unbind_svc(dest);
-			free_percpu(dest->stats.cpustats);
-			kfree_rcu(dest, rcu_head);
+			list_del(&dest->t_list);
+			ip_vs_dest_hold(dest);
+			goto out;
 		}
 	}
 
-	return NULL;
+	dest = NULL;
+
+out:
+	spin_unlock_bh(&ipvs->dest_trash_lock);
+
+	return dest;
 }
 
+static void ip_vs_dest_free(struct ip_vs_dest *dest)
+{
+	__ip_vs_dst_cache_reset(dest);
+	__ip_vs_unbind_svc(dest);
+	free_percpu(dest->stats.cpustats);
+	kfree(dest);
+}
 
 /*
  *  Clean up all the destinations in the trash
@@ -710,19 +714,18 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
  *  When the ip_vs_control_clearup is activated by ipvs module exit,
  *  the service tables must have been flushed and all the connections
  *  are expired, and the refcnt of each destination in the trash must
- *  be 1, so we simply release them here.
+ *  be 0, so we simply release them here.
  */
 static void ip_vs_trash_cleanup(struct net *net)
 {
 	struct ip_vs_dest *dest, *nxt;
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
-		list_del(&dest->n_list);
-		__ip_vs_dst_cache_reset(dest);
-		__ip_vs_unbind_svc(dest);
-		free_percpu(dest->stats.cpustats);
-		kfree_rcu(dest, rcu_head);
+	del_timer_sync(&ipvs->dest_trash_timer);
+	/* No need to use dest_trash_lock */
+	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
+		list_del(&dest->t_list);
+		ip_vs_dest_free(dest);
 	}
 }
 
@@ -955,11 +958,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
 			      ntohs(dest->vport));
 
-		/*
-		 * Get the destination from the trash
-		 */
-		list_del(&dest->n_list);
-
 		__ip_vs_update_dest(svc, dest, udest, 1);
 		ret = 0;
 	} else {
@@ -1015,11 +1013,21 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	return 0;
 }
 
+static void ip_vs_dest_wait_readers(struct rcu_head *head)
+{
+	struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest,
+					       rcu_head);
+
+	/* End of grace period after unlinking */
+	clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
+}
+
 
 /*
  *	Delete a destination (must be already unlinked from the service)
  */
-static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+			     bool cleanup)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
@@ -1030,34 +1038,22 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
 	 */
 	ip_vs_rs_unhash(dest);
 
-	/*
-	 *  Decrease the refcnt of the dest, and free the dest
-	 *  if nobody refers to it (refcnt=0). Otherwise, throw
-	 *  the destination into the trash.
-	 */
-	if (atomic_dec_and_test(&dest->refcnt)) {
-		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
-			      dest->vfwmark,
-			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
-			      ntohs(dest->port));
-		__ip_vs_dst_cache_reset(dest);
-		/* simply decrease svc->refcnt here, let the caller check
-		   and release the service if nobody refers to it.
-		   Only user context can release destination and service,
-		   and only one user context can update virtual service at a
-		   time, so the operation here is OK */
-		atomic_dec(&dest->svc->refcnt);
-		free_percpu(dest->stats.cpustats);
-		kfree_rcu(dest, rcu_head);
-	} else {
-		IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
-			      "dest->refcnt=%d\n",
-			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
-			      ntohs(dest->port),
-			      atomic_read(&dest->refcnt));
-		list_add(&dest->n_list, &ipvs->dest_trash);
-		ip_vs_dest_hold(dest);
+	if (!cleanup) {
+		set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
+		call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers);
 	}
+
+	spin_lock_bh(&ipvs->dest_trash_lock);
+	IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+		      atomic_read(&dest->refcnt));
+	if (list_empty(&ipvs->dest_trash) && !cleanup)
+		mod_timer(&ipvs->dest_trash_timer,
+			  jiffies + IP_VS_DEST_TRASH_PERIOD);
+	/* dest lives in trash without reference */
+	list_add(&dest->t_list, &ipvs->dest_trash);
+	spin_unlock_bh(&ipvs->dest_trash_lock);
+	ip_vs_dest_put(dest);
 }
 
 
@@ -1122,13 +1118,38 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	/*
 	 *	Delete the destination
 	 */
-	__ip_vs_del_dest(svc->net, dest);
+	__ip_vs_del_dest(svc->net, dest, false);
 
 	LeaveFunction(2);
 
 	return 0;
 }
 
+static void ip_vs_dest_trash_expire(unsigned long data)
+{
+	struct net *net = (struct net *) data;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_dest *dest, *next;
+
+	spin_lock(&ipvs->dest_trash_lock);
+	list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
+		/* Skip if dest is in grace period */
+		if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
+			continue;
+		if (atomic_read(&dest->refcnt) > 0)
+			continue;
+		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(dest->svc->af, &dest->addr),
+			      ntohs(dest->port));
+		list_del(&dest->t_list);
+		ip_vs_dest_free(dest);
+	}
+	if (!list_empty(&ipvs->dest_trash))
+		mod_timer(&ipvs->dest_trash_timer,
+			  jiffies + IP_VS_DEST_TRASH_PERIOD);
+	spin_unlock(&ipvs->dest_trash_lock);
+}
 
 /*
  *	Add a service into the service hash table
@@ -1358,7 +1379,7 @@ out:
  *	- The service must be unlinked, unlocked and not referenced!
  *	- We are called under _bh lock
  */
-static void __ip_vs_del_service(struct ip_vs_service *svc)
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
 {
 	struct ip_vs_dest *dest, *nxt;
 	struct ip_vs_scheduler *old_sched;
@@ -1394,7 +1415,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	 */
 	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
 		__ip_vs_unlink_dest(svc, dest, 0);
-		__ip_vs_del_dest(svc->net, dest);
+		__ip_vs_del_dest(svc->net, dest, cleanup);
 	}
 
 	/*
@@ -1424,7 +1445,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 /*
  * Unlink a service from list and try to delete it if its refcnt reached 0
  */
-static void ip_vs_unlink_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
 {
 	/*
 	 * Unhash it from the service table
@@ -1438,7 +1459,7 @@ static void ip_vs_unlink_service(struct ip_vs_service *svc)
 	 */
 	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
-	__ip_vs_del_service(svc);
+	__ip_vs_del_service(svc, cleanup);
 
 	write_unlock_bh(&__ip_vs_svc_lock);
 }
@@ -1450,7 +1471,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
 {
 	if (svc == NULL)
 		return -EEXIST;
-	ip_vs_unlink_service(svc);
+	ip_vs_unlink_service(svc, false);
 
 	return 0;
 }
@@ -1459,7 +1480,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
 /*
  *	Flush all the virtual services
  */
-static int ip_vs_flush(struct net *net)
+static int ip_vs_flush(struct net *net, bool cleanup)
 {
 	int idx;
 	struct ip_vs_service *svc, *nxt;
@@ -1471,7 +1492,7 @@ static int ip_vs_flush(struct net *net)
 		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
 					 s_list) {
 			if (net_eq(svc->net, net))
-				ip_vs_unlink_service(svc);
+				ip_vs_unlink_service(svc, cleanup);
 		}
 	}
 
@@ -1482,7 +1503,7 @@ static int ip_vs_flush(struct net *net)
 		list_for_each_entry_safe(svc, nxt,
 					 &ip_vs_svc_fwm_table[idx], f_list) {
 			if (net_eq(svc->net, net))
-				ip_vs_unlink_service(svc);
+				ip_vs_unlink_service(svc, cleanup);
 		}
 	}
 
@@ -1498,7 +1519,7 @@ void ip_vs_service_net_cleanup(struct net *net)
 	EnterFunction(2);
 	/* Check for "full" addressed entries */
 	mutex_lock(&__ip_vs_mutex);
-	ip_vs_flush(net);
+	ip_vs_flush(net, true);
 	mutex_unlock(&__ip_vs_mutex);
 	LeaveFunction(2);
 }
@@ -1558,9 +1579,11 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 		}
 	}
 
-	list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
+	spin_lock_bh(&ipvs->dest_trash_lock);
+	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
 		ip_vs_forget_dev(dest, dev);
 	}
+	spin_unlock_bh(&ipvs->dest_trash_lock);
 	mutex_unlock(&__ip_vs_mutex);
 	LeaveFunction(2);
 	return NOTIFY_DONE;
@@ -2394,7 +2417,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 
 	if (cmd == IP_VS_SO_SET_FLUSH) {
 		/* Flush the virtual service */
-		ret = ip_vs_flush(net);
+		ret = ip_vs_flush(net, false);
 		goto out_unlock;
 	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
 		/* Set timeout values for (tcp tcpfin udp) */
@@ -3403,7 +3426,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	mutex_lock(&__ip_vs_mutex);
 
 	if (cmd == IPVS_CMD_FLUSH) {
-		ret = ip_vs_flush(net);
+		ret = ip_vs_flush(net, false);
 		goto out;
 	} else if (cmd == IPVS_CMD_SET_CONFIG) {
 		ret = ip_vs_genl_set_config(net, info->attrs);
@@ -3800,6 +3823,9 @@ int __net_init ip_vs_control_net_init(struct net *net)
 		INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
 
 	INIT_LIST_HEAD(&ipvs->dest_trash);
+	spin_lock_init(&ipvs->dest_trash_lock);
+	setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
+		    (unsigned long) net);
 	atomic_set(&ipvs->ftpsvc_counter, 0);
 	atomic_set(&ipvs->nullsvc_counter, 0);
 
@@ -3829,6 +3855,10 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
+	/* Some dest can be in grace period even before cleanup, we have to
+	 * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called.
+	 */
+	rcu_barrier();
 	ip_vs_trash_cleanup(net);
 	ip_vs_stop_estimator(net, &ipvs->tot_stats);
 	ip_vs_control_net_cleanup_sysctl(net);

From ed3ffc4e48e2b03d5b23988f3cfa0ad8d79e0092 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:50 +0200
Subject: [PATCH 30/49] ipvs: do not expect result from done_service

This method releases the scheduler state,
it can not fail. Such change will help to properly
replace the scheduler in following patch.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h              |  4 ++--
 net/netfilter/ipvs/ip_vs_ctl.c   |  5 +----
 net/netfilter/ipvs/ip_vs_dh.c    |  4 +---
 net/netfilter/ipvs/ip_vs_lblc.c  |  4 +---
 net/netfilter/ipvs/ip_vs_lblcr.c |  4 +---
 net/netfilter/ipvs/ip_vs_sched.c | 13 ++++---------
 net/netfilter/ipvs/ip_vs_sh.c    |  4 +---
 net/netfilter/ipvs/ip_vs_wrr.c   |  4 +---
 8 files changed, 12 insertions(+), 30 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 18aeb85079f8..4990de698576 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -806,7 +806,7 @@ struct ip_vs_scheduler {
 	/* scheduler initializing service */
 	int (*init_service)(struct ip_vs_service *svc);
 	/* scheduling service finish */
-	int (*done_service)(struct ip_vs_service *svc);
+	void (*done_service)(struct ip_vs_service *svc);
 	/* scheduler updating service */
 	int (*update_service)(struct ip_vs_service *svc);
 	/* dest is linked */
@@ -1392,7 +1392,7 @@ extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
 extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
 extern int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 				struct ip_vs_scheduler *scheduler);
-extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc);
+extern void ip_vs_unbind_scheduler(struct ip_vs_service *svc);
 extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
 extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
 extern struct ip_vs_conn *
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 80d366b7daab..d02272685dda 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1334,10 +1334,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 		/*
 		 * Unbind the old scheduler
 		 */
-		if ((ret = ip_vs_unbind_scheduler(svc))) {
-			old_sched = sched;
-			goto out_unlock;
-		}
+		ip_vs_unbind_scheduler(svc);
 
 		/*
 		 * Bind the new scheduler
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index ebe80f44e712..89c27230d93a 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -174,7 +174,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
 {
 	struct ip_vs_dh_state *s = svc->sched_data;
 
@@ -185,8 +185,6 @@ static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
 	kfree_rcu(s, rcu_head);
 	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
 		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
-
-	return 0;
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index b873e177435a..c7ff97895237 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -388,7 +388,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 {
 	struct ip_vs_lblc_table *tbl = svc->sched_data;
 
@@ -402,8 +402,6 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 	kfree_rcu(tbl, rcu_head);
 	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
 		  sizeof(*tbl));
-
-	return 0;
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index c22f173d528c..6049b85df41f 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -555,7 +555,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
 {
 	struct ip_vs_lblcr_table *tbl = svc->sched_data;
 
@@ -569,8 +569,6 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
 	kfree_rcu(tbl, rcu_head);
 	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
 		  sizeof(*tbl));
-
-	return 0;
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 7f11d3d37a42..1b715d0caf43 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -64,22 +64,17 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 /*
  *  Unbind a service with its scheduler
  */
-int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc)
 {
 	struct ip_vs_scheduler *sched = svc->scheduler;
 
 	if (!sched)
-		return 0;
+		return;
 
-	if (sched->done_service) {
-		if (sched->done_service(svc) != 0) {
-			pr_err("%s(): done error\n", __func__);
-			return -EINVAL;
-		}
-	}
+	if (sched->done_service)
+		sched->done_service(svc);
 
 	svc->scheduler = NULL;
-	return 0;
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 55e76d8db711..81c1a10c7b49 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -187,7 +187,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
 {
 	struct ip_vs_sh_state *s = svc->sched_data;
 
@@ -198,8 +198,6 @@ static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
 	kfree_rcu(s, rcu_head);
 	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
 		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
-	return 0;
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 98cb05e345c9..a74fd9bab953 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -129,7 +129,7 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
 }
 
 
-static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
 {
 	struct ip_vs_wrr_mark *mark = svc->sched_data;
 
@@ -137,8 +137,6 @@ static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
 	 *    Release the mark variable
 	 */
 	kfree_rcu(mark, rcu_head);
-
-	return 0;
 }
 
 

From ba3a3ce14ea26d602b253ef13a56d540827cd51d Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:51 +0200
Subject: [PATCH 31/49] ipvs: convert sched_lock to spin lock

As all read_locks are gone spin lock is preferred.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h              |  2 +-
 net/netfilter/ipvs/ip_vs_ctl.c   |  2 +-
 net/netfilter/ipvs/ip_vs_lblc.c  | 18 +++++++++---------
 net/netfilter/ipvs/ip_vs_lblcr.c | 26 +++++++++++++-------------
 net/netfilter/ipvs/ip_vs_rr.c    | 10 +++++-----
 net/netfilter/ipvs/ip_vs_wrr.c   |  8 ++++----
 6 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4990de698576..4a7bc63b9cba 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -734,7 +734,7 @@ struct ip_vs_service {
 
 	/* for scheduling */
 	struct ip_vs_scheduler	*scheduler;    /* bound scheduler object */
-	rwlock_t		sched_lock;    /* lock sched_data */
+	spinlock_t		sched_lock;    /* lock sched_data */
 	void			*sched_data;   /* scheduler application data */
 
 	/* alternate persistence engine */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index d02272685dda..2bfd807bc93f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1219,7 +1219,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 	svc->net = net;
 
 	INIT_LIST_HEAD(&svc->destinations);
-	rwlock_init(&svc->sched_lock);
+	spin_lock_init(&svc->sched_lock);
 	spin_lock_init(&svc->stats.lock);
 
 	/* Bind the scheduler */
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index c7ff97895237..ffef8a162148 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -194,7 +194,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
 
 /*
  * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
- * address to a server. Called under write lock.
+ * address to a server. Called under spin lock.
  */
 static inline struct ip_vs_lblc_entry *
 ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
@@ -242,7 +242,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)
 	struct hlist_node *next;
 	int i;
 
-	write_lock_bh(&svc->sched_lock);
+	spin_lock_bh(&svc->sched_lock);
 	tbl->dead = 1;
 	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
@@ -250,7 +250,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)
 			atomic_dec(&tbl->entries);
 		}
 	}
-	write_unlock_bh(&svc->sched_lock);
+	spin_unlock_bh(&svc->sched_lock);
 }
 
 static int sysctl_lblc_expiration(struct ip_vs_service *svc)
@@ -274,7 +274,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
 	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 
-		write_lock(&svc->sched_lock);
+		spin_lock(&svc->sched_lock);
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 			if (time_before(now,
 					en->lastuse +
@@ -284,7 +284,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
 			ip_vs_lblc_free(en);
 			atomic_dec(&tbl->entries);
 		}
-		write_unlock(&svc->sched_lock);
+		spin_unlock(&svc->sched_lock);
 	}
 	tbl->rover = j;
 }
@@ -330,7 +330,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
 	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 
-		write_lock(&svc->sched_lock);
+		spin_lock(&svc->sched_lock);
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
 				continue;
@@ -339,7 +339,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
 			atomic_dec(&tbl->entries);
 			goal--;
 		}
-		write_unlock(&svc->sched_lock);
+		spin_unlock(&svc->sched_lock);
 		if (goal <= 0)
 			break;
 	}
@@ -527,10 +527,10 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	}
 
 	/* If we fail to create a cache entry, we'll just use the valid dest */
-	write_lock(&svc->sched_lock);
+	spin_lock(&svc->sched_lock);
 	if (!tbl->dead)
 		ip_vs_lblc_new(tbl, &iph.daddr, dest);
-	write_unlock(&svc->sched_lock);
+	spin_unlock(&svc->sched_lock);
 
 out:
 	IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 6049b85df41f..cdfe6a95eddb 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -368,7 +368,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
 
 /*
  * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server. Called under write lock.
+ * IP address to a server. Called under spin lock.
  */
 static inline struct ip_vs_lblcr_entry *
 ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
@@ -412,14 +412,14 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
 	struct ip_vs_lblcr_entry *en;
 	struct hlist_node *next;
 
-	write_lock_bh(&svc->sched_lock);
+	spin_lock_bh(&svc->sched_lock);
 	tbl->dead = 1;
 	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
 			ip_vs_lblcr_free(en);
 		}
 	}
-	write_unlock_bh(&svc->sched_lock);
+	spin_unlock_bh(&svc->sched_lock);
 }
 
 static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
@@ -443,7 +443,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
 	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
 
-		write_lock(&svc->sched_lock);
+		spin_lock(&svc->sched_lock);
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 			if (time_after(en->lastuse +
 				       sysctl_lblcr_expiration(svc), now))
@@ -452,7 +452,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
 			ip_vs_lblcr_free(en);
 			atomic_dec(&tbl->entries);
 		}
-		write_unlock(&svc->sched_lock);
+		spin_unlock(&svc->sched_lock);
 	}
 	tbl->rover = j;
 }
@@ -498,7 +498,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
 	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
 
-		write_lock(&svc->sched_lock);
+		spin_lock(&svc->sched_lock);
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 			if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
 				continue;
@@ -507,7 +507,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
 			atomic_dec(&tbl->entries);
 			goal--;
 		}
-		write_unlock(&svc->sched_lock);
+		spin_unlock(&svc->sched_lock);
 		if (goal <= 0)
 			break;
 	}
@@ -678,7 +678,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		if (atomic_read(&en->set.size) > 1 &&
 		    time_after(jiffies, en->set.lastmod +
 				sysctl_lblcr_expiration(svc))) {
-			write_lock(&svc->sched_lock);
+			spin_lock(&svc->sched_lock);
 			if (atomic_read(&en->set.size) > 1) {
 				struct ip_vs_dest *m;
 
@@ -686,7 +686,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 				if (m)
 					ip_vs_dest_set_erase(&en->set, m);
 			}
-			write_unlock(&svc->sched_lock);
+			spin_unlock(&svc->sched_lock);
 		}
 
 		/* If the destination is not overloaded, use it */
@@ -701,10 +701,10 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		}
 
 		/* Update our cache entry */
-		write_lock(&svc->sched_lock);
+		spin_lock(&svc->sched_lock);
 		if (!tbl->dead)
 			ip_vs_dest_set_insert(&en->set, dest, true);
-		write_unlock(&svc->sched_lock);
+		spin_unlock(&svc->sched_lock);
 		goto out;
 	}
 
@@ -716,10 +716,10 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	}
 
 	/* If we fail to create a cache entry, we'll just use the valid dest */
-	write_lock(&svc->sched_lock);
+	spin_lock(&svc->sched_lock);
 	if (!tbl->dead)
 		ip_vs_lblcr_new(tbl, &iph.daddr, dest);
-	write_unlock(&svc->sched_lock);
+	spin_unlock(&svc->sched_lock);
 
 out:
 	IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index 3942890c6f19..aa4601ff1ccc 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -39,14 +39,14 @@ static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
 {
 	struct list_head *p;
 
-	write_lock_bh(&svc->sched_lock);
+	spin_lock_bh(&svc->sched_lock);
 	p = (struct list_head *) svc->sched_data;
 	/* dest is already unlinked, so p->prev is not valid but
 	 * p->next is valid, use it to reach previous entry.
 	 */
 	if (p == &dest->n_list)
 		svc->sched_data = p->next->prev;
-	write_unlock_bh(&svc->sched_lock);
+	spin_unlock_bh(&svc->sched_lock);
 	return 0;
 }
 
@@ -63,7 +63,7 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 
-	write_lock(&svc->sched_lock);
+	spin_lock(&svc->sched_lock);
 	p = (struct list_head *) svc->sched_data;
 	last = dest = list_entry(p, struct ip_vs_dest, n_list);
 
@@ -85,13 +85,13 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	} while (pass < 2 && p != &svc->destinations);
 
 stop:
-	write_unlock(&svc->sched_lock);
+	spin_unlock(&svc->sched_lock);
 	ip_vs_scheduler_err(svc, "no destination available");
 	return NULL;
 
   out:
 	svc->sched_data = &dest->n_list;
-	write_unlock(&svc->sched_lock);
+	spin_unlock(&svc->sched_lock);
 	IP_VS_DBG_BUF(6, "RR: server %s:%u "
 		      "activeconns %d refcnt %d weight %d\n",
 		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index a74fd9bab953..b173ef907a14 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -145,7 +145,7 @@ static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
 {
 	struct ip_vs_wrr_mark *mark = svc->sched_data;
 
-	write_lock_bh(&svc->sched_lock);
+	spin_lock_bh(&svc->sched_lock);
 	mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
 	mark->di = ip_vs_wrr_gcd_weight(svc);
 	mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
@@ -153,7 +153,7 @@ static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
 		mark->cw = mark->mw;
 	else if (mark->di > 1)
 		mark->cw = (mark->cw / mark->di) * mark->di + 1;
-	write_unlock_bh(&svc->sched_lock);
+	spin_unlock_bh(&svc->sched_lock);
 	return 0;
 }
 
@@ -170,7 +170,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 
-	write_lock(&svc->sched_lock);
+	spin_lock(&svc->sched_lock);
 	dest = mark->cl;
 	/* No available dests? */
 	if (mark->mw == 0)
@@ -222,7 +222,7 @@ found:
 	mark->cl = dest;
 
   out:
-	write_unlock(&svc->sched_lock);
+	spin_unlock(&svc->sched_lock);
 	return dest;
 
 err_noavail:

From 413c2d04e9494ca38629d8a7ffeff1e4398a9fe3 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:52 +0200
Subject: [PATCH 32/49] ipvs: convert dests to rcu

In previous commits the schedulers started to access
svc->destinations with _rcu list traversal primitives
because the IP_VS_WAIT_WHILE macro still plays the role of
grace period. Now it is time to finish the updating part,
i.e. adding and deleting of dests with _rcu suffix before
removing the IP_VS_WAIT_WHILE in next commit.

We use the same rule for conns as for the
schedulers: dests can be searched in RCU read-side critical
section where ip_vs_dest_hold can be called by ip_vs_bind_dest.

Some things are not perfect, for example, calling
functions like ip_vs_lookup_dest from updating code under
RCU, just because we use some function both from reader
and from updater.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |  2 +-
 net/netfilter/ipvs/ip_vs_conn.c |  8 +++++---
 net/netfilter/ipvs/ip_vs_ctl.c  | 32 +++++++++++++++++---------------
 net/netfilter/ipvs/ip_vs_sync.c | 11 ++++-------
 4 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4a7bc63b9cba..78a6634653a2 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1434,7 +1434,7 @@ extern struct ip_vs_dest *
 ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr,
 		__be16 dport, const union nf_inet_addr *vaddr, __be16 vport,
 		__u16 protocol, __u32 fwmark, __u32 flags);
-extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
+extern void ip_vs_try_bind_dest(struct ip_vs_conn *cp);
 
 static inline void ip_vs_dest_hold(struct ip_vs_dest *dest)
 {
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 1b29e4a2b26c..54de34077b63 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -611,10 +611,11 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
  * Check if there is a destination for the connection, if so
  * bind the connection to the destination.
  */
-struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 {
 	struct ip_vs_dest *dest;
 
+	rcu_read_lock();
 	dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
 			       cp->dport, &cp->vaddr, cp->vport,
 			       cp->protocol, cp->fwmark, cp->flags);
@@ -624,7 +625,8 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 		spin_lock(&cp->lock);
 		if (cp->dest) {
 			spin_unlock(&cp->lock);
-			return dest;
+			rcu_read_unlock();
+			return;
 		}
 
 		/* Applications work depending on the forwarding method
@@ -648,7 +650,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 		if (pd && atomic_read(&pd->appcnt))
 			ip_vs_bind_app(cp, pd->pp);
 	}
-	return dest;
+	rcu_read_unlock();
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 2bfd807bc93f..0763cc6e092b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -565,8 +565,8 @@ bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
 	return false;
 }
 
-/*
- *	Lookup destination by {addr,port} in the given service
+/* Lookup destination by {addr,port} in the given service
+ * Called under RCU lock.
  */
 static struct ip_vs_dest *
 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
@@ -577,7 +577,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
 	/*
 	 * Find the destination for the given service
 	 */
-	list_for_each_entry(dest, &svc->destinations, n_list) {
+	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 		if ((dest->af == svc->af)
 		    && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
 		    && (dest->port == dport)) {
@@ -591,10 +591,11 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
 
 /*
  * Find destination by {daddr,dport,vaddr,protocol}
- * Cretaed to be used in ip_vs_process_message() in
+ * Created to be used in ip_vs_process_message() in
  * the backup synchronization daemon. It finds the
  * destination to be bound to the received connection
  * on the backup.
+ * Called under RCU lock, no refcnt is returned.
  */
 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
 				   const union nf_inet_addr *daddr,
@@ -615,8 +616,6 @@ struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
 	dest = ip_vs_lookup_dest(svc, daddr, port);
 	if (!dest)
 		dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
-	if (dest)
-		ip_vs_dest_hold(dest);
 	ip_vs_service_put(svc);
 	return dest;
 }
@@ -826,7 +825,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	if (add) {
-		list_add(&dest->n_list, &svc->destinations);
+		list_add_rcu(&dest->n_list, &svc->destinations);
 		svc->num_dests++;
 		if (svc->scheduler->add_dest)
 			svc->scheduler->add_dest(svc, dest);
@@ -933,10 +932,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 
 	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
 
-	/*
-	 * Check if the dest already exists in the list
-	 */
+	/* We use function that requires RCU lock */
+	rcu_read_lock();
 	dest = ip_vs_lookup_dest(svc, &daddr, dport);
+	rcu_read_unlock();
 
 	if (dest != NULL) {
 		IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
@@ -997,10 +996,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 
 	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
 
-	/*
-	 *  Lookup the destination list
-	 */
+	/* We use function that requires RCU lock */
+	rcu_read_lock();
 	dest = ip_vs_lookup_dest(svc, &daddr, dport);
+	rcu_read_unlock();
 
 	if (dest == NULL) {
 		IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
@@ -1069,7 +1068,7 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
 	/*
 	 *  Remove it from the d-linked destination list.
 	 */
-	list_del(&dest->n_list);
+	list_del_rcu(&dest->n_list);
 	svc->num_dests--;
 
 	if (svcupd && svc->scheduler->del_dest)
@@ -1094,7 +1093,10 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 
 	EnterFunction(2);
 
+	/* We use function that requires RCU lock */
+	rcu_read_lock();
 	dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
+	rcu_read_unlock();
 
 	if (dest == NULL) {
 		IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
@@ -2104,7 +2106,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 		else
 			seq_putc(seq, '\n');
 
-		list_for_each_entry(dest, &svc->destinations, n_list) {
+		list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 #ifdef CONFIG_IP_VS_IPV6
 			if (dest->af == AF_INET6)
 				seq_printf(seq,
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 6cc3e52f1f35..97241749216d 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -858,23 +858,20 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 		flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
 		cp->flags = flags;
 		spin_unlock(&cp->lock);
-		if (!dest) {
-			dest = ip_vs_try_bind_dest(cp);
-			if (dest)
-				ip_vs_dest_put(dest);
-		}
+		if (!dest)
+			ip_vs_try_bind_dest(cp);
 	} else {
 		/*
 		 * Find the appropriate destination for the connection.
 		 * If it is not found the connection will remain unbound
 		 * but still handled.
 		 */
+		rcu_read_lock();
 		dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
 				       param->vport, protocol, fwmark, flags);
 
 		cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
-		if (dest)
-			ip_vs_dest_put(dest);
+		rcu_read_unlock();
 		if (!cp) {
 			if (param->pe_data)
 				kfree(param->pe_data);

From ceec4c3816818459d90c92152e61371ff5b1d5a1 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:53 +0200
Subject: [PATCH 33/49] ipvs: convert services to rcu

This is the final step in RCU conversion.

Things that are removed:

- svc->usecnt: now svc is accessed under RCU read lock
- svc->inc: and some unused code
- ip_vs_bind_pe and ip_vs_unbind_pe: no ability to replace PE
- __ip_vs_svc_lock: replaced with RCU
- IP_VS_WAIT_WHILE: now readers lookup svcs and dests under
	RCU and work in parallel with configuration

Other changes:

- before now, a RCU read-side critical section included the
calling of the schedule method, now it is extended to include
service lookup
- ip_vs_svc_table and ip_vs_svc_fwm_table are now using hlist
- svc->pe and svc->scheduler remain to the end (of grace period),
	the schedulers are prepared for such RCU readers
	even after done_service is called but they need
	to use synchronize_rcu because last ip_vs_scheduler_put
	can happen while RCU read-side critical sections
	use an outdated svc->scheduler pointer
- as planned, update_service is removed
- empty services can be freed immediately after grace period.
	If dests were present, the services are freed from
	the dest trash code

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h                   |  28 +--
 net/netfilter/ipvs/ip_vs_core.c       |  26 +--
 net/netfilter/ipvs/ip_vs_ctl.c        | 299 +++++++++++---------------
 net/netfilter/ipvs/ip_vs_dh.c         |   1 +
 net/netfilter/ipvs/ip_vs_lblc.c       |   1 +
 net/netfilter/ipvs/ip_vs_lblcr.c      |   1 +
 net/netfilter/ipvs/ip_vs_lc.c         |   1 +
 net/netfilter/ipvs/ip_vs_nq.c         |   1 +
 net/netfilter/ipvs/ip_vs_pe.c         |  12 --
 net/netfilter/ipvs/ip_vs_proto_sctp.c |  14 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |  14 +-
 net/netfilter/ipvs/ip_vs_proto_udp.c  |  14 +-
 net/netfilter/ipvs/ip_vs_rr.c         |   1 +
 net/netfilter/ipvs/ip_vs_sched.c      |  28 +--
 net/netfilter/ipvs/ip_vs_sed.c        |   1 +
 net/netfilter/ipvs/ip_vs_sh.c         |   1 +
 net/netfilter/ipvs/ip_vs_wlc.c        |   1 +
 net/netfilter/ipvs/ip_vs_wrr.c        |   1 +
 18 files changed, 186 insertions(+), 259 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 78a6634653a2..f9f5b057b480 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -359,8 +359,6 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 #define LeaveFunction(level)   do {} while (0)
 #endif
 
-#define	IP_VS_WAIT_WHILE(expr)	while (expr) { cpu_relax(); }
-
 
 /*
  *      The port number of FTP service (in network order).
@@ -712,10 +710,9 @@ struct ip_vs_dest_user_kern {
  *	and the forwarding entries
  */
 struct ip_vs_service {
-	struct list_head	s_list;   /* for normal service table */
-	struct list_head	f_list;   /* for fwmark-based service table */
+	struct hlist_node	s_list;   /* for normal service table */
+	struct hlist_node	f_list;   /* for fwmark-based service table */
 	atomic_t		refcnt;   /* reference counter */
-	atomic_t		usecnt;   /* use counter */
 
 	u16			af;       /* address family */
 	__u16			protocol; /* which protocol (TCP/UDP) */
@@ -730,15 +727,16 @@ struct ip_vs_service {
 	struct list_head	destinations;  /* real server d-linked list */
 	__u32			num_dests;     /* number of servers */
 	struct ip_vs_stats      stats;         /* statistics for the service */
-	struct ip_vs_app	*inc;	  /* bind conns to this app inc */
 
 	/* for scheduling */
-	struct ip_vs_scheduler	*scheduler;    /* bound scheduler object */
+	struct ip_vs_scheduler __rcu *scheduler; /* bound scheduler object */
 	spinlock_t		sched_lock;    /* lock sched_data */
 	void			*sched_data;   /* scheduler application data */
 
 	/* alternate persistence engine */
-	struct ip_vs_pe		*pe;
+	struct ip_vs_pe __rcu	*pe;
+
+	struct rcu_head		rcu_head;
 };
 
 /* Information for cached dst */
@@ -807,8 +805,6 @@ struct ip_vs_scheduler {
 	int (*init_service)(struct ip_vs_service *svc);
 	/* scheduling service finish */
 	void (*done_service)(struct ip_vs_service *svc);
-	/* scheduler updating service */
-	int (*update_service)(struct ip_vs_service *svc);
 	/* dest is linked */
 	int (*add_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest);
 	/* dest is unlinked */
@@ -1344,8 +1340,6 @@ extern void ip_vs_app_inc_put(struct ip_vs_app *inc);
 extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
 extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
 
-void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe);
-void ip_vs_unbind_pe(struct ip_vs_service *svc);
 int register_ip_vs_pe(struct ip_vs_pe *pe);
 int unregister_ip_vs_pe(struct ip_vs_pe *pe);
 struct ip_vs_pe *ip_vs_pe_getbyname(const char *name);
@@ -1392,7 +1386,8 @@ extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
 extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
 extern int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 				struct ip_vs_scheduler *scheduler);
-extern void ip_vs_unbind_scheduler(struct ip_vs_service *svc);
+extern void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
+				   struct ip_vs_scheduler *sched);
 extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
 extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
 extern struct ip_vs_conn *
@@ -1412,14 +1407,9 @@ extern struct ip_vs_stats ip_vs_stats;
 extern int sysctl_ip_vs_sync_ver;
 
 extern struct ip_vs_service *
-ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
 		  const union nf_inet_addr *vaddr, __be16 vport);
 
-static inline void ip_vs_service_put(struct ip_vs_service *svc)
-{
-	atomic_dec(&svc->usecnt);
-}
-
 extern bool
 ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
 		       const union nf_inet_addr *daddr, __be16 dport);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 939ad11ed534..79df3c61d4d8 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -203,7 +203,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 {
 	ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
 			      vport, p);
-	p->pe = svc->pe;
+	p->pe = rcu_dereference(svc->pe);
 	if (p->pe && p->pe->fill_param)
 		return p->pe->fill_param(p, skb);
 
@@ -296,15 +296,16 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	/* Check if a template already exists */
 	ct = ip_vs_ct_in_get(&param);
 	if (!ct || !ip_vs_check_template(ct)) {
+		struct ip_vs_scheduler *sched;
+
 		/*
 		 * No template found or the dest of the connection
 		 * template is not available.
 		 * return *ignored=0 i.e. ICMP and NF_DROP
 		 */
-		rcu_read_lock();
-		dest = svc->scheduler->schedule(svc, skb);
+		sched = rcu_dereference(svc->scheduler);
+		dest = sched->schedule(svc, skb);
 		if (!dest) {
-			rcu_read_unlock();
 			IP_VS_DBG(1, "p-schedule: no dest found.\n");
 			kfree(param.pe_data);
 			*ignored = 0;
@@ -320,7 +321,6 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		 * when the template expires */
 		ct = ip_vs_conn_new(&param, &dest->addr, dport,
 				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
-		rcu_read_unlock();
 		if (ct == NULL) {
 			kfree(param.pe_data);
 			*ignored = -1;
@@ -394,6 +394,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 {
 	struct ip_vs_protocol *pp = pd->pp;
 	struct ip_vs_conn *cp = NULL;
+	struct ip_vs_scheduler *sched;
 	struct ip_vs_dest *dest;
 	__be16 _ports[2], *pptr;
 	unsigned int flags;
@@ -449,10 +450,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 		return NULL;
 	}
 
-	rcu_read_lock();
-	dest = svc->scheduler->schedule(svc, skb);
+	sched = rcu_dereference(svc->scheduler);
+	dest = sched->schedule(svc, skb);
 	if (dest == NULL) {
-		rcu_read_unlock();
 		IP_VS_DBG(1, "Schedule: no dest found.\n");
 		return NULL;
 	}
@@ -473,7 +473,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 		cp = ip_vs_conn_new(&p, &dest->addr,
 				    dest->port ? dest->port : pptr[1],
 				    flags, dest, skb->mark);
-		rcu_read_unlock();
 		if (!cp) {
 			*ignored = -1;
 			return NULL;
@@ -510,7 +509,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 
 	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
 	if (pptr == NULL) {
-		ip_vs_service_put(svc);
 		return NF_DROP;
 	}
 
@@ -536,8 +534,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 				      IP_VS_CONN_F_ONE_PACKET : 0;
 		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 
-		ip_vs_service_put(svc);
-
 		/* create a new connection entry */
 		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 		{
@@ -574,12 +570,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	 * listed in the ipvs table), pass the packets, because it is
 	 * not ipvs job to decide to drop the packets.
 	 */
-	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
-		ip_vs_service_put(svc);
+	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
 		return NF_ACCEPT;
-	}
-
-	ip_vs_service_put(svc);
 
 	/*
 	 * Notify the client that the destination is unreachable, and
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 0763cc6e092b..9e4074c26dc2 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -55,9 +55,6 @@
 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
 static DEFINE_MUTEX(__ip_vs_mutex);
 
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_svc_lock);
-
 /* sysctl variables */
 
 #ifdef CONFIG_IP_VS_DEBUG
@@ -257,9 +254,9 @@ ip_vs_use_count_dec(void)
 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
 
 /* the service table hashed by <protocol, addr, port> */
-static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
 /* the service table hashed by fwmark */
-static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
 
 
 /*
@@ -314,13 +311,13 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 		 */
 		hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
 					 &svc->addr, svc->port);
-		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+		hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
 	} else {
 		/*
 		 *  Hash it by fwmark in svc_fwm_table
 		 */
 		hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
-		list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+		hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
 	}
 
 	svc->flags |= IP_VS_SVC_F_HASHED;
@@ -344,10 +341,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
 
 	if (svc->fwmark == 0) {
 		/* Remove it from the svc_table table */
-		list_del(&svc->s_list);
+		hlist_del_rcu(&svc->s_list);
 	} else {
 		/* Remove it from the svc_fwm_table table */
-		list_del(&svc->f_list);
+		hlist_del_rcu(&svc->f_list);
 	}
 
 	svc->flags &= ~IP_VS_SVC_F_HASHED;
@@ -369,7 +366,7 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol,
 	/* Check for "full" addressed entries */
 	hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
 
-	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+	hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
 		if ((svc->af == af)
 		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
 		    && (svc->port == vport)
@@ -396,7 +393,7 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
 	/* Check for fwmark addressed entries */
 	hash = ip_vs_svc_fwm_hashkey(net, fwmark);
 
-	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+	hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
 		if (svc->fwmark == fwmark && svc->af == af
 		    && net_eq(svc->net, net)) {
 			/* HIT */
@@ -407,15 +404,14 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
 	return NULL;
 }
 
+/* Find service, called under RCU lock */
 struct ip_vs_service *
-ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
-		  const union nf_inet_addr *vaddr, __be16 vport)
+ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+		   const union nf_inet_addr *vaddr, __be16 vport)
 {
 	struct ip_vs_service *svc;
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	read_lock(&__ip_vs_svc_lock);
-
 	/*
 	 *	Check the table hashed by fwmark first
 	 */
@@ -451,10 +447,6 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
 	}
 
   out:
-	if (svc)
-		atomic_inc(&svc->usecnt);
-	read_unlock(&__ip_vs_svc_lock);
-
 	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
 		      fwmark, ip_vs_proto_name(protocol),
 		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
@@ -471,6 +463,13 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 	dest->svc = svc;
 }
 
+static void ip_vs_service_free(struct ip_vs_service *svc)
+{
+	if (svc->stats.cpustats)
+		free_percpu(svc->stats.cpustats);
+	kfree(svc);
+}
+
 static void
 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
 {
@@ -478,12 +477,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
 
 	dest->svc = NULL;
 	if (atomic_dec_and_test(&svc->refcnt)) {
-		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
 			      svc->fwmark,
 			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
-			      ntohs(svc->port), atomic_read(&svc->usecnt));
-		free_percpu(svc->stats.cpustats);
-		kfree(svc);
+			      ntohs(svc->port));
+		ip_vs_service_free(svc);
 	}
 }
 
@@ -608,7 +606,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
 	struct ip_vs_service *svc;
 	__be16 port = dport;
 
-	svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
+	svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport);
 	if (!svc)
 		return NULL;
 	if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
@@ -616,7 +614,6 @@ struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
 	dest = ip_vs_lookup_dest(svc, daddr, port);
 	if (!dest)
 		dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
-	ip_vs_service_put(svc);
 	return dest;
 }
 
@@ -774,6 +771,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 		    struct ip_vs_dest_user_kern *udest, int add)
 {
 	struct netns_ipvs *ipvs = net_ipvs(svc->net);
+	struct ip_vs_scheduler *sched;
 	int conn_flags;
 
 	/* set the weight and the flags */
@@ -816,29 +814,17 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	__ip_vs_dst_cache_reset(dest);
 	spin_unlock_bh(&dest->dst_lock);
 
-	if (add)
-		ip_vs_start_estimator(svc->net, &dest->stats);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/* Wait until all other svc users go away */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
+	sched = rcu_dereference_protected(svc->scheduler, 1);
 	if (add) {
+		ip_vs_start_estimator(svc->net, &dest->stats);
 		list_add_rcu(&dest->n_list, &svc->destinations);
 		svc->num_dests++;
-		if (svc->scheduler->add_dest)
-			svc->scheduler->add_dest(svc, dest);
+		if (sched->add_dest)
+			sched->add_dest(svc, dest);
 	} else {
-		if (svc->scheduler->upd_dest)
-			svc->scheduler->upd_dest(svc, dest);
+		if (sched->upd_dest)
+			sched->upd_dest(svc, dest);
 	}
-
-	/* call the update_service, because server weight may be changed */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
 }
 
 
@@ -1071,14 +1057,13 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
 	list_del_rcu(&dest->n_list);
 	svc->num_dests--;
 
-	if (svcupd && svc->scheduler->del_dest)
-		svc->scheduler->del_dest(svc, dest);
+	if (svcupd) {
+		struct ip_vs_scheduler *sched;
 
-	/*
-	 *  Call the update_service function of its scheduler
-	 */
-	if (svcupd && svc->scheduler->update_service)
-			svc->scheduler->update_service(svc);
+		sched = rcu_dereference_protected(svc->scheduler, 1);
+		if (sched->del_dest)
+			sched->del_dest(svc, dest);
+	}
 }
 
 
@@ -1103,20 +1088,11 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 		return -ENOENT;
 	}
 
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 *	Wait until all other svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
 	/*
 	 *	Unlink dest from the service
 	 */
 	__ip_vs_unlink_dest(svc, dest, 1);
 
-	write_unlock_bh(&__ip_vs_svc_lock);
-
 	/*
 	 *	Delete the destination
 	 */
@@ -1207,7 +1183,6 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 	}
 
 	/* I'm the first user of the service */
-	atomic_set(&svc->usecnt, 0);
 	atomic_set(&svc->refcnt, 0);
 
 	svc->af = u->af;
@@ -1231,7 +1206,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 	sched = NULL;
 
 	/* Bind the ct retriever */
-	ip_vs_bind_pe(svc, pe);
+	RCU_INIT_POINTER(svc->pe, pe);
 	pe = NULL;
 
 	/* Update the virtual service counters */
@@ -1247,9 +1222,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 		ipvs->num_services++;
 
 	/* Hash the service into the service table */
-	write_lock_bh(&__ip_vs_svc_lock);
 	ip_vs_svc_hash(svc);
-	write_unlock_bh(&__ip_vs_svc_lock);
 
 	*svc_p = svc;
 	/* Now there is a service - full throttle */
@@ -1259,15 +1232,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 
  out_err:
 	if (svc != NULL) {
-		ip_vs_unbind_scheduler(svc);
-		if (svc->inc) {
-			local_bh_disable();
-			ip_vs_app_inc_put(svc->inc);
-			local_bh_enable();
-		}
-		if (svc->stats.cpustats)
-			free_percpu(svc->stats.cpustats);
-		kfree(svc);
+		ip_vs_unbind_scheduler(svc, sched);
+		ip_vs_service_free(svc);
 	}
 	ip_vs_scheduler_put(sched);
 	ip_vs_pe_put(pe);
@@ -1317,12 +1283,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	}
 #endif
 
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 * Wait until all other svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+	old_sched = rcu_dereference_protected(svc->scheduler, 1);
+	if (sched != old_sched) {
+		/* Bind the new scheduler */
+		ret = ip_vs_bind_scheduler(svc, sched);
+		if (ret) {
+			old_sched = sched;
+			goto out;
+		}
+		/* Unbind the old scheduler on success */
+		ip_vs_unbind_scheduler(svc, old_sched);
+	}
 
 	/*
 	 * Set the flags and timeout value
@@ -1331,47 +1302,23 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	svc->timeout = u->timeout * HZ;
 	svc->netmask = u->netmask;
 
-	old_sched = svc->scheduler;
-	if (sched != old_sched) {
-		/*
-		 * Unbind the old scheduler
-		 */
-		ip_vs_unbind_scheduler(svc);
+	old_pe = rcu_dereference_protected(svc->pe, 1);
+	if (pe != old_pe)
+		rcu_assign_pointer(svc->pe, pe);
 
-		/*
-		 * Bind the new scheduler
-		 */
-		if ((ret = ip_vs_bind_scheduler(svc, sched))) {
-			/*
-			 * If ip_vs_bind_scheduler fails, restore the old
-			 * scheduler.
-			 * The main reason of failure is out of memory.
-			 *
-			 * The question is if the old scheduler can be
-			 * restored all the time. TODO: if it cannot be
-			 * restored some time, we must delete the service,
-			 * otherwise the system may crash.
-			 */
-			ip_vs_bind_scheduler(svc, old_sched);
-			old_sched = sched;
-			goto out_unlock;
-		}
-	}
-
-	old_pe = svc->pe;
-	if (pe != old_pe) {
-		ip_vs_unbind_pe(svc);
-		ip_vs_bind_pe(svc, pe);
-	}
-
-out_unlock:
-	write_unlock_bh(&__ip_vs_svc_lock);
 out:
 	ip_vs_scheduler_put(old_sched);
 	ip_vs_pe_put(old_pe);
 	return ret;
 }
 
+static void ip_vs_service_rcu_free(struct rcu_head *head)
+{
+	struct ip_vs_service *svc;
+
+	svc = container_of(head, struct ip_vs_service, rcu_head);
+	ip_vs_service_free(svc);
+}
 
 /*
  *	Delete a service from the service list
@@ -1394,21 +1341,14 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
 	ip_vs_stop_estimator(svc->net, &svc->stats);
 
 	/* Unbind scheduler */
-	old_sched = svc->scheduler;
-	ip_vs_unbind_scheduler(svc);
+	old_sched = rcu_dereference_protected(svc->scheduler, 1);
+	ip_vs_unbind_scheduler(svc, old_sched);
 	ip_vs_scheduler_put(old_sched);
 
-	/* Unbind persistence engine */
-	old_pe = svc->pe;
-	ip_vs_unbind_pe(svc);
+	/* Unbind persistence engine, keep svc->pe */
+	old_pe = rcu_dereference_protected(svc->pe, 1);
 	ip_vs_pe_put(old_pe);
 
-	/* Unbind app inc */
-	if (svc->inc) {
-		ip_vs_app_inc_put(svc->inc);
-		svc->inc = NULL;
-	}
-
 	/*
 	 *    Unlink the whole destination list
 	 */
@@ -1428,13 +1368,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
 	/*
 	 *    Free the service if nobody refers to it
 	 */
-	if (atomic_read(&svc->refcnt) == 0) {
-		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+	if (atomic_dec_and_test(&svc->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
 			      svc->fwmark,
 			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
-			      ntohs(svc->port), atomic_read(&svc->usecnt));
-		free_percpu(svc->stats.cpustats);
-		kfree(svc);
+			      ntohs(svc->port));
+		call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
 	}
 
 	/* decrease the module use count */
@@ -1446,21 +1385,14 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
  */
 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
 {
+	/* Hold svc to avoid double release from dest_trash */
+	atomic_inc(&svc->refcnt);
 	/*
 	 * Unhash it from the service table
 	 */
-	write_lock_bh(&__ip_vs_svc_lock);
-
 	ip_vs_svc_unhash(svc);
 
-	/*
-	 * Wait until all the svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
 	__ip_vs_del_service(svc, cleanup);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
 }
 
 /*
@@ -1482,14 +1414,15 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
 static int ip_vs_flush(struct net *net, bool cleanup)
 {
 	int idx;
-	struct ip_vs_service *svc, *nxt;
+	struct ip_vs_service *svc;
+	struct hlist_node *n;
 
 	/*
 	 * Flush the service table hashed by <netns,protocol,addr,port>
 	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
-					 s_list) {
+		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
+					  s_list) {
 			if (net_eq(svc->net, net))
 				ip_vs_unlink_service(svc, cleanup);
 		}
@@ -1499,8 +1432,8 @@ static int ip_vs_flush(struct net *net, bool cleanup)
 	 * Flush the service table hashed by fwmark
 	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry_safe(svc, nxt,
-					 &ip_vs_svc_fwm_table[idx], f_list) {
+		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
+					  f_list) {
 			if (net_eq(svc->net, net))
 				ip_vs_unlink_service(svc, cleanup);
 		}
@@ -1558,7 +1491,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	EnterFunction(2);
 	mutex_lock(&__ip_vs_mutex);
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
 			if (net_eq(svc->net, net)) {
 				list_for_each_entry(dest, &svc->destinations,
 						    n_list) {
@@ -1567,7 +1500,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 			}
 		}
 
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
 			if (net_eq(svc->net, net)) {
 				list_for_each_entry(dest, &svc->destinations,
 						    n_list) {
@@ -1595,12 +1528,10 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
 {
 	struct ip_vs_dest *dest;
 
-	write_lock_bh(&__ip_vs_svc_lock);
 	list_for_each_entry(dest, &svc->destinations, n_list) {
 		ip_vs_zero_stats(&dest->stats);
 	}
 	ip_vs_zero_stats(&svc->stats);
-	write_unlock_bh(&__ip_vs_svc_lock);
 	return 0;
 }
 
@@ -1610,14 +1541,14 @@ static int ip_vs_zero_all(struct net *net)
 	struct ip_vs_service *svc;
 
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
 			if (net_eq(svc->net, net))
 				ip_vs_zero_service(svc);
 		}
 	}
 
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
 			if (net_eq(svc->net, net))
 				ip_vs_zero_service(svc);
 		}
@@ -1945,7 +1876,7 @@ static struct ctl_table vs_vars[] = {
 
 struct ip_vs_iter {
 	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
-	struct list_head *table;
+	struct hlist_head *table;
 	int bucket;
 };
 
@@ -1978,7 +1909,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 
 	/* look in hash by protocol */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+		hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
 			if (net_eq(svc->net, net) && pos-- == 0) {
 				iter->table = ip_vs_svc_table;
 				iter->bucket = idx;
@@ -1989,7 +1920,8 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 
 	/* keep looking in fwmark */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+		hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
+					 f_list) {
 			if (net_eq(svc->net, net) && pos-- == 0) {
 				iter->table = ip_vs_svc_fwm_table;
 				iter->bucket = idx;
@@ -2002,17 +1934,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 }
 
 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
-__acquires(__ip_vs_svc_lock)
 {
 
-	read_lock_bh(&__ip_vs_svc_lock);
+	rcu_read_lock();
 	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 
 
 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct list_head *e;
+	struct hlist_node *e;
 	struct ip_vs_iter *iter;
 	struct ip_vs_service *svc;
 
@@ -2025,13 +1956,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	if (iter->table == ip_vs_svc_table) {
 		/* next service in table hashed by protocol */
-		if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
-			return list_entry(e, struct ip_vs_service, s_list);
-
+		e = rcu_dereference(hlist_next_rcu(&svc->s_list));
+		if (e)
+			return hlist_entry(e, struct ip_vs_service, s_list);
 
 		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
-			list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
-					    s_list) {
+			hlist_for_each_entry_rcu(svc,
+						 &ip_vs_svc_table[iter->bucket],
+						 s_list) {
 				return svc;
 			}
 		}
@@ -2042,13 +1974,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	}
 
 	/* next service in hashed by fwmark */
-	if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
-		return list_entry(e, struct ip_vs_service, f_list);
+	e = rcu_dereference(hlist_next_rcu(&svc->f_list));
+	if (e)
+		return hlist_entry(e, struct ip_vs_service, f_list);
 
  scan_fwmark:
 	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
-				    f_list)
+		hlist_for_each_entry_rcu(svc,
+					 &ip_vs_svc_fwm_table[iter->bucket],
+					 f_list)
 			return svc;
 	}
 
@@ -2056,9 +1990,8 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
-__releases(__ip_vs_svc_lock)
 {
-	read_unlock_bh(&__ip_vs_svc_lock);
+	rcu_read_unlock();
 }
 
 
@@ -2076,6 +2009,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 		const struct ip_vs_service *svc = v;
 		const struct ip_vs_iter *iter = seq->private;
 		const struct ip_vs_dest *dest;
+		struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
 
 		if (iter->table == ip_vs_svc_table) {
 #ifdef CONFIG_IP_VS_IPV6
@@ -2084,18 +2018,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 					   ip_vs_proto_name(svc->protocol),
 					   &svc->addr.in6,
 					   ntohs(svc->port),
-					   svc->scheduler->name);
+					   sched->name);
 			else
 #endif
 				seq_printf(seq, "%s  %08X:%04X %s %s ",
 					   ip_vs_proto_name(svc->protocol),
 					   ntohl(svc->addr.ip),
 					   ntohs(svc->port),
-					   svc->scheduler->name,
+					   sched->name,
 					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
 		} else {
 			seq_printf(seq, "FWM  %08X %s %s",
-				   svc->fwmark, svc->scheduler->name,
+				   svc->fwmark, sched->name,
 				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
 		}
 
@@ -2451,11 +2385,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 	}
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
+	rcu_read_lock();
 	if (usvc.fwmark == 0)
 		svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
 					   &usvc.addr, usvc.port);
 	else
 		svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+	rcu_read_unlock();
 
 	if (cmd != IP_VS_SO_SET_ADD
 	    && (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2507,11 +2443,14 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 static void
 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
 {
+	struct ip_vs_scheduler *sched;
+
+	sched = rcu_dereference_protected(src->scheduler, 1);
 	dst->protocol = src->protocol;
 	dst->addr = src->addr.ip;
 	dst->port = src->port;
 	dst->fwmark = src->fwmark;
-	strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
+	strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
 	dst->flags = src->flags;
 	dst->timeout = src->timeout / HZ;
 	dst->netmask = src->netmask;
@@ -2530,7 +2469,7 @@ __ip_vs_get_service_entries(struct net *net,
 	int ret = 0;
 
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
 			/* Only expose IPv4 entries to old interface */
 			if (svc->af != AF_INET || !net_eq(svc->net, net))
 				continue;
@@ -2549,7 +2488,7 @@ __ip_vs_get_service_entries(struct net *net,
 	}
 
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
 			/* Only expose IPv4 entries to old interface */
 			if (svc->af != AF_INET || !net_eq(svc->net, net))
 				continue;
@@ -2578,11 +2517,13 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
 	union nf_inet_addr addr = { .ip = get->addr };
 	int ret = 0;
 
+	rcu_read_lock();
 	if (get->fwmark)
 		svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
 	else
 		svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
 					   get->port);
+	rcu_read_unlock();
 
 	if (svc) {
 		int count = 0;
@@ -2765,12 +2706,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 
 		entry = (struct ip_vs_service_entry *)arg;
 		addr.ip = entry->addr;
+		rcu_read_lock();
 		if (entry->fwmark)
 			svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
 		else
 			svc = __ip_vs_service_find(net, AF_INET,
 						   entry->protocol, &addr,
 						   entry->port);
+		rcu_read_unlock();
 		if (svc) {
 			ip_vs_copy_service(entry, svc);
 			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2927,6 +2870,7 @@ nla_put_failure:
 static int ip_vs_genl_fill_service(struct sk_buff *skb,
 				   struct ip_vs_service *svc)
 {
+	struct ip_vs_scheduler *sched;
 	struct nlattr *nl_service;
 	struct ip_vs_flags flags = { .flags = svc->flags,
 				     .mask = ~0 };
@@ -2947,7 +2891,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) ||
+	sched = rcu_dereference_protected(svc->scheduler, 1);
+	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
 	    (svc->pe &&
 	     nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) ||
 	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
@@ -2998,7 +2943,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 
 	mutex_lock(&__ip_vs_mutex);
 	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+		hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
 			if (++idx <= start || !net_eq(svc->net, net))
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -3009,7 +2954,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	}
 
 	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-		list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
 			if (++idx <= start || !net_eq(svc->net, net))
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -3069,11 +3014,13 @@ static int ip_vs_genl_parse_service(struct net *net,
 		usvc->fwmark = 0;
 	}
 
+	rcu_read_lock();
 	if (usvc->fwmark)
 		svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
 	else
 		svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
 					   &usvc->addr, usvc->port);
+	rcu_read_unlock();
 	*ret_svc = svc;
 
 	/* If a full entry was requested, check for the additional fields */
@@ -3905,8 +3852,8 @@ int __init ip_vs_control_init(void)
 
 	/* Initialize svc_table, ip_vs_svc_fwm_table */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
-		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+		INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
+		INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
 	}
 
 	smp_wmb();	/* Do we really need it now ? */
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 89c27230d93a..ccab120df45e 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -269,6 +269,7 @@ static int __init ip_vs_dh_init(void)
 static void __exit ip_vs_dh_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+	synchronize_rcu();
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index ffef8a162148..d8e5238254db 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -633,6 +633,7 @@ static void __exit ip_vs_lblc_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
 	unregister_pernet_subsys(&ip_vs_lblc_ops);
+	synchronize_rcu();
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index cdfe6a95eddb..041b7cc356fb 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -821,6 +821,7 @@ static void __exit ip_vs_lblcr_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
 	unregister_pernet_subsys(&ip_vs_lblcr_ops);
+	synchronize_rcu();
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index 0cabf78fbc31..5128e338a749 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -84,6 +84,7 @@ static int __init ip_vs_lc_init(void)
 static void __exit ip_vs_lc_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+	synchronize_rcu();
 }
 
 module_init(ip_vs_lc_init);
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index 51dc0cf20d90..646cfd4baa73 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -133,6 +133,7 @@ static int __init ip_vs_nq_init(void)
 static void __exit ip_vs_nq_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+	synchronize_rcu();
 }
 
 module_init(ip_vs_nq_init);
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 5d9774c4cc4c..1a82b29ce8ea 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -16,18 +16,6 @@ static LIST_HEAD(ip_vs_pe);
 /* semaphore for IPVS PEs. */
 static DEFINE_MUTEX(ip_vs_pe_mutex);
 
-/* Bind a service with a pe */
-void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
-{
-	svc->pe = pe;
-}
-
-/* Unbind a service from its pe */
-void ip_vs_unbind_pe(struct ip_vs_service *svc)
-{
-	svc->pe = NULL;
-}
-
 /* Get pe in the pe list by name */
 struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
 {
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index f7190cdf023e..4de5176a9981 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -27,9 +27,10 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 	if (sch == NULL)
 		return 0;
 	net = skb_net(skb);
+	rcu_read_lock();
 	if ((sch->type == SCTP_CID_INIT) &&
-	    (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
-				     &iph->daddr, sh->dest))) {
+	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+				      &iph->daddr, sh->dest))) {
 		int ignored;
 
 		if (ip_vs_todrop(net_ipvs(net))) {
@@ -37,7 +38,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
 			 */
-			ip_vs_service_put(svc);
+			rcu_read_unlock();
 			*verdict = NF_DROP;
 			return 0;
 		}
@@ -49,14 +50,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		if (!*cpp && ignored <= 0) {
 			if (!ignored)
 				*verdict = ip_vs_leave(svc, skb, pd, iph);
-			else {
-				ip_vs_service_put(svc);
+			else
 				*verdict = NF_DROP;
-			}
+			rcu_read_unlock();
 			return 0;
 		}
-		ip_vs_service_put(svc);
 	}
+	rcu_read_unlock();
 	/* NF_ACCEPT */
 	return 1;
 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 0bbc3feae682..7de3342e9797 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -47,9 +47,10 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 	}
 	net = skb_net(skb);
 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
+	rcu_read_lock();
 	if (th->syn &&
-	    (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
-				     &iph->daddr, th->dest))) {
+	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+				      &iph->daddr, th->dest))) {
 		int ignored;
 
 		if (ip_vs_todrop(net_ipvs(net))) {
@@ -57,7 +58,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
 			 */
-			ip_vs_service_put(svc);
+			rcu_read_unlock();
 			*verdict = NF_DROP;
 			return 0;
 		}
@@ -70,14 +71,13 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		if (!*cpp && ignored <= 0) {
 			if (!ignored)
 				*verdict = ip_vs_leave(svc, skb, pd, iph);
-			else {
-				ip_vs_service_put(svc);
+			else
 				*verdict = NF_DROP;
-			}
+			rcu_read_unlock();
 			return 0;
 		}
-		ip_vs_service_put(svc);
 	}
+	rcu_read_unlock();
 	/* NF_ACCEPT */
 	return 1;
 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 1a03e2d9c6ba..b62a3c0ff9bf 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -44,8 +44,9 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		return 0;
 	}
 	net = skb_net(skb);
-	svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
-				&iph->daddr, uh->dest);
+	rcu_read_lock();
+	svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+				 &iph->daddr, uh->dest);
 	if (svc) {
 		int ignored;
 
@@ -54,7 +55,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
 			 */
-			ip_vs_service_put(svc);
+			rcu_read_unlock();
 			*verdict = NF_DROP;
 			return 0;
 		}
@@ -67,14 +68,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		if (!*cpp && ignored <= 0) {
 			if (!ignored)
 				*verdict = ip_vs_leave(svc, skb, pd, iph);
-			else {
-				ip_vs_service_put(svc);
+			else
 				*verdict = NF_DROP;
-			}
+			rcu_read_unlock();
 			return 0;
 		}
-		ip_vs_service_put(svc);
 	}
+	rcu_read_unlock();
 	/* NF_ACCEPT */
 	return 1;
 }
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index aa4601ff1ccc..749c98a7dd2c 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -121,6 +121,7 @@ static int __init ip_vs_rr_init(void)
 static void __exit ip_vs_rr_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+	synchronize_rcu();
 }
 
 module_init(ip_vs_rr_init);
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 1b715d0caf43..4dbcda6258bc 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -47,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 {
 	int ret;
 
-	svc->scheduler = scheduler;
-
 	if (scheduler->init_service) {
 		ret = scheduler->init_service(svc);
 		if (ret) {
@@ -56,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 			return ret;
 		}
 	}
-
+	rcu_assign_pointer(svc->scheduler, scheduler);
 	return 0;
 }
 
@@ -64,17 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 /*
  *  Unbind a service with its scheduler
  */
-void ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
+			    struct ip_vs_scheduler *sched)
 {
-	struct ip_vs_scheduler *sched = svc->scheduler;
+	struct ip_vs_scheduler *cur_sched;
 
-	if (!sched)
+	cur_sched = rcu_dereference_protected(svc->scheduler, 1);
+	/* This check proves that old 'sched' was installed */
+	if (!cur_sched)
 		return;
 
 	if (sched->done_service)
 		sched->done_service(svc);
-
-	svc->scheduler = NULL;
+	/* svc->scheduler can not be set to NULL */
 }
 
 
@@ -148,21 +148,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
 
 void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
 {
+	struct ip_vs_scheduler *sched;
+
+	sched = rcu_dereference(svc->scheduler);
 	if (svc->fwmark) {
 		IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
-			     svc->scheduler->name, svc->fwmark,
-			     svc->fwmark, msg);
+			     sched->name, svc->fwmark, svc->fwmark, msg);
 #ifdef CONFIG_IP_VS_IPV6
 	} else if (svc->af == AF_INET6) {
 		IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
-			     svc->scheduler->name,
-			     ip_vs_proto_name(svc->protocol),
+			     sched->name, ip_vs_proto_name(svc->protocol),
 			     &svc->addr.in6, ntohs(svc->port), msg);
 #endif
 	} else {
 		IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
-			     svc->scheduler->name,
-			     ip_vs_proto_name(svc->protocol),
+			     sched->name, ip_vs_proto_name(svc->protocol),
 			     &svc->addr.ip, ntohs(svc->port), msg);
 	}
 }
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index d01187084b7f..f3205925359a 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -134,6 +134,7 @@ static int __init ip_vs_sed_init(void)
 static void __exit ip_vs_sed_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+	synchronize_rcu();
 }
 
 module_init(ip_vs_sed_init);
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 81c1a10c7b49..0df269d7c99f 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -283,6 +283,7 @@ static int __init ip_vs_sh_init(void)
 static void __exit ip_vs_sh_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+	synchronize_rcu();
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index dafae881c622..c60a81c4ce9a 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -106,6 +106,7 @@ static int __init ip_vs_wlc_init(void)
 static void __exit ip_vs_wlc_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+	synchronize_rcu();
 }
 
 module_init(ip_vs_wlc_init);
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index b173ef907a14..32c646eb8747 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -261,6 +261,7 @@ static int __init ip_vs_wrr_init(void)
 static void __exit ip_vs_wrr_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+	synchronize_rcu();
 }
 
 module_init(ip_vs_wrr_init);

From ac69269a45e84c1772dcb9e77db976a932f4af22 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 22 Mar 2013 11:46:54 +0200
Subject: [PATCH 34/49] ipvs: do not disable bh for long time

We used a global BH disable in LOCAL_OUT hook.
Add _bh suffix to all places that need it and remove
the disabling from LOCAL_OUT and sync code.

Functions like ip_defrag need protection from
BH, so add it. As for nf_nat_mangle_tcp_packet, it needs
RCU lock.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_app.c        |  4 +--
 net/netfilter/ipvs/ip_vs_conn.c       | 30 +++++++++++-----------
 net/netfilter/ipvs/ip_vs_core.c       | 37 ++++++---------------------
 net/netfilter/ipvs/ip_vs_ftp.c        |  2 ++
 net/netfilter/ipvs/ip_vs_lblc.c       |  4 +--
 net/netfilter/ipvs/ip_vs_lblcr.c      | 12 ++++-----
 net/netfilter/ipvs/ip_vs_proto_sctp.c |  4 +--
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |  8 +++---
 net/netfilter/ipvs/ip_vs_rr.c         |  6 ++---
 net/netfilter/ipvs/ip_vs_sync.c       | 24 ++++++++---------
 net/netfilter/ipvs/ip_vs_wrr.c        |  4 +--
 net/netfilter/ipvs/ip_vs_xmit.c       | 16 ++++++------
 12 files changed, 64 insertions(+), 87 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index a95603004bc4..dfd7b65b3d2a 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -352,14 +352,14 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
 				 unsigned int flag, __u32 seq, int diff)
 {
 	/* spinlock is to keep updating cp->flags atomic */
-	spin_lock(&cp->lock);
+	spin_lock_bh(&cp->lock);
 	if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
 		vseq->previous_delta = vseq->delta;
 		vseq->delta += diff;
 		vseq->init_seq = seq;
 		cp->flags |= flag;
 	}
-	spin_unlock(&cp->lock);
+	spin_unlock_bh(&cp->lock);
 }
 
 static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 54de34077b63..de6475894a39 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -86,14 +86,14 @@ struct ip_vs_aligned_lock
 static struct ip_vs_aligned_lock
 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
 
-static inline void ct_write_lock(unsigned int key)
+static inline void ct_write_lock_bh(unsigned int key)
 {
-	spin_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+	spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
 }
 
-static inline void ct_write_unlock(unsigned int key)
+static inline void ct_write_unlock_bh(unsigned int key)
 {
-	spin_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+	spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
 }
 
 
@@ -167,7 +167,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	/* Hash by protocol, client address and port */
 	hash = ip_vs_conn_hashkey_conn(cp);
 
-	ct_write_lock(hash);
+	ct_write_lock_bh(hash);
 	spin_lock(&cp->lock);
 
 	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
@@ -182,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	}
 
 	spin_unlock(&cp->lock);
-	ct_write_unlock(hash);
+	ct_write_unlock_bh(hash);
 
 	return ret;
 }
@@ -200,7 +200,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	/* unhash it and decrease its reference counter */
 	hash = ip_vs_conn_hashkey_conn(cp);
 
-	ct_write_lock(hash);
+	ct_write_lock_bh(hash);
 	spin_lock(&cp->lock);
 
 	if (cp->flags & IP_VS_CONN_F_HASHED) {
@@ -212,7 +212,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 		ret = 0;
 
 	spin_unlock(&cp->lock);
-	ct_write_unlock(hash);
+	ct_write_unlock_bh(hash);
 
 	return ret;
 }
@@ -227,7 +227,7 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 
 	hash = ip_vs_conn_hashkey_conn(cp);
 
-	ct_write_lock(hash);
+	ct_write_lock_bh(hash);
 	spin_lock(&cp->lock);
 
 	if (cp->flags & IP_VS_CONN_F_HASHED) {
@@ -242,7 +242,7 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 		ret = atomic_read(&cp->refcnt) ? false : true;
 
 	spin_unlock(&cp->lock);
-	ct_write_unlock(hash);
+	ct_write_unlock_bh(hash);
 
 	return ret;
 }
@@ -462,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 {
 	if (ip_vs_conn_unhash(cp)) {
-		spin_lock(&cp->lock);
+		spin_lock_bh(&cp->lock);
 		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
 			atomic_dec(&ip_vs_conn_no_cport_cnt);
 			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
 			cp->cport = cport;
 		}
-		spin_unlock(&cp->lock);
+		spin_unlock_bh(&cp->lock);
 
 		/* hash on new dport */
 		ip_vs_conn_hash(cp);
@@ -622,9 +622,9 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 	if (dest) {
 		struct ip_vs_proto_data *pd;
 
-		spin_lock(&cp->lock);
+		spin_lock_bh(&cp->lock);
 		if (cp->dest) {
-			spin_unlock(&cp->lock);
+			spin_unlock_bh(&cp->lock);
 			rcu_read_unlock();
 			return;
 		}
@@ -635,7 +635,7 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 			ip_vs_unbind_app(cp);
 
 		ip_vs_bind_dest(cp, dest);
-		spin_unlock(&cp->lock);
+		spin_unlock_bh(&cp->lock);
 
 		/* Update its packet transmitter */
 		cp->packet_xmit = NULL;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 79df3c61d4d8..f26fe3353a30 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -638,8 +638,11 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
 
 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 {
-	int err = ip_defrag(skb, user);
+	int err;
 
+	local_bh_disable();
+	err = ip_defrag(skb, user);
+	local_bh_enable();
 	if (!err)
 		ip_send_check(ip_hdr(skb));
 
@@ -1217,13 +1220,7 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
 		   const struct net_device *in, const struct net_device *out,
 		   int (*okfn)(struct sk_buff *))
 {
-	unsigned int verdict;
-
-	/* Disable BH in LOCAL_OUT until all places are fixed */
-	local_bh_disable();
-	verdict = ip_vs_out(hooknum, skb, AF_INET);
-	local_bh_enable();
-	return verdict;
+	return ip_vs_out(hooknum, skb, AF_INET);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1250,13 +1247,7 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
 		   const struct net_device *in, const struct net_device *out,
 		   int (*okfn)(struct sk_buff *))
 {
-	unsigned int verdict;
-
-	/* Disable BH in LOCAL_OUT until all places are fixed */
-	local_bh_disable();
-	verdict = ip_vs_out(hooknum, skb, AF_INET6);
-	local_bh_enable();
-	return verdict;
+	return ip_vs_out(hooknum, skb, AF_INET6);
 }
 
 #endif
@@ -1714,13 +1705,7 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	unsigned int verdict;
-
-	/* Disable BH in LOCAL_OUT until all places are fixed */
-	local_bh_disable();
-	verdict = ip_vs_in(hooknum, skb, AF_INET);
-	local_bh_enable();
-	return verdict;
+	return ip_vs_in(hooknum, skb, AF_INET);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1779,13 +1764,7 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	unsigned int verdict;
-
-	/* Disable BH in LOCAL_OUT until all places are fixed */
-	local_bh_disable();
-	verdict = ip_vs_in(hooknum, skb, AF_INET6);
-	local_bh_enable();
-	return verdict;
+	return ip_vs_in(hooknum, skb, AF_INET6);
 }
 
 #endif
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 7f90825ef1ce..77c173282f38 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -267,10 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 			 * hopefully it will succeed on the retransmitted
 			 * packet.
 			 */
+			rcu_read_lock();
 			ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
 						       iph->ihl * 4,
 						       start-data, end-start,
 						       buf, buf_len);
+			rcu_read_unlock();
 			if (ret) {
 				ip_vs_nfct_expect_related(skb, ct, n_cp,
 							  IPPROTO_TCP, 0, 0);
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index d8e5238254db..b2cc2528a4df 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -527,10 +527,10 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	}
 
 	/* If we fail to create a cache entry, we'll just use the valid dest */
-	spin_lock(&svc->sched_lock);
+	spin_lock_bh(&svc->sched_lock);
 	if (!tbl->dead)
 		ip_vs_lblc_new(tbl, &iph.daddr, dest);
-	spin_unlock(&svc->sched_lock);
+	spin_unlock_bh(&svc->sched_lock);
 
 out:
 	IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 041b7cc356fb..feb9656eac58 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -678,7 +678,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		if (atomic_read(&en->set.size) > 1 &&
 		    time_after(jiffies, en->set.lastmod +
 				sysctl_lblcr_expiration(svc))) {
-			spin_lock(&svc->sched_lock);
+			spin_lock_bh(&svc->sched_lock);
 			if (atomic_read(&en->set.size) > 1) {
 				struct ip_vs_dest *m;
 
@@ -686,7 +686,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 				if (m)
 					ip_vs_dest_set_erase(&en->set, m);
 			}
-			spin_unlock(&svc->sched_lock);
+			spin_unlock_bh(&svc->sched_lock);
 		}
 
 		/* If the destination is not overloaded, use it */
@@ -701,10 +701,10 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		}
 
 		/* Update our cache entry */
-		spin_lock(&svc->sched_lock);
+		spin_lock_bh(&svc->sched_lock);
 		if (!tbl->dead)
 			ip_vs_dest_set_insert(&en->set, dest, true);
-		spin_unlock(&svc->sched_lock);
+		spin_unlock_bh(&svc->sched_lock);
 		goto out;
 	}
 
@@ -716,10 +716,10 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	}
 
 	/* If we fail to create a cache entry, we'll just use the valid dest */
-	spin_lock(&svc->sched_lock);
+	spin_lock_bh(&svc->sched_lock);
 	if (!tbl->dead)
 		ip_vs_lblcr_new(tbl, &iph.daddr, dest);
-	spin_unlock(&svc->sched_lock);
+	spin_unlock_bh(&svc->sched_lock);
 
 out:
 	IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 4de5176a9981..6e14a7b5602f 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -994,9 +994,9 @@ static void
 sctp_state_transition(struct ip_vs_conn *cp, int direction,
 		const struct sk_buff *skb, struct ip_vs_proto_data *pd)
 {
-	spin_lock(&cp->lock);
+	spin_lock_bh(&cp->lock);
 	set_sctp_state(pd, cp, direction, skb);
-	spin_unlock(&cp->lock);
+	spin_unlock_bh(&cp->lock);
 }
 
 static inline __u16 sctp_app_hashkey(__be16 port)
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 7de3342e9797..50a15944c6c1 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -557,9 +557,9 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
 	if (th == NULL)
 		return;
 
-	spin_lock(&cp->lock);
+	spin_lock_bh(&cp->lock);
 	set_tcp_state(pd, cp, direction, th);
-	spin_unlock(&cp->lock);
+	spin_unlock_bh(&cp->lock);
 }
 
 static inline __u16 tcp_app_hashkey(__be16 port)
@@ -655,11 +655,11 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
 {
 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
 
-	spin_lock(&cp->lock);
+	spin_lock_bh(&cp->lock);
 	cp->state = IP_VS_TCP_S_LISTEN;
 	cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
 			   : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
-	spin_unlock(&cp->lock);
+	spin_unlock_bh(&cp->lock);
 }
 
 /* ---------------------------------------------
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index 749c98a7dd2c..c35986c793d9 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -63,7 +63,7 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 
-	spin_lock(&svc->sched_lock);
+	spin_lock_bh(&svc->sched_lock);
 	p = (struct list_head *) svc->sched_data;
 	last = dest = list_entry(p, struct ip_vs_dest, n_list);
 
@@ -85,13 +85,13 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	} while (pass < 2 && p != &svc->destinations);
 
 stop:
-	spin_unlock(&svc->sched_lock);
+	spin_unlock_bh(&svc->sched_lock);
 	ip_vs_scheduler_err(svc, "no destination available");
 	return NULL;
 
   out:
 	svc->sched_data = &dest->n_list;
-	spin_unlock(&svc->sched_lock);
+	spin_unlock_bh(&svc->sched_lock);
 	IP_VS_DBG_BUF(6, "RR: server %s:%u "
 		      "activeconns %d refcnt %d weight %d\n",
 		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 97241749216d..8e57077e5540 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -531,9 +531,9 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 	if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
 		return;
 
-	spin_lock(&ipvs->sync_buff_lock);
+	spin_lock_bh(&ipvs->sync_buff_lock);
 	if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
-		spin_unlock(&ipvs->sync_buff_lock);
+		spin_unlock_bh(&ipvs->sync_buff_lock);
 		return;
 	}
 
@@ -552,7 +552,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 	if (!buff) {
 		buff = ip_vs_sync_buff_create_v0(ipvs);
 		if (!buff) {
-			spin_unlock(&ipvs->sync_buff_lock);
+			spin_unlock_bh(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
 			return;
 		}
@@ -590,7 +590,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 		sb_queue_tail(ipvs, ms);
 		ms->sync_buff = NULL;
 	}
-	spin_unlock(&ipvs->sync_buff_lock);
+	spin_unlock_bh(&ipvs->sync_buff_lock);
 
 	/* synchronize its controller if it has */
 	cp = cp->control;
@@ -641,9 +641,9 @@ sloop:
 		pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
 	}
 
-	spin_lock(&ipvs->sync_buff_lock);
+	spin_lock_bh(&ipvs->sync_buff_lock);
 	if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
-		spin_unlock(&ipvs->sync_buff_lock);
+		spin_unlock_bh(&ipvs->sync_buff_lock);
 		return;
 	}
 
@@ -683,7 +683,7 @@ sloop:
 	if (!buff) {
 		buff = ip_vs_sync_buff_create(ipvs);
 		if (!buff) {
-			spin_unlock(&ipvs->sync_buff_lock);
+			spin_unlock_bh(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
 			return;
 		}
@@ -750,7 +750,7 @@ sloop:
 		}
 	}
 
-	spin_unlock(&ipvs->sync_buff_lock);
+	spin_unlock_bh(&ipvs->sync_buff_lock);
 
 control:
 	/* synchronize its controller if it has */
@@ -843,7 +843,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 		kfree(param->pe_data);
 
 		dest = cp->dest;
-		spin_lock(&cp->lock);
+		spin_lock_bh(&cp->lock);
 		if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
 		    !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
 			if (flags & IP_VS_CONN_F_INACTIVE) {
@@ -857,7 +857,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 		flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
 		flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
 		cp->flags = flags;
-		spin_unlock(&cp->lock);
+		spin_unlock_bh(&cp->lock);
 		if (!dest)
 			ip_vs_try_bind_dest(cp);
 	} else {
@@ -1689,11 +1689,7 @@ static int sync_thread_backup(void *data)
 				break;
 			}
 
-			/* disable bottom half, because it accesses the data
-			   shared by softirq while getting/creating conns */
-			local_bh_disable();
 			ip_vs_process_message(tinfo->net, tinfo->buf, len);
-			local_bh_enable();
 		}
 	}
 
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 32c646eb8747..0e68555bceb9 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -170,7 +170,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 
 	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 
-	spin_lock(&svc->sched_lock);
+	spin_lock_bh(&svc->sched_lock);
 	dest = mark->cl;
 	/* No available dests? */
 	if (mark->mw == 0)
@@ -222,7 +222,7 @@ found:
 	mark->cl = dest;
 
   out:
-	spin_unlock(&svc->sched_lock);
+	spin_unlock_bh(&svc->sched_lock);
 	return dest;
 
 err_noavail:
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 3db7889edefc..b75ff6429a04 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -177,22 +177,22 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
 			rt = (struct rtable *) dest_dst->dst_cache;
 		else {
 			dest_dst = ip_vs_dest_dst_alloc();
-			spin_lock(&dest->dst_lock);
+			spin_lock_bh(&dest->dst_lock);
 			if (!dest_dst) {
 				__ip_vs_dst_set(dest, NULL, NULL, 0);
-				spin_unlock(&dest->dst_lock);
+				spin_unlock_bh(&dest->dst_lock);
 				goto err_unreach;
 			}
 			rt = do_output_route4(net, dest->addr.ip, rt_mode,
 					      &dest_dst->dst_saddr.ip);
 			if (!rt) {
 				__ip_vs_dst_set(dest, NULL, NULL, 0);
-				spin_unlock(&dest->dst_lock);
+				spin_unlock_bh(&dest->dst_lock);
 				ip_vs_dest_dst_free(dest_dst);
 				goto err_unreach;
 			}
 			__ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
-			spin_unlock(&dest->dst_lock);
+			spin_unlock_bh(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
 				  &dest->addr.ip, &dest_dst->dst_saddr.ip,
 				  atomic_read(&rt->dst.__refcnt));
@@ -358,10 +358,10 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 			u32 cookie;
 
 			dest_dst = ip_vs_dest_dst_alloc();
-			spin_lock(&dest->dst_lock);
+			spin_lock_bh(&dest->dst_lock);
 			if (!dest_dst) {
 				__ip_vs_dst_set(dest, NULL, NULL, 0);
-				spin_unlock(&dest->dst_lock);
+				spin_unlock_bh(&dest->dst_lock);
 				goto err_unreach;
 			}
 			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
@@ -369,14 +369,14 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
 						      do_xfrm);
 			if (!dst) {
 				__ip_vs_dst_set(dest, NULL, NULL, 0);
-				spin_unlock(&dest->dst_lock);
+				spin_unlock_bh(&dest->dst_lock);
 				ip_vs_dest_dst_free(dest_dst);
 				goto err_unreach;
 			}
 			rt = (struct rt6_info *) dst;
 			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
 			__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
-			spin_unlock(&dest->dst_lock);
+			spin_unlock_bh(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
 				  &dest->addr.in6, &dest_dst->dst_saddr.in6,
 				  atomic_read(&rt->dst.__refcnt));

From f0165888610a1701a39670c7eadf63a61fad708d Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Thu, 21 Mar 2013 19:48:42 +0000
Subject: [PATCH 35/49] netfilter: use IS_ENABLE to replace if defined in TRACE
 target

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/ip_output.c           | 3 +--
 net/ipv4/netfilter/ip_tables.c | 6 ++----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 5e12dca7b3dd..147abf5275aa 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -430,8 +430,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->tc_index = from->tc_index;
 #endif
 	nf_copy(to, from);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
-    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 	to->nf_trace = from->nf_trace;
 #endif
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 3efcf87400c3..1b433aac2663 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -182,8 +182,7 @@ ipt_get_target_c(const struct ipt_entry *e)
 	return ipt_get_target((struct ipt_entry *)e);
 }
 
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
-    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 static const char *const hooknames[] = {
 	[NF_INET_PRE_ROUTING]		= "PREROUTING",
 	[NF_INET_LOCAL_IN]		= "INPUT",
@@ -361,8 +360,7 @@ ipt_do_table(struct sk_buff *skb,
 		t = ipt_get_target(e);
 		IP_NF_ASSERT(t->u.kernel.target);
 
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
-    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 		/* The packet is traced: log it */
 		if (unlikely(skb->nf_trace))
 			trace_packet(skb, hook, in, out,

From 8746ddcf12bb263ad240e095ef16531006caeb50 Mon Sep 17 00:00:00 2001
From: "holger@eitzenberger.org" <holger@eitzenberger.org>
Date: Sat, 23 Mar 2013 10:04:03 +0000
Subject: [PATCH 36/49] netfilter: xt_NFQUEUE: introduce CPU fanout

Current NFQUEUE target uses a hash, computed over source and
destination address (and other parameters), for steering the packet
to the actual NFQUEUE. This, however forgets about the fact that the
packet eventually is handled by a particular CPU on user request.

If E. g.

  1) IRQ affinity is used to handle packets on a particular CPU already
     (both single-queue or multi-queue case)

and/or

  2) RPS is used to steer packets to a specific softirq

the target easily chooses an NFQUEUE which is not handled by a process
pinned to the same CPU.

The idea is therefore to use the CPU index for determining the
NFQUEUE handling the packet.

E. g. when having a system with 4 CPUs, 4 MQ queues and 4 NFQUEUEs it
looks like this:

 +-----+  +-----+  +-----+  +-----+
 |NFQ#0|  |NFQ#1|  |NFQ#2|  |NFQ#3|
 +-----+  +-----+  +-----+  +-----+
    ^        ^        ^        ^
    |        |NFQUEUE |        |
    +        +        +        +
 +-----+  +-----+  +-----+  +-----+
 |rx-0 |  |rx-1 |  |rx-2 |  |rx-3 |
 +-----+  +-----+  +-----+  +-----+

The NFQUEUEs not necessarily have to start with number 0, setups with
less NFQUEUEs than packet-handling CPUs are not a problem as well.

This patch extends the NFQUEUE target to accept a new
NFQ_FLAG_CPU_FANOUT flag. If this is specified the target uses the
CPU index for determining the NFQUEUE being used. I have to introduce
rev3 for this. The 'flags' are folded into _v2 'bypass'.

By changing the way which queue is assigned, I'm able to improve the
performance if the processes reading on the NFQUEUs are pinned
correctly.

Signed-off-by: Holger Eitzenberger <holger@eitzenberger.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_NFQUEUE.h |  9 +++++
 net/netfilter/xt_NFQUEUE.c                | 41 +++++++++++++++++++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/netfilter/xt_NFQUEUE.h b/include/uapi/linux/netfilter/xt_NFQUEUE.h
index 9eafdbbb401c..8bb5fe657d34 100644
--- a/include/uapi/linux/netfilter/xt_NFQUEUE.h
+++ b/include/uapi/linux/netfilter/xt_NFQUEUE.h
@@ -26,4 +26,13 @@ struct xt_NFQ_info_v2 {
 	__u16 bypass;
 };
 
+struct xt_NFQ_info_v3 {
+	__u16 queuenum;
+	__u16 queues_total;
+	__u16 flags;
+#define NFQ_FLAG_BYPASS		0x01 /* for compatibility with v2 */
+#define NFQ_FLAG_CPU_FANOUT	0x02 /* use current CPU (no hashing) */
+#define NFQ_FLAG_MASK		0x03
+};
+
 #endif /* _XT_NFQ_TARGET_H */
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 817f9e9f2b16..a287ef262d41 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -108,7 +108,7 @@ nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int nfqueue_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct xt_NFQ_info_v2 *info = par->targinfo;
+	const struct xt_NFQ_info_v3 *info = par->targinfo;
 	u32 maxid;
 
 	if (unlikely(!rnd_inited)) {
@@ -125,11 +125,39 @@ static int nfqueue_tg_check(const struct xt_tgchk_param *par)
 		       info->queues_total, maxid);
 		return -ERANGE;
 	}
-	if (par->target->revision == 2 && info->bypass > 1)
+	if (par->target->revision == 2 && info->flags > 1)
 		return -EINVAL;
+	if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK)
+		return -EINVAL;
+
 	return 0;
 }
 
+static unsigned int
+nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_NFQ_info_v3 *info = par->targinfo;
+	u32 queue = info->queuenum;
+
+	if (info->queues_total > 1) {
+		if (info->flags & NFQ_FLAG_CPU_FANOUT) {
+			int cpu = smp_processor_id();
+
+			queue = info->queuenum + cpu % info->queues_total;
+		} else {
+			if (par->family == NFPROTO_IPV4)
+				queue = (((u64) hash_v4(skb) * info->queues_total) >>
+						 32) + queue;
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+			else if (par->family == NFPROTO_IPV6)
+				queue = (((u64) hash_v6(skb) * info->queues_total) >>
+						 32) + queue;
+#endif
+		}
+	}
+	return NF_QUEUE_NR(queue);
+}
+
 static struct xt_target nfqueue_tg_reg[] __read_mostly = {
 	{
 		.name		= "NFQUEUE",
@@ -156,6 +184,15 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
 		.targetsize	= sizeof(struct xt_NFQ_info_v2),
 		.me		= THIS_MODULE,
 	},
+	{
+		.name		= "NFQUEUE",
+		.revision	= 3,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= nfqueue_tg_check,
+		.target		= nfqueue_tg_v3,
+		.targetsize	= sizeof(struct xt_NFQ_info_v3),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init nfqueue_tg_init(void)

From 5c33448c405adfe1562df76215f24ef0a7947872 Mon Sep 17 00:00:00 2001
From: "holger@eitzenberger.org" <holger@eitzenberger.org>
Date: Sat, 23 Mar 2013 10:04:04 +0000
Subject: [PATCH 37/49] netfilter: xt_NFQUEUE: coalesce IPv4 and IPv6 hashing

Because rev1 and rev3 of the target share the same hashing
generalize it by introduing nfqueue_hash().

Signed-off-by: Holger Eitzenberger <holger@eitzenberger.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_NFQUEUE.c | 42 ++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index a287ef262d41..1e2fae32f81b 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -76,22 +76,31 @@ static u32 hash_v6(const struct sk_buff *skb)
 }
 #endif
 
+static u32
+nfqueue_hash(const struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_NFQ_info_v1 *info = par->targinfo;
+	u32 queue = info->queuenum;
+
+	if (par->family == NFPROTO_IPV4)
+		queue += ((u64) hash_v4(skb) * info->queues_total) >> 32;
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+	else if (par->family == NFPROTO_IPV6)
+		queue += ((u64) hash_v6(skb) * info->queues_total) >> 32;
+#endif
+
+	return queue;
+}
+
 static unsigned int
 nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_NFQ_info_v1 *info = par->targinfo;
 	u32 queue = info->queuenum;
 
-	if (info->queues_total > 1) {
-		if (par->family == NFPROTO_IPV4)
-			queue = (((u64) hash_v4(skb) * info->queues_total) >>
-				 32) + queue;
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-		else if (par->family == NFPROTO_IPV6)
-			queue = (((u64) hash_v6(skb) * info->queues_total) >>
-				 32) + queue;
-#endif
-	}
+	if (info->queues_total > 1)
+		queue = nfqueue_hash(skb, par);
+
 	return NF_QUEUE_NR(queue);
 }
 
@@ -144,17 +153,10 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
 			int cpu = smp_processor_id();
 
 			queue = info->queuenum + cpu % info->queues_total;
-		} else {
-			if (par->family == NFPROTO_IPV4)
-				queue = (((u64) hash_v4(skb) * info->queues_total) >>
-						 32) + queue;
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-			else if (par->family == NFPROTO_IPV6)
-				queue = (((u64) hash_v6(skb) * info->queues_total) >>
-						 32) + queue;
-#endif
-		}
+		} else
+			queue = nfqueue_hash(skb, par);
 	}
+
 	return NF_QUEUE_NR(queue);
 }
 

From 152b0f5da798c56566737f4d0bd85f69688e7d7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= <mkubecek@suse.cz>
Date: Tue, 2 Apr 2013 08:12:11 +0200
Subject: [PATCH 38/49] netfilter: fix struct ip6t_frag field description

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_frag.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_frag.h b/include/uapi/linux/netfilter_ipv6/ip6t_frag.h
index b47f61b9e082..dfd8bc2268cf 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_frag.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_frag.h
@@ -4,9 +4,9 @@
 #include <linux/types.h>
 
 struct ip6t_frag {
-	__u32 ids[2];			/* Security Parameter Index */
+	__u32 ids[2];			/* Identification range */
 	__u32 hdrlen;			/* Header Length */
-	__u8  flags;			/*  */
+	__u8  flags;			/* Flags */
 	__u8  invflags;			/* Inverse flags */
 };
 

From f3c1a44a2208d14b061ad665d9549c9b321f38e5 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Sun, 24 Mar 2013 23:50:39 +0000
Subject: [PATCH 39/49] netfilter: make /proc/net/netfilter pernet

This patch makes this proc dentry pernet. So far only init_net
had a /proc/net/netfilter directory.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/net_namespace.h   |  2 ++
 include/net/netns/netfilter.h | 11 +++++++++++
 net/netfilter/core.c          | 33 +++++++++++++++++++++++++++++----
 3 files changed, 42 insertions(+), 4 deletions(-)
 create mode 100644 include/net/netns/netfilter.h

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index de644bcd8613..b17697827482 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -17,6 +17,7 @@
 #include <net/netns/ipv6.h>
 #include <net/netns/sctp.h>
 #include <net/netns/dccp.h>
+#include <net/netns/netfilter.h>
 #include <net/netns/x_tables.h>
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netns/conntrack.h>
@@ -94,6 +95,7 @@ struct net {
 	struct netns_dccp	dccp;
 #endif
 #ifdef CONFIG_NETFILTER
+	struct netns_nf		nf;
 	struct netns_xt		xt;
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct netns_ct		ct;
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
new file mode 100644
index 000000000000..248ca1c68dba
--- /dev/null
+++ b/include/net/netns/netfilter.h
@@ -0,0 +1,11 @@
+#ifndef __NETNS_NETFILTER_H
+#define __NETNS_NETFILTER_H
+
+#include <linux/proc_fs.h>
+
+struct netns_nf {
+#if defined CONFIG_PROC_FS
+	struct proc_dir_entry *proc_netfilter;
+#endif
+};
+#endif
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a9c488b6c50d..b085184d9b45 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -281,6 +281,34 @@ struct proc_dir_entry *proc_net_netfilter;
 EXPORT_SYMBOL(proc_net_netfilter);
 #endif
 
+static int __net_init netfilter_net_init(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
+						net->proc_net);
+	if (net_eq(net, &init_net)) {
+		if (!net->nf.proc_netfilter)
+			return -ENOMEM;
+		else
+			proc_net_netfilter = net->nf.proc_netfilter;
+	} else if (!net->nf.proc_netfilter) {
+		pr_err("cannot create netfilter proc entry");
+		return -ENOMEM;
+	}
+#endif
+	return 0;
+}
+
+static void __net_exit netfilter_net_exit(struct net *net)
+{
+	remove_proc_entry("netfilter", net->proc_net);
+}
+
+static struct pernet_operations netfilter_net_ops = {
+	.init = netfilter_net_init,
+	.exit = netfilter_net_exit,
+};
+
 void __init netfilter_init(void)
 {
 	int i, h;
@@ -289,11 +317,8 @@ void __init netfilter_init(void)
 			INIT_LIST_HEAD(&nf_hooks[i][h]);
 	}
 
-#ifdef CONFIG_PROC_FS
-	proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net);
-	if (!proc_net_netfilter)
+	if (register_pernet_subsys(&netfilter_net_ops) < 0)
 		panic("cannot create netfilter proc entry");
-#endif
 
 	if (netfilter_log_init() < 0)
 		panic("cannot initialize nf_log");

From 30e0c6a6bee24db0166b7ca709277cd693e179f2 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Sun, 24 Mar 2013 23:50:40 +0000
Subject: [PATCH 40/49] netfilter: nf_log: prepare net namespace support for
 loggers

This patch adds netns support to nf_log and it prepares netns
support for existing loggers. It is composed of four major
changes.

1) nf_log_register has been split to two functions: nf_log_register
   and nf_log_set. The new nf_log_register is used to globally
   register the nf_logger and nf_log_set is used for enabling
   pernet support from nf_loggers.

   Per netns is not yet complete after this patch, it comes in
   separate follow up patches.

2) Add net as a parameter of nf_log_bind_pf. Per netns is not
   yet complete after this patch, it only allows to bind the
   nf_logger to the protocol family from init_net and it skips
   other cases.

3) Adapt all nf_log_packet callers to pass netns as parameter.
   After this patch, this function only works for init_net.

4) Make the sysctl net/netfilter/nf_log pernet.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h                |  14 +-
 include/net/netns/netfilter.h                 |   7 +
 net/bridge/netfilter/ebt_log.c                |   7 +-
 net/bridge/netfilter/ebt_nflog.c              |   5 +-
 net/ipv4/netfilter/ip_tables.c                |   3 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c  |   8 +-
 net/ipv6/netfilter/ip6_tables.c               |   3 +-
 .../netfilter/nf_conntrack_proto_icmpv6.c     |   7 +-
 net/netfilter/nf_conntrack_helper.c           |   2 +-
 net/netfilter/nf_conntrack_proto_dccp.c       |   9 +-
 net/netfilter/nf_conntrack_proto_tcp.c        |  18 +-
 net/netfilter/nf_conntrack_proto_udp.c        |   6 +-
 net/netfilter/nf_conntrack_proto_udplite.c    |   8 +-
 net/netfilter/nf_log.c                        | 223 +++++++++++++-----
 net/netfilter/nfnetlink_log.c                 |   5 +-
 net/netfilter/xt_osf.c                        |   6 +-
 16 files changed, 232 insertions(+), 99 deletions(-)

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index e991bd0a27af..31f1fb9eb784 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -49,12 +49,18 @@ struct nf_logger {
 int nf_log_register(u_int8_t pf, struct nf_logger *logger);
 void nf_log_unregister(struct nf_logger *logger);
 
-int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger);
-void nf_log_unbind_pf(u_int8_t pf);
+void nf_log_set(struct net *net, u_int8_t pf,
+		const struct nf_logger *logger);
+void nf_log_unset(struct net *net, const struct nf_logger *logger);
+
+int nf_log_bind_pf(struct net *net, u_int8_t pf,
+		   const struct nf_logger *logger);
+void nf_log_unbind_pf(struct net *net, u_int8_t pf);
 
 /* Calls the registered backend logging function */
-__printf(7, 8)
-void nf_log_packet(u_int8_t pf,
+__printf(8, 9)
+void nf_log_packet(struct net *net,
+		   u_int8_t pf,
 		   unsigned int hooknum,
 		   const struct sk_buff *skb,
 		   const struct net_device *in,
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 248ca1c68dba..88740024ccf3 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -2,10 +2,17 @@
 #define __NETNS_NETFILTER_H
 
 #include <linux/proc_fs.h>
+#include <linux/netfilter.h>
+
+struct nf_logger;
 
 struct netns_nf {
 #if defined CONFIG_PROC_FS
 	struct proc_dir_entry *proc_netfilter;
+#endif
+	const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO];
+#ifdef CONFIG_SYSCTL
+	struct ctl_table_header *nf_log_dir_header;
 #endif
 };
 #endif
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 92de5e5f9db2..08e5ea5ec4ed 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -176,17 +176,18 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ebt_log_info *info = par->targinfo;
 	struct nf_loginfo li;
+	struct net *net = dev_net(par->in ? par->in : par->out);
 
 	li.type = NF_LOG_TYPE_LOG;
 	li.u.log.level = info->loglevel;
 	li.u.log.logflags = info->bitmask;
 
 	if (info->bitmask & EBT_LOG_NFLOG)
-		nf_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in,
-		              par->out, &li, "%s", info->prefix);
+		nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb,
+			      par->in, par->out, &li, "%s", info->prefix);
 	else
 		ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in,
-		               par->out, &li, info->prefix);
+			       par->out, &li, info->prefix);
 	return EBT_CONTINUE;
 }
 
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 5be68bbcc341..59ac7952010d 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -24,14 +24,15 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ebt_nflog_info *info = par->targinfo;
 	struct nf_loginfo li;
+	struct net *net = dev_net(par->in ? par->in : par->out);
 
 	li.type = NF_LOG_TYPE_ULOG;
 	li.u.ulog.copy_len = info->len;
 	li.u.ulog.group = info->group;
 	li.u.ulog.qthreshold = info->threshold;
 
-	nf_log_packet(PF_BRIDGE, par->hooknum, skb, par->in, par->out,
-	              &li, "%s", info->prefix);
+	nf_log_packet(net, PF_BRIDGE, par->hooknum, skb, par->in,
+		      par->out, &li, "%s", info->prefix);
 	return EBT_CONTINUE;
 }
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 1b433aac2663..e391db1f056d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -258,6 +258,7 @@ static void trace_packet(const struct sk_buff *skb,
 	const char *hookname, *chainname, *comment;
 	const struct ipt_entry *iter;
 	unsigned int rulenum = 0;
+	struct net *net = dev_net(in ? in : out);
 
 	table_base = private->entries[smp_processor_id()];
 	root = get_entry(table_base, private->hook_entry[hook]);
@@ -270,7 +271,7 @@ static void trace_packet(const struct sk_buff *skb,
 		    &chainname, &comment, &rulenum) != 0)
 			break;
 
-	nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
+	nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo,
 		      "TRACE: %s:%s:%s:%u ",
 		      tablename, chainname, comment, rulenum);
 }
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 5241d997ab75..c2cd63d2d892 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -187,8 +187,8 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
 	if (icmph == NULL) {
 		if (LOG_INVALID(net, IPPROTO_ICMP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
-				      "nf_ct_icmp: short packet ");
+			nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
+				      NULL, "nf_ct_icmp: short packet ");
 		return -NF_ACCEPT;
 	}
 
@@ -196,7 +196,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
 		if (LOG_INVALID(net, IPPROTO_ICMP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_icmp: bad HW ICMP checksum ");
 		return -NF_ACCEPT;
 	}
@@ -209,7 +209,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	 */
 	if (icmph->type > NR_ICMP_TYPES) {
 		if (LOG_INVALID(net, IPPROTO_ICMP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_icmp: invalid ICMP type ");
 		return -NF_ACCEPT;
 	}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 341b54ade72c..8861b1ef420e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -284,6 +284,7 @@ static void trace_packet(const struct sk_buff *skb,
 	const char *hookname, *chainname, *comment;
 	const struct ip6t_entry *iter;
 	unsigned int rulenum = 0;
+	struct net *net = dev_net(in ? in : out);
 
 	table_base = private->entries[smp_processor_id()];
 	root = get_entry(table_base, private->hook_entry[hook]);
@@ -296,7 +297,7 @@ static void trace_packet(const struct sk_buff *skb,
 		    &chainname, &comment, &rulenum) != 0)
 			break;
 
-	nf_log_packet(AF_INET6, hook, skb, in, out, &trace_loginfo,
+	nf_log_packet(net, AF_INET6, hook, skb, in, out, &trace_loginfo,
 		      "TRACE: %s:%s:%s:%u ",
 		      tablename, chainname, comment, rulenum);
 }
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 24df3dde0076..b3807c5cb888 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -131,7 +131,8 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
 			 type + 128);
 		nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
 		if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6))
-			nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(nf_ct_net(ct), PF_INET6, 0, skb, NULL,
+				      NULL, NULL,
 				      "nf_ct_icmpv6: invalid new with type %d ",
 				      type + 128);
 		return false;
@@ -203,7 +204,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
 	if (icmp6h == NULL) {
 		if (LOG_INVALID(net, IPPROTO_ICMPV6))
-		nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL,
 			      "nf_ct_icmpv6: short packet ");
 		return -NF_ACCEPT;
 	}
@@ -211,7 +212,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
 		if (LOG_INVALID(net, IPPROTO_ICMPV6))
-			nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_icmpv6: ICMPv6 checksum failed ");
 		return -NF_ACCEPT;
 	}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 94b4b9853f60..a0b1c5c23d1c 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -353,7 +353,7 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
 	/* rcu_read_lock()ed by nf_hook_slow */
 	helper = rcu_dereference(help->helper);
 
-	nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
+	nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
 		      "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf);
 
 	va_end(args);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index ba65b2041eb4..a99b6c3427b0 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -456,7 +456,8 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 
 out_invalid:
 	if (LOG_INVALID(net, IPPROTO_DCCP))
-		nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg);
+		nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL,
+			      NULL, msg);
 	return false;
 }
 
@@ -542,13 +543,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 
 		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_DCCP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_dccp: invalid packet ignored ");
 		return NF_ACCEPT;
 	case CT_DCCP_INVALID:
 		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_DCCP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_dccp: invalid state transition ");
 		return -NF_ACCEPT;
 	}
@@ -613,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,
 
 out_invalid:
 	if (LOG_INVALID(net, IPPROTO_DCCP))
-		nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg);
+		nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, msg);
 	return -NF_ACCEPT;
 }
 
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 83876e9877f1..f021a2076c87 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -720,7 +720,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 		    tn->tcp_be_liberal)
 			res = true;
 		if (!res && LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 			"nf_ct_tcp: %s ",
 			before(seq, sender->td_maxend + 1) ?
 			after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -772,7 +772,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
 		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_tcp: short packet ");
 		return -NF_ACCEPT;
 	}
@@ -780,7 +780,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	/* Not whole TCP header or malformed packet */
 	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
 		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_tcp: truncated/malformed packet ");
 		return -NF_ACCEPT;
 	}
@@ -793,7 +793,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
 		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: bad TCP checksum ");
 		return -NF_ACCEPT;
 	}
@@ -802,7 +802,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
 	if (!tcp_valid_flags[tcpflags]) {
 		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid TCP flag combination ");
 		return -NF_ACCEPT;
 	}
@@ -949,7 +949,7 @@ static int tcp_packet(struct nf_conn *ct,
 		}
 		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid packet ignored in "
 				  "state %s ", tcp_conntrack_names[old_state]);
 		return NF_ACCEPT;
@@ -959,7 +959,7 @@ static int tcp_packet(struct nf_conn *ct,
 			 dir, get_conntrack_index(th), old_state);
 		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
 		return -NF_ACCEPT;
 	case TCP_CONNTRACK_CLOSE:
@@ -969,8 +969,8 @@ static int tcp_packet(struct nf_conn *ct,
 			/* Invalid RST  */
 			spin_unlock_bh(&ct->lock);
 			if (LOG_INVALID(net, IPPROTO_TCP))
-				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
-					  "nf_ct_tcp: invalid RST ");
+				nf_log_packet(net, pf, 0, skb, NULL, NULL,
+					      NULL, "nf_ct_tcp: invalid RST ");
 			return -NF_ACCEPT;
 		}
 		if (index == TCP_RST_SET
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 59623cc56e8d..fee43228e115 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -119,7 +119,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
 	if (hdr == NULL) {
 		if (LOG_INVALID(net, IPPROTO_UDP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_udp: short packet ");
 		return -NF_ACCEPT;
 	}
@@ -127,7 +127,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	/* Truncated/malformed packets */
 	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
 		if (LOG_INVALID(net, IPPROTO_UDP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_udp: truncated/malformed packet ");
 		return -NF_ACCEPT;
 	}
@@ -143,7 +143,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
 		if (LOG_INVALID(net, IPPROTO_UDP))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_udp: bad UDP checksum ");
 		return -NF_ACCEPT;
 	}
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index ca969f6273f7..2750e6c69f82 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -131,7 +131,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
 	if (hdr == NULL) {
 		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_udplite: short packet ");
 		return -NF_ACCEPT;
 	}
@@ -141,7 +141,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 		cscov = udplen;
 	else if (cscov < sizeof(*hdr) || cscov > udplen) {
 		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				"nf_ct_udplite: invalid checksum coverage ");
 		return -NF_ACCEPT;
 	}
@@ -149,7 +149,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 	/* UDPLITE mandates checksums */
 	if (!hdr->check) {
 		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_udplite: checksum missing ");
 		return -NF_ACCEPT;
 	}
@@ -159,7 +159,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
 	    			pf)) {
 		if (LOG_INVALID(net, IPPROTO_UDPLITE))
-			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				      "nf_ct_udplite: bad UDPLite checksum ");
 		return -NF_ACCEPT;
 	}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 9e312695c818..8d331dc9b1ea 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,6 @@
 #define NF_LOG_PREFIXLEN		128
 #define NFLOGGER_NAME_LEN		64
 
-static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
 static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
 static DEFINE_MUTEX(nf_log_mutex);
 
@@ -32,13 +31,52 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
 	return NULL;
 }
 
+void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
+{
+	const struct nf_logger *log;
+
+	if (!net_eq(net, &init_net))
+		return;
+
+	if (pf == NFPROTO_UNSPEC)
+		return;
+
+	mutex_lock(&nf_log_mutex);
+	log = rcu_dereference_protected(net->nf.nf_loggers[pf],
+					lockdep_is_held(&nf_log_mutex));
+	if (log == NULL)
+		rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
+
+	mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_set);
+
+void nf_log_unset(struct net *net, const struct nf_logger *logger)
+{
+	int i;
+	const struct nf_logger *log;
+
+	if (!net_eq(net, &init_net))
+		return;
+
+	mutex_lock(&nf_log_mutex);
+	for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+		log = rcu_dereference_protected(net->nf.nf_loggers[i],
+				lockdep_is_held(&nf_log_mutex));
+		if (log == logger)
+			RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL);
+	}
+	mutex_unlock(&nf_log_mutex);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(nf_log_unset);
+
 /* return EEXIST if the same logger is registered, 0 on success. */
 int nf_log_register(u_int8_t pf, struct nf_logger *logger)
 {
-	const struct nf_logger *llog;
 	int i;
 
-	if (pf >= ARRAY_SIZE(nf_loggers))
+	if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers))
 		return -EINVAL;
 
 	for (i = 0; i < ARRAY_SIZE(logger->list); i++)
@@ -52,63 +90,62 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
 	} else {
 		/* register at end of list to honor first register win */
 		list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
-		llog = rcu_dereference_protected(nf_loggers[pf],
-						 lockdep_is_held(&nf_log_mutex));
-		if (llog == NULL)
-			rcu_assign_pointer(nf_loggers[pf], logger);
 	}
 
 	mutex_unlock(&nf_log_mutex);
 
+	nf_log_set(&init_net, pf, logger);
 	return 0;
 }
 EXPORT_SYMBOL(nf_log_register);
 
 void nf_log_unregister(struct nf_logger *logger)
 {
-	const struct nf_logger *c_logger;
 	int i;
 
 	mutex_lock(&nf_log_mutex);
-	for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
-		c_logger = rcu_dereference_protected(nf_loggers[i],
-						     lockdep_is_held(&nf_log_mutex));
-		if (c_logger == logger)
-			RCU_INIT_POINTER(nf_loggers[i], NULL);
+	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		list_del(&logger->list[i]);
-	}
 	mutex_unlock(&nf_log_mutex);
 
-	synchronize_rcu();
+	nf_log_unset(&init_net, logger);
 }
 EXPORT_SYMBOL(nf_log_unregister);
 
-int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger)
+int nf_log_bind_pf(struct net *net, u_int8_t pf,
+		   const struct nf_logger *logger)
 {
-	if (pf >= ARRAY_SIZE(nf_loggers))
+	if (!net_eq(net, &init_net))
+		return 0;
+
+	if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
 		return -EINVAL;
 	mutex_lock(&nf_log_mutex);
 	if (__find_logger(pf, logger->name) == NULL) {
 		mutex_unlock(&nf_log_mutex);
 		return -ENOENT;
 	}
-	rcu_assign_pointer(nf_loggers[pf], logger);
+	rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
 	mutex_unlock(&nf_log_mutex);
 	return 0;
 }
 EXPORT_SYMBOL(nf_log_bind_pf);
 
-void nf_log_unbind_pf(u_int8_t pf)
+void nf_log_unbind_pf(struct net *net, u_int8_t pf)
 {
-	if (pf >= ARRAY_SIZE(nf_loggers))
+	if (!net_eq(net, &init_net))
+		return;
+
+	if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
 		return;
 	mutex_lock(&nf_log_mutex);
-	RCU_INIT_POINTER(nf_loggers[pf], NULL);
+	RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL);
 	mutex_unlock(&nf_log_mutex);
 }
 EXPORT_SYMBOL(nf_log_unbind_pf);
 
-void nf_log_packet(u_int8_t pf,
+void nf_log_packet(struct net *net,
+		   u_int8_t pf,
 		   unsigned int hooknum,
 		   const struct sk_buff *skb,
 		   const struct net_device *in,
@@ -120,8 +157,11 @@ void nf_log_packet(u_int8_t pf,
 	char prefix[NF_LOG_PREFIXLEN];
 	const struct nf_logger *logger;
 
+	if (!net_eq(net, &init_net))
+		return;
+
 	rcu_read_lock();
-	logger = rcu_dereference(nf_loggers[pf]);
+	logger = rcu_dereference(net->nf.nf_loggers[pf]);
 	if (logger) {
 		va_start(args, fmt);
 		vsnprintf(prefix, sizeof(prefix), fmt, args);
@@ -135,9 +175,11 @@ EXPORT_SYMBOL(nf_log_packet);
 #ifdef CONFIG_PROC_FS
 static void *seq_start(struct seq_file *seq, loff_t *pos)
 {
+	struct net *net = seq_file_net(seq);
+
 	mutex_lock(&nf_log_mutex);
 
-	if (*pos >= ARRAY_SIZE(nf_loggers))
+	if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
 		return NULL;
 
 	return pos;
@@ -145,9 +187,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
+	struct net *net = seq_file_net(s);
+
 	(*pos)++;
 
-	if (*pos >= ARRAY_SIZE(nf_loggers))
+	if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
 		return NULL;
 
 	return pos;
@@ -164,8 +208,9 @@ static int seq_show(struct seq_file *s, void *v)
 	const struct nf_logger *logger;
 	struct nf_logger *t;
 	int ret;
+	struct net *net = seq_file_net(s);
 
-	logger = rcu_dereference_protected(nf_loggers[*pos],
+	logger = rcu_dereference_protected(net->nf.nf_loggers[*pos],
 					   lockdep_is_held(&nf_log_mutex));
 
 	if (!logger)
@@ -199,7 +244,8 @@ static const struct seq_operations nflog_seq_ops = {
 
 static int nflog_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &nflog_seq_ops);
+	return seq_open_net(inode, file, &nflog_seq_ops,
+			    sizeof(struct seq_net_private));
 }
 
 static const struct file_operations nflog_file_ops = {
@@ -207,7 +253,7 @@ static const struct file_operations nflog_file_ops = {
 	.open	 = nflog_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release,
+	.release = seq_release_net,
 };
 
 
@@ -216,7 +262,6 @@ static const struct file_operations nflog_file_ops = {
 #ifdef CONFIG_SYSCTL
 static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
 static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
-static struct ctl_table_header *nf_log_dir_header;
 
 static int nf_log_proc_dostring(ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -226,15 +271,19 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
 	size_t size = *lenp;
 	int r = 0;
 	int tindex = (unsigned long)table->extra1;
+	struct net *net = current->nsproxy->net_ns;
 
 	if (write) {
+		if (!net_eq(net, &init_net))
+			return -EPERM;
+
 		if (size > sizeof(buf))
 			size = sizeof(buf);
 		if (copy_from_user(buf, buffer, size))
 			return -EFAULT;
 
 		if (!strcmp(buf, "NONE")) {
-			nf_log_unbind_pf(tindex);
+			nf_log_unbind_pf(net, tindex);
 			return 0;
 		}
 		mutex_lock(&nf_log_mutex);
@@ -243,11 +292,11 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
 			mutex_unlock(&nf_log_mutex);
 			return -ENOENT;
 		}
-		rcu_assign_pointer(nf_loggers[tindex], logger);
+		rcu_assign_pointer(net->nf.nf_loggers[tindex], logger);
 		mutex_unlock(&nf_log_mutex);
 	} else {
 		mutex_lock(&nf_log_mutex);
-		logger = rcu_dereference_protected(nf_loggers[tindex],
+		logger = rcu_dereference_protected(net->nf.nf_loggers[tindex],
 						   lockdep_is_held(&nf_log_mutex));
 		if (!logger)
 			table->data = "NONE";
@@ -260,49 +309,111 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
 	return r;
 }
 
-static __init int netfilter_log_sysctl_init(void)
+static int netfilter_log_sysctl_init(struct net *net)
 {
 	int i;
+	struct ctl_table *table;
 
-	for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
-		snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i);
-		nf_log_sysctl_table[i].procname	=
-			nf_log_sysctl_fnames[i-NFPROTO_UNSPEC];
-		nf_log_sysctl_table[i].data = NULL;
-		nf_log_sysctl_table[i].maxlen =
-			NFLOGGER_NAME_LEN * sizeof(char);
-		nf_log_sysctl_table[i].mode = 0644;
-		nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring;
-		nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i;
+	table = nf_log_sysctl_table;
+	if (!net_eq(net, &init_net)) {
+		table = kmemdup(nf_log_sysctl_table,
+				 sizeof(nf_log_sysctl_table),
+				 GFP_KERNEL);
+		if (!table)
+			goto err_alloc;
+	} else {
+		for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+			snprintf(nf_log_sysctl_fnames[i],
+				 3, "%d", i);
+			nf_log_sysctl_table[i].procname	=
+				nf_log_sysctl_fnames[i];
+			nf_log_sysctl_table[i].data = NULL;
+			nf_log_sysctl_table[i].maxlen =
+				NFLOGGER_NAME_LEN * sizeof(char);
+			nf_log_sysctl_table[i].mode = 0644;
+			nf_log_sysctl_table[i].proc_handler =
+				nf_log_proc_dostring;
+			nf_log_sysctl_table[i].extra1 =
+				(void *)(unsigned long) i;
+		}
 	}
 
-	nf_log_dir_header = register_net_sysctl(&init_net, "net/netfilter/nf_log",
-				       nf_log_sysctl_table);
-	if (!nf_log_dir_header)
-		return -ENOMEM;
+	net->nf.nf_log_dir_header = register_net_sysctl(net,
+						"net/netfilter/nf_log",
+						table);
+	if (!net->nf.nf_log_dir_header)
+		goto err_reg;
 
 	return 0;
+
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->nf.nf_log_dir_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->nf.nf_log_dir_header);
+	if (!net_eq(net, &init_net))
+		kfree(table);
 }
 #else
-static __init int netfilter_log_sysctl_init(void)
+static int netfilter_log_sysctl_init(struct net *net)
 {
 	return 0;
 }
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+}
 #endif /* CONFIG_SYSCTL */
 
-int __init netfilter_log_init(void)
+static int __net_init nf_log_net_init(struct net *net)
 {
-	int i, r;
+	int ret = -ENOMEM;
+
 #ifdef CONFIG_PROC_FS
 	if (!proc_create("nf_log", S_IRUGO,
-			 proc_net_netfilter, &nflog_file_ops))
-		return -1;
+			 net->nf.proc_netfilter, &nflog_file_ops))
+		return ret;
 #endif
+	ret = netfilter_log_sysctl_init(net);
+	if (ret < 0)
+		goto out_sysctl;
 
-	/* Errors will trigger panic, unroll on error is unnecessary. */
-	r = netfilter_log_sysctl_init();
-	if (r < 0)
-		return r;
+	return 0;
+
+out_sysctl:
+	/* For init_net: errors will trigger panic, don't unroll on error. */
+	if (!net_eq(net, &init_net))
+		remove_proc_entry("nf_log", net->nf.proc_netfilter);
+
+	return ret;
+}
+
+static void __net_exit nf_log_net_exit(struct net *net)
+{
+	netfilter_log_sysctl_exit(net);
+	remove_proc_entry("nf_log", net->nf.proc_netfilter);
+}
+
+static struct pernet_operations nf_log_net_ops = {
+	.init = nf_log_net_init,
+	.exit = nf_log_net_exit,
+};
+
+int __init netfilter_log_init(void)
+{
+	int i, ret;
+
+	ret = register_pernet_subsys(&nf_log_net_ops);
+	if (ret < 0)
+		return ret;
 
 	for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
 		INIT_LIST_HEAD(&(nf_loggers_l[i]));
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index f248db572972..b593fd1f8cab 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -767,6 +767,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 	u_int16_t group_num = ntohs(nfmsg->res_id);
 	struct nfulnl_instance *inst;
 	struct nfulnl_msg_config_cmd *cmd = NULL;
+	struct net *net = sock_net(ctnl);
 	int ret = 0;
 
 	if (nfula[NFULA_CFG_CMD]) {
@@ -776,9 +777,9 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 		/* Commands without queue context */
 		switch (cmd->command) {
 		case NFULNL_CFG_CMD_PF_BIND:
-			return nf_log_bind_pf(pf, &nfulnl_logger);
+			return nf_log_bind_pf(net, pf, &nfulnl_logger);
 		case NFULNL_CFG_CMD_PF_UNBIND:
-			nf_log_unbind_pf(pf);
+			nf_log_unbind_pf(net, pf);
 			return 0;
 		}
 	}
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index a5e673d32bda..647d989a01e6 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -201,6 +201,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 	unsigned char opts[MAX_IPOPTLEN];
 	const struct xt_osf_finger *kf;
 	const struct xt_osf_user_finger *f;
+	struct net *net = dev_net(p->in ? p->in : p->out);
 
 	if (!info)
 		return false;
@@ -325,7 +326,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 			fcount++;
 
 			if (info->flags & XT_OSF_LOG)
-				nf_log_packet(p->family, p->hooknum, skb,
+				nf_log_packet(net, p->family, p->hooknum, skb,
 					p->in, p->out, NULL,
 					"%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
 					f->genre, f->version, f->subtype,
@@ -341,7 +342,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 	rcu_read_unlock();
 
 	if (!fcount && (info->flags & XT_OSF_LOG))
-		nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL,
+		nf_log_packet(net, p->family, p->hooknum, skb, p->in,
+			      p->out, NULL,
 			"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
 				&ip->saddr, ntohs(tcp->source),
 				&ip->daddr, ntohs(tcp->dest));

From 7d2789246cc9423d66d903f992d13a022710592a Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Sun, 24 Mar 2013 23:50:41 +0000
Subject: [PATCH 41/49] netfilter: ebt_log: add net namespace support for
 ebt_log

Add pernet support to ebt_log by means of the new nf_log_set
function added in (30e0c6a netfilter: nf_log: prepare net
namespace support for loggers).

Since syslog ns has yet not been implemented, we don't want
the containers to DDOS host's syslogd. So only enable ebt_log
only from init_net and wait for syslog ns support.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_log.c | 37 ++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 08e5ea5ec4ed..9878eb8204c5 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -78,6 +78,11 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum,
    const char *prefix)
 {
 	unsigned int bitmask;
+	struct net *net = dev_net(in ? in : out);
+
+	/* FIXME: Disabled from containers until syslog ns is supported */
+	if (!net_eq(net, &init_net))
+		return;
 
 	spin_lock_bh(&ebt_log_lock);
 	printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
@@ -207,19 +212,47 @@ static struct nf_logger ebt_log_logger __read_mostly = {
 	.me		= THIS_MODULE,
 };
 
+static int __net_init ebt_log_net_init(struct net *net)
+{
+	nf_log_set(net, NFPROTO_BRIDGE, &ebt_log_logger);
+	return 0;
+}
+
+static void __net_exit ebt_log_net_fini(struct net *net)
+{
+	nf_log_unset(net, &ebt_log_logger);
+}
+
+static struct pernet_operations ebt_log_net_ops = {
+	.init = ebt_log_net_init,
+	.exit = ebt_log_net_fini,
+};
+
 static int __init ebt_log_init(void)
 {
 	int ret;
 
+	ret = register_pernet_subsys(&ebt_log_net_ops);
+	if (ret < 0)
+		goto err_pernet;
+
 	ret = xt_register_target(&ebt_log_tg_reg);
 	if (ret < 0)
-		return ret;
+		goto err_target;
+
 	nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger);
-	return 0;
+
+	return ret;
+
+err_target:
+	unregister_pernet_subsys(&ebt_log_net_ops);
+err_pernet:
+	return ret;
 }
 
 static void __exit ebt_log_fini(void)
 {
+	unregister_pernet_subsys(&ebt_log_net_ops);
 	nf_log_unregister(&ebt_log_logger);
 	xt_unregister_target(&ebt_log_tg_reg);
 }

From 69b34fb996b2eee3970548cf6eb516d3ecb5eeed Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Sun, 24 Mar 2013 23:50:42 +0000
Subject: [PATCH 42/49] netfilter: xt_LOG: add net namespace support for xt_LOG

Add pernet support to xt_LOG by means of the new nf_log_set
function added in (30e0c6a netfilter: nf_log: prepare net
namespace support for loggers).

Since syslog ns has yet not been implemented, we don't want
the containers to DDOS host's syslogd. So only enable ebt_log
only from init_net and wait for syslog ns support

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_LOG.c | 52 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index fa40096940a1..fe573f6c9e91 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -474,7 +474,14 @@ ipt_log_packet(u_int8_t pf,
 	       const struct nf_loginfo *loginfo,
 	       const char *prefix)
 {
-	struct sbuff *m = sb_open();
+	struct sbuff *m;
+	struct net *net = dev_net(in ? in : out);
+
+	/* FIXME: Disabled from containers until syslog ns is supported */
+	if (!net_eq(net, &init_net))
+		return;
+
+	m = sb_open();
 
 	if (!loginfo)
 		loginfo = &default_loginfo;
@@ -798,7 +805,14 @@ ip6t_log_packet(u_int8_t pf,
 		const struct nf_loginfo *loginfo,
 		const char *prefix)
 {
-	struct sbuff *m = sb_open();
+	struct sbuff *m;
+	struct net *net = dev_net(in ? in : out);
+
+	/* FIXME: Disabled from containers until syslog ns is supported */
+	if (!net_eq(net, &init_net))
+		return;
+
+	m = sb_open();
 
 	if (!loginfo)
 		loginfo = &default_loginfo;
@@ -893,23 +907,55 @@ static struct nf_logger ip6t_log_logger __read_mostly = {
 };
 #endif
 
+static int __net_init log_net_init(struct net *net)
+{
+	nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+	nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger);
+#endif
+	return 0;
+}
+
+static void __net_exit log_net_exit(struct net *net)
+{
+	nf_log_unset(net, &ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+	nf_log_unset(net, &ip6t_log_logger);
+#endif
+}
+
+static struct pernet_operations log_net_ops = {
+	.init = log_net_init,
+	.exit = log_net_exit,
+};
+
 static int __init log_tg_init(void)
 {
 	int ret;
 
+	ret = register_pernet_subsys(&log_net_ops);
+	if (ret < 0)
+		goto err_pernet;
+
 	ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
 	if (ret < 0)
-		return ret;
+		goto err_target;
 
 	nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	nf_log_register(NFPROTO_IPV6, &ip6t_log_logger);
 #endif
 	return 0;
+
+err_target:
+	unregister_pernet_subsys(&log_net_ops);
+err_pernet:
+	return ret;
 }
 
 static void __exit log_tg_exit(void)
 {
+	unregister_pernet_subsys(&log_net_ops);
 	nf_log_unregister(&ipt_log_logger);
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	nf_log_unregister(&ip6t_log_logger);

From 5b175b7ebd4af6cf3e9e60fdb21b76c1c5a8d713 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Sun, 24 Mar 2013 23:50:43 +0000
Subject: [PATCH 43/49] netfilter: ebt_ulog: add net namespace support for
 ebt_ulog

Add pernet support to ebt_ulog by means of the new nf_log_set
function added in (30e0c6a netfilter: nf_log: prepare net
namespace support for loggers).

This patch also make ulog_buffers and netlink socket
ebtulognl per netns.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_ulog.c | 125 ++++++++++++++++++++++----------
 1 file changed, 88 insertions(+), 37 deletions(-)

diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 442b0321acb9..0ddd612787e3 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -41,6 +41,7 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_ulog.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
 #include <net/sock.h>
 #include "../br_private.h"
 
@@ -62,13 +63,22 @@ typedef struct {
 	spinlock_t lock;		/* the per-queue lock */
 } ebt_ulog_buff_t;
 
-static ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS];
-static struct sock *ebtulognl;
+static int ebt_ulog_net_id __read_mostly;
+struct ebt_ulog_net {
+	unsigned int nlgroup[EBT_ULOG_MAXNLGROUPS];
+	ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS];
+	struct sock *ebtulognl;
+};
+
+static struct ebt_ulog_net *ebt_ulog_pernet(struct net *net)
+{
+	return net_generic(net, ebt_ulog_net_id);
+}
 
 /* send one ulog_buff_t to userspace */
-static void ulog_send(unsigned int nlgroup)
+static void ulog_send(struct ebt_ulog_net *ebt, unsigned int nlgroup)
 {
-	ebt_ulog_buff_t *ub = &ulog_buffers[nlgroup];
+	ebt_ulog_buff_t *ub = &ebt->ulog_buffers[nlgroup];
 
 	del_timer(&ub->timer);
 
@@ -80,7 +90,7 @@ static void ulog_send(unsigned int nlgroup)
 		ub->lastnlh->nlmsg_type = NLMSG_DONE;
 
 	NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
-	netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
+	netlink_broadcast(ebt->ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
 
 	ub->qlen = 0;
 	ub->skb = NULL;
@@ -89,10 +99,15 @@ static void ulog_send(unsigned int nlgroup)
 /* timer function to flush queue in flushtimeout time */
 static void ulog_timer(unsigned long data)
 {
-	spin_lock_bh(&ulog_buffers[data].lock);
-	if (ulog_buffers[data].skb)
-		ulog_send(data);
-	spin_unlock_bh(&ulog_buffers[data].lock);
+	struct ebt_ulog_net *ebt = container_of((void *)data,
+						struct ebt_ulog_net,
+						nlgroup[*(unsigned int *)data]);
+
+	ebt_ulog_buff_t *ub = &ebt->ulog_buffers[*(unsigned int *)data];
+	spin_lock_bh(&ub->lock);
+	if (ub->skb)
+		ulog_send(ebt, *(unsigned int *)data);
+	spin_unlock_bh(&ub->lock);
 }
 
 static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -123,8 +138,10 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	ebt_ulog_packet_msg_t *pm;
 	size_t size, copy_len;
 	struct nlmsghdr *nlh;
+	struct net *net = dev_net(in ? in : out);
+	struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
 	unsigned int group = uloginfo->nlgroup;
-	ebt_ulog_buff_t *ub = &ulog_buffers[group];
+	ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group];
 	spinlock_t *lock = &ub->lock;
 	ktime_t kt;
 
@@ -146,7 +163,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 		if (!(ub->skb = ulog_alloc_skb(size)))
 			goto unlock;
 	} else if (size > skb_tailroom(ub->skb)) {
-		ulog_send(group);
+		ulog_send(ebt, group);
 
 		if (!(ub->skb = ulog_alloc_skb(size)))
 			goto unlock;
@@ -205,7 +222,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	ub->lastnlh = nlh;
 
 	if (ub->qlen >= uloginfo->qthreshold)
-		ulog_send(group);
+		ulog_send(ebt, group);
 	else if (!timer_pending(&ub->timer)) {
 		ub->timer.expires = jiffies + flushtimeout * HZ / 100;
 		add_timer(&ub->timer);
@@ -277,47 +294,39 @@ static struct nf_logger ebt_ulog_logger __read_mostly = {
 	.me		= THIS_MODULE,
 };
 
-static int __init ebt_ulog_init(void)
+static int __net_init ebt_ulog_net_init(struct net *net)
 {
-	int ret;
 	int i;
+	struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
+
 	struct netlink_kernel_cfg cfg = {
 		.groups	= EBT_ULOG_MAXNLGROUPS,
 	};
 
-	if (nlbufsiz >= 128*1024) {
-		pr_warning("Netlink buffer has to be <= 128kB,"
-			   " please try a smaller nlbufsiz parameter.\n");
-		return -EINVAL;
-	}
-
 	/* initialize ulog_buffers */
 	for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
-		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
-		spin_lock_init(&ulog_buffers[i].lock);
+		ebt->nlgroup[i] = i;
+		setup_timer(&ebt->ulog_buffers[i].timer, ulog_timer,
+			    (unsigned long)&ebt->nlgroup[i]);
+		spin_lock_init(&ebt->ulog_buffers[i].lock);
 	}
 
-	ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
-	if (!ebtulognl)
-		ret = -ENOMEM;
-	else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0)
-		netlink_kernel_release(ebtulognl);
+	ebt->ebtulognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
+	if (!ebt->ebtulognl)
+		return -ENOMEM;
 
-	if (ret == 0)
-		nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger);
-
-	return ret;
+	nf_log_set(net, NFPROTO_BRIDGE, &ebt_ulog_logger);
+	return 0;
 }
 
-static void __exit ebt_ulog_fini(void)
+static void __net_exit ebt_ulog_net_fini(struct net *net)
 {
-	ebt_ulog_buff_t *ub;
 	int i;
+	struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
 
-	nf_log_unregister(&ebt_ulog_logger);
-	xt_unregister_target(&ebt_ulog_tg_reg);
+	nf_log_unset(net, &ebt_ulog_logger);
 	for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
-		ub = &ulog_buffers[i];
+		ebt_ulog_buff_t *ub = &ebt->ulog_buffers[i];
 		del_timer(&ub->timer);
 
 		if (ub->skb) {
@@ -325,7 +334,49 @@ static void __exit ebt_ulog_fini(void)
 			ub->skb = NULL;
 		}
 	}
-	netlink_kernel_release(ebtulognl);
+	netlink_kernel_release(ebt->ebtulognl);
+}
+
+static struct pernet_operations ebt_ulog_net_ops = {
+	.init = ebt_ulog_net_init,
+	.exit = ebt_ulog_net_fini,
+	.id   = &ebt_ulog_net_id,
+	.size = sizeof(struct ebt_ulog_net),
+};
+
+static int __init ebt_ulog_init(void)
+{
+	int ret;
+
+	if (nlbufsiz >= 128*1024) {
+		pr_warn("Netlink buffer has to be <= 128kB,"
+			"please try a smaller nlbufsiz parameter.\n");
+		return -EINVAL;
+	}
+
+	ret = register_pernet_subsys(&ebt_ulog_net_ops);
+	if (ret)
+		goto out_pernet;
+
+	ret = xt_register_target(&ebt_ulog_tg_reg);
+	if (ret)
+		goto out_target;
+
+	nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger);
+
+	return 0;
+
+out_target:
+	unregister_pernet_subsys(&ebt_ulog_net_ops);
+out_pernet:
+	return ret;
+}
+
+static void __exit ebt_ulog_fini(void)
+{
+	nf_log_unregister(&ebt_ulog_logger);
+	xt_unregister_target(&ebt_ulog_tg_reg);
+	unregister_pernet_subsys(&ebt_ulog_net_ops);
 }
 
 module_init(ebt_ulog_init);

From 355430671ad93546b34b4e91bdf720f3a704efa4 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Sun, 24 Mar 2013 23:50:44 +0000
Subject: [PATCH 44/49] netfilter: ipt_ULOG: add net namespace support for
 ipt_ULOG

Add pernet support to ipt_ULOG by means of the new nf_log_set
function added in (30e0c6a netfilter: nf_log: prepare net
namespace support for loggers).

This patch also make ulog_buffers and netlink socket
nflognl per netns.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_ULOG.c | 129 +++++++++++++++++++++++-----------
 1 file changed, 89 insertions(+), 40 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 7d168dcbd135..642ecfb711bc 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -45,6 +45,7 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ipt_ULOG.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
 #include <net/sock.h>
 #include <linux/bitops.h>
 #include <asm/unaligned.h>
@@ -78,15 +79,23 @@ typedef struct {
 	struct timer_list timer;	/* the timer function */
 } ulog_buff_t;
 
-static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];	/* array of buffers */
+static int ulog_net_id __read_mostly;
+struct ulog_net {
+	unsigned int nlgroup[ULOG_MAXNLGROUPS];
+	ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];
+	struct sock *nflognl;
+	spinlock_t lock;
+};
 
-static struct sock *nflognl;		/* our socket */
-static DEFINE_SPINLOCK(ulog_lock);	/* spinlock */
+static struct ulog_net *ulog_pernet(struct net *net)
+{
+	return net_generic(net, ulog_net_id);
+}
 
 /* send one ulog_buff_t to userspace */
-static void ulog_send(unsigned int nlgroupnum)
+static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)
 {
-	ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
+	ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum];
 
 	pr_debug("ulog_send: timer is deleting\n");
 	del_timer(&ub->timer);
@@ -103,7 +112,8 @@ static void ulog_send(unsigned int nlgroupnum)
 	NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
 	pr_debug("throwing %d packets to netlink group %u\n",
 		 ub->qlen, nlgroupnum + 1);
-	netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
+	netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1,
+			  GFP_ATOMIC);
 
 	ub->qlen = 0;
 	ub->skb = NULL;
@@ -114,13 +124,16 @@ static void ulog_send(unsigned int nlgroupnum)
 /* timer function to flush queue in flushtimeout time */
 static void ulog_timer(unsigned long data)
 {
+	struct ulog_net *ulog = container_of((void *)data,
+					     struct ulog_net,
+					     nlgroup[*(unsigned int *)data]);
 	pr_debug("timer function called, calling ulog_send\n");
 
 	/* lock to protect against somebody modifying our structure
 	 * from ipt_ulog_target at the same time */
-	spin_lock_bh(&ulog_lock);
-	ulog_send(data);
-	spin_unlock_bh(&ulog_lock);
+	spin_lock_bh(&ulog->lock);
+	ulog_send(ulog, data);
+	spin_unlock_bh(&ulog->lock);
 }
 
 static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -160,6 +173,8 @@ static void ipt_ulog_packet(unsigned int hooknum,
 	size_t size, copy_len;
 	struct nlmsghdr *nlh;
 	struct timeval tv;
+	struct net *net = dev_net(in ? in : out);
+	struct ulog_net *ulog = ulog_pernet(net);
 
 	/* ffs == find first bit set, necessary because userspace
 	 * is already shifting groupnumber, but we need unshifted.
@@ -174,9 +189,9 @@ static void ipt_ulog_packet(unsigned int hooknum,
 
 	size = NLMSG_SPACE(sizeof(*pm) + copy_len);
 
-	ub = &ulog_buffers[groupnum];
+	ub = &ulog->ulog_buffers[groupnum];
 
-	spin_lock_bh(&ulog_lock);
+	spin_lock_bh(&ulog->lock);
 
 	if (!ub->skb) {
 		if (!(ub->skb = ulog_alloc_skb(size)))
@@ -186,7 +201,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
 		/* either the queue len is too high or we don't have
 		 * enough room in nlskb left. send it to userspace. */
 
-		ulog_send(groupnum);
+		ulog_send(ulog, groupnum);
 
 		if (!(ub->skb = ulog_alloc_skb(size)))
 			goto alloc_failure;
@@ -260,16 +275,16 @@ static void ipt_ulog_packet(unsigned int hooknum,
 	if (ub->qlen >= loginfo->qthreshold) {
 		if (loginfo->qthreshold > 1)
 			nlh->nlmsg_type = NLMSG_DONE;
-		ulog_send(groupnum);
+		ulog_send(ulog, groupnum);
 	}
 out_unlock:
-	spin_unlock_bh(&ulog_lock);
+	spin_unlock_bh(&ulog->lock);
 
 	return;
 
 alloc_failure:
 	pr_debug("Error building netlink message\n");
-	spin_unlock_bh(&ulog_lock);
+	spin_unlock_bh(&ulog->lock);
 }
 
 static unsigned int
@@ -376,54 +391,43 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
 	.me		= THIS_MODULE,
 };
 
-static int __init ulog_tg_init(void)
+static int __net_init ulog_tg_net_init(struct net *net)
 {
-	int ret, i;
+	int i;
+	struct ulog_net *ulog = ulog_pernet(net);
 	struct netlink_kernel_cfg cfg = {
 		.groups	= ULOG_MAXNLGROUPS,
 	};
 
-	pr_debug("init module\n");
-
-	if (nlbufsiz > 128*1024) {
-		pr_warning("Netlink buffer has to be <= 128kB\n");
-		return -EINVAL;
-	}
-
+	spin_lock_init(&ulog->lock);
 	/* initialize ulog_buffers */
 	for (i = 0; i < ULOG_MAXNLGROUPS; i++)
-		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
+		setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, i);
 
-	nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
-	if (!nflognl)
+	ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
+	if (!ulog->nflognl)
 		return -ENOMEM;
 
-	ret = xt_register_target(&ulog_tg_reg);
-	if (ret < 0) {
-		netlink_kernel_release(nflognl);
-		return ret;
-	}
 	if (nflog)
-		nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
+		nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);
 
 	return 0;
 }
 
-static void __exit ulog_tg_exit(void)
+static void __net_exit ulog_tg_net_exit(struct net *net)
 {
 	ulog_buff_t *ub;
 	int i;
-
-	pr_debug("cleanup_module\n");
+	struct ulog_net *ulog = ulog_pernet(net);
 
 	if (nflog)
-		nf_log_unregister(&ipt_ulog_logger);
-	xt_unregister_target(&ulog_tg_reg);
-	netlink_kernel_release(nflognl);
+		nf_log_unset(net, &ipt_ulog_logger);
+
+	netlink_kernel_release(ulog->nflognl);
 
 	/* remove pending timers and free allocated skb's */
 	for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
-		ub = &ulog_buffers[i];
+		ub = &ulog->ulog_buffers[i];
 		pr_debug("timer is deleting\n");
 		del_timer(&ub->timer);
 
@@ -434,5 +438,50 @@ static void __exit ulog_tg_exit(void)
 	}
 }
 
+static struct pernet_operations ulog_tg_net_ops = {
+	.init = ulog_tg_net_init,
+	.exit = ulog_tg_net_exit,
+	.id   = &ulog_net_id,
+	.size = sizeof(struct ulog_net),
+};
+
+static int __init ulog_tg_init(void)
+{
+	int ret;
+	pr_debug("init module\n");
+
+	if (nlbufsiz > 128*1024) {
+		pr_warn("Netlink buffer has to be <= 128kB\n");
+		return -EINVAL;
+	}
+
+	ret = register_pernet_subsys(&ulog_tg_net_ops);
+	if (ret)
+		goto out_pernet;
+
+	ret = xt_register_target(&ulog_tg_reg);
+	if (ret < 0)
+		goto out_target;
+
+	if (nflog)
+		nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
+
+	return 0;
+
+out_target:
+	unregister_pernet_subsys(&ulog_tg_net_ops);
+out_pernet:
+	return ret;
+}
+
+static void __exit ulog_tg_exit(void)
+{
+	pr_debug("cleanup_module\n");
+	if (nflog)
+		nf_log_unregister(&ipt_ulog_logger);
+	xt_unregister_target(&ulog_tg_reg);
+	unregister_pernet_subsys(&ulog_tg_net_ops);
+}
+
 module_init(ulog_tg_init);
 module_exit(ulog_tg_exit);

From 9368a53c471b42a1bd99117d590ce2ccdc8dc3c2 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Sun, 24 Mar 2013 23:50:45 +0000
Subject: [PATCH 45/49] netfilter: nfnetlink_log: add net namespace support for
 nfnetlink_log

This patch makes /proc/net/netfilter/nfnetlink_log pernet.
Moreover, there's a pernet instance table and lock.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_log.c | 177 ++++++++++++++++++++++------------
 1 file changed, 117 insertions(+), 60 deletions(-)

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index b593fd1f8cab..8e7bf641e096 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <net/sock.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
 #include <net/netfilter/nfnetlink_log.h>
 
 #include <linux/atomic.h>
@@ -56,6 +57,7 @@ struct nfulnl_instance {
 	unsigned int qlen;		/* number of nlmsgs in skb */
 	struct sk_buff *skb;		/* pre-allocatd skb */
 	struct timer_list timer;
+	struct net *net;
 	struct user_namespace *peer_user_ns;	/* User namespace of the peer process */
 	int peer_portid;			/* PORTID of the peer process */
 
@@ -71,25 +73,34 @@ struct nfulnl_instance {
 	struct rcu_head rcu;
 };
 
-static DEFINE_SPINLOCK(instances_lock);
-static atomic_t global_seq;
-
 #define INSTANCE_BUCKETS	16
-static struct hlist_head instance_table[INSTANCE_BUCKETS];
 static unsigned int hash_init;
 
+static int nfnl_log_net_id __read_mostly;
+
+struct nfnl_log_net {
+	spinlock_t instances_lock;
+	struct hlist_head instance_table[INSTANCE_BUCKETS];
+	atomic_t global_seq;
+};
+
+static struct nfnl_log_net *nfnl_log_pernet(struct net *net)
+{
+	return net_generic(net, nfnl_log_net_id);
+}
+
 static inline u_int8_t instance_hashfn(u_int16_t group_num)
 {
 	return ((group_num & 0xff) % INSTANCE_BUCKETS);
 }
 
 static struct nfulnl_instance *
-__instance_lookup(u_int16_t group_num)
+__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)
 {
 	struct hlist_head *head;
 	struct nfulnl_instance *inst;
 
-	head = &instance_table[instance_hashfn(group_num)];
+	head = &log->instance_table[instance_hashfn(group_num)];
 	hlist_for_each_entry_rcu(inst, head, hlist) {
 		if (inst->group_num == group_num)
 			return inst;
@@ -104,12 +115,12 @@ instance_get(struct nfulnl_instance *inst)
 }
 
 static struct nfulnl_instance *
-instance_lookup_get(u_int16_t group_num)
+instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)
 {
 	struct nfulnl_instance *inst;
 
 	rcu_read_lock_bh();
-	inst = __instance_lookup(group_num);
+	inst = __instance_lookup(log, group_num);
 	if (inst && !atomic_inc_not_zero(&inst->use))
 		inst = NULL;
 	rcu_read_unlock_bh();
@@ -119,7 +130,11 @@ instance_lookup_get(u_int16_t group_num)
 
 static void nfulnl_instance_free_rcu(struct rcu_head *head)
 {
-	kfree(container_of(head, struct nfulnl_instance, rcu));
+	struct nfulnl_instance *inst =
+		container_of(head, struct nfulnl_instance, rcu);
+
+	put_net(inst->net);
+	kfree(inst);
 	module_put(THIS_MODULE);
 }
 
@@ -133,13 +148,15 @@ instance_put(struct nfulnl_instance *inst)
 static void nfulnl_timer(unsigned long data);
 
 static struct nfulnl_instance *
-instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns)
+instance_create(struct net *net, u_int16_t group_num,
+		int portid, struct user_namespace *user_ns)
 {
 	struct nfulnl_instance *inst;
+	struct nfnl_log_net *log = nfnl_log_pernet(net);
 	int err;
 
-	spin_lock_bh(&instances_lock);
-	if (__instance_lookup(group_num)) {
+	spin_lock_bh(&log->instances_lock);
+	if (__instance_lookup(log, group_num)) {
 		err = -EEXIST;
 		goto out_unlock;
 	}
@@ -163,6 +180,7 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns)
 
 	setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst);
 
+	inst->net = get_net(net);
 	inst->peer_user_ns = user_ns;
 	inst->peer_portid = portid;
 	inst->group_num = group_num;
@@ -174,14 +192,15 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns)
 	inst->copy_range 	= NFULNL_COPY_RANGE_MAX;
 
 	hlist_add_head_rcu(&inst->hlist,
-		       &instance_table[instance_hashfn(group_num)]);
+		       &log->instance_table[instance_hashfn(group_num)]);
 
-	spin_unlock_bh(&instances_lock);
+
+	spin_unlock_bh(&log->instances_lock);
 
 	return inst;
 
 out_unlock:
-	spin_unlock_bh(&instances_lock);
+	spin_unlock_bh(&log->instances_lock);
 	return ERR_PTR(err);
 }
 
@@ -210,11 +229,12 @@ __instance_destroy(struct nfulnl_instance *inst)
 }
 
 static inline void
-instance_destroy(struct nfulnl_instance *inst)
+instance_destroy(struct nfnl_log_net *log,
+		 struct nfulnl_instance *inst)
 {
-	spin_lock_bh(&instances_lock);
+	spin_lock_bh(&log->instances_lock);
 	__instance_destroy(inst);
-	spin_unlock_bh(&instances_lock);
+	spin_unlock_bh(&log->instances_lock);
 }
 
 static int
@@ -336,7 +356,7 @@ __nfulnl_send(struct nfulnl_instance *inst)
 		if (!nlh)
 			goto out;
 	}
-	status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_portid,
+	status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
 				   MSG_DONTWAIT);
 
 	inst->qlen = 0;
@@ -370,7 +390,8 @@ nfulnl_timer(unsigned long data)
 /* This is an inline function, we don't really care about a long
  * list of arguments */
 static inline int
-__build_packet_message(struct nfulnl_instance *inst,
+__build_packet_message(struct nfnl_log_net *log,
+			struct nfulnl_instance *inst,
 			const struct sk_buff *skb,
 			unsigned int data_len,
 			u_int8_t pf,
@@ -536,7 +557,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 	/* global sequence number */
 	if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) &&
 	    nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL,
-			 htonl(atomic_inc_return(&global_seq))))
+			 htonl(atomic_inc_return(&log->global_seq))))
 		goto nla_put_failure;
 
 	if (data_len) {
@@ -592,13 +613,15 @@ nfulnl_log_packet(u_int8_t pf,
 	const struct nf_loginfo *li;
 	unsigned int qthreshold;
 	unsigned int plen;
+	struct net *net = dev_net(in ? in : out);
+	struct nfnl_log_net *log = nfnl_log_pernet(net);
 
 	if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
 		li = li_user;
 	else
 		li = &default_loginfo;
 
-	inst = instance_lookup_get(li->u.ulog.group);
+	inst = instance_lookup_get(log, li->u.ulog.group);
 	if (!inst)
 		return;
 
@@ -680,7 +703,7 @@ nfulnl_log_packet(u_int8_t pf,
 
 	inst->qlen++;
 
-	__build_packet_message(inst, skb, data_len, pf,
+	__build_packet_message(log, inst, skb, data_len, pf,
 				hooknum, in, out, prefix, plen);
 
 	if (inst->qlen >= qthreshold)
@@ -709,24 +732,24 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
 		   unsigned long event, void *ptr)
 {
 	struct netlink_notify *n = ptr;
+	struct nfnl_log_net *log = nfnl_log_pernet(n->net);
 
 	if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
 		int i;
 
 		/* destroy all instances for this portid */
-		spin_lock_bh(&instances_lock);
+		spin_lock_bh(&log->instances_lock);
 		for  (i = 0; i < INSTANCE_BUCKETS; i++) {
 			struct hlist_node *t2;
 			struct nfulnl_instance *inst;
-			struct hlist_head *head = &instance_table[i];
+			struct hlist_head *head = &log->instance_table[i];
 
 			hlist_for_each_entry_safe(inst, t2, head, hlist) {
-				if ((net_eq(n->net, &init_net)) &&
-				    (n->portid == inst->peer_portid))
+				if (n->portid == inst->peer_portid)
 					__instance_destroy(inst);
 			}
 		}
-		spin_unlock_bh(&instances_lock);
+		spin_unlock_bh(&log->instances_lock);
 	}
 	return NOTIFY_DONE;
 }
@@ -768,6 +791,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 	struct nfulnl_instance *inst;
 	struct nfulnl_msg_config_cmd *cmd = NULL;
 	struct net *net = sock_net(ctnl);
+	struct nfnl_log_net *log = nfnl_log_pernet(net);
 	int ret = 0;
 
 	if (nfula[NFULA_CFG_CMD]) {
@@ -784,7 +808,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 		}
 	}
 
-	inst = instance_lookup_get(group_num);
+	inst = instance_lookup_get(log, group_num);
 	if (inst && inst->peer_portid != NETLINK_CB(skb).portid) {
 		ret = -EPERM;
 		goto out_put;
@@ -798,7 +822,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 				goto out_put;
 			}
 
-			inst = instance_create(group_num,
+			inst = instance_create(net, group_num,
 					       NETLINK_CB(skb).portid,
 					       sk_user_ns(NETLINK_CB(skb).ssk));
 			if (IS_ERR(inst)) {
@@ -812,7 +836,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 				goto out;
 			}
 
-			instance_destroy(inst);
+			instance_destroy(log, inst);
 			goto out_put;
 		default:
 			ret = -ENOTSUPP;
@@ -895,55 +919,68 @@ static const struct nfnetlink_subsystem nfulnl_subsys = {
 
 #ifdef CONFIG_PROC_FS
 struct iter_state {
+	struct seq_net_private p;
 	unsigned int bucket;
 };
 
-static struct hlist_node *get_first(struct iter_state *st)
+static struct hlist_node *get_first(struct net *net, struct iter_state *st)
 {
+	struct nfnl_log_net *log;
 	if (!st)
 		return NULL;
 
+	log = nfnl_log_pernet(net);
+
 	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
-		if (!hlist_empty(&instance_table[st->bucket]))
-			return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
+		struct hlist_head *head = &log->instance_table[st->bucket];
+
+		if (!hlist_empty(head))
+			return rcu_dereference_bh(hlist_first_rcu(head));
 	}
 	return NULL;
 }
 
-static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
+static struct hlist_node *get_next(struct net *net, struct iter_state *st,
+				   struct hlist_node *h)
 {
 	h = rcu_dereference_bh(hlist_next_rcu(h));
 	while (!h) {
+		struct nfnl_log_net *log;
+		struct hlist_head *head;
+
 		if (++st->bucket >= INSTANCE_BUCKETS)
 			return NULL;
 
-		h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
+		log = nfnl_log_pernet(net);
+		head = &log->instance_table[st->bucket];
+		h = rcu_dereference_bh(hlist_first_rcu(head));
 	}
 	return h;
 }
 
-static struct hlist_node *get_idx(struct iter_state *st, loff_t pos)
+static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
+				  loff_t pos)
 {
 	struct hlist_node *head;
-	head = get_first(st);
+	head = get_first(net, st);
 
 	if (head)
-		while (pos && (head = get_next(st, head)))
+		while (pos && (head = get_next(net, st, head)))
 			pos--;
 	return pos ? NULL : head;
 }
 
-static void *seq_start(struct seq_file *seq, loff_t *pos)
+static void *seq_start(struct seq_file *s, loff_t *pos)
 	__acquires(rcu_bh)
 {
 	rcu_read_lock_bh();
-	return get_idx(seq->private, *pos);
+	return get_idx(seq_file_net(s), s->private, *pos);
 }
 
 static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	(*pos)++;
-	return get_next(s->private, v);
+	return get_next(seq_file_net(s), s->private, v);
 }
 
 static void seq_stop(struct seq_file *s, void *v)
@@ -972,8 +1009,8 @@ static const struct seq_operations nful_seq_ops = {
 
 static int nful_open(struct inode *inode, struct file *file)
 {
-	return seq_open_private(file, &nful_seq_ops,
-			sizeof(struct iter_state));
+	return seq_open_net(inode, file, &nful_seq_ops,
+			    sizeof(struct iter_state));
 }
 
 static const struct file_operations nful_file_ops = {
@@ -981,17 +1018,43 @@ static const struct file_operations nful_file_ops = {
 	.open	 = nful_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release_private,
+	.release = seq_release_net,
 };
 
 #endif /* PROC_FS */
 
-static int __init nfnetlink_log_init(void)
+static int __net_init nfnl_log_net_init(struct net *net)
 {
-	int i, status = -ENOMEM;
+	unsigned int i;
+	struct nfnl_log_net *log = nfnl_log_pernet(net);
 
 	for (i = 0; i < INSTANCE_BUCKETS; i++)
-		INIT_HLIST_HEAD(&instance_table[i]);
+		INIT_HLIST_HEAD(&log->instance_table[i]);
+	spin_lock_init(&log->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+	if (!proc_create("nfnetlink_log", 0440,
+			 net->nf.proc_netfilter, &nful_file_ops))
+		return -ENOMEM;
+#endif
+	return 0;
+}
+
+static void __net_exit nfnl_log_net_exit(struct net *net)
+{
+	remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
+}
+
+static struct pernet_operations nfnl_log_net_ops = {
+	.init	= nfnl_log_net_init,
+	.exit	= nfnl_log_net_exit,
+	.id	= &nfnl_log_net_id,
+	.size	= sizeof(struct nfnl_log_net),
+};
+
+static int __init nfnetlink_log_init(void)
+{
+	int status = -ENOMEM;
 
 	/* it's not really all that important to have a random value, so
 	 * we can do this from the init function, even if there hasn't
@@ -1001,29 +1064,25 @@ static int __init nfnetlink_log_init(void)
 	netlink_register_notifier(&nfulnl_rtnl_notifier);
 	status = nfnetlink_subsys_register(&nfulnl_subsys);
 	if (status < 0) {
-		printk(KERN_ERR "log: failed to create netlink socket\n");
+		pr_err("log: failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
 	}
 
 	status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger);
 	if (status < 0) {
-		printk(KERN_ERR "log: failed to register logger\n");
+		pr_err("log: failed to register logger\n");
 		goto cleanup_subsys;
 	}
 
-#ifdef CONFIG_PROC_FS
-	if (!proc_create("nfnetlink_log", 0440,
-			 proc_net_netfilter, &nful_file_ops)) {
-		status = -ENOMEM;
+	status = register_pernet_subsys(&nfnl_log_net_ops);
+	if (status < 0) {
+		pr_err("log: failed to register pernet ops\n");
 		goto cleanup_logger;
 	}
-#endif
 	return status;
 
-#ifdef CONFIG_PROC_FS
 cleanup_logger:
 	nf_log_unregister(&nfulnl_logger);
-#endif
 cleanup_subsys:
 	nfnetlink_subsys_unregister(&nfulnl_subsys);
 cleanup_netlink_notifier:
@@ -1033,10 +1092,8 @@ cleanup_netlink_notifier:
 
 static void __exit nfnetlink_log_fini(void)
 {
+	unregister_pernet_subsys(&nfnl_log_net_ops);
 	nf_log_unregister(&nfulnl_logger);
-#ifdef CONFIG_PROC_FS
-	remove_proc_entry("nfnetlink_log", proc_net_netfilter);
-#endif
 	nfnetlink_subsys_unregister(&nfulnl_subsys);
 	netlink_unregister_notifier(&nfulnl_rtnl_notifier);
 }

From 5b023fc8d8e0997e0b7ea6506d243afd5478c96e Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Fri, 5 Apr 2013 17:04:35 +0200
Subject: [PATCH 46/49] netfilter: enable per netns support for nf_loggers

After this patch, all nf_loggers support net namespace. Still
xt_LOG and ebt_log require syslog netns support.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_log.c | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 8d331dc9b1ea..388656d5a9ec 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -35,9 +35,6 @@ void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
 {
 	const struct nf_logger *log;
 
-	if (!net_eq(net, &init_net))
-		return;
-
 	if (pf == NFPROTO_UNSPEC)
 		return;
 
@@ -56,9 +53,6 @@ void nf_log_unset(struct net *net, const struct nf_logger *logger)
 	int i;
 	const struct nf_logger *log;
 
-	if (!net_eq(net, &init_net))
-		return;
-
 	mutex_lock(&nf_log_mutex);
 	for (i = 0; i < NFPROTO_NUMPROTO; i++) {
 		log = rcu_dereference_protected(net->nf.nf_loggers[i],
@@ -94,7 +88,6 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
 
 	mutex_unlock(&nf_log_mutex);
 
-	nf_log_set(&init_net, pf, logger);
 	return 0;
 }
 EXPORT_SYMBOL(nf_log_register);
@@ -107,17 +100,12 @@ void nf_log_unregister(struct nf_logger *logger)
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		list_del(&logger->list[i]);
 	mutex_unlock(&nf_log_mutex);
-
-	nf_log_unset(&init_net, logger);
 }
 EXPORT_SYMBOL(nf_log_unregister);
 
 int nf_log_bind_pf(struct net *net, u_int8_t pf,
 		   const struct nf_logger *logger)
 {
-	if (!net_eq(net, &init_net))
-		return 0;
-
 	if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
 		return -EINVAL;
 	mutex_lock(&nf_log_mutex);
@@ -133,9 +121,6 @@ EXPORT_SYMBOL(nf_log_bind_pf);
 
 void nf_log_unbind_pf(struct net *net, u_int8_t pf)
 {
-	if (!net_eq(net, &init_net))
-		return;
-
 	if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
 		return;
 	mutex_lock(&nf_log_mutex);
@@ -157,9 +142,6 @@ void nf_log_packet(struct net *net,
 	char prefix[NF_LOG_PREFIXLEN];
 	const struct nf_logger *logger;
 
-	if (!net_eq(net, &init_net))
-		return;
-
 	rcu_read_lock();
 	logger = rcu_dereference(net->nf.nf_loggers[pf]);
 	if (logger) {
@@ -274,9 +256,6 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
 	struct net *net = current->nsproxy->net_ns;
 
 	if (write) {
-		if (!net_eq(net, &init_net))
-			return -EPERM;
-
 		if (size > sizeof(buf))
 			size = sizeof(buf);
 		if (copy_from_user(buf, buffer, size))

From e817961048ecd12cf9cdfcec0062deb5f7970592 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Sun, 24 Mar 2013 23:50:47 +0000
Subject: [PATCH 47/49] netfilter: nfnetlink_queue: add net namespace support
 for nfnetlink_queue

This patch makes /proc/net/netfilter/nfnetlink_queue pernet.
Moreover, there's a pernet instance table and lock.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_queue_core.c | 173 +++++++++++++++++----------
 1 file changed, 113 insertions(+), 60 deletions(-)

diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 19845e34f70f..d20c52cc80c0 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -30,6 +30,7 @@
 #include <linux/list.h>
 #include <net/sock.h>
 #include <net/netfilter/nf_queue.h>
+#include <net/netns/generic.h>
 #include <net/netfilter/nfnetlink_queue.h>
 
 #include <linux/atomic.h>
@@ -66,10 +67,18 @@ struct nfqnl_instance {
 
 typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
 
-static DEFINE_SPINLOCK(instances_lock);
+static int nfnl_queue_net_id __read_mostly;
 
 #define INSTANCE_BUCKETS	16
-static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly;
+struct nfnl_queue_net {
+	spinlock_t instances_lock;
+	struct hlist_head instance_table[INSTANCE_BUCKETS];
+};
+
+static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
+{
+	return net_generic(net, nfnl_queue_net_id);
+}
 
 static inline u_int8_t instance_hashfn(u_int16_t queue_num)
 {
@@ -77,12 +86,12 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num)
 }
 
 static struct nfqnl_instance *
-instance_lookup(u_int16_t queue_num)
+instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
 {
 	struct hlist_head *head;
 	struct nfqnl_instance *inst;
 
-	head = &instance_table[instance_hashfn(queue_num)];
+	head = &q->instance_table[instance_hashfn(queue_num)];
 	hlist_for_each_entry_rcu(inst, head, hlist) {
 		if (inst->queue_num == queue_num)
 			return inst;
@@ -91,14 +100,15 @@ instance_lookup(u_int16_t queue_num)
 }
 
 static struct nfqnl_instance *
-instance_create(u_int16_t queue_num, int portid)
+instance_create(struct nfnl_queue_net *q, u_int16_t queue_num,
+		int portid)
 {
 	struct nfqnl_instance *inst;
 	unsigned int h;
 	int err;
 
-	spin_lock(&instances_lock);
-	if (instance_lookup(queue_num)) {
+	spin_lock(&q->instances_lock);
+	if (instance_lookup(q, queue_num)) {
 		err = -EEXIST;
 		goto out_unlock;
 	}
@@ -123,16 +133,16 @@ instance_create(u_int16_t queue_num, int portid)
 	}
 
 	h = instance_hashfn(queue_num);
-	hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
+	hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]);
 
-	spin_unlock(&instances_lock);
+	spin_unlock(&q->instances_lock);
 
 	return inst;
 
 out_free:
 	kfree(inst);
 out_unlock:
-	spin_unlock(&instances_lock);
+	spin_unlock(&q->instances_lock);
 	return ERR_PTR(err);
 }
 
@@ -158,11 +168,11 @@ __instance_destroy(struct nfqnl_instance *inst)
 }
 
 static void
-instance_destroy(struct nfqnl_instance *inst)
+instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
 {
-	spin_lock(&instances_lock);
+	spin_lock(&q->instances_lock);
 	__instance_destroy(inst);
-	spin_unlock(&instances_lock);
+	spin_unlock(&q->instances_lock);
 }
 
 static inline void
@@ -473,9 +483,12 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 	int err = -ENOBUFS;
 	__be32 *packet_id_ptr;
 	int failopen = 0;
+	struct net *net = dev_net(entry->indev ?
+				  entry->indev : entry->outdev);
+	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 
 	/* rcu_read_lock()ed by nf_hook_slow() */
-	queue = instance_lookup(queuenum);
+	queue = instance_lookup(q, queuenum);
 	if (!queue) {
 		err = -ESRCH;
 		goto err_out;
@@ -512,7 +525,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 	*packet_id_ptr = htonl(entry->id);
 
 	/* nfnetlink_unicast will either free the nskb or add it to a socket */
-	err = nfnetlink_unicast(nskb, &init_net, queue->peer_portid, MSG_DONTWAIT);
+	err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
 	if (err < 0) {
 		queue->queue_user_dropped++;
 		goto err_out_unlock;
@@ -625,15 +638,16 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
 /* drop all packets with either indev or outdev == ifindex from all queue
  * instances */
 static void
-nfqnl_dev_drop(int ifindex)
+nfqnl_dev_drop(struct net *net, int ifindex)
 {
 	int i;
+	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 
 	rcu_read_lock();
 
 	for (i = 0; i < INSTANCE_BUCKETS; i++) {
 		struct nfqnl_instance *inst;
-		struct hlist_head *head = &instance_table[i];
+		struct hlist_head *head = &q->instance_table[i];
 
 		hlist_for_each_entry_rcu(inst, head, hlist)
 			nfqnl_flush(inst, dev_cmp, ifindex);
@@ -650,12 +664,9 @@ nfqnl_rcv_dev_event(struct notifier_block *this,
 {
 	struct net_device *dev = ptr;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		return NOTIFY_DONE;
-
 	/* Drop any packets associated with the downed device */
 	if (event == NETDEV_DOWN)
-		nfqnl_dev_drop(dev->ifindex);
+		nfqnl_dev_drop(dev_net(dev), dev->ifindex);
 	return NOTIFY_DONE;
 }
 
@@ -668,24 +679,24 @@ nfqnl_rcv_nl_event(struct notifier_block *this,
 		   unsigned long event, void *ptr)
 {
 	struct netlink_notify *n = ptr;
+	struct nfnl_queue_net *q = nfnl_queue_pernet(n->net);
 
 	if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
 		int i;
 
 		/* destroy all instances for this portid */
-		spin_lock(&instances_lock);
+		spin_lock(&q->instances_lock);
 		for (i = 0; i < INSTANCE_BUCKETS; i++) {
 			struct hlist_node *t2;
 			struct nfqnl_instance *inst;
-			struct hlist_head *head = &instance_table[i];
+			struct hlist_head *head = &q->instance_table[i];
 
 			hlist_for_each_entry_safe(inst, t2, head, hlist) {
-				if ((n->net == &init_net) &&
-				    (n->portid == inst->peer_portid))
+				if (n->portid == inst->peer_portid)
 					__instance_destroy(inst);
 			}
 		}
-		spin_unlock(&instances_lock);
+		spin_unlock(&q->instances_lock);
 	}
 	return NOTIFY_DONE;
 }
@@ -706,11 +717,12 @@ static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
 	[NFQA_MARK]		= { .type = NLA_U32 },
 };
 
-static struct nfqnl_instance *verdict_instance_lookup(u16 queue_num, int nlportid)
+static struct nfqnl_instance *
+verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid)
 {
 	struct nfqnl_instance *queue;
 
-	queue = instance_lookup(queue_num);
+	queue = instance_lookup(q, queue_num);
 	if (!queue)
 		return ERR_PTR(-ENODEV);
 
@@ -754,7 +766,11 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
 	LIST_HEAD(batch_list);
 	u16 queue_num = ntohs(nfmsg->res_id);
 
-	queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid);
+	struct net *net = sock_net(ctnl);
+	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+	queue = verdict_instance_lookup(q, queue_num,
+					NETLINK_CB(skb).portid);
 	if (IS_ERR(queue))
 		return PTR_ERR(queue);
 
@@ -802,10 +818,13 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 	enum ip_conntrack_info uninitialized_var(ctinfo);
 	struct nf_conn *ct = NULL;
 
-	queue = instance_lookup(queue_num);
-	if (!queue)
+	struct net *net = sock_net(ctnl);
+	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 
-	queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid);
+	queue = instance_lookup(q, queue_num);
+	if (!queue)
+		queue = verdict_instance_lookup(q, queue_num,
+						NETLINK_CB(skb).portid);
 	if (IS_ERR(queue))
 		return PTR_ERR(queue);
 
@@ -869,6 +888,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 	u_int16_t queue_num = ntohs(nfmsg->res_id);
 	struct nfqnl_instance *queue;
 	struct nfqnl_msg_config_cmd *cmd = NULL;
+	struct net *net = sock_net(ctnl);
+	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 	int ret = 0;
 
 	if (nfqa[NFQA_CFG_CMD]) {
@@ -882,7 +903,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 	}
 
 	rcu_read_lock();
-	queue = instance_lookup(queue_num);
+	queue = instance_lookup(q, queue_num);
 	if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
 		ret = -EPERM;
 		goto err_out_unlock;
@@ -895,7 +916,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 				ret = -EBUSY;
 				goto err_out_unlock;
 			}
-			queue = instance_create(queue_num, NETLINK_CB(skb).portid);
+			queue = instance_create(q, queue_num,
+						NETLINK_CB(skb).portid);
 			if (IS_ERR(queue)) {
 				ret = PTR_ERR(queue);
 				goto err_out_unlock;
@@ -906,7 +928,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 				ret = -ENODEV;
 				goto err_out_unlock;
 			}
-			instance_destroy(queue);
+			instance_destroy(q, queue);
 			break;
 		case NFQNL_CFG_CMD_PF_BIND:
 		case NFQNL_CFG_CMD_PF_UNBIND:
@@ -1000,19 +1022,24 @@ static const struct nfnetlink_subsystem nfqnl_subsys = {
 
 #ifdef CONFIG_PROC_FS
 struct iter_state {
+	struct seq_net_private p;
 	unsigned int bucket;
 };
 
 static struct hlist_node *get_first(struct seq_file *seq)
 {
 	struct iter_state *st = seq->private;
+	struct net *net;
+	struct nfnl_queue_net *q;
 
 	if (!st)
 		return NULL;
 
+	net = seq_file_net(seq);
+	q = nfnl_queue_pernet(net);
 	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
-		if (!hlist_empty(&instance_table[st->bucket]))
-			return instance_table[st->bucket].first;
+		if (!hlist_empty(&q->instance_table[st->bucket]))
+			return q->instance_table[st->bucket].first;
 	}
 	return NULL;
 }
@@ -1020,13 +1047,17 @@ static struct hlist_node *get_first(struct seq_file *seq)
 static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
 {
 	struct iter_state *st = seq->private;
+	struct net *net = seq_file_net(seq);
 
 	h = h->next;
 	while (!h) {
+		struct nfnl_queue_net *q;
+
 		if (++st->bucket >= INSTANCE_BUCKETS)
 			return NULL;
 
-		h = instance_table[st->bucket].first;
+		q = nfnl_queue_pernet(net);
+		h = q->instance_table[st->bucket].first;
 	}
 	return h;
 }
@@ -1042,11 +1073,11 @@ static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
 	return pos ? NULL : head;
 }
 
-static void *seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(instances_lock)
+static void *seq_start(struct seq_file *s, loff_t *pos)
+	__acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
 {
-	spin_lock(&instances_lock);
-	return get_idx(seq, *pos);
+	spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
+	return get_idx(s, *pos);
 }
 
 static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
@@ -1056,9 +1087,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
 }
 
 static void seq_stop(struct seq_file *s, void *v)
-	__releases(instances_lock)
+	__releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
 {
-	spin_unlock(&instances_lock);
+	spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
 }
 
 static int seq_show(struct seq_file *s, void *v)
@@ -1082,7 +1113,7 @@ static const struct seq_operations nfqnl_seq_ops = {
 
 static int nfqnl_open(struct inode *inode, struct file *file)
 {
-	return seq_open_private(file, &nfqnl_seq_ops,
+	return seq_open_net(inode, file, &nfqnl_seq_ops,
 			sizeof(struct iter_state));
 }
 
@@ -1091,39 +1122,63 @@ static const struct file_operations nfqnl_file_ops = {
 	.open	 = nfqnl_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release_private,
+	.release = seq_release_net,
 };
 
 #endif /* PROC_FS */
 
-static int __init nfnetlink_queue_init(void)
+static int __net_init nfnl_queue_net_init(struct net *net)
 {
-	int i, status = -ENOMEM;
+	unsigned int i;
+	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 
 	for (i = 0; i < INSTANCE_BUCKETS; i++)
-		INIT_HLIST_HEAD(&instance_table[i]);
+		INIT_HLIST_HEAD(&q->instance_table[i]);
+
+	spin_lock_init(&q->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+	if (!proc_create("nfnetlink_queue", 0440,
+			 net->nf.proc_netfilter, &nfqnl_file_ops))
+		return -ENOMEM;
+#endif
+	return 0;
+}
+
+static void __net_exit nfnl_queue_net_exit(struct net *net)
+{
+	remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
+}
+
+static struct pernet_operations nfnl_queue_net_ops = {
+	.init	= nfnl_queue_net_init,
+	.exit	= nfnl_queue_net_exit,
+	.id	= &nfnl_queue_net_id,
+	.size	= sizeof(struct nfnl_queue_net),
+};
+
+static int __init nfnetlink_queue_init(void)
+{
+	int status = -ENOMEM;
 
 	netlink_register_notifier(&nfqnl_rtnl_notifier);
 	status = nfnetlink_subsys_register(&nfqnl_subsys);
 	if (status < 0) {
-		printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
+		pr_err("nf_queue: failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
 	}
 
-#ifdef CONFIG_PROC_FS
-	if (!proc_create("nfnetlink_queue", 0440,
-			 proc_net_netfilter, &nfqnl_file_ops))
+	status = register_pernet_subsys(&nfnl_queue_net_ops);
+	if (status < 0) {
+		pr_err("nf_queue: failed to register pernet ops\n");
 		goto cleanup_subsys;
-#endif
-
+	}
 	register_netdevice_notifier(&nfqnl_dev_notifier);
 	nf_register_queue_handler(&nfqh);
 	return status;
 
-#ifdef CONFIG_PROC_FS
 cleanup_subsys:
 	nfnetlink_subsys_unregister(&nfqnl_subsys);
-#endif
 cleanup_netlink_notifier:
 	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
 	return status;
@@ -1133,9 +1188,7 @@ static void __exit nfnetlink_queue_fini(void)
 {
 	nf_unregister_queue_handler();
 	unregister_netdevice_notifier(&nfqnl_dev_notifier);
-#ifdef CONFIG_PROC_FS
-	remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
-#endif
+	unregister_pernet_subsys(&nfnl_queue_net_ops);
 	nfnetlink_subsys_unregister(&nfqnl_subsys);
 	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
 

From 12202fa7573d32aa0915cde6e8fab4c86b63ca2c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 5 Apr 2013 19:40:10 +0200
Subject: [PATCH 48/49] netfilter: remove unneeded variable proc_net_netfilter

Now that this supports net namespace for nflog and nfqueue,
we can remove the global proc_net_netfilter which has no
clients anymore.

Based on patch from Gao feng.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h |  5 -----
 net/netfilter/core.c      | 16 ++++------------
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index ee142846f56a..0060fde3160e 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -289,11 +289,6 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 #endif
 }
 
-#ifdef CONFIG_PROC_FS
-#include <linux/proc_fs.h>
-extern struct proc_dir_entry *proc_net_netfilter;
-#endif
-
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
 #define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) (okfn)(skb)
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b085184d9b45..7d97302f7c07 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -276,23 +276,15 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
 EXPORT_SYMBOL(nf_nat_decode_session_hook);
 #endif
 
-#ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_net_netfilter;
-EXPORT_SYMBOL(proc_net_netfilter);
-#endif
-
 static int __net_init netfilter_net_init(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
 						net->proc_net);
-	if (net_eq(net, &init_net)) {
-		if (!net->nf.proc_netfilter)
-			return -ENOMEM;
-		else
-			proc_net_netfilter = net->nf.proc_netfilter;
-	} else if (!net->nf.proc_netfilter) {
-		pr_err("cannot create netfilter proc entry");
+	if (!net->nf.proc_netfilter) {
+		if (!net_eq(net, &init_net))
+			pr_err("cannot create netfilter proc entry");
+
 		return -ENOMEM;
 	}
 #endif

From b8dd6a223eb86d537c2c6d8d28916c1f0ba3ea3c Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sun, 24 Mar 2013 15:20:51 +0000
Subject: [PATCH 49/49] netfilter: implement RFC3168 5.3 (ecn protection) for
 ipv6 fragmentation handling

This change brings netfilter reassembly logic on par with
reassembly.c. The corresponding change in net-next is
(eec2e61 ipv6: implement RFC3168 5.3 (ecn protection) for
ipv6 fragmentation handling)

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jesper Dangaard Brouer <jbrouer@redhat.com>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6700069949dd..dffdc1a389c5 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -41,6 +41,7 @@
 #include <net/rawv6.h>
 #include <net/ndisc.h>
 #include <net/addrconf.h>
+#include <net/inet_ecn.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 #include <linux/sysctl.h>
 #include <linux/netfilter.h>
@@ -138,6 +139,11 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
 }
 #endif
 
+static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
+{
+	return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
+}
+
 static unsigned int nf_hashfn(struct inet_frag_queue *q)
 {
 	const struct frag_queue *nq;
@@ -166,7 +172,7 @@ static void nf_ct_frag6_expire(unsigned long data)
 /* Creation primitives. */
 static inline struct frag_queue *fq_find(struct net *net, __be32 id,
 					 u32 user, struct in6_addr *src,
-					 struct in6_addr *dst)
+					 struct in6_addr *dst, u8 ecn)
 {
 	struct inet_frag_queue *q;
 	struct ip6_create_arg arg;
@@ -176,6 +182,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id,
 	arg.user = user;
 	arg.src = src;
 	arg.dst = dst;
+	arg.ecn = ecn;
 
 	read_lock_bh(&nf_frags.lock);
 	hash = inet6_hash_frag(id, src, dst, nf_frags.rnd);
@@ -196,6 +203,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	struct sk_buff *prev, *next;
 	unsigned int payload_len;
 	int offset, end;
+	u8 ecn;
 
 	if (fq->q.last_in & INET_FRAG_COMPLETE) {
 		pr_debug("Already completed\n");
@@ -213,6 +221,8 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 		return -1;
 	}
 
+	ecn = ip6_frag_ecn(ipv6_hdr(skb));
+
 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
 		const unsigned char *nh = skb_network_header(skb);
 		skb->csum = csum_sub(skb->csum,
@@ -317,6 +327,7 @@ found:
 	}
 	fq->q.stamp = skb->tstamp;
 	fq->q.meat += skb->len;
+	fq->ecn |= ecn;
 	if (payload_len > fq->q.max_size)
 		fq->q.max_size = payload_len;
 	add_frag_mem_limit(&fq->q, skb->truesize);
@@ -352,12 +363,17 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 {
 	struct sk_buff *fp, *op, *head = fq->q.fragments;
 	int    payload_len;
+	u8 ecn;
 
 	inet_frag_kill(&fq->q, &nf_frags);
 
 	WARN_ON(head == NULL);
 	WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
 
+	ecn = ip_frag_ecn_table[fq->ecn];
+	if (unlikely(ecn == 0xff))
+		goto out_fail;
+
 	/* Unfragmented part is taken from the first segment. */
 	payload_len = ((head->data - skb_network_header(head)) -
 		       sizeof(struct ipv6hdr) + fq->q.len -
@@ -428,6 +444,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 	head->dev = dev;
 	head->tstamp = fq->q.stamp;
 	ipv6_hdr(head)->payload_len = htons(payload_len);
+	ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
 	IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
 
 	/* Yes, and fold redundant checksum back. 8) */
@@ -572,7 +589,8 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
 	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
 	local_bh_enable();
 
-	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr);
+	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
+		     ip6_frag_ecn(hdr));
 	if (fq == NULL) {
 		pr_debug("Can't find and can't create new queue\n");
 		goto ret_orig;