From 5142967ab524eb8e5c1f6122e46e2df81bae178b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 24 May 2019 22:26:34 +0200
Subject: [PATCH 01/24] netfilter: nf_tables: fix module autoload with inet
 family

Use MODULE_ALIAS_NFT_EXPR() to make happy the inet family with nat.

Fixes: 63ce3940f3ab ("netfilter: nft_redir: add inet support")
Fixes: 071657d2c38c ("netfilter: nft_masq: add inet support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_masq.c  | 3 +--
 net/netfilter/nft_redir.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 86fd90085eaf..8c1612d6bc2c 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -307,5 +307,4 @@ module_exit(nft_masq_module_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
+MODULE_ALIAS_NFT_EXPR("masq");
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index da74fdc4a684..8787e9f8ed71 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -294,5 +294,4 @@ module_exit(nft_redir_module_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir");
+MODULE_ALIAS_NFT_EXPR("nat");

From a0d56cb911ca301de81735f1d73c2aab424654ba Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Sun, 2 Jun 2019 15:13:47 +0200
Subject: [PATCH 02/24] netfilter: ipv6: nf_defrag: fix leakage of unqueued
 fragments

With commit 997dd9647164 ("net: IP6 defrag: use rbtrees in
nf_conntrack_reasm.c"), nf_ct_frag6_reasm() is now called from
nf_ct_frag6_queue(). With this change, nf_ct_frag6_queue() can fail
after the skb has been added to the fragment queue and
nf_ct_frag6_gather() was adapted to handle this case.

But nf_ct_frag6_queue() can still fail before the fragment has been
queued. nf_ct_frag6_gather() can't handle this case anymore, because it
has no way to know if nf_ct_frag6_queue() queued the fragment before
failing. If it didn't, the skb is lost as the error code is overwritten
with -EINPROGRESS.

Fix this by setting -EINPROGRESS directly in nf_ct_frag6_queue(), so
that nf_ct_frag6_gather() can propagate the error as is.

Fixes: 997dd9647164 ("net: IP6 defrag: use rbtrees in nf_conntrack_reasm.c")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3de0e9b0a482..5b3f65e29b6f 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -293,7 +293,11 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 		skb->_skb_refdst = 0UL;
 		err = nf_ct_frag6_reasm(fq, skb, prev, dev);
 		skb->_skb_refdst = orefdst;
-		return err;
+
+		/* After queue has assumed skb ownership, only 0 or
+		 * -EINPROGRESS must be returned.
+		 */
+		return err ? -EINPROGRESS : 0;
 	}
 
 	skb_dst_drop(skb);
@@ -480,12 +484,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 		ret = 0;
 	}
 
-	/* after queue has assumed skb ownership, only 0 or -EINPROGRESS
-	 * must be returned.
-	 */
-	if (ret)
-		ret = -EINPROGRESS;
-
 	spin_unlock_bh(&fq->q.lock);
 	inet_frag_put(&fq->q);
 	return ret;

From 8a3dca632538c550930ce8bafa8c906b130d35cf Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Thu, 6 Jun 2019 18:04:00 +0200
Subject: [PATCH 03/24] netfilter: ipv6: nf_defrag: accept duplicate fragments
 again

When fixing the skb leak introduced by the conversion to rbtree, I
forgot about the special case of duplicate fragments. The condition
under the 'insert_error' label isn't effective anymore as
nf_ct_frg6_gather() doesn't override the returned value anymore. So
duplicate fragments now get NF_DROP verdict.

To accept duplicate fragments again, handle them specially as soon as
inet_frag_queue_insert() reports them. Return -EINPROGRESS which will
translate to NF_STOLEN verdict, like any accepted fragment. However,
such packets don't carry any new information and aren't queued, so we
just drop them immediately.

Fixes: a0d56cb911ca ("netfilter: ipv6: nf_defrag: fix leakage of unqueued fragments")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 5b3f65e29b6f..8951de8b568f 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -265,8 +265,14 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 
 	prev = fq->q.fragments_tail;
 	err = inet_frag_queue_insert(&fq->q, skb, offset, end);
-	if (err)
+	if (err) {
+		if (err == IPFRAG_DUP) {
+			/* No error for duplicates, pretend they got queued. */
+			kfree_skb(skb);
+			return -EINPROGRESS;
+		}
 		goto insert_error;
+	}
 
 	if (dev)
 		fq->iif = dev->ifindex;
@@ -304,8 +310,6 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return -EINPROGRESS;
 
 insert_error:
-	if (err == IPFRAG_DUP)
-		goto err;
 	inet_frag_kill(&fq->q);
 err:
 	skb_dst_drop(skb);

From cf18cecca911c0db96b868072665347efe6df46f Mon Sep 17 00:00:00 2001
From: "Mauro S. M. Rodrigues" <maurosr@linux.vnet.ibm.com>
Date: Thu, 13 Jun 2019 16:25:40 -0300
Subject: [PATCH 04/24] bnx2x: Check if transceiver implements DDM before
 access

Some transceivers may comply with SFF-8472 even though they do not
implement the Digital Diagnostic Monitoring (DDM) interface described in
the spec. The existence of such area is specified by the 6th bit of byte
92, set to 1 if implemented.

Currently, without checking this bit, bnx2x fails trying to read sfp
module's EEPROM with the follow message:

ethtool -m enP5p1s0f1
Cannot get Module EEPROM data: Input/output error

Because it fails to read the additional 256 bytes in which it is assumed
to exist the DDM data.

This issue was noticed using a Mellanox Passive DAC PN 01FT738. The EEPROM
data was confirmed by Mellanox as correct and similar to other Passive
DACs from other manufacturers.

Signed-off-by: Mauro S. M. Rodrigues <maurosr@linux.vnet.ibm.com>
Acked-by: Sudarsana Reddy Kalluru <skalluru@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 3 ++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h    | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 0745cccd416d..51fc845de31a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -1609,7 +1609,8 @@ static int bnx2x_get_module_info(struct net_device *dev,
 	}
 
 	if (!sff8472_comp ||
-	    (diag_type & SFP_EEPROM_DIAG_ADDR_CHANGE_REQ)) {
+	    (diag_type & SFP_EEPROM_DIAG_ADDR_CHANGE_REQ) ||
+	    !(diag_type & SFP_EEPROM_DDM_IMPLEMENTED)) {
 		modinfo->type = ETH_MODULE_SFF_8079;
 		modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
 	} else {
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
index b7d251108c19..7115f5025664 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
@@ -62,6 +62,7 @@
 #define SFP_EEPROM_DIAG_TYPE_ADDR		0x5c
 #define SFP_EEPROM_DIAG_TYPE_SIZE		1
 #define SFP_EEPROM_DIAG_ADDR_CHANGE_REQ		(1<<2)
+#define SFP_EEPROM_DDM_IMPLEMENTED		(1<<6)
 #define SFP_EEPROM_SFF_8472_COMP_ADDR		0x5e
 #define SFP_EEPROM_SFF_8472_COMP_SIZE		1
 

From d0f84d0856c11fbafadae3d580f6a9c98d818ccd Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Mon, 17 Jun 2019 11:56:12 +0700
Subject: [PATCH 05/24] tipc: fix issues with early FAILOVER_MSG from peer

It appears that a FAILOVER_MSG can come from peer even when the failure
link is resetting (i.e. just after the 'node_write_unlock()'...). This
means the failover procedure on the node has not been started yet.
The situation is as follows:

         node1                                node2
  linkb          linka                  linka        linkb
    |              |                      |            |
    |              |                      x failure    |
    |              |                  RESETTING        |
    |              |                      |            |
    |              x failure            RESET          |
    |          RESETTING             FAILINGOVER       |
    |              |   (FAILOVER_MSG)     |            |
    |<-------------------------------------------------|
    | *FAILINGOVER |                      |            |
    |              | (dummy FAILOVER_MSG) |            |
    |------------------------------------------------->|
    |            RESET                    |            | FAILOVER_END
    |         FAILINGOVER               RESET          |
    .              .                      .            .
    .              .                      .            .
    .              .                      .            .

Once this happens, the link failover procedure will be triggered
wrongly on the receiving node since the node isn't in FAILINGOVER state
but then another link failover will be carried out.
The consequences are:

1) A peer might get stuck in FAILINGOVER state because the 'sync_point'
was set, reset and set incorrectly, the criteria to end the failover
would not be met, it could keep waiting for a message that has already
received.

2) The early FAILOVER_MSG(s) could be queued in the link failover
deferdq but would be purged or not pulled out because the 'drop_point'
was not set correctly.

3) The early FAILOVER_MSG(s) could be dropped too.

4) The dummy FAILOVER_MSG could make the peer leaving FAILINGOVER state
shortly, but later on it would be restarted.

The same situation can also happen when the link is in PEER_RESET state
and a FAILOVER_MSG arrives.

The commit resolves the issues by forcing the link down immediately, so
the failover procedure will be started normally (which is the same as
when receiving a FAILOVER_MSG and the link is in up state).

Also, the function "tipc_node_link_failover()" is toughen to avoid such
a situation from happening.

Acked-by: Jon Maloy <jon.maloy@ericsson.se>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c |  1 -
 net/tipc/node.c | 10 +++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index f5cd986e1e50..2050fd386642 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1728,7 +1728,6 @@ void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl,
 	 * node has entered SELF_DOWN_PEER_LEAVING and both peer nodes
 	 * would have to start over from scratch instead.
 	 */
-	WARN_ON(l && tipc_link_is_up(l));
 	tnl->drop_point = 1;
 	tnl->failover_reasm_skb = NULL;
 
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 9e106d3ed187..550581d47d51 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -766,9 +766,9 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id,
  *	   disturbance, wrong session, etc.)
  *	3. Link <1B-2B> up
  *	4. Link endpoint 2A down (e.g. due to link tolerance timeout)
- *	5. Node B starts failover onto link <1B-2B>
+ *	5. Node 2 starts failover onto link <1B-2B>
  *
- *	==> Node A does never start link/node failover!
+ *	==> Node 1 does never start link/node failover!
  *
  * @n: tipc node structure
  * @l: link peer endpoint failingover (- can be NULL)
@@ -783,6 +783,10 @@ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l,
 	if (!tipc_link_is_up(tnl))
 		return;
 
+	/* Don't rush, failure link may be in the process of resetting */
+	if (l && !tipc_link_is_reset(l))
+		return;
+
 	tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT);
 	tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);
 
@@ -1706,7 +1710,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
 	/* Initiate or update failover mode if applicable */
 	if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) {
 		syncpt = oseqno + exp_pkts - 1;
-		if (pl && tipc_link_is_up(pl)) {
+		if (pl && !tipc_link_is_reset(pl)) {
 			__tipc_node_link_down(n, &pb_id, xmitq, &maddr);
 			trace_tipc_node_link_down(n, true,
 						  "node link down <- failover!");

From d0bae4a0e3d8c5690a885204d7eb2341a5b4884d Mon Sep 17 00:00:00 2001
From: JingYi Hou <houjingyi647@gmail.com>
Date: Mon, 17 Jun 2019 14:56:05 +0800
Subject: [PATCH 06/24] net: remove duplicate fetch in sock_getsockopt

In sock_getsockopt(), 'optlen' is fetched the first time from userspace.
'len < 0' is then checked. Then in condition 'SO_MEMINFO', 'optlen' is
fetched the second time from userspace.

If change it between two fetches may cause security problems or unexpected
behaivor, and there is no reason to fetch it a second time.

To fix this, we need to remove the second fetch.

Signed-off-by: JingYi Hou <houjingyi647@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index af09a23e4822..aa4a00d381e3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1477,9 +1477,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 	{
 		u32 meminfo[SK_MEMINFO_VARS];
 
-		if (get_user(len, optlen))
-			return -EFAULT;
-
 		sk_get_meminfo(sk, meminfo);
 
 		len = min_t(unsigned int, len, sizeof(meminfo));

From 72b319dc08b4924a29f5e2560ef6d966fa54c429 Mon Sep 17 00:00:00 2001
From: Fei Li <lifei.shirley@bytedance.com>
Date: Mon, 17 Jun 2019 21:26:36 +0800
Subject: [PATCH 07/24] tun: wake up waitqueues after IFF_UP is set

Currently after setting tap0 link up, the tun code wakes tx/rx waited
queues up in tun_net_open() when .ndo_open() is called, however the
IFF_UP flag has not been set yet. If there's already a wait queue, it
would fail to transmit when checking the IFF_UP flag in tun_sendmsg().
Then the saving vhost_poll_start() will add the wq into wqh until it
is waken up again. Although this works when IFF_UP flag has been set
when tun_chr_poll detects; this is not true if IFF_UP flag has not
been set at that time. Sadly the latter case is a fatal error, as
the wq will never be waken up in future unless later manually
setting link up on purpose.

Fix this by moving the wakeup process into the NETDEV_UP event
notifying process, this makes sure IFF_UP has been set before all
waited queues been waken up.

Signed-off-by: Fei Li <lifei.shirley@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c452d6d831dd..d7c55e0fa8f4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1014,18 +1014,8 @@ static void tun_net_uninit(struct net_device *dev)
 /* Net device open. */
 static int tun_net_open(struct net_device *dev)
 {
-	struct tun_struct *tun = netdev_priv(dev);
-	int i;
-
 	netif_tx_start_all_queues(dev);
 
-	for (i = 0; i < tun->numqueues; i++) {
-		struct tun_file *tfile;
-
-		tfile = rtnl_dereference(tun->tfiles[i]);
-		tfile->socket.sk->sk_write_space(tfile->socket.sk);
-	}
-
 	return 0;
 }
 
@@ -3634,6 +3624,7 @@ static int tun_device_event(struct notifier_block *unused,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct tun_struct *tun = netdev_priv(dev);
+	int i;
 
 	if (dev->rtnl_link_ops != &tun_link_ops)
 		return NOTIFY_DONE;
@@ -3643,6 +3634,14 @@ static int tun_device_event(struct notifier_block *unused,
 		if (tun_queue_resize(tun))
 			return NOTIFY_BAD;
 		break;
+	case NETDEV_UP:
+		for (i = 0; i < tun->numqueues; i++) {
+			struct tun_file *tfile;
+
+			tfile = rtnl_dereference(tun->tfiles[i]);
+			tfile->socket.sk->sk_write_space(tfile->socket.sk);
+		}
+		break;
 	default:
 		break;
 	}

From 5684abf7020dfc5f0b6ba1d68eda3663871fce52 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jun 2019 21:34:13 +0800
Subject: [PATCH 08/24] ip_tunnel: allow not to count pkts on tstats by setting
 skb's dev to NULL

iptunnel_xmit() works as a common function, also used by a udp tunnel
which doesn't have to have a tunnel device, like how TIPC works with
udp media.

In these cases, we should allow not to count pkts on dev's tstats, so
that udp tunnel can work with no tunnel device safely.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel_core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 9e3846388fb3..1452a97914a0 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -76,9 +76,12 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 	__ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
 
 	err = ip_local_out(net, sk, skb);
-	if (unlikely(net_xmit_eval(err)))
-		pkt_len = 0;
-	iptunnel_xmit_stats(dev, pkt_len);
+
+	if (dev) {
+		if (unlikely(net_xmit_eval(err)))
+			pkt_len = 0;
+		iptunnel_xmit_stats(dev, pkt_len);
+	}
 }
 EXPORT_SYMBOL_GPL(iptunnel_xmit);
 

From 6f6a8622057c92408930c31698394fae1557b188 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jun 2019 21:34:14 +0800
Subject: [PATCH 09/24] ip6_tunnel: allow not to count pkts on tstats by
 passing dev as NULL

A similar fix to Patch "ip_tunnel: allow not to count pkts on tstats by
setting skb's dev to NULL" is also needed by ip6_tunnel.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 69b4bcf880c9..028eaea1c854 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -158,9 +158,12 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
 	memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
 	pkt_len = skb->len - skb_inner_network_offset(skb);
 	err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
-	if (unlikely(net_xmit_eval(err)))
-		pkt_len = -1;
-	iptunnel_xmit_stats(dev, pkt_len);
+
+	if (dev) {
+		if (unlikely(net_xmit_eval(err)))
+			pkt_len = -1;
+		iptunnel_xmit_stats(dev, pkt_len);
+	}
 }
 #endif
 #endif

From c3bcde026684c62d7a2b6f626dc7cf763833875c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jun 2019 21:34:15 +0800
Subject: [PATCH 10/24] tipc: pass tunnel dev as NULL to udp_tunnel(6)_xmit_skb

udp_tunnel(6)_xmit_skb() called by tipc_udp_xmit() expects a tunnel device
to count packets on dev->tstats, a perpcu variable. However, TIPC is using
udp tunnel with no tunnel device, and pass the lower dev, like veth device
that only initializes dev->lstats(a perpcu variable) when creating it.

Later iptunnel_xmit_stats() called by ip(6)tunnel_xmit() thinks the dev as
a tunnel device, and uses dev->tstats instead of dev->lstats. tstats' each
pointer points to a bigger struct than lstats, so when tstats->tx_bytes is
increased, other percpu variable's members could be overwritten.

syzbot has reported quite a few crashes due to fib_nh_common percpu member
'nhc_pcpu_rth_output' overwritten, call traces are like:

  BUG: KASAN: slab-out-of-bounds in rt_cache_valid+0x158/0x190
  net/ipv4/route.c:1556
    rt_cache_valid+0x158/0x190 net/ipv4/route.c:1556
    __mkroute_output net/ipv4/route.c:2332 [inline]
    ip_route_output_key_hash_rcu+0x819/0x2d50 net/ipv4/route.c:2564
    ip_route_output_key_hash+0x1ef/0x360 net/ipv4/route.c:2393
    __ip_route_output_key include/net/route.h:125 [inline]
    ip_route_output_flow+0x28/0xc0 net/ipv4/route.c:2651
    ip_route_output_key include/net/route.h:135 [inline]
  ...

or:

  kasan: GPF could be caused by NULL-ptr deref or user memory access
  RIP: 0010:dst_dev_put+0x24/0x290 net/core/dst.c:168
    <IRQ>
    rt_fibinfo_free_cpus net/ipv4/fib_semantics.c:200 [inline]
    free_fib_info_rcu+0x2e1/0x490 net/ipv4/fib_semantics.c:217
    __rcu_reclaim kernel/rcu/rcu.h:240 [inline]
    rcu_do_batch kernel/rcu/tree.c:2437 [inline]
    invoke_rcu_callbacks kernel/rcu/tree.c:2716 [inline]
    rcu_process_callbacks+0x100a/0x1ac0 kernel/rcu/tree.c:2697
  ...

The issue exists since tunnel stats update is moved to iptunnel_xmit by
Commit 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()"),
and here to fix it by passing a NULL tunnel dev to udp_tunnel(6)_xmit_skb
so that the packets counting won't happen on dev->tstats.

Reported-by: syzbot+9d4c12bfd45a58738d0a@syzkaller.appspotmail.com
Reported-by: syzbot+a9e23ea2aa21044c2798@syzkaller.appspotmail.com
Reported-by: syzbot+c4c4b2bb358bb936ad7e@syzkaller.appspotmail.com
Reported-by: syzbot+0290d2290a607e035ba1@syzkaller.appspotmail.com
Reported-by: syzbot+a43d8d4e7e8a7a9e149e@syzkaller.appspotmail.com
Reported-by: syzbot+a47c5f4c6c00fc1ed16e@syzkaller.appspotmail.com
Fixes: 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/udp_media.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 7fc02d84c4f1..1405ccc9101c 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -176,7 +176,6 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
 			goto tx_error;
 		}
 
-		skb->dev = rt->dst.dev;
 		ttl = ip4_dst_hoplimit(&rt->dst);
 		udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr,
 				    dst->ipv4.s_addr, 0, ttl, 0, src->port,
@@ -195,10 +194,9 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
 		if (err)
 			goto tx_error;
 		ttl = ip6_dst_hoplimit(ndst);
-		err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb,
-					   ndst->dev, &src->ipv6,
-					   &dst->ipv6, 0, ttl, 0, src->port,
-					   dst->port, false);
+		err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL,
+					   &src->ipv6, &dst->ipv6, 0, ttl, 0,
+					   src->port, dst->port, false);
 #endif
 	}
 	return err;

From 9476274093a0e79b905f4cd6cf6d149f65e02c17 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 17 Jun 2019 17:12:49 +0100
Subject: [PATCH 11/24] net: lio_core: fix potential sign-extension overflow on
 large shift

Left shifting the signed int value 1 by 31 bits has undefined behaviour
and the shift amount oq_no can be as much as 63.  Fix this by using
BIT_ULL(oq_no) instead.

Addresses-Coverity: ("Bad shift operation")
Fixes: f21fb3ed364b ("Add support of Cavium Liquidio ethernet adapters")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 1c50c10b5a16..d7e805749a5b 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -964,7 +964,7 @@ static void liquidio_schedule_droq_pkt_handlers(struct octeon_device *oct)
 
 			if (droq->ops.poll_mode) {
 				droq->ops.napi_fn(droq);
-				oct_priv->napi_mask |= (1 << oq_no);
+				oct_priv->napi_mask |= BIT_ULL(oq_no);
 			} else {
 				tasklet_schedule(&oct_priv->droq_tasklet);
 			}

From 177b8007463c4f36c9a2c7ce7aa9875a4cad9bd5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 17 Jun 2019 11:11:10 -0700
Subject: [PATCH 12/24] net: netem: fix backlog accounting for corrupted GSO
 frames

When GSO frame has to be corrupted netem uses skb_gso_segment()
to produce the list of frames, and re-enqueues the segments one
by one.  The backlog length has to be adjusted to account for
new frames.

The current calculation is incorrect, leading to wrong backlog
lengths in the parent qdisc (both bytes and packets), and
incorrect packet backlog count in netem itself.

Parent backlog goes negative, netem's packet backlog counts
all non-first segments twice (thus remaining non-zero even
after qdisc is emptied).

Move the variables used to count the adjustment into local
scope to make 100% sure they aren't used at any stage in
backports.

Fixes: 6071bd1aa13e ("netem: Segment GSO packets on enqueue")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 956ff3da81f4..3b3e2d772c3b 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -439,8 +439,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	struct netem_skb_cb *cb;
 	struct sk_buff *skb2;
 	struct sk_buff *segs = NULL;
-	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
-	int nb = 0;
+	unsigned int prev_len = qdisc_pkt_len(skb);
 	int count = 1;
 	int rc = NET_XMIT_SUCCESS;
 	int rc_drop = NET_XMIT_DROP;
@@ -497,6 +496,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			segs = netem_segment(skb, sch, to_free);
 			if (!segs)
 				return rc_drop;
+			qdisc_skb_cb(segs)->pkt_len = segs->len;
 		} else {
 			segs = skb;
 		}
@@ -593,6 +593,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 finish_segs:
 	if (segs) {
+		unsigned int len, last_len;
+		int nb = 0;
+
+		len = skb->len;
+
 		while (segs) {
 			skb2 = segs->next;
 			skb_mark_not_on_list(segs);
@@ -608,9 +613,7 @@ finish_segs:
 			}
 			segs = skb2;
 		}
-		sch->q.qlen += nb;
-		if (nb > 1)
-			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
+		qdisc_tree_reduce_backlog(sch, -nb, prev_len - len);
 	}
 	return NET_XMIT_SUCCESS;
 }

From 3e14c383de349a86895b8c6c410b9222646574f6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 17 Jun 2019 11:11:11 -0700
Subject: [PATCH 13/24] net: netem: fix use after free and double free with
 packet corruption

Brendan reports that the use of netem's packet corruption capability
leads to strange crashes.  This seems to be caused by
commit d66280b12bd7 ("net: netem: use a list in addition to rbtree")
which uses skb->next pointer to construct a fast-path queue of
in-order skbs.

Packet corruption code has to invoke skb_gso_segment() in case
of skbs in need of GSO.  skb_gso_segment() returns a list of
skbs.  If next pointers of the skbs on that list do not get cleared
fast path list may point to freed skbs or skbs which are also on
the RB tree.

Let's say skb gets segmented into 3 frames:

A -> B -> C

A gets hooked to the t_head t_tail list by tfifo_enqueue(), but it's
next pointer didn't get cleared so we have:

h t
|/
A -> B -> C

Now if B and C get also get enqueued successfully all is fine, because
tfifo_enqueue() will overwrite the list in order.  IOW:

Enqueue B:

h    t
|    |
A -> B    C

Enqueue C:

h         t
|         |
A -> B -> C

But if B and C get reordered we may end up with:

h t            RB tree
|/                |
A -> B -> C       B
                   \
                    C

Or if they get dropped just:

h t
|/
A -> B -> C

where A and B are already freed.

To reproduce either limit has to be set low to cause freeing of
segs or reorders have to happen (due to delay jitter).

Note that we only have to mark the first segment as not on the
list, "finish_segs" handling of other frags already does that.

Another caveat is that qdisc_drop_all() still has to free all
segments correctly in case of drop of first segment, therefore
we re-link segs before calling it.

v2:
 - re-link before drop, v1 was leaking non-first segs if limit
   was hit at the first seg
 - better commit message which lead to discovering the above :)

Reported-by: Brendan Galloway <brendan.galloway@netronome.com>
Fixes: d66280b12bd7 ("net: netem: use a list in addition to rbtree")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 3b3e2d772c3b..b17f2ed970e2 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -493,17 +493,14 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	 */
 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 		if (skb_is_gso(skb)) {
-			segs = netem_segment(skb, sch, to_free);
-			if (!segs)
+			skb = netem_segment(skb, sch, to_free);
+			if (!skb)
 				return rc_drop;
-			qdisc_skb_cb(segs)->pkt_len = segs->len;
-		} else {
-			segs = skb;
+			segs = skb->next;
+			skb_mark_not_on_list(skb);
+			qdisc_skb_cb(skb)->pkt_len = skb->len;
 		}
 
-		skb = segs;
-		segs = segs->next;
-
 		skb = skb_unshare(skb, GFP_ATOMIC);
 		if (unlikely(!skb)) {
 			qdisc_qstats_drop(sch);
@@ -520,6 +517,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	}
 
 	if (unlikely(sch->q.qlen >= sch->limit)) {
+		/* re-link segs, so that qdisc_drop_all() frees them all */
+		skb->next = segs;
 		qdisc_drop_all(skb, sch, to_free);
 		return rc_drop;
 	}

From 76e21533a48bb42d1fa894f93f6233bf4554f45e Mon Sep 17 00:00:00 2001
From: Fred Klassen <fklassen@appneta.com>
Date: Mon, 17 Jun 2019 12:05:07 -0700
Subject: [PATCH 14/24] net/udp_gso: Allow TX timestamp with UDP GSO

Fixes an issue where TX Timestamps are not arriving on the error queue
when UDP_SEGMENT CMSG type is combined with CMSG type SO_TIMESTAMPING.
This can be illustrated with an updated updgso_bench_tx program which
includes the '-T' option to test for this condition. It also introduces
the '-P' option which will call poll() before reading the error queue.

    ./udpgso_bench_tx -4ucTPv -S 1472 -l2 -D 172.16.120.18
    poll timeout
    udp tx:      0 MB/s        1 calls/s      1 msg/s

The "poll timeout" message above indicates that TX timestamp never
arrived.

This patch preserves tx_flags for the first UDP GSO segment. Only the
first segment is timestamped, even though in some cases there may be
benefital in timestamping both the first and last segment.

Factors in deciding on first segment timestamp only:

- Timestamping both first and last segmented is not feasible. Hardware
can only have one outstanding TS request at a time.

- Timestamping last segment may under report network latency of the
previous segments. Even though the doorbell is suppressed, the ring
producer counter has been incremented.

- Timestamping the first segment has the upside in that it reports
timestamps from the application's view, e.g. RTT.

- Timestamping the first segment has the downside that it may
underreport tx host network latency. It appears that we have to pick
one or the other. And possibly follow-up with a config flag to choose
behavior.

v2: Remove tests as noted by Willem de Bruijn <willemb@google.com>
    Moving tests from net to net-next

v3: Update only relevant tx_flag bits as per
    Willem de Bruijn <willemb@google.com>

v4: Update comments and commit message as per
    Willem de Bruijn <willemb@google.com>

Fixes: ee80d1ebe5ba ("udp: add udp gso")
Signed-off-by: Fred Klassen <fklassen@appneta.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 06b3e2c1fcdc..9763464a75d7 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -224,6 +224,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	seg = segs;
 	uh = udp_hdr(seg);
 
+	/* preserve TX timestamp flags and TS key for first segment */
+	skb_shinfo(seg)->tskey = skb_shinfo(gso_skb)->tskey;
+	skb_shinfo(seg)->tx_flags |=
+			(skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP);
+
 	/* compute checksum adjustment based on old length versus new */
 	newlen = htons(sizeof(*uh) + mss);
 	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);

From cb359b60416701c8bed82fec79de25a144beb893 Mon Sep 17 00:00:00 2001
From: Sunil Muthuswamy <sunilmut@microsoft.com>
Date: Mon, 17 Jun 2019 19:26:25 +0000
Subject: [PATCH 15/24] hvsock: fix epollout hang from race condition

Currently, hvsock can enter into a state where epoll_wait on EPOLLOUT will
not return even when the hvsock socket is writable, under some race
condition. This can happen under the following sequence:
- fd = socket(hvsocket)
- fd_out = dup(fd)
- fd_in = dup(fd)
- start a writer thread that writes data to fd_out with a combination of
  epoll_wait(fd_out, EPOLLOUT) and
- start a reader thread that reads data from fd_in with a combination of
  epoll_wait(fd_in, EPOLLIN)
- On the host, there are two threads that are reading/writing data to the
  hvsocket

stack:
hvs_stream_has_space
hvs_notify_poll_out
vsock_poll
sock_poll
ep_poll

Race condition:
check for epollout from ep_poll():
	assume no writable space in the socket
	hvs_stream_has_space() returns 0
check for epollin from ep_poll():
	assume socket has some free space < HVS_PKT_LEN(HVS_SEND_BUF_SIZE)
	hvs_stream_has_space() will clear the channel pending send size
	host will not notify the guest because the pending send size has
		been cleared and so the hvsocket will never mark the
		socket writable

Now, the EPOLLOUT will never return even if the socket write buffer is
empty.

The fix is to set the pending size to the default size and never change it.
This way the host will always notify the guest whenever the writable space
is bigger than the pending size. The host is already optimized to *only*
notify the guest when the pending size threshold boundary is crossed and
not everytime.

This change also reduces the cpu usage somewhat since hv_stream_has_space()
is in the hotpath of send:
vsock_stream_sendmsg()->hv_stream_has_space()
Earlier hv_stream_has_space was setting/clearing the pending size on every
call.

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/hyperv_transport.c | 39 +++++++-------------------------
 1 file changed, 8 insertions(+), 31 deletions(-)

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index fb2df6e068fa..62dcdf082349 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -211,18 +211,6 @@ static void hvs_set_channel_pending_send_size(struct vmbus_channel *chan)
 	set_channel_pending_send_size(chan,
 				      HVS_PKT_LEN(HVS_SEND_BUF_SIZE));
 
-	/* See hvs_stream_has_space(): we must make sure the host has seen
-	 * the new pending send size, before we can re-check the writable
-	 * bytes.
-	 */
-	virt_mb();
-}
-
-static void hvs_clear_channel_pending_send_size(struct vmbus_channel *chan)
-{
-	set_channel_pending_send_size(chan, 0);
-
-	/* Ditto */
 	virt_mb();
 }
 
@@ -292,9 +280,6 @@ static void hvs_channel_cb(void *ctx)
 	if (hvs_channel_readable(chan))
 		sk->sk_data_ready(sk);
 
-	/* See hvs_stream_has_space(): when we reach here, the writable bytes
-	 * may be already less than HVS_PKT_LEN(HVS_SEND_BUF_SIZE).
-	 */
 	if (hv_get_bytes_to_write(&chan->outbound) > 0)
 		sk->sk_write_space(sk);
 }
@@ -395,6 +380,13 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 	set_per_channel_state(chan, conn_from_host ? new : sk);
 	vmbus_set_chn_rescind_callback(chan, hvs_close_connection);
 
+	/* Set the pending send size to max packet size to always get
+	 * notifications from the host when there is enough writable space.
+	 * The host is optimized to send notifications only when the pending
+	 * size boundary is crossed, and not always.
+	 */
+	hvs_set_channel_pending_send_size(chan);
+
 	if (conn_from_host) {
 		new->sk_state = TCP_ESTABLISHED;
 		sk->sk_ack_backlog++;
@@ -688,23 +680,8 @@ static s64 hvs_stream_has_data(struct vsock_sock *vsk)
 static s64 hvs_stream_has_space(struct vsock_sock *vsk)
 {
 	struct hvsock *hvs = vsk->trans;
-	struct vmbus_channel *chan = hvs->chan;
-	s64 ret;
 
-	ret = hvs_channel_writable_bytes(chan);
-	if (ret > 0)  {
-		hvs_clear_channel_pending_send_size(chan);
-	} else {
-		/* See hvs_channel_cb() */
-		hvs_set_channel_pending_send_size(chan);
-
-		/* Re-check the writable bytes to avoid race */
-		ret = hvs_channel_writable_bytes(chan);
-		if (ret > 0)
-			hvs_clear_channel_pending_send_size(chan);
-	}
-
-	return ret;
+	return hvs_channel_writable_bytes(hvs->chan);
 }
 
 static u64 hvs_stream_rcvhiwat(struct vsock_sock *vsk)

From 48620e341659f6e4b978ec229f6944dabe6df709 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Date: Wed, 19 Jun 2019 10:02:13 +0000
Subject: [PATCH 16/24] net: dsa: mv88e6xxx: fix shift of FID bits in
 mv88e6185_g1_vtu_loadpurge()

The comment is correct, but the code ends up moving the bits four
places too far, into the VTUOp field.

Fixes: 11ea809f1a74 (net: dsa: mv88e6xxx: support 256 databases)
Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/global1_vtu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/global1_vtu.c b/drivers/net/dsa/mv88e6xxx/global1_vtu.c
index 3e9be3f51196..ecef69045a42 100644
--- a/drivers/net/dsa/mv88e6xxx/global1_vtu.c
+++ b/drivers/net/dsa/mv88e6xxx/global1_vtu.c
@@ -415,7 +415,7 @@ int mv88e6185_g1_vtu_loadpurge(struct mv88e6xxx_chip *chip,
 		 * VTU DBNum[7:4] are located in VTU Operation 11:8
 		 */
 		op |= entry->fid & 0x000f;
-		op |= (entry->fid & 0x00f0) << 8;
+		op |= (entry->fid & 0x00f0) << 4;
 	}
 
 	return mv88e6xxx_g1_vtu_op(chip, op);

From fdbf6326912d578a31ac4ca0933c919eadf1d54c Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 18 Jun 2019 20:42:59 +0200
Subject: [PATCH 17/24] net/af_iucv: remove GFP_DMA restriction for
 HiperTransport

af_iucv sockets over z/VM IUCV require that their skbs are allocated
in DMA memory. This restriction doesn't apply to connections over
HiperSockets. So only set this limit for z/VM IUCV sockets, thereby
increasing the likelihood that the large (and linear!) allocations for
HiperTransport messages succeed.

Fixes: 3881ac441f64 ("af_iucv: add HiperSockets transport")
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Reviewed-by: Hendrik Brueckner <brueckner@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/iucv/af_iucv.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 36eb8d1d9128..2ee68d2d4693 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -588,7 +588,6 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio,
 
 	sk->sk_destruct = iucv_sock_destruct;
 	sk->sk_sndtimeo = IUCV_CONN_TIMEOUT;
-	sk->sk_allocation = GFP_DMA;
 
 	sock_reset_flag(sk, SOCK_ZAPPED);
 
@@ -782,6 +781,7 @@ vm_bind:
 		memcpy(iucv->src_user_id, iucv_userid, 8);
 		sk->sk_state = IUCV_BOUND;
 		iucv->transport = AF_IUCV_TRANS_IUCV;
+		sk->sk_allocation |= GFP_DMA;
 		if (!iucv->msglimit)
 			iucv->msglimit = IUCV_QUEUELEN_DEFAULT;
 		goto done_unlock;
@@ -806,6 +806,8 @@ static int iucv_sock_autobind(struct sock *sk)
 		return -EPROTO;
 
 	memcpy(iucv->src_user_id, iucv_userid, 8);
+	iucv->transport = AF_IUCV_TRANS_IUCV;
+	sk->sk_allocation |= GFP_DMA;
 
 	write_lock_bh(&iucv_sk_list.lock);
 	__iucv_auto_name(iucv);
@@ -1781,6 +1783,8 @@ static int iucv_callback_connreq(struct iucv_path *path,
 
 	niucv = iucv_sk(nsk);
 	iucv_sock_init(nsk, sk);
+	niucv->transport = AF_IUCV_TRANS_IUCV;
+	nsk->sk_allocation |= GFP_DMA;
 
 	/* Set the new iucv_sock */
 	memcpy(niucv->dst_name, ipuser + 8, 8);

From 238965b71b968dc5b3c0fe430e946f488322c4b5 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 18 Jun 2019 20:43:00 +0200
Subject: [PATCH 18/24] net/af_iucv: build proper skbs for HiperTransport

The HiperSockets-based transport path in af_iucv is still too closely
entangled with qeth.
With commit a647a02512ca ("s390/qeth: speed-up L3 IQD xmit"), the
relevant xmit code in qeth has begun to use skb_cow_head(). So to avoid
unnecessary skb head expansions, af_iucv must learn to
1) respect dev->needed_headroom when allocating skbs, and
2) drop the header reference before cloning the skb.

While at it, also stop hard-coding the LL-header creation stage and just
use the appropriate helper.

Fixes: a647a02512ca ("s390/qeth: speed-up L3 IQD xmit")
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/iucv/af_iucv.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 2ee68d2d4693..5264a182d2c3 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -14,6 +14,7 @@
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
 #include <linux/module.h>
+#include <linux/netdevice.h>
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/errno.h>
@@ -347,14 +348,14 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
 	if (imsg)
 		memcpy(&phs_hdr->iucv_hdr, imsg, sizeof(struct iucv_message));
 
-	skb_push(skb, ETH_HLEN);
-	memset(skb->data, 0, ETH_HLEN);
-
 	skb->dev = iucv->hs_dev;
 	if (!skb->dev) {
 		err = -ENODEV;
 		goto err_free;
 	}
+
+	dev_hard_header(skb, skb->dev, ETH_P_AF_IUCV, NULL, NULL, skb->len);
+
 	if (!(skb->dev->flags & IFF_UP) || !netif_carrier_ok(skb->dev)) {
 		err = -ENETDOWN;
 		goto err_free;
@@ -367,6 +368,8 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
 		skb_trim(skb, skb->dev->mtu);
 	}
 	skb->protocol = cpu_to_be16(ETH_P_AF_IUCV);
+
+	__skb_header_release(skb);
 	nskb = skb_clone(skb, GFP_ATOMIC);
 	if (!nskb) {
 		err = -ENOMEM;
@@ -466,12 +469,14 @@ static void iucv_sever_path(struct sock *sk, int with_user_data)
 /* Send controlling flags through an IUCV socket for HIPER transport */
 static int iucv_send_ctrl(struct sock *sk, u8 flags)
 {
+	struct iucv_sock *iucv = iucv_sk(sk);
 	int err = 0;
 	int blen;
 	struct sk_buff *skb;
 	u8 shutdown = 0;
 
-	blen = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
+	blen = sizeof(struct af_iucv_trans_hdr) +
+	       LL_RESERVED_SPACE(iucv->hs_dev);
 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
 		/* controlling flags should be sent anyway */
 		shutdown = sk->sk_shutdown;
@@ -1133,7 +1138,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
 	 * segmented records using the MSG_EOR flag), but
 	 * for SOCK_STREAM we might want to improve it in future */
 	if (iucv->transport == AF_IUCV_TRANS_HIPER) {
-		headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
+		headroom = sizeof(struct af_iucv_trans_hdr) +
+			   LL_RESERVED_SPACE(iucv->hs_dev);
 		linear = len;
 	} else {
 		if (len < PAGE_SIZE) {

From 06996c1d4088a0d5f3e7789d7f96b4653cc947cc Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 18 Jun 2019 20:43:01 +0200
Subject: [PATCH 19/24] net/af_iucv: always register net_device notifier

Even when running as VM guest (ie pr_iucv != NULL), af_iucv can still
open HiperTransport-based connections. For robust operation these
connections require the af_iucv_netdev_notifier, so register it
unconditionally.

Also handle any error that register_netdevice_notifier() returns.

Fixes: 9fbd87d41392 ("af_iucv: handle netdev events")
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/iucv/af_iucv.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 5264a182d2c3..09e1694b6d34 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -2440,6 +2440,13 @@ out:
 	return err;
 }
 
+static void afiucv_iucv_exit(void)
+{
+	device_unregister(af_iucv_dev);
+	driver_unregister(&af_iucv_driver);
+	pr_iucv->iucv_unregister(&af_iucv_handler, 0);
+}
+
 static int __init afiucv_init(void)
 {
 	int err;
@@ -2473,11 +2480,18 @@ static int __init afiucv_init(void)
 		err = afiucv_iucv_init();
 		if (err)
 			goto out_sock;
-	} else
-		register_netdevice_notifier(&afiucv_netdev_notifier);
+	}
+
+	err = register_netdevice_notifier(&afiucv_netdev_notifier);
+	if (err)
+		goto out_notifier;
+
 	dev_add_pack(&iucv_packet_type);
 	return 0;
 
+out_notifier:
+	if (pr_iucv)
+		afiucv_iucv_exit();
 out_sock:
 	sock_unregister(PF_IUCV);
 out_proto:
@@ -2491,12 +2505,11 @@ out:
 static void __exit afiucv_exit(void)
 {
 	if (pr_iucv) {
-		device_unregister(af_iucv_dev);
-		driver_unregister(&af_iucv_driver);
-		pr_iucv->iucv_unregister(&af_iucv_handler, 0);
+		afiucv_iucv_exit();
 		symbol_put(iucv_if);
-	} else
-		unregister_netdevice_notifier(&afiucv_netdev_notifier);
+	}
+
+	unregister_netdevice_notifier(&afiucv_netdev_notifier);
 	dev_remove_pack(&iucv_packet_type);
 	sock_unregister(PF_IUCV);
 	proto_unregister(&iucv_proto);

From bf6de2315362473e14817ccbdbd00ade6de2756e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 18 Jun 2019 20:54:22 +0200
Subject: [PATCH 20/24] net: hns3: Fix inconsistent indenting

Fix wrong indentation of goto return.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 196a3d780dcf..f326805543a4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3803,7 +3803,7 @@ static int hns3_client_init(struct hnae3_handle *handle)
 	ret = hns3_client_start(handle);
 	if (ret) {
 		dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret);
-			goto out_client_start;
+		goto out_client_start;
 	}
 
 	hns3_dcbnl_setup(handle);

From c7036d97acd2527cef145b5ef9ad1a37ed21bbe6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 19 Jun 2019 10:50:24 -0700
Subject: [PATCH 21/24] ipv6: Default fib6_type to RTN_UNICAST when not set

A user reported that routes are getting installed with type 0 (RTN_UNSPEC)
where before the routes were RTN_UNICAST. One example is from accel-ppp
which apparently still uses the ioctl interface and does not set
rtmsg_type. Another is the netlink interface where ipv6 does not require
rtm_type to be set (v4 does). Prior to the commit in the Fixes tag the
ipv6 stack converted type 0 to RTN_UNICAST, so restore that behavior.

Fixes: e8478e80e5a7 ("net/ipv6: Save route type in rt6_info")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0f60eb3a2873..11ad62effd56 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3184,7 +3184,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 
 	rt->fib6_table = table;
 	rt->fib6_metric = cfg->fc_metric;
-	rt->fib6_type = cfg->fc_type;
+	rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
 
 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);

From 8110a7a7d295b08433dfa7e692e347a43e63f475 Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Wed, 19 Jun 2019 11:17:15 -0700
Subject: [PATCH 22/24] net: mvpp2: debugfs: Add pmap to fs dump

There was an unused variable 'mvpp2_dbgfs_prs_pmap_fops'
Added a usage consistent with other fops to dump pmap
to userspace.

Cc: clang-built-linux@googlegroups.com
Link: https://github.com/ClangBuiltLinux/linux/issues/529
Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c
index 0ee39ea47b6b..274fb07362cb 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c
@@ -566,6 +566,9 @@ static int mvpp2_dbgfs_prs_entry_init(struct dentry *parent,
 	debugfs_create_file("hits", 0444, prs_entry_dir, entry,
 			    &mvpp2_dbgfs_prs_hits_fops);
 
+	debugfs_create_file("pmap", 0444, prs_entry_dir, entry,
+			     &mvpp2_dbgfs_prs_pmap_fops);
+
 	return 0;
 }
 

From 85f9aa7565bd79b039325f2c01af7ffa717924df Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 19 Jun 2019 09:38:38 -0700
Subject: [PATCH 23/24] inet: clear num_timeout reqsk_alloc()

KMSAN caught uninit-value in tcp_create_openreq_child() [1]
This is caused by a recent change, combined by the fact
that TCP cleared num_timeout, num_retrans and sk fields only
when a request socket was about to be queued.

Under syncookie mode, a temporary request socket is used,
and req->num_timeout could contain garbage.

Lets clear these three fields sooner, there is really no
point trying to defer this and risk other bugs.

[1]

BUG: KMSAN: uninit-value in tcp_create_openreq_child+0x157f/0x1cc0 net/ipv4/tcp_minisocks.c:526
CPU: 1 PID: 13357 Comm: syz-executor591 Not tainted 5.2.0-rc4+ #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x191/0x1f0 lib/dump_stack.c:113
 kmsan_report+0x162/0x2d0 mm/kmsan/kmsan.c:611
 __msan_warning+0x75/0xe0 mm/kmsan/kmsan_instr.c:304
 tcp_create_openreq_child+0x157f/0x1cc0 net/ipv4/tcp_minisocks.c:526
 tcp_v6_syn_recv_sock+0x761/0x2d80 net/ipv6/tcp_ipv6.c:1152
 tcp_get_cookie_sock+0x16e/0x6b0 net/ipv4/syncookies.c:209
 cookie_v6_check+0x27e0/0x29a0 net/ipv6/syncookies.c:252
 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:1039 [inline]
 tcp_v6_do_rcv+0xf1c/0x1ce0 net/ipv6/tcp_ipv6.c:1344
 tcp_v6_rcv+0x60b7/0x6a30 net/ipv6/tcp_ipv6.c:1554
 ip6_protocol_deliver_rcu+0x1433/0x22f0 net/ipv6/ip6_input.c:397
 ip6_input_finish net/ipv6/ip6_input.c:438 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ip6_input+0x2af/0x340 net/ipv6/ip6_input.c:447
 dst_input include/net/dst.h:439 [inline]
 ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ipv6_rcv+0x683/0x710 net/ipv6/ip6_input.c:272
 __netif_receive_skb_one_core net/core/dev.c:4981 [inline]
 __netif_receive_skb net/core/dev.c:5095 [inline]
 process_backlog+0x721/0x1410 net/core/dev.c:5906
 napi_poll net/core/dev.c:6329 [inline]
 net_rx_action+0x738/0x1940 net/core/dev.c:6395
 __do_softirq+0x4ad/0x858 kernel/softirq.c:293
 do_softirq_own_stack+0x49/0x80 arch/x86/entry/entry_64.S:1052
 </IRQ>
 do_softirq kernel/softirq.c:338 [inline]
 __local_bh_enable_ip+0x199/0x1e0 kernel/softirq.c:190
 local_bh_enable+0x36/0x40 include/linux/bottom_half.h:32
 rcu_read_unlock_bh include/linux/rcupdate.h:682 [inline]
 ip6_finish_output2+0x213f/0x2670 net/ipv6/ip6_output.c:117
 ip6_finish_output+0xae4/0xbc0 net/ipv6/ip6_output.c:150
 NF_HOOK_COND include/linux/netfilter.h:294 [inline]
 ip6_output+0x5d3/0x720 net/ipv6/ip6_output.c:167
 dst_output include/net/dst.h:433 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ip6_xmit+0x1f53/0x2650 net/ipv6/ip6_output.c:271
 inet6_csk_xmit+0x3df/0x4f0 net/ipv6/inet6_connection_sock.c:135
 __tcp_transmit_skb+0x4076/0x5b40 net/ipv4/tcp_output.c:1156
 tcp_transmit_skb net/ipv4/tcp_output.c:1172 [inline]
 tcp_write_xmit+0x39a9/0xa730 net/ipv4/tcp_output.c:2397
 __tcp_push_pending_frames+0x124/0x4e0 net/ipv4/tcp_output.c:2573
 tcp_send_fin+0xd43/0x1540 net/ipv4/tcp_output.c:3118
 tcp_close+0x16ba/0x1860 net/ipv4/tcp.c:2403
 inet_release+0x1f7/0x270 net/ipv4/af_inet.c:427
 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:470
 __sock_release net/socket.c:601 [inline]
 sock_close+0x156/0x490 net/socket.c:1273
 __fput+0x4c9/0xba0 fs/file_table.c:280
 ____fput+0x37/0x40 fs/file_table.c:313
 task_work_run+0x22e/0x2a0 kernel/task_work.c:113
 tracehook_notify_resume include/linux/tracehook.h:185 [inline]
 exit_to_usermode_loop arch/x86/entry/common.c:168 [inline]
 prepare_exit_to_usermode+0x39d/0x4d0 arch/x86/entry/common.c:199
 syscall_return_slowpath+0x90/0x5c0 arch/x86/entry/common.c:279
 do_syscall_64+0xe2/0xf0 arch/x86/entry/common.c:305
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x401d50
Code: 01 f0 ff ff 0f 83 40 0d 00 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d dd 8d 2d 00 00 75 14 b8 03 00 00 00 0f 05 <48> 3d 01 f0 ff ff 0f 83 14 0d 00 00 c3 48 83 ec 08 e8 7a 02 00 00
RSP: 002b:00007fff1cf58cf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000401d50
RDX: 000000000000001c RSI: 0000000000000000 RDI: 0000000000000003
RBP: 00000000004a9050 R08: 0000000020000040 R09: 000000000000001c
R10: 0000000020004004 R11: 0000000000000246 R12: 0000000000402ef0
R13: 0000000000402f80 R14: 0000000000000000 R15: 0000000000000000

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:201 [inline]
 kmsan_internal_poison_shadow+0x53/0xa0 mm/kmsan/kmsan.c:160
 kmsan_kmalloc+0xa4/0x130 mm/kmsan/kmsan_hooks.c:177
 kmem_cache_alloc+0x534/0xb00 mm/slub.c:2781
 reqsk_alloc include/net/request_sock.h:84 [inline]
 inet_reqsk_alloc+0xa8/0x600 net/ipv4/tcp_input.c:6384
 cookie_v6_check+0xadb/0x29a0 net/ipv6/syncookies.c:173
 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:1039 [inline]
 tcp_v6_do_rcv+0xf1c/0x1ce0 net/ipv6/tcp_ipv6.c:1344
 tcp_v6_rcv+0x60b7/0x6a30 net/ipv6/tcp_ipv6.c:1554
 ip6_protocol_deliver_rcu+0x1433/0x22f0 net/ipv6/ip6_input.c:397
 ip6_input_finish net/ipv6/ip6_input.c:438 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ip6_input+0x2af/0x340 net/ipv6/ip6_input.c:447
 dst_input include/net/dst.h:439 [inline]
 ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ipv6_rcv+0x683/0x710 net/ipv6/ip6_input.c:272
 __netif_receive_skb_one_core net/core/dev.c:4981 [inline]
 __netif_receive_skb net/core/dev.c:5095 [inline]
 process_backlog+0x721/0x1410 net/core/dev.c:5906
 napi_poll net/core/dev.c:6329 [inline]
 net_rx_action+0x738/0x1940 net/core/dev.c:6395
 __do_softirq+0x4ad/0x858 kernel/softirq.c:293
 do_softirq_own_stack+0x49/0x80 arch/x86/entry/entry_64.S:1052
 do_softirq kernel/softirq.c:338 [inline]
 __local_bh_enable_ip+0x199/0x1e0 kernel/softirq.c:190
 local_bh_enable+0x36/0x40 include/linux/bottom_half.h:32
 rcu_read_unlock_bh include/linux/rcupdate.h:682 [inline]
 ip6_finish_output2+0x213f/0x2670 net/ipv6/ip6_output.c:117
 ip6_finish_output+0xae4/0xbc0 net/ipv6/ip6_output.c:150
 NF_HOOK_COND include/linux/netfilter.h:294 [inline]
 ip6_output+0x5d3/0x720 net/ipv6/ip6_output.c:167
 dst_output include/net/dst.h:433 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ip6_xmit+0x1f53/0x2650 net/ipv6/ip6_output.c:271
 inet6_csk_xmit+0x3df/0x4f0 net/ipv6/inet6_connection_sock.c:135
 __tcp_transmit_skb+0x4076/0x5b40 net/ipv4/tcp_output.c:1156
 tcp_transmit_skb net/ipv4/tcp_output.c:1172 [inline]
 tcp_write_xmit+0x39a9/0xa730 net/ipv4/tcp_output.c:2397
 __tcp_push_pending_frames+0x124/0x4e0 net/ipv4/tcp_output.c:2573
 tcp_send_fin+0xd43/0x1540 net/ipv4/tcp_output.c:3118
 tcp_close+0x16ba/0x1860 net/ipv4/tcp.c:2403
 inet_release+0x1f7/0x270 net/ipv4/af_inet.c:427
 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:470
 __sock_release net/socket.c:601 [inline]
 sock_close+0x156/0x490 net/socket.c:1273
 __fput+0x4c9/0xba0 fs/file_table.c:280
 ____fput+0x37/0x40 fs/file_table.c:313
 task_work_run+0x22e/0x2a0 kernel/task_work.c:113
 tracehook_notify_resume include/linux/tracehook.h:185 [inline]
 exit_to_usermode_loop arch/x86/entry/common.c:168 [inline]
 prepare_exit_to_usermode+0x39d/0x4d0 arch/x86/entry/common.c:199
 syscall_return_slowpath+0x90/0x5c0 arch/x86/entry/common.c:279
 do_syscall_64+0xe2/0xf0 arch/x86/entry/common.c:305
 entry_SYSCALL_64_after_hwframe+0x63/0xe7

Fixes: 336c39a03151 ("tcp: undo init congestion window on false SYNACK timeout")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/request_sock.h      | 3 +++
 net/ipv4/inet_connection_sock.c | 4 ----
 net/ipv4/tcp_fastopen.c         | 4 ----
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index b3ea21f2732e..fd178d58fa84 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -97,6 +97,9 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
 	sk_node_init(&req_to_sk(req)->sk_node);
 	sk_tx_queue_clear(req_to_sk(req));
 	req->saved_syn = NULL;
+	req->num_timeout = 0;
+	req->num_retrans = 0;
+	req->sk = NULL;
 	refcount_set(&req->rsk_refcnt, 0);
 
 	return req;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 13ec7c3a9c49..7fd6db3fe366 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -752,10 +752,6 @@ drop:
 static void reqsk_queue_hash_req(struct request_sock *req,
 				 unsigned long timeout)
 {
-	req->num_retrans = 0;
-	req->num_timeout = 0;
-	req->sk = NULL;
-
 	timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
 	mod_timer(&req->rsk_timer, jiffies + timeout);
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 018a48477355..f5a45e1e1182 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -221,10 +221,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	struct sock *child;
 	bool own_req;
 
-	req->num_retrans = 0;
-	req->num_timeout = 0;
-	req->sk = NULL;
-
 	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
 							 NULL, &own_req);
 	if (!child)

From b6653b3629e5b88202be3c9abc44713973f5c4b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Jun 2019 06:09:55 -0700
Subject: [PATCH 24/24] tcp: refine memory limit test in tcp_fragment()

tcp_fragment() might be called for skbs in the write queue.

Memory limits might have been exceeded because tcp_sendmsg() only
checks limits at full skb (64KB) boundaries.

Therefore, we need to make sure tcp_fragment() wont punish applications
that might have setup very low SO_SNDBUF values.

Fixes: f070ef2ac667 ("tcp: tcp_fragment() should apply sane memory limits")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Christoph Paasch <cpaasch@apple.com>
Tested-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 00c01a01b547..0ebc33d1c9e5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1296,7 +1296,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 	if (nsize < 0)
 		nsize = 0;
 
-	if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) {
+	if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf &&
+		     tcp_queue != TCP_FRAG_IN_WRITE_QUEUE)) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
 		return -ENOMEM;
 	}