From 5142967ab524eb8e5c1f6122e46e2df81bae178b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 May 2019 22:26:34 +0200 Subject: [PATCH 01/24] netfilter: nf_tables: fix module autoload with inet family Use MODULE_ALIAS_NFT_EXPR() to make happy the inet family with nat. Fixes: 63ce3940f3ab ("netfilter: nft_redir: add inet support") Fixes: 071657d2c38c ("netfilter: nft_masq: add inet support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_masq.c | 3 +-- net/netfilter/nft_redir.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 86fd90085eaf..8c1612d6bc2c 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -307,5 +307,4 @@ module_exit(nft_masq_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez "); -MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq"); -MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); +MODULE_ALIAS_NFT_EXPR("masq"); diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index da74fdc4a684..8787e9f8ed71 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -294,5 +294,4 @@ module_exit(nft_redir_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez "); -MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); -MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir"); +MODULE_ALIAS_NFT_EXPR("nat"); From a0d56cb911ca301de81735f1d73c2aab424654ba Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sun, 2 Jun 2019 15:13:47 +0200 Subject: [PATCH 02/24] netfilter: ipv6: nf_defrag: fix leakage of unqueued fragments With commit 997dd9647164 ("net: IP6 defrag: use rbtrees in nf_conntrack_reasm.c"), nf_ct_frag6_reasm() is now called from nf_ct_frag6_queue(). With this change, nf_ct_frag6_queue() can fail after the skb has been added to the fragment queue and nf_ct_frag6_gather() was adapted to handle this case. But nf_ct_frag6_queue() can still fail before the fragment has been queued. nf_ct_frag6_gather() can't handle this case anymore, because it has no way to know if nf_ct_frag6_queue() queued the fragment before failing. If it didn't, the skb is lost as the error code is overwritten with -EINPROGRESS. Fix this by setting -EINPROGRESS directly in nf_ct_frag6_queue(), so that nf_ct_frag6_gather() can propagate the error as is. Fixes: 997dd9647164 ("net: IP6 defrag: use rbtrees in nf_conntrack_reasm.c") Signed-off-by: Guillaume Nault Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3de0e9b0a482..5b3f65e29b6f 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -293,7 +293,11 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, skb->_skb_refdst = 0UL; err = nf_ct_frag6_reasm(fq, skb, prev, dev); skb->_skb_refdst = orefdst; - return err; + + /* After queue has assumed skb ownership, only 0 or + * -EINPROGRESS must be returned. + */ + return err ? -EINPROGRESS : 0; } skb_dst_drop(skb); @@ -480,12 +484,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) ret = 0; } - /* after queue has assumed skb ownership, only 0 or -EINPROGRESS - * must be returned. - */ - if (ret) - ret = -EINPROGRESS; - spin_unlock_bh(&fq->q.lock); inet_frag_put(&fq->q); return ret; From 8a3dca632538c550930ce8bafa8c906b130d35cf Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 6 Jun 2019 18:04:00 +0200 Subject: [PATCH 03/24] netfilter: ipv6: nf_defrag: accept duplicate fragments again When fixing the skb leak introduced by the conversion to rbtree, I forgot about the special case of duplicate fragments. The condition under the 'insert_error' label isn't effective anymore as nf_ct_frg6_gather() doesn't override the returned value anymore. So duplicate fragments now get NF_DROP verdict. To accept duplicate fragments again, handle them specially as soon as inet_frag_queue_insert() reports them. Return -EINPROGRESS which will translate to NF_STOLEN verdict, like any accepted fragment. However, such packets don't carry any new information and aren't queued, so we just drop them immediately. Fixes: a0d56cb911ca ("netfilter: ipv6: nf_defrag: fix leakage of unqueued fragments") Signed-off-by: Guillaume Nault Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 5b3f65e29b6f..8951de8b568f 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -265,8 +265,14 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, prev = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); - if (err) + if (err) { + if (err == IPFRAG_DUP) { + /* No error for duplicates, pretend they got queued. */ + kfree_skb(skb); + return -EINPROGRESS; + } goto insert_error; + } if (dev) fq->iif = dev->ifindex; @@ -304,8 +310,6 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, return -EINPROGRESS; insert_error: - if (err == IPFRAG_DUP) - goto err; inet_frag_kill(&fq->q); err: skb_dst_drop(skb); From cf18cecca911c0db96b868072665347efe6df46f Mon Sep 17 00:00:00 2001 From: "Mauro S. M. Rodrigues" Date: Thu, 13 Jun 2019 16:25:40 -0300 Subject: [PATCH 04/24] bnx2x: Check if transceiver implements DDM before access Some transceivers may comply with SFF-8472 even though they do not implement the Digital Diagnostic Monitoring (DDM) interface described in the spec. The existence of such area is specified by the 6th bit of byte 92, set to 1 if implemented. Currently, without checking this bit, bnx2x fails trying to read sfp module's EEPROM with the follow message: ethtool -m enP5p1s0f1 Cannot get Module EEPROM data: Input/output error Because it fails to read the additional 256 bytes in which it is assumed to exist the DDM data. This issue was noticed using a Mellanox Passive DAC PN 01FT738. The EEPROM data was confirmed by Mellanox as correct and similar to other Passive DACs from other manufacturers. Signed-off-by: Mauro S. M. Rodrigues Acked-by: Sudarsana Reddy Kalluru Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 3 ++- drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c index 0745cccd416d..51fc845de31a 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c @@ -1609,7 +1609,8 @@ static int bnx2x_get_module_info(struct net_device *dev, } if (!sff8472_comp || - (diag_type & SFP_EEPROM_DIAG_ADDR_CHANGE_REQ)) { + (diag_type & SFP_EEPROM_DIAG_ADDR_CHANGE_REQ) || + !(diag_type & SFP_EEPROM_DDM_IMPLEMENTED)) { modinfo->type = ETH_MODULE_SFF_8079; modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; } else { diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h index b7d251108c19..7115f5025664 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h @@ -62,6 +62,7 @@ #define SFP_EEPROM_DIAG_TYPE_ADDR 0x5c #define SFP_EEPROM_DIAG_TYPE_SIZE 1 #define SFP_EEPROM_DIAG_ADDR_CHANGE_REQ (1<<2) +#define SFP_EEPROM_DDM_IMPLEMENTED (1<<6) #define SFP_EEPROM_SFF_8472_COMP_ADDR 0x5e #define SFP_EEPROM_SFF_8472_COMP_SIZE 1 From d0f84d0856c11fbafadae3d580f6a9c98d818ccd Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Mon, 17 Jun 2019 11:56:12 +0700 Subject: [PATCH 05/24] tipc: fix issues with early FAILOVER_MSG from peer It appears that a FAILOVER_MSG can come from peer even when the failure link is resetting (i.e. just after the 'node_write_unlock()'...). This means the failover procedure on the node has not been started yet. The situation is as follows: node1 node2 linkb linka linka linkb | | | | | | x failure | | | RESETTING | | | | | | x failure RESET | | RESETTING FAILINGOVER | | | (FAILOVER_MSG) | | |<-------------------------------------------------| | *FAILINGOVER | | | | | (dummy FAILOVER_MSG) | | |------------------------------------------------->| | RESET | | FAILOVER_END | FAILINGOVER RESET | . . . . . . . . . . . . Once this happens, the link failover procedure will be triggered wrongly on the receiving node since the node isn't in FAILINGOVER state but then another link failover will be carried out. The consequences are: 1) A peer might get stuck in FAILINGOVER state because the 'sync_point' was set, reset and set incorrectly, the criteria to end the failover would not be met, it could keep waiting for a message that has already received. 2) The early FAILOVER_MSG(s) could be queued in the link failover deferdq but would be purged or not pulled out because the 'drop_point' was not set correctly. 3) The early FAILOVER_MSG(s) could be dropped too. 4) The dummy FAILOVER_MSG could make the peer leaving FAILINGOVER state shortly, but later on it would be restarted. The same situation can also happen when the link is in PEER_RESET state and a FAILOVER_MSG arrives. The commit resolves the issues by forcing the link down immediately, so the failover procedure will be started normally (which is the same as when receiving a FAILOVER_MSG and the link is in up state). Also, the function "tipc_node_link_failover()" is toughen to avoid such a situation from happening. Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/link.c | 1 - net/tipc/node.c | 10 +++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index f5cd986e1e50..2050fd386642 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1728,7 +1728,6 @@ void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl, * node has entered SELF_DOWN_PEER_LEAVING and both peer nodes * would have to start over from scratch instead. */ - WARN_ON(l && tipc_link_is_up(l)); tnl->drop_point = 1; tnl->failover_reasm_skb = NULL; diff --git a/net/tipc/node.c b/net/tipc/node.c index 9e106d3ed187..550581d47d51 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -766,9 +766,9 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) - * 5. Node B starts failover onto link <1B-2B> + * 5. Node 2 starts failover onto link <1B-2B> * - * ==> Node A does never start link/node failover! + * ==> Node 1 does never start link/node failover! * * @n: tipc node structure * @l: link peer endpoint failingover (- can be NULL) @@ -783,6 +783,10 @@ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, if (!tipc_link_is_up(tnl)) return; + /* Don't rush, failure link may be in the process of resetting */ + if (l && !tipc_link_is_reset(l)) + return; + tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); @@ -1706,7 +1710,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; - if (pl && tipc_link_is_up(pl)) { + if (pl && !tipc_link_is_reset(pl)) { __tipc_node_link_down(n, &pb_id, xmitq, &maddr); trace_tipc_node_link_down(n, true, "node link down <- failover!"); From d0bae4a0e3d8c5690a885204d7eb2341a5b4884d Mon Sep 17 00:00:00 2001 From: JingYi Hou Date: Mon, 17 Jun 2019 14:56:05 +0800 Subject: [PATCH 06/24] net: remove duplicate fetch in sock_getsockopt In sock_getsockopt(), 'optlen' is fetched the first time from userspace. 'len < 0' is then checked. Then in condition 'SO_MEMINFO', 'optlen' is fetched the second time from userspace. If change it between two fetches may cause security problems or unexpected behaivor, and there is no reason to fetch it a second time. To fix this, we need to remove the second fetch. Signed-off-by: JingYi Hou Signed-off-by: David S. Miller --- net/core/sock.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index af09a23e4822..aa4a00d381e3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1477,9 +1477,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname, { u32 meminfo[SK_MEMINFO_VARS]; - if (get_user(len, optlen)) - return -EFAULT; - sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); From 72b319dc08b4924a29f5e2560ef6d966fa54c429 Mon Sep 17 00:00:00 2001 From: Fei Li Date: Mon, 17 Jun 2019 21:26:36 +0800 Subject: [PATCH 07/24] tun: wake up waitqueues after IFF_UP is set Currently after setting tap0 link up, the tun code wakes tx/rx waited queues up in tun_net_open() when .ndo_open() is called, however the IFF_UP flag has not been set yet. If there's already a wait queue, it would fail to transmit when checking the IFF_UP flag in tun_sendmsg(). Then the saving vhost_poll_start() will add the wq into wqh until it is waken up again. Although this works when IFF_UP flag has been set when tun_chr_poll detects; this is not true if IFF_UP flag has not been set at that time. Sadly the latter case is a fatal error, as the wq will never be waken up in future unless later manually setting link up on purpose. Fix this by moving the wakeup process into the NETDEV_UP event notifying process, this makes sure IFF_UP has been set before all waited queues been waken up. Signed-off-by: Fei Li Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c452d6d831dd..d7c55e0fa8f4 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1014,18 +1014,8 @@ static void tun_net_uninit(struct net_device *dev) /* Net device open. */ static int tun_net_open(struct net_device *dev) { - struct tun_struct *tun = netdev_priv(dev); - int i; - netif_tx_start_all_queues(dev); - for (i = 0; i < tun->numqueues; i++) { - struct tun_file *tfile; - - tfile = rtnl_dereference(tun->tfiles[i]); - tfile->socket.sk->sk_write_space(tfile->socket.sk); - } - return 0; } @@ -3634,6 +3624,7 @@ static int tun_device_event(struct notifier_block *unused, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tun_struct *tun = netdev_priv(dev); + int i; if (dev->rtnl_link_ops != &tun_link_ops) return NOTIFY_DONE; @@ -3643,6 +3634,14 @@ static int tun_device_event(struct notifier_block *unused, if (tun_queue_resize(tun)) return NOTIFY_BAD; break; + case NETDEV_UP: + for (i = 0; i < tun->numqueues; i++) { + struct tun_file *tfile; + + tfile = rtnl_dereference(tun->tfiles[i]); + tfile->socket.sk->sk_write_space(tfile->socket.sk); + } + break; default: break; } From 5684abf7020dfc5f0b6ba1d68eda3663871fce52 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 17 Jun 2019 21:34:13 +0800 Subject: [PATCH 08/24] ip_tunnel: allow not to count pkts on tstats by setting skb's dev to NULL iptunnel_xmit() works as a common function, also used by a udp tunnel which doesn't have to have a tunnel device, like how TIPC works with udp media. In these cases, we should allow not to count pkts on dev's tstats, so that udp tunnel can work with no tunnel device safely. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 9e3846388fb3..1452a97914a0 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -76,9 +76,12 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); err = ip_local_out(net, sk, skb); - if (unlikely(net_xmit_eval(err))) - pkt_len = 0; - iptunnel_xmit_stats(dev, pkt_len); + + if (dev) { + if (unlikely(net_xmit_eval(err))) + pkt_len = 0; + iptunnel_xmit_stats(dev, pkt_len); + } } EXPORT_SYMBOL_GPL(iptunnel_xmit); From 6f6a8622057c92408930c31698394fae1557b188 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 17 Jun 2019 21:34:14 +0800 Subject: [PATCH 09/24] ip6_tunnel: allow not to count pkts on tstats by passing dev as NULL A similar fix to Patch "ip_tunnel: allow not to count pkts on tstats by setting skb's dev to NULL" is also needed by ip6_tunnel. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 69b4bcf880c9..028eaea1c854 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -158,9 +158,12 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); - if (unlikely(net_xmit_eval(err))) - pkt_len = -1; - iptunnel_xmit_stats(dev, pkt_len); + + if (dev) { + if (unlikely(net_xmit_eval(err))) + pkt_len = -1; + iptunnel_xmit_stats(dev, pkt_len); + } } #endif #endif From c3bcde026684c62d7a2b6f626dc7cf763833875c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 17 Jun 2019 21:34:15 +0800 Subject: [PATCH 10/24] tipc: pass tunnel dev as NULL to udp_tunnel(6)_xmit_skb udp_tunnel(6)_xmit_skb() called by tipc_udp_xmit() expects a tunnel device to count packets on dev->tstats, a perpcu variable. However, TIPC is using udp tunnel with no tunnel device, and pass the lower dev, like veth device that only initializes dev->lstats(a perpcu variable) when creating it. Later iptunnel_xmit_stats() called by ip(6)tunnel_xmit() thinks the dev as a tunnel device, and uses dev->tstats instead of dev->lstats. tstats' each pointer points to a bigger struct than lstats, so when tstats->tx_bytes is increased, other percpu variable's members could be overwritten. syzbot has reported quite a few crashes due to fib_nh_common percpu member 'nhc_pcpu_rth_output' overwritten, call traces are like: BUG: KASAN: slab-out-of-bounds in rt_cache_valid+0x158/0x190 net/ipv4/route.c:1556 rt_cache_valid+0x158/0x190 net/ipv4/route.c:1556 __mkroute_output net/ipv4/route.c:2332 [inline] ip_route_output_key_hash_rcu+0x819/0x2d50 net/ipv4/route.c:2564 ip_route_output_key_hash+0x1ef/0x360 net/ipv4/route.c:2393 __ip_route_output_key include/net/route.h:125 [inline] ip_route_output_flow+0x28/0xc0 net/ipv4/route.c:2651 ip_route_output_key include/net/route.h:135 [inline] ... or: kasan: GPF could be caused by NULL-ptr deref or user memory access RIP: 0010:dst_dev_put+0x24/0x290 net/core/dst.c:168 rt_fibinfo_free_cpus net/ipv4/fib_semantics.c:200 [inline] free_fib_info_rcu+0x2e1/0x490 net/ipv4/fib_semantics.c:217 __rcu_reclaim kernel/rcu/rcu.h:240 [inline] rcu_do_batch kernel/rcu/tree.c:2437 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2716 [inline] rcu_process_callbacks+0x100a/0x1ac0 kernel/rcu/tree.c:2697 ... The issue exists since tunnel stats update is moved to iptunnel_xmit by Commit 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()"), and here to fix it by passing a NULL tunnel dev to udp_tunnel(6)_xmit_skb so that the packets counting won't happen on dev->tstats. Reported-by: syzbot+9d4c12bfd45a58738d0a@syzkaller.appspotmail.com Reported-by: syzbot+a9e23ea2aa21044c2798@syzkaller.appspotmail.com Reported-by: syzbot+c4c4b2bb358bb936ad7e@syzkaller.appspotmail.com Reported-by: syzbot+0290d2290a607e035ba1@syzkaller.appspotmail.com Reported-by: syzbot+a43d8d4e7e8a7a9e149e@syzkaller.appspotmail.com Reported-by: syzbot+a47c5f4c6c00fc1ed16e@syzkaller.appspotmail.com Fixes: 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/tipc/udp_media.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 7fc02d84c4f1..1405ccc9101c 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -176,7 +176,6 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, goto tx_error; } - skb->dev = rt->dst.dev; ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->port, @@ -195,10 +194,9 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, if (err) goto tx_error; ttl = ip6_dst_hoplimit(ndst); - err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, - ndst->dev, &src->ipv6, - &dst->ipv6, 0, ttl, 0, src->port, - dst->port, false); + err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, + &src->ipv6, &dst->ipv6, 0, ttl, 0, + src->port, dst->port, false); #endif } return err; From 9476274093a0e79b905f4cd6cf6d149f65e02c17 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 17 Jun 2019 17:12:49 +0100 Subject: [PATCH 11/24] net: lio_core: fix potential sign-extension overflow on large shift Left shifting the signed int value 1 by 31 bits has undefined behaviour and the shift amount oq_no can be as much as 63. Fix this by using BIT_ULL(oq_no) instead. Addresses-Coverity: ("Bad shift operation") Fixes: f21fb3ed364b ("Add support of Cavium Liquidio ethernet adapters") Signed-off-by: Colin Ian King Reviewed-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/lio_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c index 1c50c10b5a16..d7e805749a5b 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_core.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c @@ -964,7 +964,7 @@ static void liquidio_schedule_droq_pkt_handlers(struct octeon_device *oct) if (droq->ops.poll_mode) { droq->ops.napi_fn(droq); - oct_priv->napi_mask |= (1 << oq_no); + oct_priv->napi_mask |= BIT_ULL(oq_no); } else { tasklet_schedule(&oct_priv->droq_tasklet); } From 177b8007463c4f36c9a2c7ce7aa9875a4cad9bd5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Jun 2019 11:11:10 -0700 Subject: [PATCH 12/24] net: netem: fix backlog accounting for corrupted GSO frames When GSO frame has to be corrupted netem uses skb_gso_segment() to produce the list of frames, and re-enqueues the segments one by one. The backlog length has to be adjusted to account for new frames. The current calculation is incorrect, leading to wrong backlog lengths in the parent qdisc (both bytes and packets), and incorrect packet backlog count in netem itself. Parent backlog goes negative, netem's packet backlog counts all non-first segments twice (thus remaining non-zero even after qdisc is emptied). Move the variables used to count the adjustment into local scope to make 100% sure they aren't used at any stage in backports. Fixes: 6071bd1aa13e ("netem: Segment GSO packets on enqueue") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 956ff3da81f4..3b3e2d772c3b 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -439,8 +439,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct netem_skb_cb *cb; struct sk_buff *skb2; struct sk_buff *segs = NULL; - unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb); - int nb = 0; + unsigned int prev_len = qdisc_pkt_len(skb); int count = 1; int rc = NET_XMIT_SUCCESS; int rc_drop = NET_XMIT_DROP; @@ -497,6 +496,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, segs = netem_segment(skb, sch, to_free); if (!segs) return rc_drop; + qdisc_skb_cb(segs)->pkt_len = segs->len; } else { segs = skb; } @@ -593,6 +593,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, finish_segs: if (segs) { + unsigned int len, last_len; + int nb = 0; + + len = skb->len; + while (segs) { skb2 = segs->next; skb_mark_not_on_list(segs); @@ -608,9 +613,7 @@ finish_segs: } segs = skb2; } - sch->q.qlen += nb; - if (nb > 1) - qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); + qdisc_tree_reduce_backlog(sch, -nb, prev_len - len); } return NET_XMIT_SUCCESS; } From 3e14c383de349a86895b8c6c410b9222646574f6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Jun 2019 11:11:11 -0700 Subject: [PATCH 13/24] net: netem: fix use after free and double free with packet corruption Brendan reports that the use of netem's packet corruption capability leads to strange crashes. This seems to be caused by commit d66280b12bd7 ("net: netem: use a list in addition to rbtree") which uses skb->next pointer to construct a fast-path queue of in-order skbs. Packet corruption code has to invoke skb_gso_segment() in case of skbs in need of GSO. skb_gso_segment() returns a list of skbs. If next pointers of the skbs on that list do not get cleared fast path list may point to freed skbs or skbs which are also on the RB tree. Let's say skb gets segmented into 3 frames: A -> B -> C A gets hooked to the t_head t_tail list by tfifo_enqueue(), but it's next pointer didn't get cleared so we have: h t |/ A -> B -> C Now if B and C get also get enqueued successfully all is fine, because tfifo_enqueue() will overwrite the list in order. IOW: Enqueue B: h t | | A -> B C Enqueue C: h t | | A -> B -> C But if B and C get reordered we may end up with: h t RB tree |/ | A -> B -> C B \ C Or if they get dropped just: h t |/ A -> B -> C where A and B are already freed. To reproduce either limit has to be set low to cause freeing of segs or reorders have to happen (due to delay jitter). Note that we only have to mark the first segment as not on the list, "finish_segs" handling of other frags already does that. Another caveat is that qdisc_drop_all() still has to free all segments correctly in case of drop of first segment, therefore we re-link segs before calling it. v2: - re-link before drop, v1 was leaking non-first segs if limit was hit at the first seg - better commit message which lead to discovering the above :) Reported-by: Brendan Galloway Fixes: d66280b12bd7 ("net: netem: use a list in addition to rbtree") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 3b3e2d772c3b..b17f2ed970e2 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -493,17 +493,14 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { if (skb_is_gso(skb)) { - segs = netem_segment(skb, sch, to_free); - if (!segs) + skb = netem_segment(skb, sch, to_free); + if (!skb) return rc_drop; - qdisc_skb_cb(segs)->pkt_len = segs->len; - } else { - segs = skb; + segs = skb->next; + skb_mark_not_on_list(skb); + qdisc_skb_cb(skb)->pkt_len = skb->len; } - skb = segs; - segs = segs->next; - skb = skb_unshare(skb, GFP_ATOMIC); if (unlikely(!skb)) { qdisc_qstats_drop(sch); @@ -520,6 +517,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, } if (unlikely(sch->q.qlen >= sch->limit)) { + /* re-link segs, so that qdisc_drop_all() frees them all */ + skb->next = segs; qdisc_drop_all(skb, sch, to_free); return rc_drop; } From 76e21533a48bb42d1fa894f93f6233bf4554f45e Mon Sep 17 00:00:00 2001 From: Fred Klassen Date: Mon, 17 Jun 2019 12:05:07 -0700 Subject: [PATCH 14/24] net/udp_gso: Allow TX timestamp with UDP GSO Fixes an issue where TX Timestamps are not arriving on the error queue when UDP_SEGMENT CMSG type is combined with CMSG type SO_TIMESTAMPING. This can be illustrated with an updated updgso_bench_tx program which includes the '-T' option to test for this condition. It also introduces the '-P' option which will call poll() before reading the error queue. ./udpgso_bench_tx -4ucTPv -S 1472 -l2 -D 172.16.120.18 poll timeout udp tx: 0 MB/s 1 calls/s 1 msg/s The "poll timeout" message above indicates that TX timestamp never arrived. This patch preserves tx_flags for the first UDP GSO segment. Only the first segment is timestamped, even though in some cases there may be benefital in timestamping both the first and last segment. Factors in deciding on first segment timestamp only: - Timestamping both first and last segmented is not feasible. Hardware can only have one outstanding TS request at a time. - Timestamping last segment may under report network latency of the previous segments. Even though the doorbell is suppressed, the ring producer counter has been incremented. - Timestamping the first segment has the upside in that it reports timestamps from the application's view, e.g. RTT. - Timestamping the first segment has the downside that it may underreport tx host network latency. It appears that we have to pick one or the other. And possibly follow-up with a config flag to choose behavior. v2: Remove tests as noted by Willem de Bruijn Moving tests from net to net-next v3: Update only relevant tx_flag bits as per Willem de Bruijn v4: Update comments and commit message as per Willem de Bruijn Fixes: ee80d1ebe5ba ("udp: add udp gso") Signed-off-by: Fred Klassen Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 06b3e2c1fcdc..9763464a75d7 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -224,6 +224,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, seg = segs; uh = udp_hdr(seg); + /* preserve TX timestamp flags and TS key for first segment */ + skb_shinfo(seg)->tskey = skb_shinfo(gso_skb)->tskey; + skb_shinfo(seg)->tx_flags |= + (skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP); + /* compute checksum adjustment based on old length versus new */ newlen = htons(sizeof(*uh) + mss); check = csum16_add(csum16_sub(uh->check, uh->len), newlen); From cb359b60416701c8bed82fec79de25a144beb893 Mon Sep 17 00:00:00 2001 From: Sunil Muthuswamy Date: Mon, 17 Jun 2019 19:26:25 +0000 Subject: [PATCH 15/24] hvsock: fix epollout hang from race condition Currently, hvsock can enter into a state where epoll_wait on EPOLLOUT will not return even when the hvsock socket is writable, under some race condition. This can happen under the following sequence: - fd = socket(hvsocket) - fd_out = dup(fd) - fd_in = dup(fd) - start a writer thread that writes data to fd_out with a combination of epoll_wait(fd_out, EPOLLOUT) and - start a reader thread that reads data from fd_in with a combination of epoll_wait(fd_in, EPOLLIN) - On the host, there are two threads that are reading/writing data to the hvsocket stack: hvs_stream_has_space hvs_notify_poll_out vsock_poll sock_poll ep_poll Race condition: check for epollout from ep_poll(): assume no writable space in the socket hvs_stream_has_space() returns 0 check for epollin from ep_poll(): assume socket has some free space < HVS_PKT_LEN(HVS_SEND_BUF_SIZE) hvs_stream_has_space() will clear the channel pending send size host will not notify the guest because the pending send size has been cleared and so the hvsocket will never mark the socket writable Now, the EPOLLOUT will never return even if the socket write buffer is empty. The fix is to set the pending size to the default size and never change it. This way the host will always notify the guest whenever the writable space is bigger than the pending size. The host is already optimized to *only* notify the guest when the pending size threshold boundary is crossed and not everytime. This change also reduces the cpu usage somewhat since hv_stream_has_space() is in the hotpath of send: vsock_stream_sendmsg()->hv_stream_has_space() Earlier hv_stream_has_space was setting/clearing the pending size on every call. Signed-off-by: Sunil Muthuswamy Reviewed-by: Dexuan Cui Signed-off-by: David S. Miller --- net/vmw_vsock/hyperv_transport.c | 39 +++++++------------------------- 1 file changed, 8 insertions(+), 31 deletions(-) diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index fb2df6e068fa..62dcdf082349 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -211,18 +211,6 @@ static void hvs_set_channel_pending_send_size(struct vmbus_channel *chan) set_channel_pending_send_size(chan, HVS_PKT_LEN(HVS_SEND_BUF_SIZE)); - /* See hvs_stream_has_space(): we must make sure the host has seen - * the new pending send size, before we can re-check the writable - * bytes. - */ - virt_mb(); -} - -static void hvs_clear_channel_pending_send_size(struct vmbus_channel *chan) -{ - set_channel_pending_send_size(chan, 0); - - /* Ditto */ virt_mb(); } @@ -292,9 +280,6 @@ static void hvs_channel_cb(void *ctx) if (hvs_channel_readable(chan)) sk->sk_data_ready(sk); - /* See hvs_stream_has_space(): when we reach here, the writable bytes - * may be already less than HVS_PKT_LEN(HVS_SEND_BUF_SIZE). - */ if (hv_get_bytes_to_write(&chan->outbound) > 0) sk->sk_write_space(sk); } @@ -395,6 +380,13 @@ static void hvs_open_connection(struct vmbus_channel *chan) set_per_channel_state(chan, conn_from_host ? new : sk); vmbus_set_chn_rescind_callback(chan, hvs_close_connection); + /* Set the pending send size to max packet size to always get + * notifications from the host when there is enough writable space. + * The host is optimized to send notifications only when the pending + * size boundary is crossed, and not always. + */ + hvs_set_channel_pending_send_size(chan); + if (conn_from_host) { new->sk_state = TCP_ESTABLISHED; sk->sk_ack_backlog++; @@ -688,23 +680,8 @@ static s64 hvs_stream_has_data(struct vsock_sock *vsk) static s64 hvs_stream_has_space(struct vsock_sock *vsk) { struct hvsock *hvs = vsk->trans; - struct vmbus_channel *chan = hvs->chan; - s64 ret; - ret = hvs_channel_writable_bytes(chan); - if (ret > 0) { - hvs_clear_channel_pending_send_size(chan); - } else { - /* See hvs_channel_cb() */ - hvs_set_channel_pending_send_size(chan); - - /* Re-check the writable bytes to avoid race */ - ret = hvs_channel_writable_bytes(chan); - if (ret > 0) - hvs_clear_channel_pending_send_size(chan); - } - - return ret; + return hvs_channel_writable_bytes(hvs->chan); } static u64 hvs_stream_rcvhiwat(struct vsock_sock *vsk) From 48620e341659f6e4b978ec229f6944dabe6df709 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 19 Jun 2019 10:02:13 +0000 Subject: [PATCH 16/24] net: dsa: mv88e6xxx: fix shift of FID bits in mv88e6185_g1_vtu_loadpurge() The comment is correct, but the code ends up moving the bits four places too far, into the VTUOp field. Fixes: 11ea809f1a74 (net: dsa: mv88e6xxx: support 256 databases) Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/global1_vtu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/global1_vtu.c b/drivers/net/dsa/mv88e6xxx/global1_vtu.c index 3e9be3f51196..ecef69045a42 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_vtu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_vtu.c @@ -415,7 +415,7 @@ int mv88e6185_g1_vtu_loadpurge(struct mv88e6xxx_chip *chip, * VTU DBNum[7:4] are located in VTU Operation 11:8 */ op |= entry->fid & 0x000f; - op |= (entry->fid & 0x00f0) << 8; + op |= (entry->fid & 0x00f0) << 4; } return mv88e6xxx_g1_vtu_op(chip, op); From fdbf6326912d578a31ac4ca0933c919eadf1d54c Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 18 Jun 2019 20:42:59 +0200 Subject: [PATCH 17/24] net/af_iucv: remove GFP_DMA restriction for HiperTransport af_iucv sockets over z/VM IUCV require that their skbs are allocated in DMA memory. This restriction doesn't apply to connections over HiperSockets. So only set this limit for z/VM IUCV sockets, thereby increasing the likelihood that the large (and linear!) allocations for HiperTransport messages succeed. Fixes: 3881ac441f64 ("af_iucv: add HiperSockets transport") Signed-off-by: Julian Wiedmann Reviewed-by: Ursula Braun Reviewed-by: Hendrik Brueckner Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 36eb8d1d9128..2ee68d2d4693 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -588,7 +588,6 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, sk->sk_destruct = iucv_sock_destruct; sk->sk_sndtimeo = IUCV_CONN_TIMEOUT; - sk->sk_allocation = GFP_DMA; sock_reset_flag(sk, SOCK_ZAPPED); @@ -782,6 +781,7 @@ vm_bind: memcpy(iucv->src_user_id, iucv_userid, 8); sk->sk_state = IUCV_BOUND; iucv->transport = AF_IUCV_TRANS_IUCV; + sk->sk_allocation |= GFP_DMA; if (!iucv->msglimit) iucv->msglimit = IUCV_QUEUELEN_DEFAULT; goto done_unlock; @@ -806,6 +806,8 @@ static int iucv_sock_autobind(struct sock *sk) return -EPROTO; memcpy(iucv->src_user_id, iucv_userid, 8); + iucv->transport = AF_IUCV_TRANS_IUCV; + sk->sk_allocation |= GFP_DMA; write_lock_bh(&iucv_sk_list.lock); __iucv_auto_name(iucv); @@ -1781,6 +1783,8 @@ static int iucv_callback_connreq(struct iucv_path *path, niucv = iucv_sk(nsk); iucv_sock_init(nsk, sk); + niucv->transport = AF_IUCV_TRANS_IUCV; + nsk->sk_allocation |= GFP_DMA; /* Set the new iucv_sock */ memcpy(niucv->dst_name, ipuser + 8, 8); From 238965b71b968dc5b3c0fe430e946f488322c4b5 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 18 Jun 2019 20:43:00 +0200 Subject: [PATCH 18/24] net/af_iucv: build proper skbs for HiperTransport The HiperSockets-based transport path in af_iucv is still too closely entangled with qeth. With commit a647a02512ca ("s390/qeth: speed-up L3 IQD xmit"), the relevant xmit code in qeth has begun to use skb_cow_head(). So to avoid unnecessary skb head expansions, af_iucv must learn to 1) respect dev->needed_headroom when allocating skbs, and 2) drop the header reference before cloning the skb. While at it, also stop hard-coding the LL-header creation stage and just use the appropriate helper. Fixes: a647a02512ca ("s390/qeth: speed-up L3 IQD xmit") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 2ee68d2d4693..5264a182d2c3 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include +#include #include #include #include @@ -347,14 +348,14 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, if (imsg) memcpy(&phs_hdr->iucv_hdr, imsg, sizeof(struct iucv_message)); - skb_push(skb, ETH_HLEN); - memset(skb->data, 0, ETH_HLEN); - skb->dev = iucv->hs_dev; if (!skb->dev) { err = -ENODEV; goto err_free; } + + dev_hard_header(skb, skb->dev, ETH_P_AF_IUCV, NULL, NULL, skb->len); + if (!(skb->dev->flags & IFF_UP) || !netif_carrier_ok(skb->dev)) { err = -ENETDOWN; goto err_free; @@ -367,6 +368,8 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, skb_trim(skb, skb->dev->mtu); } skb->protocol = cpu_to_be16(ETH_P_AF_IUCV); + + __skb_header_release(skb); nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) { err = -ENOMEM; @@ -466,12 +469,14 @@ static void iucv_sever_path(struct sock *sk, int with_user_data) /* Send controlling flags through an IUCV socket for HIPER transport */ static int iucv_send_ctrl(struct sock *sk, u8 flags) { + struct iucv_sock *iucv = iucv_sk(sk); int err = 0; int blen; struct sk_buff *skb; u8 shutdown = 0; - blen = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN; + blen = sizeof(struct af_iucv_trans_hdr) + + LL_RESERVED_SPACE(iucv->hs_dev); if (sk->sk_shutdown & SEND_SHUTDOWN) { /* controlling flags should be sent anyway */ shutdown = sk->sk_shutdown; @@ -1133,7 +1138,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, * segmented records using the MSG_EOR flag), but * for SOCK_STREAM we might want to improve it in future */ if (iucv->transport == AF_IUCV_TRANS_HIPER) { - headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN; + headroom = sizeof(struct af_iucv_trans_hdr) + + LL_RESERVED_SPACE(iucv->hs_dev); linear = len; } else { if (len < PAGE_SIZE) { From 06996c1d4088a0d5f3e7789d7f96b4653cc947cc Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 18 Jun 2019 20:43:01 +0200 Subject: [PATCH 19/24] net/af_iucv: always register net_device notifier Even when running as VM guest (ie pr_iucv != NULL), af_iucv can still open HiperTransport-based connections. For robust operation these connections require the af_iucv_netdev_notifier, so register it unconditionally. Also handle any error that register_netdevice_notifier() returns. Fixes: 9fbd87d41392 ("af_iucv: handle netdev events") Signed-off-by: Julian Wiedmann Reviewed-by: Ursula Braun Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 5264a182d2c3..09e1694b6d34 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -2440,6 +2440,13 @@ out: return err; } +static void afiucv_iucv_exit(void) +{ + device_unregister(af_iucv_dev); + driver_unregister(&af_iucv_driver); + pr_iucv->iucv_unregister(&af_iucv_handler, 0); +} + static int __init afiucv_init(void) { int err; @@ -2473,11 +2480,18 @@ static int __init afiucv_init(void) err = afiucv_iucv_init(); if (err) goto out_sock; - } else - register_netdevice_notifier(&afiucv_netdev_notifier); + } + + err = register_netdevice_notifier(&afiucv_netdev_notifier); + if (err) + goto out_notifier; + dev_add_pack(&iucv_packet_type); return 0; +out_notifier: + if (pr_iucv) + afiucv_iucv_exit(); out_sock: sock_unregister(PF_IUCV); out_proto: @@ -2491,12 +2505,11 @@ out: static void __exit afiucv_exit(void) { if (pr_iucv) { - device_unregister(af_iucv_dev); - driver_unregister(&af_iucv_driver); - pr_iucv->iucv_unregister(&af_iucv_handler, 0); + afiucv_iucv_exit(); symbol_put(iucv_if); - } else - unregister_netdevice_notifier(&afiucv_netdev_notifier); + } + + unregister_netdevice_notifier(&afiucv_netdev_notifier); dev_remove_pack(&iucv_packet_type); sock_unregister(PF_IUCV); proto_unregister(&iucv_proto); From bf6de2315362473e14817ccbdbd00ade6de2756e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 18 Jun 2019 20:54:22 +0200 Subject: [PATCH 20/24] net: hns3: Fix inconsistent indenting Fix wrong indentation of goto return. Signed-off-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 196a3d780dcf..f326805543a4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3803,7 +3803,7 @@ static int hns3_client_init(struct hnae3_handle *handle) ret = hns3_client_start(handle); if (ret) { dev_err(priv->dev, "hns3_client_start fail! ret=%d\n", ret); - goto out_client_start; + goto out_client_start; } hns3_dcbnl_setup(handle); From c7036d97acd2527cef145b5ef9ad1a37ed21bbe6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Jun 2019 10:50:24 -0700 Subject: [PATCH 21/24] ipv6: Default fib6_type to RTN_UNICAST when not set A user reported that routes are getting installed with type 0 (RTN_UNSPEC) where before the routes were RTN_UNICAST. One example is from accel-ppp which apparently still uses the ioctl interface and does not set rtmsg_type. Another is the netlink interface where ipv6 does not require rtm_type to be set (v4 does). Prior to the commit in the Fixes tag the ipv6 stack converted type 0 to RTN_UNICAST, so restore that behavior. Fixes: e8478e80e5a7 ("net/ipv6: Save route type in rt6_info") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0f60eb3a2873..11ad62effd56 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3184,7 +3184,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; - rt->fib6_type = cfg->fc_type; + rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); From 8110a7a7d295b08433dfa7e692e347a43e63f475 Mon Sep 17 00:00:00 2001 From: Nathan Huckleberry Date: Wed, 19 Jun 2019 11:17:15 -0700 Subject: [PATCH 22/24] net: mvpp2: debugfs: Add pmap to fs dump There was an unused variable 'mvpp2_dbgfs_prs_pmap_fops' Added a usage consistent with other fops to dump pmap to userspace. Cc: clang-built-linux@googlegroups.com Link: https://github.com/ClangBuiltLinux/linux/issues/529 Signed-off-by: Nathan Huckleberry Tested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c index 0ee39ea47b6b..274fb07362cb 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c @@ -566,6 +566,9 @@ static int mvpp2_dbgfs_prs_entry_init(struct dentry *parent, debugfs_create_file("hits", 0444, prs_entry_dir, entry, &mvpp2_dbgfs_prs_hits_fops); + debugfs_create_file("pmap", 0444, prs_entry_dir, entry, + &mvpp2_dbgfs_prs_pmap_fops); + return 0; } From 85f9aa7565bd79b039325f2c01af7ffa717924df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jun 2019 09:38:38 -0700 Subject: [PATCH 23/24] inet: clear num_timeout reqsk_alloc() KMSAN caught uninit-value in tcp_create_openreq_child() [1] This is caused by a recent change, combined by the fact that TCP cleared num_timeout, num_retrans and sk fields only when a request socket was about to be queued. Under syncookie mode, a temporary request socket is used, and req->num_timeout could contain garbage. Lets clear these three fields sooner, there is really no point trying to defer this and risk other bugs. [1] BUG: KMSAN: uninit-value in tcp_create_openreq_child+0x157f/0x1cc0 net/ipv4/tcp_minisocks.c:526 CPU: 1 PID: 13357 Comm: syz-executor591 Not tainted 5.2.0-rc4+ #3 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x191/0x1f0 lib/dump_stack.c:113 kmsan_report+0x162/0x2d0 mm/kmsan/kmsan.c:611 __msan_warning+0x75/0xe0 mm/kmsan/kmsan_instr.c:304 tcp_create_openreq_child+0x157f/0x1cc0 net/ipv4/tcp_minisocks.c:526 tcp_v6_syn_recv_sock+0x761/0x2d80 net/ipv6/tcp_ipv6.c:1152 tcp_get_cookie_sock+0x16e/0x6b0 net/ipv4/syncookies.c:209 cookie_v6_check+0x27e0/0x29a0 net/ipv6/syncookies.c:252 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:1039 [inline] tcp_v6_do_rcv+0xf1c/0x1ce0 net/ipv6/tcp_ipv6.c:1344 tcp_v6_rcv+0x60b7/0x6a30 net/ipv6/tcp_ipv6.c:1554 ip6_protocol_deliver_rcu+0x1433/0x22f0 net/ipv6/ip6_input.c:397 ip6_input_finish net/ipv6/ip6_input.c:438 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ip6_input+0x2af/0x340 net/ipv6/ip6_input.c:447 dst_input include/net/dst.h:439 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ipv6_rcv+0x683/0x710 net/ipv6/ip6_input.c:272 __netif_receive_skb_one_core net/core/dev.c:4981 [inline] __netif_receive_skb net/core/dev.c:5095 [inline] process_backlog+0x721/0x1410 net/core/dev.c:5906 napi_poll net/core/dev.c:6329 [inline] net_rx_action+0x738/0x1940 net/core/dev.c:6395 __do_softirq+0x4ad/0x858 kernel/softirq.c:293 do_softirq_own_stack+0x49/0x80 arch/x86/entry/entry_64.S:1052 do_softirq kernel/softirq.c:338 [inline] __local_bh_enable_ip+0x199/0x1e0 kernel/softirq.c:190 local_bh_enable+0x36/0x40 include/linux/bottom_half.h:32 rcu_read_unlock_bh include/linux/rcupdate.h:682 [inline] ip6_finish_output2+0x213f/0x2670 net/ipv6/ip6_output.c:117 ip6_finish_output+0xae4/0xbc0 net/ipv6/ip6_output.c:150 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0x5d3/0x720 net/ipv6/ip6_output.c:167 dst_output include/net/dst.h:433 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ip6_xmit+0x1f53/0x2650 net/ipv6/ip6_output.c:271 inet6_csk_xmit+0x3df/0x4f0 net/ipv6/inet6_connection_sock.c:135 __tcp_transmit_skb+0x4076/0x5b40 net/ipv4/tcp_output.c:1156 tcp_transmit_skb net/ipv4/tcp_output.c:1172 [inline] tcp_write_xmit+0x39a9/0xa730 net/ipv4/tcp_output.c:2397 __tcp_push_pending_frames+0x124/0x4e0 net/ipv4/tcp_output.c:2573 tcp_send_fin+0xd43/0x1540 net/ipv4/tcp_output.c:3118 tcp_close+0x16ba/0x1860 net/ipv4/tcp.c:2403 inet_release+0x1f7/0x270 net/ipv4/af_inet.c:427 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:470 __sock_release net/socket.c:601 [inline] sock_close+0x156/0x490 net/socket.c:1273 __fput+0x4c9/0xba0 fs/file_table.c:280 ____fput+0x37/0x40 fs/file_table.c:313 task_work_run+0x22e/0x2a0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:185 [inline] exit_to_usermode_loop arch/x86/entry/common.c:168 [inline] prepare_exit_to_usermode+0x39d/0x4d0 arch/x86/entry/common.c:199 syscall_return_slowpath+0x90/0x5c0 arch/x86/entry/common.c:279 do_syscall_64+0xe2/0xf0 arch/x86/entry/common.c:305 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x401d50 Code: 01 f0 ff ff 0f 83 40 0d 00 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d dd 8d 2d 00 00 75 14 b8 03 00 00 00 0f 05 <48> 3d 01 f0 ff ff 0f 83 14 0d 00 00 c3 48 83 ec 08 e8 7a 02 00 00 RSP: 002b:00007fff1cf58cf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000401d50 RDX: 000000000000001c RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00000000004a9050 R08: 0000000020000040 R09: 000000000000001c R10: 0000000020004004 R11: 0000000000000246 R12: 0000000000402ef0 R13: 0000000000402f80 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:201 [inline] kmsan_internal_poison_shadow+0x53/0xa0 mm/kmsan/kmsan.c:160 kmsan_kmalloc+0xa4/0x130 mm/kmsan/kmsan_hooks.c:177 kmem_cache_alloc+0x534/0xb00 mm/slub.c:2781 reqsk_alloc include/net/request_sock.h:84 [inline] inet_reqsk_alloc+0xa8/0x600 net/ipv4/tcp_input.c:6384 cookie_v6_check+0xadb/0x29a0 net/ipv6/syncookies.c:173 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:1039 [inline] tcp_v6_do_rcv+0xf1c/0x1ce0 net/ipv6/tcp_ipv6.c:1344 tcp_v6_rcv+0x60b7/0x6a30 net/ipv6/tcp_ipv6.c:1554 ip6_protocol_deliver_rcu+0x1433/0x22f0 net/ipv6/ip6_input.c:397 ip6_input_finish net/ipv6/ip6_input.c:438 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ip6_input+0x2af/0x340 net/ipv6/ip6_input.c:447 dst_input include/net/dst.h:439 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ipv6_rcv+0x683/0x710 net/ipv6/ip6_input.c:272 __netif_receive_skb_one_core net/core/dev.c:4981 [inline] __netif_receive_skb net/core/dev.c:5095 [inline] process_backlog+0x721/0x1410 net/core/dev.c:5906 napi_poll net/core/dev.c:6329 [inline] net_rx_action+0x738/0x1940 net/core/dev.c:6395 __do_softirq+0x4ad/0x858 kernel/softirq.c:293 do_softirq_own_stack+0x49/0x80 arch/x86/entry/entry_64.S:1052 do_softirq kernel/softirq.c:338 [inline] __local_bh_enable_ip+0x199/0x1e0 kernel/softirq.c:190 local_bh_enable+0x36/0x40 include/linux/bottom_half.h:32 rcu_read_unlock_bh include/linux/rcupdate.h:682 [inline] ip6_finish_output2+0x213f/0x2670 net/ipv6/ip6_output.c:117 ip6_finish_output+0xae4/0xbc0 net/ipv6/ip6_output.c:150 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0x5d3/0x720 net/ipv6/ip6_output.c:167 dst_output include/net/dst.h:433 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ip6_xmit+0x1f53/0x2650 net/ipv6/ip6_output.c:271 inet6_csk_xmit+0x3df/0x4f0 net/ipv6/inet6_connection_sock.c:135 __tcp_transmit_skb+0x4076/0x5b40 net/ipv4/tcp_output.c:1156 tcp_transmit_skb net/ipv4/tcp_output.c:1172 [inline] tcp_write_xmit+0x39a9/0xa730 net/ipv4/tcp_output.c:2397 __tcp_push_pending_frames+0x124/0x4e0 net/ipv4/tcp_output.c:2573 tcp_send_fin+0xd43/0x1540 net/ipv4/tcp_output.c:3118 tcp_close+0x16ba/0x1860 net/ipv4/tcp.c:2403 inet_release+0x1f7/0x270 net/ipv4/af_inet.c:427 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:470 __sock_release net/socket.c:601 [inline] sock_close+0x156/0x490 net/socket.c:1273 __fput+0x4c9/0xba0 fs/file_table.c:280 ____fput+0x37/0x40 fs/file_table.c:313 task_work_run+0x22e/0x2a0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:185 [inline] exit_to_usermode_loop arch/x86/entry/common.c:168 [inline] prepare_exit_to_usermode+0x39d/0x4d0 arch/x86/entry/common.c:199 syscall_return_slowpath+0x90/0x5c0 arch/x86/entry/common.c:279 do_syscall_64+0xe2/0xf0 arch/x86/entry/common.c:305 entry_SYSCALL_64_after_hwframe+0x63/0xe7 Fixes: 336c39a03151 ("tcp: undo init congestion window on false SYNACK timeout") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Soheil Hassas Yeganeh Reported-by: syzbot Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/request_sock.h | 3 +++ net/ipv4/inet_connection_sock.c | 4 ---- net/ipv4/tcp_fastopen.c | 4 ---- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b3ea21f2732e..fd178d58fa84 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -97,6 +97,9 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, sk_node_init(&req_to_sk(req)->sk_node); sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; + req->num_timeout = 0; + req->num_retrans = 0; + req->sk = NULL; refcount_set(&req->rsk_refcnt, 0); return req; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 13ec7c3a9c49..7fd6db3fe366 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -752,10 +752,6 @@ drop: static void reqsk_queue_hash_req(struct request_sock *req, unsigned long timeout) { - req->num_retrans = 0; - req->num_timeout = 0; - req->sk = NULL; - timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 018a48477355..f5a45e1e1182 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -221,10 +221,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sock *child; bool own_req; - req->num_retrans = 0; - req->num_timeout = 0; - req->sk = NULL; - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, NULL, &own_req); if (!child) From b6653b3629e5b88202be3c9abc44713973f5c4b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Jun 2019 06:09:55 -0700 Subject: [PATCH 24/24] tcp: refine memory limit test in tcp_fragment() tcp_fragment() might be called for skbs in the write queue. Memory limits might have been exceeded because tcp_sendmsg() only checks limits at full skb (64KB) boundaries. Therefore, we need to make sure tcp_fragment() wont punish applications that might have setup very low SO_SNDBUF values. Fixes: f070ef2ac667 ("tcp: tcp_fragment() should apply sane memory limits") Signed-off-by: Eric Dumazet Reported-by: Christoph Paasch Tested-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 00c01a01b547..0ebc33d1c9e5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1296,7 +1296,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (nsize < 0) nsize = 0; - if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) { + if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf && + tcp_queue != TCP_FRAG_IN_WRITE_QUEUE)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); return -ENOMEM; }