From dfa73c17d55b921e1d4e154976de35317e43a93a Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 27 Jun 2023 11:31:38 +0800 Subject: [PATCH 01/49] net: xfrm: Fix xfrm_address_filter OOB read We found below OOB crash: [ 44.211730] ================================================================== [ 44.212045] BUG: KASAN: slab-out-of-bounds in memcmp+0x8b/0xb0 [ 44.212045] Read of size 8 at addr ffff88800870f320 by task poc.xfrm/97 [ 44.212045] [ 44.212045] CPU: 0 PID: 97 Comm: poc.xfrm Not tainted 6.4.0-rc7-00072-gdad9774deaf1-dirty #4 [ 44.212045] Call Trace: [ 44.212045] [ 44.212045] dump_stack_lvl+0x37/0x50 [ 44.212045] print_report+0xcc/0x620 [ 44.212045] ? __virt_addr_valid+0xf3/0x170 [ 44.212045] ? memcmp+0x8b/0xb0 [ 44.212045] kasan_report+0xb2/0xe0 [ 44.212045] ? memcmp+0x8b/0xb0 [ 44.212045] kasan_check_range+0x39/0x1c0 [ 44.212045] memcmp+0x8b/0xb0 [ 44.212045] xfrm_state_walk+0x21c/0x420 [ 44.212045] ? __pfx_dump_one_state+0x10/0x10 [ 44.212045] xfrm_dump_sa+0x1e2/0x290 [ 44.212045] ? __pfx_xfrm_dump_sa+0x10/0x10 [ 44.212045] ? __kernel_text_address+0xd/0x40 [ 44.212045] ? kasan_unpoison+0x27/0x60 [ 44.212045] ? mutex_lock+0x60/0xe0 [ 44.212045] ? __pfx_mutex_lock+0x10/0x10 [ 44.212045] ? kasan_save_stack+0x22/0x50 [ 44.212045] netlink_dump+0x322/0x6c0 [ 44.212045] ? __pfx_netlink_dump+0x10/0x10 [ 44.212045] ? mutex_unlock+0x7f/0xd0 [ 44.212045] ? __pfx_mutex_unlock+0x10/0x10 [ 44.212045] __netlink_dump_start+0x353/0x430 [ 44.212045] xfrm_user_rcv_msg+0x3a4/0x410 [ 44.212045] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 44.212045] ? __pfx_xfrm_user_rcv_msg+0x10/0x10 [ 44.212045] ? __pfx_xfrm_dump_sa+0x10/0x10 [ 44.212045] ? __pfx_xfrm_dump_sa_done+0x10/0x10 [ 44.212045] ? __stack_depot_save+0x382/0x4e0 [ 44.212045] ? filter_irq_stacks+0x1c/0x70 [ 44.212045] ? kasan_save_stack+0x32/0x50 [ 44.212045] ? kasan_save_stack+0x22/0x50 [ 44.212045] ? kasan_set_track+0x25/0x30 [ 44.212045] ? __kasan_slab_alloc+0x59/0x70 [ 44.212045] ? kmem_cache_alloc_node+0xf7/0x260 [ 44.212045] ? kmalloc_reserve+0xab/0x120 [ 44.212045] ? __alloc_skb+0xcf/0x210 [ 44.212045] ? netlink_sendmsg+0x509/0x700 [ 44.212045] ? sock_sendmsg+0xde/0xe0 [ 44.212045] ? __sys_sendto+0x18d/0x230 [ 44.212045] ? __x64_sys_sendto+0x71/0x90 [ 44.212045] ? do_syscall_64+0x3f/0x90 [ 44.212045] ? entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 44.212045] ? netlink_sendmsg+0x509/0x700 [ 44.212045] ? sock_sendmsg+0xde/0xe0 [ 44.212045] ? __sys_sendto+0x18d/0x230 [ 44.212045] ? __x64_sys_sendto+0x71/0x90 [ 44.212045] ? do_syscall_64+0x3f/0x90 [ 44.212045] ? entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 44.212045] ? kasan_save_stack+0x22/0x50 [ 44.212045] ? kasan_set_track+0x25/0x30 [ 44.212045] ? kasan_save_free_info+0x2e/0x50 [ 44.212045] ? __kasan_slab_free+0x10a/0x190 [ 44.212045] ? kmem_cache_free+0x9c/0x340 [ 44.212045] ? netlink_recvmsg+0x23c/0x660 [ 44.212045] ? sock_recvmsg+0xeb/0xf0 [ 44.212045] ? __sys_recvfrom+0x13c/0x1f0 [ 44.212045] ? __x64_sys_recvfrom+0x71/0x90 [ 44.212045] ? do_syscall_64+0x3f/0x90 [ 44.212045] ? entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 44.212045] ? copyout+0x3e/0x50 [ 44.212045] netlink_rcv_skb+0xd6/0x210 [ 44.212045] ? __pfx_xfrm_user_rcv_msg+0x10/0x10 [ 44.212045] ? __pfx_netlink_rcv_skb+0x10/0x10 [ 44.212045] ? __pfx_sock_has_perm+0x10/0x10 [ 44.212045] ? mutex_lock+0x8d/0xe0 [ 44.212045] ? __pfx_mutex_lock+0x10/0x10 [ 44.212045] xfrm_netlink_rcv+0x44/0x50 [ 44.212045] netlink_unicast+0x36f/0x4c0 [ 44.212045] ? __pfx_netlink_unicast+0x10/0x10 [ 44.212045] ? netlink_recvmsg+0x500/0x660 [ 44.212045] netlink_sendmsg+0x3b7/0x700 [ 44.212045] ? __pfx_netlink_sendmsg+0x10/0x10 [ 44.212045] ? __pfx_netlink_sendmsg+0x10/0x10 [ 44.212045] sock_sendmsg+0xde/0xe0 [ 44.212045] __sys_sendto+0x18d/0x230 [ 44.212045] ? __pfx___sys_sendto+0x10/0x10 [ 44.212045] ? rcu_core+0x44a/0xe10 [ 44.212045] ? __rseq_handle_notify_resume+0x45b/0x740 [ 44.212045] ? _raw_spin_lock_irq+0x81/0xe0 [ 44.212045] ? __pfx___rseq_handle_notify_resume+0x10/0x10 [ 44.212045] ? __pfx_restore_fpregs_from_fpstate+0x10/0x10 [ 44.212045] ? __pfx_blkcg_maybe_throttle_current+0x10/0x10 [ 44.212045] ? __pfx_task_work_run+0x10/0x10 [ 44.212045] __x64_sys_sendto+0x71/0x90 [ 44.212045] do_syscall_64+0x3f/0x90 [ 44.212045] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 44.212045] RIP: 0033:0x44b7da [ 44.212045] RSP: 002b:00007ffdc8838548 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 44.212045] RAX: ffffffffffffffda RBX: 00007ffdc8839978 RCX: 000000000044b7da [ 44.212045] RDX: 0000000000000038 RSI: 00007ffdc8838770 RDI: 0000000000000003 [ 44.212045] RBP: 00007ffdc88385b0 R08: 00007ffdc883858c R09: 000000000000000c [ 44.212045] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001 [ 44.212045] R13: 00007ffdc8839968 R14: 00000000004c37d0 R15: 0000000000000001 [ 44.212045] [ 44.212045] [ 44.212045] Allocated by task 97: [ 44.212045] kasan_save_stack+0x22/0x50 [ 44.212045] kasan_set_track+0x25/0x30 [ 44.212045] __kasan_kmalloc+0x7f/0x90 [ 44.212045] __kmalloc_node_track_caller+0x5b/0x140 [ 44.212045] kmemdup+0x21/0x50 [ 44.212045] xfrm_dump_sa+0x17d/0x290 [ 44.212045] netlink_dump+0x322/0x6c0 [ 44.212045] __netlink_dump_start+0x353/0x430 [ 44.212045] xfrm_user_rcv_msg+0x3a4/0x410 [ 44.212045] netlink_rcv_skb+0xd6/0x210 [ 44.212045] xfrm_netlink_rcv+0x44/0x50 [ 44.212045] netlink_unicast+0x36f/0x4c0 [ 44.212045] netlink_sendmsg+0x3b7/0x700 [ 44.212045] sock_sendmsg+0xde/0xe0 [ 44.212045] __sys_sendto+0x18d/0x230 [ 44.212045] __x64_sys_sendto+0x71/0x90 [ 44.212045] do_syscall_64+0x3f/0x90 [ 44.212045] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 44.212045] [ 44.212045] The buggy address belongs to the object at ffff88800870f300 [ 44.212045] which belongs to the cache kmalloc-64 of size 64 [ 44.212045] The buggy address is located 32 bytes inside of [ 44.212045] allocated 36-byte region [ffff88800870f300, ffff88800870f324) [ 44.212045] [ 44.212045] The buggy address belongs to the physical page: [ 44.212045] page:00000000e4de16ee refcount:1 mapcount:0 mapping:000000000 ... [ 44.212045] flags: 0x100000000000200(slab|node=0|zone=1) [ 44.212045] page_type: 0xffffffff() [ 44.212045] raw: 0100000000000200 ffff888004c41640 dead000000000122 0000000000000000 [ 44.212045] raw: 0000000000000000 0000000080200020 00000001ffffffff 0000000000000000 [ 44.212045] page dumped because: kasan: bad access detected [ 44.212045] [ 44.212045] Memory state around the buggy address: [ 44.212045] ffff88800870f200: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 44.212045] ffff88800870f280: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc [ 44.212045] >ffff88800870f300: 00 00 00 00 04 fc fc fc fc fc fc fc fc fc fc fc [ 44.212045] ^ [ 44.212045] ffff88800870f380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 44.212045] ffff88800870f400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 44.212045] ================================================================== By investigating the code, we find the root cause of this OOB is the lack of checks in xfrm_dump_sa(). The buggy code allows a malicious user to pass arbitrary value of filter->splen/dplen. Hence, with crafted xfrm states, the attacker can achieve 8 bytes heap OOB read, which causes info leak. if (attrs[XFRMA_ADDRESS_FILTER]) { filter = kmemdup(nla_data(attrs[XFRMA_ADDRESS_FILTER]), sizeof(*filter), GFP_KERNEL); if (filter == NULL) return -ENOMEM; // NO MORE CHECKS HERE !!! } This patch fixes the OOB by adding necessary boundary checks, just like the code in pfkey_dump() function. Fixes: d3623099d350 ("ipsec: add support of limited SA dump") Signed-off-by: Lin Ma Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c34a2a06ca94..7c91deadc36e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1267,6 +1267,15 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) sizeof(*filter), GFP_KERNEL); if (filter == NULL) return -ENOMEM; + + /* see addr_match(), (prefix length >> 5) << 2 + * will be used to compare xfrm_address_t + */ + if (filter->splen > (sizeof(xfrm_address_t) << 3) || + filter->dplen > (sizeof(xfrm_address_t) << 3)) { + kfree(filter); + return -EINVAL; + } } if (attrs[XFRMA_PROTO]) From 75065a8929069bc93181848818e23f147a73f83a Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 27 Jun 2023 11:39:54 +0800 Subject: [PATCH 02/49] net: af_key: fix sadb_x_filter validation When running xfrm_state_walk_init(), the xfrm_address_filter being used is okay to have a splen/dplen that equals to sizeof(xfrm_address_t)<<3. This commit replaces >= to > to make sure the boundary checking is correct. Fixes: 37bd22420f85 ("af_key: pfkey_dump needs parameter validation") Signed-off-by: Lin Ma Signed-off-by: Steffen Klassert --- net/key/af_key.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index ede3c6a60353..b4ea4cf9fad4 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1848,9 +1848,9 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; - if ((xfilter->sadb_x_filter_splen >= + if ((xfilter->sadb_x_filter_splen > (sizeof(xfrm_address_t) << 3)) || - (xfilter->sadb_x_filter_dplen >= + (xfilter->sadb_x_filter_dplen > (sizeof(xfrm_address_t) << 3))) { mutex_unlock(&pfk->dump_lock); return -EINVAL; From d1e0e61d617ba17aa516db707aa871387566bbf7 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Fri, 30 Jun 2023 16:19:11 +0800 Subject: [PATCH 03/49] net: xfrm: Amend XFRMA_SEC_CTX nla_policy structure According to all consumers code of attrs[XFRMA_SEC_CTX], like * verify_sec_ctx_len(), convert to xfrm_user_sec_ctx* * xfrm_state_construct(), call security_xfrm_state_alloc whose prototype is int security_xfrm_state_alloc(.., struct xfrm_user_sec_ctx *sec_ctx); * copy_from_user_sec_ctx(), convert to xfrm_user_sec_ctx * ... It seems that the expected parsing result for XFRMA_SEC_CTX should be structure xfrm_user_sec_ctx, and the current xfrm_sec_ctx is confusing and misleading (Luckily, they happen to have same size 8 bytes). This commit amend the policy structure to xfrm_user_sec_ctx to avoid ambiguity. Fixes: cf5cb79f6946 ("[XFRM] netlink: Establish an attribute policy") Signed-off-by: Lin Ma Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_compat.c | 2 +- net/xfrm/xfrm_user.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 8cbf45a8bcdc..655fe4ff8621 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -108,7 +108,7 @@ static const struct nla_policy compat_policy[XFRMA_MAX+1] = { [XFRMA_ALG_COMP] = { .len = sizeof(struct xfrm_algo) }, [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) }, [XFRMA_TMPL] = { .len = sizeof(struct xfrm_user_tmpl) }, - [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) }, + [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_user_sec_ctx) }, [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) }, [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) }, [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 }, diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7c91deadc36e..fdc0c17122b6 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3024,7 +3024,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_ALG_COMP] = { .len = sizeof(struct xfrm_algo) }, [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) }, [XFRMA_TMPL] = { .len = sizeof(struct xfrm_user_tmpl) }, - [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) }, + [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_user_sec_ctx) }, [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) }, [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) }, [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 }, From 57010b8ece2821a1fdfdba2197d14a022f3769db Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 4 Jul 2023 08:53:49 +0800 Subject: [PATCH 04/49] xfrm: Silence warnings triggerable by bad packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the elimination of inner modes, a couple of warnings that were previously unreachable can now be triggered by malformed inbound packets. Fix this by: 1. Moving the setting of skb->protocol into the decap functions. 2. Returning -EINVAL when unexpected protocol is seen. Reported-by: Maciej Żenczykowski Fixes: 5f24f41e8ea6 ("xfrm: Remove inner/outer modes from input path") Signed-off-by: Herbert Xu Reviewed-by: Maciej Żenczykowski Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 815b38080401..d5ee96789d4b 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -180,6 +180,8 @@ static int xfrm4_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) int optlen = 0; int err = -EINVAL; + skb->protocol = htons(ETH_P_IP); + if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { struct ip_beet_phdr *ph; int phlen; @@ -232,6 +234,8 @@ static int xfrm4_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; + skb->protocol = htons(ETH_P_IP); + if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; @@ -267,6 +271,8 @@ static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; + skb->protocol = htons(ETH_P_IPV6); + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; @@ -296,6 +302,8 @@ static int xfrm6_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) int size = sizeof(struct ipv6hdr); int err; + skb->protocol = htons(ETH_P_IPV6); + err = skb_cow_head(skb, size + skb->mac_len); if (err) goto out; @@ -346,6 +354,7 @@ xfrm_inner_mode_encap_remove(struct xfrm_state *x, return xfrm6_remove_tunnel_encap(x, skb); break; } + return -EINVAL; } WARN_ON_ONCE(1); @@ -366,19 +375,6 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) return -EAFNOSUPPORT; } - switch (XFRM_MODE_SKB_CB(skb)->protocol) { - case IPPROTO_IPIP: - case IPPROTO_BEETPH: - skb->protocol = htons(ETH_P_IP); - break; - case IPPROTO_IPV6: - skb->protocol = htons(ETH_P_IPV6); - break; - default: - WARN_ON_ONCE(1); - break; - } - return xfrm_inner_mode_encap_remove(x, skb); } From 53223f2ed1ef5c90dad814daaaefea4e68a933c8 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 10 Jul 2023 17:40:51 +0800 Subject: [PATCH 05/49] xfrm: fix slab-use-after-free in decode_session6 When the xfrm device is set to the qdisc of the sfb type, the cb field of the sent skb may be modified during enqueuing. Then, slab-use-after-free may occur when the xfrm device sends IPv6 packets. The stack information is as follows: BUG: KASAN: slab-use-after-free in decode_session6+0x103f/0x1890 Read of size 1 at addr ffff8881111458ef by task swapper/3/0 CPU: 3 PID: 0 Comm: swapper/3 Not tainted 6.4.0-next-20230707 #409 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 Call Trace: dump_stack_lvl+0xd9/0x150 print_address_description.constprop.0+0x2c/0x3c0 kasan_report+0x11d/0x130 decode_session6+0x103f/0x1890 __xfrm_decode_session+0x54/0xb0 xfrmi_xmit+0x173/0x1ca0 dev_hard_start_xmit+0x187/0x700 sch_direct_xmit+0x1a3/0xc30 __qdisc_run+0x510/0x17a0 __dev_queue_xmit+0x2215/0x3b10 neigh_connected_output+0x3c2/0x550 ip6_finish_output2+0x55a/0x1550 ip6_finish_output+0x6b9/0x1270 ip6_output+0x1f1/0x540 ndisc_send_skb+0xa63/0x1890 ndisc_send_rs+0x132/0x6f0 addrconf_rs_timer+0x3f1/0x870 call_timer_fn+0x1a0/0x580 expire_timers+0x29b/0x4b0 run_timer_softirq+0x326/0x910 __do_softirq+0x1d4/0x905 irq_exit_rcu+0xb7/0x120 sysvec_apic_timer_interrupt+0x97/0xc0 asm_sysvec_apic_timer_interrupt+0x1a/0x20 RIP: 0010:intel_idle_hlt+0x23/0x30 Code: 1f 84 00 00 00 00 00 f3 0f 1e fa 41 54 41 89 d4 0f 1f 44 00 00 66 90 0f 1f 44 00 00 0f 00 2d c4 9f ab 00 0f 1f 44 00 00 fb f4 44 89 e0 41 5c c3 66 0f 1f 44 00 00 f3 0f 1e fa 41 54 41 89 d4 RSP: 0018:ffffc90000197d78 EFLAGS: 00000246 RAX: 00000000000a83c3 RBX: ffffe8ffffd09c50 RCX: ffffffff8a22d8e5 RDX: 0000000000000001 RSI: ffffffff8d3f8080 RDI: ffffe8ffffd09c50 RBP: ffffffff8d3f8080 R08: 0000000000000001 R09: ffffed1026ba6d9d R10: ffff888135d36ceb R11: 0000000000000001 R12: 0000000000000001 R13: ffffffff8d3f8100 R14: 0000000000000001 R15: 0000000000000000 cpuidle_enter_state+0xd3/0x6f0 cpuidle_enter+0x4e/0xa0 do_idle+0x2fe/0x3c0 cpu_startup_entry+0x18/0x20 start_secondary+0x200/0x290 secondary_startup_64_no_verify+0x167/0x16b Allocated by task 939: kasan_save_stack+0x22/0x40 kasan_set_track+0x25/0x30 __kasan_slab_alloc+0x7f/0x90 kmem_cache_alloc_node+0x1cd/0x410 kmalloc_reserve+0x165/0x270 __alloc_skb+0x129/0x330 inet6_ifa_notify+0x118/0x230 __ipv6_ifa_notify+0x177/0xbe0 addrconf_dad_completed+0x133/0xe00 addrconf_dad_work+0x764/0x1390 process_one_work+0xa32/0x16f0 worker_thread+0x67d/0x10c0 kthread+0x344/0x440 ret_from_fork+0x1f/0x30 The buggy address belongs to the object at ffff888111145800 which belongs to the cache skbuff_small_head of size 640 The buggy address is located 239 bytes inside of freed 640-byte region [ffff888111145800, ffff888111145a80) As commit f855691975bb ("xfrm6: Fix the nexthdr offset in _decode_session6.") showed, xfrm_decode_session was originally intended only for the receive path. IP6CB(skb)->nhoff is not set during transmission. Therefore, set the cb field in the skb to 0 before sending packets. Fixes: f855691975bb ("xfrm6: Fix the nexthdr offset in _decode_session6.") Signed-off-by: Zhengchao Shao Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index a3319965470a..b86474084690 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -537,8 +537,8 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) switch (skb->protocol) { case htons(ETH_P_IPV6): - xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + xfrm_decode_session(skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; @@ -552,8 +552,8 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) } break; case htons(ETH_P_IP): - xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + xfrm_decode_session(skb, &fl, AF_INET); if (!dst) { struct rtable *rt; From 9fd41f1ba638938c9a1195d09bc6fa3be2712f25 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 10 Jul 2023 17:40:52 +0800 Subject: [PATCH 06/49] ip6_vti: fix slab-use-after-free in decode_session6 When ipv6_vti device is set to the qdisc of the sfb type, the cb field of the sent skb may be modified during enqueuing. Then, slab-use-after-free may occur when ipv6_vti device sends IPv6 packets. The stack information is as follows: BUG: KASAN: slab-use-after-free in decode_session6+0x103f/0x1890 Read of size 1 at addr ffff88802e08edc2 by task swapper/0/0 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.4.0-next-20230707-00001-g84e2cad7f979 #410 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 Call Trace: dump_stack_lvl+0xd9/0x150 print_address_description.constprop.0+0x2c/0x3c0 kasan_report+0x11d/0x130 decode_session6+0x103f/0x1890 __xfrm_decode_session+0x54/0xb0 vti6_tnl_xmit+0x3e6/0x1ee0 dev_hard_start_xmit+0x187/0x700 sch_direct_xmit+0x1a3/0xc30 __qdisc_run+0x510/0x17a0 __dev_queue_xmit+0x2215/0x3b10 neigh_connected_output+0x3c2/0x550 ip6_finish_output2+0x55a/0x1550 ip6_finish_output+0x6b9/0x1270 ip6_output+0x1f1/0x540 ndisc_send_skb+0xa63/0x1890 ndisc_send_rs+0x132/0x6f0 addrconf_rs_timer+0x3f1/0x870 call_timer_fn+0x1a0/0x580 expire_timers+0x29b/0x4b0 run_timer_softirq+0x326/0x910 __do_softirq+0x1d4/0x905 irq_exit_rcu+0xb7/0x120 sysvec_apic_timer_interrupt+0x97/0xc0 Allocated by task 9176: kasan_save_stack+0x22/0x40 kasan_set_track+0x25/0x30 __kasan_slab_alloc+0x7f/0x90 kmem_cache_alloc_node+0x1cd/0x410 kmalloc_reserve+0x165/0x270 __alloc_skb+0x129/0x330 netlink_sendmsg+0x9b1/0xe30 sock_sendmsg+0xde/0x190 ____sys_sendmsg+0x739/0x920 ___sys_sendmsg+0x110/0x1b0 __sys_sendmsg+0xf7/0x1c0 do_syscall_64+0x39/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 9176: kasan_save_stack+0x22/0x40 kasan_set_track+0x25/0x30 kasan_save_free_info+0x2b/0x40 ____kasan_slab_free+0x160/0x1c0 slab_free_freelist_hook+0x11b/0x220 kmem_cache_free+0xf0/0x490 skb_free_head+0x17f/0x1b0 skb_release_data+0x59c/0x850 consume_skb+0xd2/0x170 netlink_unicast+0x54f/0x7f0 netlink_sendmsg+0x926/0xe30 sock_sendmsg+0xde/0x190 ____sys_sendmsg+0x739/0x920 ___sys_sendmsg+0x110/0x1b0 __sys_sendmsg+0xf7/0x1c0 do_syscall_64+0x39/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd The buggy address belongs to the object at ffff88802e08ed00 which belongs to the cache skbuff_small_head of size 640 The buggy address is located 194 bytes inside of freed 640-byte region [ffff88802e08ed00, ffff88802e08ef80) As commit f855691975bb ("xfrm6: Fix the nexthdr offset in _decode_session6.") showed, xfrm_decode_session was originally intended only for the receive path. IP6CB(skb)->nhoff is not set during transmission. Therefore, set the cb field in the skb to 0 before sending packets. Fixes: f855691975bb ("xfrm6: Fix the nexthdr offset in _decode_session6.") Signed-off-by: Zhengchao Shao Signed-off-by: Steffen Klassert --- net/ipv6/ip6_vti.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 10b222865d46..73c85d4e0e9c 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -568,12 +568,12 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; - xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + xfrm_decode_session(skb, &fl, AF_INET6); break; case htons(ETH_P_IP): - xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + xfrm_decode_session(skb, &fl, AF_INET); break; default: goto tx_err; From 6018a266279b1a75143c7c0804dd08a5fc4c3e0b Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 10 Jul 2023 17:40:53 +0800 Subject: [PATCH 07/49] ip_vti: fix potential slab-use-after-free in decode_session6 When ip_vti device is set to the qdisc of the sfb type, the cb field of the sent skb may be modified during enqueuing. Then, slab-use-after-free may occur when ip_vti device sends IPv6 packets. As commit f855691975bb ("xfrm6: Fix the nexthdr offset in _decode_session6.") showed, xfrm_decode_session was originally intended only for the receive path. IP6CB(skb)->nhoff is not set during transmission. Therefore, set the cb field in the skb to 0 before sending packets. Fixes: f855691975bb ("xfrm6: Fix the nexthdr offset in _decode_session6.") Signed-off-by: Zhengchao Shao Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 53bfd8af6920..d1e7d0ceb7ed 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -287,12 +287,12 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) switch (skb->protocol) { case htons(ETH_P_IP): - xfrm_decode_session(skb, &fl, AF_INET); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + xfrm_decode_session(skb, &fl, AF_INET); break; case htons(ETH_P_IPV6): - xfrm_decode_session(skb, &fl, AF_INET6); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + xfrm_decode_session(skb, &fl, AF_INET6); break; default: goto tx_err; From 00374d9b6d9f932802b55181be9831aa948e5b7c Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Fri, 21 Jul 2023 22:51:03 +0800 Subject: [PATCH 08/49] xfrm: add NULL check in xfrm_update_ae_params Normally, x->replay_esn and x->preplay_esn should be allocated at xfrm_alloc_replay_state_esn(...) in xfrm_state_construct(...), hence the xfrm_update_ae_params(...) is okay to update them. However, the current implementation of xfrm_new_ae(...) allows a malicious user to directly dereference a NULL pointer and crash the kernel like below. BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 8253067 P4D 8253067 PUD 8e0e067 PMD 0 Oops: 0002 [#1] PREEMPT SMP KASAN NOPTI CPU: 0 PID: 98 Comm: poc.npd Not tainted 6.4.0-rc7-00072-gdad9774deaf1 #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.o4 RIP: 0010:memcpy_orig+0xad/0x140 Code: e8 4c 89 5f e0 48 8d 7f e0 73 d2 83 c2 20 48 29 d6 48 29 d7 83 fa 10 72 34 4c 8b 06 4c 8b 4e 08 c RSP: 0018:ffff888008f57658 EFLAGS: 00000202 RAX: 0000000000000000 RBX: ffff888008bd0000 RCX: ffffffff8238e571 RDX: 0000000000000018 RSI: ffff888007f64844 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff888008f57818 R13: ffff888007f64aa4 R14: 0000000000000000 R15: 0000000000000000 FS: 00000000014013c0(0000) GS:ffff88806d600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000054d8000 CR4: 00000000000006f0 Call Trace: ? __die+0x1f/0x70 ? page_fault_oops+0x1e8/0x500 ? __pfx_is_prefetch.constprop.0+0x10/0x10 ? __pfx_page_fault_oops+0x10/0x10 ? _raw_spin_unlock_irqrestore+0x11/0x40 ? fixup_exception+0x36/0x460 ? _raw_spin_unlock_irqrestore+0x11/0x40 ? exc_page_fault+0x5e/0xc0 ? asm_exc_page_fault+0x26/0x30 ? xfrm_update_ae_params+0xd1/0x260 ? memcpy_orig+0xad/0x140 ? __pfx__raw_spin_lock_bh+0x10/0x10 xfrm_update_ae_params+0xe7/0x260 xfrm_new_ae+0x298/0x4e0 ? __pfx_xfrm_new_ae+0x10/0x10 ? __pfx_xfrm_new_ae+0x10/0x10 xfrm_user_rcv_msg+0x25a/0x410 ? __pfx_xfrm_user_rcv_msg+0x10/0x10 ? __alloc_skb+0xcf/0x210 ? stack_trace_save+0x90/0xd0 ? filter_irq_stacks+0x1c/0x70 ? __stack_depot_save+0x39/0x4e0 ? __kasan_slab_free+0x10a/0x190 ? kmem_cache_free+0x9c/0x340 ? netlink_recvmsg+0x23c/0x660 ? sock_recvmsg+0xeb/0xf0 ? __sys_recvfrom+0x13c/0x1f0 ? __x64_sys_recvfrom+0x71/0x90 ? do_syscall_64+0x3f/0x90 ? entry_SYSCALL_64_after_hwframe+0x72/0xdc ? copyout+0x3e/0x50 netlink_rcv_skb+0xd6/0x210 ? __pfx_xfrm_user_rcv_msg+0x10/0x10 ? __pfx_netlink_rcv_skb+0x10/0x10 ? __pfx_sock_has_perm+0x10/0x10 ? mutex_lock+0x8d/0xe0 ? __pfx_mutex_lock+0x10/0x10 xfrm_netlink_rcv+0x44/0x50 netlink_unicast+0x36f/0x4c0 ? __pfx_netlink_unicast+0x10/0x10 ? netlink_recvmsg+0x500/0x660 netlink_sendmsg+0x3b7/0x700 This Null-ptr-deref bug is assigned CVE-2023-3772. And this commit adds additional NULL check in xfrm_update_ae_params to fix the NPD. Fixes: d8647b79c3b7 ("xfrm: Add user interface for esn and big anti-replay windows") Signed-off-by: Lin Ma Reviewed-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index fdc0c17122b6..8f74dde4a55f 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -628,7 +628,7 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; struct nlattr *mt = attrs[XFRMA_MTIMER_THRESH]; - if (re) { + if (re && x->replay_esn && x->preplay_esn) { struct xfrm_replay_state_esn *replay_esn; replay_esn = nla_data(re); memcpy(x->replay_esn, replay_esn, From 5e2424708da7207087934c5c75211e8584d553a0 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Sun, 23 Jul 2023 15:41:10 +0800 Subject: [PATCH 09/49] xfrm: add forgotten nla_policy for XFRMA_MTIMER_THRESH The previous commit 4e484b3e969b ("xfrm: rate limit SA mapping change message to user space") added one additional attribute named XFRMA_MTIMER_THRESH and described its type at compat_policy (net/xfrm/xfrm_compat.c). However, the author forgot to also describe the nla_policy at xfrma_policy (net/xfrm/xfrm_user.c). Hence, this suppose NLA_U32 (4 bytes) value can be faked as empty (0 bytes) by a malicious user, which leads to 4 bytes overflow read and heap information leak when parsing nlattrs. To exploit this, one malicious user can spray the SLUB objects and then leverage this 4 bytes OOB read to leak the heap data into x->mapping_maxage (see xfrm_update_ae_params(...)), and leak it to userspace via copy_to_user_state_extra(...). The above bug is assigned CVE-2023-3773. To fix it, this commit just completes the nla_policy description for XFRMA_MTIMER_THRESH, which enforces the length check and avoids such OOB read. Fixes: 4e484b3e969b ("xfrm: rate limit SA mapping change message to user space") Signed-off-by: Lin Ma Reviewed-by: Simon Horman Reviewed-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8f74dde4a55f..f06d6deb58dd 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3044,6 +3044,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SET_MARK] = { .type = NLA_U32 }, [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, [XFRMA_IF_ID] = { .type = NLA_U32 }, + [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(xfrma_policy); From 982c3aca8bac8ae38acdc940e4f1ecec3bffc623 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 31 Jul 2023 14:38:26 +0300 Subject: [PATCH 10/49] xfrm: delete offloaded policy The policy memory was released but not HW driver data. Add call to xfrm_dev_policy_delete(), so drivers will have a chance to release their resources. Fixes: 919e43fad516 ("xfrm: add an interface to offload policy") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index f06d6deb58dd..ad01997c3aa9 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2345,6 +2345,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid); } } else { + xfrm_dev_policy_delete(xp); xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err != 0) From f3ec2b5d879ef5bbcb24678914641343cb6399a2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 31 Jul 2023 14:38:27 +0300 Subject: [PATCH 11/49] xfrm: don't skip free of empty state in acquire policy In destruction flow, the assignment of NULL to xso->dev caused to skip of xfrm_dev_state_free() call, which was called in xfrm_state_put(to_put) routine. Instead of open-coded variant of xfrm_dev_state_delete() and xfrm_dev_state_free(), let's use them directly. Fixes: f8a70afafc17 ("xfrm: add TX datapath support for IPsec packet offload mode") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + net/xfrm/xfrm_state.c | 8 ++------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 151ca95dd08d..363c7d510554 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1984,6 +1984,7 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) if (dev->xfrmdev_ops->xdo_dev_state_free) dev->xfrmdev_ops->xdo_dev_state_free(x); xso->dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; netdev_put(dev, &xso->dev_tracker); } } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 49e63eea841d..bda5327bf34d 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1324,12 +1324,8 @@ found: struct xfrm_dev_offload *xso = &x->xso; if (xso->type == XFRM_DEV_OFFLOAD_PACKET) { - xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); - xso->dir = 0; - netdev_put(xso->dev, &xso->dev_tracker); - xso->dev = NULL; - xso->real_dev = NULL; - xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + xfrm_dev_state_delete(x); + xfrm_dev_state_free(x); } #endif x->km.state = XFRM_STATE_DEAD; From 51b813176f098ff61bd2833f627f5319ead098a5 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 9 Aug 2023 23:12:56 -0400 Subject: [PATCH 12/49] virtio-net: set queues after driver_ok Commit 25266128fe16 ("virtio-net: fix race between set queues and probe") tries to fix the race between set queues and probe by calling _virtnet_set_queues() before DRIVER_OK is set. This violates virtio spec. Fixing this by setting queues after virtio_device_ready(). Note that rtnl needs to be held for userspace requests to change the number of queues. So we are serialized in this way. Fixes: 25266128fe16 ("virtio-net: fix race between set queues and probe") Reported-by: Dragos Tatulea Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1270c8d23463..ff03921e46df 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -4219,8 +4219,6 @@ static int virtnet_probe(struct virtio_device *vdev) if (vi->has_rss || vi->has_rss_hash_report) virtnet_init_default_rss(vi); - _virtnet_set_queues(vi, vi->curr_queue_pairs); - /* serialize netdev register + virtio_device_ready() with ndo_open() */ rtnl_lock(); @@ -4233,6 +4231,8 @@ static int virtnet_probe(struct virtio_device *vdev) virtio_device_ready(vdev); + _virtnet_set_queues(vi, vi->curr_queue_pairs); + /* a random MAC address has been assigned, notify the device. * We don't fail probe if VIRTIO_NET_F_CTRL_MAC_ADDR is not there * because many devices work fine without getting MAC explicitly From 829c6524d6729d05a82575dbcc16f99be5ee843d Mon Sep 17 00:00:00 2001 From: Xiang Yang Date: Thu, 10 Aug 2023 22:06:39 +0800 Subject: [PATCH 13/49] net: pcs: Add missing put_device call in miic_create The reference of pdev->dev is taken by of_find_device_by_node, so it should be released when not need anymore. Fixes: 7dc54d3b8d91 ("net: pcs: add Renesas MII converter driver") Signed-off-by: Xiang Yang Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/pcs/pcs-rzn1-miic.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/pcs/pcs-rzn1-miic.c b/drivers/net/pcs/pcs-rzn1-miic.c index 323bec5e57f8..356099169003 100644 --- a/drivers/net/pcs/pcs-rzn1-miic.c +++ b/drivers/net/pcs/pcs-rzn1-miic.c @@ -313,15 +313,21 @@ struct phylink_pcs *miic_create(struct device *dev, struct device_node *np) pdev = of_find_device_by_node(pcs_np); of_node_put(pcs_np); - if (!pdev || !platform_get_drvdata(pdev)) + if (!pdev || !platform_get_drvdata(pdev)) { + if (pdev) + put_device(&pdev->dev); return ERR_PTR(-EPROBE_DEFER); + } miic_port = kzalloc(sizeof(*miic_port), GFP_KERNEL); - if (!miic_port) + if (!miic_port) { + put_device(&pdev->dev); return ERR_PTR(-ENOMEM); + } miic = platform_get_drvdata(pdev); device_link_add(dev, miic->dev, DL_FLAG_AUTOREMOVE_CONSUMER); + put_device(&pdev->dev); miic_port->miic = miic; miic_port->port = port - 1; From cc941e548bffc01b5816b4edc5cb432a137a58b3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Aug 2023 11:26:30 +0100 Subject: [PATCH 14/49] net: phy: fix IRQ-based wake-on-lan over hibernate / power off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Uwe reports: "Most PHYs signal WoL using an interrupt. So disabling interrupts [at shutdown] breaks WoL at least on PHYs covered by the marvell driver." Discussing with Ioana, the problem which was trying to be solved was: "The board in question is a LS1021ATSN which has two AR8031 PHYs that share an interrupt line. In case only one of the PHYs is probed and there are pending interrupts on the PHY#2 an IRQ storm will happen since there is no entity to clear the interrupt from PHY#2's registers. PHY#1's driver will get stuck in .handle_interrupt() indefinitely." Further confirmation that "the two AR8031 PHYs are on the same MDIO bus." With WoL using interrupts to wake the system, in such a case, the system will begin booting with an asserted interrupt. Thus, we need to cope with an interrupt asserted during boot. Solve this instead by disabling interrupts during PHY probe. This will ensure in Ioana's situation that both PHYs of the same type sharing an interrupt line on a common MDIO bus will have their interrupt outputs disabled when the driver probes the device, but before we hook in any interrupt handlers - thus avoiding the interrupt storm. A better fix would be for platform firmware to disable the interrupting devices at source during boot, before control is handed to the kernel. Fixes: e2f016cf7751 ("net: phy: add a shutdown procedure") Link: 20230804071757.383971-1-u.kleine-koenig@pengutronix.de Reported-by: Uwe Kleine-König Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 61921d4dbb13..c7cf61fe41cf 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3216,6 +3216,8 @@ static int phy_probe(struct device *dev) goto out; } + phy_disable_interrupts(phydev); + /* Start out supporting everything. Eventually, * a controller will attach, and may modify one * or both of these values @@ -3333,16 +3335,6 @@ static int phy_remove(struct device *dev) return 0; } -static void phy_shutdown(struct device *dev) -{ - struct phy_device *phydev = to_phy_device(dev); - - if (phydev->state == PHY_READY || !phydev->attached_dev) - return; - - phy_disable_interrupts(phydev); -} - /** * phy_driver_register - register a phy_driver with the PHY layer * @new_driver: new phy_driver to register @@ -3376,7 +3368,6 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner) new_driver->mdiodrv.driver.bus = &mdio_bus_type; new_driver->mdiodrv.driver.probe = phy_probe; new_driver->mdiodrv.driver.remove = phy_remove; - new_driver->mdiodrv.driver.shutdown = phy_shutdown; new_driver->mdiodrv.driver.owner = owner; new_driver->mdiodrv.driver.probe_type = PROBE_FORCE_SYNCHRONOUS; From ace0ab3a4b54205a01d3f4a0fd9bdb4616cfb60b Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 11 Aug 2023 17:45:23 +0200 Subject: [PATCH 15/49] Revert "vlan: Fix VLAN 0 memory leak" This reverts commit 718cb09aaa6fa78cc8124e9517efbc6c92665384. The commit triggers multiple syzbot issues, probably due to possibility of manually creating VLAN 0 on netdevice which will cause the code to delete it since it can't distinguish such VLAN from implicit VLAN 0 automatically created for devices with NETIF_F_HW_VLAN_CTAG_FILTER feature. Reported-by: syzbot+662f783a5cdf3add2719@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/00000000000090196d0602a6167d@google.com/ Reported-by: syzbot+4b4f06495414e92701d5@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/00000000000096ae870602a61602@google.com/ Reported-by: syzbot+d810d3cd45ed1848c3f7@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/0000000000009f0f9c0602a616ce@google.com/ Fixes: 718cb09aaa6f ("vlan: Fix VLAN 0 memory leak") Signed-off-by: Vlad Buslov Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index b3662119ddbc..e40aa3e3641c 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -384,7 +384,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } - if (event == NETDEV_DOWN) + if (event == NETDEV_DOWN && + (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); From 855067defa36b1f9effad8c219d9a85b655cf500 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 11 Aug 2023 17:59:27 +0200 Subject: [PATCH 16/49] selftests: mirror_gre_changes: Tighten up the TTL test match This test verifies whether the encapsulated packets have the correct configured TTL. It does so by sending ICMP packets through the test topology and mirroring them to a gretap netdevice. On a busy host however, more than just the test ICMP packets may end up flowing through the topology, get mirrored, and counted. This leads to potential spurious failures as the test observes much more mirrored packets than the sent test packets, and assumes a bug. Fix this by tightening up the mirror action match. Change it from matchall to a flower classifier matching on ICMP packets specifically. Fixes: 45315673e0c5 ("selftests: forwarding: Test changes in mirror-to-gretap") Signed-off-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Ido Schimmel Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/mirror_gre_changes.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh index aff88f78e339..5ea9d63915f7 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh @@ -72,7 +72,8 @@ test_span_gre_ttl() RET=0 - mirror_install $swp1 ingress $tundev "matchall $tcflags" + mirror_install $swp1 ingress $tundev \ + "prot ip flower $tcflags ip_prot icmp" tc filter add dev $h3 ingress pref 77 prot $prot \ flower skip_hw ip_ttl 50 action pass From 6c461e394d11a981c662cc16cebfb05b602e23ba Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Mon, 7 Aug 2023 18:44:51 +0530 Subject: [PATCH 17/49] net: macb: In ZynqMP resume always configure PS GTR for non-wakeup source On Zynq UltraScale+ MPSoC ubuntu platform when systemctl issues suspend, network manager bring down the interface and goes into suspend. When it wakes up it again enables the interface. This leads to xilinx-psgtr "PLL lock timeout" on interface bringup, as the power management controller power down the entire FPD (including SERDES) if none of the FPD devices are in use and serdes is not initialized on resume. $ sudo rtcwake -m no -s 120 -v $ sudo systemctl suspend $ ifconfig eth1 up xilinx-psgtr fd400000.phy: lane 0 (type 10, protocol 5): PLL lock timeout phy phy-fd400000.phy.0: phy poweron failed --> -110 macb driver is called in this way: 1. macb_close: Stop network interface. In this function, it reset MACB IP and disables PHY and network interface. 2. macb_suspend: It is called in kernel suspend flow. But because network interface has been disabled(netif_running(ndev) is false), it does nothing and returns directly; 3. System goes into suspend state. Some time later, system is waken up by RTC wakeup device; 4. macb_resume: It does nothing because network interface has been disabled; 5. macb_open: It is called to enable network interface again. ethernet interface is initialized in this API but serdes which is power-off by PMUFW during FPD-off suspend is not initialized again and so we hit GT PLL lock issue on open. To resolve this PLL timeout issue always do PS GTR initialization when ethernet device is configured as non-wakeup source. Fixes: f22bd29ba19a ("net: macb: Fix ZynqMP SGMII non-wakeup source resume failure") Fixes: 8b73fa3ae02b ("net: macb: Added ZynqMP-specific initialization") Signed-off-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/1691414091-2260697-1-git-send-email-radhey.shyam.pandey@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index f6a0f12a6d52..82929ee76739 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -5194,6 +5194,9 @@ static int __maybe_unused macb_suspend(struct device *dev) unsigned int q; int err; + if (!device_may_wakeup(&bp->dev->dev)) + phy_exit(bp->sgmii_phy); + if (!netif_running(netdev)) return 0; @@ -5254,7 +5257,6 @@ static int __maybe_unused macb_suspend(struct device *dev) if (!(bp->wol & MACB_WOL_ENABLED)) { rtnl_lock(); phylink_stop(bp->phylink); - phy_exit(bp->sgmii_phy); rtnl_unlock(); spin_lock_irqsave(&bp->lock, flags); macb_reset_hw(bp); @@ -5284,6 +5286,9 @@ static int __maybe_unused macb_resume(struct device *dev) unsigned int q; int err; + if (!device_may_wakeup(&bp->dev->dev)) + phy_init(bp->sgmii_phy); + if (!netif_running(netdev)) return 0; @@ -5344,8 +5349,6 @@ static int __maybe_unused macb_resume(struct device *dev) macb_set_rx_mode(netdev); macb_restore_features(bp); rtnl_lock(); - if (!device_may_wakeup(&bp->dev->dev)) - phy_init(bp->sgmii_phy); phylink_start(bp->phylink); rtnl_unlock(); From 519b227904f0e70d4a1d6cf41daa5392715f2d2f Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Thu, 10 Aug 2023 17:01:11 +0200 Subject: [PATCH 18/49] octeon_ep: fix timeout value for waiting on mbox response The intention was to wait up to 500 ms for the mbox response. The third argument to wait_event_interruptible_timeout() is supposed to be the timeout duration. The driver mistakenly passed absolute time instead. Fixes: 577f0d1b1c5f ("octeon_ep: add separate mailbox command and response queues") Signed-off-by: Michal Schmidt Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230810150114.107765-2-mschmidt@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_ctrl_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_ctrl_net.c b/drivers/net/ethernet/marvell/octeon_ep/octep_ctrl_net.c index 1cc6af2feb38..565320ec24f8 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_ctrl_net.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_ctrl_net.c @@ -55,7 +55,7 @@ static int octep_send_mbox_req(struct octep_device *oct, list_add_tail(&d->list, &oct->ctrl_req_wait_list); ret = wait_event_interruptible_timeout(oct->ctrl_req_wait_q, (d->done != 0), - jiffies + msecs_to_jiffies(500)); + msecs_to_jiffies(500)); list_del(&d->list); if (ret == 0 || ret == 1) return -EAGAIN; From 28458c80006bb4e993a09fc094094a8578cad292 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Thu, 10 Aug 2023 17:01:12 +0200 Subject: [PATCH 19/49] octeon_ep: cancel tx_timeout_task later in remove sequence tx_timeout_task is canceled too early when removing the driver. Nothing prevents .ndo_tx_timeout from triggering and queuing the work again. Better cancel it after the netdev is unregistered. It's harmless for octep_tx_timeout_task to run in the window between the unregistration and cancelation, because it checks netif_running. Fixes: 862cd659a6fb ("octeon_ep: Add driver framework and device initialization") Signed-off-by: Michal Schmidt Link: https://lore.kernel.org/r/20230810150114.107765-3-mschmidt@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 43eb6e871351..d8066bff5f7b 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1200,12 +1200,12 @@ static void octep_remove(struct pci_dev *pdev) if (!oct) return; - cancel_work_sync(&oct->tx_timeout_task); cancel_work_sync(&oct->ctrl_mbox_task); netdev = oct->netdev; if (netdev->reg_state == NETREG_REGISTERED) unregister_netdev(netdev); + cancel_work_sync(&oct->tx_timeout_task); oct->poll_non_ioq_intr = false; cancel_delayed_work_sync(&oct->intr_poll_task); octep_device_cleanup(oct); From 607a7a45cdf38c1901e0d81e4e00a2a88786330a Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Thu, 10 Aug 2023 17:01:13 +0200 Subject: [PATCH 20/49] octeon_ep: cancel ctrl_mbox_task after intr_poll_task intr_poll_task may queue ctrl_mbox_task. The function octep_poll_non_ioq_interrupts_cn93_pf does this. When removing the driver and canceling these two works, cancel ctrl_mbox_task last to guarantee it does not run anymore. Fixes: 24d4333233b3 ("octeon_ep: poll for control messages") Signed-off-by: Michal Schmidt Link: https://lore.kernel.org/r/20230810150114.107765-4-mschmidt@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index d8066bff5f7b..ab69b6d62509 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1200,7 +1200,6 @@ static void octep_remove(struct pci_dev *pdev) if (!oct) return; - cancel_work_sync(&oct->ctrl_mbox_task); netdev = oct->netdev; if (netdev->reg_state == NETREG_REGISTERED) unregister_netdev(netdev); @@ -1208,6 +1207,7 @@ static void octep_remove(struct pci_dev *pdev) cancel_work_sync(&oct->tx_timeout_task); oct->poll_non_ioq_intr = false; cancel_delayed_work_sync(&oct->intr_poll_task); + cancel_work_sync(&oct->ctrl_mbox_task); octep_device_cleanup(oct); pci_release_mem_regions(pdev); free_netdev(netdev); From 758c91078165ae641b698750a72eafe7968b3756 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Thu, 10 Aug 2023 17:01:14 +0200 Subject: [PATCH 21/49] octeon_ep: cancel queued works in probe error path If it fails to get the devices's MAC address, octep_probe exits while leaving the delayed work intr_poll_task queued. When the work later runs, it's a use after free. Move the cancelation of intr_poll_task from octep_remove into octep_device_cleanup. This does not change anything in the octep_remove flow, but octep_device_cleanup is called also in the octep_probe error path, where the cancelation is needed. Note that the cancelation of ctrl_mbox_task has to follow intr_poll_task's, because the ctrl_mbox_task may be queued by intr_poll_task. Fixes: 24d4333233b3 ("octeon_ep: poll for control messages") Signed-off-by: Michal Schmidt Link: https://lore.kernel.org/r/20230810150114.107765-5-mschmidt@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index ab69b6d62509..4424de2ffd70 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1038,6 +1038,10 @@ static void octep_device_cleanup(struct octep_device *oct) { int i; + oct->poll_non_ioq_intr = false; + cancel_delayed_work_sync(&oct->intr_poll_task); + cancel_work_sync(&oct->ctrl_mbox_task); + dev_info(&oct->pdev->dev, "Cleaning up Octeon Device ...\n"); for (i = 0; i < OCTEP_MAX_VF; i++) { @@ -1205,9 +1209,6 @@ static void octep_remove(struct pci_dev *pdev) unregister_netdev(netdev); cancel_work_sync(&oct->tx_timeout_task); - oct->poll_non_ioq_intr = false; - cancel_delayed_work_sync(&oct->intr_poll_task); - cancel_work_sync(&oct->ctrl_mbox_task); octep_device_cleanup(oct); pci_release_mem_regions(pdev); free_netdev(netdev); From 8a519a572598b7c0c07b02f69bf5b4e8dd4b2d7d Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Sat, 12 Aug 2023 10:30:16 +0800 Subject: [PATCH 22/49] net: veth: Page pool creation error handling for existing pools only The failure handling procedure destroys page pools for all queues, including those that haven't had their page pool created yet. this patch introduces necessary adjustments to prevent potential risks and inconsistency with the error handling behavior. Fixes: 0ebab78cbcbf ("net: veth: add page_pool for page recycling") Acked-by: Jesper Dangaard Brouer Signed-off-by: Liang Chen Link: https://lore.kernel.org/r/20230812023016.10553-1-liangchen.linux@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/veth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 614f3e3efab0..509e901da41d 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1081,8 +1081,9 @@ static int __veth_napi_enable_range(struct net_device *dev, int start, int end) err_xdp_ring: for (i--; i >= start; i--) ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); + i = end; err_page_pool: - for (i = start; i < end; i++) { + for (i--; i >= start; i--) { page_pool_destroy(priv->rq[i].page_pool); priv->rq[i].page_pool = NULL; } From e4dd0d3a2f64b8bd8029ec70f52bdbebd0644408 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Fri, 11 Aug 2023 10:37:47 +0800 Subject: [PATCH 23/49] net: fix the RTO timer retransmitting skb every 1ms if linear option is enabled In the real workload, I encountered an issue which could cause the RTO timer to retransmit the skb per 1ms with linear option enabled. The amount of lost-retransmitted skbs can go up to 1000+ instantly. The root cause is that if the icsk_rto happens to be zero in the 6th round (which is the TCP_THIN_LINEAR_RETRIES value), then it will always be zero due to the changed calculation method in tcp_retransmit_timer() as follows: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); Above line could be converted to icsk->icsk_rto = min(0 << 1, TCP_RTO_MAX) = 0 Therefore, the timer expires so quickly without any doubt. I read through the RFC 6298 and found that the RTO value can be rounded up to a certain value, in Linux, say TCP_RTO_MIN as default, which is regarded as the lower bound in this patch as suggested by Eric. Fixes: 36e31b0af587 ("net: TCP thin linear timeouts") Suggested-by: Eric Dumazet Signed-off-by: Jason Xing Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 470f581eedd4..206418b6d7c4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -591,7 +591,9 @@ out_reset_timer: tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; - icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); + icsk->icsk_rto = clamp(__tcp_set_rto(tp), + tcp_rto_min(sk), + TCP_RTO_MAX); } else if (sk->sk_state != TCP_SYN_SENT || icsk->icsk_backoff > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { From b9f052dc68f69dac89fe1e24693354c033daa091 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Aug 2023 20:40:17 +0200 Subject: [PATCH 24/49] netfilter: nf_tables: fix false-positive lockdep splat ->abort invocation may cause splat on debug kernels: WARNING: suspicious RCU usage net/netfilter/nft_set_pipapo.c:1697 suspicious rcu_dereference_check() usage! [..] rcu_scheduler_active = 2, debug_locks = 1 1 lock held by nft/133554: [..] (nft_net->commit_mutex){+.+.}-{3:3}, at: nf_tables_valid_genid [..] lockdep_rcu_suspicious+0x1ad/0x260 nft_pipapo_abort+0x145/0x180 __nf_tables_abort+0x5359/0x63d0 nf_tables_abort+0x24/0x40 nfnetlink_rcv+0x1a0a/0x22c0 netlink_unicast+0x73c/0x900 netlink_sendmsg+0x7f0/0xc20 ____sys_sendmsg+0x48d/0x760 Transaction mutex is held, so parallel updates are not possible. Switch to _protected and check mutex is held for lockdep enabled builds. Fixes: 212ed75dc5fb ("netfilter: nf_tables: integrate pipapo into commit protocol") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index a5b8301afe4a..5fa12cfc7b84 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1697,6 +1697,17 @@ static void nft_pipapo_commit(const struct nft_set *set) priv->clone = new_clone; } +static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) +{ +#ifdef CONFIG_PROVE_LOCKING + const struct net *net = read_pnet(&set->net); + + return lockdep_is_held(&nft_pernet(net)->commit_mutex); +#else + return true; +#endif +} + static void nft_pipapo_abort(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); @@ -1705,7 +1716,7 @@ static void nft_pipapo_abort(const struct nft_set *set) if (!priv->dirty) return; - m = rcu_dereference(priv->match); + m = rcu_dereference_protected(priv->match, nft_pipapo_transaction_mutex_held(set)); new_clone = pipapo_clone(m); if (IS_ERR(new_clone)) From 08713cb006b6f07434f276c5ee214fb20c7fd965 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Aug 2023 23:59:03 +0200 Subject: [PATCH 25/49] netfilter: nf_tables: fix kdoc warnings after gc rework Jakub Kicinski says: We've got some new kdoc warnings here: net/netfilter/nft_set_pipapo.c:1557: warning: Function parameter or member '_set' not described in 'pipapo_gc' net/netfilter/nft_set_pipapo.c:1557: warning: Excess function parameter 'set' description in 'pipapo_gc' include/net/netfilter/nf_tables.h:577: warning: Function parameter or member 'dead' not described in 'nft_set' Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20230810104638.746e46f1@kernel.org/ Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 1 + net/netfilter/nft_set_pipapo.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 35870858ddf2..e9ae567c037d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -534,6 +534,7 @@ struct nft_set_elem_expr { * @expr: stateful expression * @ops: set ops * @flags: set flags + * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 5fa12cfc7b84..f95b3844162e 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1549,7 +1549,7 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements - * @set: nftables API set representation + * @_set: nftables API set representation * @m: Matching data */ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) From 90e5b3462efa37b8bba82d7c4e63683856e188af Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 12 Aug 2023 13:05:16 +0200 Subject: [PATCH 26/49] netfilter: nf_tables: deactivate catchall elements in next generation When flushing, individual set elements are disabled in the next generation via the ->flush callback. Catchall elements are not disabled. This is incorrect and may lead to double-deactivations of catchall elements which then results in memory leaks: WARNING: CPU: 1 PID: 3300 at include/net/netfilter/nf_tables.h:1172 nft_map_deactivate+0x549/0x730 CPU: 1 PID: 3300 Comm: nft Not tainted 6.5.0-rc5+ #60 RIP: 0010:nft_map_deactivate+0x549/0x730 [..] ? nft_map_deactivate+0x549/0x730 nf_tables_delset+0xb66/0xeb0 (the warn is due to nft_use_dec() detecting underflow). Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Reported-by: lonial con Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c62227ae7746..6f31022cacc6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7091,6 +7091,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, ret = __nft_set_catchall_flush(ctx, set, &elem); if (ret < 0) break; + nft_set_elem_change_active(ctx->net, set, ext); } return ret; From 7845914f45f066497ac75b30c50dbc735e84e884 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 12 Aug 2023 20:03:57 +0200 Subject: [PATCH 27/49] netfilter: nf_tables: don't fail inserts if duplicate has expired nftables selftests fail: run-tests.sh testcases/sets/0044interval_overlap_0 Expected: 0-2 . 0-3, got: W: [FAILED] ./testcases/sets/0044interval_overlap_0: got 1 Insertion must ignore duplicate but expired entries. Moreover, there is a strange asymmetry in nft_pipapo_activate: It refetches the current element, whereas the other ->activate callbacks (bitmap, hash, rhash, rbtree) use elem->priv. Same for .remove: other set implementations take elem->priv, nft_pipapo_remove fetches elem->priv, then does a relookup, remove this. I suspect this was the reason for the change that prompted the removal of the expired check in pipapo_get() in the first place, but skipping exired elements there makes no sense to me, this helper is used for normal get requests, insertions (duplicate check) and deactivate callback. In first two cases expired elements must be skipped. For ->deactivate(), this gets called for DELSETELEM, so it seems to me that expired elements should be skipped as well, i.e. delete request should fail with -ENOENT error. Fixes: 24138933b97b ("netfilter: nf_tables: don't skip expired elements during walk") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index f95b3844162e..3757fcc55723 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -566,6 +566,8 @@ next_match: goto out; if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext)) + goto next_match; if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; @@ -600,17 +602,8 @@ out: static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - struct nft_pipapo_elem *ret; - - ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, + return pipapo_get(net, set, (const u8 *)elem->key.val.data, nft_genmask_cur(net)); - if (IS_ERR(ret)) - return ret; - - if (nft_set_elem_expired(&ret->ext)) - return ERR_PTR(-ENOENT); - - return ret; } /** @@ -1743,11 +1736,7 @@ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_pipapo_elem *e; - - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); - if (IS_ERR(e)) - return; + struct nft_pipapo_elem *e = elem->priv; nft_set_elem_change_active(net, set, &e->ext); } @@ -1961,10 +1950,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, data = (const u8 *)nft_set_ext_key(&e->ext); - e = pipapo_get(net, set, data, 0); - if (IS_ERR(e)) - return; - while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *match_start, *match_end; From 9bfab6d23a2865966a4f89a96536fbf23f83bc8c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 15 Aug 2023 14:08:47 -0400 Subject: [PATCH 28/49] netfilter: set default timeout to 3 secs for sctp shutdown send and recv state In SCTP protocol, it is using the same timer (T2 timer) for SHUTDOWN and SHUTDOWN_ACK retransmission. However in sctp conntrack the default timeout value for SCTP_CONNTRACK_SHUTDOWN_ACK_SENT state is 3 secs while it's 300 msecs for SCTP_CONNTRACK_SHUTDOWN_SEND/RECV state. As Paolo Valerio noticed, this might cause unwanted expiration of the ct entry. In my test, with 1s tc netem delay set on the NAT path, after the SHUTDOWN is sent, the sctp ct entry enters SCTP_CONNTRACK_SHUTDOWN_SEND state. However, due to 300ms (too short) delay, when the SHUTDOWN_ACK is sent back from the peer, the sctp ct entry has expired and been deleted, and then the SHUTDOWN_ACK has to be dropped. Also, it is confusing these two sysctl options always show 0 due to all timeout values using sec as unit: net.netfilter.nf_conntrack_sctp_timeout_shutdown_recd = 0 net.netfilter.nf_conntrack_sctp_timeout_shutdown_sent = 0 This patch fixes it by also using 3 secs for sctp shutdown send and recv state in sctp conntrack, which is also RTO.initial value in SCTP protocol. Note that the very short time value for SCTP_CONNTRACK_SHUTDOWN_SEND/RECV was probably used for a rare scenario where SHUTDOWN is sent on 1st path but SHUTDOWN_ACK is replied on 2nd path, then a new connection started immediately on 1st path. So this patch also moves from SHUTDOWN_SEND/RECV to CLOSE when receiving INIT in the ORIGINAL direction. Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") Reported-by: Paolo Valerio Signed-off-by: Xin Long Reviewed-by: Simon Horman Signed-off-by: Florian Westphal --- Documentation/networking/nf_conntrack-sysctl.rst | 4 ++-- net/netfilter/nf_conntrack_proto_sctp.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst index 8b1045c3b59e..c383a394c665 100644 --- a/Documentation/networking/nf_conntrack-sysctl.rst +++ b/Documentation/networking/nf_conntrack-sysctl.rst @@ -178,10 +178,10 @@ nf_conntrack_sctp_timeout_established - INTEGER (seconds) Default is set to (hb_interval * path_max_retrans + rto_max) nf_conntrack_sctp_timeout_shutdown_sent - INTEGER (seconds) - default 0.3 + default 3 nf_conntrack_sctp_timeout_shutdown_recd - INTEGER (seconds) - default 0.3 + default 3 nf_conntrack_sctp_timeout_shutdown_ack_sent - INTEGER (seconds) default 3 diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 91eacc9b0b98..b6bcc8f2f46b 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -49,8 +49,8 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, - [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, - [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, + [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, + [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, }; @@ -105,7 +105,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ -/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW}, +/* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, From 5310760af1d4fbea1452bfc77db5f9a680f7ae47 Mon Sep 17 00:00:00 2001 From: Sishuai Gong Date: Thu, 10 Aug 2023 15:12:42 -0400 Subject: [PATCH 29/49] ipvs: fix racy memcpy in proc_do_sync_threshold When two threads run proc_do_sync_threshold() in parallel, data races could happen between the two memcpy(): Thread-1 Thread-2 memcpy(val, valp, sizeof(val)); memcpy(valp, val, sizeof(val)); This race might mess up the (struct ctl_table *) table->data, so we add a mutex lock to serialize them. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://lore.kernel.org/netdev/B6988E90-0A1E-4B85-BF26-2DAF6D482433@gmail.com/ Signed-off-by: Sishuai Gong Acked-by: Simon Horman Acked-by: Julian Anastasov Signed-off-by: Florian Westphal --- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 62606fb44d02..4bb0d90eca1c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1876,6 +1876,7 @@ static int proc_do_sync_threshold(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { + struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val[2]; int rc; @@ -1885,6 +1886,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, .mode = table->mode, }; + mutex_lock(&ipvs->sync_mutex); memcpy(val, valp, sizeof(val)); rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { @@ -1894,6 +1896,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, else memcpy(valp, val, sizeof(val)); } + mutex_unlock(&ipvs->sync_mutex); return rc; } @@ -4321,6 +4324,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx].extra2 = ipvs; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; From 6a33d8b73dfac0a41f3877894b38082bd0c9a5bc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Aug 2023 15:39:00 +0200 Subject: [PATCH 30/49] netfilter: nf_tables: fix GC transaction races with netns and netlink event exit path Netlink event path is missing a synchronization point with GC transactions. Add GC sequence number update to netns release path and netlink event path, any GC transaction losing race will be discarded. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 36 +++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6f31022cacc6..8ac4dd8be1a2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9739,6 +9739,22 @@ static void nft_set_commit_update(struct list_head *set_update_list) } } +static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net) +{ + unsigned int gc_seq; + + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + + return gc_seq; +} + +static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) +{ + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -9824,9 +9840,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) WRITE_ONCE(nft_net->base_seq, base_seq); - /* Bump gc counter, it becomes odd, this is the busy mark. */ - gc_seq = READ_ONCE(nft_net->gc_seq); - WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + gc_seq = nft_gc_seq_begin(nft_net); /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -10039,7 +10053,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); - WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + nft_gc_seq_end(nft_net, gc_seq); nf_tables_commit_release(net); return 0; @@ -11040,6 +11054,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, struct net *net = n->net; unsigned int deleted; bool restart = false; + unsigned int gc_seq; if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; @@ -11047,6 +11062,9 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, nft_net = nft_pernet(net); deleted = 0; mutex_lock(&nft_net->commit_mutex); + + gc_seq = nft_gc_seq_begin(nft_net); + if (!list_empty(&nf_tables_destroy_list)) rcu_barrier(); again: @@ -11069,6 +11087,8 @@ again: if (restart) goto again; } + nft_gc_seq_end(nft_net, gc_seq); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; @@ -11106,12 +11126,20 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); + unsigned int gc_seq; mutex_lock(&nft_net->commit_mutex); + + gc_seq = nft_gc_seq_begin(nft_net); + if (!list_empty(&nft_net->commit_list) || !list_empty(&nft_net->module_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); + __nft_release_tables(net); + + nft_gc_seq_end(nft_net, gc_seq); + mutex_unlock(&nft_net->commit_mutex); WARN_ON_ONCE(!list_empty(&nft_net->tables)); WARN_ON_ONCE(!list_empty(&nft_net->module_list)); From 02c6c24402bf1c1e986899c14ba22a10b510916b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Aug 2023 15:39:01 +0200 Subject: [PATCH 31/49] netfilter: nf_tables: GC transaction race with netns dismantle Use maybe_get_net() since GC workqueue might race with netns exit path. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8ac4dd8be1a2..3e841e45f2c0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9481,9 +9481,14 @@ struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, if (!trans) return NULL; + trans->net = maybe_get_net(net); + if (!trans->net) { + kfree(trans); + return NULL; + } + refcount_inc(&set->refs); trans->set = set; - trans->net = get_net(net); trans->seq = gc_seq; return trans; From 23185c6aed1ffb8fc44087880ba2767aba493779 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Aug 2023 15:39:02 +0200 Subject: [PATCH 32/49] netfilter: nft_dynset: disallow object maps Do not allow to insert elements from datapath to objects maps. Fixes: 8aeff920dcc9 ("netfilter: nf_tables: add stateful object reference to set elements") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nft_dynset.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 4fb34d76dbea..5c5cc01c73c5 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -191,6 +191,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_OBJECT) + return -EOPNOTSUPP; + if (set->ops->update == NULL) return -EOPNOTSUPP; From 096516d092d54604d590827d05b1022c8f326639 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Sat, 12 Aug 2023 21:41:47 -0700 Subject: [PATCH 33/49] net: phy: broadcom: stub c45 read/write for 54810 The 54810 does not support c45. The mmd_phy_indirect accesses return arbirtary values leading to odd behavior like saying it supports EEE when it doesn't. We also see that reading/writing these non-existent MMD registers leads to phy instability in some cases. Fixes: b14995ac2527 ("net: phy: broadcom: Add BCM54810 PHY entry") Signed-off-by: Justin Chen Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/1691901708-28650-1-git-send-email-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/broadcom.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 59cae0d808aa..04b2e6eeb195 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -542,6 +542,17 @@ static int bcm54xx_resume(struct phy_device *phydev) return bcm54xx_config_init(phydev); } +static int bcm54810_read_mmd(struct phy_device *phydev, int devnum, u16 regnum) +{ + return -EOPNOTSUPP; +} + +static int bcm54810_write_mmd(struct phy_device *phydev, int devnum, u16 regnum, + u16 val) +{ + return -EOPNOTSUPP; +} + static int bcm54811_config_init(struct phy_device *phydev) { int err, reg; @@ -1103,6 +1114,8 @@ static struct phy_driver broadcom_drivers[] = { .get_strings = bcm_phy_get_strings, .get_stats = bcm54xx_get_stats, .probe = bcm54xx_phy_probe, + .read_mmd = bcm54810_read_mmd, + .write_mmd = bcm54810_write_mmd, .config_init = bcm54xx_config_init, .config_aneg = bcm5481_config_aneg, .config_intr = bcm_phy_config_intr, From dafcbce07136d799edc4c67f04f9fd69ff1eac1f Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Mon, 14 Aug 2023 11:23:01 +0800 Subject: [PATCH 34/49] team: Fix incorrect deletion of ETH_P_8021AD protocol vid from slaves Similar to commit 01f4fd270870 ("bonding: Fix incorrect deletion of ETH_P_8021AD protocol vid from slaves"), we can trigger BUG_ON(!vlan_info) in unregister_vlan_dev() with the following testcase: # ip netns add ns1 # ip netns exec ns1 ip link add team1 type team # ip netns exec ns1 ip link add team_slave type veth peer veth2 # ip netns exec ns1 ip link set team_slave master team1 # ip netns exec ns1 ip link add link team_slave name team_slave.10 type vlan id 10 protocol 802.1ad # ip netns exec ns1 ip link add link team1 name team1.10 type vlan id 10 protocol 802.1ad # ip netns exec ns1 ip link set team_slave nomaster # ip netns del ns1 Add S-VLAN tag related features support to team driver. So the team driver will always propagate the VLAN info to its slaves. Fixes: 8ad227ff89a7 ("net: vlan: add 802.1ad support") Suggested-by: Ido Schimmel Signed-off-by: Ziyang Xuan Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230814032301.2804971-1-william.xuanziyang@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/team/team.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index d3dc22509ea5..382756c3fb83 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2200,7 +2200,9 @@ static void team_setup(struct net_device *dev) dev->hw_features = TEAM_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | - NETIF_F_HW_VLAN_CTAG_FILTER; + NETIF_F_HW_VLAN_CTAG_FILTER | + NETIF_F_HW_VLAN_STAG_RX | + NETIF_F_HW_VLAN_STAG_FILTER; dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; dev->features |= dev->hw_features; From a552bfa16bab4ce901ee721346a28c4e483f4066 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 13:38:40 -0700 Subject: [PATCH 35/49] net: openvswitch: reject negative ifindex Recent changes in net-next (commit 759ab1edb56c ("net: store netdevs in an xarray")) refactored the handling of pre-assigned ifindexes and let syzbot surface a latent problem in ovs. ovs does not validate ifindex, making it possible to create netdev ports with negative ifindex values. It's easy to repro with YNL: $ ./cli.py --spec netlink/specs/ovs_datapath.yaml \ --do new \ --json '{"upcall-pid": 1, "name":"my-dp"}' $ ./cli.py --spec netlink/specs/ovs_vport.yaml \ --do new \ --json '{"upcall-pid": "00000001", "name": "some-port0", "dp-ifindex":3,"ifindex":4294901760,"type":2}' $ ip link show -65536: some-port0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 7a:48:21:ad:0b:fb brd ff:ff:ff:ff:ff:ff ... Validate the inputs. Now the second command correctly returns: $ ./cli.py --spec netlink/specs/ovs_vport.yaml \ --do new \ --json '{"upcall-pid": "00000001", "name": "some-port0", "dp-ifindex":3,"ifindex":4294901760,"type":2}' lib.ynl.NlError: Netlink error: Numerical result out of range nl_len = 108 (92) nl_flags = 0x300 nl_type = 2 error: -34 extack: {'msg': 'integer out of range', 'unknown': [[type:4 len:36] b'\x0c\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x03\x00\xff\xff\xff\x7f\x00\x00\x00\x00\x08\x00\x01\x00\x08\x00\x00\x00'], 'bad-attr': '.ifindex'} Accept 0 since it used to be silently ignored. Fixes: 54c4ef34c4b6 ("openvswitch: allow specifying ifindex of new interfaces") Reported-by: syzbot+7456b5dcf65111553320@syzkaller.appspotmail.com Reviewed-by: Leon Romanovsky Reviewed-by: Aaron Conole Link: https://lore.kernel.org/r/20230814203840.2908710-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index a6d2a0b1aa21..3d7a91e64c88 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1829,7 +1829,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.port_no = OVSP_LOCAL; parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; parms.desired_ifindex = a[OVS_DP_ATTR_IFINDEX] - ? nla_get_u32(a[OVS_DP_ATTR_IFINDEX]) : 0; + ? nla_get_s32(a[OVS_DP_ATTR_IFINDEX]) : 0; /* So far only local changes have been made, now need the lock. */ ovs_lock(); @@ -2049,7 +2049,7 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), - [OVS_DP_ATTR_IFINDEX] = {.type = NLA_U32 }, + [OVS_DP_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), }; static const struct genl_small_ops dp_datapath_genl_ops[] = { @@ -2302,7 +2302,7 @@ restart: parms.port_no = port_no; parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; parms.desired_ifindex = a[OVS_VPORT_ATTR_IFINDEX] - ? nla_get_u32(a[OVS_VPORT_ATTR_IFINDEX]) : 0; + ? nla_get_s32(a[OVS_VPORT_ATTR_IFINDEX]) : 0; vport = new_vport(&parms); err = PTR_ERR(vport); @@ -2539,7 +2539,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, - [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED }, }; From 9944d203fa63721b87eee84a89f7275dc3d25c05 Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Tue, 15 Aug 2023 00:00:30 +0300 Subject: [PATCH 36/49] broadcom: b44: Use b44_writephy() return value Return result of b44_writephy() instead of zero to deal with possible error. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Artem Chernyshev Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/b44.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c index 392ec09a1d8a..3e4fb3c3e834 100644 --- a/drivers/net/ethernet/broadcom/b44.c +++ b/drivers/net/ethernet/broadcom/b44.c @@ -1793,11 +1793,9 @@ static int b44_nway_reset(struct net_device *dev) b44_readphy(bp, MII_BMCR, &bmcr); b44_readphy(bp, MII_BMCR, &bmcr); r = -EINVAL; - if (bmcr & BMCR_ANENABLE) { - b44_writephy(bp, MII_BMCR, - bmcr | BMCR_ANRESTART); - r = 0; - } + if (bmcr & BMCR_ANENABLE) + r = b44_writephy(bp, MII_BMCR, + bmcr | BMCR_ANRESTART); spin_unlock_irq(&bp->lock); return r; From 0b70f1950e79b37df5617c83ed1ad1a4cc7fc89c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 15 Aug 2023 17:27:49 +0200 Subject: [PATCH 37/49] mailmap: add entries for Simon Horman Retire some of my email addresses from Kernel activities. Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 5dd318121982..e50662536c48 100644 --- a/.mailmap +++ b/.mailmap @@ -538,6 +538,8 @@ Shuah Khan Sibi Sankar Sid Manning Simon Arlott +Simon Horman +Simon Horman Simon Kelley Sricharan Ramabadhran Srinivas Ramana From b35c968363c036e93f95cb233182f2d1c44605c2 Mon Sep 17 00:00:00 2001 From: Prasad Pandit Date: Wed, 16 Aug 2023 13:26:06 +0530 Subject: [PATCH 38/49] ipv6: fix indentation of a config attribute Fix indentation of a type attribute of IPV6_VTI config entry. Signed-off-by: Prasad Pandit Signed-off-by: David S. Miller --- net/ipv6/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 658bfed1df8b..08d4b7132d4c 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -152,7 +152,7 @@ config INET6_TUNNEL default n config IPV6_VTI -tristate "Virtual (secure) IPv6: tunneling" + tristate "Virtual (secure) IPv6: tunneling" select IPV6_TUNNEL select NET_IP_TUNNEL select XFRM From 751969e5b1196821ef78f0aa664a8a97c92c9057 Mon Sep 17 00:00:00 2001 From: Piotr Gardocki Date: Mon, 7 Aug 2023 16:46:04 +0200 Subject: [PATCH 39/49] iavf: fix FDIR rule fields masks validation Return an error if a field's mask is neither full nor empty. When a mask is only partial the field is not being used for rule programming but it gives a wrong impression it is used. Fix by returning an error on any partial mask to make it clear they are not supported. The ip_ver assignment is moved earlier in code to allow using it in iavf_validate_fdir_fltr_masks. Fixes: 527691bf0682 ("iavf: Support IPv4 Flow Director filters") Fixes: e90cbc257a6f ("iavf: Support IPv6 Flow Director filters") Signed-off-by: Piotr Gardocki Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/iavf/iavf_ethtool.c | 10 +++ drivers/net/ethernet/intel/iavf/iavf_fdir.c | 77 ++++++++++++++++++- drivers/net/ethernet/intel/iavf/iavf_fdir.h | 2 + 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 460ca561819a..a34303ad057d 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -1289,6 +1289,7 @@ iavf_add_fdir_fltr_info(struct iavf_adapter *adapter, struct ethtool_rx_flow_spe fltr->ip_mask.src_port = fsp->m_u.tcp_ip4_spec.psrc; fltr->ip_mask.dst_port = fsp->m_u.tcp_ip4_spec.pdst; fltr->ip_mask.tos = fsp->m_u.tcp_ip4_spec.tos; + fltr->ip_ver = 4; break; case AH_V4_FLOW: case ESP_V4_FLOW: @@ -1300,6 +1301,7 @@ iavf_add_fdir_fltr_info(struct iavf_adapter *adapter, struct ethtool_rx_flow_spe fltr->ip_mask.v4_addrs.dst_ip = fsp->m_u.ah_ip4_spec.ip4dst; fltr->ip_mask.spi = fsp->m_u.ah_ip4_spec.spi; fltr->ip_mask.tos = fsp->m_u.ah_ip4_spec.tos; + fltr->ip_ver = 4; break; case IPV4_USER_FLOW: fltr->ip_data.v4_addrs.src_ip = fsp->h_u.usr_ip4_spec.ip4src; @@ -1312,6 +1314,7 @@ iavf_add_fdir_fltr_info(struct iavf_adapter *adapter, struct ethtool_rx_flow_spe fltr->ip_mask.l4_header = fsp->m_u.usr_ip4_spec.l4_4_bytes; fltr->ip_mask.tos = fsp->m_u.usr_ip4_spec.tos; fltr->ip_mask.proto = fsp->m_u.usr_ip4_spec.proto; + fltr->ip_ver = 4; break; case TCP_V6_FLOW: case UDP_V6_FLOW: @@ -1330,6 +1333,7 @@ iavf_add_fdir_fltr_info(struct iavf_adapter *adapter, struct ethtool_rx_flow_spe fltr->ip_mask.src_port = fsp->m_u.tcp_ip6_spec.psrc; fltr->ip_mask.dst_port = fsp->m_u.tcp_ip6_spec.pdst; fltr->ip_mask.tclass = fsp->m_u.tcp_ip6_spec.tclass; + fltr->ip_ver = 6; break; case AH_V6_FLOW: case ESP_V6_FLOW: @@ -1345,6 +1349,7 @@ iavf_add_fdir_fltr_info(struct iavf_adapter *adapter, struct ethtool_rx_flow_spe sizeof(struct in6_addr)); fltr->ip_mask.spi = fsp->m_u.ah_ip6_spec.spi; fltr->ip_mask.tclass = fsp->m_u.ah_ip6_spec.tclass; + fltr->ip_ver = 6; break; case IPV6_USER_FLOW: memcpy(&fltr->ip_data.v6_addrs.src_ip, fsp->h_u.usr_ip6_spec.ip6src, @@ -1361,6 +1366,7 @@ iavf_add_fdir_fltr_info(struct iavf_adapter *adapter, struct ethtool_rx_flow_spe fltr->ip_mask.l4_header = fsp->m_u.usr_ip6_spec.l4_4_bytes; fltr->ip_mask.tclass = fsp->m_u.usr_ip6_spec.tclass; fltr->ip_mask.proto = fsp->m_u.usr_ip6_spec.l4_proto; + fltr->ip_ver = 6; break; case ETHER_FLOW: fltr->eth_data.etype = fsp->h_u.ether_spec.h_proto; @@ -1371,6 +1377,10 @@ iavf_add_fdir_fltr_info(struct iavf_adapter *adapter, struct ethtool_rx_flow_spe return -EINVAL; } + err = iavf_validate_fdir_fltr_masks(adapter, fltr); + if (err) + return err; + if (iavf_fdir_is_dup_fltr(adapter, fltr)) return -EEXIST; diff --git a/drivers/net/ethernet/intel/iavf/iavf_fdir.c b/drivers/net/ethernet/intel/iavf/iavf_fdir.c index 505e82ebafe4..03e774bd2a5b 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_fdir.c +++ b/drivers/net/ethernet/intel/iavf/iavf_fdir.c @@ -18,6 +18,79 @@ static const struct in6_addr ipv6_addr_full_mask = { } }; +static const struct in6_addr ipv6_addr_zero_mask = { + .in6_u = { + .u6_addr8 = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + } + } +}; + +/** + * iavf_validate_fdir_fltr_masks - validate Flow Director filter fields masks + * @adapter: pointer to the VF adapter structure + * @fltr: Flow Director filter data structure + * + * Returns 0 if all masks of packet fields are either full or empty. Returns + * error on at least one partial mask. + */ +int iavf_validate_fdir_fltr_masks(struct iavf_adapter *adapter, + struct iavf_fdir_fltr *fltr) +{ + if (fltr->eth_mask.etype && fltr->eth_mask.etype != htons(U16_MAX)) + goto partial_mask; + + if (fltr->ip_ver == 4) { + if (fltr->ip_mask.v4_addrs.src_ip && + fltr->ip_mask.v4_addrs.src_ip != htonl(U32_MAX)) + goto partial_mask; + + if (fltr->ip_mask.v4_addrs.dst_ip && + fltr->ip_mask.v4_addrs.dst_ip != htonl(U32_MAX)) + goto partial_mask; + + if (fltr->ip_mask.tos && fltr->ip_mask.tos != U8_MAX) + goto partial_mask; + } else if (fltr->ip_ver == 6) { + if (memcmp(&fltr->ip_mask.v6_addrs.src_ip, &ipv6_addr_zero_mask, + sizeof(struct in6_addr)) && + memcmp(&fltr->ip_mask.v6_addrs.src_ip, &ipv6_addr_full_mask, + sizeof(struct in6_addr))) + goto partial_mask; + + if (memcmp(&fltr->ip_mask.v6_addrs.dst_ip, &ipv6_addr_zero_mask, + sizeof(struct in6_addr)) && + memcmp(&fltr->ip_mask.v6_addrs.dst_ip, &ipv6_addr_full_mask, + sizeof(struct in6_addr))) + goto partial_mask; + + if (fltr->ip_mask.tclass && fltr->ip_mask.tclass != U8_MAX) + goto partial_mask; + } + + if (fltr->ip_mask.proto && fltr->ip_mask.proto != U8_MAX) + goto partial_mask; + + if (fltr->ip_mask.src_port && fltr->ip_mask.src_port != htons(U16_MAX)) + goto partial_mask; + + if (fltr->ip_mask.dst_port && fltr->ip_mask.dst_port != htons(U16_MAX)) + goto partial_mask; + + if (fltr->ip_mask.spi && fltr->ip_mask.spi != htonl(U32_MAX)) + goto partial_mask; + + if (fltr->ip_mask.l4_header && + fltr->ip_mask.l4_header != htonl(U32_MAX)) + goto partial_mask; + + return 0; + +partial_mask: + dev_err(&adapter->pdev->dev, "Failed to add Flow Director filter, partial masks are not supported\n"); + return -EOPNOTSUPP; +} + /** * iavf_pkt_udp_no_pay_len - the length of UDP packet without payload * @fltr: Flow Director filter data structure @@ -263,8 +336,6 @@ iavf_fill_fdir_ip4_hdr(struct iavf_fdir_fltr *fltr, VIRTCHNL_ADD_PROTO_HDR_FIELD_BIT(hdr, IPV4, DST); } - fltr->ip_ver = 4; - return 0; } @@ -309,8 +380,6 @@ iavf_fill_fdir_ip6_hdr(struct iavf_fdir_fltr *fltr, VIRTCHNL_ADD_PROTO_HDR_FIELD_BIT(hdr, IPV6, DST); } - fltr->ip_ver = 6; - return 0; } diff --git a/drivers/net/ethernet/intel/iavf/iavf_fdir.h b/drivers/net/ethernet/intel/iavf/iavf_fdir.h index 33c55c366315..9eb9f73f6adf 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_fdir.h +++ b/drivers/net/ethernet/intel/iavf/iavf_fdir.h @@ -110,6 +110,8 @@ struct iavf_fdir_fltr { struct virtchnl_fdir_add vc_add_msg; }; +int iavf_validate_fdir_fltr_masks(struct iavf_adapter *adapter, + struct iavf_fdir_fltr *fltr); int iavf_fill_fdir_add_msg(struct iavf_adapter *adapter, struct iavf_fdir_fltr *fltr); void iavf_print_fdir_fltr(struct iavf_adapter *adapter, struct iavf_fdir_fltr *fltr); bool iavf_fdir_is_dup_fltr(struct iavf_adapter *adapter, struct iavf_fdir_fltr *fltr); From 2f2beb8874cb0844e84ad26e990f05f4f13ff63f Mon Sep 17 00:00:00 2001 From: Andrii Staikov Date: Wed, 2 Aug 2023 09:47:32 +0200 Subject: [PATCH 40/49] i40e: fix misleading debug logs Change "write" into the actual "read" word. Change parameters description. Fixes: 7073f46e443e ("i40e: Add AQ commands for NVM Update for X722") Signed-off-by: Aleksandr Loktionov Signed-off-by: Andrii Staikov Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_nvm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c index 9da0c87f0328..f99c1f7fec40 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c +++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c @@ -210,11 +210,11 @@ read_nvm_exit: * @hw: pointer to the HW structure. * @module_pointer: module pointer location in words from the NVM beginning * @offset: offset in words from module start - * @words: number of words to write - * @data: buffer with words to write to the Shadow RAM + * @words: number of words to read + * @data: buffer with words to read to the Shadow RAM * @last_command: tells the AdminQ that this is the last command * - * Writes a 16 bit words buffer to the Shadow RAM using the admin command. + * Reads a 16 bit words buffer to the Shadow RAM using the admin command. **/ static int i40e_read_nvm_aq(struct i40e_hw *hw, u8 module_pointer, u32 offset, @@ -234,18 +234,18 @@ static int i40e_read_nvm_aq(struct i40e_hw *hw, */ if ((offset + words) > hw->nvm.sr_size) i40e_debug(hw, I40E_DEBUG_NVM, - "NVM write error: offset %d beyond Shadow RAM limit %d\n", + "NVM read error: offset %d beyond Shadow RAM limit %d\n", (offset + words), hw->nvm.sr_size); else if (words > I40E_SR_SECTOR_SIZE_IN_WORDS) - /* We can write only up to 4KB (one sector), in one AQ write */ + /* We can read only up to 4KB (one sector), in one AQ write */ i40e_debug(hw, I40E_DEBUG_NVM, - "NVM write fail error: tried to write %d words, limit is %d.\n", + "NVM read fail error: tried to read %d words, limit is %d.\n", words, I40E_SR_SECTOR_SIZE_IN_WORDS); else if (((offset + (words - 1)) / I40E_SR_SECTOR_SIZE_IN_WORDS) != (offset / I40E_SR_SECTOR_SIZE_IN_WORDS)) - /* A single write cannot spread over two sectors */ + /* A single read cannot spread over two sectors */ i40e_debug(hw, I40E_DEBUG_NVM, - "NVM write error: cannot spread over two sectors in a single write offset=%d words=%d\n", + "NVM read error: cannot spread over two sectors in a single read offset=%d words=%d\n", offset, words); else ret_code = i40e_aq_read_nvm(hw, module_pointer, From 34a79876d9f77e971115236bcf7b5d14a8ecf542 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Tue, 1 Aug 2023 20:41:03 +0300 Subject: [PATCH 41/49] net/mlx5e: XDP, Fix fifo overrun on XDP_REDIRECT Before this fix, running high rate traffic through XDP_REDIRECT with multibuf could overrun the fifo used to release the xdp frames after tx completion. This resulted in corrupted data being consumed on the free side. The culplirt was a miscalculation of the fifo size: the maximum ratio between fifo entries / data segments was incorrect. This ratio serves to calculate the max fifo size for a full sq where each packet uses the worst case number of entries in the fifo. This patch fixes the formula and names the constant. It also makes sure that future values will use a power of 2 number of entries for the fifo mask to work. Signed-off-by: Dragos Tatulea Fixes: 3f734b8c594b ("net/mlx5e: XDP, Use multiple single-entry objects in xdpi_fifo") Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index 9e8e6184f9e4..ecfe93a479da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -84,6 +84,8 @@ enum mlx5e_xdp_xmit_mode { * MLX5E_XDP_XMIT_MODE_XSK: * none. */ +#define MLX5E_XDP_FIFO_ENTRIES2DS_MAX_RATIO 4 + union mlx5e_xdp_info { enum mlx5e_xdp_xmit_mode mode; union { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c27df14df145..f7b494125eee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1298,11 +1298,13 @@ static int mlx5e_alloc_xdpsq_fifo(struct mlx5e_xdpsq *sq, int numa) { struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo; int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); - int entries = wq_sz * MLX5_SEND_WQEBB_NUM_DS * 2; /* upper bound for maximum num of - * entries of all xmit_modes. - */ + int entries; size_t size; + /* upper bound for maximum num of entries of all xmit_modes. */ + entries = roundup_pow_of_two(wq_sz * MLX5_SEND_WQEBB_NUM_DS * + MLX5E_XDP_FIFO_ENTRIES2DS_MAX_RATIO); + size = array_size(sizeof(*xdpi_fifo->xi), entries); xdpi_fifo->xi = kvzalloc_node(size, GFP_KERNEL, numa); if (!xdpi_fifo->xi) From 0fd23db0cc74cf6d28d26ce5e7802e982608d830 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 25 Jun 2023 10:43:03 +0300 Subject: [PATCH 42/49] net/mlx5: Fix mlx5_cmd_update_root_ft() error flow The cited patch change mlx5_cmd_update_root_ft() to work with multiple peer devices. However, it didn't align the error flow as well. Hence, Fix the error code to work with multiple peer devices. Fixes: 222dd185833e ("{net/RDMA}/mlx5: introduce lag_for_each_peer") Signed-off-by: Shay Drory Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index aab7059bf6e9..244cfd470903 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -245,12 +245,20 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns, mlx5_lag_is_shared_fdb(dev) && mlx5_lag_is_master(dev)) { struct mlx5_core_dev *peer_dev; - int i; + int i, j; mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { err = mlx5_cmd_set_slave_root_fdb(dev, peer_dev, !disconnect, (!disconnect) ? ft->id : 0); if (err && !disconnect) { + mlx5_lag_for_each_peer_mdev(dev, peer_dev, j) { + if (j < i) + mlx5_cmd_set_slave_root_fdb(dev, peer_dev, 1, + ns->root_ft->id); + else + break; + } + MLX5_SET(set_flow_table_root_in, in, op_mod, 0); MLX5_SET(set_flow_table_root_in, in, table_id, ns->root_ft->id); From 23d775f12dcd23d052a4927195f15e970e27ab26 Mon Sep 17 00:00:00 2001 From: Alfred Lee Date: Mon, 14 Aug 2023 17:13:23 -0700 Subject: [PATCH 43/49] net: dsa: mv88e6xxx: Wait for EEPROM done before HW reset If the switch is reset during active EEPROM transactions, as in just after an SoC reset after power up, the I2C bus transaction may be cut short leaving the EEPROM internal I2C state machine in the wrong state. When the switch is reset again, the bad state machine state may result in data being read from the wrong memory location causing the switch to enter unexpected mode rendering it inoperational. Fixes: a3dcb3e7e70c ("net: dsa: mv88e6xxx: Wait for EEPROM done after HW reset") Signed-off-by: Alfred Lee Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20230815001323.24739-1-l00g33k@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index c7d51a539451..7af2f08a62f1 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3034,6 +3034,14 @@ static void mv88e6xxx_hardware_reset(struct mv88e6xxx_chip *chip) /* If there is a GPIO connected to the reset pin, toggle it */ if (gpiod) { + /* If the switch has just been reset and not yet completed + * loading EEPROM, the reset may interrupt the I2C transaction + * mid-byte, causing the first EEPROM read after the reset + * from the wrong location resulting in the switch booting + * to wrong mode and inoperable. + */ + mv88e6xxx_g1_wait_eeprom_done(chip); + gpiod_set_value_cansleep(gpiod, 1); usleep_range(10000, 20000); gpiod_set_value_cansleep(gpiod, 0); From fa165e1949976704500a442faeef8d9596faee76 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 15 Aug 2023 16:57:27 +0100 Subject: [PATCH 44/49] sfc: don't unregister flow_indr if it was never registered In efx_init_tc(), move the setting of efx->tc->up after the flow_indr_dev_register() call, so that if it fails, efx_fini_tc() won't call flow_indr_dev_unregister(). Fixes: 5b2e12d51bd8 ("sfc: bind indirect blocks for TC offload on EF100") Suggested-by: Pieter Jansen van Vuuren Reviewed-by: Martin Habets Signed-off-by: Edward Cree Link: https://lore.kernel.org/r/a81284d7013aba74005277bd81104e4cfbea3f6f.1692114888.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/tc.c b/drivers/net/ethernet/sfc/tc.c index 15ebd3973922..fe268b6c1cac 100644 --- a/drivers/net/ethernet/sfc/tc.c +++ b/drivers/net/ethernet/sfc/tc.c @@ -1657,10 +1657,10 @@ int efx_init_tc(struct efx_nic *efx) rc = efx_tc_configure_fallback_acts_reps(efx); if (rc) return rc; - efx->tc->up = true; rc = flow_indr_dev_register(efx_tc_indr_setup_cb, efx); if (rc) return rc; + efx->tc->up = true; return 0; } From 54c9016eb8eda55952a195b071359cd13f50ed9b Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 15 Aug 2023 16:57:28 +0100 Subject: [PATCH 45/49] sfc: don't fail probe if MAE/TC setup fails Existing comment in the source explains why we don't want efx_init_tc() failure to be fatal. Cited commit erroneously consolidated failure paths causing the probe to be failed in this case. Fixes: 7e056e2360d9 ("sfc: obtain device mac address based on firmware handle for ef100") Reviewed-by: Martin Habets Signed-off-by: Edward Cree Link: https://lore.kernel.org/r/aa7f589dd6028bd1ad49f0a85f37ab33c09b2b45.1692114888.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/ef100_nic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index 7adde9639c8a..35d8e9811998 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -1194,7 +1194,7 @@ int ef100_probe_netdev_pf(struct efx_nic *efx) net_dev->features |= NETIF_F_HW_TC; efx->fixed_features |= NETIF_F_HW_TC; } - return rc; + return 0; } int ef100_probe_vf(struct efx_nic *efx) From 2d0c88e84e483982067a82073f6125490ddf3614 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Wed, 16 Aug 2023 17:12:22 +0800 Subject: [PATCH 46/49] sock: Fix misuse of sk_under_memory_pressure() The status of global socket memory pressure is updated when: a) __sk_mem_raise_allocated(): enter: sk_memory_allocated(sk) > sysctl_mem[1] leave: sk_memory_allocated(sk) <= sysctl_mem[0] b) __sk_mem_reduce_allocated(): leave: sk_under_memory_pressure(sk) && sk_memory_allocated(sk) < sysctl_mem[0] So the conditions of leaving global pressure are inconstant, which may lead to the situation that one pressured net-memcg prevents the global pressure from being cleared when there is indeed no global pressure, thus the global constrains are still in effect unexpectedly on the other sockets. This patch fixes this by ignoring the net-memcg's pressure when deciding whether should leave global memory pressure. Fixes: e1aab161e013 ("socket: initial cgroup code.") Signed-off-by: Abel Wu Acked-by: Shakeel Butt Link: https://lore.kernel.org/r/20230816091226.1542-1-wuyun.abel@bytedance.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 6 ++++++ net/core/sock.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 2eb916d1ff64..e3d987b2ef12 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1420,6 +1420,12 @@ static inline bool sk_has_memory_pressure(const struct sock *sk) return sk->sk_prot->memory_pressure != NULL; } +static inline bool sk_under_global_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure && + !!*sk->sk_prot->memory_pressure; +} + static inline bool sk_under_memory_pressure(const struct sock *sk) { if (!sk->sk_prot->memory_pressure) diff --git a/net/core/sock.c b/net/core/sock.c index 732fc37a4771..c9cffb7acbea 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3159,7 +3159,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); - if (sk_under_memory_pressure(sk) && + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } From b616be6b97688f2f2bd7c4a47ab32f27f94fb2a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 14:21:58 +0000 Subject: [PATCH 47/49] net: do not allow gso_size to be set to GSO_BY_FRAGS One missing check in virtio_net_hdr_to_skb() allowed syzbot to crash kernels again [1] Do not allow gso_size to be set to GSO_BY_FRAGS (0xffff), because this magic value is used by the kernel. [1] general protection fault, probably for non-canonical address 0xdffffc000000000e: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077] CPU: 0 PID: 5039 Comm: syz-executor401 Not tainted 6.5.0-rc5-next-20230809-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 RIP: 0010:skb_segment+0x1a52/0x3ef0 net/core/skbuff.c:4500 Code: 00 00 00 e9 ab eb ff ff e8 6b 96 5d f9 48 8b 84 24 00 01 00 00 48 8d 78 70 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e ea 21 00 00 48 8b 84 24 00 01 RSP: 0018:ffffc90003d3f1c8 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 000000000001fffe RCX: 0000000000000000 RDX: 000000000000000e RSI: ffffffff882a3115 RDI: 0000000000000070 RBP: ffffc90003d3f378 R08: 0000000000000005 R09: 000000000000ffff R10: 000000000000ffff R11: 5ee4a93e456187d6 R12: 000000000001ffc6 R13: dffffc0000000000 R14: 0000000000000008 R15: 000000000000ffff FS: 00005555563f2380(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020020000 CR3: 000000001626d000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: udp6_ufo_fragment+0x9d2/0xd50 net/ipv6/udp_offload.c:109 ipv6_gso_segment+0x5c4/0x17b0 net/ipv6/ip6_offload.c:120 skb_mac_gso_segment+0x292/0x610 net/core/gso.c:53 __skb_gso_segment+0x339/0x710 net/core/gso.c:124 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x3a5/0xf10 net/core/dev.c:3625 __dev_queue_xmit+0x8f0/0x3d60 net/core/dev.c:4329 dev_queue_xmit include/linux/netdevice.h:3082 [inline] packet_xmit+0x257/0x380 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3087 [inline] packet_sendmsg+0x24c7/0x5570 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:727 [inline] sock_sendmsg+0xd9/0x180 net/socket.c:750 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2496 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2550 __sys_sendmsg+0x117/0x1e0 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7ff27cdb34d9 Fixes: 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Xin Long Cc: "Michael S. Tsirkin" Cc: Jason Wang Reviewed-by: Willem de Bruijn Reviewed-by: Marcelo Ricardo Leitner Reviewed-by: Xuan Zhuo Link: https://lore.kernel.org/r/20230816142158.1779798-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index bdf8de2cdd93..7b4dd69555e4 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -155,6 +155,10 @@ retry: if (gso_type & SKB_GSO_UDP) nh_off -= thlen; + /* Kernel has a special handling for GSO_BY_FRAGS. */ + if (gso_size == GSO_BY_FRAGS) + return -EINVAL; + /* Too small packets are not really GSO ones. */ if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; From 2eb9625a3a32251ecea470cd576659a3a03b4e59 Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Wed, 16 Aug 2023 20:37:11 +0530 Subject: [PATCH 48/49] qede: fix firmware halt over suspend and resume While performing certain power-off sequences, PCI drivers are called to suspend and resume their underlying devices through PCI PM (power management) interface. However this NIC hardware does not support PCI PM suspend/resume operations so system wide suspend/resume leads to bad MFW (management firmware) state which causes various follow-up errors in driver when communicating with the device/firmware afterwards. To fix this driver implements PCI PM suspend handler to indicate unsupported operation to the PCI subsystem explicitly, thus avoiding system to go into suspended/standby mode. Without this fix device/firmware does not recover unless system is power cycled. Fixes: 2950219d87b0 ("qede: Add basic network device support") Signed-off-by: Manish Chopra Signed-off-by: Alok Prasad Reviewed-by: John Meneghini Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230816150711.59035-1-manishc@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qede/qede_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 4b004a728190..99df00c30b8c 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -176,6 +176,15 @@ static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param) } #endif +static int __maybe_unused qede_suspend(struct device *dev) +{ + dev_info(dev, "Device does not support suspend operation\n"); + + return -EOPNOTSUPP; +} + +static DEFINE_SIMPLE_DEV_PM_OPS(qede_pm_ops, qede_suspend, NULL); + static const struct pci_error_handlers qede_err_handler = { .error_detected = qede_io_error_detected, }; @@ -190,6 +199,7 @@ static struct pci_driver qede_pci_driver = { .sriov_configure = qede_sriov_configure, #endif .err_handler = &qede_err_handler, + .driver.pm = &qede_pm_ops, }; static struct qed_eth_cb_ops qede_ll_ops = { From 43d00e102d9ecbe2635d7e3f2e14d2e90183d6af Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Wed, 16 Aug 2023 12:34:05 -0700 Subject: [PATCH 49/49] ice: Block switchdev mode when ADQ is active and vice versa ADQ and switchdev are not supported simultaneously. Enabling both at the same time can result in nullptr dereference. To prevent this, check if ADQ is active when changing devlink mode to switchdev mode, and check if switchdev is active when enabling ADQ. Fixes: fbc7b27af0f9 ("ice: enable ndo_setup_tc support for mqprio_qdisc") Signed-off-by: Marcin Szycik Reviewed-by: Przemek Kitszel Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230816193405.1307580-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 6 ++++++ drivers/net/ethernet/intel/ice/ice_main.c | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index ad0a007b7398..8f232c41a89e 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -538,6 +538,12 @@ ice_eswitch_mode_set(struct devlink *devlink, u16 mode, break; case DEVLINK_ESWITCH_MODE_SWITCHDEV: { + if (ice_is_adq_active(pf)) { + dev_err(ice_pf_to_dev(pf), "Couldn't change eswitch mode to switchdev - ADQ is active. Delete ADQ configs and try again, e.g. tc qdisc del dev $PF root"); + NL_SET_ERR_MSG_MOD(extack, "Couldn't change eswitch mode to switchdev - ADQ is active. Delete ADQ configs and try again, e.g. tc qdisc del dev $PF root"); + return -EOPNOTSUPP; + } + dev_info(ice_pf_to_dev(pf), "PF %d changed eswitch mode to switchdev", pf->hw.pf_id); NL_SET_ERR_MSG_MOD(extack, "Changed eswitch mode to switchdev"); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index cf92c39467c8..b40dfe6ae321 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -8823,6 +8823,11 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type, ice_setup_tc_block_cb, np, np, true); case TC_SETUP_QDISC_MQPRIO: + if (ice_is_eswitch_mode_switchdev(pf)) { + netdev_err(netdev, "TC MQPRIO offload not supported, switchdev is enabled\n"); + return -EOPNOTSUPP; + } + if (pf->adev) { mutex_lock(&pf->adev_mutex); device_lock(&pf->adev->dev);