From 9b49f55838b1f42b2624216ce494f0536669e8f0 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Sun, 11 Aug 2024 11:56:42 +0300 Subject: [PATCH 01/13] xfrm: Remove documentation WARN_ON to limit return values for offloaded SA The original idea to put WARN_ON() on return value from driver code was to make sure that packet offload doesn't have silent fallback to SW implementation, like crypto offload has. In reality, this is not needed as all *swan implementations followed this request and used explicit configuration style to make sure that "users will get what they ask". So instead of forcing drivers to make sure that even their internal flows don't return -EOPNOTSUPP, let's remove this WARN_ON. Signed-off-by: Patrisious Haddad Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 9a44d363ba62..f123b7c9ec82 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -328,12 +328,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, /* User explicitly requested packet offload mode and configured * policy in addition to the XFRM state. So be civil to users, * and return an error instead of taking fallback path. - * - * This WARN_ON() can be seen as a documentation for driver - * authors to do not return -EOPNOTSUPP in packet offload mode. */ - WARN_ON(err == -EOPNOTSUPP && is_packet_offload); - if (err != -EOPNOTSUPP || is_packet_offload) { + if ((err != -EOPNOTSUPP && !is_packet_offload) || is_packet_offload) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this state"); return err; } From 6ad8bc92a47702f0b5d0b96b680199910b89f688 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 15 Aug 2024 13:21:14 -0400 Subject: [PATCH 02/13] net: add copy from skb_seq_state to buffer function Add an skb helper function to copy a range of bytes from within an existing skb_seq_state. Signed-off-by: Christian Hopps Signed-off-by: Steffen Klassert --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 29c3ea5b6e93..a871533b8568 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1433,6 +1433,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st); void skb_abort_seq_read(struct skb_seq_state *st); +int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len); unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 83f8cd8aa2d1..fe4b2dc5c19b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4409,6 +4409,41 @@ void skb_abort_seq_read(struct skb_seq_state *st) } EXPORT_SYMBOL(skb_abort_seq_read); +/** + * skb_copy_seq_read() - copy from a skb_seq_state to a buffer + * @st: source skb_seq_state + * @offset: offset in source + * @to: destination buffer + * @len: number of bytes to copy + * + * Copy @len bytes from @offset bytes into the source @st to the destination + * buffer @to. `offset` should increase (or be unchanged) with each subsequent + * call to this function. If offset needs to decrease from the previous use `st` + * should be reset first. + * + * Return: 0 on success or -EINVAL if the copy ended early + */ +int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len) +{ + const u8 *data; + u32 sqlen; + + for (;;) { + sqlen = skb_seq_read(offset, &data, st); + if (sqlen == 0) + return -EINVAL; + if (sqlen >= len) { + memcpy(to, data, len); + return 0; + } + memcpy(to, data, sqlen); + to += sqlen; + offset += sqlen; + len -= sqlen; + } +} +EXPORT_SYMBOL(skb_copy_seq_read); + #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, From 54f2f78d6b9f1f90e91aaf5dbb34a6198f65fdfd Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 21 Aug 2024 16:30:13 +0100 Subject: [PATCH 03/13] xfrm: Correct spelling in xfrm.h Correct spelling in xfrm.h. As reported by codespell. Signed-off-by: Simon Horman Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 54cef89f6c1e..f7244ac4fa08 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -79,7 +79,7 @@ policy entry has list of up to XFRM_MAX_DEPTH transformations, described by templates xfrm_tmpl. Each template is resolved to a complete xfrm_state (see below) and we pack bundle of transformations - to a dst_entry returned to requestor. + to a dst_entry returned to requester. dst -. xfrm .-> xfrm_state #1 |---. child .-> dst -. xfrm .-> xfrm_state #2 @@ -1016,7 +1016,7 @@ void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); struct xfrm_if_parms { int link; /* ifindex of underlying L2 interface */ - u32 if_id; /* interface identifyer */ + u32 if_id; /* interface identifier */ bool collect_md; }; From 9c5b6d4e33dd78a0511ec34756d783b7e36028c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 Aug 2024 15:04:29 +0200 Subject: [PATCH 04/13] selftests: add xfrm policy insertion speed test script Nothing special, just test how long insertion of x policies takes. This should ideally show linear insertion speeds. Do not run this by default, it has little value, but it can be useful to check for insertion speed chahnges when altering the xfrm policy db implementation. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- tools/testing/selftests/net/Makefile | 2 +- .../selftests/net/xfrm_policy_add_speed.sh | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/xfrm_policy_add_speed.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 8eaffd7a641c..e127a80ff713 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -56,7 +56,7 @@ TEST_PROGS += ip_local_port_range.sh TEST_PROGS += rps_default_mask.sh TEST_PROGS += big_tcp.sh TEST_PROGS += netns-sysctl.sh -TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh +TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh xfrm_policy_add_speed.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite diff --git a/tools/testing/selftests/net/xfrm_policy_add_speed.sh b/tools/testing/selftests/net/xfrm_policy_add_speed.sh new file mode 100755 index 000000000000..2fab29d3cb91 --- /dev/null +++ b/tools/testing/selftests/net/xfrm_policy_add_speed.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +source lib.sh + +timeout=4m +ret=0 +tmp=$(mktemp) +cleanup() { + cleanup_all_ns + rm -f "$tmp" +} + +trap cleanup EXIT + +maxpolicies=100000 +[ "$KSFT_MACHINE_SLOW" = "yes" ] && maxpolicies=10000 + +do_dummies4() { + local dir="$1" + local max="$2" + + local policies + local pfx + pfx=30 + policies=0 + + ip netns exec "$ns" ip xfrm policy flush + + for i in $(seq 1 100);do + local s + local d + for j in $(seq 1 255);do + s=$((i+0)) + d=$((i+100)) + + for a in $(seq 1 8 255); do + policies=$((policies+1)) + [ "$policies" -gt "$max" ] && return + echo xfrm policy add src 10.$s.$j.0/30 dst 10.$d.$j.$a/$pfx dir $dir action block + done + for a in $(seq 1 8 255); do + policies=$((policies+1)) + [ "$policies" -gt "$max" ] && return + echo xfrm policy add src 10.$s.$j.$a/30 dst 10.$d.$j.0/$pfx dir $dir action block + done + done + done +} + +setup_ns ns + +do_bench() +{ + local max="$1" + + start=$(date +%s%3N) + do_dummies4 "out" "$max" > "$tmp" + if ! timeout "$timeout" ip netns exec "$ns" ip -batch "$tmp";then + echo "WARNING: policy insertion cancelled after $timeout" + ret=1 + fi + stop=$(date +%s%3N) + + result=$((stop-start)) + + policies=$(wc -l < "$tmp") + printf "Inserted %-06s policies in $result ms\n" $policies + + have=$(ip netns exec "$ns" ip xfrm policy show | grep "action block" | wc -l) + if [ "$have" -ne "$policies" ]; then + echo "WARNING: mismatch, have $have policies, expected $policies" + ret=1 + fi +} + +p=100 +while [ $p -le "$maxpolicies" ]; do + do_bench "$p" + p="${p}0" +done + +exit $ret From 33f611cf7d52ffdaebdcd6e42672ca1d06e22974 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 Aug 2024 15:04:30 +0200 Subject: [PATCH 05/13] xfrm: policy: don't iterate inexact policies twice at insert time Since commit 6be3b0db6db8 ("xfrm: policy: add inexact policy search tree infrastructure") policy lookup no longer walks a list but has a set of candidate lists. This set has to be searched for the best match. In case there are several matches, the priority wins. If the priority is also the same, then the historic behaviour with a single list was to return the first match (first-in-list). With introduction of serval lists, this doesn't work and a new 'pos' member was added that reflects the xfrm_policy structs position in the list. This value is not exported to userspace and it does not need to be the 'position in the list', it just needs to make sure that a->pos < b->pos means that a was added to the lists more recently than b. This re-walk is expensive when many inexact policies are in use. Speed this up: when appending the policy to the end of the walker list, then just take the ->pos value of the last entry made and add 1. Add a slowpath version to prevent overflow, if we'd assign UINT_MAX then iterate the entire list and fix the ordering. While this speeds up insertion considerably finding the insertion spot in the inexact list still requires a partial list walk. This is addressed in followup patches. Before: ./xfrm_policy_add_speed.sh Inserted 1000 policies in 72 ms Inserted 10000 policies in 1540 ms Inserted 100000 policies in 334780 ms After: Inserted 1000 policies in 68 ms Inserted 10000 policies in 1137 ms Inserted 100000 policies in 157307 ms Reported-by: Noel Kuntze Cc: Tobias Brunner Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 59 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c56c61b0c12e..423d1eb24f31 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1237,6 +1237,17 @@ xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) return delpol; } +static bool xfrm_policy_is_dead_or_sk(const struct xfrm_policy *policy) +{ + int dir; + + if (policy->walk.dead) + return true; + + dir = xfrm_policy_id2dir(policy->index); + return dir >= XFRM_POLICY_MAX; +} + static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, @@ -1524,7 +1535,6 @@ static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, { struct xfrm_policy *pol, *delpol = NULL; struct hlist_node *newpos = NULL; - int i = 0; hlist_for_each_entry(pol, chain, bydst_inexact_list) { if (pol->type == policy->type && @@ -1548,11 +1558,6 @@ static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); else hlist_add_head_rcu(&policy->bydst_inexact_list, chain); - - hlist_for_each_entry(pol, chain, bydst_inexact_list) { - pol->pos = i; - i++; - } } static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, @@ -2294,10 +2299,52 @@ out: return pol; } +static u32 xfrm_gen_pos_slow(struct net *net) +{ + struct xfrm_policy *policy; + u32 i = 0; + + /* oldest entry is last in list */ + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + if (!xfrm_policy_is_dead_or_sk(policy)) + policy->pos = ++i; + } + + return i; +} + +static u32 xfrm_gen_pos(struct net *net) +{ + const struct xfrm_policy *policy; + u32 i = 0; + + /* most recently added policy is at the head of the list */ + list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { + if (xfrm_policy_is_dead_or_sk(policy)) + continue; + + if (policy->pos == UINT_MAX) + return xfrm_gen_pos_slow(net); + + i = policy->pos + 1; + break; + } + + return i; +} + static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); + switch (dir) { + case XFRM_POLICY_IN: + case XFRM_POLICY_FWD: + case XFRM_POLICY_OUT: + pol->pos = xfrm_gen_pos(net); + break; + } + list_add(&pol->walk.all, &net->xfrm.policy_all); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); From 563d5ca93e883b9dcb4b7dc8967ac569fd91820d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 Aug 2024 15:04:31 +0200 Subject: [PATCH 06/13] xfrm: switch migrate to xfrm_policy_lookup_bytype XFRM_MIGRATE still uses the old lookup method: first check the bydst hash table, then search the list of all the other policies. Switch MIGRATE to use the same lookup function as the packetpath. This is done to remove the last remaining users of the pernet xfrm.policy_inexact lists with the intent of removing this list. After this patch, policies are still added to the list on insertion and they are rehashed as-needed but no single API makes use of these anymore. This change is compile tested only. Cc: Tobias Brunner Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 106 +++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 67 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 423d1eb24f31..d2feee60bb62 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1252,13 +1252,10 @@ static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hthresh.work); - unsigned int hmask; struct xfrm_policy *pol; struct xfrm_policy *policy; struct hlist_head *chain; - struct hlist_head *odst; struct hlist_node *newpos; - int i; int dir; unsigned seq; u8 lbits4, rbits4, lbits6, rbits6; @@ -1322,23 +1319,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) goto out_unlock; } - /* reset the bydst and inexact table in all directions */ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - struct hlist_node *n; - - hlist_for_each_entry_safe(policy, n, - &net->xfrm.policy_inexact[dir], - bydst_inexact_list) { - hlist_del_rcu(&policy->bydst); - hlist_del_init(&policy->bydst_inexact_list); - } - - hmask = net->xfrm.policy_bydst[dir].hmask; - odst = net->xfrm.policy_bydst[dir].table; - for (i = hmask; i >= 0; i--) { - hlist_for_each_entry_safe(policy, n, odst + i, bydst) - hlist_del_rcu(&policy->bydst); - } if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; @@ -1363,6 +1344,10 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* skip socket policies */ continue; } + + hlist_del_rcu(&policy->bydst); + hlist_del_init(&policy->bydst_inexact_list); + newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); @@ -4484,63 +4469,50 @@ EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); #endif #ifdef CONFIG_XFRM_MIGRATE -static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, - const struct xfrm_selector *sel_tgt) -{ - if (sel_cmp->proto == IPSEC_ULPROTO_ANY) { - if (sel_tgt->family == sel_cmp->family && - xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr, - sel_cmp->family) && - xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr, - sel_cmp->family) && - sel_tgt->prefixlen_d == sel_cmp->prefixlen_d && - sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) { - return true; - } - } else { - if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) { - return true; - } - } - return false; -} - static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol, *ret = NULL; - struct hlist_head *chain; - u32 priority = ~0U; + struct flowi fl; - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); - hlist_for_each_entry(pol, chain, bydst) { - if ((if_id == 0 || pol->if_id == if_id) && - xfrm_migrate_selector_match(sel, &pol->selector) && - pol->type == type) { - ret = pol; - priority = ret->priority; - break; - } - } - chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, chain, bydst_inexact_list) { - if ((pol->priority >= priority) && ret) - break; + memset(&fl, 0, sizeof(fl)); - if ((if_id == 0 || pol->if_id == if_id) && - xfrm_migrate_selector_match(sel, &pol->selector) && - pol->type == type) { - ret = pol; + fl.flowi_proto = sel->proto; + + switch (sel->family) { + case AF_INET: + fl.u.ip4.saddr = sel->saddr.a4; + fl.u.ip4.daddr = sel->daddr.a4; + if (sel->proto == IPSEC_ULPROTO_ANY) break; - } + fl.u.flowi4_oif = sel->ifindex; + fl.u.ip4.fl4_sport = sel->sport; + fl.u.ip4.fl4_dport = sel->dport; + break; + case AF_INET6: + fl.u.ip6.saddr = sel->saddr.in6; + fl.u.ip6.daddr = sel->daddr.in6; + if (sel->proto == IPSEC_ULPROTO_ANY) + break; + fl.u.flowi6_oif = sel->ifindex; + fl.u.ip6.fl4_sport = sel->sport; + fl.u.ip6.fl4_dport = sel->dport; + break; + default: + return ERR_PTR(-EAFNOSUPPORT); } - xfrm_pol_hold(ret); + rcu_read_lock(); - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + pol = xfrm_policy_lookup_bytype(net, type, &fl, sel->family, dir, if_id); + if (IS_ERR_OR_NULL(pol)) + goto out_unlock; - return ret; + if (!xfrm_pol_hold_rcu(ret)) + pol = NULL; +out_unlock: + rcu_read_unlock(); + return pol; } static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) @@ -4677,9 +4649,9 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, /* Stage 1 - find policy */ pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id); - if (!pol) { + if (IS_ERR_OR_NULL(pol)) { NL_SET_ERR_MSG(extack, "Target policy not found"); - err = -ENOENT; + err = IS_ERR(pol) ? PTR_ERR(pol) : -ENOENT; goto out; } From a54ad727f74559f7c3dfcfd2a63d0ce7683a82e8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 Aug 2024 15:04:32 +0200 Subject: [PATCH 07/13] xfrm: policy: remove remaining use of inexact list No consumers anymore, remove it. After this, insertion of policies no longer require list walk of all inexact policies but only those that are reachable via the candidate sets. This gives almost linear insertion speeds provided the inserted policies are for non-overlapping networks. Before: Inserted 1000 policies in 70 ms Inserted 10000 policies in 1155 ms Inserted 100000 policies in 216848 ms After: Inserted 1000 policies in 56 ms Inserted 10000 policies in 478 ms Inserted 100000 policies in 4580 ms Insertion of 1m entries takes about ~40s after this change on my test vm. Cc: Noel Kuntze Cc: Tobias Brunner Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/xfrm/xfrm_policy.c | 38 -------------------------------------- 2 files changed, 39 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f7244ac4fa08..1fa2da22a49e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -555,7 +555,6 @@ struct xfrm_policy { u16 family; struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; - struct hlist_node bydst_inexact_list; struct rcu_head rcu; struct xfrm_dev_offload xdo; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d2feee60bb62..b79ac453ea37 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -196,8 +196,6 @@ xfrm_policy_inexact_lookup_rcu(struct net *net, static struct xfrm_policy * xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl); -static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, - struct xfrm_policy *policy); static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, @@ -410,7 +408,6 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); - INIT_HLIST_NODE(&policy->bydst_inexact_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); @@ -1228,9 +1225,6 @@ xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) return ERR_PTR(-EEXIST); } - chain = &net->xfrm.policy_inexact[dir]; - xfrm_policy_insert_inexact_list(chain, policy); - if (delpol) __xfrm_policy_inexact_prune_bin(bin, false); @@ -1346,7 +1340,6 @@ static void xfrm_hash_rebuild(struct work_struct *work) } hlist_del_rcu(&policy->bydst); - hlist_del_init(&policy->bydst_inexact_list); newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, @@ -1515,36 +1508,6 @@ static const struct rhashtable_params xfrm_pol_inexact_params = { .automatic_shrinking = true, }; -static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, - struct xfrm_policy *policy) -{ - struct xfrm_policy *pol, *delpol = NULL; - struct hlist_node *newpos = NULL; - - hlist_for_each_entry(pol, chain, bydst_inexact_list) { - if (pol->type == policy->type && - pol->if_id == policy->if_id && - !selector_cmp(&pol->selector, &policy->selector) && - xfrm_policy_mark_match(&policy->mark, pol) && - xfrm_sec_ctx_match(pol->security, policy->security) && - !WARN_ON(delpol)) { - delpol = pol; - if (policy->priority > pol->priority) - continue; - } else if (policy->priority >= pol->priority) { - newpos = &pol->bydst_inexact_list; - continue; - } - if (delpol) - break; - } - - if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) - hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); - else - hlist_add_head_rcu(&policy->bydst_inexact_list, chain); -} - static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl) @@ -2346,7 +2309,6 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); - hlist_del_init(&pol->bydst_inexact_list); hlist_del(&pol->byidx); } From e7cd191f83fd899c233dfbe7dc6d96ef703dcbbd Mon Sep 17 00:00:00 2001 From: wangfe Date: Thu, 22 Aug 2024 13:02:52 -0700 Subject: [PATCH 08/13] xfrm: add SA information to the offloaded packet In packet offload mode, append Security Association (SA) information to each packet, replicating the crypto offload implementation. The XFRM_XMIT flag is set to enable packet to be returned immediately from the validate_xmit_xfrm function, thus aligning with the existing code path for packet offload mode. This SA info helps HW offload match packets to their correct security policies. The XFRM interface ID is included, which is crucial in setups with multiple XFRM interfaces where source/destination addresses alone can't pinpoint the right policy. Signed-off-by: wangfe Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e5722c95b8bb..a12588e7b060 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -706,6 +706,8 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) struct xfrm_state *x = skb_dst(skb)->xfrm; int family; int err; + struct xfrm_offload *xo; + struct sec_path *sp; family = (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) ? x->outer_mode.family : skb_dst(skb)->ops->family; @@ -728,6 +730,25 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return -EHOSTUNREACH; } + sp = secpath_set(skb); + if (!sp) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -ENOMEM; + } + + sp->olen++; + sp->xvec[sp->len++] = x; + xfrm_state_hold(x); + + xo = xfrm_offload(skb); + if (!xo) { + secpath_reset(skb); + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -EINVAL; + } + xo->flags |= XFRM_XMIT; return xfrm_output_resume(sk, skb, 0); } From 08c2182cf0b47646af7f3daa292980f95f5e1ea6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Aug 2024 15:37:32 +0200 Subject: [PATCH 09/13] xfrm: policy: use recently added helper in more places No logical change intended. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b79ac453ea37..94859b2182ec 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1276,11 +1276,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) struct xfrm_pol_inexact_bin *bin; u8 dbits, sbits; - if (policy->walk.dead) - continue; - - dir = xfrm_policy_id2dir(policy->index); - if (dir >= XFRM_POLICY_MAX) + if (xfrm_policy_is_dead_or_sk(policy)) continue; if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { @@ -1331,13 +1327,8 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { - if (policy->walk.dead) + if (xfrm_policy_is_dead_or_sk(policy)) continue; - dir = xfrm_policy_id2dir(policy->index); - if (dir >= XFRM_POLICY_MAX) { - /* skip socket policies */ - continue; - } hlist_del_rcu(&policy->bydst); From 17163f23678c7599e40758d7b96f68e3f3f2ea15 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Aug 2024 15:38:23 +0200 Subject: [PATCH 10/13] xfrm: minor update to sdb and xfrm_policy comments The spd is no longer maintained as a linear list. We also haven't been caching bundles in the xfrm_policy struct since 2010. While at it, add kdoc style comments for the xfrm_policy structure and extend the description of the current rbtree based search to mention why it needs to search the candidate set. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 40 +++++++++++++++++++++++++++++++++++----- net/xfrm/xfrm_policy.c | 6 +++++- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1fa2da22a49e..b6bfdc6416c7 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -67,13 +67,15 @@ - instance of a transformer, struct xfrm_state (=SA) - template to clone xfrm_state, struct xfrm_tmpl - SPD is plain linear list of xfrm_policy rules, ordered by priority. + SPD is organized as hash table (for policies that meet minimum address prefix + length setting, net->xfrm.policy_hthresh). Other policies are stored in + lists, sorted into rbtree ordered by destination and source address networks. + See net/xfrm/xfrm_policy.c for details. + (To be compatible with existing pfkeyv2 implementations, many rules with priority of 0x7fffffff are allowed to exist and such rules are ordered in an unpredictable way, thanks to bsd folks.) - Lookup is plain linear search until the first match with selector. - If "action" is "block", then we prohibit the flow, otherwise: if "xfrms_nr" is zero, the flow passes untransformed. Otherwise, policy entry has list of up to XFRM_MAX_DEPTH transformations, @@ -86,8 +88,6 @@ |---. child .-> dst -. xfrm .-> xfrm_state #3 |---. child .-> NULL - Bundles are cached at xrfm_policy struct (field ->bundles). - Resolution of xrfm_tmpl ----------------------- @@ -526,6 +526,36 @@ struct xfrm_policy_queue { unsigned long timeout; }; +/** + * struct xfrm_policy - xfrm policy + * @xp_net: network namespace the policy lives in + * @bydst: hlist node for SPD hash table or rbtree list + * @byidx: hlist node for index hash table + * @lock: serialize changes to policy structure members + * @refcnt: reference count, freed once it reaches 0 + * @pos: kernel internal tie-breaker to determine age of policy + * @timer: timer + * @genid: generation, used to invalidate old policies + * @priority: priority, set by userspace + * @index: policy index (autogenerated) + * @if_id: virtual xfrm interface id + * @mark: packet mark + * @selector: selector + * @lft: liftime configuration data + * @curlft: liftime state + * @walk: list head on pernet policy list + * @polq: queue to hold packets while aqcuire operaion in progress + * @bydst_reinsert: policy tree node needs to be merged + * @type: XFRM_POLICY_TYPE_MAIN or _SUB + * @action: XFRM_POLICY_ALLOW or _BLOCK + * @flags: XFRM_POLICY_LOCALOK, XFRM_POLICY_ICMP + * @xfrm_nr: number of used templates in @xfrm_vec + * @family: protocol family + * @security: SELinux security label + * @xfrm_vec: array of templates to resolve state + * @rcu: rcu head, used to defer memory release + * @xdo: hardware offload state + */ struct xfrm_policy { possible_net_t xp_net; struct hlist_node bydst; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 94859b2182ec..6336baa8a93c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -109,7 +109,11 @@ struct xfrm_pol_inexact_node { * 4. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with - * the lowest priority. If two results have same prio, youngest one wins. + * the lowest priority. If two candidates have the same priority, the + * struct xfrm_policy pos member with the lower number is used. + * + * This replicates previous single-list-search algorithm which would + * return first matching policy in the (ordered-by-priority) list. */ struct xfrm_pol_inexact_key { From 69716a3babe1165a9ab1b3770b55d303d5bb5fa3 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 Sep 2024 11:43:39 +0200 Subject: [PATCH 11/13] Revert "xfrm: add SA information to the offloaded packet" This reverts commit e7cd191f83fd899c233dfbe7dc6d96ef703dcbbd. While supporting xfrm interfaces in the packet offload API is needed, this patch does not do the right thing. There are more things to do to really support xfrm interfaces, so revert it for now. Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index a12588e7b060..e5722c95b8bb 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -706,8 +706,6 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) struct xfrm_state *x = skb_dst(skb)->xfrm; int family; int err; - struct xfrm_offload *xo; - struct sec_path *sp; family = (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) ? x->outer_mode.family : skb_dst(skb)->ops->family; @@ -730,25 +728,6 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return -EHOSTUNREACH; } - sp = secpath_set(skb); - if (!sp) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); - kfree_skb(skb); - return -ENOMEM; - } - - sp->olen++; - sp->xvec[sp->len++] = x; - xfrm_state_hold(x); - - xo = xfrm_offload(skb); - if (!xo) { - secpath_reset(skb); - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); - kfree_skb(skb); - return -EINVAL; - } - xo->flags |= XFRM_XMIT; return xfrm_output_resume(sk, skb, 0); } From 6a13f5afd39d17320316dbb8dd533fe7c6a613e6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 30 Aug 2024 16:39:10 +0200 Subject: [PATCH 12/13] xfrm: policy: fix null dereference Julian Wiedmann says: > + if (!xfrm_pol_hold_rcu(ret)) Coverity spotted that ^^^ needs a s/ret/pol fix-up: > CID 1599386: Null pointer dereferences (FORWARD_NULL) > Passing null pointer "ret" to "xfrm_pol_hold_rcu", which dereferences it. Ditch the bogus 'ret' variable. Fixes: 563d5ca93e88 ("xfrm: switch migrate to xfrm_policy_lookup_bytype") Reported-by: Julian Wiedmann Closes: https://lore.kernel.org/netdev/06dc2499-c095-4bd4-aee3-a1d0e3ec87c4@gmail.com/ Signed-off-by: Florian Westphal Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 6336baa8a93c..31c14457fdaf 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4429,7 +4429,7 @@ EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, u8 dir, u8 type, struct net *net, u32 if_id) { - struct xfrm_policy *pol, *ret = NULL; + struct xfrm_policy *pol; struct flowi fl; memset(&fl, 0, sizeof(fl)); @@ -4465,7 +4465,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * if (IS_ERR_OR_NULL(pol)) goto out_unlock; - if (!xfrm_pol_hold_rcu(ret)) + if (!xfrm_pol_hold_rcu(pol)) pol = NULL; out_unlock: rcu_read_unlock(); From e62d39332d4b9400a69ba902797aa17a1a0037e7 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 29 Aug 2024 11:14:54 -0700 Subject: [PATCH 13/13] xfrm: policy: Restore dir assignments in xfrm_hash_rebuild() Clang warns (or errors with CONFIG_WERROR): net/xfrm/xfrm_policy.c:1286:8: error: variable 'dir' is uninitialized when used here [-Werror,-Wuninitialized] 1286 | if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { | ^~~ net/xfrm/xfrm_policy.c:1257:9: note: initialize the variable 'dir' to silence this warning 1257 | int dir; | ^ | = 0 1 error generated. A recent refactoring removed some assignments to dir because xfrm_policy_is_dead_or_sk() has a dir assignment in it. However, dir is used elsewhere in xfrm_hash_rebuild(), including within loops where it needs to be reloaded for each policy. Restore the assignments before the first use of dir to fix the warning and ensure dir is properly initialized throughout the function. Fixes: 08c2182cf0b4 ("xfrm: policy: use recently added helper in more places") Acked-by: Florian Westphal Signed-off-by: Nathan Chancellor Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 31c14457fdaf..428ee83fe298 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1283,6 +1283,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) if (xfrm_policy_is_dead_or_sk(policy)) continue; + dir = xfrm_policy_id2dir(policy->index); if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { if (policy->family == AF_INET) { dbits = rbits4; @@ -1337,6 +1338,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) hlist_del_rcu(&policy->bydst); newpos = NULL; + dir = xfrm_policy_id2dir(policy->index); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);