From 5311591fbb349fe9f5c555dcba3b13a5831aa72d Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 26 Mar 2024 10:17:40 +0000 Subject: [PATCH 001/147] bpf: Add support for passing mark with bpf_fib_lookup Extend the bpf_fib_lookup() helper by making it to utilize mark if the BPF_FIB_LOOKUP_MARK flag is set. In order to pass the mark the four bytes of struct bpf_fib_lookup are used, shared with the output-only smac/dmac fields. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Reviewed-by: David Ahern Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240326101742.17421-2-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 20 ++++++++++++++++++-- net/core/filter.c | 12 +++++++++--- tools/include/uapi/linux/bpf.h | 20 ++++++++++++++++++-- 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9585f5345353..96d57e483133 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3394,6 +3394,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -7120,6 +7124,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7197,8 +7202,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { diff --git a/net/core/filter.c b/net/core/filter.c index 0c66e4a3fc5b..1205dd777dc2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5884,7 +5884,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { - fl4.flowi4_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -6027,7 +6030,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { - fl6.flowi6_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6105,7 +6111,7 @@ set_fwd_params: #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ - BPF_FIB_LOOKUP_SRC) + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9585f5345353..96d57e483133 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3394,6 +3394,10 @@ union bpf_attr { * for the nexthop. If the src addr cannot be derived, * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -7120,6 +7124,7 @@ enum { BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), }; enum { @@ -7197,8 +7202,19 @@ struct bpf_fib_lookup { __u32 tbid; }; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; }; struct bpf_redir_neigh { From 6efec2cb06411a577125b5f531a852c08ead1209 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 26 Mar 2024 10:17:41 +0000 Subject: [PATCH 002/147] selftests/bpf: Add BPF_FIB_LOOKUP_MARK tests This patch extends the fib_lookup test suite by adding a few test cases for each IP family to test the new BPF_FIB_LOOKUP_MARK flag to the bpf_fib_lookup: * Test destination IP address selection with and without a mark and/or the BPF_FIB_LOOKUP_MARK flag set Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240326101742.17421-3-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/fib_lookup.c | 132 ++++++++++++++---- 1 file changed, 103 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c index 3379df2d4cf2..bd7658958004 100644 --- a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c @@ -26,6 +26,17 @@ #define IPV6_TBID_ADDR "fd00::FFFF" #define IPV6_TBID_NET "fd00::" #define IPV6_TBID_DST "fd00::2" +#define MARK_NO_POLICY 33 +#define MARK 42 +#define MARK_TABLE "200" +#define IPV4_REMOTE_DST "1.2.3.4" +#define IPV4_LOCAL "10.4.0.3" +#define IPV4_GW1 "10.4.0.1" +#define IPV4_GW2 "10.4.0.2" +#define IPV6_REMOTE_DST "be:ef::b0:10" +#define IPV6_LOCAL "fd01::3" +#define IPV6_GW1 "fd01::1" +#define IPV6_GW2 "fd01::2" #define DMAC "11:11:11:11:11:11" #define DMAC_INIT { 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, } #define DMAC2 "01:01:01:01:01:01" @@ -36,9 +47,11 @@ struct fib_lookup_test { const char *daddr; int expected_ret; const char *expected_src; + const char *expected_dst; int lookup_flags; __u32 tbid; __u8 dmac[6]; + __u32 mark; }; static const struct fib_lookup_test tests[] = { @@ -90,10 +103,47 @@ static const struct fib_lookup_test tests[] = { .daddr = IPV6_ADDR_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, .expected_src = IPV6_IFACE_ADDR_SEC, .lookup_flags = BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + /* policy routing */ + { .desc = "IPv4 policy routing, default", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv4 policy routing, mark doesn't point to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv4 policy routing, mark points to a policy", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv4 policy routing, mark points to a policy, but no flag", + .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV4_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, default", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, }, + { .desc = "IPv6 policy routing, mark doesn't point to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK_NO_POLICY, }, + { .desc = "IPv6 policy routing, mark points to a policy", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW2, + .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, + { .desc = "IPv6 policy routing, mark points to a policy, but no flag", + .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS, + .expected_dst = IPV6_GW1, + .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, + .mark = MARK, }, }; -static int ifindex; - static int setup_netns(void) { int err; @@ -144,12 +194,24 @@ static int setup_netns(void) if (!ASSERT_OK(err, "write_sysctl(net.ipv6.conf.veth1.forwarding)")) goto fail; + /* Setup for policy routing tests */ + SYS(fail, "ip addr add %s/24 dev veth1", IPV4_LOCAL); + SYS(fail, "ip addr add %s/64 dev veth1 nodad", IPV6_LOCAL); + SYS(fail, "ip route add %s/32 via %s", IPV4_REMOTE_DST, IPV4_GW1); + SYS(fail, "ip route add %s/32 via %s table %s", IPV4_REMOTE_DST, IPV4_GW2, MARK_TABLE); + SYS(fail, "ip -6 route add %s/128 via %s", IPV6_REMOTE_DST, IPV6_GW1); + SYS(fail, "ip -6 route add %s/128 via %s table %s", IPV6_REMOTE_DST, IPV6_GW2, MARK_TABLE); + SYS(fail, "ip rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + SYS(fail, "ip -6 rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE); + return 0; fail: return -1; } -static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_lookup_test *test) +static int set_lookup_params(struct bpf_fib_lookup *params, + const struct fib_lookup_test *test, + int ifindex) { int ret; @@ -158,6 +220,7 @@ static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_loo params->l4_protocol = IPPROTO_TCP; params->ifindex = ifindex; params->tbid = test->tbid; + params->mark = test->mark; if (inet_pton(AF_INET6, test->daddr, params->ipv6_dst) == 1) { params->family = AF_INET6; @@ -190,38 +253,43 @@ static void mac_str(char *b, const __u8 *mac) mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); } -static void assert_src_ip(struct bpf_fib_lookup *fib_params, const char *expected_src) +static void assert_ip_address(int family, void *addr, const char *expected_str) { + char str[INET6_ADDRSTRLEN]; + u8 expected_addr[16]; + int addr_len = 0; int ret; - __u32 src6[4]; - __be32 src4; - switch (fib_params->family) { + switch (family) { case AF_INET6: - ret = inet_pton(AF_INET6, expected_src, src6); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ret = memcmp(src6, fib_params->ipv6_src, sizeof(fib_params->ipv6_src)); - if (!ASSERT_EQ(ret, 0, "fib_lookup ipv6 src")) { - char str_src6[64]; - - inet_ntop(AF_INET6, fib_params->ipv6_src, str_src6, - sizeof(str_src6)); - printf("ipv6 expected %s actual %s ", expected_src, - str_src6); - } - + ret = inet_pton(AF_INET6, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET6, expected_str)"); + addr_len = 16; break; case AF_INET: - ret = inet_pton(AF_INET, expected_src, &src4); - ASSERT_EQ(ret, 1, "inet_pton(expected_src)"); - - ASSERT_EQ(fib_params->ipv4_src, src4, "fib_lookup ipv4 src"); - + ret = inet_pton(AF_INET, expected_str, expected_addr); + ASSERT_EQ(ret, 1, "inet_pton(AF_INET, expected_str)"); + addr_len = 4; break; default: - PRINT_FAIL("invalid addr family: %d", fib_params->family); + PRINT_FAIL("invalid address family: %d", family); + break; } + + if (memcmp(addr, expected_addr, addr_len)) { + inet_ntop(family, addr, str, sizeof(str)); + PRINT_FAIL("expected %s actual %s ", expected_str, str); + } +} + +static void assert_src_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_src, expected); +} + +static void assert_dst_ip(struct bpf_fib_lookup *params, const char *expected) +{ + assert_ip_address(params->family, params->ipv6_dst, expected); } void test_fib_lookup(void) @@ -256,15 +324,18 @@ void test_fib_lookup(void) if (setup_netns()) goto fail; - ifindex = if_nametoindex("veth1"); - skb.ifindex = ifindex; + skb.ifindex = if_nametoindex("veth1"); + if (!ASSERT_NEQ(skb.ifindex, 0, "if_nametoindex(veth1)")) + goto fail; + fib_params = &skel->bss->fib_params; for (i = 0; i < ARRAY_SIZE(tests); i++) { printf("Testing %s ", tests[i].desc); - if (set_lookup_params(fib_params, &tests[i])) + if (set_lookup_params(fib_params, &tests[i], skb.ifindex)) continue; + skel->bss->fib_lookup_ret = -1; skel->bss->lookup_flags = tests[i].lookup_flags; @@ -278,6 +349,9 @@ void test_fib_lookup(void) if (tests[i].expected_src) assert_src_ip(fib_params, tests[i].expected_src); + if (tests[i].expected_dst) + assert_dst_ip(fib_params, tests[i].expected_dst); + ret = memcmp(tests[i].dmac, fib_params->dmac, sizeof(tests[i].dmac)); if (!ASSERT_EQ(ret, 0, "dmac not match")) { char expected[18], actual[18]; From 59b418c7063d30e0a3e1f592d47df096db83185c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 26 Mar 2024 10:17:42 +0000 Subject: [PATCH 003/147] bpf: Add a check for struct bpf_fib_lookup size The struct bpf_fib_lookup should not grow outside of its 64 bytes. Add a static assert to validate this. Suggested-by: David Ahern Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240326101742.17421-4-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 1205dd777dc2..786d792ac816 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,6 +87,9 @@ #include "dev.h" +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); + static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); From 786bf0e7e2ec90b349a7bab6e97947982ab31f2c Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 25 Mar 2024 15:22:10 +0000 Subject: [PATCH 004/147] bpf: improve error message for unsupported helper BPF verifier emits "unknown func" message when given BPF program type does not support BPF helper. This message may be confusing for users, as important context that helper is unknown only to current program type is not provided. This patch changes message to "program of this type cannot use helper " and aligns dependent code in libbpf and tests. Any suggestions on improving/changing this message are welcome. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20240325152210.377548-1-yatsenko@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- tools/bpf/bpftool/feature.c | 3 ++- tools/lib/bpf/libbpf_probes.c | 6 ++++-- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 +- .../selftests/bpf/progs/verifier_helper_restricted.c | 8 ++++---- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17f26ba1a9e0..edb650667f44 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10201,8 +10201,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { - verbose(env, "unknown func %s#%d\n", func_id_name(func_id), - func_id); + verbose(env, "program of this type cannot use helper %s#%d\n", + func_id_name(func_id), func_id); return -EINVAL; } diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 708733b0ea06..c754a428c8c6 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -664,7 +664,8 @@ probe_helper_ifindex(enum bpf_func_id id, enum bpf_prog_type prog_type, probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); + res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ") && + !grep(buf, "program of this type cannot use helper "); switch (get_vendor_id(ifindex)) { case 0x19ee: /* Netronome specific */ diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 302188122439..9dfbe7750f56 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -448,7 +448,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) * at all, it will emit something like "invalid func unknown#181". * If BPF verifier recognizes BPF helper but it's not supported for - * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166" + * or "program of this type cannot use helper bpf_sys_bpf#166". * In both cases, provided combination of BPF program type and BPF * helper is not supported by the kernel. * In all other cases, probe_prog_load() above will either succeed (e.g., @@ -457,7 +458,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe * that), or we'll get some more specific BPF verifier error about * some unsatisfied conditions. */ - if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") || + strstr(buf, "program of this type cannot use helper "))) return 0; return 1; /* assume supported */ } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 94cb22b01482..99dc60fe59d5 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -304,7 +304,7 @@ static void test_rel_setsockopt(void) struct bpf_dctcp_release *rel_skel; libbpf_print_fn_t old_print_fn; - err_str = "unknown func bpf_setsockopt"; + err_str = "program of this type cannot use helper bpf_setsockopt"; found = false; old_print_fn = libbpf_set_print(libbpf_debug_print); diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c index 0ede0ccd090c..059aa716e3d0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c @@ -30,7 +30,7 @@ struct { SEC("kprobe") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_kprobe_1(void) { asm volatile (" \ @@ -44,7 +44,7 @@ __naked void in_bpf_prog_type_kprobe_1(void) SEC("tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void in_bpf_prog_type_tracepoint_1(void) { asm volatile (" \ @@ -58,7 +58,7 @@ __naked void in_bpf_prog_type_tracepoint_1(void) SEC("perf_event") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_perf_event_1(void) { asm volatile (" \ @@ -72,7 +72,7 @@ __naked void bpf_prog_type_perf_event_1(void) SEC("raw_tracepoint") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT") -__failure __msg("unknown func bpf_ktime_get_coarse_ns") +__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") __naked void bpf_prog_type_raw_tracepoint_1(void) { asm volatile (" \ From 55fc888ded83ed542f3de3e51bae03936a998349 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Wed, 27 Mar 2024 14:53:29 +0800 Subject: [PATCH 005/147] bpf,arena: Use helper sizeof_field in struct accessors Use the well defined helper sizeof_field() to calculate the size of a struct member, instead of doing custom calculations. Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240327065334.8140-1-haiyue.wang@intel.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 343c3456c8dd..6c81630c5293 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -37,7 +37,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { From 1175f8dea349e5999d99727346db24f38306a793 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:46 -0700 Subject: [PATCH 006/147] selftests/bpf: rename and clean up userspace-triggered benchmarks Rename uprobe-base to more precise usermode-count (it will match other baseline-like benchmarks, kernel-count and syscall-count). Also use BENCH_TRIG_USERMODE() macro to define all usermode-based triggering benchmarks, which include usermode-count and uprobe/uretprobe benchmarks. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 14 ++- .../selftests/bpf/benchs/bench_trigger.c | 104 ++++++------------ .../selftests/bpf/benchs/run_bench_uprobes.sh | 2 +- 3 files changed, 48 insertions(+), 72 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index b2b4c391eb0a..7ca1e1eb5c30 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -491,7 +491,12 @@ extern const struct bench bench_rename_kretprobe; extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; + +/* pure counting benchmarks to establish theoretical lmits */ +extern const struct bench bench_trig_usermode_count; extern const struct bench bench_trig_base; + +/* kernel-side syscall-triggered benchmarks */ extern const struct bench bench_trig_tp; extern const struct bench bench_trig_rawtp; extern const struct bench bench_trig_kprobe; @@ -502,13 +507,15 @@ extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; -extern const struct bench bench_trig_uprobe_base; + +/* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; extern const struct bench bench_trig_uretprobe_nop; extern const struct bench bench_trig_uprobe_push; extern const struct bench bench_trig_uretprobe_push; extern const struct bench bench_trig_uprobe_ret; extern const struct bench bench_trig_uretprobe_ret; + extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; extern const struct bench bench_pb_libbpf; @@ -539,7 +546,10 @@ static const struct bench *benchs[] = { &bench_rename_rawtp, &bench_rename_fentry, &bench_rename_fexit, + /* pure counting benchmarks for establishing theoretical limits */ + &bench_trig_usermode_count, &bench_trig_base, + /* syscall-driven triggering benchmarks */ &bench_trig_tp, &bench_trig_rawtp, &bench_trig_kprobe, @@ -550,7 +560,7 @@ static const struct bench *benchs[] = { &bench_trig_fexit, &bench_trig_fentry_sleep, &bench_trig_fmodret, - &bench_trig_uprobe_base, + /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, &bench_trig_uprobe_push, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index d66eddacd642..97aba7e6458d 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -14,6 +14,7 @@ /* BPF triggering benchmarks */ static struct trigger_ctx { struct trigger_bench *skel; + bool usermode_counters; } ctx; static struct counter base_hits[MAX_BUCKETS]; @@ -74,7 +75,10 @@ static void *trigger_producer(void *input) static void trigger_measure(struct bench_res *res) { - res->hits = sum_and_reset_counters(ctx.skel->bss->hits); + if (ctx.usermode_counters) + res->hits = sum_and_reset_counters(base_hits); + else + res->hits = sum_and_reset_counters(ctx.skel->bss->hits); } static void setup_ctx(void) @@ -192,7 +196,7 @@ __nocf_check __weak void uprobe_target_ret(void) asm volatile (""); } -static void *uprobe_base_producer(void *input) +static void *uprobe_producer_count(void *input) { while (true) { uprobe_target_nop(); @@ -248,32 +252,37 @@ static void usetup(bool use_retprobe, void *target_addr) ctx.skel->links.bench_trigger_uprobe = link; } -static void uprobe_setup_nop(void) +static void usermode_count_setup(void) +{ + ctx.usermode_counters = true; +} + +static void uprobe_nop_setup(void) { usetup(false, &uprobe_target_nop); } -static void uretprobe_setup_nop(void) +static void uretprobe_nop_setup(void) { usetup(true, &uprobe_target_nop); } -static void uprobe_setup_push(void) +static void uprobe_push_setup(void) { usetup(false, &uprobe_target_push); } -static void uretprobe_setup_push(void) +static void uretprobe_push_setup(void) { usetup(true, &uprobe_target_push); } -static void uprobe_setup_ret(void) +static void uprobe_ret_setup(void) { usetup(false, &uprobe_target_ret); } -static void uretprobe_setup_ret(void) +static void uretprobe_ret_setup(void) { usetup(true, &uprobe_target_ret); } @@ -387,65 +396,22 @@ const struct bench bench_trig_fmodret = { .report_final = hits_drops_report_final, }; -const struct bench bench_trig_uprobe_base = { - .name = "trig-uprobe-base", - .setup = NULL, /* no uprobe/uretprobe is attached */ - .producer_thread = uprobe_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; +/* uprobe benchmarks */ +#define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .validate = trigger_validate, \ + .setup = KIND##_setup, \ + .producer_thread = uprobe_producer_##PRODUCER, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ +} -const struct bench bench_trig_uprobe_nop = { - .name = "trig-uprobe-nop", - .setup = uprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_nop = { - .name = "trig-uretprobe-nop", - .setup = uretprobe_setup_nop, - .producer_thread = uprobe_producer_nop, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_push = { - .name = "trig-uprobe-push", - .setup = uprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_push = { - .name = "trig-uretprobe-push", - .setup = uretprobe_setup_push, - .producer_thread = uprobe_producer_push, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uprobe_ret = { - .name = "trig-uprobe-ret", - .setup = uprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_uretprobe_ret = { - .name = "trig-uretprobe-ret", - .setup = uretprobe_setup_ret, - .producer_thread = uprobe_producer_ret, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; +BENCH_TRIG_USERMODE(usermode_count, count, "usermode-count"); +BENCH_TRIG_USERMODE(uprobe_nop, nop, "uprobe-nop"); +BENCH_TRIG_USERMODE(uprobe_push, push, "uprobe-push"); +BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret"); +BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop"); +BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push"); +BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret"); diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index 9bdcc74e03a4..36021d243397 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in base {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count base {uprobe,uretprobe}-{nop,push,ret} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" From 7df4e597ea2cfd677e65730948153d5544986a10 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:47 -0700 Subject: [PATCH 007/147] selftests/bpf: add batched, mostly in-kernel BPF triggering benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Existing kprobe/fentry triggering benchmarks have 1-to-1 mapping between one syscall execution and BPF program run. While we use a fast get_pgid() syscall, syscall overhead can still be non-trivial. This patch adds kprobe/fentry set of benchmarks significantly amortizing the cost of syscall vs actual BPF triggering overhead. We do this by employing BPF_PROG_TEST_RUN command to trigger "driver" raw_tp program which does a tight parameterized loop calling cheap BPF helper (bpf_get_numa_node_id()), to which kprobe/fentry programs are attached for benchmarking. This way 1 bpf() syscall causes N executions of BPF program being benchmarked. N defaults to 100, but can be adjusted with --trig-batch-iters CLI argument. For comparison we also implement a new baseline program that instead of triggering another BPF program just does N atomic per-CPU counter increments, establishing the limit for all other types of program within this batched benchmarking setup. Taking the final set of benchmarks added in this patch set (including tp/raw_tp/fmodret, added in later patch), and keeping for now "legacy" syscall-driven benchmarks, we can capture all triggering benchmarks in one place for comparison, before we remove the legacy ones (and rename xxx-batched into just xxx). $ benchs/run_bench_trigger.sh usermode-count : 79.500 ± 0.024M/s kernel-count : 49.949 ± 0.081M/s syscall-count : 9.009 ± 0.007M/s fentry-batch : 31.002 ± 0.015M/s fexit-batch : 20.372 ± 0.028M/s fmodret-batch : 21.651 ± 0.659M/s rawtp-batch : 36.775 ± 0.264M/s tp-batch : 19.411 ± 0.248M/s kprobe-batch : 12.949 ± 0.220M/s kprobe-multi-batch : 15.400 ± 0.007M/s kretprobe-batch : 5.559 ± 0.011M/s kretprobe-multi-batch: 5.861 ± 0.003M/s fentry-legacy : 8.329 ± 0.004M/s fexit-legacy : 6.239 ± 0.003M/s fmodret-legacy : 6.595 ± 0.001M/s rawtp-legacy : 8.305 ± 0.004M/s tp-legacy : 6.382 ± 0.001M/s kprobe-legacy : 5.528 ± 0.003M/s kprobe-multi-legacy : 5.864 ± 0.022M/s kretprobe-legacy : 3.081 ± 0.001M/s kretprobe-multi-legacy: 3.193 ± 0.001M/s Note how xxx-batch variants are measured with significantly higher throughput, even though it's exactly the same in-kernel overhead. As such, results can be compared only between benchmarks of the same kind (syscall vs batched): fentry-legacy : 8.329 ± 0.004M/s fentry-batch : 31.002 ± 0.015M/s kprobe-multi-legacy : 5.864 ± 0.022M/s kprobe-multi-batch : 15.400 ± 0.007M/s Note also that syscall-count is setting a theoretical limit for syscall-triggered benchmarks, while kernel-count is setting similar limits for batch variants. usermode-count is a happy and unachievable case of user space counting without doing any syscalls, and is mostly the measure of CPU speed for such a trivial benchmark. As was mentioned, tp/raw_tp/fmodret require kernel-side kfunc to produce similar benchmark, which we address in a separate patch. Note that run_bench_trigger.sh allows to override a list of benchmarks to run, which is very useful for performance work. Cc: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 21 ++- .../selftests/bpf/benchs/bench_trigger.c | 133 +++++++++++++++++- .../selftests/bpf/benchs/run_bench_trigger.sh | 24 +++- .../selftests/bpf/progs/trigger_bench.c | 67 ++++++++- 4 files changed, 238 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 7ca1e1eb5c30..484bcbeaa819 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -280,6 +280,7 @@ extern struct argp bench_strncmp_argp; extern struct argp bench_hashmap_lookup_argp; extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; +extern struct argp bench_trigger_batch_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -292,6 +293,7 @@ static const struct argp_child bench_parsers[] = { { &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 }, { &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 }, { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, + { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, {}, }; @@ -508,6 +510,15 @@ extern const struct bench bench_trig_fexit; extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; +/* batched, staying mostly in-kernel benchmarks */ +extern const struct bench bench_trig_kernel_count; +extern const struct bench bench_trig_kprobe_batch; +extern const struct bench bench_trig_kretprobe_batch; +extern const struct bench bench_trig_kprobe_multi_batch; +extern const struct bench bench_trig_kretprobe_multi_batch; +extern const struct bench bench_trig_fentry_batch; +extern const struct bench bench_trig_fexit_batch; + /* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; extern const struct bench bench_trig_uretprobe_nop; @@ -548,7 +559,7 @@ static const struct bench *benchs[] = { &bench_rename_fexit, /* pure counting benchmarks for establishing theoretical limits */ &bench_trig_usermode_count, - &bench_trig_base, + &bench_trig_kernel_count, /* syscall-driven triggering benchmarks */ &bench_trig_tp, &bench_trig_rawtp, @@ -560,6 +571,13 @@ static const struct bench *benchs[] = { &bench_trig_fexit, &bench_trig_fentry_sleep, &bench_trig_fmodret, + /* batched, staying mostly in-kernel triggers */ + &bench_trig_kprobe_batch, + &bench_trig_kretprobe_batch, + &bench_trig_kprobe_multi_batch, + &bench_trig_kretprobe_multi_batch, + &bench_trig_fentry_batch, + &bench_trig_fexit_batch, /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, @@ -567,6 +585,7 @@ static const struct bench *benchs[] = { &bench_trig_uretprobe_push, &bench_trig_uprobe_ret, &bench_trig_uretprobe_ret, + /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, &bench_pb_libbpf, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 97aba7e6458d..20277dabdaf9 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -1,11 +1,57 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ #define _GNU_SOURCE +#include #include +#include #include "bench.h" #include "trigger_bench.skel.h" #include "trace_helpers.h" +#define MAX_TRIG_BATCH_ITERS 1000 + +static struct { + __u32 batch_iters; +} args = { + .batch_iters = 100, +}; + +enum { + ARG_TRIG_BATCH_ITERS = 7000, +}; + +static const struct argp_option opts[] = { + { "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0, + "Number of in-kernel iterations per one driver test run"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_TRIG_BATCH_ITERS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > MAX_TRIG_BATCH_ITERS) { + fprintf(stderr, "invalid --trig-batch-iters value (should be between %d and %d)\n", + 1, MAX_TRIG_BATCH_ITERS); + argp_usage(state); + } + args.batch_iters = ret; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_trigger_batch_argp = { + .options = opts, + .parser = parse_arg, +}; + /* adjust slot shift in inc_hits() if changing */ #define MAX_BUCKETS 256 @@ -15,6 +61,7 @@ static struct trigger_ctx { struct trigger_bench *skel; bool usermode_counters; + int driver_prog_fd; } ctx; static struct counter base_hits[MAX_BUCKETS]; @@ -73,6 +120,16 @@ static void *trigger_producer(void *input) return NULL; } +static void *trigger_producer_batch(void *input) +{ + int fd = ctx.driver_prog_fd ?: bpf_program__fd(ctx.skel->progs.trigger_driver); + + while (true) + bpf_prog_test_run_opts(fd, NULL); + + return NULL; +} + static void trigger_measure(struct bench_res *res) { if (ctx.usermode_counters) @@ -83,13 +140,23 @@ static void trigger_measure(struct bench_res *res) static void setup_ctx(void) { + int err; + setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + + ctx.skel->rodata->batch_iters = args.batch_iters; + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } } static void attach_bpf(struct bpf_program *prog) @@ -163,6 +230,50 @@ static void trigger_fmodret_setup(void) attach_bpf(ctx.skel->progs.bench_trigger_fmodret); } +/* Batched, staying mostly in-kernel triggering setups */ +static void trigger_kernel_count_setup(void) +{ + setup_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); +} + +static void trigger_kprobe_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kprobe_batch); +} + +static void trigger_kretprobe_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_batch); +} + +static void trigger_kprobe_multi_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi_batch); +} + +static void trigger_kretprobe_multi_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi_batch); +} + +static void trigger_fentry_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fentry_batch); +} + +static void trigger_fexit_batch_setup(void) +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fexit_batch); +} + /* make sure call is not inlined and not avoided by compiler, so __weak and * inline asm volatile in the body of the function * @@ -396,6 +507,26 @@ const struct bench bench_trig_fmodret = { .report_final = hits_drops_report_final, }; +/* batched (staying mostly in kernel) kprobe/fentry benchmarks */ +#define BENCH_TRIG_BATCH(KIND, NAME) \ +const struct bench bench_trig_##KIND = { \ + .name = "trig-" NAME, \ + .setup = trigger_##KIND##_setup, \ + .producer_thread = trigger_producer_batch, \ + .measure = trigger_measure, \ + .report_progress = hits_drops_report_progress, \ + .report_final = hits_drops_report_final, \ + .argp = &bench_trigger_batch_argp, \ +} + +BENCH_TRIG_BATCH(kernel_count, "kernel-count"); +BENCH_TRIG_BATCH(kprobe_batch, "kprobe-batch"); +BENCH_TRIG_BATCH(kretprobe_batch, "kretprobe-batch"); +BENCH_TRIG_BATCH(kprobe_multi_batch, "kprobe-multi-batch"); +BENCH_TRIG_BATCH(kretprobe_multi_batch, "kretprobe-multi-batch"); +BENCH_TRIG_BATCH(fentry_batch, "fentry-batch"); +BENCH_TRIG_BATCH(fexit_batch, "fexit-batch"); + /* uprobe benchmarks */ #define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ const struct bench bench_trig_##KIND = { \ diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index 78e83f243294..b58ec33ea18c 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -2,8 +2,24 @@ set -eufo pipefail -for i in base tp rawtp kprobe fentry fmodret -do - summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) - printf "%-10s: %s\n" $i "$summary" +def_tests=( \ + usermode-count kernel-count syscall-count \ + fentry-batch fexit-batch \ + kprobe-batch kprobe-multi-batch \ + kretprobe-batch kretprobe-multi-batch \ + fentry fexit fmodret \ + rawtp tp \ + kprobe kprobe-multi kretprobe kretprobe-multi \ +) + +tests=("$@") +if [ ${#tests[@]} -eq 0 ]; then + tests=("${def_tests[@]}") +fi + +p=${PROD_CNT:-1} + +for t in "${tests[@]}"; do + summary=$(sudo ./bench -w2 -d5 -a -p$p trig-$t | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-21s: %s\n" $t "$summary" done diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 42ec202015ed..f0b76afa5017 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Facebook - #include #include #include @@ -103,3 +102,69 @@ int bench_trigger_uprobe(void *ctx) inc_counter(); return 0; } + +const volatile int batch_iters = 0; + +SEC("raw_tp") +int trigger_count(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + inc_counter(); + + return 0; +} + +SEC("raw_tp") +int trigger_driver(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_get_numa_node_id(); /* attach point for benchmarking */ + + return 0; +} + +SEC("kprobe/bpf_get_numa_node_id") +int bench_trigger_kprobe_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("kretprobe/bpf_get_numa_node_id") +int bench_trigger_kretprobe_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("kprobe.multi/bpf_get_numa_node_id") +int bench_trigger_kprobe_multi_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("kretprobe.multi/bpf_get_numa_node_id") +int bench_trigger_kretprobe_multi_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("fentry/bpf_get_numa_node_id") +int bench_trigger_fentry_batch(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("fexit/bpf_get_numa_node_id") +int bench_trigger_fexit_batch(void *ctx) +{ + inc_counter(); + return 0; +} From 208c4391204d25d9178fbc87f216daffad00cd15 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:48 -0700 Subject: [PATCH 008/147] selftests/bpf: remove syscall-driven benchs, keep syscall-count only Remove "legacy" benchmarks triggered by syscalls in favor of newly added in-kernel/batched benchmarks. Drop -batched suffix now as well. Next patch will restore "feature parity" by adding back tp/raw_tp/fmodret benchmarks based on in-kernel kfunc approach. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 32 +-- .../selftests/bpf/benchs/bench_trigger.c | 213 +++--------------- .../selftests/bpf/benchs/run_bench_trigger.sh | 11 +- .../selftests/bpf/benchs/run_bench_uprobes.sh | 2 +- .../selftests/bpf/progs/trigger_bench.c | 83 +------ 5 files changed, 42 insertions(+), 299 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 484bcbeaa819..8b16841a3dec 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -496,28 +496,16 @@ extern const struct bench bench_rename_fexit; /* pure counting benchmarks to establish theoretical lmits */ extern const struct bench bench_trig_usermode_count; -extern const struct bench bench_trig_base; +extern const struct bench bench_trig_syscall_count; +extern const struct bench bench_trig_kernel_count; -/* kernel-side syscall-triggered benchmarks */ -extern const struct bench bench_trig_tp; -extern const struct bench bench_trig_rawtp; +/* batched, staying mostly in-kernel benchmarks */ extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_kretprobe; extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; -extern const struct bench bench_trig_fentry_sleep; -extern const struct bench bench_trig_fmodret; - -/* batched, staying mostly in-kernel benchmarks */ -extern const struct bench bench_trig_kernel_count; -extern const struct bench bench_trig_kprobe_batch; -extern const struct bench bench_trig_kretprobe_batch; -extern const struct bench bench_trig_kprobe_multi_batch; -extern const struct bench bench_trig_kretprobe_multi_batch; -extern const struct bench bench_trig_fentry_batch; -extern const struct bench bench_trig_fexit_batch; /* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; @@ -560,24 +548,14 @@ static const struct bench *benchs[] = { /* pure counting benchmarks for establishing theoretical limits */ &bench_trig_usermode_count, &bench_trig_kernel_count, - /* syscall-driven triggering benchmarks */ - &bench_trig_tp, - &bench_trig_rawtp, + &bench_trig_syscall_count, + /* batched, staying mostly in-kernel triggers */ &bench_trig_kprobe, &bench_trig_kretprobe, &bench_trig_kprobe_multi, &bench_trig_kretprobe_multi, &bench_trig_fentry, &bench_trig_fexit, - &bench_trig_fentry_sleep, - &bench_trig_fmodret, - /* batched, staying mostly in-kernel triggers */ - &bench_trig_kprobe_batch, - &bench_trig_kretprobe_batch, - &bench_trig_kprobe_multi_batch, - &bench_trig_kretprobe_multi_batch, - &bench_trig_fentry_batch, - &bench_trig_fexit_batch, /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 20277dabdaf9..7d4f34adfd64 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -99,24 +99,17 @@ static void trigger_validate(void) } } -static void *trigger_base_producer(void *input) -{ - while (true) { - (void)syscall(__NR_getpgid); - inc_counter(base_hits); - } - return NULL; -} - -static void trigger_base_measure(struct bench_res *res) -{ - res->hits = sum_and_reset_counters(base_hits); -} - static void *trigger_producer(void *input) { - while (true) - (void)syscall(__NR_getpgid); + if (ctx.usermode_counters) { + while (true) { + (void)syscall(__NR_getpgid); + inc_counter(base_hits); + } + } else { + while (true) + (void)syscall(__NR_getpgid); + } return NULL; } @@ -170,16 +163,17 @@ static void attach_bpf(struct bpf_program *prog) } } -static void trigger_tp_setup(void) +static void trigger_syscall_count_setup(void) { - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_tp); + ctx.usermode_counters = true; } -static void trigger_rawtp_setup(void) +/* Batched, staying mostly in-kernel triggering setups */ +static void trigger_kernel_count_setup(void) { setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); } static void trigger_kprobe_setup(void) @@ -218,62 +212,6 @@ static void trigger_fexit_setup(void) attach_bpf(ctx.skel->progs.bench_trigger_fexit); } -static void trigger_fentry_sleep_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep); -} - -static void trigger_fmodret_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fmodret); -} - -/* Batched, staying mostly in-kernel triggering setups */ -static void trigger_kernel_count_setup(void) -{ - setup_ctx(); - /* override driver program */ - ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); -} - -static void trigger_kprobe_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_kprobe_batch); -} - -static void trigger_kretprobe_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_batch); -} - -static void trigger_kprobe_multi_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi_batch); -} - -static void trigger_kretprobe_multi_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi_batch); -} - -static void trigger_fentry_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fentry_batch); -} - -static void trigger_fexit_batch_setup(void) -{ - setup_ctx(); - attach_bpf(ctx.skel->progs.bench_trigger_fexit_batch); -} - /* make sure call is not inlined and not avoided by compiler, so __weak and * inline asm volatile in the body of the function * @@ -398,109 +336,10 @@ static void uretprobe_ret_setup(void) usetup(true, &uprobe_target_ret); } -const struct bench bench_trig_base = { - .name = "trig-base", +const struct bench bench_trig_syscall_count = { + .name = "trig-syscall-count", .validate = trigger_validate, - .producer_thread = trigger_base_producer, - .measure = trigger_base_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_tp = { - .name = "trig-tp", - .validate = trigger_validate, - .setup = trigger_tp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_rawtp = { - .name = "trig-rawtp", - .validate = trigger_validate, - .setup = trigger_rawtp_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe = { - .name = "trig-kprobe", - .validate = trigger_validate, - .setup = trigger_kprobe_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kretprobe = { - .name = "trig-kretprobe", - .validate = trigger_validate, - .setup = trigger_kretprobe_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kprobe_multi = { - .name = "trig-kprobe-multi", - .validate = trigger_validate, - .setup = trigger_kprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_kretprobe_multi = { - .name = "trig-kretprobe-multi", - .validate = trigger_validate, - .setup = trigger_kretprobe_multi_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry = { - .name = "trig-fentry", - .validate = trigger_validate, - .setup = trigger_fentry_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fexit = { - .name = "trig-fexit", - .validate = trigger_validate, - .setup = trigger_fexit_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fentry_sleep = { - .name = "trig-fentry-sleep", - .validate = trigger_validate, - .setup = trigger_fentry_sleep_setup, - .producer_thread = trigger_producer, - .measure = trigger_measure, - .report_progress = hits_drops_report_progress, - .report_final = hits_drops_report_final, -}; - -const struct bench bench_trig_fmodret = { - .name = "trig-fmodret", - .validate = trigger_validate, - .setup = trigger_fmodret_setup, + .setup = trigger_syscall_count_setup, .producer_thread = trigger_producer, .measure = trigger_measure, .report_progress = hits_drops_report_progress, @@ -508,7 +347,7 @@ const struct bench bench_trig_fmodret = { }; /* batched (staying mostly in kernel) kprobe/fentry benchmarks */ -#define BENCH_TRIG_BATCH(KIND, NAME) \ +#define BENCH_TRIG_KERNEL(KIND, NAME) \ const struct bench bench_trig_##KIND = { \ .name = "trig-" NAME, \ .setup = trigger_##KIND##_setup, \ @@ -519,13 +358,13 @@ const struct bench bench_trig_##KIND = { \ .argp = &bench_trigger_batch_argp, \ } -BENCH_TRIG_BATCH(kernel_count, "kernel-count"); -BENCH_TRIG_BATCH(kprobe_batch, "kprobe-batch"); -BENCH_TRIG_BATCH(kretprobe_batch, "kretprobe-batch"); -BENCH_TRIG_BATCH(kprobe_multi_batch, "kprobe-multi-batch"); -BENCH_TRIG_BATCH(kretprobe_multi_batch, "kretprobe-multi-batch"); -BENCH_TRIG_BATCH(fentry_batch, "fentry-batch"); -BENCH_TRIG_BATCH(fexit_batch, "fexit-batch"); +BENCH_TRIG_KERNEL(kernel_count, "kernel-count"); +BENCH_TRIG_KERNEL(kprobe, "kprobe"); +BENCH_TRIG_KERNEL(kretprobe, "kretprobe"); +BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); +BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); +BENCH_TRIG_KERNEL(fentry, "fentry"); +BENCH_TRIG_KERNEL(fexit, "fexit"); /* uprobe benchmarks */ #define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index b58ec33ea18c..6e790e294260 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -4,12 +4,9 @@ set -eufo pipefail def_tests=( \ usermode-count kernel-count syscall-count \ - fentry-batch fexit-batch \ - kprobe-batch kprobe-multi-batch \ - kretprobe-batch kretprobe-multi-batch \ - fentry fexit fmodret \ - rawtp tp \ - kprobe kprobe-multi kretprobe kretprobe-multi \ + fentry fexit \ + kprobe kprobe-multi \ + kretprobe kretprobe-multi \ ) tests=("$@") @@ -21,5 +18,5 @@ p=${PROD_CNT:-1} for t in "${tests[@]}"; do summary=$(sudo ./bench -w2 -d5 -a -p$p trig-$t | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) - printf "%-21s: %s\n" $t "$summary" + printf "%-15s: %s\n" $t "$summary" done diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index 36021d243397..af169f831f2f 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in usermode-count base {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index f0b76afa5017..81990e45b547 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -25,77 +25,6 @@ static __always_inline void inc_counter(void) __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); } -SEC("tp/syscalls/sys_enter_getpgid") -int bench_trigger_tp(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("raw_tp/sys_enter") -int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) -{ - if (id == __NR_getpgid) - inc_counter(); - return 0; -} - -SEC("kprobe/" SYS_PREFIX "sys_getpgid") -int bench_trigger_kprobe(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("kretprobe/" SYS_PREFIX "sys_getpgid") -int bench_trigger_kretprobe(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid") -int bench_trigger_kprobe_multi(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid") -int bench_trigger_kretprobe_multi(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("fentry/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fentry(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("fexit/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fexit(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("fentry.s/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fentry_sleep(void *ctx) -{ - inc_counter(); - return 0; -} - -SEC("fmod_ret/" SYS_PREFIX "sys_getpgid") -int bench_trigger_fmodret(void *ctx) -{ - inc_counter(); - return -22; -} - SEC("uprobe") int bench_trigger_uprobe(void *ctx) { @@ -128,42 +57,42 @@ int trigger_driver(void *ctx) } SEC("kprobe/bpf_get_numa_node_id") -int bench_trigger_kprobe_batch(void *ctx) +int bench_trigger_kprobe(void *ctx) { inc_counter(); return 0; } SEC("kretprobe/bpf_get_numa_node_id") -int bench_trigger_kretprobe_batch(void *ctx) +int bench_trigger_kretprobe(void *ctx) { inc_counter(); return 0; } SEC("kprobe.multi/bpf_get_numa_node_id") -int bench_trigger_kprobe_multi_batch(void *ctx) +int bench_trigger_kprobe_multi(void *ctx) { inc_counter(); return 0; } SEC("kretprobe.multi/bpf_get_numa_node_id") -int bench_trigger_kretprobe_multi_batch(void *ctx) +int bench_trigger_kretprobe_multi(void *ctx) { inc_counter(); return 0; } SEC("fentry/bpf_get_numa_node_id") -int bench_trigger_fentry_batch(void *ctx) +int bench_trigger_fentry(void *ctx) { inc_counter(); return 0; } SEC("fexit/bpf_get_numa_node_id") -int bench_trigger_fexit_batch(void *ctx) +int bench_trigger_fexit(void *ctx) { inc_counter(); return 0; From b4ccf9158f5893dedb898687272fabfe80f58907 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:49 -0700 Subject: [PATCH 009/147] selftests/bpf: lazy-load trigger bench BPF programs Instead of front-loading all possible benchmarking BPF programs for trigger benchmarks, explicitly specify which BPF programs are used by specific benchmark and load only it. This allows to be more flexible in supporting older kernels, where some program types might not be possible to load (e.g., those that rely on newly added kfunc). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/benchs/bench_trigger.c | 36 +++++++++++++++++-- .../selftests/bpf/progs/trigger_bench.c | 18 +++++----- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 7d4f34adfd64..2c477808a6f0 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -133,8 +133,6 @@ static void trigger_measure(struct bench_res *res) static void setup_ctx(void) { - int err; - setup_libbpf(); ctx.skel = trigger_bench__open(); @@ -143,7 +141,15 @@ static void setup_ctx(void) exit(1); } + /* default "driver" BPF program */ + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); + ctx.skel->rodata->batch_iters = args.batch_iters; +} + +static void load_ctx(void) +{ + int err; err = trigger_bench__load(ctx.skel); if (err) { @@ -172,6 +178,9 @@ static void trigger_syscall_count_setup(void) static void trigger_kernel_count_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_count, true); + load_ctx(); /* override driver program */ ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); } @@ -179,36 +188,48 @@ static void trigger_kernel_count_setup(void) static void trigger_kprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe); } static void trigger_kretprobe_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe); } static void trigger_kprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi); } static void trigger_kretprobe_multi_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe_multi, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi); } static void trigger_fentry_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fentry, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fentry); } static void trigger_fexit_setup(void) { setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fexit, true); + load_ctx(); attach_bpf(ctx.skel->progs.bench_trigger_fexit); } @@ -279,15 +300,24 @@ static void usetup(bool use_retprobe, void *target_addr) { size_t uprobe_offset; struct bpf_link *link; + int err; setup_libbpf(); - ctx.skel = trigger_bench__open_and_load(); + ctx.skel = trigger_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); + + err = trigger_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + uprobe_offset = get_uprobe_offset(target_addr); link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, use_retprobe, diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 81990e45b547..07587cb3c9f5 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -25,7 +25,7 @@ static __always_inline void inc_counter(void) __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); } -SEC("uprobe") +SEC("?uprobe") int bench_trigger_uprobe(void *ctx) { inc_counter(); @@ -34,7 +34,7 @@ int bench_trigger_uprobe(void *ctx) const volatile int batch_iters = 0; -SEC("raw_tp") +SEC("?raw_tp") int trigger_count(void *ctx) { int i; @@ -45,7 +45,7 @@ int trigger_count(void *ctx) return 0; } -SEC("raw_tp") +SEC("?raw_tp") int trigger_driver(void *ctx) { int i; @@ -56,42 +56,42 @@ int trigger_driver(void *ctx) return 0; } -SEC("kprobe/bpf_get_numa_node_id") +SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { inc_counter(); return 0; } -SEC("kretprobe/bpf_get_numa_node_id") +SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { inc_counter(); return 0; } -SEC("kprobe.multi/bpf_get_numa_node_id") +SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { inc_counter(); return 0; } -SEC("kretprobe.multi/bpf_get_numa_node_id") +SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { inc_counter(); return 0; } -SEC("fentry/bpf_get_numa_node_id") +SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { inc_counter(); return 0; } -SEC("fexit/bpf_get_numa_node_id") +SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { inc_counter(); From 3124591f686115aca25d772c2ccb7b1e202c3197 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:50 -0700 Subject: [PATCH 010/147] bpf: add bpf_modify_return_test_tp() kfunc triggering tracepoint Add a simple bpf_modify_return_test_tp() kfunc, available to all program types, that is useful for various testing and benchmarking scenarios, as it allows to trigger most tracing BPF program types from BPF side, allowing to do complex testing and benchmarking scenarios. It is also attachable to for fmod_ret programs, making it a good and simple way to trigger fmod_ret program under test/benchmark. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/trace/events/bpf_test_run.h | 17 +++++++++++++++++ kernel/bpf/helpers.c | 1 + net/bpf/test_run.c | 8 ++++++++ 3 files changed, 26 insertions(+) diff --git a/include/trace/events/bpf_test_run.h b/include/trace/events/bpf_test_run.h index 265447e3f71a..0c924d39b7cb 100644 --- a/include/trace/events/bpf_test_run.h +++ b/include/trace/events/bpf_test_run.h @@ -7,6 +7,23 @@ #include +TRACE_EVENT(bpf_trigger_tp, + + TP_PROTO(int nonce), + + TP_ARGS(nonce), + + TP_STRUCT__entry( + __field(int, nonce) + ), + + TP_fast_assign( + __entry->nonce = nonce; + ), + + TP_printk("nonce %d", __entry->nonce) +); + DECLARE_EVENT_CLASS(bpf_test_finish, TP_PROTO(int *err), diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f860adab3eb9..d9e7aca8ae9e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2625,6 +2625,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 61efeadaff8d..f6aad4ed2ab2 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -575,6 +575,13 @@ __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, return a + *b + c + d + (long)e + f + g; } +__bpf_kfunc int bpf_modify_return_test_tp(int nonce) +{ + trace_bpf_trigger_tp(nonce); + + return nonce; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -622,6 +629,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_test_modify_return_ids) From 985d0681b46be7db5ccc330d9a7f318b96ce0029 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:51 -0700 Subject: [PATCH 011/147] selftests/bpf: add batched tp/raw_tp/fmodret tests Utilize bpf_modify_return_test_tp() kfunc to have a fast way to trigger tp/raw_tp/fmodret programs from another BPF program, which gives us comparable batched benchmarks to (batched) kprobe/fentry benchmarks. We don't switch kprobe/fentry batched benchmarks to this kfunc to make bench tool usable on older kernels as well. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 6 +++ .../selftests/bpf/benchs/bench_trigger.c | 39 +++++++++++++++++++ .../selftests/bpf/benchs/run_bench_trigger.sh | 3 +- .../selftests/bpf/progs/trigger_bench.c | 34 ++++++++++++++++ 4 files changed, 81 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 8b16841a3dec..82de56c8162e 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -506,6 +506,9 @@ extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fexit; +extern const struct bench bench_trig_fmodret; +extern const struct bench bench_trig_tp; +extern const struct bench bench_trig_rawtp; /* uprobe/uretprobe benchmarks */ extern const struct bench bench_trig_uprobe_nop; @@ -556,6 +559,9 @@ static const struct bench *benchs[] = { &bench_trig_kretprobe_multi, &bench_trig_fentry, &bench_trig_fexit, + &bench_trig_fmodret, + &bench_trig_tp, + &bench_trig_rawtp, /* uprobes */ &bench_trig_uprobe_nop, &bench_trig_uretprobe_nop, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 2c477808a6f0..4b05539f167d 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -233,6 +233,42 @@ static void trigger_fexit_setup(void) attach_bpf(ctx.skel->progs.bench_trigger_fexit); } +static void trigger_fmodret_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fmodret, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); +} + +static void trigger_tp_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_tp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_tp); +} + +static void trigger_rawtp_setup(void) +{ + setup_ctx(); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); + bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true); + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_rawtp, true); + load_ctx(); + /* override driver program */ + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc); + attach_bpf(ctx.skel->progs.bench_trigger_rawtp); +} + /* make sure call is not inlined and not avoided by compiler, so __weak and * inline asm volatile in the body of the function * @@ -395,6 +431,9 @@ BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); BENCH_TRIG_KERNEL(fentry, "fentry"); BENCH_TRIG_KERNEL(fexit, "fexit"); +BENCH_TRIG_KERNEL(fmodret, "fmodret"); +BENCH_TRIG_KERNEL(tp, "tp"); +BENCH_TRIG_KERNEL(rawtp, "rawtp"); /* uprobe benchmarks */ #define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \ diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index 6e790e294260..a690f5a68b6b 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -4,7 +4,8 @@ set -eufo pipefail def_tests=( \ usermode-count kernel-count syscall-count \ - fentry fexit \ + fentry fexit fmodret \ + rawtp tp \ kprobe kprobe-multi \ kretprobe kretprobe-multi \ ) diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 07587cb3c9f5..2619ed193c65 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -56,6 +56,19 @@ int trigger_driver(void *ctx) return 0; } +extern int bpf_modify_return_test_tp(int nonce) __ksym __weak; + +SEC("?raw_tp") +int trigger_driver_kfunc(void *ctx) +{ + int i; + + for (i = 0; i < batch_iters; i++) + (void)bpf_modify_return_test_tp(0); /* attach point for benchmarking */ + + return 0; +} + SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { @@ -97,3 +110,24 @@ int bench_trigger_fexit(void *ctx) inc_counter(); return 0; } + +SEC("?fmod_ret/bpf_modify_return_test_tp") +int bench_trigger_fmodret(void *ctx) +{ + inc_counter(); + return -22; +} + +SEC("?tp/bpf_test_run/bpf_trigger_tp") +int bench_trigger_tp(void *ctx) +{ + inc_counter(); + return 0; +} + +SEC("?raw_tp/bpf_trigger_tp") +int bench_trigger_rawtp(void *ctx) +{ + inc_counter(); + return 0; +} From ee3bad033d01066348913f3220ea81987721ed53 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 27 Mar 2024 11:20:22 +0800 Subject: [PATCH 012/147] bpf: Mitigate latency spikes associated with freeing non-preallocated htab Following the recent upgrade of one of our BPF programs, we encountered significant latency spikes affecting other applications running on the same host. After thorough investigation, we identified that these spikes were primarily caused by the prolonged duration required to free a non-preallocated htab with approximately 2 million keys. Notably, our kernel configuration lacks the presence of CONFIG_PREEMPT. In scenarios where kernel execution extends excessively, other threads might be starved of CPU time, resulting in latency issues across the system. To mitigate this, we've adopted a proactive approach by incorporating cond_resched() calls within the kernel code. This ensures that during lengthy kernel operations, the scheduler is invoked periodically to provide opportunities for other threads to execute. Signed-off-by: Yafang Shao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240327032022.78391-1-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3a088a5349bc..e81059faae63 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1490,6 +1490,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_del_rcu(&l->hash_node); htab_elem_free(htab, l); } + cond_resched(); } migrate_enable(); } From 42e4ebd390be8e0090d64d58433b6cba45d919e9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 Mar 2024 12:14:32 -0700 Subject: [PATCH 013/147] bpf: Remove CONFIG_X86 and CONFIG_DYNAMIC_FTRACE guard from the tcp-cc kfuncs The commit 7aae231ac93b ("bpf: tcp: Limit calling some tcp cc functions to CONFIG_DYNAMIC_FTRACE") added CONFIG_DYNAMIC_FTRACE guard because pahole was only generating btf for ftrace-able functions. The ftrace filter had already been removed from pahole, so the CONFIG_DYNAMIC_FTRACE guard can be removed. The commit 569c484f9995 ("bpf: Limit static tcp-cc functions in the .BTF_ids list to x86") has added CONFIG_X86 guard because it failed the powerpc arch which prepended a "." to the local static function, so "cubictcp_init" becomes ".cubictcp_init". "__bpf_kfunc" has been added to kfunc since then and it uses the __unused compiler attribute. There is an existing "__bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused)" test in bpf_testmod.c to cover the static kfunc case. cross compile on ppc64 with CONFIG_DYNAMIC_FTRACE disabled: > readelf -s vmlinux | grep cubictcp_ 56938: c00000000144fd00 184 FUNC LOCAL DEFAULT 2 cubictcp_cwnd_event [: 8] 56939: c00000000144fdb8 200 FUNC LOCAL DEFAULT 2 cubictcp_recalc_[...] [: 8] 56940: c00000000144fe80 296 FUNC LOCAL DEFAULT 2 cubictcp_init [: 8] 56941: c00000000144ffa8 228 FUNC LOCAL DEFAULT 2 cubictcp_state [: 8] 56942: c00000000145008c 1908 FUNC LOCAL DEFAULT 2 cubictcp_cong_avoid [: 8] 56943: c000000001450800 1644 FUNC LOCAL DEFAULT 2 cubictcp_acked [: 8] > bpftool btf dump file vmlinux | grep cubictcp_ [51540] FUNC 'cubictcp_acked' type_id=38137 linkage=static [51541] FUNC 'cubictcp_cong_avoid' type_id=38122 linkage=static [51543] FUNC 'cubictcp_cwnd_event' type_id=51542 linkage=static [51544] FUNC 'cubictcp_init' type_id=9186 linkage=static [51545] FUNC 'cubictcp_recalc_ssthresh' type_id=35021 linkage=static [51547] FUNC 'cubictcp_state' type_id=38141 linkage=static The patch removed both config guards. Cc: Jiri Olsa Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240322191433.4133280-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- net/ipv4/tcp_bbr.c | 4 ---- net/ipv4/tcp_cubic.c | 4 ---- net/ipv4/tcp_dctcp.c | 4 ---- 3 files changed, 12 deletions(-) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 05dc2d05bc7c..7e52ab24e40a 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1156,8 +1156,6 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { }; BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, bbr_init) BTF_ID_FLAGS(func, bbr_main) BTF_ID_FLAGS(func, bbr_sndbuf_expand) @@ -1166,8 +1164,6 @@ BTF_ID_FLAGS(func, bbr_cwnd_event) BTF_ID_FLAGS(func, bbr_ssthresh) BTF_ID_FLAGS(func, bbr_min_tso_segs) BTF_ID_FLAGS(func, bbr_set_state) -#endif -#endif BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 44869ea089e3..5dbed91c6178 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -486,16 +486,12 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { }; BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, cubictcp_init) BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) BTF_ID_FLAGS(func, cubictcp_cong_avoid) BTF_ID_FLAGS(func, cubictcp_state) BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) -#endif -#endif BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index e33fbe4933e4..6b712a33d49f 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -261,16 +261,12 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { }; BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, dctcp_init) BTF_ID_FLAGS(func, dctcp_update_alpha) BTF_ID_FLAGS(func, dctcp_cwnd_event) BTF_ID_FLAGS(func, dctcp_ssthresh) BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) -#endif -#endif BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { From 5da7fb04902b0f0fcd13bc5ef216e232fa971efa Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 Mar 2024 12:14:33 -0700 Subject: [PATCH 014/147] selftests/bpf: Test loading bpf-tcp-cc prog calling the kernel tcp-cc kfuncs This patch adds a test to ensure all static tcp-cc kfuncs is visible to the struct_ops bpf programs. It is checked by successfully loading the struct_ops programs calling these tcp-cc kfuncs. This patch needs to enable the CONFIG_TCP_CONG_DCTCP and the CONFIG_TCP_CONG_BBR. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240322191433.4133280-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/config | 2 + .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 12 ++ .../selftests/bpf/progs/tcp_ca_kfunc.c | 121 ++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 01f241ea2c67..afd675b1bf80 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -88,3 +88,5 @@ CONFIG_VSOCKETS=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y CONFIG_XFRM_INTERFACE=y +CONFIG_TCP_CONG_DCTCP=y +CONFIG_TCP_CONG_BBR=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 99dc60fe59d5..332ed54cc6af 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -13,6 +13,7 @@ #include "tcp_ca_write_sk_pacing.skel.h" #include "tcp_ca_incompl_cong_ops.skel.h" #include "tcp_ca_unsupp_cong_op.skel.h" +#include "tcp_ca_kfunc.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -518,6 +519,15 @@ static void test_link_replace(void) tcp_ca_update__destroy(skel); } +static void test_tcp_ca_kfunc(void) +{ + struct tcp_ca_kfunc *skel; + + skel = tcp_ca_kfunc__open_and_load(); + ASSERT_OK_PTR(skel, "tcp_ca_kfunc__open_and_load"); + tcp_ca_kfunc__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -546,4 +556,6 @@ void test_bpf_tcp_ca(void) test_multi_links(); if (test__start_subtest("link_replace")) test_link_replace(); + if (test__start_subtest("tcp_ca_kfunc")) + test_tcp_ca_kfunc(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c new file mode 100644 index 000000000000..fcfbfe0336b4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Facebook */ + +#include "vmlinux.h" +#include + +extern void bbr_init(struct sock *sk) __ksym; +extern void bbr_main(struct sock *sk, const struct rate_sample *rs) __ksym; +extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym; +extern u32 bbr_undo_cwnd(struct sock *sk) __ksym; +extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern u32 bbr_ssthresh(struct sock *sk) __ksym; +extern u32 bbr_min_tso_segs(struct sock *sk) __ksym; +extern void bbr_set_state(struct sock *sk, u8 new_state) __ksym; + +extern void dctcp_init(struct sock *sk) __ksym; +extern void dctcp_update_alpha(struct sock *sk, u32 flags) __ksym; +extern void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) __ksym; +extern u32 dctcp_ssthresh(struct sock *sk) __ksym; +extern u32 dctcp_cwnd_undo(struct sock *sk) __ksym; +extern void dctcp_state(struct sock *sk, u8 new_state) __ksym; + +extern void cubictcp_init(struct sock *sk) __ksym; +extern u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) __ksym; +extern void cubictcp_state(struct sock *sk, u8 new_state) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; + +SEC("struct_ops/init") +void BPF_PROG(init, struct sock *sk) +{ + bbr_init(sk); + dctcp_init(sk); + cubictcp_init(sk); +} + +SEC("struct_ops/in_ack_event") +void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) +{ + dctcp_update_alpha(sk, flags); +} + +SEC("struct_ops/cong_control") +void BPF_PROG(cong_control, struct sock *sk, const struct rate_sample *rs) +{ + bbr_main(sk, rs); +} + +SEC("struct_ops/cong_avoid") +void BPF_PROG(cong_avoid, struct sock *sk, u32 ack, u32 acked) +{ + cubictcp_cong_avoid(sk, ack, acked); +} + +SEC("struct_ops/sndbuf_expand") +u32 BPF_PROG(sndbuf_expand, struct sock *sk) +{ + return bbr_sndbuf_expand(sk); +} + +SEC("struct_ops/undo_cwnd") +u32 BPF_PROG(undo_cwnd, struct sock *sk) +{ + bbr_undo_cwnd(sk); + return dctcp_cwnd_undo(sk); +} + +SEC("struct_ops/cwnd_event") +void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + bbr_cwnd_event(sk, event); + dctcp_cwnd_event(sk, event); + cubictcp_cwnd_event(sk, event); +} + +SEC("struct_ops/ssthresh") +u32 BPF_PROG(ssthresh, struct sock *sk) +{ + bbr_ssthresh(sk); + dctcp_ssthresh(sk); + return cubictcp_recalc_ssthresh(sk); +} + +SEC("struct_ops/min_tso_segs") +u32 BPF_PROG(min_tso_segs, struct sock *sk) +{ + return bbr_min_tso_segs(sk); +} + +SEC("struct_ops/set_state") +void BPF_PROG(set_state, struct sock *sk, u8 new_state) +{ + bbr_set_state(sk, new_state); + dctcp_state(sk, new_state); + cubictcp_state(sk, new_state); +} + +SEC("struct_ops/pkts_acked") +void BPF_PROG(pkts_acked, struct sock *sk, const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +SEC(".struct_ops") +struct tcp_congestion_ops tcp_ca_kfunc = { + .init = (void *)init, + .in_ack_event = (void *)in_ack_event, + .cong_control = (void *)cong_control, + .cong_avoid = (void *)cong_avoid, + .sndbuf_expand = (void *)sndbuf_expand, + .undo_cwnd = (void *)undo_cwnd, + .cwnd_event = (void *)cwnd_event, + .ssthresh = (void *)ssthresh, + .min_tso_segs = (void *)min_tso_segs, + .set_state = (void *)set_state, + .pkts_acked = (void *)pkts_acked, + .name = "tcp_ca_kfunc", +}; + +char _license[] SEC("license") = "GPL"; From cdfd9cc3ba147ceea650afa6b7031e31a98d500e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:14:48 -0700 Subject: [PATCH 015/147] selftests/bpf: Replace CHECK with ASSERT macros for ksyms test Replace CHECK with ASSERT macros for ksyms tests. This test failed earlier with clang lto kernel, but the issue is gone with latest code base. But replacing CHECK with ASSERT still improves code as ASSERT is preferred in selftests. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041448.1197812-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/ksyms.c | 30 +++++++------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c index b295969b263b..dc7aab532fb1 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c @@ -5,8 +5,6 @@ #include "test_ksyms.skel.h" #include -static int duration; - void test_ksyms(void) { const char *btf_path = "/sys/kernel/btf/vmlinux"; @@ -18,43 +16,37 @@ void test_ksyms(void) int err; err = kallsyms_find("bpf_link_fops", &link_fops_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "bpf_link_fops: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "bpf_link_fops: ksym_find")) return; err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr); - if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) + if (!ASSERT_NEQ(err, -EINVAL, "__per_cpu_start: kallsyms_fopen")) return; - if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n")) + if (!ASSERT_NEQ(err, -ENOENT, "__per_cpu_start: ksym_find")) return; - if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno)) + if (!ASSERT_OK(stat(btf_path, &st), "stat_btf")) return; btf_size = st.st_size; skel = test_ksyms__open_and_load(); - if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n")) + if (!ASSERT_OK_PTR(skel, "test_ksyms__open_and_load")) return; err = test_ksyms__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + if (!ASSERT_OK(err, "test_ksyms__attach")) goto cleanup; /* trigger tracepoint */ usleep(1); data = skel->data; - CHECK(data->out__bpf_link_fops != link_fops_addr, "bpf_link_fops", - "got 0x%llx, exp 0x%llx\n", - data->out__bpf_link_fops, link_fops_addr); - CHECK(data->out__bpf_link_fops1 != 0, "bpf_link_fops1", - "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0); - CHECK(data->out__btf_size != btf_size, "btf_size", - "got %llu, exp %llu\n", data->out__btf_size, btf_size); - CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start", - "got %llu, exp %llu\n", data->out__per_cpu_start, - per_cpu_start_addr); + ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops"); + ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1"); + ASSERT_EQ(data->out__btf_size, btf_size, "btf_size"); + ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start"); cleanup: test_ksyms__destroy(skel); From ad2b05286e94485070475e473963724fa657491c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:14:53 -0700 Subject: [PATCH 016/147] libbpf: Mark libbpf_kallsyms_parse static function Currently libbpf_kallsyms_parse() function is declared as a global function but actually it is not a API and there is no external users in bpftool/bpf-selftests. So let us mark the function as static. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041453.1197949-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 5 ++++- tools/lib/bpf/libbpf_internal.h | 5 ----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d7d8f78f8846..1822fb8a02d2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7986,7 +7986,10 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) return 0; } -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +static int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) { char sym_type, sym_name[500]; unsigned long long sym_addr; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 864b36177424..a0dcfb82e455 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -518,11 +518,6 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind); -typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, - const char *sym_name, void *ctx); - -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); - /* handle direct returned errors */ static inline int libbpf_err(int ret) { From c56e59776f46d77984329488878a52baf4969457 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:14:58 -0700 Subject: [PATCH 017/147] libbpf: Handle .llvm. symbol properly With CONFIG_LTO_CLANG_THIN enabled, with some of previous version of kernel code base ([1]), I hit the following error: test_ksyms:PASS:kallsyms_fopen 0 nsec test_ksyms:FAIL:ksym_find symbol 'bpf_link_fops' not found #118 ksyms:FAIL The reason is that 'bpf_link_fops' is renamed to bpf_link_fops.llvm.8325593422554671469 Due to cross-file inlining, the static variable 'bpf_link_fops' in syscall.c is used by a function in another file. To avoid potential duplicated names, the llvm added suffix '.llvm.' ([2]) to 'bpf_link_fops' variable. Such renaming caused a problem in libbpf if 'bpf_link_fops' is used in bpf prog as a ksym but 'bpf_link_fops' does not match any symbol in /proc/kallsyms. To fix this issue, libbpf needs to understand that suffix '.llvm.' is caused by clang lto kernel and to process such symbols properly. With latest bpf-next code base built with CONFIG_LTO_CLANG_THIN, I cannot reproduce the above failure any more. But such an issue could happen with other symbols or in the future for bpf_link_fops symbol. For example, with my current kernel, I got the following from /proc/kallsyms: ffffffff84782154 d __func__.net_ratelimit.llvm.6135436931166841955 ffffffff85f0a500 d tk_core.llvm.726630847145216431 ffffffff85fdb960 d __fs_reclaim_map.llvm.10487989720912350772 ffffffff864c7300 d fake_dst_ops.llvm.54750082607048300 I could not easily create a selftest to test newly-added libbpf functionality with a static C test since I do not know which symbol is cross-file inlined. But based on my particular kernel, the following test change can run successfully. > diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c > index 6a86d1f07800..904a103f7b1d 100644 > --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c > +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c > @@ -42,6 +42,7 @@ void test_ksyms(void) > ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops"); > ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1"); > ASSERT_EQ(data->out__btf_size, btf_size, "btf_size"); > + ASSERT_NEQ(data->out__fake_dst_ops, 0, "fake_dst_ops"); > ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start"); > > cleanup: > diff --git a/tools/testing/selftests/bpf/progs/test_ksyms.c b/tools/testing/selftests/bpf/progs/test_ksyms.c > index 6c9cbb5a3bdf..fe91eef54b66 100644 > --- a/tools/testing/selftests/bpf/progs/test_ksyms.c > +++ b/tools/testing/selftests/bpf/progs/test_ksyms.c > @@ -9,11 +9,13 @@ __u64 out__bpf_link_fops = -1; > __u64 out__bpf_link_fops1 = -1; > __u64 out__btf_size = -1; > __u64 out__per_cpu_start = -1; > +__u64 out__fake_dst_ops = -1; > > extern const void bpf_link_fops __ksym; > extern const void __start_BTF __ksym; > extern const void __stop_BTF __ksym; > extern const void __per_cpu_start __ksym; > +extern const void fake_dst_ops __ksym; > /* non-existing symbol, weak, default to zero */ > extern const void bpf_link_fops1 __ksym __weak; > > @@ -23,6 +25,7 @@ int handler(const void *ctx) > out__bpf_link_fops = (__u64)&bpf_link_fops; > out__btf_size = (__u64)(&__stop_BTF - &__start_BTF); > out__per_cpu_start = (__u64)&__per_cpu_start; > + out__fake_dst_ops = (__u64)&fake_dst_ops; > > out__bpf_link_fops1 = (__u64)&bpf_link_fops1; This patch fixed the issue in libbpf such that the suffix '.llvm.' will be ignored during comparison of bpf prog ksym vs. symbols in /proc/kallsyms, this resolved the issue. Currently, only static variables in /proc/kallsyms are checked with '.llvm.' suffix since in bpf programs function ksyms with '.llvm.' suffix are most likely kfunc's and unlikely to be cross-file inlined. Note that currently kernel does not support gcc build with lto. [1] https://lore.kernel.org/bpf/20240302165017.1627295-1-yonghong.song@linux.dev/ [2] https://github.com/llvm/llvm-project/blob/release/18.x/llvm/include/llvm/IR/ModuleSummaryIndex.h#L1714-L1719 Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041458.1198161-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1822fb8a02d2..b091154bc5b5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1970,6 +1970,20 @@ static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, return NULL; } +static struct extern_desc *find_extern_by_name_with_len(const struct bpf_object *obj, + const void *name, int len) +{ + const char *ext_name; + int i; + + for (i = 0; i < obj->nr_extern; i++) { + ext_name = obj->externs[i].name; + if (strlen(ext_name) == len && strncmp(ext_name, name, len) == 0) + return &obj->externs[i]; + } + return NULL; +} + static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, char value) { @@ -8029,8 +8043,13 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, struct bpf_object *obj = ctx; const struct btf_type *t; struct extern_desc *ext; + char *res; - ext = find_extern_by_name(obj, sym_name); + res = strstr(sym_name, ".llvm."); + if (sym_type == 'd' && res) + ext = find_extern_by_name_with_len(obj, sym_name, res - sym_name); + else + ext = find_extern_by_name(obj, sym_name); if (!ext || ext->type != EXT_KSYM) return 0; From d1320649346c36c5ba7b579533bf518960ef71e1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:03 -0700 Subject: [PATCH 018/147] selftests/bpf: Refactor some functions for kprobe_multi_test Refactor some functions in kprobe_multi_test.c to extract some helper functions who will be used in later patches to avoid code duplication. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041503.1198982-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/kprobe_multi_test.c | 100 +++++++++++------- 1 file changed, 60 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 05000810e28e..46e28edda595 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -336,6 +336,37 @@ static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) return strcmp((const char *) key1, (const char *) key2) == 0; } +static bool is_invalid_entry(char *buf, bool kernel) +{ + if (kernel && strchr(buf, '[')) + return true; + if (!kernel && !strchr(buf, '[')) + return true; + return false; +} + +static bool skip_entry(char *name) +{ + /* + * We attach to almost all kernel functions and some of them + * will cause 'suspicious RCU usage' when fprobe is attached + * to them. Filter out the current culprits - arch_cpu_idle + * default_idle and rcu_* functions. + */ + if (!strcmp(name, "arch_cpu_idle")) + return true; + if (!strcmp(name, "default_idle")) + return true; + if (!strncmp(name, "rcu_", 4)) + return true; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + return true; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + return true; + return false; +} + static int get_syms(char ***symsp, size_t *cntp, bool kernel) { size_t cap = 0, cnt = 0, i; @@ -368,30 +399,13 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) } while (fgets(buf, sizeof(buf), f)) { - if (kernel && strchr(buf, '[')) - continue; - if (!kernel && !strchr(buf, '[')) + if (is_invalid_entry(buf, kernel)) continue; free(name); if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) continue; - /* - * We attach to almost all kernel functions and some of them - * will cause 'suspicious RCU usage' when fprobe is attached - * to them. Filter out the current culprits - arch_cpu_idle - * default_idle and rcu_* functions. - */ - if (!strcmp(name, "arch_cpu_idle")) - continue; - if (!strcmp(name, "default_idle")) - continue; - if (!strncmp(name, "rcu_", 4)) - continue; - if (!strcmp(name, "bpf_dispatcher_xdp_func")) - continue; - if (!strncmp(name, "__ftrace_invalid_address__", - sizeof("__ftrace_invalid_address__") - 1)) + if (skip_entry(name)) continue; err = hashmap__add(map, name, 0); @@ -426,14 +440,37 @@ error: return err; } -static void test_kprobe_multi_bench_attach(bool kernel) +static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { - LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); - struct kprobe_multi_empty *skel = NULL; long attach_start_ns, attach_end_ns; long detach_start_ns, detach_end_ns; double attach_delta, detach_delta; struct bpf_link *link = NULL; + + attach_start_ns = get_time_ns(); + link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_empty, + NULL, opts); + attach_end_ns = get_time_ns(); + + if (!ASSERT_OK_PTR(link, "bpf_program__attach_kprobe_multi_opts")) + return; + + detach_start_ns = get_time_ns(); + bpf_link__destroy(link); + detach_end_ns = get_time_ns(); + + attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0; + detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0; + + printf("%s: found %lu functions\n", __func__, opts->cnt); + printf("%s: attached in %7.3lfs\n", __func__, attach_delta); + printf("%s: detached in %7.3lfs\n", __func__, detach_delta); +} + +static void test_kprobe_multi_bench_attach(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; char **syms = NULL; size_t cnt = 0, i; @@ -447,24 +484,7 @@ static void test_kprobe_multi_bench_attach(bool kernel) opts.syms = (const char **) syms; opts.cnt = cnt; - attach_start_ns = get_time_ns(); - link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_empty, - NULL, &opts); - attach_end_ns = get_time_ns(); - - if (!ASSERT_OK_PTR(link, "bpf_program__attach_kprobe_multi_opts")) - goto cleanup; - - detach_start_ns = get_time_ns(); - bpf_link__destroy(link); - detach_end_ns = get_time_ns(); - - attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0; - detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0; - - printf("%s: found %lu functions\n", __func__, cnt); - printf("%s: attached in %7.3lfs\n", __func__, attach_delta); - printf("%s: detached in %7.3lfs\n", __func__, detach_delta); + do_bench_test(skel, &opts); cleanup: kprobe_multi_empty__destroy(skel); From 9475dacb75e0b5efae086dc904f4d27c31f15157 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:08 -0700 Subject: [PATCH 019/147] selftests/bpf: Refactor trace helper func load_kallsyms_local() Refactor trace helper function load_kallsyms_local() such that it invokes a common function with a compare function as input. The common function will be used later for other local functions. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041508.1199239-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/trace_helpers.c | 19 ++++++++++++------- tools/testing/selftests/bpf/trace_helpers.h | 2 ++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 27fd7ed3e4b0..fc9de4a89aea 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -61,12 +61,7 @@ void free_kallsyms_local(struct ksyms *ksyms) free(ksyms); } -static int ksym_cmp(const void *p1, const void *p2) -{ - return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; -} - -struct ksyms *load_kallsyms_local(void) +static struct ksyms *load_kallsyms_local_common(ksym_cmp_t cmp_cb) { FILE *f; char func[256], buf[256]; @@ -100,7 +95,7 @@ struct ksyms *load_kallsyms_local(void) goto error; } fclose(f); - qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), ksym_cmp); + qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), cmp_cb); return ksyms; error: @@ -109,6 +104,16 @@ error: return NULL; } +static int ksym_cmp(const void *p1, const void *p2) +{ + return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; +} + +struct ksyms *load_kallsyms_local(void) +{ + return load_kallsyms_local_common(ksym_cmp); +} + int load_kallsyms(void) { pthread_mutex_lock(&ksyms_mutex); diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 04fd1da7079d..9da0d0484eed 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -13,6 +13,8 @@ struct ksym { }; struct ksyms; +typedef int (*ksym_cmp_t)(const void *p1, const void *p2); + int load_kallsyms(void); struct ksym *ksym_search(long key); long ksym_get_addr(const char *name); From d1f02581059e42d8daf944aae2a296254cc7a5d5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:13 -0700 Subject: [PATCH 020/147] selftests/bpf: Add {load,search}_kallsyms_custom_local() These two functions allow selftests to do loading/searching kallsyms based on their specific compare functions. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041513.1199440-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/trace_helpers.c | 27 +++++++++++++++++++++ tools/testing/selftests/bpf/trace_helpers.h | 5 ++++ 2 files changed, 32 insertions(+) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index fc9de4a89aea..7f45b4cb41fe 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -114,6 +114,11 @@ struct ksyms *load_kallsyms_local(void) return load_kallsyms_local_common(ksym_cmp); } +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb) +{ + return load_kallsyms_local_common(cmp_cb); +} + int load_kallsyms(void) { pthread_mutex_lock(&ksyms_mutex); @@ -153,6 +158,28 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key) return &ksyms->syms[0]; } +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p, + ksym_search_cmp_t cmp_cb) +{ + int start = 0, mid, end = ksyms->sym_cnt; + struct ksym *ks; + int result; + + while (start < end) { + mid = start + (end - start) / 2; + ks = &ksyms->syms[mid]; + result = cmp_cb(p, ks); + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return ks; + } + + return NULL; +} + struct ksym *ksym_search(long key) { if (!ksyms) diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 9da0d0484eed..d1ed71789049 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -14,6 +14,7 @@ struct ksym { struct ksyms; typedef int (*ksym_cmp_t)(const void *p1, const void *p2); +typedef int (*ksym_search_cmp_t)(const void *p1, const struct ksym *p2); int load_kallsyms(void); struct ksym *ksym_search(long key); @@ -24,6 +25,10 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key); long ksym_get_addr_local(struct ksyms *ksyms, const char *name); void free_kallsyms_local(struct ksyms *ksyms); +struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb); +struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p1, + ksym_search_cmp_t cmp_cb); + /* open kallsyms and find addresses on the fly, faster than load + search. */ int kallsyms_find(const char *sym, unsigned long long *addr); From 9edaafadc2c50f2af99ee5b3bad6831e1b6ad54f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:18 -0700 Subject: [PATCH 021/147] selftests/bpf: Fix kprobe_multi_bench_attach test failure with LTO kernel In my locally build clang LTO kernel (enabling CONFIG_LTO and CONFIG_LTO_CLANG_THIN), kprobe_multi_bench_attach/kernel subtest failed like: test_kprobe_multi_bench_attach:PASS:get_syms 0 nsec test_kprobe_multi_bench_attach:PASS:kprobe_multi_empty__open_and_load 0 nsec libbpf: prog 'test_kprobe_empty': failed to attach: No such process test_kprobe_multi_bench_attach:FAIL:bpf_program__attach_kprobe_multi_opts unexpected error: -3 #117/1 kprobe_multi_bench_attach/kernel:FAIL There are multiple symbols in /sys/kernel/debug/tracing/available_filter_functions are renamed in /proc/kallsyms due to cross file inlining. One example is for static function __access_remote_vm in mm/memory.c. In a non-LTO kernel, we have the following call stack: ptrace_access_vm (global, kernel/ptrace.c) access_remote_vm (global, mm/memory.c) __access_remote_vm (static, mm/memory.c) With LTO kernel, it is possible that access_remote_vm() is inlined by ptrace_access_vm(). So we end up with the following call stack: ptrace_access_vm (global, kernel/ptrace.c) __access_remote_vm (static, mm/memory.c) The compiler renames __access_remote_vm to __access_remote_vm.llvm. to prevent potential name collision. The kernel bpf_kprobe_multi_link_attach() and ftrace_lookup_symbols() try to find addresses based on /proc/kallsyms, hence the current test failed with LTO kenrel. This patch consulted /proc/kallsyms to find the corresponding entries for the ksym and this solved the issue. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041518.1199758-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/kprobe_multi_test.c | 62 ++++++++++++++----- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 46e28edda595..3b9059164360 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -367,15 +367,49 @@ static bool skip_entry(char *name) return false; } +/* Do comparision by ignoring '.llvm.' suffixes. */ +static int compare_name(const char *name1, const char *name2) +{ + const char *res1, *res2; + int len1, len2; + + res1 = strstr(name1, ".llvm."); + res2 = strstr(name2, ".llvm."); + len1 = res1 ? res1 - name1 : strlen(name1); + len2 = res2 ? res2 - name2 : strlen(name2); + + if (len1 == len2) + return strncmp(name1, name2, len1); + if (len1 < len2) + return strncmp(name1, name2, len1) <= 0 ? -1 : 1; + return strncmp(name1, name2, len2) >= 0 ? 1 : -1; +} + +static int load_kallsyms_compare(const void *p1, const void *p2) +{ + return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); +} + +static int search_kallsyms_compare(const void *p1, const struct ksym *p2) +{ + return compare_name(p1, p2->name); +} + static int get_syms(char ***symsp, size_t *cntp, bool kernel) { - size_t cap = 0, cnt = 0, i; - char *name = NULL, **syms = NULL; + size_t cap = 0, cnt = 0; + char *name = NULL, *ksym_name, **syms = NULL; struct hashmap *map; + struct ksyms *ksyms; + struct ksym *ks; char buf[256]; FILE *f; int err = 0; + ksyms = load_kallsyms_custom_local(load_kallsyms_compare); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local")) + return -EINVAL; + /* * The available_filter_functions contains many duplicates, * but other than that all symbols are usable in kprobe multi @@ -408,7 +442,14 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) if (skip_entry(name)) continue; - err = hashmap__add(map, name, 0); + ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); + if (!ks) { + err = -EINVAL; + goto error; + } + + ksym_name = ks->name; + err = hashmap__add(map, ksym_name, 0); if (err == -EEXIST) { err = 0; continue; @@ -421,8 +462,7 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel) if (err) goto error; - syms[cnt++] = name; - name = NULL; + syms[cnt++] = ksym_name; } *symsp = syms; @@ -432,11 +472,8 @@ error: free(name); fclose(f); hashmap__free(map); - if (err) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (err) free(syms); - } return err; } @@ -472,7 +509,7 @@ static void test_kprobe_multi_bench_attach(bool kernel) LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); struct kprobe_multi_empty *skel = NULL; char **syms = NULL; - size_t cnt = 0, i; + size_t cnt = 0; if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) return; @@ -488,11 +525,8 @@ static void test_kprobe_multi_bench_attach(bool kernel) cleanup: kprobe_multi_empty__destroy(skel); - if (syms) { - for (i = 0; i < cnt; i++) - free(syms[i]); + if (syms) free(syms); - } } static void test_attach_override(void) From 6302bdeb91df9b4484b9d537c29f8b6117f3f73d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 25 Mar 2024 21:15:23 -0700 Subject: [PATCH 022/147] selftests/bpf: Add a kprobe_multi subtest to use addrs instead of syms Get addrs directly from available_filter_functions_addrs and send to the kernel during kprobe_multi_attach. This avoids consultation of /proc/kallsyms. But available_filter_functions_addrs is introduced in 6.5, i.e., it is introduced recently, so I skip the test if the kernel does not support it. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240326041523.1200301-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/kprobe_multi_test.c | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 3b9059164360..51628455b6f5 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -477,6 +477,69 @@ error: return err; } +static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) +{ + unsigned long *addr, *addrs, *tmp_addrs; + int err = 0, max_cnt, inc_cnt; + char *name = NULL; + size_t cnt = 0; + char buf[256]; + FILE *f; + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); + + if (!f) + return -ENOENT; + + /* In my local setup, the number of entries is 50k+ so Let us initially + * allocate space to hold 64k entries. If 64k is not enough, incrementally + * increase 1k each time. + */ + max_cnt = 65536; + inc_cnt = 1024; + addrs = malloc(max_cnt * sizeof(long)); + if (addrs == NULL) { + err = -ENOMEM; + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) + continue; + if (skip_entry(name)) + continue; + + if (cnt == max_cnt) { + max_cnt += inc_cnt; + tmp_addrs = realloc(addrs, max_cnt); + if (!tmp_addrs) { + err = -ENOMEM; + goto error; + } + addrs = tmp_addrs; + } + + addrs[cnt++] = (unsigned long)addr; + } + + *addrsp = addrs; + *cntp = cnt; + +error: + free(name); + fclose(f); + if (err) + free(addrs); + return err; +} + static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { long attach_start_ns, attach_end_ns; @@ -529,6 +592,37 @@ cleanup: free(syms); } +static void test_kprobe_multi_bench_attach_addr(bool kernel) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct kprobe_multi_empty *skel = NULL; + unsigned long *addrs = NULL; + size_t cnt = 0; + int err; + + err = get_addrs(&addrs, &cnt, kernel); + if (err == -ENOENT) { + test__skip(); + return; + } + + if (!ASSERT_OK(err, "get_addrs")) + return; + + skel = kprobe_multi_empty__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + opts.addrs = addrs; + opts.cnt = cnt; + + do_bench_test(skel, &opts); + +cleanup: + kprobe_multi_empty__destroy(skel); + free(addrs); +} + static void test_attach_override(void) { struct kprobe_multi_override *skel = NULL; @@ -569,6 +663,10 @@ void serial_test_kprobe_multi_bench_attach(void) test_kprobe_multi_bench_attach(true); if (test__start_subtest("modules")) test_kprobe_multi_bench_attach(false); + if (test__start_subtest("kernel")) + test_kprobe_multi_bench_attach_addr(true); + if (test__start_subtest("modules")) + test_kprobe_multi_bench_attach_addr(false); } void test_kprobe_multi_test(void) From e8742081db7d01f980c6161ae1e8a1dbc1e30979 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 28 Mar 2024 11:58:01 -0700 Subject: [PATCH 023/147] bpf: Mark bpf prog stack with kmsan_unposion_memory in interpreter mode syzbot reported uninit memory usages during map_{lookup,delete}_elem. ========== BUG: KMSAN: uninit-value in __dev_map_lookup_elem kernel/bpf/devmap.c:441 [inline] BUG: KMSAN: uninit-value in dev_map_lookup_elem+0xf3/0x170 kernel/bpf/devmap.c:796 __dev_map_lookup_elem kernel/bpf/devmap.c:441 [inline] dev_map_lookup_elem+0xf3/0x170 kernel/bpf/devmap.c:796 ____bpf_map_lookup_elem kernel/bpf/helpers.c:42 [inline] bpf_map_lookup_elem+0x5c/0x80 kernel/bpf/helpers.c:38 ___bpf_prog_run+0x13fe/0xe0f0 kernel/bpf/core.c:1997 __bpf_prog_run256+0xb5/0xe0 kernel/bpf/core.c:2237 ========== The reproducer should be in the interpreter mode. The C reproducer is trying to run the following bpf prog: 0: (18) r0 = 0x0 2: (18) r1 = map[id:49] 4: (b7) r8 = 16777216 5: (7b) *(u64 *)(r10 -8) = r8 6: (bf) r2 = r10 7: (07) r2 += -229 ^^^^^^^^^^ 8: (b7) r3 = 8 9: (b7) r4 = 0 10: (85) call dev_map_lookup_elem#1543472 11: (95) exit It is due to the "void *key" (r2) passed to the helper. bpf allows uninit stack memory access for bpf prog with the right privileges. This patch uses kmsan_unpoison_memory() to mark the stack as initialized. This should address different syzbot reports on the uninit "void *key" argument during map_{lookup,delete}_elem. Reported-by: syzbot+603bcd9b0bf1d94dbb9b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000f9ce6d061494e694@google.com/ Reported-by: syzbot+eb02dc7f03dce0ef39f3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000a5c69c06147c2238@google.com/ Reported-by: syzbot+b4e65ca24fd4d0c734c3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000ac56fb06143b6cfa@google.com/ Reported-by: syzbot+d2b113dc9fea5e1d2848@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/0000000000000d69b206142d1ff7@google.com/ Reported-by: syzbot+1a3cf6f08d68868f9db3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/0000000000006f876b061478e878@google.com/ Tested-by: syzbot+1a3cf6f08d68868f9db3@syzkaller.appspotmail.com Suggested-by: Yonghong Song Suggested-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240328185801.1843078-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5aacb1d3c4cc..ab400cdd7d7a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2218,6 +2218,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG] = {}; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ return ___bpf_prog_run(regs, insn); \ @@ -2231,6 +2232,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG]; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ BPF_R2 = r2; \ From e5e1a3aa56773d55dfb71c4d58176bc19ecfa739 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 26 Mar 2024 18:03:38 +0800 Subject: [PATCH 024/147] selftests/bpf: Use connect_fd_to_fd in bpf_tcp_ca To simplify the code, use BPF selftests helper connect_fd_to_fd() in bpf_tcp_ca.c instead of open-coding it. This helper is defined in network_helpers.c, and exported in network_helpers.h, which is already included in bpf_tcp_ca.c. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/e105d1f225c643bee838409378dd90fd9aabb6dc.1711447102.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 332ed54cc6af..2f6b8b2780e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -79,11 +79,9 @@ done: static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { - struct sockaddr_in6 sa6 = {}; ssize_t nr_recv = 0, bytes = 0; int lfd = -1, fd = -1; pthread_t srv_thread; - socklen_t addrlen = sizeof(sa6); void *thread_ret; char batch[1500]; int err; @@ -104,10 +102,6 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) settimeo(lfd, 0) || settimeo(fd, 0)) goto done; - err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); - if (!ASSERT_NEQ(err, -1, "getsockname")) - goto done; - if (sk_stg_map) { err = bpf_map_update_elem(bpf_map__fd(sk_stg_map), &fd, &expected_stg, BPF_NOEXIST); @@ -116,7 +110,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) } /* connect to server */ - err = connect(fd, (struct sockaddr *)&sa6, addrlen); + err = connect_fd_to_fd(fd, lfd, 0); if (!ASSERT_NEQ(err, -1, "connect")) goto done; From 426670929fda4485a23094e03cea5d9b3ca918aa Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 26 Mar 2024 18:03:39 +0800 Subject: [PATCH 025/147] selftests/bpf: Drop settimeo in do_test settimeo is invoked in start_server() and in connect_fd_to_fd() already, no need to invoke settimeo(lfd, 0) and settimeo(fd, 0) in do_test() anymore. This patch drops them. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/dbc3613bee3b1c78f95ac9ff468bf47c92f106ea.1711447102.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 2f6b8b2780e6..077b107130f6 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -98,8 +98,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) return; } - if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) || - settimeo(lfd, 0) || settimeo(fd, 0)) + if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca)) goto done; if (sk_stg_map) { From 59f2f841179aa6a0899cb9cf53659149a35749b7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 29 Mar 2024 10:14:39 -0700 Subject: [PATCH 026/147] bpf: Avoid kfree_rcu() under lock in bpf_lpm_trie. syzbot reported the following lock sequence: cpu 2: grabs timer_base lock spins on bpf_lpm lock cpu 1: grab rcu krcp lock spins on timer_base lock cpu 0: grab bpf_lpm lock spins on rcu krcp lock bpf_lpm lock can be the same. timer_base lock can also be the same due to timer migration. but rcu krcp lock is always per-cpu, so it cannot be the same lock. Hence it's a false positive. To avoid lockdep complaining move kfree_rcu() after spin_unlock. Reported-by: syzbot+1fa663a2100308ab6eab@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240329171439.37813-1-alexei.starovoitov@gmail.com --- kernel/bpf/lpm_trie.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 939620b91c0e..0218a5132ab5 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -316,6 +316,7 @@ static long trie_update_elem(struct bpf_map *map, { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; @@ -390,7 +391,7 @@ static long trie_update_elem(struct bpf_map *map, trie->n_entries--; rcu_assign_pointer(*slot, new_node); - kfree_rcu(node, rcu); + free_node = node; goto out; } @@ -437,6 +438,7 @@ out: } spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_node, rcu); return ret; } @@ -445,6 +447,7 @@ out: static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *free_node = NULL, *free_parent = NULL; struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; @@ -514,8 +517,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) else rcu_assign_pointer( *trim2, rcu_access_pointer(parent->child[0])); - kfree_rcu(parent, rcu); - kfree_rcu(node, rcu); + free_parent = parent; + free_node = node; goto out; } @@ -529,10 +532,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); else RCU_INIT_POINTER(*trim, NULL); - kfree_rcu(node, rcu); + free_node = node; out: spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_parent, rcu); + kfree_rcu(free_node, rcu); return ret; } From 623bdd58be3727318d374f0052f9dfff1e87b854 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 29 Mar 2024 12:04:10 -0700 Subject: [PATCH 027/147] selftests/bpf: make multi-uprobe tests work in RELEASE=1 mode When BPF selftests are built in RELEASE=1 mode with -O2 optimization level, uprobe_multi binary, called from multi-uprobe tests is optimized to the point that all the thousands of target uprobe_multi_func_XXX functions are eliminated, breaking tests. So ensure they are preserved by using weak attribute. But, actually, compiling uprobe_multi binary with -O2 takes a really long time, and is quite useless (it's not a benchmark). So in addition to ensuring that uprobe_multi_func_XXX functions are preserved, opt-out of -O2 explicitly in Makefile and stick to -O0. This saves a lot of compilation time. With -O2, just recompiling uprobe_multi: $ touch uprobe_multi.c $ time make RELEASE=1 -j90 make RELEASE=1 -j90 291.66s user 2.54s system 99% cpu 4:55.52 total With -O0: $ touch uprobe_multi.c $ time make RELEASE=1 -j90 make RELEASE=1 -j90 22.40s user 1.91s system 99% cpu 24.355 total 5 minutes vs (still slow, but...) 24 seconds. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240329190410.4191353-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/uprobe_multi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3b9eb40d6343..b0ac3dd80acf 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -759,7 +759,7 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o $(OUTPUT)/uprobe_multi: uprobe_multi.c $(call msg,BINARY,,$@) - $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c index a61ceab60b68..7ffa563ffeba 100644 --- a/tools/testing/selftests/bpf/uprobe_multi.c +++ b/tools/testing/selftests/bpf/uprobe_multi.c @@ -9,7 +9,7 @@ #define NAME(name, idx) PASTE(name, idx) -#define DEF(name, idx) int NAME(name, idx)(void) { return 0; } +#define DEF(name, idx) int __attribute__((weak)) NAME(name, idx)(void) { return 0; } #define CALL(name, idx) NAME(name, idx)(); #define F(body, name, idx) body(name, idx) From f7b68543642136164ce7348945d3ada707c4e635 Mon Sep 17 00:00:00 2001 From: Rameez Rehman Date: Sun, 31 Mar 2024 21:03:44 +0100 Subject: [PATCH 028/147] bpftool: Use simpler indentation in source rST for documentation The rST manual pages for bpftool would use a mix of tabs and spaces for indentation. While this is the norm in C code, this is rather unusual for rST documents, and over time we've seen many contributors use a wrong level of indentation for documentation update. Let's fix bpftool's indentation in docs once and for all: - Let's use spaces, that are more common in rST files. - Remove one level of indentation for the synopsis, the command description, and the "see also" section. As a result, all sections start with the same indentation level in the generated man page. - Rewrap the paragraphs after the changes. There is no content change in this patch, only indentation and rewrapping changes. The wrapping in the generated source files for the manual pages is changed, but the pages displayed with "man" remain the same, apart from the adjusted indentation level on relevant sections. [ Quentin: rebased on bpf-next, removed indent level for command description and options, updated synopsis, command summary, and "see also" sections. ] Signed-off-by: Rameez Rehman Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240331200346.29118-2-qmo@kernel.org --- tools/bpf/bpftool/Documentation/Makefile | 6 +- .../bpf/bpftool/Documentation/bpftool-btf.rst | 100 +++-- .../bpftool/Documentation/bpftool-cgroup.rst | 198 +++++----- .../bpftool/Documentation/bpftool-feature.rst | 99 +++-- .../bpf/bpftool/Documentation/bpftool-gen.rst | 284 ++++++------- .../bpftool/Documentation/bpftool-iter.rst | 50 ++- .../bpftool/Documentation/bpftool-link.rst | 73 ++-- .../bpf/bpftool/Documentation/bpftool-map.rst | 232 ++++++----- .../bpf/bpftool/Documentation/bpftool-net.rst | 98 +++-- .../bpftool/Documentation/bpftool-perf.rst | 34 +- .../bpftool/Documentation/bpftool-prog.rst | 374 ++++++++---------- .../Documentation/bpftool-struct_ops.rst | 71 ++-- tools/bpf/bpftool/Documentation/bpftool.rst | 60 +-- .../bpftool/Documentation/common_options.rst | 26 +- 14 files changed, 811 insertions(+), 894 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index ac8487dcff1d..4315652678b9 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -31,9 +31,9 @@ see_also = $(subst " ",, \ "\n" \ "SEE ALSO\n" \ "========\n" \ - "\t**bpf**\ (2),\n" \ - "\t**bpf-helpers**\\ (7)" \ - $(foreach page,$(call list_pages,$(1)),",\n\t**$(page)**\\ (8)") \ + "**bpf**\ (2),\n" \ + "**bpf-helpers**\\ (7)" \ + $(foreach page,$(call list_pages,$(1)),",\n**$(page)**\\ (8)") \ "\n") $(OUTPUT)%.8: %.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 342716f74ec4..1be6b9aae053 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -14,82 +14,76 @@ tool for inspection of BTF data SYNOPSIS ======== - **bpftool** [*OPTIONS*] **btf** *COMMAND* +**bpftool** [*OPTIONS*] **btf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } - *COMMANDS* := { **dump** | **help** } +*COMMANDS* := { **dump** | **help** } BTF COMMANDS ============= -| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] -| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] -| **bpftool** **btf help** +| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] +| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] +| **bpftool** **btf help** | -| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } -| *FORMAT* := { **raw** | **c** } -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } +| *FORMAT* := { **raw** | **c** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } DESCRIPTION =========== - **bpftool btf { show | list }** [**id** *BTF_ID*] - Show information about loaded BTF objects. If a BTF ID is - specified, show information only about given BTF object, - otherwise list all BTF objects currently loaded on the - system. +**bpftool btf { show | list }** [**id** *BTF_ID*] + Show information about loaded BTF objects. If a BTF ID is specified, show + information only about given BTF object, otherwise list all BTF objects + currently loaded on the system. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BTF - objects. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BTF objects. On such kernels + bpftool will automatically emit this information as well. - **bpftool btf dump** *BTF_SRC* - Dump BTF entries from a given *BTF_SRC*. +**bpftool btf dump** *BTF_SRC* + Dump BTF entries from a given *BTF_SRC*. - When **id** is specified, BTF object with that ID will be - loaded and all its BTF types emitted. + When **id** is specified, BTF object with that ID will be loaded and all + its BTF types emitted. - When **map** is provided, it's expected that map has - associated BTF object with BTF types describing key and - value. It's possible to select whether to dump only BTF - type(s) associated with key (**key**), value (**value**), - both key and value (**kv**), or all BTF types present in - associated BTF object (**all**). If not specified, **kv** - is assumed. + When **map** is provided, it's expected that map has associated BTF object + with BTF types describing key and value. It's possible to select whether to + dump only BTF type(s) associated with key (**key**), value (**value**), + both key and value (**kv**), or all BTF types present in associated BTF + object (**all**). If not specified, **kv** is assumed. - When **prog** is provided, it's expected that program has - associated BTF object with BTF types. + When **prog** is provided, it's expected that program has associated BTF + object with BTF types. - When specifying *FILE*, an ELF file is expected, containing - .BTF section with well-defined BTF binary format data, - typically produced by clang or pahole. + When specifying *FILE*, an ELF file is expected, containing .BTF section + with well-defined BTF binary format data, typically produced by clang or + pahole. - **format** option can be used to override default (raw) - output format. Raw (**raw**) or C-syntax (**c**) output - formats are supported. + **format** option can be used to override default (raw) output format. Raw + (**raw**) or C-syntax (**c**) output formats are supported. - **bpftool btf help** - Print short help message. +**bpftool btf help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -B, --base-btf *FILE* - Pass a base BTF object. Base BTF objects are typically used - with BTF objects for kernel modules. To avoid duplicating - all kernel symbols required by modules, BTF objects for - modules are "split", they are built incrementally on top of - the kernel (vmlinux) BTF object. So the base BTF reference - should usually point to the kernel BTF. +-B, --base-btf *FILE* + Pass a base BTF object. Base BTF objects are typically used with BTF + objects for kernel modules. To avoid duplicating all kernel symbols + required by modules, BTF objects for modules are "split", they are + built incrementally on top of the kernel (vmlinux) BTF object. So the + base BTF reference should usually point to the kernel BTF. - When the main BTF object to process (for example, the - module BTF to dump) is passed as a *FILE*, bpftool attempts - to autodetect the path for the base object, and passing - this option is optional. When the main BTF object is passed - through other handles, this option becomes necessary. + When the main BTF object to process (for example, the module BTF to + dump) is passed as a *FILE*, bpftool attempts to autodetect the path + for the base object, and passing this option is optional. When the main + BTF object is passed through other handles, this option becomes + necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 2ce900f66d6e..1dbabe33e56a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -14,134 +14,130 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **cgroup** *COMMAND* +**bpftool** [*OPTIONS*] **cgroup** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } - *COMMANDS* := - { **show** | **list** | **tree** | **attach** | **detach** | **help** } +*COMMANDS* := +{ **show** | **list** | **tree** | **attach** | **detach** | **help** } CGROUP COMMANDS =============== -| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] -| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] -| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] -| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* -| **bpftool** **cgroup help** +| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] +| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] +| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] +| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* +| **bpftool** **cgroup help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | -| **cgroup_inet_sock_create** | **cgroup_sock_ops** | -| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | -| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | -| **cgroup_inet4_connect** | **cgroup_inet6_connect** | -| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | -| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | -| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | -| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | -| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | -| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | -| **cgroup_unix_recvmsg** | **cgroup_sysctl** | -| **cgroup_getsockopt** | **cgroup_setsockopt** | -| **cgroup_inet_sock_release** } -| *ATTACH_FLAGS* := { **multi** | **override** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | +| **cgroup_inet_sock_create** | **cgroup_sock_ops** | +| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | +| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | +| **cgroup_inet4_connect** | **cgroup_inet6_connect** | +| **cgroup_unix_connect** | **cgroup_inet4_getpeername** | +| **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** | +| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | +| **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** | +| **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** | +| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | +| **cgroup_unix_recvmsg** | **cgroup_sysctl** | +| **cgroup_getsockopt** | **cgroup_setsockopt** | +| **cgroup_inet_sock_release** } +| *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION =========== - **bpftool cgroup { show | list }** *CGROUP* [**effective**] - List all programs attached to the cgroup *CGROUP*. +**bpftool cgroup { show | list }** *CGROUP* [**effective**] + List all programs attached to the cgroup *CGROUP*. - Output will start with program ID followed by attach type, - attach flags and program name. + Output will start with program ID followed by attach type, attach flags and + program name. - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. - **bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] - Iterate over all cgroups in *CGROUP_ROOT* and list all - attached programs. If *CGROUP_ROOT* is not specified, - bpftool uses cgroup v2 mountpoint. +**bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] + Iterate over all cgroups in *CGROUP_ROOT* and list all attached programs. + If *CGROUP_ROOT* is not specified, bpftool uses cgroup v2 mountpoint. - The output is similar to the output of cgroup show/list - commands: it starts with absolute cgroup path, followed by - program ID, attach type, attach flags and program name. + The output is similar to the output of cgroup show/list commands: it starts + with absolute cgroup path, followed by program ID, attach type, attach + flags and program name. - If **effective** is specified retrieve effective programs that - will execute for events within a cgroup. This includes - inherited along with attached ones. + If **effective** is specified retrieve effective programs that will execute + for events within a cgroup. This includes inherited along with attached + ones. - **bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] - Attach program *PROG* to the cgroup *CGROUP* with attach type - *ATTACH_TYPE* and optional *ATTACH_FLAGS*. +**bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] + Attach program *PROG* to the cgroup *CGROUP* with attach type *ATTACH_TYPE* + and optional *ATTACH_FLAGS*. - *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs - some bpf program, the program in this cgroup yields to sub-cgroup - program; **multi** if a sub-cgroup installs some bpf program, - that cgroup program gets run in addition to the program in this - cgroup. + *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs some + bpf program, the program in this cgroup yields to sub-cgroup program; + **multi** if a sub-cgroup installs some bpf program, that cgroup program + gets run in addition to the program in this cgroup. - Only one program is allowed to be attached to a cgroup with - no attach flags or the **override** flag. Attaching another - program will release old program and attach the new one. + Only one program is allowed to be attached to a cgroup with no attach flags + or the **override** flag. Attaching another program will release old + program and attach the new one. - Multiple programs are allowed to be attached to a cgroup with - **multi**. They are executed in FIFO order (those that were - attached first, run first). + Multiple programs are allowed to be attached to a cgroup with **multi**. + They are executed in FIFO order (those that were attached first, run + first). - Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 - and later. + Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 and later. - *ATTACH_TYPE* can be on of: - **ingress** ingress path of the inet socket (since 4.10); - **egress** egress path of the inet socket (since 4.10); - **sock_create** opening of an inet socket (since 4.10); - **sock_ops** various socket operations (since 4.12); - **device** device access (since 4.15); - **bind4** call to bind(2) for an inet4 socket (since 4.17); - **bind6** call to bind(2) for an inet6 socket (since 4.17); - **post_bind4** return from bind(2) for an inet4 socket (since 4.17); - **post_bind6** return from bind(2) for an inet6 socket (since 4.17); - **connect4** call to connect(2) for an inet4 socket (since 4.17); - **connect6** call to connect(2) for an inet6 socket (since 4.17); - **connect_unix** call to connect(2) for a unix socket (since 6.7); - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp4 socket (since 4.18); - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected udp6 socket (since 4.18); - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for - an unconnected unix socket (since 6.7); - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp4 socket (since 5.2); - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected udp6 socket (since 5.2); - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for - an unconnected unix socket (since 6.7); - **sysctl** sysctl access (since 5.2); - **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3); - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7); - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); - **sock_release** closing an userspace inet socket (since 5.9). + *ATTACH_TYPE* can be on of: + **ingress** ingress path of the inet socket (since 4.10); + **egress** egress path of the inet socket (since 4.10); + **sock_create** opening of an inet socket (since 4.10); + **sock_ops** various socket operations (since 4.12); + **device** device access (since 4.15); + **bind4** call to bind(2) for an inet4 socket (since 4.17); + **bind6** call to bind(2) for an inet6 socket (since 4.17); + **post_bind4** return from bind(2) for an inet4 socket (since 4.17); + **post_bind6** return from bind(2) for an inet6 socket (since 4.17); + **connect4** call to connect(2) for an inet4 socket (since 4.17); + **connect6** call to connect(2) for an inet6 socket (since 4.17); + **connect_unix** call to connect(2) for a unix socket (since 6.7); + **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected + udp4 socket (since 4.18); + **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected + udp6 socket (since 4.18); + **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an + unconnected unix socket (since 6.7); + **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an + unconnected udp4 socket (since 5.2); + **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an + unconnected udp6 socket (since 5.2); + **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an + unconnected unix socket (since 6.7); + **sysctl** sysctl access (since 5.2); + **getsockopt** call to getsockopt (since 5.3); + **setsockopt** call to setsockopt (since 5.3); + **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); + **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); + **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7); + **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); + **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). + **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); + **sock_release** closing an userspace inet socket (since 5.9). - **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* - Detach *PROG* from the cgroup *CGROUP* and attach type - *ATTACH_TYPE*. +**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* + Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*. - **bpftool prog help** - Print short help message. +**bpftool prog help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned programs. +-f, --bpffs + Show file names of pinned programs. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index e44039f89be7..5c2433147ba4 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -14,77 +14,70 @@ tool for inspection of eBPF-related parameters for Linux kernel or net device SYNOPSIS ======== - **bpftool** [*OPTIONS*] **feature** *COMMAND* +**bpftool** [*OPTIONS*] **feature** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **probe** | **help** } +*COMMANDS* := { **probe** | **help** } FEATURE COMMANDS ================ -| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] -| **bpftool** **feature list_builtins** *GROUP* -| **bpftool** **feature help** +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature list_builtins** *GROUP* +| **bpftool** **feature help** | -| *COMPONENT* := { **kernel** | **dev** *NAME* } -| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } +| *COMPONENT* := { **kernel** | **dev** *NAME* } +| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } DESCRIPTION =========== - **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] - Probe the running kernel and dump a number of eBPF-related - parameters, such as availability of the **bpf**\ () system call, - JIT status, eBPF program types availability, eBPF helper - functions availability, and more. +**bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] + Probe the running kernel and dump a number of eBPF-related parameters, such + as availability of the **bpf**\ () system call, JIT status, eBPF program + types availability, eBPF helper functions availability, and more. - By default, bpftool **does not run probes** for - **bpf_probe_write_user**\ () and **bpf_trace_printk**\() - helpers which print warnings to kernel logs. To enable them - and run all probes, the **full** keyword should be used. + By default, bpftool **does not run probes** for **bpf_probe_write_user**\ + () and **bpf_trace_printk**\() helpers which print warnings to kernel logs. + To enable them and run all probes, the **full** keyword should be used. - If the **macros** keyword (but not the **-j** option) is - passed, a subset of the output is dumped as a list of - **#define** macros that are ready to be included in a C - header file, for example. If, additionally, **prefix** is - used to define a *PREFIX*, the provided string will be used - as a prefix to the names of the macros: this can be used to - avoid conflicts on macro names when including the output of - this command as a header file. + If the **macros** keyword (but not the **-j** option) is passed, a subset + of the output is dumped as a list of **#define** macros that are ready to + be included in a C header file, for example. If, additionally, **prefix** + is used to define a *PREFIX*, the provided string will be used as a prefix + to the names of the macros: this can be used to avoid conflicts on macro + names when including the output of this command as a header file. - Keyword **kernel** can be omitted. If no probe target is - specified, probing the kernel is the default behaviour. + Keyword **kernel** can be omitted. If no probe target is specified, probing + the kernel is the default behaviour. - When the **unprivileged** keyword is used, bpftool will dump - only the features available to a user who does not have the - **CAP_SYS_ADMIN** capability set. The features available in - that case usually represent a small subset of the parameters - supported by the system. Unprivileged users MUST use the - **unprivileged** keyword: This is to avoid misdetection if - bpftool is inadvertently run as non-root, for example. This - keyword is unavailable if bpftool was compiled without - libcap. + When the **unprivileged** keyword is used, bpftool will dump only the + features available to a user who does not have the **CAP_SYS_ADMIN** + capability set. The features available in that case usually represent a + small subset of the parameters supported by the system. Unprivileged users + MUST use the **unprivileged** keyword: This is to avoid misdetection if + bpftool is inadvertently run as non-root, for example. This keyword is + unavailable if bpftool was compiled without libcap. - **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] - Probe network device for supported eBPF features and dump - results to the console. +**bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] + Probe network device for supported eBPF features and dump results to the + console. - The keywords **full**, **macros** and **prefix** have the - same role as when probing the kernel. + The keywords **full**, **macros** and **prefix** have the same role as when + probing the kernel. - **bpftool feature list_builtins** *GROUP* - List items known to bpftool. These can be BPF program types - (**prog_types**), BPF map types (**map_types**), attach types - (**attach_types**), link types (**link_types**), or BPF helper - functions (**helpers**). The command does not probe the system, but - simply lists the elements that bpftool knows from compilation time, - as provided from libbpf (for all object types) or from the BPF UAPI - header (list of helpers). This can be used in scripts to iterate over - BPF types or helpers. +**bpftool feature list_builtins** *GROUP* + List items known to bpftool. These can be BPF program types + (**prog_types**), BPF map types (**map_types**), attach types + (**attach_types**), link types (**link_types**), or BPF helper functions + (**helpers**). The command does not probe the system, but simply lists the + elements that bpftool knows from compilation time, as provided from libbpf + (for all object types) or from the BPF UAPI header (list of helpers). This + can be used in scripts to iterate over BPF types or helpers. - **bpftool feature help** - Print short help message. +**bpftool feature help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 5e60825818dd..68104347d592 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -14,199 +14,177 @@ tool for BPF code-generation SYNOPSIS ======== - **bpftool** [*OPTIONS*] **gen** *COMMAND* +**bpftool** [*OPTIONS*] **gen** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } - *COMMAND* := { **object** | **skeleton** | **help** } +*COMMAND* := { **object** | **skeleton** | **help** } GEN COMMANDS ============= -| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] -| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] -| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] -| **bpftool** **gen help** +| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] +| **bpftool** **gen help** DESCRIPTION =========== - **bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] - Statically link (combine) together one or more *INPUT_FILE*'s - into a single resulting *OUTPUT_FILE*. All the files involved - are BPF ELF object files. +**bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] + Statically link (combine) together one or more *INPUT_FILE*'s into a single + resulting *OUTPUT_FILE*. All the files involved are BPF ELF object files. - The rules of BPF static linking are mostly the same as for - user-space object files, but in addition to combining data - and instruction sections, .BTF and .BTF.ext (if present in - any of the input files) data are combined together. .BTF - data is deduplicated, so all the common types across - *INPUT_FILE*'s will only be represented once in the resulting - BTF information. + The rules of BPF static linking are mostly the same as for user-space + object files, but in addition to combining data and instruction sections, + .BTF and .BTF.ext (if present in any of the input files) data are combined + together. .BTF data is deduplicated, so all the common types across + *INPUT_FILE*'s will only be represented once in the resulting BTF + information. - BPF static linking allows to partition BPF source code into - individually compiled files that are then linked into - a single resulting BPF object file, which can be used to - generated BPF skeleton (with **gen skeleton** command) or - passed directly into **libbpf** (using **bpf_object__open()** - family of APIs). + BPF static linking allows to partition BPF source code into individually + compiled files that are then linked into a single resulting BPF object + file, which can be used to generated BPF skeleton (with **gen skeleton** + command) or passed directly into **libbpf** (using **bpf_object__open()** + family of APIs). - **bpftool gen skeleton** *FILE* - Generate BPF skeleton C header file for a given *FILE*. +**bpftool gen skeleton** *FILE* + Generate BPF skeleton C header file for a given *FILE*. - BPF skeleton is an alternative interface to existing libbpf - APIs for working with BPF objects. Skeleton code is intended - to significantly shorten and simplify code to load and work - with BPF programs from userspace side. Generated code is - tailored to specific input BPF object *FILE*, reflecting its - structure by listing out available maps, program, variables, - etc. Skeleton eliminates the need to lookup mentioned - components by name. Instead, if skeleton instantiation - succeeds, they are populated in skeleton structure as valid - libbpf types (e.g., **struct bpf_map** pointer) and can be - passed to existing generic libbpf APIs. + BPF skeleton is an alternative interface to existing libbpf APIs for + working with BPF objects. Skeleton code is intended to significantly + shorten and simplify code to load and work with BPF programs from userspace + side. Generated code is tailored to specific input BPF object *FILE*, + reflecting its structure by listing out available maps, program, variables, + etc. Skeleton eliminates the need to lookup mentioned components by name. + Instead, if skeleton instantiation succeeds, they are populated in skeleton + structure as valid libbpf types (e.g., **struct bpf_map** pointer) and can + be passed to existing generic libbpf APIs. - In addition to simple and reliable access to maps and - programs, skeleton provides a storage for BPF links (**struct - bpf_link**) for each BPF program within BPF object. When - requested, supported BPF programs will be automatically - attached and resulting BPF links stored for further use by - user in pre-allocated fields in skeleton struct. For BPF - programs that can't be automatically attached by libbpf, - user can attach them manually, but store resulting BPF link - in per-program link field. All such set up links will be - automatically destroyed on BPF skeleton destruction. This - eliminates the need for users to manage links manually and - rely on libbpf support to detach programs and free up - resources. + In addition to simple and reliable access to maps and programs, skeleton + provides a storage for BPF links (**struct bpf_link**) for each BPF program + within BPF object. When requested, supported BPF programs will be + automatically attached and resulting BPF links stored for further use by + user in pre-allocated fields in skeleton struct. For BPF programs that + can't be automatically attached by libbpf, user can attach them manually, + but store resulting BPF link in per-program link field. All such set up + links will be automatically destroyed on BPF skeleton destruction. This + eliminates the need for users to manage links manually and rely on libbpf + support to detach programs and free up resources. - Another facility provided by BPF skeleton is an interface to - global variables of all supported kinds: mutable, read-only, - as well as extern ones. This interface allows to pre-setup - initial values of variables before BPF object is loaded and - verified by kernel. For non-read-only variables, the same - interface can be used to fetch values of global variables on - userspace side, even if they are modified by BPF code. + Another facility provided by BPF skeleton is an interface to global + variables of all supported kinds: mutable, read-only, as well as extern + ones. This interface allows to pre-setup initial values of variables before + BPF object is loaded and verified by kernel. For non-read-only variables, + the same interface can be used to fetch values of global variables on + userspace side, even if they are modified by BPF code. - During skeleton generation, contents of source BPF object - *FILE* is embedded within generated code and is thus not - necessary to keep around. This ensures skeleton and BPF - object file are matching 1-to-1 and always stay in sync. - Generated code is dual-licensed under LGPL-2.1 and - BSD-2-Clause licenses. + During skeleton generation, contents of source BPF object *FILE* is + embedded within generated code and is thus not necessary to keep around. + This ensures skeleton and BPF object file are matching 1-to-1 and always + stay in sync. Generated code is dual-licensed under LGPL-2.1 and + BSD-2-Clause licenses. - It is a design goal and guarantee that skeleton interfaces - are interoperable with generic libbpf APIs. User should - always be able to use skeleton API to create and load BPF - object, and later use libbpf APIs to keep working with - specific maps, programs, etc. + It is a design goal and guarantee that skeleton interfaces are + interoperable with generic libbpf APIs. User should always be able to use + skeleton API to create and load BPF object, and later use libbpf APIs to + keep working with specific maps, programs, etc. - As part of skeleton, few custom functions are generated. - Each of them is prefixed with object name. Object name can - either be derived from object file name, i.e., if BPF object - file name is **example.o**, BPF object name will be - **example**. Object name can be also specified explicitly - through **name** *OBJECT_NAME* parameter. The following - custom functions are provided (assuming **example** as - the object name): + As part of skeleton, few custom functions are generated. Each of them is + prefixed with object name. Object name can either be derived from object + file name, i.e., if BPF object file name is **example.o**, BPF object name + will be **example**. Object name can be also specified explicitly through + **name** *OBJECT_NAME* parameter. The following custom functions are + provided (assuming **example** as the object name): - - **example__open** and **example__open_opts**. - These functions are used to instantiate skeleton. It - corresponds to libbpf's **bpf_object__open**\ () API. - **_opts** variants accepts extra **bpf_object_open_opts** - options. + - **example__open** and **example__open_opts**. + These functions are used to instantiate skeleton. It corresponds to + libbpf's **bpf_object__open**\ () API. **_opts** variants accepts extra + **bpf_object_open_opts** options. - - **example__load**. - This function creates maps, loads and verifies BPF - programs, initializes global data maps. It corresponds to - libppf's **bpf_object__load**\ () API. + - **example__load**. + This function creates maps, loads and verifies BPF programs, initializes + global data maps. It corresponds to libppf's **bpf_object__load**\ () + API. - - **example__open_and_load** combines **example__open** and - **example__load** invocations in one commonly used - operation. + - **example__open_and_load** combines **example__open** and + **example__load** invocations in one commonly used operation. - - **example__attach** and **example__detach** - This pair of functions allow to attach and detach, - correspondingly, already loaded BPF object. Only BPF - programs of types supported by libbpf for auto-attachment - will be auto-attached and their corresponding BPF links - instantiated. For other BPF programs, user can manually - create a BPF link and assign it to corresponding fields in - skeleton struct. **example__detach** will detach both - links created automatically, as well as those populated by - user manually. + - **example__attach** and **example__detach** + This pair of functions allow to attach and detach, correspondingly, + already loaded BPF object. Only BPF programs of types supported by libbpf + for auto-attachment will be auto-attached and their corresponding BPF + links instantiated. For other BPF programs, user can manually create a + BPF link and assign it to corresponding fields in skeleton struct. + **example__detach** will detach both links created automatically, as well + as those populated by user manually. - - **example__destroy** - Detach and unload BPF programs, free up all the resources - used by skeleton and BPF object. + - **example__destroy** + Detach and unload BPF programs, free up all the resources used by + skeleton and BPF object. - If BPF object has global variables, corresponding structs - with memory layout corresponding to global data data section - layout will be created. Currently supported ones are: *.data*, - *.bss*, *.rodata*, and *.kconfig* structs/data sections. - These data sections/structs can be used to set up initial - values of variables, if set before **example__load**. - Afterwards, if target kernel supports memory-mapped BPF - arrays, same structs can be used to fetch and update - (non-read-only) data from userspace, with same simplicity - as for BPF side. + If BPF object has global variables, corresponding structs with memory + layout corresponding to global data data section layout will be created. + Currently supported ones are: *.data*, *.bss*, *.rodata*, and *.kconfig* + structs/data sections. These data sections/structs can be used to set up + initial values of variables, if set before **example__load**. Afterwards, + if target kernel supports memory-mapped BPF arrays, same structs can be + used to fetch and update (non-read-only) data from userspace, with same + simplicity as for BPF side. - **bpftool gen subskeleton** *FILE* - Generate BPF subskeleton C header file for a given *FILE*. +**bpftool gen subskeleton** *FILE* + Generate BPF subskeleton C header file for a given *FILE*. - Subskeletons are similar to skeletons, except they do not own - the corresponding maps, programs, or global variables. They - require that the object file used to generate them is already - loaded into a *bpf_object* by some other means. + Subskeletons are similar to skeletons, except they do not own the + corresponding maps, programs, or global variables. They require that the + object file used to generate them is already loaded into a *bpf_object* by + some other means. - This functionality is useful when a library is included into a - larger BPF program. A subskeleton for the library would have - access to all objects and globals defined in it, without - having to know about the larger program. + This functionality is useful when a library is included into a larger BPF + program. A subskeleton for the library would have access to all objects and + globals defined in it, without having to know about the larger program. - Consequently, there are only two functions defined - for subskeletons: + Consequently, there are only two functions defined for subskeletons: - - **example__open(bpf_object\*)** - Instantiates a subskeleton from an already opened (but not - necessarily loaded) **bpf_object**. + - **example__open(bpf_object\*)** + Instantiates a subskeleton from an already opened (but not necessarily + loaded) **bpf_object**. - - **example__destroy()** - Frees the storage for the subskeleton but *does not* unload - any BPF programs or maps. + - **example__destroy()** + Frees the storage for the subskeleton but *does not* unload any BPF + programs or maps. - **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] - Generate a minimum BTF file as *OUTPUT*, derived from a given - *INPUT* BTF file, containing all needed BTF types so one, or - more, given eBPF objects CO-RE relocations may be satisfied. +**bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] + Generate a minimum BTF file as *OUTPUT*, derived from a given *INPUT* BTF + file, containing all needed BTF types so one, or more, given eBPF objects + CO-RE relocations may be satisfied. - When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, - libbpf, when loading an eBPF object, has to rely on external - BTF files to be able to calculate CO-RE relocations. + When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, libbpf, when + loading an eBPF object, has to rely on external BTF files to be able to + calculate CO-RE relocations. - Usually, an external BTF file is built from existing kernel - DWARF data using pahole. It contains all the types used by - its respective kernel image and, because of that, is big. + Usually, an external BTF file is built from existing kernel DWARF data + using pahole. It contains all the types used by its respective kernel image + and, because of that, is big. - The min_core_btf feature builds smaller BTF files, customized - to one or multiple eBPF objects, so they can be distributed - together with an eBPF CO-RE based application, turning the - application portable to different kernel versions. + The min_core_btf feature builds smaller BTF files, customized to one or + multiple eBPF objects, so they can be distributed together with an eBPF + CO-RE based application, turning the application portable to different + kernel versions. - Check examples bellow for more information how to use it. + Check examples bellow for more information how to use it. - **bpftool gen help** - Print short help message. +**bpftool gen help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -L, --use-loader - For skeletons, generate a "light" skeleton (also known as "loader" - skeleton). A light skeleton contains a loader eBPF program. It does - not use the majority of the libbpf infrastructure, and does not need - libelf. +-L, --use-loader + For skeletons, generate a "light" skeleton (also known as "loader" + skeleton). A light skeleton contains a loader eBPF program. It does not use + the majority of the libbpf infrastructure, and does not need libelf. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 84839d488621..af17cc0201d4 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -14,50 +14,46 @@ tool to create BPF iterators SYNOPSIS ======== - **bpftool** [*OPTIONS*] **iter** *COMMAND* +**bpftool** [*OPTIONS*] **iter** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := { **pin** | **help** } +*COMMANDS* := { **pin** | **help** } ITER COMMANDS =================== -| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] -| **bpftool** **iter help** +| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] +| **bpftool** **iter help** | -| *OBJ* := /a/file/of/bpf_iter_target.o -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *OBJ* := /a/file/of/bpf_iter_target.o +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] - A bpf iterator combines a kernel iterating of - particular kernel data (e.g., tasks, bpf_maps, etc.) - and a bpf program called for each kernel data object - (e.g., one task, one bpf_map, etc.). User space can - *read* kernel iterator output through *read()* syscall. +**bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] + A bpf iterator combines a kernel iterating of particular kernel data (e.g., + tasks, bpf_maps, etc.) and a bpf program called for each kernel data object + (e.g., one task, one bpf_map, etc.). User space can *read* kernel iterator + output through *read()* syscall. - The *pin* command creates a bpf iterator from *OBJ*, - and pin it to *PATH*. The *PATH* should be located - in *bpffs* mount. It must not contain a dot - character ('.'), which is reserved for future extensions - of *bpffs*. + The *pin* command creates a bpf iterator from *OBJ*, and pin it to *PATH*. + The *PATH* should be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - Map element bpf iterator requires an additional parameter - *MAP* so bpf program can iterate over map elements for - that map. User can have a bpf program in kernel to run - with each map element, do checking, filtering, aggregation, - etc. without copying data to user space. + Map element bpf iterator requires an additional parameter *MAP* so bpf + program can iterate over map elements for that map. User can have a bpf + program in kernel to run with each map element, do checking, filtering, + aggregation, etc. without copying data to user space. - User can then *cat PATH* to see the bpf iterator output. + User can then *cat PATH* to see the bpf iterator output. - **bpftool iter help** - Print short help message. +**bpftool iter help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 52a4eee4af54..6d39ad890fea 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -14,67 +14,62 @@ tool for inspection and simple manipulation of eBPF links SYNOPSIS ======== - **bpftool** [*OPTIONS*] **link** *COMMAND* +**bpftool** [*OPTIONS*] **link** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := { **show** | **list** | **pin** | **help** } +*COMMANDS* := { **show** | **list** | **pin** | **help** } LINK COMMANDS ============= -| **bpftool** **link { show | list }** [*LINK*] -| **bpftool** **link pin** *LINK* *FILE* -| **bpftool** **link detach** *LINK* -| **bpftool** **link help** +| **bpftool** **link { show | list }** [*LINK*] +| **bpftool** **link pin** *LINK* *FILE* +| **bpftool** **link detach** *LINK* +| **bpftool** **link help** | -| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } +| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } DESCRIPTION =========== - **bpftool link { show | list }** [*LINK*] - Show information about active links. If *LINK* is - specified show information only about given link, - otherwise list all links currently active on the system. +**bpftool link { show | list }** [*LINK*] + Show information about active links. If *LINK* is specified show + information only about given link, otherwise list all links currently + active on the system. - Output will start with link ID followed by link type and - zero or more named attributes, some of which depend on type - of link. + Output will start with link ID followed by link type and zero or more named + attributes, some of which depend on type of link. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - links. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF links. On such kernels + bpftool will automatically emit this information as well. - **bpftool link pin** *LINK* *FILE* - Pin link *LINK* as *FILE*. +**bpftool link pin** *LINK* *FILE* + Pin link *LINK* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool link detach** *LINK* - Force-detach link *LINK*. BPF link and its underlying BPF - program will stay valid, but they will be detached from the - respective BPF hook and BPF link will transition into - a defunct state until last open file descriptor for that - link is closed. +**bpftool link detach** *LINK* + Force-detach link *LINK*. BPF link and its underlying BPF program will stay + valid, but they will be detached from the respective BPF hook and BPF link + will transition into a defunct state until last open file descriptor for + that link is closed. - **bpftool link help** - Print short help message. +**bpftool link help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst + .. include:: common_options.rst - -f, --bpffs - When showing BPF links, show file names of pinned - links. + -f, --bpffs + When showing BPF links, show file names of pinned links. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. + -n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 9d6a314dfd7a..864257169942 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -14,166 +14,160 @@ tool for inspection and simple manipulation of eBPF maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] **map** *COMMAND* +**bpftool** [*OPTIONS*] **map** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } - *COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **help** } +*COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **help** } MAP COMMANDS ============= -| **bpftool** **map** { **show** | **list** } [*MAP*] -| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ -| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ -| [**offload_dev** *NAME*] -| **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* [**key** *DATA*] -| **bpftool** **map getnext** *MAP* [**key** *DATA*] -| **bpftool** **map delete** *MAP* **key** *DATA* -| **bpftool** **map pin** *MAP* *FILE* -| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] -| **bpftool** **map peek** *MAP* -| **bpftool** **map push** *MAP* **value** *VALUE* -| **bpftool** **map pop** *MAP* -| **bpftool** **map enqueue** *MAP* **value** *VALUE* -| **bpftool** **map dequeue** *MAP* -| **bpftool** **map freeze** *MAP* -| **bpftool** **map help** +| **bpftool** **map** { **show** | **list** } [*MAP*] +| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ +| [**offload_dev** *NAME*] +| **bpftool** **map dump** *MAP* +| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* [**key** *DATA*] +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* +| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] +| **bpftool** **map peek** *MAP* +| **bpftool** **map push** *MAP* **value** *VALUE* +| **bpftool** **map pop** *MAP* +| **bpftool** **map enqueue** *MAP* **value** *VALUE* +| **bpftool** **map dequeue** *MAP* +| **bpftool** **map freeze** *MAP* +| **bpftool** **map help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } -| *DATA* := { [**hex**] *BYTES* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *VALUE* := { *DATA* | *MAP* | *PROG* } -| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } -| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** -| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** -| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** -| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** -| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } +| *DATA* := { [**hex**] *BYTES* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *VALUE* := { *DATA* | *MAP* | *PROG* } +| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } +| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** +| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** +| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** +| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } DESCRIPTION =========== - **bpftool map { show | list }** [*MAP*] - Show information about loaded maps. If *MAP* is specified - show information only about given maps, otherwise list all - maps currently loaded on the system. In case of **name**, - *MAP* may match several maps which will all be shown. +**bpftool map { show | list }** [*MAP*] + Show information about loaded maps. If *MAP* is specified show information + only about given maps, otherwise list all maps currently loaded on the + system. In case of **name**, *MAP* may match several maps which will all + be shown. - Output will start with map ID followed by map type and - zero or more named attributes (depending on kernel version). + Output will start with map ID followed by map type and zero or more named + attributes (depending on kernel version). - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - maps. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF maps. On such kernels + bpftool will automatically emit this information as well. - **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] - Create a new map with given parameters and pin it to *bpffs* - as *FILE*. +**bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] + Create a new map with given parameters and pin it to *bpffs* as *FILE*. - *FLAGS* should be an integer which is the combination of - desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h - UAPI header for existing flags). + *FLAGS* should be an integer which is the combination of desired flags, + e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h UAPI header for existing + flags). - To create maps of type array-of-maps or hash-of-maps, the - **inner_map** keyword must be used to pass an inner map. The - kernel needs it to collect metadata related to the inner maps - that the new map will work with. + To create maps of type array-of-maps or hash-of-maps, the **inner_map** + keyword must be used to pass an inner map. The kernel needs it to collect + metadata related to the inner maps that the new map will work with. - Keyword **offload_dev** expects a network interface name, - and is used to request hardware offload for the map. + Keyword **offload_dev** expects a network interface name, and is used to + request hardware offload for the map. - **bpftool map dump** *MAP* - Dump all entries in a given *MAP*. In case of **name**, - *MAP* may match several maps which will all be dumped. +**bpftool map dump** *MAP* + Dump all entries in a given *MAP*. In case of **name**, *MAP* may match + several maps which will all be dumped. - **bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] - Update map entry for a given *KEY*. +**bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] + Update map entry for a given *KEY*. - *UPDATE_FLAGS* can be one of: **any** update existing entry - or add if doesn't exit; **exist** update only if entry already - exists; **noexist** update only if entry doesn't exist. + *UPDATE_FLAGS* can be one of: **any** update existing entry or add if + doesn't exit; **exist** update only if entry already exists; **noexist** + update only if entry doesn't exist. - If the **hex** keyword is provided in front of the bytes - sequence, the bytes are parsed as hexadecimal values, even if - no "0x" prefix is added. If the keyword is not provided, then - the bytes are parsed as decimal values, unless a "0x" prefix - (for hexadecimal) or a "0" prefix (for octal) is provided. + If the **hex** keyword is provided in front of the bytes sequence, the + bytes are parsed as hexadecimal values, even if no "0x" prefix is added. If + the keyword is not provided, then the bytes are parsed as decimal values, + unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is + provided. - **bpftool map lookup** *MAP* [**key** *DATA*] - Lookup **key** in the map. +**bpftool map lookup** *MAP* [**key** *DATA*] + Lookup **key** in the map. - **bpftool map getnext** *MAP* [**key** *DATA*] - Get next key. If *key* is not specified, get first key. +**bpftool map getnext** *MAP* [**key** *DATA*] + Get next key. If *key* is not specified, get first key. - **bpftool map delete** *MAP* **key** *DATA* - Remove entry from the map. +**bpftool map delete** *MAP* **key** *DATA* + Remove entry from the map. - **bpftool map pin** *MAP* *FILE* - Pin map *MAP* as *FILE*. +**bpftool map pin** *MAP* *FILE* + Pin map *MAP* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] - Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. +**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] + Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. - Install perf rings into a perf event array map and dump - output of any **bpf_perf_event_output**\ () call in the kernel. - By default read the number of CPUs on the system and - install perf ring for each CPU in the corresponding index - in the array. + Install perf rings into a perf event array map and dump output of any + **bpf_perf_event_output**\ () call in the kernel. By default read the + number of CPUs on the system and install perf ring for each CPU in the + corresponding index in the array. - If **cpu** and **index** are specified, install perf ring - for given **cpu** at **index** in the array (single ring). + If **cpu** and **index** are specified, install perf ring for given **cpu** + at **index** in the array (single ring). - Note that installing a perf ring into an array will silently - replace any existing ring. Any other application will stop - receiving events if it installed its rings earlier. + Note that installing a perf ring into an array will silently replace any + existing ring. Any other application will stop receiving events if it + installed its rings earlier. - **bpftool map peek** *MAP* - Peek next value in the queue or stack. +**bpftool map peek** *MAP* + Peek next value in the queue or stack. - **bpftool map push** *MAP* **value** *VALUE* - Push *VALUE* onto the stack. +**bpftool map push** *MAP* **value** *VALUE* + Push *VALUE* onto the stack. - **bpftool map pop** *MAP* - Pop and print value from the stack. +**bpftool map pop** *MAP* + Pop and print value from the stack. - **bpftool map enqueue** *MAP* **value** *VALUE* - Enqueue *VALUE* into the queue. +**bpftool map enqueue** *MAP* **value** *VALUE* + Enqueue *VALUE* into the queue. - **bpftool map dequeue** *MAP* - Dequeue and print value from the queue. +**bpftool map dequeue** *MAP* + Dequeue and print value from the queue. - **bpftool map freeze** *MAP* - Freeze the map as read-only from user space. Entries from a - frozen map can not longer be updated or deleted with the - **bpf**\ () system call. This operation is not reversible, - and the map remains immutable from user space until its - destruction. However, read and write permissions for BPF - programs to the map remain unchanged. +**bpftool map freeze** *MAP* + Freeze the map as read-only from user space. Entries from a frozen map can + not longer be updated or deleted with the **bpf**\ () system call. This + operation is not reversible, and the map remains immutable from user space + until its destruction. However, read and write permissions for BPF programs + to the map remain unchanged. - **bpftool map help** - Print short help message. +**bpftool map help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - Show file names of pinned maps. +-f, --bpffs + Show file names of pinned maps. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index dd3f9469765b..43d2c858fdf8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -14,76 +14,74 @@ tool for inspection of networking related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **net** *COMMAND* +**bpftool** [*OPTIONS*] **net** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **attach** | **detach** | **help** } +*COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } NET COMMANDS ============ -| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] -| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] -| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* -| **bpftool** **net help** +| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] +| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* +| **bpftool** **net help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } DESCRIPTION =========== - **bpftool net { show | list }** [ **dev** *NAME* ] - List bpf program attachments in the kernel networking subsystem. +**bpftool net { show | list }** [ **dev** *NAME* ] + List bpf program attachments in the kernel networking subsystem. - Currently, device driver xdp attachments, tcx, netkit and old-style tc - classifier/action attachments, flow_dissector as well as netfilter - attachments are implemented, i.e., for - program types **BPF_PROG_TYPE_XDP**, **BPF_PROG_TYPE_SCHED_CLS**, - **BPF_PROG_TYPE_SCHED_ACT**, **BPF_PROG_TYPE_FLOW_DISSECTOR**, - **BPF_PROG_TYPE_NETFILTER**. + Currently, device driver xdp attachments, tcx, netkit and old-style tc + classifier/action attachments, flow_dissector as well as netfilter + attachments are implemented, i.e., for program types **BPF_PROG_TYPE_XDP**, + **BPF_PROG_TYPE_SCHED_CLS**, **BPF_PROG_TYPE_SCHED_ACT**, + **BPF_PROG_TYPE_FLOW_DISSECTOR**, **BPF_PROG_TYPE_NETFILTER**. - For programs attached to a particular cgroup, e.g., - **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, - **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, - users can use **bpftool cgroup** to dump cgroup attachments. - For sk_{filter, skb, msg, reuseport} and lwt/seg6 - bpf programs, users should consult other tools, e.g., iproute2. + For programs attached to a particular cgroup, e.g., + **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, + **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, users + can use **bpftool cgroup** to dump cgroup attachments. For sk_{filter, skb, + msg, reuseport} and lwt/seg6 bpf programs, users should consult other + tools, e.g., iproute2. - The current output will start with all xdp program attachments, followed by - all tcx, netkit, then tc class/qdisc bpf program attachments, then flow_dissector - and finally netfilter programs. Both xdp programs and tcx/netkit/tc programs are - ordered based on ifindex number. If multiple bpf programs attached - to the same networking device through **tc**, the order will be first - all bpf programs attached to tcx, netkit, then tc classes, then all bpf programs - attached to non clsact qdiscs, and finally all bpf programs attached - to root and clsact qdisc. + The current output will start with all xdp program attachments, followed by + all tcx, netkit, then tc class/qdisc bpf program attachments, then + flow_dissector and finally netfilter programs. Both xdp programs and + tcx/netkit/tc programs are ordered based on ifindex number. If multiple bpf + programs attached to the same networking device through **tc**, the order + will be first all bpf programs attached to tcx, netkit, then tc classes, + then all bpf programs attached to non clsact qdiscs, and finally all bpf + programs attached to root and clsact qdisc. - **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] - Attach bpf program *PROG* to network interface *NAME* with - type specified by *ATTACH_TYPE*. Previously attached bpf program - can be replaced by the command used with **overwrite** option. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. +**bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] + Attach bpf program *PROG* to network interface *NAME* with type specified + by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the + command used with **overwrite** option. Currently, only XDP-related modes + are supported for *ATTACH_TYPE*. - *ATTACH_TYPE* can be of: - **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; - **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; - **xdpdrv** - Native XDP. runs earliest point in driver's receive path; - **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; + *ATTACH_TYPE* can be of: + **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; + **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; + **xdpdrv** - Native XDP. runs earliest point in driver's receive path; + **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; - **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* - Detach bpf program attached to network interface *NAME* with - type specified by *ATTACH_TYPE*. To detach bpf program, same - *ATTACH_TYPE* previously used for attach must be specified. - Currently, only XDP-related modes are supported for *ATTACH_TYPE*. +**bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* + Detach bpf program attached to network interface *NAME* with type specified + by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used + for attach must be specified. Currently, only XDP-related modes are + supported for *ATTACH_TYPE*. - **bpftool net help** - Print short help message. +**bpftool net help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 5fea633a82f1..841789e30acd 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -14,37 +14,37 @@ tool for inspection of perf related bpf prog attachments SYNOPSIS ======== - **bpftool** [*OPTIONS*] **perf** *COMMAND* +**bpftool** [*OPTIONS*] **perf** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **help** } +*COMMANDS* := +{ **show** | **list** | **help** } PERF COMMANDS ============= -| **bpftool** **perf** { **show** | **list** } -| **bpftool** **perf help** +| **bpftool** **perf** { **show** | **list** } +| **bpftool** **perf help** DESCRIPTION =========== - **bpftool perf { show | list }** - List all raw_tracepoint, tracepoint, kprobe attachment in the system. +**bpftool perf { show | list }** + List all raw_tracepoint, tracepoint, kprobe attachment in the system. - Output will start with process id and file descriptor in that process, - followed by bpf program id, attachment information, and attachment point. - The attachment point for raw_tracepoint/tracepoint is the trace probe name. - The attachment point for k[ret]probe is either symbol name and offset, - or a kernel virtual address. - The attachment point for u[ret]probe is the file name and the file offset. + Output will start with process id and file descriptor in that process, + followed by bpf program id, attachment information, and attachment point. + The attachment point for raw_tracepoint/tracepoint is the trace probe name. + The attachment point for k[ret]probe is either symbol name and offset, or a + kernel virtual address. The attachment point for u[ret]probe is the file + name and the file offset. - **bpftool perf help** - Print short help message. +**bpftool perf help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 58e6a5b10ef7..cc456d357db3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -14,250 +14,226 @@ tool for inspection and simple manipulation of eBPF progs SYNOPSIS ======== - **bpftool** [*OPTIONS*] **prog** *COMMAND* +**bpftool** [*OPTIONS*] **prog** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| | - { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | - { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | +{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | +{ **-L** | **--use-loader** } } - *COMMANDS* := - { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | - **loadall** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | +**loadall** | **help** } PROG COMMANDS ============= -| **bpftool** **prog** { **show** | **list** } [*PROG*] -| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] -| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] -| **bpftool** **prog pin** *PROG* *FILE* -| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] -| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] -| **bpftool** **prog tracelog** -| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] -| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* -| **bpftool** **prog help** +| **bpftool** **prog** { **show** | **list** } [*PROG*] +| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] +| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] +| **bpftool** **prog pin** *PROG* *FILE* +| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] +| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog tracelog** +| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] +| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* +| **bpftool** **prog help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *TYPE* := { -| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | -| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | -| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | -| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | -| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | -| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | -| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | -| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | -| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | -| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | -| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | -| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** -| } -| *ATTACH_TYPE* := { -| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | -| **sk_skb_stream_parser** | **flow_dissector** -| } -| *METRICs* := { -| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | -| **itlb_misses** | **dtlb_misses** -| } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *TYPE* := { +| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | +| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | +| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | +| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | +| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | +| **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** | +| **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** | +| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** | +| **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** | +| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** | +| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | +| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** +| } +| *ATTACH_TYPE* := { +| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | +| **sk_skb_stream_parser** | **flow_dissector** +| } +| *METRICs* := { +| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | +| **itlb_misses** | **dtlb_misses** +| } DESCRIPTION =========== - **bpftool prog { show | list }** [*PROG*] - Show information about loaded programs. If *PROG* is - specified show information only about given programs, - otherwise list all programs currently loaded on the system. - In case of **tag** or **name**, *PROG* may match several - programs which will all be shown. +**bpftool prog { show | list }** [*PROG*] + Show information about loaded programs. If *PROG* is specified show + information only about given programs, otherwise list all programs + currently loaded on the system. In case of **tag** or **name**, *PROG* may + match several programs which will all be shown. - Output will start with program ID followed by program type and - zero or more named attributes (depending on kernel version). + Output will start with program ID followed by program type and zero or more + named attributes (depending on kernel version). - Since Linux 5.1 the kernel can collect statistics on BPF - programs (such as the total time spent running the program, - and the number of times it was run). If available, bpftool - shows such statistics. However, the kernel does not collect - them by defaults, as it slightly impacts performance on each - program run. Activation or deactivation of the feature is - performed via the **kernel.bpf_stats_enabled** sysctl knob. + Since Linux 5.1 the kernel can collect statistics on BPF programs (such as + the total time spent running the program, and the number of times it was + run). If available, bpftool shows such statistics. However, the kernel does + not collect them by defaults, as it slightly impacts performance on each + program run. Activation or deactivation of the feature is performed via the + **kernel.bpf_stats_enabled** sysctl knob. - Since Linux 5.8 bpftool is able to discover information about - processes that hold open file descriptors (FDs) against BPF - programs. On such kernels bpftool will automatically emit this - information as well. + Since Linux 5.8 bpftool is able to discover information about processes + that hold open file descriptors (FDs) against BPF programs. On such kernels + bpftool will automatically emit this information as well. - **bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] - Dump eBPF instructions of the programs from the kernel. By - default, eBPF will be disassembled and printed to standard - output in human-readable format. In this case, **opcodes** - controls if raw opcodes should be printed as well. +**bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] + Dump eBPF instructions of the programs from the kernel. By default, eBPF + will be disassembled and printed to standard output in human-readable + format. In this case, **opcodes** controls if raw opcodes should be printed + as well. - In case of **tag** or **name**, *PROG* may match several - programs which will all be dumped. However, if **file** or - **visual** is specified, *PROG* must match a single program. + In case of **tag** or **name**, *PROG* may match several programs which + will all be dumped. However, if **file** or **visual** is specified, + *PROG* must match a single program. - If **file** is specified, the binary image will instead be - written to *FILE*. + If **file** is specified, the binary image will instead be written to + *FILE*. - If **visual** is specified, control flow graph (CFG) will be - built instead, and eBPF instructions will be presented with - CFG in DOT format, on standard output. + If **visual** is specified, control flow graph (CFG) will be built instead, + and eBPF instructions will be presented with CFG in DOT format, on standard + output. - If the programs have line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. + If the programs have line_info available, the source line will be + displayed. If **linum** is specified, the filename, line number and line + column will also be displayed. - **bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] - Dump jited image (host machine code) of the program. +**bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] + Dump jited image (host machine code) of the program. - If *FILE* is specified image will be written to a file, - otherwise it will be disassembled and printed to stdout. - *PROG* must match a single program when **file** is specified. + If *FILE* is specified image will be written to a file, otherwise it will + be disassembled and printed to stdout. *PROG* must match a single program + when **file** is specified. - **opcodes** controls if raw opcodes will be printed. + **opcodes** controls if raw opcodes will be printed. - If the prog has line_info available, the source line will - be displayed. If **linum** is specified, the filename, line - number and line column will also be displayed. + If the prog has line_info available, the source line will be displayed. If + **linum** is specified, the filename, line number and line column will also + be displayed. - **bpftool prog pin** *PROG* *FILE* - Pin program *PROG* as *FILE*. +**bpftool prog pin** *PROG* *FILE* + Pin program *PROG* as *FILE*. - Note: *FILE* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *FILE* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] - Load bpf program(s) from binary *OBJ* and pin as *PATH*. - **bpftool prog load** pins only the first program from the - *OBJ* as *PATH*. **bpftool prog loadall** pins all programs - from the *OBJ* under *PATH* directory. - **type** is optional, if not specified program type will be - inferred from section names. - By default bpftool will create new maps as declared in the ELF - object being loaded. **map** parameter allows for the reuse - of existing maps. It can be specified multiple times, each - time for a different map. *IDX* refers to index of the map - to be replaced in the ELF file counting from 0, while *NAME* - allows to replace a map by name. *MAP* specifies the map to - use, referring to it by **id** or through a **pinned** file. - If **offload_dev** *NAME* is specified program will be loaded - onto given networking device (offload). - If **xdpmeta_dev** *NAME* is specified program will become - device-bound without offloading, this facilitates access - to XDP metadata. - Optional **pinmaps** argument can be provided to pin all - maps under *MAP_DIR* directory. +**bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] + Load bpf program(s) from binary *OBJ* and pin as *PATH*. **bpftool prog + load** pins only the first program from the *OBJ* as *PATH*. **bpftool prog + loadall** pins all programs from the *OBJ* under *PATH* directory. **type** + is optional, if not specified program type will be inferred from section + names. By default bpftool will create new maps as declared in the ELF + object being loaded. **map** parameter allows for the reuse of existing + maps. It can be specified multiple times, each time for a different map. + *IDX* refers to index of the map to be replaced in the ELF file counting + from 0, while *NAME* allows to replace a map by name. *MAP* specifies the + map to use, referring to it by **id** or through a **pinned** file. If + **offload_dev** *NAME* is specified program will be loaded onto given + networking device (offload). If **xdpmeta_dev** *NAME* is specified program + will become device-bound without offloading, this facilitates access to XDP + metadata. Optional **pinmaps** argument can be provided to pin all maps + under *MAP_DIR* directory. - If **autoattach** is specified program will be attached - before pin. In that case, only the link (representing the - program attached to its hook) is pinned, not the program as - such, so the path won't show in **bpftool prog show -f**, - only show in **bpftool link show -f**. Also, this only works - when bpftool (libbpf) is able to infer all necessary - information from the object file, in particular, it's not - supported for all program types. If a program does not - support autoattach, bpftool falls back to regular pinning - for that program instead. + If **autoattach** is specified program will be attached before pin. In that + case, only the link (representing the program attached to its hook) is + pinned, not the program as such, so the path won't show in **bpftool prog + show -f**, only show in **bpftool link show -f**. Also, this only works + when bpftool (libbpf) is able to infer all necessary information from the + object file, in particular, it's not supported for all program types. If a + program does not support autoattach, bpftool falls back to regular pinning + for that program instead. - Note: *PATH* must be located in *bpffs* mount. It must not - contain a dot character ('.'), which is reserved for future - extensions of *bpffs*. + Note: *PATH* must be located in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions of *bpffs*. - **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] - Attach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - attached to current networking name space. +**bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] + Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is attached to current networking name space. - **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] - Detach bpf program *PROG* (with type specified by - *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* - parameter, with the exception of *flow_dissector* which is - detached from the current networking name space. +**bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] + Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most + *ATTACH_TYPEs* require a *MAP* parameter, with the exception of + *flow_dissector* which is detached from the current networking name space. - **bpftool prog tracelog** - Dump the trace pipe of the system to the console (stdout). - Hit to stop printing. BPF programs can write to this - trace pipe at runtime with the **bpf_trace_printk**\ () helper. - This should be used only for debugging purposes. For - streaming data from BPF programs to user space, one can use - perf events (see also **bpftool-map**\ (8)). +**bpftool prog tracelog** + Dump the trace pipe of the system to the console (stdout). Hit to + stop printing. BPF programs can write to this trace pipe at runtime with + the **bpf_trace_printk**\ () helper. This should be used only for debugging + purposes. For streaming data from BPF programs to user space, one can use + perf events (see also **bpftool-map**\ (8)). - **bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] - Run BPF program *PROG* in the kernel testing infrastructure - for BPF, meaning that the program works on the data and - context provided by the user, and not on actual packets or - monitored functions etc. Return value and duration for the - test run are printed out to the console. +**bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] + Run BPF program *PROG* in the kernel testing infrastructure for BPF, + meaning that the program works on the data and context provided by the + user, and not on actual packets or monitored functions etc. Return value + and duration for the test run are printed out to the console. - Input data is read from the *FILE* passed with **data_in**. - If this *FILE* is "**-**", input data is read from standard - input. Input context, if any, is read from *FILE* passed with - **ctx_in**. Again, "**-**" can be used to read from standard - input, but only if standard input is not already in use for - input data. If a *FILE* is passed with **data_out**, output - data is written to that file. Similarly, output context is - written to the *FILE* passed with **ctx_out**. For both - output flows, "**-**" can be used to print to the standard - output (as plain text, or JSON if relevant option was - passed). If output keywords are omitted, output data and - context are discarded. Keywords **data_size_out** and - **ctx_size_out** are used to pass the size (in bytes) for the - output buffers to the kernel, although the default of 32 kB - should be more than enough for most cases. + Input data is read from the *FILE* passed with **data_in**. If this *FILE* + is "**-**", input data is read from standard input. Input context, if any, + is read from *FILE* passed with **ctx_in**. Again, "**-**" can be used to + read from standard input, but only if standard input is not already in use + for input data. If a *FILE* is passed with **data_out**, output data is + written to that file. Similarly, output context is written to the *FILE* + passed with **ctx_out**. For both output flows, "**-**" can be used to + print to the standard output (as plain text, or JSON if relevant option was + passed). If output keywords are omitted, output data and context are + discarded. Keywords **data_size_out** and **ctx_size_out** are used to pass + the size (in bytes) for the output buffers to the kernel, although the + default of 32 kB should be more than enough for most cases. - Keyword **repeat** is used to indicate the number of - consecutive runs to perform. Note that output data and - context printed to files correspond to the last of those - runs. The duration printed out at the end of the runs is an - average over all runs performed by the command. + Keyword **repeat** is used to indicate the number of consecutive runs to + perform. Note that output data and context printed to files correspond to + the last of those runs. The duration printed out at the end of the runs is + an average over all runs performed by the command. - Not all program types support test run. Among those which do, - not all of them can take the **ctx_in**/**ctx_out** - arguments. bpftool does not perform checks on program types. + Not all program types support test run. Among those which do, not all of + them can take the **ctx_in**/**ctx_out** arguments. bpftool does not + perform checks on program types. - **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* - Profile *METRICs* for bpf program *PROG* for *DURATION* - seconds or until user hits . *DURATION* is optional. - If *DURATION* is not specified, the profiling will run up to - **UINT_MAX** seconds. +**bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* + Profile *METRICs* for bpf program *PROG* for *DURATION* seconds or until + user hits . *DURATION* is optional. If *DURATION* is not specified, + the profiling will run up to **UINT_MAX** seconds. - **bpftool prog help** - Print short help message. +**bpftool prog help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -f, --bpffs - When showing BPF programs, show file names of pinned - programs. +-f, --bpffs + When showing BPF programs, show file names of pinned programs. - -m, --mapcompat - Allow loading maps with unknown map definitions. +-m, --mapcompat + Allow loading maps with unknown map definitions. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. - -L, --use-loader - Load program as a "loader" program. This is useful to debug - the generation of such programs. When this option is in - use, bpftool attempts to load the programs from the object - file into the kernel, but does not pin them (therefore, the - *PATH* must not be provided). +-L, --use-loader + Load program as a "loader" program. This is useful to debug the generation + of such programs. When this option is in use, bpftool attempts to load the + programs from the object file into the kernel, but does not pin them + (therefore, the *PATH* must not be provided). - When combined with the **-d**\ \|\ **--debug** option, - additional debug messages are generated, and the execution - of the loader program will use the **bpf_trace_printk**\ () - helper to log each step of loading BTF, creating the maps, - and loading the programs (see **bpftool prog tracelog** as - a way to dump those messages). + When combined with the **-d**\ \|\ **--debug** option, additional debug + messages are generated, and the execution of the loader program will use + the **bpf_trace_printk**\ () helper to log each step of loading BTF, + creating the maps, and loading the programs (see **bpftool prog tracelog** + as a way to dump those messages). EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 8022b5321dbe..61ec1286446c 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -14,61 +14,60 @@ tool to register/unregister/introspect BPF struct_ops SYNOPSIS ======== - **bpftool** [*OPTIONS*] **struct_ops** *COMMAND* +**bpftool** [*OPTIONS*] **struct_ops** *COMMAND* - *OPTIONS* := { |COMMON_OPTIONS| } +*OPTIONS* := { |COMMON_OPTIONS| } - *COMMANDS* := - { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*COMMANDS* := +{ **show** | **list** | **dump** | **register** | **unregister** | **help** } STRUCT_OPS COMMANDS =================== -| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] -| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] -| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* -| **bpftool** **struct_ops help** +| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] +| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* +| **bpftool** **struct_ops help** | -| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } -| *OBJ* := /a/file/of/bpf_struct_ops.o +| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } +| *OBJ* := /a/file/of/bpf_struct_ops.o DESCRIPTION =========== - **bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] - Show brief information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it shows information only - for the given struct_ops. Otherwise, it lists all struct_ops - currently existing in the system. +**bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] + Show brief information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it shows information only for the given + struct_ops. Otherwise, it lists all struct_ops currently existing in the + system. - Output will start with struct_ops map ID, followed by its map - name and its struct_ops's kernel type. + Output will start with struct_ops map ID, followed by its map name and its + struct_ops's kernel type. - **bpftool struct_ops dump** [*STRUCT_OPS_MAP*] - Dump details information about the struct_ops in the system. - If *STRUCT_OPS_MAP* is specified, it dumps information only - for the given struct_ops. Otherwise, it dumps all struct_ops - currently existing in the system. +**bpftool struct_ops dump** [*STRUCT_OPS_MAP*] + Dump details information about the struct_ops in the system. If + *STRUCT_OPS_MAP* is specified, it dumps information only for the given + struct_ops. Otherwise, it dumps all struct_ops currently existing in the + system. - **bpftool struct_ops register** *OBJ* [*LINK_DIR*] - Register bpf struct_ops from *OBJ*. All struct_ops under - the ELF section ".struct_ops" and ".struct_ops.link" will - be registered to its kernel subsystem. For each - struct_ops in the ".struct_ops.link" section, a link - will be created. You can give *LINK_DIR* to provide a - directory path where these links will be pinned with the - same name as their corresponding map name. +**bpftool struct_ops register** *OBJ* [*LINK_DIR*] + Register bpf struct_ops from *OBJ*. All struct_ops under the ELF section + ".struct_ops" and ".struct_ops.link" will be registered to its kernel + subsystem. For each struct_ops in the ".struct_ops.link" section, a link + will be created. You can give *LINK_DIR* to provide a directory path where + these links will be pinned with the same name as their corresponding map + name. - **bpftool struct_ops unregister** *STRUCT_OPS_MAP* - Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. +**bpftool struct_ops unregister** *STRUCT_OPS_MAP* + Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. - **bpftool struct_ops help** - Print short help message. +**bpftool struct_ops help** + Print short help message. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 09e4f2ff5658..f38ae5c40439 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -14,57 +14,57 @@ tool for inspection and simple manipulation of eBPF programs and maps SYNOPSIS ======== - **bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } +**bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } - **bpftool** **batch file** *FILE* +**bpftool** **batch file** *FILE* - **bpftool** **version** +**bpftool** **version** - *OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | - **btf** | **gen** | **struct_ops** | **iter** } +*OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** | +**btf** | **gen** | **struct_ops** | **iter** } - *OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } +*OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } - *MAP-COMMANDS* := - { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | - **delete** | **pin** | **event_pipe** | **help** } +*MAP-COMMANDS* := +{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | +**delete** | **pin** | **event_pipe** | **help** } - *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | - **load** | **attach** | **detach** | **help** } +*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | +**load** | **attach** | **detach** | **help** } - *LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } +*LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } - *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } +*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } - *PERF-COMMANDS* := { **show** | **list** | **help** } +*PERF-COMMANDS* := { **show** | **list** | **help** } - *NET-COMMANDS* := { **show** | **list** | **help** } +*NET-COMMANDS* := { **show** | **list** | **help** } - *FEATURE-COMMANDS* := { **probe** | **help** } +*FEATURE-COMMANDS* := { **probe** | **help** } - *BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } +*BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } - *GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } +*GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } - *STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } +*STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } - *ITER-COMMANDS* := { **pin** | **help** } +*ITER-COMMANDS* := { **pin** | **help** } DESCRIPTION =========== - *bpftool* allows for inspection and simple modification of BPF objects - on the system. +*bpftool* allows for inspection and simple modification of BPF objects on the +system. - Note that format of the output of all tools is not guaranteed to be - stable and should not be depended upon. +Note that format of the output of all tools is not guaranteed to be stable and +should not be depended upon. OPTIONS ======= - .. include:: common_options.rst +.. include:: common_options.rst - -m, --mapcompat - Allow loading maps with unknown map definitions. +-m, --mapcompat + Allow loading maps with unknown map definitions. - -n, --nomount - Do not automatically attempt to mount any virtual file system - (such as tracefs or BPF virtual file system) when necessary. +-n, --nomount + Do not automatically attempt to mount any virtual file system (such as + tracefs or BPF virtual file system) when necessary. diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst index 30df7a707f02..9234b9dab768 100644 --- a/tools/bpf/bpftool/Documentation/common_options.rst +++ b/tools/bpf/bpftool/Documentation/common_options.rst @@ -1,25 +1,23 @@ .. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -h, --help - Print short help message (similar to **bpftool help**). + Print short help message (similar to **bpftool help**). -V, --version - Print bpftool's version number (similar to **bpftool version**), the - number of the libbpf version in use, and optional features that were - included when bpftool was compiled. Optional features include linking - against LLVM or libbfd to provide the disassembler for JIT-ted - programs (**bpftool prog dump jited**) and usage of BPF skeletons - (some features like **bpftool prog profile** or showing pids - associated to BPF objects may rely on it). + Print bpftool's version number (similar to **bpftool version**), the number + of the libbpf version in use, and optional features that were included when + bpftool was compiled. Optional features include linking against LLVM or + libbfd to provide the disassembler for JIT-ted programs (**bpftool prog + dump jited**) and usage of BPF skeletons (some features like **bpftool prog + profile** or showing pids associated to BPF objects may rely on it). -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. + Generate JSON output. For commands that cannot produce JSON, this option + has no effect. -p, --pretty - Generate human-readable JSON output. Implies **-j**. + Generate human-readable JSON output. Implies **-j**. -d, --debug - Print all logs available, even debug-level information. This includes - logs from libbpf as well as from the verifier, when attempting to - load programs. + Print all logs available, even debug-level information. This includes logs + from libbpf as well as from the verifier, when attempting to load programs. From ea379b3ccc2e4dff9c3d616f0611b5312fe389ad Mon Sep 17 00:00:00 2001 From: Rameez Rehman Date: Sun, 31 Mar 2024 21:03:45 +0100 Subject: [PATCH 029/147] bpftool: Remove useless emphasis on command description in man pages As it turns out, the terms in definition lists in the rST file are already rendered with bold-ish formatting when generating the man pages; all double-star sequences we have in the commands for the command description are unnecessary, and can be removed to make the documentation easier to read. The rST files were automatically processed with: sed -i '/DESCRIPTION/,/OPTIONS/ { /^\*/ s/\*\*//g }' b*.rst Signed-off-by: Rameez Rehman Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240331200346.29118-3-qmo@kernel.org --- .../bpf/bpftool/Documentation/bpftool-btf.rst | 6 ++-- .../bpftool/Documentation/bpftool-cgroup.rst | 10 +++--- .../bpftool/Documentation/bpftool-feature.rst | 8 ++--- .../bpf/bpftool/Documentation/bpftool-gen.rst | 10 +++--- .../bpftool/Documentation/bpftool-iter.rst | 4 +-- .../bpftool/Documentation/bpftool-link.rst | 8 ++--- .../bpf/bpftool/Documentation/bpftool-map.rst | 32 +++++++++---------- .../bpf/bpftool/Documentation/bpftool-net.rst | 8 ++--- .../bpftool/Documentation/bpftool-perf.rst | 4 +-- .../bpftool/Documentation/bpftool-prog.rst | 22 ++++++------- .../Documentation/bpftool-struct_ops.rst | 10 +++--- 11 files changed, 61 insertions(+), 61 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 1be6b9aae053..f66781f20af2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -34,7 +34,7 @@ BTF COMMANDS DESCRIPTION =========== -**bpftool btf { show | list }** [**id** *BTF_ID*] +bpftool btf { show | list } [id *BTF_ID*] Show information about loaded BTF objects. If a BTF ID is specified, show information only about given BTF object, otherwise list all BTF objects currently loaded on the system. @@ -43,7 +43,7 @@ DESCRIPTION that hold open file descriptors (FDs) against BTF objects. On such kernels bpftool will automatically emit this information as well. -**bpftool btf dump** *BTF_SRC* +bpftool btf dump *BTF_SRC* Dump BTF entries from a given *BTF_SRC*. When **id** is specified, BTF object with that ID will be loaded and all @@ -65,7 +65,7 @@ DESCRIPTION **format** option can be used to override default (raw) output format. Raw (**raw**) or C-syntax (**c**) output formats are supported. -**bpftool btf help** +bpftool btf help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 1dbabe33e56a..1acf9f58fca2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -49,7 +49,7 @@ CGROUP COMMANDS DESCRIPTION =========== -**bpftool cgroup { show | list }** *CGROUP* [**effective**] +bpftool cgroup { show | list } *CGROUP* [effective] List all programs attached to the cgroup *CGROUP*. Output will start with program ID followed by attach type, attach flags and @@ -59,7 +59,7 @@ DESCRIPTION for events within a cgroup. This includes inherited along with attached ones. -**bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] +bpftool cgroup tree [*CGROUP_ROOT*] [effective] Iterate over all cgroups in *CGROUP_ROOT* and list all attached programs. If *CGROUP_ROOT* is not specified, bpftool uses cgroup v2 mountpoint. @@ -71,7 +71,7 @@ DESCRIPTION for events within a cgroup. This includes inherited along with attached ones. -**bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] +bpftool cgroup attach *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] Attach program *PROG* to the cgroup *CGROUP* with attach type *ATTACH_TYPE* and optional *ATTACH_FLAGS*. @@ -126,10 +126,10 @@ DESCRIPTION **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); **sock_release** closing an userspace inet socket (since 5.9). -**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* +bpftool cgroup detach *CGROUP* *ATTACH_TYPE* *PROG* Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*. -**bpftool prog help** +bpftool prog help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 5c2433147ba4..c7f837898bc7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -32,7 +32,7 @@ FEATURE COMMANDS DESCRIPTION =========== -**bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] +bpftool feature probe [kernel] [full] [macros [prefix *PREFIX*]] Probe the running kernel and dump a number of eBPF-related parameters, such as availability of the **bpf**\ () system call, JIT status, eBPF program types availability, eBPF helper functions availability, and more. @@ -59,14 +59,14 @@ DESCRIPTION bpftool is inadvertently run as non-root, for example. This keyword is unavailable if bpftool was compiled without libcap. -**bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] +bpftool feature probe dev *NAME* [full] [macros [prefix *PREFIX*]] Probe network device for supported eBPF features and dump results to the console. The keywords **full**, **macros** and **prefix** have the same role as when probing the kernel. -**bpftool feature list_builtins** *GROUP* +bpftool feature list_builtins *GROUP* List items known to bpftool. These can be BPF program types (**prog_types**), BPF map types (**map_types**), attach types (**attach_types**), link types (**link_types**), or BPF helper functions @@ -75,7 +75,7 @@ DESCRIPTION (for all object types) or from the BPF UAPI header (list of helpers). This can be used in scripts to iterate over BPF types or helpers. -**bpftool feature help** +bpftool feature help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 68104347d592..e9589c21e9c3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -31,7 +31,7 @@ GEN COMMANDS DESCRIPTION =========== -**bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +bpftool gen object *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] Statically link (combine) together one or more *INPUT_FILE*'s into a single resulting *OUTPUT_FILE*. All the files involved are BPF ELF object files. @@ -48,7 +48,7 @@ DESCRIPTION command) or passed directly into **libbpf** (using **bpf_object__open()** family of APIs). -**bpftool gen skeleton** *FILE* +bpftool gen skeleton *FILE* Generate BPF skeleton C header file for a given *FILE*. BPF skeleton is an alternative interface to existing libbpf APIs for @@ -132,7 +132,7 @@ DESCRIPTION used to fetch and update (non-read-only) data from userspace, with same simplicity as for BPF side. -**bpftool gen subskeleton** *FILE* +bpftool gen subskeleton *FILE* Generate BPF subskeleton C header file for a given *FILE*. Subskeletons are similar to skeletons, except they do not own the @@ -154,7 +154,7 @@ DESCRIPTION Frees the storage for the subskeleton but *does not* unload any BPF programs or maps. -**bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] +bpftool gen min_core_btf *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] Generate a minimum BTF file as *OUTPUT*, derived from a given *INPUT* BTF file, containing all needed BTF types so one, or more, given eBPF objects CO-RE relocations may be satisfied. @@ -174,7 +174,7 @@ DESCRIPTION Check examples bellow for more information how to use it. -**bpftool gen help** +bpftool gen help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index af17cc0201d4..7bcb4e993d7d 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -31,7 +31,7 @@ ITER COMMANDS DESCRIPTION =========== -**bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] +bpftool iter pin *OBJ* *PATH* [map *MAP*] A bpf iterator combines a kernel iterating of particular kernel data (e.g., tasks, bpf_maps, etc.) and a bpf program called for each kernel data object (e.g., one task, one bpf_map, etc.). User space can *read* kernel iterator @@ -48,7 +48,7 @@ DESCRIPTION User can then *cat PATH* to see the bpf iterator output. -**bpftool iter help** +bpftool iter help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 6d39ad890fea..6f09d4405ed8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -33,7 +33,7 @@ LINK COMMANDS DESCRIPTION =========== -**bpftool link { show | list }** [*LINK*] +bpftool link { show | list } [*LINK*] Show information about active links. If *LINK* is specified show information only about given link, otherwise list all links currently active on the system. @@ -45,19 +45,19 @@ DESCRIPTION that hold open file descriptors (FDs) against BPF links. On such kernels bpftool will automatically emit this information as well. -**bpftool link pin** *LINK* *FILE* +bpftool link pin *LINK* *FILE* Pin link *LINK* as *FILE*. Note: *FILE* must be located in *bpffs* mount. It must not contain a dot character ('.'), which is reserved for future extensions of *bpffs*. -**bpftool link detach** *LINK* +bpftool link detach *LINK* Force-detach link *LINK*. BPF link and its underlying BPF program will stay valid, but they will be detached from the respective BPF hook and BPF link will transition into a defunct state until last open file descriptor for that link is closed. -**bpftool link help** +bpftool link help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 864257169942..252e4c538edb 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -59,7 +59,7 @@ MAP COMMANDS DESCRIPTION =========== -**bpftool map { show | list }** [*MAP*] +bpftool map { show | list } [*MAP*] Show information about loaded maps. If *MAP* is specified show information only about given maps, otherwise list all maps currently loaded on the system. In case of **name**, *MAP* may match several maps which will all @@ -72,7 +72,7 @@ DESCRIPTION that hold open file descriptors (FDs) against BPF maps. On such kernels bpftool will automatically emit this information as well. -**bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] +bpftool map create *FILE* type *TYPE* key *KEY_SIZE* value *VALUE_SIZE* entries *MAX_ENTRIES* name *NAME* [flags *FLAGS*] [inner_map *MAP*] [offload_dev *NAME*] Create a new map with given parameters and pin it to *bpffs* as *FILE*. *FLAGS* should be an integer which is the combination of desired flags, @@ -86,11 +86,11 @@ DESCRIPTION Keyword **offload_dev** expects a network interface name, and is used to request hardware offload for the map. -**bpftool map dump** *MAP* +bpftool map dump *MAP* Dump all entries in a given *MAP*. In case of **name**, *MAP* may match several maps which will all be dumped. -**bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] +bpftool map update *MAP* [key *DATA*] [value *VALUE*] [*UPDATE_FLAGS*] Update map entry for a given *KEY*. *UPDATE_FLAGS* can be one of: **any** update existing entry or add if @@ -103,22 +103,22 @@ DESCRIPTION unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is provided. -**bpftool map lookup** *MAP* [**key** *DATA*] +bpftool map lookup *MAP* [key *DATA*] Lookup **key** in the map. -**bpftool map getnext** *MAP* [**key** *DATA*] +bpftool map getnext *MAP* [key *DATA*] Get next key. If *key* is not specified, get first key. -**bpftool map delete** *MAP* **key** *DATA* +bpftool map delete *MAP* key *DATA* Remove entry from the map. -**bpftool map pin** *MAP* *FILE* +bpftool map pin *MAP* *FILE* Pin map *MAP* as *FILE*. Note: *FILE* must be located in *bpffs* mount. It must not contain a dot character ('.'), which is reserved for future extensions of *bpffs*. -**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] +bpftool map event_pipe *MAP* [cpu *N* index *M*] Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. Install perf rings into a perf event array map and dump output of any @@ -133,29 +133,29 @@ DESCRIPTION existing ring. Any other application will stop receiving events if it installed its rings earlier. -**bpftool map peek** *MAP* +bpftool map peek *MAP* Peek next value in the queue or stack. -**bpftool map push** *MAP* **value** *VALUE* +bpftool map push *MAP* value *VALUE* Push *VALUE* onto the stack. -**bpftool map pop** *MAP* +bpftool map pop *MAP* Pop and print value from the stack. -**bpftool map enqueue** *MAP* **value** *VALUE* +bpftool map enqueue *MAP* value *VALUE* Enqueue *VALUE* into the queue. -**bpftool map dequeue** *MAP* +bpftool map dequeue *MAP* Dequeue and print value from the queue. -**bpftool map freeze** *MAP* +bpftool map freeze *MAP* Freeze the map as read-only from user space. Entries from a frozen map can not longer be updated or deleted with the **bpf**\ () system call. This operation is not reversible, and the map remains immutable from user space until its destruction. However, read and write permissions for BPF programs to the map remain unchanged. -**bpftool map help** +bpftool map help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 43d2c858fdf8..f8e65869f8b4 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -33,7 +33,7 @@ NET COMMANDS DESCRIPTION =========== -**bpftool net { show | list }** [ **dev** *NAME* ] +bpftool net { show | list } [ dev *NAME* ] List bpf program attachments in the kernel networking subsystem. Currently, device driver xdp attachments, tcx, netkit and old-style tc @@ -58,7 +58,7 @@ DESCRIPTION then all bpf programs attached to non clsact qdiscs, and finally all bpf programs attached to root and clsact qdisc. -**bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] Attach bpf program *PROG* to network interface *NAME* with type specified by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the command used with **overwrite** option. Currently, only XDP-related modes @@ -70,13 +70,13 @@ DESCRIPTION **xdpdrv** - Native XDP. runs earliest point in driver's receive path; **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; -**bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* +bpftool net detach *ATTACH_TYPE* dev *NAME* Detach bpf program attached to network interface *NAME* with type specified by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used for attach must be specified. Currently, only XDP-related modes are supported for *ATTACH_TYPE*. -**bpftool net help** +bpftool net help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 841789e30acd..8c1ae55be596 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -29,7 +29,7 @@ PERF COMMANDS DESCRIPTION =========== -**bpftool perf { show | list }** +bpftool perf { show | list } List all raw_tracepoint, tracepoint, kprobe attachment in the system. Output will start with process id and file descriptor in that process, @@ -39,7 +39,7 @@ DESCRIPTION kernel virtual address. The attachment point for u[ret]probe is the file name and the file offset. -**bpftool perf help** +bpftool perf help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index cc456d357db3..8e730cfb2589 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -67,7 +67,7 @@ PROG COMMANDS DESCRIPTION =========== -**bpftool prog { show | list }** [*PROG*] +bpftool prog { show | list } [*PROG*] Show information about loaded programs. If *PROG* is specified show information only about given programs, otherwise list all programs currently loaded on the system. In case of **tag** or **name**, *PROG* may @@ -87,7 +87,7 @@ DESCRIPTION that hold open file descriptors (FDs) against BPF programs. On such kernels bpftool will automatically emit this information as well. -**bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] +bpftool prog dump xlated *PROG* [{ file *FILE* | [opcodes] [linum] [visual] }] Dump eBPF instructions of the programs from the kernel. By default, eBPF will be disassembled and printed to standard output in human-readable format. In this case, **opcodes** controls if raw opcodes should be printed @@ -108,7 +108,7 @@ DESCRIPTION displayed. If **linum** is specified, the filename, line number and line column will also be displayed. -**bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] +bpftool prog dump jited *PROG* [{ file *FILE* | [opcodes] [linum] }] Dump jited image (host machine code) of the program. If *FILE* is specified image will be written to a file, otherwise it will @@ -121,13 +121,13 @@ DESCRIPTION **linum** is specified, the filename, line number and line column will also be displayed. -**bpftool prog pin** *PROG* *FILE* +bpftool prog pin *PROG* *FILE* Pin program *PROG* as *FILE*. Note: *FILE* must be located in *bpffs* mount. It must not contain a dot character ('.'), which is reserved for future extensions of *bpffs*. -**bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] +bpftool prog { load | loadall } *OBJ* *PATH* [type *TYPE*] [map { idx *IDX* | name *NAME* } *MAP*] [{ offload_dev | xdpmeta_dev } *NAME*] [pinmaps *MAP_DIR*] [autoattach] Load bpf program(s) from binary *OBJ* and pin as *PATH*. **bpftool prog load** pins only the first program from the *OBJ* as *PATH*. **bpftool prog loadall** pins all programs from the *OBJ* under *PATH* directory. **type** @@ -156,24 +156,24 @@ DESCRIPTION Note: *PATH* must be located in *bpffs* mount. It must not contain a dot character ('.'), which is reserved for future extensions of *bpffs*. -**bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] +bpftool prog attach *PROG* *ATTACH_TYPE* [*MAP*] Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* parameter, with the exception of *flow_dissector* which is attached to current networking name space. -**bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] +bpftool prog detach *PROG* *ATTACH_TYPE* [*MAP*] Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* parameter, with the exception of *flow_dissector* which is detached from the current networking name space. -**bpftool prog tracelog** +bpftool prog tracelog Dump the trace pipe of the system to the console (stdout). Hit to stop printing. BPF programs can write to this trace pipe at runtime with the **bpf_trace_printk**\ () helper. This should be used only for debugging purposes. For streaming data from BPF programs to user space, one can use perf events (see also **bpftool-map**\ (8)). -**bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] +bpftool prog run *PROG* data_in *FILE* [data_out *FILE* [data_size_out *L*]] [ctx_in *FILE* [ctx_out *FILE* [ctx_size_out *M*]]] [repeat *N*] Run BPF program *PROG* in the kernel testing infrastructure for BPF, meaning that the program works on the data and context provided by the user, and not on actual packets or monitored functions etc. Return value @@ -201,12 +201,12 @@ DESCRIPTION them can take the **ctx_in**/**ctx_out** arguments. bpftool does not perform checks on program types. -**bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* +bpftool prog profile *PROG* [duration *DURATION*] *METRICs* Profile *METRICs* for bpf program *PROG* for *DURATION* seconds or until user hits . *DURATION* is optional. If *DURATION* is not specified, the profiling will run up to **UINT_MAX** seconds. -**bpftool prog help** +bpftool prog help Print short help message. OPTIONS diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 61ec1286446c..e871b9539ac7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -36,7 +36,7 @@ STRUCT_OPS COMMANDS DESCRIPTION =========== -**bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] +bpftool struct_ops { show | list } [*STRUCT_OPS_MAP*] Show brief information about the struct_ops in the system. If *STRUCT_OPS_MAP* is specified, it shows information only for the given struct_ops. Otherwise, it lists all struct_ops currently existing in the @@ -45,13 +45,13 @@ DESCRIPTION Output will start with struct_ops map ID, followed by its map name and its struct_ops's kernel type. -**bpftool struct_ops dump** [*STRUCT_OPS_MAP*] +bpftool struct_ops dump [*STRUCT_OPS_MAP*] Dump details information about the struct_ops in the system. If *STRUCT_OPS_MAP* is specified, it dumps information only for the given struct_ops. Otherwise, it dumps all struct_ops currently existing in the system. -**bpftool struct_ops register** *OBJ* [*LINK_DIR*] +bpftool struct_ops register *OBJ* [*LINK_DIR*] Register bpf struct_ops from *OBJ*. All struct_ops under the ELF section ".struct_ops" and ".struct_ops.link" will be registered to its kernel subsystem. For each struct_ops in the ".struct_ops.link" section, a link @@ -59,10 +59,10 @@ DESCRIPTION these links will be pinned with the same name as their corresponding map name. -**bpftool struct_ops unregister** *STRUCT_OPS_MAP* +bpftool struct_ops unregister *STRUCT_OPS_MAP* Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. -**bpftool struct_ops help** +bpftool struct_ops help Print short help message. OPTIONS From a70f5d840a56a82c576385ca79ee55c1598f1bc3 Mon Sep 17 00:00:00 2001 From: Rameez Rehman Date: Sun, 31 Mar 2024 21:03:46 +0100 Subject: [PATCH 030/147] bpftool: Clean-up typos, punctuation, list formatting in docs Improve the formatting of the attach flags for cgroup programs in the relevant man page, and fix typos ("can be on of", "an userspace inet socket") when introducing that list. Also fix a couple of other trivial issues in docs. [ Quentin: Fixed trival issues in bpftool-gen.rst and bpftool-iter.rst ] Signed-off-by: Rameez Rehman Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240331200346.29118-4-qmo@kernel.org --- .../bpftool/Documentation/bpftool-cgroup.rst | 65 +++++++++---------- .../bpf/bpftool/Documentation/bpftool-gen.rst | 8 +-- .../bpftool/Documentation/bpftool-iter.rst | 2 +- 3 files changed, 35 insertions(+), 40 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 1acf9f58fca2..b2610d169e60 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -90,41 +90,36 @@ bpftool cgroup attach *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 and later. - *ATTACH_TYPE* can be on of: - **ingress** ingress path of the inet socket (since 4.10); - **egress** egress path of the inet socket (since 4.10); - **sock_create** opening of an inet socket (since 4.10); - **sock_ops** various socket operations (since 4.12); - **device** device access (since 4.15); - **bind4** call to bind(2) for an inet4 socket (since 4.17); - **bind6** call to bind(2) for an inet6 socket (since 4.17); - **post_bind4** return from bind(2) for an inet4 socket (since 4.17); - **post_bind6** return from bind(2) for an inet6 socket (since 4.17); - **connect4** call to connect(2) for an inet4 socket (since 4.17); - **connect6** call to connect(2) for an inet6 socket (since 4.17); - **connect_unix** call to connect(2) for a unix socket (since 6.7); - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected - udp4 socket (since 4.18); - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected - udp6 socket (since 4.18); - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an - unconnected unix socket (since 6.7); - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an - unconnected udp4 socket (since 5.2); - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an - unconnected udp6 socket (since 5.2); - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an - unconnected unix socket (since 6.7); - **sysctl** sysctl access (since 5.2); - **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3); - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7); - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7); - **sock_release** closing an userspace inet socket (since 5.9). + *ATTACH_TYPE* can be one of: + + - **ingress** ingress path of the inet socket (since 4.10) + - **egress** egress path of the inet socket (since 4.10) + - **sock_create** opening of an inet socket (since 4.10) + - **sock_ops** various socket operations (since 4.12) + - **device** device access (since 4.15) + - **bind4** call to bind(2) for an inet4 socket (since 4.17) + - **bind6** call to bind(2) for an inet6 socket (since 4.17) + - **post_bind4** return from bind(2) for an inet4 socket (since 4.17) + - **post_bind6** return from bind(2) for an inet6 socket (since 4.17) + - **connect4** call to connect(2) for an inet4 socket (since 4.17) + - **connect6** call to connect(2) for an inet6 socket (since 4.17) + - **connect_unix** call to connect(2) for a unix socket (since 6.7) + - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp4 socket (since 4.18) + - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp6 socket (since 4.18) + - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected unix socket (since 6.7) + - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp4 socket (since 5.2) + - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp6 socket (since 5.2) + - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected unix socket (since 6.7) + - **sysctl** sysctl access (since 5.2) + - **getsockopt** call to getsockopt (since 5.3) + - **setsockopt** call to setsockopt (since 5.3) + - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8) + - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8) + - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7) + - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8) + - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8) + - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7) + - **sock_release** closing a userspace inet socket (since 5.9) bpftool cgroup detach *CGROUP* *ATTACH_TYPE* *PROG* Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*. diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index e9589c21e9c3..c768e6d4ae09 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -110,7 +110,7 @@ bpftool gen skeleton *FILE* - **example__open_and_load** combines **example__open** and **example__load** invocations in one commonly used operation. - - **example__attach** and **example__detach** + - **example__attach** and **example__detach**. This pair of functions allow to attach and detach, correspondingly, already loaded BPF object. Only BPF programs of types supported by libbpf for auto-attachment will be auto-attached and their corresponding BPF @@ -119,7 +119,7 @@ bpftool gen skeleton *FILE* **example__detach** will detach both links created automatically, as well as those populated by user manually. - - **example__destroy** + - **example__destroy**. Detach and unload BPF programs, free up all the resources used by skeleton and BPF object. @@ -146,11 +146,11 @@ bpftool gen subskeleton *FILE* Consequently, there are only two functions defined for subskeletons: - - **example__open(bpf_object\*)** + - **example__open(bpf_object\*)**. Instantiates a subskeleton from an already opened (but not necessarily loaded) **bpf_object**. - - **example__destroy()** + - **example__destroy()**. Frees the storage for the subskeleton but *does not* unload any BPF programs or maps. diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 7bcb4e993d7d..2e5d81c906dc 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -21,7 +21,7 @@ SYNOPSIS *COMMANDS* := { **pin** | **help** } ITER COMMANDS -=================== +============= | **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] | **bpftool** **iter help** From ca4ddc26f8acaa9cb451fcb20f7ab0f02e4970cb Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 29 Mar 2024 10:28:46 -0500 Subject: [PATCH 031/147] bpf: Fix typo in uapi doc comments In a few places in the bpf uapi headers, EOPNOTSUPP is missing a "P" in the doc comments. This adds the missing "P". Signed-off-by: David Lechner Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240329152900.398260-2-dlechner@baylibre.com --- include/uapi/linux/bpf.h | 4 ++-- tools/include/uapi/linux/bpf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 96d57e483133..79c548276b6b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5026,7 +5026,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. * * struct socket *bpf_sock_from_file(struct file *file) @@ -5512,7 +5512,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * * void *bpf_kptr_xchg(void *map_value, void *ptr) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 96d57e483133..79c548276b6b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5026,7 +5026,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. * * struct socket *bpf_sock_from_file(struct file *file) @@ -5512,7 +5512,7 @@ union bpf_attr { * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * * void *bpf_kptr_xchg(void *map_value, void *ptr) From 9dc182c58b5f5d4ac125ac85ad553f7142aa08d4 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 2 Apr 2024 07:33:47 +0000 Subject: [PATCH 032/147] bpf: Add a verbose message if map limit is reached When more than 64 maps are used by a program and its subprograms the verifier returns -E2BIG. Add a verbose message which highlights the source of the error and also print the actual limit. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240402073347.195920-1-aspsk@isovalent.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index edb650667f44..fcb62300f407 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18348,6 +18348,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } if (env->used_map_cnt >= MAX_USED_MAPS) { + verbose(env, "The total number of maps per program has reached the limit of %u\n", + MAX_USED_MAPS); fdput(f); return -E2BIG; } From 965c6167c93f3fac53e25807f83c07e87b3c085a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 1 Apr 2024 19:54:46 -0700 Subject: [PATCH 033/147] selftests/bpf: Using llvm may_goto inline asm for cond_break macro Currently, cond_break macro uses bytes to encode the may_goto insn. Patch [1] in llvm implemented may_goto insn in BPF backend. Replace byte-level encoding with llvm inline asm for better usability. Using llvm may_goto insn is controlled by macro __BPF_FEATURE_MAY_GOTO. [1] https://github.com/llvm/llvm-project/commit/0e0bfacff71859d1f9212205f8f873d47029d3fb Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240402025446.3215182-1-yonghong.song@linux.dev --- tools/testing/selftests/bpf/bpf_experimental.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index a5b9df38c162..3329ea080865 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -326,6 +326,16 @@ l_true: \ }) #endif +#ifdef __BPF_FEATURE_MAY_GOTO +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#else #define cond_break \ ({ __label__ l_break, l_continue; \ asm volatile goto("1:.byte 0xe5; \ @@ -337,6 +347,7 @@ l_true: \ l_break: break; \ l_continue:; \ }) +#endif #ifndef bpf_nop_mov #define bpf_nop_mov(var) \ From 2a24e2485722b0e12e17a2bd473bd15c9e420bdb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 10:07:13 -0700 Subject: [PATCH 034/147] bpftool: Use __typeof__() instead of typeof() in BPF skeleton When generated BPF skeleton header is included in C++ code base, some compiler setups will emit warning about using language extensions due to typeof() usage, resulting in something like: error: extension used [-Werror,-Wlanguage-extension-token] obj->struct_ops.empty_tcp_ca = (typeof(obj->struct_ops.empty_tcp_ca)) ^ It looks like __typeof__() is a preferred way to do typeof() with better C++ compatibility behavior, so switch to that. With __typeof__() we get no such warning. Fixes: c2a0257c1edf ("bpftool: Cast pointers for shadow types explicitly.") Fixes: 00389c58ffe9 ("bpftool: Add support for subskeletons") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Kui-Feng Lee Acked-by: Quentin Monnet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240401170713.2081368-1-andrii@kernel.org --- tools/bpf/bpftool/gen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 786268f1a483..b3979ddc0189 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -386,7 +386,7 @@ static int codegen_subskel_datasecs(struct bpf_object *obj, const char *obj_name */ needs_typeof = btf_is_array(var) || btf_is_ptr_to_func_proto(btf, var); if (needs_typeof) - printf("typeof("); + printf("__typeof__("); err = btf_dump__emit_type_decl(d, var_type_id, &opts); if (err) @@ -1131,7 +1131,7 @@ static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj) continue; codegen("\ \n\ - obj->struct_ops.%1$s = (typeof(obj->struct_ops.%1$s))\n\ + obj->struct_ops.%1$s = (__typeof__(obj->struct_ops.%1$s))\n\ bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ \n\ ", ident); From c186ed12a8ec498532d13de43094bdec9ac6f121 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 2 Apr 2024 07:30:29 +0000 Subject: [PATCH 035/147] selftests/bpf: Skip test when perf_event_open returns EOPNOTSUPP When testing send_signal and stacktrace_build_id_nmi using the riscv sbi pmu driver without the sscofpmf extension or the riscv legacy pmu driver, then failures as follows are encountered: test_send_signal_common:FAIL:perf_event_open unexpected perf_event_open: actual -1 < expected 0 #272/3 send_signal/send_signal_nmi:FAIL test_stacktrace_build_id_nmi:FAIL:perf_event_open err -1 errno 95 #304 stacktrace_build_id_nmi:FAIL The reason is that the above pmu driver or hardware does not support sampling events, that is, PERF_PMU_CAP_NO_INTERRUPT is set to pmu capabilities, and then perf_event_open returns EOPNOTSUPP. Since PERF_PMU_CAP_NO_INTERRUPT is not only set in the riscv-related pmu driver, it is better to skip testing when this capability is set. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240402073029.1299085-1-pulehui@huaweicloud.com --- tools/testing/selftests/bpf/prog_tests/send_signal.c | 2 +- .../testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index b15b343ebb6b..920aee41bd58 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -179,7 +179,7 @@ static void test_send_signal_nmi(bool signal_thread) pmu_fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */, -1 /* cpu */, -1 /* group_fd */, 0 /* flags */); if (pmu_fd == -1) { - if (errno == ENOENT) { + if (errno == ENOENT || errno == EOPNOTSUPP) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 5db9eec24b5b..0832fd787457 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -35,7 +35,7 @@ retry: pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu 0 */, -1 /* group id */, 0 /* flags */); - if (pmu_fd < 0 && errno == ENOENT) { + if (pmu_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); test__skip(); goto cleanup; From ce09cbdd988887662546a1175bcfdfc6c8fdd150 Mon Sep 17 00:00:00 2001 From: Jose Fernandez Date: Mon, 1 Apr 2024 21:40:10 -0600 Subject: [PATCH 036/147] bpf: Improve program stats run-time calculation This patch improves the run-time calculation for program stats by capturing the duration as soon as possible after the program returns. Previously, the duration included u64_stats_t operations. While the instrumentation overhead is part of the total time spent when stats are enabled, distinguishing between the program's native execution time and the time spent due to instrumentation is crucial for accurate performance analysis. By making this change, the patch facilitates more precise optimization of BPF programs, enabling users to understand their performance in environments without stats enabled. I used a virtualized environment to measure the run-time over one minute for a basic raw_tracepoint/sys_enter program, which just increments a local counter. Although the virtualization introduced some performance degradation that could affect the results, I observed approximately a 16% decrease in average run-time reported by stats with this change (310 -> 260 nsec). Signed-off-by: Jose Fernandez Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240402034010.25060-1-josef@netflix.com --- include/linux/filter.h | 6 ++++-- kernel/bpf/trampoline.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 44934b968b57..531b36090122 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -654,14 +654,16 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; - u64 start = sched_clock(); + u64 duration, start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + + duration = sched_clock() - start; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, sched_clock() - start); + u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index cc50607f8d8c..26ae703d3c3b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -885,12 +885,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog, * Hence check that 'start' is valid. */ start > NO_START_TIME) { + u64 duration = sched_clock() - start; unsigned long flags; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, sched_clock() - start); + u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } } From 15ea39ad7e83af16480bbf20144fcc6edf4757f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20B=C3=B6hm?= Date: Tue, 2 Apr 2024 17:10:52 +0200 Subject: [PATCH 037/147] libbpf: Use local bpf_helpers.h include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 20d59ee55172fdf6 ("libbpf: add bpf_core_cast() macro") added a bpf_helpers include in bpf_core_read.h as a system include. Usually, the includes are local, though, like in bpf_tracing.h. This commit adjusts the include to be local as well. Signed-off-by: Tobias Böhm Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/q5d5bgc6vty2fmaazd5e73efd6f5bhiru2le6fxn43vkw45bls@fhlw2s5ootdb --- tools/lib/bpf/bpf_core_read.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 1ce738d91685..b5c7ce5c243a 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -2,7 +2,7 @@ #ifndef __BPF_CORE_READ_H__ #define __BPF_CORE_READ_H__ -#include +#include "bpf_helpers.h" /* * enum bpf_field_info_kind is passed as a second argument into From c07b4bcd5163c2929d8bfc55140325fc15afb4eb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 2 Apr 2024 18:50:21 +0800 Subject: [PATCH 038/147] selftests/bpf: Add pid limit for mptcpify prog In order to prevent mptcpify prog from affecting the running results of other BPF tests, a pid limit was added to restrict it from only modifying its own program. Suggested-by: Martin KaFai Lau Signed-off-by: Geliang Tang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8987e2938e15e8ec390b85b5dcbee704751359dc.1712054986.git.tanggeliang@kylinos.cn --- tools/testing/selftests/bpf/prog_tests/mptcp.c | 2 ++ tools/testing/selftests/bpf/progs/mptcpify.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index 8f8d792307c1..4e0f69295872 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -273,6 +273,8 @@ static int run_mptcpify(int cgroup_fd) if (!ASSERT_OK_PTR(mptcpify_skel, "skel_open_load")) return libbpf_get_error(mptcpify_skel); + mptcpify_skel->bss->pid = getpid(); + err = mptcpify__attach(mptcpify_skel); if (!ASSERT_OK(err, "skel_attach")) goto out; diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c index 53301ae8a8f7..cbdc730c3a47 100644 --- a/tools/testing/selftests/bpf/progs/mptcpify.c +++ b/tools/testing/selftests/bpf/progs/mptcpify.c @@ -6,10 +6,14 @@ #include "bpf_tracing_net.h" char _license[] SEC("license") = "GPL"; +int pid; SEC("fmod_ret/update_socket_protocol") int BPF_PROG(mptcpify, int family, int type, int protocol) { + if (bpf_get_current_pid_tgid() >> 32 != pid) + return protocol; + if ((family == AF_INET || family == AF_INET6) && type == SOCK_STREAM && (!protocol || protocol == IPPROTO_TCP)) { From 339af577ec05c8fc0b96f23579614ae853d913ab Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 25 Mar 2024 15:07:15 +0000 Subject: [PATCH 039/147] bpf: Add arm64 JIT support for PROBE_MEM32 pseudo instructions. Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW] instructions. They are similar to PROBE_MEM instructions with the following differences: - PROBE_MEM32 supports store. - PROBE_MEM32 relies on the verifier to clear upper 32-bit of the src/dst register - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in R28 in the prologue). Due to bpf_arena constructions such R28 + reg + off16 access is guaranteed to be within arena virtual range, so no address check at run-time. - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When LDX faults the destination register is zeroed. To support these on arm64, we do tmp2 = R28 + src/dst reg and then use tmp2 as the new src/dst register. This allows us to reuse most of the code for normal [LDX | STX | ST]. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240325150716.4387-2-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 70 ++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 82d6c562e8c7..c6210b150b6a 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -29,6 +29,7 @@ #define TCALL_CNT (MAX_BPF_JIT_REG + 2) #define TMP_REG_3 (MAX_BPF_JIT_REG + 3) #define FP_BOTTOM (MAX_BPF_JIT_REG + 4) +#define ARENA_VM_START (MAX_BPF_JIT_REG + 5) #define check_imm(bits, imm) do { \ if ((((imm) > 0) && ((imm) >> (bits))) || \ @@ -67,6 +68,8 @@ static const int bpf2a64[] = { /* temporary register for blinding constants */ [BPF_REG_AX] = A64_R(9), [FP_BOTTOM] = A64_R(27), + /* callee saved register for kern_vm_start address */ + [ARENA_VM_START] = A64_R(28), }; struct jit_ctx { @@ -295,7 +298,7 @@ static bool is_lsi_offset(int offset, int scale) #define PROLOGUE_OFFSET (BTI_INSNS + 2 + PAC_INSNS + 8) static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, - bool is_exception_cb) + bool is_exception_cb, u64 arena_vm_start) { const struct bpf_prog *prog = ctx->prog; const bool is_main_prog = !bpf_is_subprog(prog); @@ -306,6 +309,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, const u8 fp = bpf2a64[BPF_REG_FP]; const u8 tcc = bpf2a64[TCALL_CNT]; const u8 fpb = bpf2a64[FP_BOTTOM]; + const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const int idx0 = ctx->idx; int cur_offset; @@ -411,6 +415,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf, /* Set up function call stack */ emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); + + if (arena_vm_start) + emit_a64_mov_i64(arena_vm_base, arena_vm_start, ctx); + return 0; } @@ -738,6 +746,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool is_exception_cb) #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) +#define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) @@ -745,7 +754,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); - regs->regs[dst_reg] = 0; + if (dst_reg != DONT_CLEAR) + regs->regs[dst_reg] = 0; regs->pc = (unsigned long)&ex->fixup - offset; return true; } @@ -765,7 +775,8 @@ static int add_exception_handler(const struct bpf_insn *insn, return 0; if (BPF_MODE(insn->code) != BPF_PROBE_MEM && - BPF_MODE(insn->code) != BPF_PROBE_MEMSX) + BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32) return 0; if (!ctx->prog->aux->extable || @@ -810,6 +821,9 @@ static int add_exception_handler(const struct bpf_insn *insn, ex->insn = ins_offset; + if (BPF_CLASS(insn->code) != BPF_LDX) + dst_reg = DONT_CLEAR; + ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) | FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); @@ -829,12 +843,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool extra_pass) { const u8 code = insn->code; - const u8 dst = bpf2a64[insn->dst_reg]; - const u8 src = bpf2a64[insn->src_reg]; + u8 dst = bpf2a64[insn->dst_reg]; + u8 src = bpf2a64[insn->src_reg]; const u8 tmp = bpf2a64[TMP_REG_1]; const u8 tmp2 = bpf2a64[TMP_REG_2]; const u8 fp = bpf2a64[BPF_REG_FP]; const u8 fpb = bpf2a64[FP_BOTTOM]; + const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const s16 off = insn->off; const s32 imm = insn->imm; const int i = insn - ctx->prog->insnsi; @@ -1237,7 +1252,15 @@ emit_cond_jmp: case BPF_LDX | BPF_PROBE_MEMSX | BPF_B: case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: - if (ctx->fpb_offset > 0 && src == fp) { + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit(A64_ADD(1, tmp2, src, arena_vm_base), ctx); + src = tmp2; + } + if (ctx->fpb_offset > 0 && src == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) { src_adj = fpb; off_adj = off + ctx->fpb_offset; } else { @@ -1322,7 +1345,15 @@ emit_cond_jmp: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_DW: - if (ctx->fpb_offset > 0 && dst == fp) { + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit(A64_ADD(1, tmp2, dst, arena_vm_base), ctx); + dst = tmp2; + } + if (ctx->fpb_offset > 0 && dst == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) { dst_adj = fpb; off_adj = off + ctx->fpb_offset; } else { @@ -1365,6 +1396,10 @@ emit_cond_jmp: } break; } + + ret = add_exception_handler(insn, ctx, dst); + if (ret) + return ret; break; /* STX: *(size *)(dst + off) = src */ @@ -1372,7 +1407,15 @@ emit_cond_jmp: case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_B: case BPF_STX | BPF_MEM | BPF_DW: - if (ctx->fpb_offset > 0 && dst == fp) { + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit(A64_ADD(1, tmp2, dst, arena_vm_base), ctx); + dst = tmp2; + } + if (ctx->fpb_offset > 0 && dst == fp && BPF_MODE(insn->code) != BPF_PROBE_MEM32) { dst_adj = fpb; off_adj = off + ctx->fpb_offset; } else { @@ -1413,6 +1456,10 @@ emit_cond_jmp: } break; } + + ret = add_exception_handler(insn, ctx, dst); + if (ret) + return ret; break; case BPF_STX | BPF_ATOMIC | BPF_W: @@ -1594,6 +1641,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bool tmp_blinded = false; bool extra_pass = false; struct jit_ctx ctx; + u64 arena_vm_start; u8 *image_ptr; u8 *ro_image_ptr; @@ -1611,6 +1659,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = tmp; } + arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena); jit_data = prog->aux->jit_data; if (!jit_data) { jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); @@ -1648,7 +1697,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * BPF line info needs ctx->offset[i] to be the offset of * instruction[i] in jited image, so build prologue first. */ - if (build_prologue(&ctx, was_classic, prog->aux->exception_cb)) { + if (build_prologue(&ctx, was_classic, prog->aux->exception_cb, + arena_vm_start)) { prog = orig_prog; goto out_off; } @@ -1696,7 +1746,7 @@ skip_init_ctx: ctx.idx = 0; ctx.exentry_idx = 0; - build_prologue(&ctx, was_classic, prog->aux->exception_cb); + build_prologue(&ctx, was_classic, prog->aux->exception_cb, arena_vm_start); if (build_body(&ctx, extra_pass)) { prog = orig_prog; From 4dd31243e30843d5f63bccfb0369146e4de1a130 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 25 Mar 2024 15:07:16 +0000 Subject: [PATCH 040/147] bpf: Add arm64 JIT support for bpf_addr_space_cast instruction. LLVM generates bpf_addr_space_cast instruction while translating pointers between native (zero) address space and __attribute__((address_space(N))). The addr_space=0 is reserved as bpf_arena address space. rY = addr_space_cast(rX, 0, 1) is processed by the verifier and converted to normal 32-bit move: wX = wY. rY = addr_space_cast(rX, 1, 0) : used to convert a bpf arena pointer to a pointer in the userspace vma. This has to be converted by the JIT. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240325150716.4387-3-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 16 ++++++++++++++++ tools/testing/selftests/bpf/DENYLIST.aarch64 | 2 -- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c6210b150b6a..76b91f36c729 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -82,6 +82,7 @@ struct jit_ctx { __le32 *ro_image; u32 stack_size; int fpb_offset; + u64 user_vm_start; }; struct bpf_plt { @@ -868,6 +869,15 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, /* dst = src */ case BPF_ALU | BPF_MOV | BPF_X: case BPF_ALU64 | BPF_MOV | BPF_X: + if (insn_is_cast_user(insn)) { + emit(A64_MOV(0, tmp, src), ctx); // 32-bit mov clears the upper 32 bits + emit_a64_mov_i(0, dst, ctx->user_vm_start >> 32, ctx); + emit(A64_LSL(1, dst, dst, 32), ctx); + emit(A64_CBZ(1, tmp, 2), ctx); + emit(A64_ORR(1, tmp, dst, tmp), ctx); + emit(A64_MOV(1, dst, tmp), ctx); + break; + } switch (insn->off) { case 0: emit(A64_MOV(is64, dst, src), ctx); @@ -1690,6 +1700,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx.fpb_offset = find_fpb_offset(prog); + ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); /* * 1. Initial fake pass to compute ctx->idx and ctx->offset. @@ -2511,6 +2522,11 @@ bool bpf_jit_supports_exceptions(void) return true; } +bool bpf_jit_supports_arena(void) +{ + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index d8ade15e2789..0445ac38bc07 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,5 +10,3 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) -verifier_arena # JIT does not support arena -arena_htab # JIT does not support arena From 7effe3fdc049a34c56a68671100b5570e53e8f0a Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:23 +0000 Subject: [PATCH 041/147] tools: Add ethtool.h header to tooling infra This commit duplicates the ethtool.h file from the include/uapi/linux directory in the kernel source to the tools/include/uapi/linux directory. This action ensures that the ethtool.h file used in the tools directory is in sync with the kernel's version, maintaining consistency across the codebase. There are some checkpatch warnings in this file that could be cleaned up, but I preferred to move it over as-is for now to avoid disrupting the code. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-2-tushar.vyavahare@intel.com --- tools/include/uapi/linux/ethtool.h | 2261 +++++++++++++++++++++++++++- 1 file changed, 2214 insertions(+), 47 deletions(-) diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h index 47afae3895ec..11fc18988bc2 100644 --- a/tools/include/uapi/linux/ethtool.h +++ b/tools/include/uapi/linux/ethtool.h @@ -14,11 +14,524 @@ #ifndef _UAPI_LINUX_ETHTOOL_H #define _UAPI_LINUX_ETHTOOL_H -#include +#include #include #include -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ +#ifndef __KERNEL__ +#include /* for INT_MAX */ +#endif + +/* All structures exposed to userland should be defined such that they + * have the same layout for 32-bit and 64-bit userland. + */ + +/* Note on reserved space. + * Reserved fields must not be accessed directly by user space because + * they may be replaced by a different field in the future. They must + * be initialized to zero before making the request, e.g. via memset + * of the entire structure or implicitly by not being set in a structure + * initializer. + */ + +/** + * struct ethtool_cmd - DEPRECATED, link control and status + * This structure is DEPRECATED, please use struct ethtool_link_settings. + * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET + * @supported: Bitmask of %SUPPORTED_* flags for the link modes, + * physical connectors and other link features for which the + * interface supports autonegotiation or auto-detection. + * Read-only. + * @advertising: Bitmask of %ADVERTISED_* flags for the link modes, + * physical connectors and other link features that are + * advertised through autonegotiation or enabled for + * auto-detection. + * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @transceiver: Historically used to distinguish different possible + * PHY types, but not in a consistent way. Deprecated. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @maxtxpkt: Historically used to report TX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @maxrxpkt: Historically used to report RX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @lp_advertising: Bitmask of %ADVERTISED_* flags for the link modes + * and other link features that the link partner advertised + * through autonegotiation; 0 if unknown or not applicable. + * Read-only. + * @reserved: Reserved for future use; see the note on reserved space. + * + * The link speed in Mbps is split between @speed and @speed_hi. Use + * the ethtool_cmd_speed() and ethtool_cmd_speed_set() functions to + * access it. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GSET to get the current values before making specific + * changes and then applying them with %ETHTOOL_SSET. + * + * Deprecated fields should be ignored by both users and drivers. + */ +struct ethtool_cmd { + __u32 cmd; + __u32 supported; + __u32 advertising; + __u16 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 transceiver; + __u8 autoneg; + __u8 mdio_support; + __u32 maxtxpkt; + __u32 maxrxpkt; + __u16 speed_hi; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __u32 lp_advertising; + __u32 reserved[2]; +}; + +static inline void ethtool_cmd_speed_set(struct ethtool_cmd *ep, + __u32 speed) +{ + ep->speed = (__u16)(speed & 0xFFFF); + ep->speed_hi = (__u16)(speed >> 16); +} + +static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) +{ + return (ep->speed_hi << 16) | ep->speed; +} + +/* Device supports clause 22 register access to PHY or peripherals + * using the interface defined in . This should not be + * set if there are known to be no such peripherals present or if + * the driver only emulates clause 22 registers for compatibility. + */ +#define ETH_MDIO_SUPPORTS_C22 1 + +/* Device supports clause 45 register access to PHY or peripherals + * using the interface defined in and . + * This should not be set if there are known to be no such peripherals + * present. + */ +#define ETH_MDIO_SUPPORTS_C45 2 + +#define ETHTOOL_FWVERS_LEN 32 +#define ETHTOOL_BUSINFO_LEN 32 +#define ETHTOOL_EROMVERS_LEN 32 + +/** + * struct ethtool_drvinfo - general driver and device information + * @cmd: Command number = %ETHTOOL_GDRVINFO + * @driver: Driver short name. This should normally match the name + * in its bus driver structure (e.g. pci_driver::name). Must + * not be an empty string. + * @version: Driver version string; may be an empty string + * @fw_version: Firmware version string; driver defined; may be an + * empty string + * @erom_version: Expansion ROM version string; driver defined; may be + * an empty string + * @bus_info: Device bus address. This should match the dev_name() + * string for the underlying bus device, if there is one. May be + * an empty string. + * @reserved2: Reserved for future use; see the note on reserved space. + * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and + * %ETHTOOL_SPFLAGS commands; also the number of strings in the + * %ETH_SS_PRIV_FLAGS set + * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS + * command; also the number of strings in the %ETH_SS_STATS set + * @testinfo_len: Number of results returned by the %ETHTOOL_TEST + * command; also the number of strings in the %ETH_SS_TEST set + * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM + * and %ETHTOOL_SEEPROM commands, in bytes + * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS + * command, in bytes + * + * Users can use the %ETHTOOL_GSSET_INFO command to get the number of + * strings in any string set (from Linux 2.6.34). + */ +struct ethtool_drvinfo { + __u32 cmd; + char driver[32]; + char version[32]; + char fw_version[ETHTOOL_FWVERS_LEN]; + char bus_info[ETHTOOL_BUSINFO_LEN]; + char erom_version[ETHTOOL_EROMVERS_LEN]; + char reserved2[12]; + __u32 n_priv_flags; + __u32 n_stats; + __u32 testinfo_len; + __u32 eedump_len; + __u32 regdump_len; +}; + +#define SOPASS_MAX 6 + +/** + * struct ethtool_wolinfo - Wake-On-Lan configuration + * @cmd: Command number = %ETHTOOL_GWOL or %ETHTOOL_SWOL + * @supported: Bitmask of %WAKE_* flags for supported Wake-On-Lan modes. + * Read-only. + * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes. + * @sopass: SecureOn(tm) password; meaningful only if %WAKE_MAGICSECURE + * is set in @wolopts. + */ +struct ethtool_wolinfo { + __u32 cmd; + __u32 supported; + __u32 wolopts; + __u8 sopass[SOPASS_MAX]; +}; + +/* for passing single values */ +struct ethtool_value { + __u32 cmd; + __u32 data; +}; + +#define PFC_STORM_PREVENTION_AUTO 0xffff +#define PFC_STORM_PREVENTION_DISABLE 0 + +enum tunable_id { + ETHTOOL_ID_UNSPEC, + ETHTOOL_RX_COPYBREAK, + ETHTOOL_TX_COPYBREAK, + ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ + ETHTOOL_TX_COPYBREAK_BUF_SIZE, + /* + * Add your fresh new tunable attribute above and remember to update + * tunable_strings[] in net/ethtool/common.c + */ + __ETHTOOL_TUNABLE_COUNT, +}; + +enum tunable_type_id { + ETHTOOL_TUNABLE_UNSPEC, + ETHTOOL_TUNABLE_U8, + ETHTOOL_TUNABLE_U16, + ETHTOOL_TUNABLE_U32, + ETHTOOL_TUNABLE_U64, + ETHTOOL_TUNABLE_STRING, + ETHTOOL_TUNABLE_S8, + ETHTOOL_TUNABLE_S16, + ETHTOOL_TUNABLE_S32, + ETHTOOL_TUNABLE_S64, +}; + +struct ethtool_tunable { + __u32 cmd; + __u32 id; + __u32 type_id; + __u32 len; + void *data[]; +}; + +#define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff +#define DOWNSHIFT_DEV_DISABLE 0 + +/* Time in msecs after which link is reported as down + * 0 = lowest time supported by the PHY + * 0xff = off, link down detection according to standard + */ +#define ETHTOOL_PHY_FAST_LINK_DOWN_ON 0 +#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF 0xff + +/* Energy Detect Power Down (EDPD) is a feature supported by some PHYs, where + * the PHY's RX & TX blocks are put into a low-power mode when there is no + * link detected (typically cable is un-plugged). For RX, only a minimal + * link-detection is available, and for TX the PHY wakes up to send link pulses + * to avoid any lock-ups in case the peer PHY may also be running in EDPD mode. + * + * Some PHYs may support configuration of the wake-up interval for TX pulses, + * and some PHYs may support only disabling TX pulses entirely. For the latter + * a special value is required (ETHTOOL_PHY_EDPD_NO_TX) so that this can be + * configured from userspace (should the user want it). + * + * The interval units for TX wake-up are in milliseconds, since this should + * cover a reasonable range of intervals: + * - from 1 millisecond, which does not sound like much of a power-saver + * - to ~65 seconds which is quite a lot to wait for a link to come up when + * plugging a cable + */ +#define ETHTOOL_PHY_EDPD_DFLT_TX_MSECS 0xffff +#define ETHTOOL_PHY_EDPD_NO_TX 0xfffe +#define ETHTOOL_PHY_EDPD_DISABLE 0 + +enum phy_tunable_id { + ETHTOOL_PHY_ID_UNSPEC, + ETHTOOL_PHY_DOWNSHIFT, + ETHTOOL_PHY_FAST_LINK_DOWN, + ETHTOOL_PHY_EDPD, + /* + * Add your fresh new phy tunable attribute above and remember to update + * phy_tunable_strings[] in net/ethtool/common.c + */ + __ETHTOOL_PHY_TUNABLE_COUNT, +}; + +/** + * struct ethtool_regs - hardware register dump + * @cmd: Command number = %ETHTOOL_GREGS + * @version: Dump format version. This is driver-specific and may + * distinguish different chips/revisions. Drivers must use new + * version numbers whenever the dump format changes in an + * incompatible way. + * @len: On entry, the real length of @data. On return, the number of + * bytes used. + * @data: Buffer for the register dump + * + * Users should use %ETHTOOL_GDRVINFO to find the maximum length of + * a register dump for the interface. They must allocate the buffer + * immediately following this structure. + */ +struct ethtool_regs { + __u32 cmd; + __u32 version; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_eeprom - EEPROM dump + * @cmd: Command number = %ETHTOOL_GEEPROM, %ETHTOOL_GMODULEEEPROM or + * %ETHTOOL_SEEPROM + * @magic: A 'magic cookie' value to guard against accidental changes. + * The value passed in to %ETHTOOL_SEEPROM must match the value + * returned by %ETHTOOL_GEEPROM for the same device. This is + * unused when @cmd is %ETHTOOL_GMODULEEEPROM. + * @offset: Offset within the EEPROM to begin reading/writing, in bytes + * @len: On entry, number of bytes to read/write. On successful + * return, number of bytes actually read/written. In case of + * error, this may indicate at what point the error occurred. + * @data: Buffer to read/write from + * + * Users may use %ETHTOOL_GDRVINFO or %ETHTOOL_GMODULEINFO to find + * the length of an on-board or module EEPROM, respectively. They + * must allocate the buffer immediately following this structure. + */ +struct ethtool_eeprom { + __u32 cmd; + __u32 magic; + __u32 offset; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_eee - Energy Efficient Ethernet information + * @cmd: ETHTOOL_{G,S}EEE + * @supported: Mask of %SUPPORTED_* flags for the speed/duplex combinations + * for which there is EEE support. + * @advertised: Mask of %ADVERTISED_* flags for the speed/duplex combinations + * advertised as eee capable. + * @lp_advertised: Mask of %ADVERTISED_* flags for the speed/duplex + * combinations advertised by the link partner as eee capable. + * @eee_active: Result of the eee auto negotiation. + * @eee_enabled: EEE configured mode (enabled/disabled). + * @tx_lpi_enabled: Whether the interface should assert its tx lpi, given + * that eee was negotiated. + * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting + * its tx lpi (after reaching 'idle' state). Effective only when eee + * was negotiated and tx_lpi_enabled was set. + * @reserved: Reserved for future use; see the note on reserved space. + */ +struct ethtool_eee { + __u32 cmd; + __u32 supported; + __u32 advertised; + __u32 lp_advertised; + __u32 eee_active; + __u32 eee_enabled; + __u32 tx_lpi_enabled; + __u32 tx_lpi_timer; + __u32 reserved[2]; +}; + +/** + * struct ethtool_modinfo - plugin module eeprom information + * @cmd: %ETHTOOL_GMODULEINFO + * @type: Standard the module information conforms to %ETH_MODULE_SFF_xxxx + * @eeprom_len: Length of the eeprom + * @reserved: Reserved for future use; see the note on reserved space. + * + * This structure is used to return the information to + * properly size memory for a subsequent call to %ETHTOOL_GMODULEEEPROM. + * The type code indicates the eeprom data format + */ +struct ethtool_modinfo { + __u32 cmd; + __u32 type; + __u32 eeprom_len; + __u32 reserved[8]; +}; + +/** + * struct ethtool_coalesce - coalescing parameters for IRQs and stats updates + * @cmd: ETHTOOL_{G,S}COALESCE + * @rx_coalesce_usecs: How many usecs to delay an RX interrupt after + * a packet arrives. + * @rx_max_coalesced_frames: Maximum number of packets to receive + * before an RX interrupt. + * @rx_coalesce_usecs_irq: Same as @rx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @rx_max_coalesced_frames_irq: Same as @rx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @tx_coalesce_usecs: How many usecs to delay a TX interrupt after + * a packet is sent. + * @tx_max_coalesced_frames: Maximum number of packets to be sent + * before a TX interrupt. + * @tx_coalesce_usecs_irq: Same as @tx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @tx_max_coalesced_frames_irq: Same as @tx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @stats_block_coalesce_usecs: How many usecs to delay in-memory + * statistics block updates. Some drivers do not have an + * in-memory statistic block, and in such cases this value is + * ignored. This value must not be zero. + * @use_adaptive_rx_coalesce: Enable adaptive RX coalescing. + * @use_adaptive_tx_coalesce: Enable adaptive TX coalescing. + * @pkt_rate_low: Threshold for low packet rate (packets per second). + * @rx_coalesce_usecs_low: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is below @pkt_rate_low. + * @rx_max_coalesced_frames_low: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is below @pkt_rate_low. + * @tx_coalesce_usecs_low: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is below @pkt_rate_low. + * @tx_max_coalesced_frames_low: Maximum nuumber of packets to be sent before + * a TX interrupt, when the packet rate is below @pkt_rate_low. + * @pkt_rate_high: Threshold for high packet rate (packets per second). + * @rx_coalesce_usecs_high: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is above @pkt_rate_high. + * @rx_max_coalesced_frames_high: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is above @pkt_rate_high. + * @tx_coalesce_usecs_high: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is above @pkt_rate_high. + * @tx_max_coalesced_frames_high: Maximum number of packets to be sent before + * a TX interrupt, when the packet rate is above @pkt_rate_high. + * @rate_sample_interval: How often to do adaptive coalescing packet rate + * sampling, measured in seconds. Must not be zero. + * + * Each pair of (usecs, max_frames) fields specifies that interrupts + * should be coalesced until + * (usecs > 0 && time_since_first_completion >= usecs) || + * (max_frames > 0 && completed_frames >= max_frames) + * + * It is illegal to set both usecs and max_frames to zero as this + * would cause interrupts to never be generated. To disable + * coalescing, set usecs = 0 and max_frames = 1. + * + * Some implementations ignore the value of max_frames and use the + * condition time_since_first_completion >= usecs + * + * This is deprecated. Drivers for hardware that does not support + * counting completions should validate that max_frames == !rx_usecs. + * + * Adaptive RX/TX coalescing is an algorithm implemented by some + * drivers to improve latency under low packet rates and improve + * throughput under high packet rates. Some drivers only implement + * one of RX or TX adaptive coalescing. Anything not implemented by + * the driver causes these values to be silently ignored. + * + * When the packet rate is below @pkt_rate_high but above + * @pkt_rate_low (both measured in packets per second) the + * normal {rx,tx}_* coalescing parameters are used. + */ +struct ethtool_coalesce { + __u32 cmd; + __u32 rx_coalesce_usecs; + __u32 rx_max_coalesced_frames; + __u32 rx_coalesce_usecs_irq; + __u32 rx_max_coalesced_frames_irq; + __u32 tx_coalesce_usecs; + __u32 tx_max_coalesced_frames; + __u32 tx_coalesce_usecs_irq; + __u32 tx_max_coalesced_frames_irq; + __u32 stats_block_coalesce_usecs; + __u32 use_adaptive_rx_coalesce; + __u32 use_adaptive_tx_coalesce; + __u32 pkt_rate_low; + __u32 rx_coalesce_usecs_low; + __u32 rx_max_coalesced_frames_low; + __u32 tx_coalesce_usecs_low; + __u32 tx_max_coalesced_frames_low; + __u32 pkt_rate_high; + __u32 rx_coalesce_usecs_high; + __u32 rx_max_coalesced_frames_high; + __u32 tx_coalesce_usecs_high; + __u32 tx_max_coalesced_frames_high; + __u32 rate_sample_interval; +}; + +/** + * struct ethtool_ringparam - RX/TX ring parameters + * @cmd: Command number = %ETHTOOL_GRINGPARAM or %ETHTOOL_SRINGPARAM + * @rx_max_pending: Maximum supported number of pending entries per + * RX ring. Read-only. + * @rx_mini_max_pending: Maximum supported number of pending entries + * per RX mini ring. Read-only. + * @rx_jumbo_max_pending: Maximum supported number of pending entries + * per RX jumbo ring. Read-only. + * @tx_max_pending: Maximum supported number of pending entries per + * TX ring. Read-only. + * @rx_pending: Current maximum number of pending entries per RX ring + * @rx_mini_pending: Current maximum number of pending entries per RX + * mini ring + * @rx_jumbo_pending: Current maximum number of pending entries per RX + * jumbo ring + * @tx_pending: Current maximum supported number of pending entries + * per TX ring + * + * If the interface does not have separate RX mini and/or jumbo rings, + * @rx_mini_max_pending and/or @rx_jumbo_max_pending will be 0. + * + * There may also be driver-dependent minimum values for the number + * of entries per ring. + */ +struct ethtool_ringparam { + __u32 cmd; + __u32 rx_max_pending; + __u32 rx_mini_max_pending; + __u32 rx_jumbo_max_pending; + __u32 tx_max_pending; + __u32 rx_pending; + __u32 rx_mini_pending; + __u32 rx_jumbo_pending; + __u32 tx_pending; +}; /** * struct ethtool_channels - configuring number of network channel @@ -48,57 +561,1711 @@ struct ethtool_channels { __u32 combined_count; }; -#define ETHTOOL_FWVERS_LEN 32 -#define ETHTOOL_BUSINFO_LEN 32 -#define ETHTOOL_EROMVERS_LEN 32 - /** - * struct ethtool_drvinfo - general driver and device information - * @cmd: Command number = %ETHTOOL_GDRVINFO - * @driver: Driver short name. This should normally match the name - * in its bus driver structure (e.g. pci_driver::name). Must - * not be an empty string. - * @version: Driver version string; may be an empty string - * @fw_version: Firmware version string; may be an empty string - * @erom_version: Expansion ROM version string; may be an empty string - * @bus_info: Device bus address. This should match the dev_name() - * string for the underlying bus device, if there is one. May be - * an empty string. - * @reserved2: Reserved for future use; see the note on reserved space. - * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and - * %ETHTOOL_SPFLAGS commands; also the number of strings in the - * %ETH_SS_PRIV_FLAGS set - * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS - * command; also the number of strings in the %ETH_SS_STATS set - * @testinfo_len: Number of results returned by the %ETHTOOL_TEST - * command; also the number of strings in the %ETH_SS_TEST set - * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM - * and %ETHTOOL_SEEPROM commands, in bytes - * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS - * command, in bytes + * struct ethtool_pauseparam - Ethernet pause (flow control) parameters + * @cmd: Command number = %ETHTOOL_GPAUSEPARAM or %ETHTOOL_SPAUSEPARAM + * @autoneg: Flag to enable autonegotiation of pause frame use + * @rx_pause: Flag to enable reception of pause frames + * @tx_pause: Flag to enable transmission of pause frames * - * Users can use the %ETHTOOL_GSSET_INFO command to get the number of - * strings in any string set (from Linux 2.6.34). + * Drivers should reject a non-zero setting of @autoneg when + * autoneogotiation is disabled (or not supported) for the link. * - * Drivers should set at most @driver, @version, @fw_version and - * @bus_info in their get_drvinfo() implementation. The ethtool - * core fills in the other fields using other driver operations. + * If the link is autonegotiated, drivers should use + * mii_advertise_flowctrl() or similar code to set the advertised + * pause frame capabilities based on the @rx_pause and @tx_pause flags, + * even if @autoneg is zero. They should also allow the advertised + * pause frame capabilities to be controlled directly through the + * advertising field of &struct ethtool_cmd. + * + * If @autoneg is non-zero, the MAC is configured to send and/or + * receive pause frames according to the result of autonegotiation. + * Otherwise, it is configured directly based on the @rx_pause and + * @tx_pause flags. */ -struct ethtool_drvinfo { +struct ethtool_pauseparam { __u32 cmd; - char driver[32]; - char version[32]; - char fw_version[ETHTOOL_FWVERS_LEN]; - char bus_info[ETHTOOL_BUSINFO_LEN]; - char erom_version[ETHTOOL_EROMVERS_LEN]; - char reserved2[12]; - __u32 n_priv_flags; - __u32 n_stats; - __u32 testinfo_len; - __u32 eedump_len; - __u32 regdump_len; + __u32 autoneg; + __u32 rx_pause; + __u32 tx_pause; }; -#define ETHTOOL_GDRVINFO 0x00000003 +/* Link extended state */ +enum ethtool_link_ext_state { + ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, + ETHTOOL_LINK_EXT_STATE_NO_CABLE, + ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, + ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, + ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, + ETHTOOL_LINK_EXT_STATE_OVERHEAT, + ETHTOOL_LINK_EXT_STATE_MODULE, +}; +/* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */ +enum ethtool_link_ext_substate_autoneg { + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE. + */ +enum ethtool_link_ext_substate_link_training { + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY, + ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH. + */ +enum ethtool_link_ext_substate_link_logical_mismatch { + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY. + */ +enum ethtool_link_ext_substate_bad_signal_integrity { + ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */ +enum ethtool_link_ext_substate_cable_issue { + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1, + ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, +}; + +/* More information in addition to ETHTOOL_LINK_EXT_STATE_MODULE. */ +enum ethtool_link_ext_substate_module { + ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY = 1, +}; + +#define ETH_GSTRING_LEN 32 + +/** + * enum ethtool_stringset - string set ID + * @ETH_SS_TEST: Self-test result names, for use with %ETHTOOL_TEST + * @ETH_SS_STATS: Statistic names, for use with %ETHTOOL_GSTATS + * @ETH_SS_PRIV_FLAGS: Driver private flag names, for use with + * %ETHTOOL_GPFLAGS and %ETHTOOL_SPFLAGS + * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE; + * now deprecated + * @ETH_SS_FEATURES: Device feature names + * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names + * @ETH_SS_TUNABLES: tunable names + * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS + * @ETH_SS_PHY_TUNABLES: PHY tunable names + * @ETH_SS_LINK_MODES: link mode names + * @ETH_SS_MSG_CLASSES: debug message class names + * @ETH_SS_WOL_MODES: wake-on-lan modes + * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags + * @ETH_SS_TS_TX_TYPES: timestamping Tx types + * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters + * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types + * @ETH_SS_STATS_STD: standardized stats + * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics + * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics + * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics + * @ETH_SS_STATS_RMON: names of RMON statistics + * + * @ETH_SS_COUNT: number of defined string sets + */ +enum ethtool_stringset { + ETH_SS_TEST = 0, + ETH_SS_STATS, + ETH_SS_PRIV_FLAGS, + ETH_SS_NTUPLE_FILTERS, + ETH_SS_FEATURES, + ETH_SS_RSS_HASH_FUNCS, + ETH_SS_TUNABLES, + ETH_SS_PHY_STATS, + ETH_SS_PHY_TUNABLES, + ETH_SS_LINK_MODES, + ETH_SS_MSG_CLASSES, + ETH_SS_WOL_MODES, + ETH_SS_SOF_TIMESTAMPING, + ETH_SS_TS_TX_TYPES, + ETH_SS_TS_RX_FILTERS, + ETH_SS_UDP_TUNNEL_TYPES, + ETH_SS_STATS_STD, + ETH_SS_STATS_ETH_PHY, + ETH_SS_STATS_ETH_MAC, + ETH_SS_STATS_ETH_CTRL, + ETH_SS_STATS_RMON, + + /* add new constants above here */ + ETH_SS_COUNT +}; + +/** + * enum ethtool_mac_stats_src - source of ethtool MAC statistics + * @ETHTOOL_MAC_STATS_SRC_AGGREGATE: + * if device supports a MAC merge layer, this retrieves the aggregate + * statistics of the eMAC and pMAC. Otherwise, it retrieves just the + * statistics of the single (express) MAC. + * @ETHTOOL_MAC_STATS_SRC_EMAC: + * if device supports a MM layer, this retrieves the eMAC statistics. + * Otherwise, it retrieves the statistics of the single (express) MAC. + * @ETHTOOL_MAC_STATS_SRC_PMAC: + * if device supports a MM layer, this retrieves the pMAC statistics. + */ +enum ethtool_mac_stats_src { + ETHTOOL_MAC_STATS_SRC_AGGREGATE, + ETHTOOL_MAC_STATS_SRC_EMAC, + ETHTOOL_MAC_STATS_SRC_PMAC, +}; + +/** + * enum ethtool_module_power_mode_policy - plug-in module power mode policy + * @ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH: Module is always in high power mode. + * @ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO: Module is transitioned by the host + * to high power mode when the first port using it is put administratively + * up and to low power mode when the last port using it is put + * administratively down. + */ +enum ethtool_module_power_mode_policy { + ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH = 1, + ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO, +}; + +/** + * enum ethtool_module_power_mode - plug-in module power mode + * @ETHTOOL_MODULE_POWER_MODE_LOW: Module is in low power mode. + * @ETHTOOL_MODULE_POWER_MODE_HIGH: Module is in high power mode. + */ +enum ethtool_module_power_mode { + ETHTOOL_MODULE_POWER_MODE_LOW = 1, + ETHTOOL_MODULE_POWER_MODE_HIGH, +}; + +/** + * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE + * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState + * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are + * unknown + * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled + * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled + */ +enum ethtool_podl_pse_admin_state { + ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1, + ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, + ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED, +}; + +/** + * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE. + * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: + * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is + * asserted true when the PoDL PSE state diagram variable mr_pse_enable is + * false" + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is + * asserted true when either of the PSE state diagram variables + * pi_detecting or pi_classifying is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower” + * is asserted true when the PoDL PSE state diagram variable pi_powered is + * true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted + * true when the PoDL PSE state diagram variable pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true + * when the logical combination of the PoDL PSE state diagram variables + * pi_prebiased*!pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted + * true when the PoDL PSE state diagram variable overload_held is true." + */ +enum ethtool_podl_pse_pw_d_status { + ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1, + ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED, + ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING, + ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING, + ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP, + ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE, + ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR, +}; + +/** + * enum ethtool_mm_verify_status - status of MAC Merge Verify function + * @ETHTOOL_MM_VERIFY_STATUS_UNKNOWN: + * verification status is unknown + * @ETHTOOL_MM_VERIFY_STATUS_INITIAL: + * the 802.3 Verify State diagram is in the state INIT_VERIFICATION + * @ETHTOOL_MM_VERIFY_STATUS_VERIFYING: + * the Verify State diagram is in the state VERIFICATION_IDLE, + * SEND_VERIFY or WAIT_FOR_RESPONSE + * @ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: + * indicates that the Verify State diagram is in the state VERIFIED + * @ETHTOOL_MM_VERIFY_STATUS_FAILED: + * the Verify State diagram is in the state VERIFY_FAIL + * @ETHTOOL_MM_VERIFY_STATUS_DISABLED: + * verification of preemption operation is disabled + */ +enum ethtool_mm_verify_status { + ETHTOOL_MM_VERIFY_STATUS_UNKNOWN, + ETHTOOL_MM_VERIFY_STATUS_INITIAL, + ETHTOOL_MM_VERIFY_STATUS_VERIFYING, + ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED, + ETHTOOL_MM_VERIFY_STATUS_FAILED, + ETHTOOL_MM_VERIFY_STATUS_DISABLED, +}; + +/** + * struct ethtool_gstrings - string set for data tagging + * @cmd: Command number = %ETHTOOL_GSTRINGS + * @string_set: String set ID; one of &enum ethtool_stringset + * @len: On return, the number of strings in the string set + * @data: Buffer for strings. Each string is null-padded to a size of + * %ETH_GSTRING_LEN. + * + * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in + * the string set. They must allocate a buffer of the appropriate + * size immediately following this structure. + */ +struct ethtool_gstrings { + __u32 cmd; + __u32 string_set; + __u32 len; + __u8 data[]; +}; + +/** + * struct ethtool_sset_info - string set information + * @cmd: Command number = %ETHTOOL_GSSET_INFO + * @reserved: Reserved for future use; see the note on reserved space. + * @sset_mask: On entry, a bitmask of string sets to query, with bits + * numbered according to &enum ethtool_stringset. On return, a + * bitmask of those string sets queried that are supported. + * @data: Buffer for string set sizes. On return, this contains the + * size of each string set that was queried and supported, in + * order of ID. + * + * Example: The user passes in @sset_mask = 0x7 (sets 0, 1, 2) and on + * return @sset_mask == 0x6 (sets 1, 2). Then @data[0] contains the + * size of set 1 and @data[1] contains the size of set 2. + * + * Users must allocate a buffer of the appropriate size (4 * number of + * sets queried) immediately following this structure. + */ +struct ethtool_sset_info { + __u32 cmd; + __u32 reserved; + __u64 sset_mask; + __u32 data[]; +}; + +/** + * enum ethtool_test_flags - flags definition of ethtool_test + * @ETH_TEST_FL_OFFLINE: if set perform online and offline tests, otherwise + * only online tests. + * @ETH_TEST_FL_FAILED: Driver set this flag if test fails. + * @ETH_TEST_FL_EXTERNAL_LB: Application request to perform external loopback + * test. + * @ETH_TEST_FL_EXTERNAL_LB_DONE: Driver performed the external loopback test + */ + +enum ethtool_test_flags { + ETH_TEST_FL_OFFLINE = (1 << 0), + ETH_TEST_FL_FAILED = (1 << 1), + ETH_TEST_FL_EXTERNAL_LB = (1 << 2), + ETH_TEST_FL_EXTERNAL_LB_DONE = (1 << 3), +}; + +/** + * struct ethtool_test - device self-test invocation + * @cmd: Command number = %ETHTOOL_TEST + * @flags: A bitmask of flags from &enum ethtool_test_flags. Some + * flags may be set by the user on entry; others may be set by + * the driver on return. + * @reserved: Reserved for future use; see the note on reserved space. + * @len: On return, the number of test results + * @data: Array of test results + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of test results that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of results) immediately + * following this structure. + */ +struct ethtool_test { + __u32 cmd; + __u32 flags; + __u32 reserved; + __u32 len; + __u64 data[]; +}; + +/** + * struct ethtool_stats - device-specific statistics + * @cmd: Command number = %ETHTOOL_GSTATS + * @n_stats: On return, the number of statistics + * @data: Array of statistics + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of statistics that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of statistics) + * immediately following this structure. + */ +struct ethtool_stats { + __u32 cmd; + __u32 n_stats; + __u64 data[]; +}; + +/** + * struct ethtool_perm_addr - permanent hardware address + * @cmd: Command number = %ETHTOOL_GPERMADDR + * @size: On entry, the size of the buffer. On return, the size of the + * address. The command fails if the buffer is too small. + * @data: Buffer for the address + * + * Users must allocate the buffer immediately following this structure. + * A buffer size of %MAX_ADDR_LEN should be sufficient for any address + * type. + */ +struct ethtool_perm_addr { + __u32 cmd; + __u32 size; + __u8 data[]; +}; + +/* boolean flags controlling per-interface behavior characteristics. + * When reading, the flag indicates whether or not a certain behavior + * is enabled/present. When writing, the flag indicates whether + * or not the driver should turn on (set) or off (clear) a behavior. + * + * Some behaviors may read-only (unconditionally absent or present). + * If such is the case, return EINVAL in the set-flags operation if the + * flag differs from the read-only value. + */ +enum ethtool_flags { + ETH_FLAG_TXVLAN = (1 << 7), /* TX VLAN offload enabled */ + ETH_FLAG_RXVLAN = (1 << 8), /* RX VLAN offload enabled */ + ETH_FLAG_LRO = (1 << 15), /* LRO is enabled */ + ETH_FLAG_NTUPLE = (1 << 27), /* N-tuple filters enabled */ + ETH_FLAG_RXHASH = (1 << 28), +}; + +/* The following structures are for supporting RX network flow + * classification and RX n-tuple configuration. Note, all multibyte + * fields, e.g., ip4src, ip4dst, psrc, pdst, spi, etc. are expected to + * be in network byte order. + */ + +/** + * struct ethtool_tcpip4_spec - flow specification for TCP/IPv4 etc. + * @ip4src: Source host + * @ip4dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tos: Type-of-service + * + * This can be used to specify a TCP/IPv4, UDP/IPv4 or SCTP/IPv4 flow. + */ +struct ethtool_tcpip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be16 psrc; + __be16 pdst; + __u8 tos; +}; + +/** + * struct ethtool_ah_espip4_spec - flow specification for IPsec/IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @spi: Security parameters index + * @tos: Type-of-service + * + * This can be used to specify an IPsec transport or tunnel over IPv4. + */ +struct ethtool_ah_espip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be32 spi; + __u8 tos; +}; + +#define ETH_RX_NFC_IP4 1 + +/** + * struct ethtool_usrip4_spec - general flow specification for IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tos: Type-of-service + * @ip_ver: Value must be %ETH_RX_NFC_IP4; mask must be 0 + * @proto: Transport protocol number; mask must be 0 + */ +struct ethtool_usrip4_spec { + __be32 ip4src; + __be32 ip4dst; + __be32 l4_4_bytes; + __u8 tos; + __u8 ip_ver; + __u8 proto; +}; + +/** + * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc. + * @ip6src: Source host + * @ip6dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tclass: Traffic Class + * + * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow. + */ +struct ethtool_tcpip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be16 psrc; + __be16 pdst; + __u8 tclass; +}; + +/** + * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @spi: Security parameters index + * @tclass: Traffic Class + * + * This can be used to specify an IPsec transport or tunnel over IPv6. + */ +struct ethtool_ah_espip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 spi; + __u8 tclass; +}; + +/** + * struct ethtool_usrip6_spec - general flow specification for IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tclass: Traffic Class + * @l4_proto: Transport protocol number (nexthdr after any Extension Headers) + */ +struct ethtool_usrip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 l4_4_bytes; + __u8 tclass; + __u8 l4_proto; +}; + +union ethtool_flow_union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethtool_tcpip6_spec tcp_ip6_spec; + struct ethtool_tcpip6_spec udp_ip6_spec; + struct ethtool_tcpip6_spec sctp_ip6_spec; + struct ethtool_ah_espip6_spec ah_ip6_spec; + struct ethtool_ah_espip6_spec esp_ip6_spec; + struct ethtool_usrip6_spec usr_ip6_spec; + struct ethhdr ether_spec; + __u8 hdata[52]; +}; + +/** + * struct ethtool_flow_ext - additional RX flow fields + * @h_dest: destination MAC address + * @vlan_etype: VLAN EtherType + * @vlan_tci: VLAN tag control information + * @data: user defined data + * @padding: Reserved for future use; see the note on reserved space. + * + * Note, @vlan_etype, @vlan_tci, and @data are only valid if %FLOW_EXT + * is set in &struct ethtool_rx_flow_spec @flow_type. + * @h_dest is valid if %FLOW_MAC_EXT is set. + */ +struct ethtool_flow_ext { + __u8 padding[2]; + unsigned char h_dest[ETH_ALEN]; + __be16 vlan_etype; + __be16 vlan_tci; + __be32 data[2]; +}; + +/** + * struct ethtool_rx_flow_spec - classification rule for RX flows + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow fields to match (dependent on @flow_type) + * @h_ext: Additional fields to match + * @m_u: Masks for flow field bits to be matched + * @m_ext: Masks for additional field bits to be matched + * Note, all additional fields must be ignored unless @flow_type + * includes the %FLOW_EXT or %FLOW_MAC_EXT flag + * (see &struct ethtool_flow_ext description). + * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC + * if packets should be discarded, or %RX_CLS_FLOW_WAKE if the + * packets should be used for Wake-on-LAN with %WAKE_FILTER + * @location: Location of rule in the table. Locations must be + * numbered such that a flow matching multiple rules will be + * classified according to the first (lowest numbered) rule. + */ +struct ethtool_rx_flow_spec { + __u32 flow_type; + union ethtool_flow_union h_u; + struct ethtool_flow_ext h_ext; + union ethtool_flow_union m_u; + struct ethtool_flow_ext m_ext; + __u64 ring_cookie; + __u32 location; +}; + +/* How rings are laid out when accessing virtual functions or + * offloaded queues is device specific. To allow users to do flow + * steering and specify these queues the ring cookie is partitioned + * into a 32bit queue index with an 8 bit virtual function id. + * This also leaves the 3bytes for further specifiers. It is possible + * future devices may support more than 256 virtual functions if + * devices start supporting PCIe w/ARI. However at the moment I + * do not know of any devices that support this so I do not reserve + * space for this at this time. If a future patch consumes the next + * byte it should be aware of this possibility. + */ +#define ETHTOOL_RX_FLOW_SPEC_RING 0x00000000FFFFFFFFLL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF 0x000000FF00000000LL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32 +static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie) +{ + return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie; +} + +static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) +{ + return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >> + ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; +} + +/** + * struct ethtool_rxnfc - command to get or set RX flow classification rules + * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH, + * %ETHTOOL_GRXRINGS, %ETHTOOL_GRXCLSRLCNT, %ETHTOOL_GRXCLSRULE, + * %ETHTOOL_GRXCLSRLALL, %ETHTOOL_SRXCLSRLDEL or %ETHTOOL_SRXCLSRLINS + * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW + * @data: Command-dependent value + * @fs: Flow classification rule + * @rss_context: RSS context to be affected + * @rule_cnt: Number of rules to be affected + * @rule_locs: Array of used rule locations + * + * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating + * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following + * structure fields must not be used, except that if @flow_type includes + * the %FLOW_RSS flag, then @rss_context determines which RSS context to + * act on. + * + * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues + * on return. + * + * For %ETHTOOL_GRXCLSRLCNT, @rule_cnt is set to the number of defined + * rules on return. If @data is non-zero on return then it is the + * size of the rule table, plus the flag %RX_CLS_LOC_SPECIAL if the + * driver supports any special location values. If that flag is not + * set in @data then special location values should not be used. + * + * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an + * existing rule on entry and @fs contains the rule on return; if + * @fs.@flow_type includes the %FLOW_RSS flag, then @rss_context is + * filled with the RSS context ID associated with the rule. + * + * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the + * user buffer for @rule_locs on entry. On return, @data is the size + * of the rule table, @rule_cnt is the number of defined rules, and + * @rule_locs contains the locations of the defined rules. Drivers + * must use the second parameter to get_rxnfc() instead of @rule_locs. + * + * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update. + * @fs.@location either specifies the location to use or is a special + * location value with %RX_CLS_LOC_SPECIAL flag set. On return, + * @fs.@location is the actual rule location. If @fs.@flow_type + * includes the %FLOW_RSS flag, @rss_context is the RSS context ID to + * use for flow spreading traffic which matches this rule. The value + * from the rxfh indirection table will be added to @fs.@ring_cookie + * to choose which ring to deliver to. + * + * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an + * existing rule on entry. + * + * A driver supporting the special location values for + * %ETHTOOL_SRXCLSRLINS may add the rule at any suitable unused + * location, and may remove a rule at a later location (lower + * priority) that matches exactly the same set of flows. The special + * values are %RX_CLS_LOC_ANY, selecting any location; + * %RX_CLS_LOC_FIRST, selecting the first suitable location (maximum + * priority); and %RX_CLS_LOC_LAST, selecting the last suitable + * location (minimum priority). Additional special values may be + * defined in future and drivers must return -%EINVAL for any + * unrecognised value. + */ +struct ethtool_rxnfc { + __u32 cmd; + __u32 flow_type; + __u64 data; + struct ethtool_rx_flow_spec fs; + union { + __u32 rule_cnt; + __u32 rss_context; + }; + __u32 rule_locs[]; +}; + + +/** + * struct ethtool_rxfh_indir - command to get or set RX flow hash indirection + * @cmd: Specific command number - %ETHTOOL_GRXFHINDIR or %ETHTOOL_SRXFHINDIR + * @size: On entry, the array size of the user buffer, which may be zero. + * On return from %ETHTOOL_GRXFHINDIR, the array size of the hardware + * indirection table. + * @ring_index: RX ring/queue index for each hash value + * + * For %ETHTOOL_GRXFHINDIR, a @size of zero means that only the size + * should be returned. For %ETHTOOL_SRXFHINDIR, a @size of zero means + * the table should be reset to default values. This last feature + * is not supported by the original implementations. + */ +struct ethtool_rxfh_indir { + __u32 cmd; + __u32 size; + __u32 ring_index[]; +}; + +/** + * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key. + * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH + * @rss_context: RSS context identifier. Context 0 is the default for normal + * traffic; other contexts can be referenced as the destination for RX flow + * classification rules. %ETH_RXFH_CONTEXT_ALLOC is used with command + * %ETHTOOL_SRSSH to allocate a new RSS context; on return this field will + * contain the ID of the newly allocated context. + * @indir_size: On entry, the array size of the user buffer for the + * indirection table, which may be zero, or (for %ETHTOOL_SRSSH), + * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH, + * the array size of the hardware indirection table. + * @key_size: On entry, the array size of the user buffer for the hash key, + * which may be zero. On return from %ETHTOOL_GRSSH, the size of the + * hardware hash key. + * @hfunc: Defines the current RSS hash function used by HW (or to be set to). + * Valid values are one of the %ETH_RSS_HASH_*. + * @input_xfrm: Defines how the input data is transformed. Valid values are one + * of %RXH_XFRM_*. + * @rsvd8: Reserved for future use; see the note on reserved space. + * @rsvd32: Reserved for future use; see the note on reserved space. + * @rss_config: RX ring/queue index for each hash value i.e., indirection table + * of @indir_size __u32 elements, followed by hash key of @key_size + * bytes. + * + * For %ETHTOOL_GRSSH, a @indir_size and key_size of zero means that only the + * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of + * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested + * and a @indir_size of zero means the indir table should be reset to default + * values (if @rss_context == 0) or that the RSS context should be deleted. + * An hfunc of zero means that hash function setting is not requested. + */ +struct ethtool_rxfh { + __u32 cmd; + __u32 rss_context; + __u32 indir_size; + __u32 key_size; + __u8 hfunc; + __u8 input_xfrm; + __u8 rsvd8[2]; + __u32 rsvd32; + __u32 rss_config[]; +}; +#define ETH_RXFH_CONTEXT_ALLOC 0xffffffff +#define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff + +/** + * struct ethtool_rx_ntuple_flow_spec - specification for RX flow filter + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow field values to match (dependent on @flow_type) + * @m_u: Masks for flow field value bits to be ignored + * @vlan_tag: VLAN tag to match + * @vlan_tag_mask: Mask for VLAN tag bits to be ignored + * @data: Driver-dependent data to match + * @data_mask: Mask for driver-dependent data bits to be ignored + * @action: RX ring/queue index to deliver to (non-negative) or other action + * (negative, e.g. %ETHTOOL_RXNTUPLE_ACTION_DROP) + * + * For flow types %TCP_V4_FLOW, %UDP_V4_FLOW and %SCTP_V4_FLOW, where + * a field value and mask are both zero this is treated as if all mask + * bits are set i.e. the field is ignored. + */ +struct ethtool_rx_ntuple_flow_spec { + __u32 flow_type; + union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethhdr ether_spec; + __u8 hdata[72]; + } h_u, m_u; + + __u16 vlan_tag; + __u16 vlan_tag_mask; + __u64 data; + __u64 data_mask; + + __s32 action; +#define ETHTOOL_RXNTUPLE_ACTION_DROP (-1) /* drop packet */ +#define ETHTOOL_RXNTUPLE_ACTION_CLEAR (-2) /* clear filter */ +}; + +/** + * struct ethtool_rx_ntuple - command to set or clear RX flow filter + * @cmd: Command number - %ETHTOOL_SRXNTUPLE + * @fs: Flow filter specification + */ +struct ethtool_rx_ntuple { + __u32 cmd; + struct ethtool_rx_ntuple_flow_spec fs; +}; + +#define ETHTOOL_FLASH_MAX_FILENAME 128 +enum ethtool_flash_op_type { + ETHTOOL_FLASH_ALL_REGIONS = 0, +}; + +/* for passing firmware flashing related parameters */ +struct ethtool_flash { + __u32 cmd; + __u32 region; + char data[ETHTOOL_FLASH_MAX_FILENAME]; +}; + +/** + * struct ethtool_dump - used for retrieving, setting device dump + * @cmd: Command number - %ETHTOOL_GET_DUMP_FLAG, %ETHTOOL_GET_DUMP_DATA, or + * %ETHTOOL_SET_DUMP + * @version: FW version of the dump, filled in by driver + * @flag: driver dependent flag for dump setting, filled in by driver during + * get and filled in by ethtool for set operation. + * flag must be initialized by macro ETH_FW_DUMP_DISABLE value when + * firmware dump is disabled. + * @len: length of dump data, used as the length of the user buffer on entry to + * %ETHTOOL_GET_DUMP_DATA and this is returned as dump length by driver + * for %ETHTOOL_GET_DUMP_FLAG command + * @data: data collected for get dump data operation + */ +struct ethtool_dump { + __u32 cmd; + __u32 version; + __u32 flag; + __u32 len; + __u8 data[]; +}; + +#define ETH_FW_DUMP_DISABLE 0 + +/* for returning and changing feature sets */ + +/** + * struct ethtool_get_features_block - block with state of 32 features + * @available: mask of changeable features + * @requested: mask of features requested to be enabled if possible + * @active: mask of currently enabled features + * @never_changed: mask of features not changeable for any device + */ +struct ethtool_get_features_block { + __u32 available; + __u32 requested; + __u32 active; + __u32 never_changed; +}; + +/** + * struct ethtool_gfeatures - command to get state of device's features + * @cmd: command number = %ETHTOOL_GFEATURES + * @size: On entry, the number of elements in the features[] array; + * on return, the number of elements in features[] needed to hold + * all features + * @features: state of features + */ +struct ethtool_gfeatures { + __u32 cmd; + __u32 size; + struct ethtool_get_features_block features[]; +}; + +/** + * struct ethtool_set_features_block - block with request for 32 features + * @valid: mask of features to be changed + * @requested: values of features to be changed + */ +struct ethtool_set_features_block { + __u32 valid; + __u32 requested; +}; + +/** + * struct ethtool_sfeatures - command to request change in device's features + * @cmd: command number = %ETHTOOL_SFEATURES + * @size: array size of the features[] array + * @features: feature change masks + */ +struct ethtool_sfeatures { + __u32 cmd; + __u32 size; + struct ethtool_set_features_block features[]; +}; + +/** + * struct ethtool_ts_info - holds a device's timestamping and PHC association + * @cmd: command number = %ETHTOOL_GET_TS_INFO + * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags + * @phc_index: device index of the associated PHC, or -1 if there is none + * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values + * @tx_reserved: Reserved for future use; see the note on reserved space. + * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values + * @rx_reserved: Reserved for future use; see the note on reserved space. + * + * The bits in the 'tx_types' and 'rx_filters' fields correspond to + * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values, + * respectively. For example, if the device supports HWTSTAMP_TX_ON, + * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set. + * + * Drivers should only report the filters they actually support without + * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for + * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the + * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op. + */ +struct ethtool_ts_info { + __u32 cmd; + __u32 so_timestamping; + __s32 phc_index; + __u32 tx_types; + __u32 tx_reserved[3]; + __u32 rx_filters; + __u32 rx_reserved[3]; +}; + +/* + * %ETHTOOL_SFEATURES changes features present in features[].valid to the + * values of corresponding bits in features[].requested. Bits in .requested + * not set in .valid or not changeable are ignored. + * + * Returns %EINVAL when .valid contains undefined or never-changeable bits + * or size is not equal to required number of features words (32-bit blocks). + * Returns >= 0 if request was completed; bits set in the value mean: + * %ETHTOOL_F_UNSUPPORTED - there were bits set in .valid that are not + * changeable (not present in %ETHTOOL_GFEATURES' features[].available) + * those bits were ignored. + * %ETHTOOL_F_WISH - some or all changes requested were recorded but the + * resulting state of bits masked by .valid is not equal to .requested. + * Probably there are other device-specific constraints on some features + * in the set. When %ETHTOOL_F_UNSUPPORTED is set, .valid is considered + * here as though ignored bits were cleared. + * %ETHTOOL_F_COMPAT - some or all changes requested were made by calling + * compatibility functions. Requested offload state cannot be properly + * managed by kernel. + * + * Meaning of bits in the masks are obtained by %ETHTOOL_GSSET_INFO (number of + * bits in the arrays - always multiple of 32) and %ETHTOOL_GSTRINGS commands + * for ETH_SS_FEATURES string set. First entry in the table corresponds to least + * significant bit in features[0] fields. Empty strings mark undefined features. + */ +enum ethtool_sfeatures_retval_bits { + ETHTOOL_F_UNSUPPORTED__BIT, + ETHTOOL_F_WISH__BIT, + ETHTOOL_F_COMPAT__BIT, +}; + +#define ETHTOOL_F_UNSUPPORTED (1 << ETHTOOL_F_UNSUPPORTED__BIT) +#define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT) +#define ETHTOOL_F_COMPAT (1 << ETHTOOL_F_COMPAT__BIT) + +#define MAX_NUM_QUEUE 4096 + +/** + * struct ethtool_per_queue_op - apply sub command to the queues in mask. + * @cmd: ETHTOOL_PERQUEUE + * @sub_command: the sub command which apply to each queues + * @queue_mask: Bitmap of the queues which sub command apply to + * @data: A complete command structure following for each of the queues addressed + */ +struct ethtool_per_queue_op { + __u32 cmd; + __u32 sub_command; + __u32 queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; + char data[]; +}; + +/** + * struct ethtool_fecparam - Ethernet Forward Error Correction parameters + * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM + * @active_fec: FEC mode which is active on the port, single bit set, GET only. + * @fec: Bitmask of configured FEC modes. + * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. + * + * Note that @reserved was never validated on input and ethtool user space + * left it uninitialized when calling SET. Hence going forward it can only be + * used to return a value to userspace with GET. + * + * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. + * FEC settings are configured by link autonegotiation whenever it's enabled. + * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. + * + * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings. + * It is recommended that drivers only accept a single bit set in @fec. + * When multiple bits are set in @fec drivers may pick mode in an implementation + * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other + * FEC modes, because it's unclear whether in this case other modes constrain + * AUTO or are independent choices. + * Drivers must reject SET requests if they support none of the requested modes. + * + * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead + * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM. + * + * See enum ethtool_fec_config_bits for definition of valid bits for both + * @fec and @active_fec. + */ +struct ethtool_fecparam { + __u32 cmd; + /* bitmask of FEC modes */ + __u32 active_fec; + __u32 fec; + __u32 reserved; +}; + +/** + * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration + * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not + * be used together with other bits. GET only. + * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually + * based link mode and SFP parameters read from module's + * EEPROM. This bit does _not_ mean autonegotiation. + * @ETHTOOL_FEC_OFF_BIT: No FEC Mode + * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode + * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode + * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet + * Consortium) + */ +enum ethtool_fec_config_bits { + ETHTOOL_FEC_NONE_BIT, + ETHTOOL_FEC_AUTO_BIT, + ETHTOOL_FEC_OFF_BIT, + ETHTOOL_FEC_RS_BIT, + ETHTOOL_FEC_BASER_BIT, + ETHTOOL_FEC_LLRS_BIT, +}; + +#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) +#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT) +#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) +#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) +#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) +#define ETHTOOL_FEC_LLRS (1 << ETHTOOL_FEC_LLRS_BIT) + +/* CMDs currently supported */ +#define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. + * Please use ETHTOOL_GLINKSETTINGS + */ +#define ETHTOOL_SSET 0x00000002 /* DEPRECATED, Set settings. + * Please use ETHTOOL_SLINKSETTINGS + */ +#define ETHTOOL_GDRVINFO 0x00000003 /* Get driver info. */ +#define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */ +#define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */ +#define ETHTOOL_SWOL 0x00000006 /* Set wake-on-lan options. */ +#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ +#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ +#define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */ +/* Get link status for host, i.e. whether the interface *and* the + * physical port (if there is one) are up (ethtool_value). */ +#define ETHTOOL_GLINK 0x0000000a +#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ +#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ +#define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ +#define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */ +#define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */ +#define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters. */ +#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */ +#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */ +#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ +#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ +#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ +#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ +#define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable + * (ethtool_value) */ +#define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable + * (ethtool_value). */ +#define ETHTOOL_TEST 0x0000001a /* execute NIC self-test. */ +#define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */ +#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ +#define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ +#define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ +#define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ +#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ +#define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */ +#define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */ +#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ +#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ +#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */ +#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */ +#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ +#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ + +#define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */ +#define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */ +#define ETHTOOL_GGRO 0x0000002b /* Get GRO enable (ethtool_value) */ +#define ETHTOOL_SGRO 0x0000002c /* Set GRO enable (ethtool_value) */ +#define ETHTOOL_GRXRINGS 0x0000002d /* Get RX rings available for LB */ +#define ETHTOOL_GRXCLSRLCNT 0x0000002e /* Get RX class rule count */ +#define ETHTOOL_GRXCLSRULE 0x0000002f /* Get RX classification rule */ +#define ETHTOOL_GRXCLSRLALL 0x00000030 /* Get all RX classification rule */ +#define ETHTOOL_SRXCLSRLDEL 0x00000031 /* Delete RX classification rule */ +#define ETHTOOL_SRXCLSRLINS 0x00000032 /* Insert RX classification rule */ +#define ETHTOOL_FLASHDEV 0x00000033 /* Flash firmware to device */ +#define ETHTOOL_RESET 0x00000034 /* Reset hardware */ +#define ETHTOOL_SRXNTUPLE 0x00000035 /* Add an n-tuple filter to device */ +#define ETHTOOL_GRXNTUPLE 0x00000036 /* deprecated */ +#define ETHTOOL_GSSET_INFO 0x00000037 /* Get string set info */ +#define ETHTOOL_GRXFHINDIR 0x00000038 /* Get RX flow hash indir'n table */ +#define ETHTOOL_SRXFHINDIR 0x00000039 /* Set RX flow hash indir'n table */ + +#define ETHTOOL_GFEATURES 0x0000003a /* Get device offload settings */ +#define ETHTOOL_SFEATURES 0x0000003b /* Change device offload settings */ +#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ +#define ETHTOOL_SCHANNELS 0x0000003d /* Set no of channels */ +#define ETHTOOL_SET_DUMP 0x0000003e /* Set dump settings */ +#define ETHTOOL_GET_DUMP_FLAG 0x0000003f /* Get dump settings */ +#define ETHTOOL_GET_DUMP_DATA 0x00000040 /* Get dump data */ +#define ETHTOOL_GET_TS_INFO 0x00000041 /* Get time stamping and PHC info */ +#define ETHTOOL_GMODULEINFO 0x00000042 /* Get plug-in module information */ +#define ETHTOOL_GMODULEEEPROM 0x00000043 /* Get plug-in module eeprom */ +#define ETHTOOL_GEEE 0x00000044 /* Get EEE settings */ +#define ETHTOOL_SEEE 0x00000045 /* Set EEE settings */ + +#define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ +#define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ +#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ +#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ +#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ + +#define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */ + +#define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */ +#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ +#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ +#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ +#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */ +#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */ + +/* compatibility with older code */ +#define SPARC_ETH_GSET ETHTOOL_GSET +#define SPARC_ETH_SSET ETHTOOL_SSET + +/* Link mode bit indices */ +enum ethtool_link_mode_bit_indices { + ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0, + ETHTOOL_LINK_MODE_10baseT_Full_BIT = 1, + ETHTOOL_LINK_MODE_100baseT_Half_BIT = 2, + ETHTOOL_LINK_MODE_100baseT_Full_BIT = 3, + ETHTOOL_LINK_MODE_1000baseT_Half_BIT = 4, + ETHTOOL_LINK_MODE_1000baseT_Full_BIT = 5, + ETHTOOL_LINK_MODE_Autoneg_BIT = 6, + ETHTOOL_LINK_MODE_TP_BIT = 7, + ETHTOOL_LINK_MODE_AUI_BIT = 8, + ETHTOOL_LINK_MODE_MII_BIT = 9, + ETHTOOL_LINK_MODE_FIBRE_BIT = 10, + ETHTOOL_LINK_MODE_BNC_BIT = 11, + ETHTOOL_LINK_MODE_10000baseT_Full_BIT = 12, + ETHTOOL_LINK_MODE_Pause_BIT = 13, + ETHTOOL_LINK_MODE_Asym_Pause_BIT = 14, + ETHTOOL_LINK_MODE_2500baseX_Full_BIT = 15, + ETHTOOL_LINK_MODE_Backplane_BIT = 16, + ETHTOOL_LINK_MODE_1000baseKX_Full_BIT = 17, + ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT = 18, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT = 19, + ETHTOOL_LINK_MODE_10000baseR_FEC_BIT = 20, + ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21, + ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT = 22, + ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT = 23, + ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT = 24, + ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT = 25, + ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT = 26, + ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT = 27, + ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28, + ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29, + ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30, + ETHTOOL_LINK_MODE_25000baseCR_Full_BIT = 31, + + /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit + * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* + * macro for bits > 31. The only way to use indices > 31 is to + * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. + */ + + ETHTOOL_LINK_MODE_25000baseKR_Full_BIT = 32, + ETHTOOL_LINK_MODE_25000baseSR_Full_BIT = 33, + ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34, + ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT = 35, + ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT = 36, + ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT = 37, + ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT = 38, + ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT = 39, + ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40, + ETHTOOL_LINK_MODE_1000baseX_Full_BIT = 41, + ETHTOOL_LINK_MODE_10000baseCR_Full_BIT = 42, + ETHTOOL_LINK_MODE_10000baseSR_Full_BIT = 43, + ETHTOOL_LINK_MODE_10000baseLR_Full_BIT = 44, + ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT = 45, + ETHTOOL_LINK_MODE_10000baseER_Full_BIT = 46, + ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47, + ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48, + + ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, + ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, + ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, + ETHTOOL_LINK_MODE_50000baseKR_Full_BIT = 52, + ETHTOOL_LINK_MODE_50000baseSR_Full_BIT = 53, + ETHTOOL_LINK_MODE_50000baseCR_Full_BIT = 54, + ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT = 55, + ETHTOOL_LINK_MODE_50000baseDR_Full_BIT = 56, + ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT = 57, + ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT = 58, + ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT = 59, + ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT = 60, + ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT = 61, + ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT = 62, + ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT = 63, + ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64, + ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65, + ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, + ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, + ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, + ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, + ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, + ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, + ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, + ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, + ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, + ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75, + ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76, + ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77, + ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78, + ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79, + ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80, + ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81, + ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82, + ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83, + ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84, + ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85, + ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86, + ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, + ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, + ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, + ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, + ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, + ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92, + ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93, + ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94, + ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95, + ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96, + ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97, + ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98, + ETHTOOL_LINK_MODE_10baseT1S_Full_BIT = 99, + ETHTOOL_LINK_MODE_10baseT1S_Half_BIT = 100, + ETHTOOL_LINK_MODE_10baseT1S_P2MP_Half_BIT = 101, + + /* must be last entry */ + __ETHTOOL_LINK_MODE_MASK_NBITS +}; + +#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ + (1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT)) + +/* DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new SUPPORTED_* macro for bits > 31. + */ +#define SUPPORTED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define SUPPORTED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define SUPPORTED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define SUPPORTED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define SUPPORTED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define SUPPORTED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define SUPPORTED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define SUPPORTED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define SUPPORTED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define SUPPORTED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define SUPPORTED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define SUPPORTED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define SUPPORTED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define SUPPORTED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define SUPPORTED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define SUPPORTED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define SUPPORTED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define SUPPORTED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define SUPPORTED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define SUPPORTED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define SUPPORTED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define SUPPORTED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define SUPPORTED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define SUPPORTED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define SUPPORTED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define SUPPORTED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define SUPPORTED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define SUPPORTED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define SUPPORTED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define SUPPORTED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define SUPPORTED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new SUPPORTED_* macro for bits > 31, see + * notice above. + */ + +/* + * DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new ADERTISE_* macro for bits > 31. + */ +#define ADVERTISED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define ADVERTISED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define ADVERTISED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define ADVERTISED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define ADVERTISED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define ADVERTISED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define ADVERTISED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define ADVERTISED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define ADVERTISED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define ADVERTISED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define ADVERTISED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define ADVERTISED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define ADVERTISED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define ADVERTISED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define ADVERTISED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define ADVERTISED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define ADVERTISED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define ADVERTISED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define ADVERTISED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define ADVERTISED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define ADVERTISED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define ADVERTISED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define ADVERTISED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define ADVERTISED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define ADVERTISED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define ADVERTISED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define ADVERTISED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define ADVERTISED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define ADVERTISED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define ADVERTISED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define ADVERTISED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new ADVERTISED_* macro for bits > 31, see + * notice above. + */ + +/* The following are all involved in forcing a particular link + * mode for the device for setting things. When getting the + * devices settings, these indicate the current mode and whether + * it was forced up into this mode or autonegotiated. + */ + +/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. + * Update drivers/net/phy/phy.c:phy_speed_to_str() and + * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values. + */ +#define SPEED_10 10 +#define SPEED_100 100 +#define SPEED_1000 1000 +#define SPEED_2500 2500 +#define SPEED_5000 5000 +#define SPEED_10000 10000 +#define SPEED_14000 14000 +#define SPEED_20000 20000 +#define SPEED_25000 25000 +#define SPEED_40000 40000 +#define SPEED_50000 50000 +#define SPEED_56000 56000 +#define SPEED_100000 100000 +#define SPEED_200000 200000 +#define SPEED_400000 400000 +#define SPEED_800000 800000 + +#define SPEED_UNKNOWN -1 + +static inline int ethtool_validate_speed(__u32 speed) +{ + return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN; +} + +/* Duplex, half or full. */ +#define DUPLEX_HALF 0x00 +#define DUPLEX_FULL 0x01 +#define DUPLEX_UNKNOWN 0xff + +static inline int ethtool_validate_duplex(__u8 duplex) +{ + switch (duplex) { + case DUPLEX_HALF: + case DUPLEX_FULL: + case DUPLEX_UNKNOWN: + return 1; + } + + return 0; +} + +#define MASTER_SLAVE_CFG_UNSUPPORTED 0 +#define MASTER_SLAVE_CFG_UNKNOWN 1 +#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2 +#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3 +#define MASTER_SLAVE_CFG_MASTER_FORCE 4 +#define MASTER_SLAVE_CFG_SLAVE_FORCE 5 +#define MASTER_SLAVE_STATE_UNSUPPORTED 0 +#define MASTER_SLAVE_STATE_UNKNOWN 1 +#define MASTER_SLAVE_STATE_MASTER 2 +#define MASTER_SLAVE_STATE_SLAVE 3 +#define MASTER_SLAVE_STATE_ERR 4 + +/* These are used to throttle the rate of data on the phy interface when the + * native speed of the interface is higher than the link speed. These should + * not be used for phy interfaces which natively support multiple speeds (e.g. + * MII or SGMII). + */ +/* No rate matching performed. */ +#define RATE_MATCH_NONE 0 +/* The phy sends pause frames to throttle the MAC. */ +#define RATE_MATCH_PAUSE 1 +/* The phy asserts CRS to prevent the MAC from transmitting. */ +#define RATE_MATCH_CRS 2 +/* The MAC is programmed with a sufficiently-large IPG. */ +#define RATE_MATCH_OPEN_LOOP 3 + +/* Which connector port. */ +#define PORT_TP 0x00 +#define PORT_AUI 0x01 +#define PORT_MII 0x02 +#define PORT_FIBRE 0x03 +#define PORT_BNC 0x04 +#define PORT_DA 0x05 +#define PORT_NONE 0xef +#define PORT_OTHER 0xff + +/* Which transceiver to use. */ +#define XCVR_INTERNAL 0x00 /* PHY and MAC are in the same package */ +#define XCVR_EXTERNAL 0x01 /* PHY and MAC are in different packages */ +#define XCVR_DUMMY1 0x02 +#define XCVR_DUMMY2 0x03 +#define XCVR_DUMMY3 0x04 + +/* Enable or disable autonegotiation. */ +#define AUTONEG_DISABLE 0x00 +#define AUTONEG_ENABLE 0x01 + +/* MDI or MDI-X status/control - if MDI/MDI_X/AUTO is set then + * the driver is required to renegotiate link + */ +#define ETH_TP_MDI_INVALID 0x00 /* status: unknown; control: unsupported */ +#define ETH_TP_MDI 0x01 /* status: MDI; control: force MDI */ +#define ETH_TP_MDI_X 0x02 /* status: MDI-X; control: force MDI-X */ +#define ETH_TP_MDI_AUTO 0x03 /* control: auto-select */ + +/* Wake-On-Lan options. */ +#define WAKE_PHY (1 << 0) +#define WAKE_UCAST (1 << 1) +#define WAKE_MCAST (1 << 2) +#define WAKE_BCAST (1 << 3) +#define WAKE_ARP (1 << 4) +#define WAKE_MAGIC (1 << 5) +#define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */ +#define WAKE_FILTER (1 << 7) + +#define WOL_MODE_COUNT 8 + +/* RSS hash function data + * XOR the corresponding source and destination fields of each specified + * protocol. Both copies of the XOR'ed fields are fed into the RSS and RXHASH + * calculation. Note that this XORing reduces the input set entropy and could + * be exploited to reduce the RSS queue spread. + */ +#define RXH_XFRM_SYM_XOR (1 << 0) +#define RXH_XFRM_NO_CHANGE 0xff + +/* L2-L4 network traffic flow types */ +#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ +#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ +#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ +#define AH_ESP_V4_FLOW 0x04 /* hash only */ +#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ +#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ +#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ +#define AH_ESP_V6_FLOW 0x08 /* hash only */ +#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ +#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ +#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ +#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ +#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ +#define IP_USER_FLOW IPV4_USER_FLOW +#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ +#define IPV4_FLOW 0x10 /* hash only */ +#define IPV6_FLOW 0x11 /* hash only */ +#define ETHER_FLOW 0x12 /* spec only (ether_spec) */ + +/* Used for GTP-U IPv4 and IPv6. + * The format of GTP packets only includes + * elements such as TEID and GTP version. + * It is primarily intended for data communication of the UE. + */ +#define GTPU_V4_FLOW 0x13 /* hash only */ +#define GTPU_V6_FLOW 0x14 /* hash only */ + +/* Use for GTP-C IPv4 and v6. + * The format of these GTP packets does not include TEID. + * Primarily expected to be used for communication + * to create sessions for UE data communication, + * commonly referred to as CSR (Create Session Request). + */ +#define GTPC_V4_FLOW 0x15 /* hash only */ +#define GTPC_V6_FLOW 0x16 /* hash only */ + +/* Use for GTP-C IPv4 and v6. + * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID. + * After session creation, it becomes this packet. + * This is mainly used for requests to realize UE handover. + */ +#define GTPC_TEID_V4_FLOW 0x17 /* hash only */ +#define GTPC_TEID_V6_FLOW 0x18 /* hash only */ + +/* Use for GTP-U and extended headers for the PSC (PDU Session Container). + * The format of these GTP packets includes TEID and QFI. + * In 5G communication using UPF (User Plane Function), + * data communication with this extended header is performed. + */ +#define GTPU_EH_V4_FLOW 0x19 /* hash only */ +#define GTPU_EH_V6_FLOW 0x1a /* hash only */ + +/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers. + * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by + * UL/DL included in the PSC. + * There are differences in the data included based on Downlink/Uplink, + * and can be used to distinguish packets. + * The functions described so far are useful when you want to + * handle communication from the mobile network in UPF, PGW, etc. + */ +#define GTPU_UL_V4_FLOW 0x1b /* hash only */ +#define GTPU_UL_V6_FLOW 0x1c /* hash only */ +#define GTPU_DL_V4_FLOW 0x1d /* hash only */ +#define GTPU_DL_V6_FLOW 0x1e /* hash only */ + +/* Flag to enable additional fields in struct ethtool_rx_flow_spec */ +#define FLOW_EXT 0x80000000 +#define FLOW_MAC_EXT 0x40000000 +/* Flag to enable RSS spreading of traffic matching rule (nfc only) */ +#define FLOW_RSS 0x20000000 + +/* L3-L4 network traffic flow hash options */ +#define RXH_L2DA (1 << 1) +#define RXH_VLAN (1 << 2) +#define RXH_L3_PROTO (1 << 3) +#define RXH_IP_SRC (1 << 4) +#define RXH_IP_DST (1 << 5) +#define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ +#define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ +#define RXH_GTP_TEID (1 << 8) /* teid in case of GTP */ +#define RXH_DISCARD (1 << 31) + +#define RX_CLS_FLOW_DISC 0xffffffffffffffffULL +#define RX_CLS_FLOW_WAKE 0xfffffffffffffffeULL + +/* Special RX classification rule insert location values */ +#define RX_CLS_LOC_SPECIAL 0x80000000 /* flag */ +#define RX_CLS_LOC_ANY 0xffffffff +#define RX_CLS_LOC_FIRST 0xfffffffe +#define RX_CLS_LOC_LAST 0xfffffffd + +/* EEPROM Standards for plug in modules */ +#define ETH_MODULE_SFF_8079 0x1 +#define ETH_MODULE_SFF_8079_LEN 256 +#define ETH_MODULE_SFF_8472 0x2 +#define ETH_MODULE_SFF_8472_LEN 512 +#define ETH_MODULE_SFF_8636 0x3 +#define ETH_MODULE_SFF_8636_LEN 256 +#define ETH_MODULE_SFF_8436 0x4 +#define ETH_MODULE_SFF_8436_LEN 256 + +#define ETH_MODULE_SFF_8636_MAX_LEN 640 +#define ETH_MODULE_SFF_8436_MAX_LEN 640 + +/* Reset flags */ +/* The reset() operation must clear the flags for the components which + * were actually reset. On successful return, the flags indicate the + * components which were not reset, either because they do not exist + * in the hardware or because they cannot be reset independently. The + * driver must never reset any components that were not requested. + */ +enum ethtool_reset_flags { + /* These flags represent components dedicated to the interface + * the command is addressed to. Shift any flag left by + * ETH_RESET_SHARED_SHIFT to reset a shared component of the + * same type. + */ + ETH_RESET_MGMT = 1 << 0, /* Management processor */ + ETH_RESET_IRQ = 1 << 1, /* Interrupt requester */ + ETH_RESET_DMA = 1 << 2, /* DMA engine */ + ETH_RESET_FILTER = 1 << 3, /* Filtering/flow direction */ + ETH_RESET_OFFLOAD = 1 << 4, /* Protocol offload */ + ETH_RESET_MAC = 1 << 5, /* Media access controller */ + ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */ + ETH_RESET_RAM = 1 << 7, /* RAM shared between + * multiple components */ + ETH_RESET_AP = 1 << 8, /* Application processor */ + + ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to + * this interface */ + ETH_RESET_ALL = 0xffffffff, /* All components used by this + * interface, even if shared */ +}; +#define ETH_RESET_SHARED_SHIFT 16 + + +/** + * struct ethtool_link_settings - link control and status + * + * IMPORTANT, Backward compatibility notice: When implementing new + * user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and + * if it succeeds use %ETHTOOL_SLINKSETTINGS to change link + * settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS + * succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in + * that case. Conversely, if %ETHTOOL_GLINKSETTINGS fails, use + * %ETHTOOL_GSET to query and %ETHTOOL_SSET to change link + * settings; do not use %ETHTOOL_SLINKSETTINGS if + * %ETHTOOL_GLINKSETTINGS failed: stick to + * %ETHTOOL_GSET/%ETHTOOL_SSET in that case. + * + * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS + * @speed: Link speed (Mbps) + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @link_mode_masks_nwords: Number of 32-bit words for each of the + * supported, advertising, lp_advertising link mode bitmaps. For + * %ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user + * (>= 0); on return, if handshake in progress, negative if + * request size unsupported by kernel: absolute value indicates + * kernel expected size and all the other fields but cmd + * are 0; otherwise (handshake completed), strictly positive + * to indicate size used by kernel and cmd field stays + * %ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For + * %ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive + * value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise + * refused. For drivers: ignore this field (use kernel's + * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will + * be overwritten by kernel. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. + * @master_slave_cfg: Master/slave port mode. + * @master_slave_state: Master/slave port state. + * @rate_matching: Rate adaptation performed by the PHY + * @reserved: Reserved for future use; see the note on reserved space. + * @link_mode_masks: Variable length bitmaps. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt + * are not available in %ethtool_link_settings. These fields will be + * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will + * fail if any of them is set to non-zero value. + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GLINKSETTINGS to get the current values before making specific + * changes and then applying them with %ETHTOOL_SLINKSETTINGS. + * + * Drivers that implement %get_link_ksettings and/or + * %set_link_ksettings should ignore the @cmd + * and @link_mode_masks_nwords fields (any change to them overwritten + * by kernel), and rely only on kernel's internal + * %__ETHTOOL_LINK_MODE_MASK_NBITS and + * %ethtool_link_mode_mask_t. Drivers that implement + * %set_link_ksettings() should validate all fields other than @cmd + * and @link_mode_masks_nwords that are not described as read-only or + * deprecated, and must ignore all fields described as read-only. + * + * @link_mode_masks is divided into three bitfields, each of length + * @link_mode_masks_nwords: + * - supported: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features for which the interface + * supports autonegotiation or auto-detection. Read-only. + * - advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features that are advertised through + * autonegotiation or enabled for auto-detection. + * - lp_advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, and other + * link features that the link partner advertised through + * autonegotiation; 0 if unknown or not applicable. Read-only. + */ +struct ethtool_link_settings { + __u32 cmd; + __u32 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 autoneg; + __u8 mdio_support; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __s8 link_mode_masks_nwords; + __u8 transceiver; + __u8 master_slave_cfg; + __u8 master_slave_state; + __u8 rate_matching; + __u32 reserved[7]; + __u32 link_mode_masks[]; + /* layout of link_mode_masks fields: + * __u32 map_supported[link_mode_masks_nwords]; + * __u32 map_advertising[link_mode_masks_nwords]; + * __u32 map_lp_advertising[link_mode_masks_nwords]; + */ +}; #endif /* _UAPI_LINUX_ETHTOOL_H */ From c3bd015090f24dcd2e839db1401e948ad95ce803 Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:24 +0000 Subject: [PATCH 042/147] selftests/xsk: Make batch size variable Convert the constant BATCH_SIZE into a variable named batch_size to allow dynamic modification at runtime. This is required for the forthcoming changes to support testing different hardware ring sizes. While running these tests, a bug was identified when the batch size is roughly the same as the NIC ring size. This has now been addressed by Maciej's fix in commit 913eda2b08cc ("i40e: xsk: remove count_mask"). Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-3-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 20 +++++++++++--------- tools/testing/selftests/bpf/xskxceiver.h | 3 ++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index b1102ee13faa..eaa102c8098b 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -239,7 +239,7 @@ static void enable_busy_poll(struct xsk_socket_info *xsk) (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); - sock_opt = BATCH_SIZE; + sock_opt = xsk->batch_size; if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); @@ -439,6 +439,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, for (j = 0; j < MAX_SOCKETS; j++) { memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; if (i == 0) ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; else @@ -1087,7 +1088,7 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) return TEST_CONTINUE; } - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx); if (!rcvd) return TEST_CONTINUE; @@ -1239,7 +1240,8 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); /* pkts_in_flight might be negative if many invalid packets are sent */ - if (pkts_in_flight >= (int)((umem_size(umem) - BATCH_SIZE * buffer_len) / buffer_len)) { + if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / + buffer_len)) { ret = kick_tx(xsk); if (ret) return TEST_FAILURE; @@ -1249,7 +1251,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b fds.fd = xsk_socket__fd(xsk->xsk); fds.events = POLLOUT; - while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) { + while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { if (use_poll) { ret = poll(&fds, 1, POLL_TMOUT); if (timeout) { @@ -1269,10 +1271,10 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b } } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } - for (i = 0; i < BATCH_SIZE; i++) { + for (i = 0; i < xsk->batch_size; i++) { struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); u32 nb_frags_left, nb_frags, bytes_written = 0; @@ -1280,9 +1282,9 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b break; nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); - if (nb_frags > BATCH_SIZE - i) { + if (nb_frags > xsk->batch_size - i) { pkt_stream_cancel(pkt_stream); - xsk_ring_prod__cancel(&xsk->tx, BATCH_SIZE - i); + xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i); break; } nb_frags_left = nb_frags; @@ -1370,7 +1372,7 @@ static int wait_for_tx_completion(struct xsk_socket_info *xsk) return TEST_FAILURE; } - complete_pkts(xsk, BATCH_SIZE); + complete_pkts(xsk, xsk->batch_size); } return TEST_PASS; diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index f174df2d693f..425304e52f35 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -44,7 +44,7 @@ #define MAX_ETH_JUMBO_SIZE 9000 #define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 -#define BATCH_SIZE 64 +#define DEFAULT_BATCH_SIZE 64 #define POLL_TMOUT 1000 #define THREAD_TMOUT 3 #define DEFAULT_PKT_CNT (4 * 1024) @@ -91,6 +91,7 @@ struct xsk_socket_info { struct pkt_stream *pkt_stream; u32 outstanding_tx; u32 rxqsize; + u32 batch_size; u8 dst_mac[ETH_ALEN]; u8 src_mac[ETH_ALEN]; }; From 90a695c3d31e1c9f0adb8c4c80028ed4ea7ed5ab Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:25 +0000 Subject: [PATCH 043/147] selftests/bpf: Implement get_hw_ring_size function to retrieve current and max interface size Introduce a new function called get_hw_size that retrieves both the current and maximum size of the interface and stores this information in the 'ethtool_ringparam' structure. Remove ethtool_channels struct from xdp_hw_metadata.c due to redefinition error. Remove unused linux/if.h include from flow_dissector BPF test to address CI pipeline failure. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-4-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/network_helpers.c | 24 +++++++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 4 ++++ .../selftests/bpf/prog_tests/flow_dissector.c | 1 - tools/testing/selftests/bpf/xdp_hw_metadata.c | 14 ----------- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 6db27a9088e9..1cab20020f94 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -497,3 +497,27 @@ int get_socket_local_port(int sock_fd) return -1; } + +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_GRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 94b9be24e39b..de4d27d201cb 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -9,8 +9,11 @@ typedef __u16 __sum16; #include #include #include +#include +#include #include #include +#include #define MAGIC_VAL 0x1234 #define NUM_ITER 100000 @@ -61,6 +64,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, struct sockaddr_storage *addr, socklen_t *len); char *ping_command(int family); int get_socket_local_port(int sock_fd); +int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); struct nstoken; /** diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index c4773173a4e4..9e5f38739104 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index bdf5d8180067..0859fe727da7 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -495,20 +495,6 @@ peek: return 0; } -struct ethtool_channels { - __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; -}; - -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ - static int rxq_num(const char *ifname) { struct ethtool_channels ch = { From bee3a7b07624223526c1fea465557068546d3b3c Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:26 +0000 Subject: [PATCH 044/147] selftests/bpf: Implement set_hw_ring_size function to configure interface ring size Introduce a new function called set_hw_ring_size that allows for the dynamic configuration of the ring size within the interface. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-5-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/network_helpers.c | 24 +++++++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 1 + 2 files changed, 25 insertions(+) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 1cab20020f94..04175e16195a 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -521,3 +521,27 @@ int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) close(sockfd); return 0; } + +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) +{ + struct ifreq ifr = {0}; + int sockfd, err; + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) + return -errno; + + memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + ring_param->cmd = ETHTOOL_SRINGPARAM; + ifr.ifr_data = (char *)ring_param; + + if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) { + err = errno; + close(sockfd); + return -err; + } + + close(sockfd); + return 0; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index de4d27d201cb..6457445cc6e2 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -65,6 +65,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, char *ping_command(int family); int get_socket_local_port(int sock_fd); int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); +int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); struct nstoken; /** From 776021e07fd0d7592c767e60929b954e82676186 Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:27 +0000 Subject: [PATCH 045/147] selftests/xsk: Introduce set_ring_size function with a retry mechanism for handling AF_XDP socket closures Introduce a new function, set_ring_size(), to manage asynchronous AF_XDP socket closure. Retry set_hw_ring_size up to SOCK_RECONF_CTR times if it fails due to an active AF_XDP socket. Return an error immediately for non-EBUSY errors. This enhances robustness against asynchronous AF_XDP socket closures during ring size changes. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-6-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/xskxceiver.c | 57 +++++++++++++++++++++++- tools/testing/selftests/bpf/xskxceiver.h | 9 ++++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index b0ac3dd80acf..eea5b8deaaf0 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -695,7 +695,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) # Include find_bit.c to compile xskxceiver. EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c -$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) +$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index eaa102c8098b..8c26868e17cf 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -105,11 +106,15 @@ #include "../kselftest.h" #include "xsk_xdp_common.h" +#include + static bool opt_verbose; static bool opt_print_tests; static enum test_mode opt_mode = TEST_MODE_ALL; static u32 opt_run_test = RUN_ALL_TESTS; +void test__fail(void) { /* for network_helpers.c */ } + static void __exit_with_error(int error, const char *file, const char *func, int line) { ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, @@ -409,6 +414,33 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj } } +static int set_ring_size(struct ifobject *ifobj) +{ + int ret; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring); + if (!ret) + break; + + /* Retry if it fails */ + if (ctr >= SOCK_RECONF_CTR || errno != EBUSY) + return -errno; + + usleep(USLEEP_MAX); + } + + return ret; +} + +static int hw_ring_size_reset(struct ifobject *ifobj) +{ + ifobj->ring.tx_pending = ifobj->set_ring.default_tx; + ifobj->ring.rx_pending = ifobj->set_ring.default_rx; + return set_ring_size(ifobj); +} + static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, struct ifobject *ifobj_rx) { @@ -452,12 +484,16 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + test->ifobj_tx = ifobj_tx; test->ifobj_rx = ifobj_rx; test->current_step = 0; test->total_steps = 1; test->nb_sockets = 1; test->fail = false; + test->set_ring = false; test->mtu = MAX_ETH_PKT_SIZE; test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; @@ -1862,6 +1898,14 @@ static int testapp_validate_traffic(struct test_spec *test) return TEST_SKIP; } + if (test->set_ring) { + if (ifobj_tx->hw_ring_size_supp) + return set_ring_size(ifobj_tx); + + ksft_test_result_skip("Changing HW ring size not supported.\n"); + return TEST_SKIP; + } + xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx); return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); } @@ -2479,7 +2523,7 @@ static const struct test_spec tests[] = { {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, -}; + }; static void print_tests(void) { @@ -2499,6 +2543,7 @@ int main(int argc, char **argv) int modes = TEST_MODE_SKB + 1; struct test_spec test; bool shared_netdev; + int ret; /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); @@ -2536,6 +2581,13 @@ int main(int argc, char **argv) modes++; } + ret = get_hw_ring_size(ifobj_tx->ifname, &ifobj_tx->ring); + if (!ret) { + ifobj_tx->hw_ring_size_supp = true; + ifobj_tx->set_ring.default_tx = ifobj_tx->ring.tx_pending; + ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; + } + init_iface(ifobj_rx, worker_testapp_validate_rx); init_iface(ifobj_tx, worker_testapp_validate_tx); @@ -2583,6 +2635,9 @@ int main(int argc, char **argv) } } + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + pkt_stream_delete(tx_pkt_stream_default); pkt_stream_delete(rx_pkt_stream_default); xsk_unload_xdp_programs(ifobj_tx); diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index 425304e52f35..906de5fab7a3 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -114,6 +114,11 @@ struct pkt_stream { bool verbatim; }; +struct set_hw_ring { + u32 default_tx; + u32 default_rx; +}; + struct ifobject; struct test_spec; typedef int (*validation_func_t)(struct ifobject *ifobj); @@ -130,6 +135,8 @@ struct ifobject { struct xsk_xdp_progs *xdp_progs; struct bpf_map *xskmap; struct bpf_program *xdp_prog; + struct ethtool_ringparam ring; + struct set_hw_ring set_ring; enum test_mode mode; int ifindex; int mtu; @@ -146,6 +153,7 @@ struct ifobject { bool unaligned_supp; bool multi_buff_supp; bool multi_buff_zc_supp; + bool hw_ring_size_supp; }; struct test_spec { @@ -163,6 +171,7 @@ struct test_spec { u16 current_step; u16 nb_sockets; bool fail; + bool set_ring; enum test_mode mode; char name[MAX_TEST_NAME_SIZE]; }; From c4f960539fae6f04617d3909cc0dfdb88e7d197b Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:28 +0000 Subject: [PATCH 046/147] selftests/xsk: Test AF_XDP functionality under minimal ring configurations Add a new test case that stresses AF_XDP and the driver by configuring small hardware and software ring sizes. This verifies that AF_XDP continues to function properly even with insufficient ring space that could lead to frequent producer/consumer throttling. The test procedure involves: 1. Set the minimum possible ring configuration(tx 64 and rx 128). 2. Run tests with various batch sizes(1 and 63) to validate the system's behavior under different configurations. Update Makefile to include network_helpers.o in the build process for xskxceiver. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-7-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 8c26868e17cf..dac2dcf39bb8 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -2419,6 +2419,26 @@ static int testapp_xdp_metadata_mb(struct test_spec *test) return testapp_xdp_metadata_copy(test); } +static int testapp_hw_sw_min_ring_size(struct test_spec *test) +{ + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE; + test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2; + test->ifobj_tx->xsk->batch_size = 1; + test->ifobj_rx->xsk->batch_size = 1; + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch size to hw_ring_size - 1 */ + test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + return testapp_validate_traffic(test); +} + static void run_pkt_test(struct test_spec *test) { int ret; @@ -2523,6 +2543,7 @@ static const struct test_spec tests[] = { {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, + {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, }; static void print_tests(void) From c53908b254fcf25b05bcdf6634adb36eaccac111 Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Tue, 2 Apr 2024 11:45:29 +0000 Subject: [PATCH 047/147] selftests/xsk: Add new test case for AF_XDP under max ring sizes Introduce a test case to evaluate AF_XDP's robustness by pushing hardware and software ring sizes to their limits. This test ensures AF_XDP's reliability amidst potential producer/consumer throttling due to maximum ring utilization. The testing strategy includes: 1. Configuring rings to their maximum allowable sizes. 2. Executing a series of tests across diverse batch sizes to assess system's behavior under different configurations. Signed-off-by: Tushar Vyavahare Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20240402114529.545475-8-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index dac2dcf39bb8..2eac0895b0a1 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -2439,6 +2439,30 @@ static int testapp_hw_sw_min_ring_size(struct test_spec *test) return testapp_validate_traffic(test); } +static int testapp_hw_sw_max_ring_size(struct test_spec *test) +{ + u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; + test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; + test->ifobj_rx->umem->num_frames = max_descs; + test->ifobj_rx->xsk->rxqsize = max_descs; + test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch_size to 4095 */ + test->ifobj_tx->xsk->batch_size = max_descs - 1; + test->ifobj_rx->xsk->batch_size = max_descs - 1; + return testapp_validate_traffic(test); +} + static void run_pkt_test(struct test_spec *test) { int ret; @@ -2544,6 +2568,7 @@ static const struct test_spec tests[] = { {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, + {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, }; static void print_tests(void) From 2e114248e086fb376405ed3f89b220f8586a2541 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Tue, 2 Apr 2024 23:52:50 +0000 Subject: [PATCH 048/147] bpf: Replace deprecated strncpy with strscpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. bpf sym names get looked up and compared/cleaned with various string apis. This suggests they need to be NUL-terminated (strncpy() suggests this but does not guarantee it). | static int compare_symbol_name(const char *name, char *namebuf) | { | cleanup_symbol_name(namebuf); | return strcmp(name, namebuf); | } | static void cleanup_symbol_name(char *s) | { | ... | res = strstr(s, ".llvm."); | ... | } Use strscpy() as this method guarantees NUL-termination on the destination buffer. This patch also replaces two uses of strncpy() used in log.c. These are simple replacements as postfix has been zero-initialized on the stack and has source arguments with a size less than the destination's size. Note that this patch uses the new 2-argument version of strscpy introduced in commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). Signed-off-by: Justin Stitt Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [2] Link: https://github.com/KSPP/linux/issues/90 Link: https://lore.kernel.org/bpf/20240402-strncpy-kernel-bpf-core-c-v1-1-7cb07a426e78@google.com --- kernel/bpf/core.c | 4 ++-- kernel/bpf/log.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ab400cdd7d7a..ae406a2814db 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -747,7 +747,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); ret = sym; if (size) @@ -813,7 +813,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); *value = ksym->start; *type = BPF_SYM_ELF_TYPE; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 2a243cf37c60..4bd8f17a9f24 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -467,9 +467,9 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) if (type & PTR_MAYBE_NULL) { if (base_type(type) == PTR_TO_BTF_ID) - strncpy(postfix, "or_null_", 16); + strscpy(postfix, "or_null_"); else - strncpy(postfix, "_or_null", 16); + strscpy(postfix, "_or_null"); } snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", From 7bdbf7446305cb65c510c16d57cde82bc76b234a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:02 -0700 Subject: [PATCH 049/147] bpf: add special internal-only MOV instruction to resolve per-CPU addrs Add a new BPF instruction for resolving absolute addresses of per-CPU data from their per-CPU offsets. This instruction is internal-only and users are not allowed to use them directly. They will only be used for internal inlining optimizations for now between BPF verifier and BPF JITs. We use a special BPF_MOV | BPF_ALU64 | BPF_X form with insn->off field set to BPF_ADDR_PERCPU = -1. I used negative offset value to distinguish them from positive ones used by user-exposed instructions. Such instruction performs a resolution of a per-CPU offset stored in a register to a valid kernel address which can be dereferenced. It is useful in any use case where absolute address of a per-CPU data has to be resolved (e.g., in inlining bpf_map_lookup_elem()). BPF disassembler is also taught to recognize them to support dumping final BPF assembly code (non-JIT'ed version). Add arch-specific way for BPF JITs to mark support for this instructions. This patch also adds support for these instructions in x86-64 BPF JIT. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 16 ++++++++++++++++ include/linux/filter.h | 20 ++++++++++++++++++++ kernel/bpf/core.c | 5 +++++ kernel/bpf/disasm.c | 14 ++++++++++++++ 4 files changed, 55 insertions(+) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 3b639d6f2f54..af89dd117dce 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1382,6 +1382,17 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image maybe_emit_mod(&prog, AUX_REG, dst_reg, true); EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg)); break; + } else if (insn_is_mov_percpu_addr(insn)) { + u32 off = (u32)(unsigned long)&this_cpu_off; + + /* mov , (if necessary) */ + EMIT_mov(dst_reg, src_reg); + + /* add , gs:[] */ + EMIT2(0x65, add_1mod(0x48, dst_reg)); + EMIT3(0x03, add_1reg(0x04, dst_reg), 0x25); + EMIT(off, 4); + break; } fallthrough; case BPF_ALU | BPF_MOV | BPF_X: @@ -3365,6 +3376,11 @@ bool bpf_jit_supports_subprog_tailcalls(void) return true; } +bool bpf_jit_supports_percpu_insn(void) +{ + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/include/linux/filter.h b/include/linux/filter.h index 531b36090122..161d5f7b64ed 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -178,6 +178,25 @@ struct ctl_table_header; .off = 0, \ .imm = 0 }) +/* Special (internal-only) form of mov, used to resolve per-CPU addrs: + * dst_reg = src_reg + + * BPF_ADDR_PERCPU is used as a special insn->off value. + */ +#define BPF_ADDR_PERCPU (-1) + +#define BPF_MOV64_PERCPU_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = BPF_ADDR_PERCPU, \ + .imm = 0 }) + +static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; +} + /* Short form of mov, dst_reg = imm32 */ #define BPF_MOV64_IMM(DST, IMM) \ @@ -972,6 +991,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); +bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ae406a2814db..7a33a3a7e63c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2945,6 +2945,11 @@ bool __weak bpf_jit_supports_subprog_tailcalls(void) return false; } +bool __weak bpf_jit_supports_percpu_insn(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index bd2e2dd04740..309c4aa1b026 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn) insn->off == BPF_ADDR_SPACE_CAST; } +/* Special (internal-only) form of mov, used to resolve per-CPU addrs: + * dst_reg = src_reg + + * BPF_ADDR_PERCPU is used as a special insn->off value. + */ +#define BPF_ADDR_PERCPU (-1) + +static inline bool is_mov_percpu_addr(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -194,6 +205,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); + } else if (is_mov_percpu_addr(insn)) { + verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n", + insn->code, insn->dst_reg, insn->src_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', From 1ae6921009e5d72787e07ccc04754514ccf6bc99 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:03 -0700 Subject: [PATCH 050/147] bpf: inline bpf_get_smp_processor_id() helper If BPF JIT supports per-CPU MOV instruction, inline bpf_get_smp_processor_id() to eliminate unnecessary function calls. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fcb62300f407..17c06f1505e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20074,6 +20074,30 @@ patch_map_ops_generic: goto next_insn; } +#ifdef CONFIG_X86_64 + /* Implement bpf_get_smp_processor_id() inline. */ + if (insn->imm == BPF_FUNC_get_smp_processor_id && + prog->jit_requested && bpf_jit_supports_percpu_insn()) { + /* BPF_FUNC_get_smp_processor_id inlining is an + * optimization, so if pcpu_hot.cpu_number is ever + * changed in some incompatible and hard to support + * way, it's fine to back out this inlining logic + */ + insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } +#endif /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg) { From db69718b8efac802c7cc20d5a6c7dfc913f99c43 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:04 -0700 Subject: [PATCH 051/147] bpf: inline bpf_map_lookup_elem() for PERCPU_ARRAY maps Using new per-CPU BPF instruction implement inlining for per-CPU ARRAY map lookup helper, if BPF JIT support is present. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 13358675ff2e..8c1e6d7654bb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -246,6 +246,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */ +static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + + BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs)); + + *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0); + if (!map->bypass_spec_v1) { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5); + } + + *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0); + return insn - insn_buf; +} + static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -776,6 +808,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, + .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, From 0b56e637f705836b9aa51e2b1058c3c814c121a8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:05 -0700 Subject: [PATCH 052/147] bpf: inline bpf_map_lookup_elem() helper for PERCPU_HASH map Using new per-CPU BPF instruction, partially inline bpf_map_lookup_elem() helper for per-CPU hashmap BPF map. Just like for normal HASH map, we still generate a call into __htab_map_lookup_elem(), but after that we resolve per-CPU element address using a new instruction, saving on extra functions calls. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e81059faae63..83a9a74260e9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2308,6 +2308,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* inline bpf_map_lookup_elem() call for per-CPU hashmap */ +static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, + offsetof(struct htab_elem, key) + map->key_size); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + + return insn - insn_buf; +} + static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; @@ -2436,6 +2456,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_gen_lookup = htab_percpu_map_gen_lookup, .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, From 1e9e0b85255e6eca6036b59d8a5fbca6501905ac Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Apr 2024 20:47:26 -0700 Subject: [PATCH 053/147] bpf: handle CONFIG_SMP=n configuration in x86 BPF JIT On non-SMP systems, there is no "per-CPU" data, it's just global data. So in such case just don't do this_cpu_off-based per-CPU address adjustment. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404040951.d4CUx5S6-lkp@intel.com/ Fixes: 7bdbf7446305 ("bpf: add special internal-only MOV instruction to resolve per-CPU addrs") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240404034726.2766740-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index af89dd117dce..964e8154da66 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1383,15 +1383,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg)); break; } else if (insn_is_mov_percpu_addr(insn)) { - u32 off = (u32)(unsigned long)&this_cpu_off; - /* mov , (if necessary) */ EMIT_mov(dst_reg, src_reg); - +#ifdef CONFIG_SMP /* add , gs:[] */ EMIT2(0x65, add_1mod(0x48, dst_reg)); EMIT3(0x03, add_1reg(0x04, dst_reg), 0x25); - EMIT(off, 4); + EMIT((u32)(unsigned long)&this_cpu_off, 4); +#endif break; } fallthrough; From af682b767a41772499f8e54ca7d7e1deb3395f44 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2024 16:38:00 -0700 Subject: [PATCH 054/147] bpf: Optimize emit_mov_imm64(). Turned out that bpf prog callback addresses, bpf prog addresses used in bpf_trampoline, and in other cases the 64-bit address can be represented as sign extended 32-bit value. According to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82339 "Skylake has 0.64c throughput for mov r64, imm64, vs. 0.25 for mov r32, imm32." So use shorter encoding and faster instruction when possible. Special care is needed in jit_subprogs(), since bpf_pseudo_func() instruction cannot change its size during the last step of JIT. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/CAADnVQKFfpY-QZBrOU2CG8v2du8Lgyb7MNVmOZVK_yTyOdNbBA@mail.gmail.com Link: https://lore.kernel.org/bpf/20240401233800.42737-1-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 5 ++++- kernel/bpf/verifier.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 964e8154da66..6cf9a5697c09 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -816,9 +816,10 @@ done: static void emit_mov_imm64(u8 **pprog, u32 dst_reg, const u32 imm32_hi, const u32 imm32_lo) { + u64 imm64 = ((u64)imm32_hi << 32) | (u32)imm32_lo; u8 *prog = *pprog; - if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { + if (is_uimm32(imm64)) { /* * For emitting plain u32, where sign bit must not be * propagated LLVM tends to load imm64 over mov32 @@ -826,6 +827,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, * 'mov %eax, imm32' instead. */ emit_mov_imm32(&prog, false, dst_reg, imm32_lo); + } else if (is_simm32(imm64)) { + emit_mov_imm32(&prog, true, dst_reg, imm32_lo); } else { /* movabsq rax, imm64 */ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17c06f1505e4..1e03ba9ed07b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19147,12 +19147,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) env->insn_aux_data[i].call_imm = insn->imm; /* point imm to __bpf_call_base+1 from JITs point of view */ insn->imm = 1; - if (bpf_pseudo_func(insn)) + if (bpf_pseudo_func(insn)) { +#if defined(MODULES_VADDR) + u64 addr = MODULES_VADDR; +#else + u64 addr = VMALLOC_START; +#endif /* jit (e.g. x86_64) may emit fewer instructions * if it learns a u32 imm is the same as a u64 imm. - * Force a non zero here. + * Set close enough to possible prog address. */ - insn[1].imm = 1; + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + } } err = bpf_prog_alloc_jited_linfo(prog); From 633a6e01d1a20b24a16899094c249a8cb2aad4b2 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 4 Apr 2024 11:42:02 +0000 Subject: [PATCH 055/147] bpf, riscv: Implement PROBE_MEM32 pseudo instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW] instructions. They are similar to PROBE_MEM instructions with the following differences: - PROBE_MEM32 supports store. - PROBE_MEM32 relies on the verifier to clear upper 32-bit of the src/dst register - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in S7 in the prologue). Due to bpf_arena constructions such S7 + reg + off16 access is guaranteed to be within arena virtual range, so no address check at run-time. - S11 is a free callee-saved register, so it is used to store kern_vm_start - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When LDX faults the destination register is zeroed. To support these on riscv, we do tmp = S7 + src/dst reg and then use tmp2 as the new src/dst register. This allows us to reuse most of the code for normal [LDX | STX | ST]. Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Tested-by: Pu Lehui Reviewed-by: Pu Lehui Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240404114203.105970-2-puranjay12@gmail.com --- arch/riscv/net/bpf_jit.h | 1 + arch/riscv/net/bpf_jit_comp64.c | 189 +++++++++++++++++++++++++++++++- arch/riscv/net/bpf_jit_core.c | 1 + 3 files changed, 189 insertions(+), 2 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index f4b6b3b9edda..8a47da08dd9c 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -81,6 +81,7 @@ struct rv_jit_context { int nexentries; unsigned long flags; int stack_size; + u64 arena_vm_start; }; /* Convert from ninsns to bytes. */ diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 1adf2f39ce59..a4c8e1e6c1e2 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -18,6 +18,7 @@ #define RV_REG_TCC RV_REG_A6 #define RV_REG_TCC_SAVED RV_REG_S6 /* Store A6 in S6 if program do calls */ +#define RV_REG_ARENA RV_REG_S7 /* For storing arena_vm_start */ static const int regmap[] = { [BPF_REG_0] = RV_REG_A5, @@ -255,6 +256,10 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx); store_offset -= 8; } + if (ctx->arena_vm_start) { + emit_ld(RV_REG_ARENA, store_offset, RV_REG_SP, ctx); + store_offset -= 8; + } emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx); /* Set return value. */ @@ -548,6 +553,7 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) +#define REG_DONT_CLEAR_MARKER 0 /* RV_REG_ZERO unused in pt_regmap */ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) @@ -555,7 +561,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); - *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0; + if (regs_offset != REG_DONT_CLEAR_MARKER) + *(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0; regs->epc = (unsigned long)&ex->fixup - offset; return true; @@ -572,7 +579,8 @@ static int add_exception_handler(const struct bpf_insn *insn, off_t fixup_offset; if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable || - (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX)) + (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32)) return 0; if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries)) @@ -1539,6 +1547,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_LDX | BPF_PROBE_MEMSX | BPF_B: case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: + /* LDX | PROBE_MEM32: dst = *(unsigned size *)(src + RV_REG_ARENA + off) */ + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: { int insn_len, insns_start; bool sign_ext; @@ -1546,6 +1559,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, sign_ext = BPF_MODE(insn->code) == BPF_MEMSX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX; + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit_add(RV_REG_T2, rs, RV_REG_ARENA, ctx); + rs = RV_REG_T2; + } + switch (BPF_SIZE(code)) { case BPF_B: if (is_12b_int(off)) { @@ -1682,6 +1700,86 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); break; + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + { + int insn_len, insns_start; + + emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T3; + + /* Load imm to a register then store it */ + emit_imm(RV_REG_T1, imm, ctx); + + switch (BPF_SIZE(code)) { + case BPF_B: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit(rv_sb(rd, off, RV_REG_T1), ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + insns_start = ctx->ninsns; + emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_H: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit(rv_sh(rd, off, RV_REG_T1), ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + insns_start = ctx->ninsns; + emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_W: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_sw(rd, off, RV_REG_T1, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + insns_start = ctx->ninsns; + emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_DW: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_sd(rd, off, RV_REG_T1, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + insns_start = ctx->ninsns; + emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, + insn_len); + if (ret) + return ret; + + break; + } + /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_B: if (is_12b_int(off)) { @@ -1728,6 +1826,84 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, emit_atomic(rd, rs, off, imm, BPF_SIZE(code) == BPF_DW, ctx); break; + + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + { + int insn_len, insns_start; + + emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T2; + + switch (BPF_SIZE(code)) { + case BPF_B: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit(rv_sb(rd, off, rs), ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + insns_start = ctx->ninsns; + emit(rv_sb(RV_REG_T1, 0, rs), ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_H: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit(rv_sh(rd, off, rs), ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + insns_start = ctx->ninsns; + emit(rv_sh(RV_REG_T1, 0, rs), ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_W: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_sw(rd, off, rs, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + insns_start = ctx->ninsns; + emit_sw(RV_REG_T1, 0, rs, ctx); + insn_len = ctx->ninsns - insns_start; + break; + case BPF_DW: + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_sd(rd, off, rs, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + insns_start = ctx->ninsns; + emit_sd(RV_REG_T1, 0, rs, ctx); + insn_len = ctx->ninsns - insns_start; + break; + } + + ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, + insn_len); + if (ret) + return ret; + + break; + } + default: pr_err("bpf-jit: unknown opcode %02x\n", code); return -EINVAL; @@ -1759,6 +1935,8 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog) stack_adjust += 8; if (seen_reg(RV_REG_S6, ctx)) stack_adjust += 8; + if (ctx->arena_vm_start) + stack_adjust += 8; stack_adjust = round_up(stack_adjust, 16); stack_adjust += bpf_stack_adjust; @@ -1810,6 +1988,10 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog) emit_sd(RV_REG_SP, store_offset, RV_REG_S6, ctx); store_offset -= 8; } + if (ctx->arena_vm_start) { + emit_sd(RV_REG_SP, store_offset, RV_REG_ARENA, ctx); + store_offset -= 8; + } emit_addi(RV_REG_FP, RV_REG_SP, stack_adjust, ctx); @@ -1823,6 +2005,9 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog) emit_mv(RV_REG_TCC_SAVED, RV_REG_TCC, ctx); ctx->stack_size = stack_adjust; + + if (ctx->arena_vm_start) + emit_imm(RV_REG_ARENA, ctx->arena_vm_start, ctx); } void bpf_jit_build_epilogue(struct rv_jit_context *ctx) diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 6b3acac30c06..9ab739b9f9a2 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -80,6 +80,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto skip_init_ctx; } + ctx->arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena); ctx->prog = prog; ctx->offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL); if (!ctx->offset) { From 21ab0b6d0cfcb8aa98e33baa83f933f963514027 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 4 Apr 2024 11:42:03 +0000 Subject: [PATCH 056/147] bpf, riscv: Implement bpf_addr_space_cast instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLVM generates bpf_addr_space_cast instruction while translating pointers between native (zero) address space and __attribute__((address_space(N))). The addr_space=0 is reserved as bpf_arena address space. rY = addr_space_cast(rX, 0, 1) is processed by the verifier and converted to normal 32-bit move: wX = wY rY = addr_space_cast(rX, 1, 0) has to be converted by JIT. Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Tested-by: Pu Lehui Reviewed-by: Pu Lehui Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240404114203.105970-3-puranjay12@gmail.com --- arch/riscv/net/bpf_jit.h | 1 + arch/riscv/net/bpf_jit_comp64.c | 14 ++++++++++++++ arch/riscv/net/bpf_jit_core.c | 1 + 3 files changed, 16 insertions(+) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 8a47da08dd9c..5fc374ed98ea 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -82,6 +82,7 @@ struct rv_jit_context { unsigned long flags; int stack_size; u64 arena_vm_start; + u64 user_vm_start; }; /* Convert from ninsns to bytes. */ diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index a4c8e1e6c1e2..15e482f2c657 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1081,6 +1081,15 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, /* dst = src */ case BPF_ALU | BPF_MOV | BPF_X: case BPF_ALU64 | BPF_MOV | BPF_X: + if (insn_is_cast_user(insn)) { + emit_mv(RV_REG_T1, rs, ctx); + emit_zextw(RV_REG_T1, RV_REG_T1, ctx); + emit_imm(rd, (ctx->user_vm_start >> 32) << 32, ctx); + emit(rv_beq(RV_REG_T1, RV_REG_ZERO, 4), ctx); + emit_or(RV_REG_T1, rd, RV_REG_T1, ctx); + emit_mv(rd, RV_REG_T1, ctx); + break; + } if (imm == 1) { /* Special mov32 for zext */ emit_zextw(rd, rd, ctx); @@ -2024,3 +2033,8 @@ bool bpf_jit_supports_ptr_xchg(void) { return true; } + +bool bpf_jit_supports_arena(void) +{ + return true; +} diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 9ab739b9f9a2..8a69d6d81e32 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -81,6 +81,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx->arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena); + ctx->user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); ctx->prog = prog; ctx->offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL); if (!ctx->offset) { From 5e6a3c1ee693da1793739bb378b224bcf33d7f14 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Apr 2024 17:26:39 -0700 Subject: [PATCH 057/147] bpf: make bpf_get_branch_snapshot() architecture-agnostic perf_snapshot_branch_stack is set up in an architecture-agnostic way, so there is no reason for BPF subsystem to keep track of which architectures do support LBR or not. E.g., it looks like ARM64 might soon get support for BRBE ([0]), which (with proper integration) should be possible to utilize using this BPF helper. perf_snapshot_branch_stack static call will point to __static_call_return0() by default, which just returns zero, which will lead to -ENOENT, as expected. So no need to guard anything here. [0] https://lore.kernel.org/linux-arm-kernel/20240125094119.2542332-1-anshuman.khandual@arm.com/ Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404002640.1774210-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6d0c95638e1b..afb232b1d7c2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1188,9 +1188,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { -#ifndef CONFIG_X86 - return -ENOENT; -#else static const u32 br_entry_size = sizeof(struct perf_branch_entry); u32 entry_cnt = size / br_entry_size; @@ -1203,7 +1200,6 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return -ENOENT; return entry_cnt * br_entry_size; -#endif } static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { From 314a53623cd4e62e1b88126e5ed2bc87073d90ee Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Apr 2024 17:26:40 -0700 Subject: [PATCH 058/147] bpf: inline bpf_get_branch_snapshot() helper Inline bpf_get_branch_snapshot() helper using architecture-agnostic inline BPF code which calls directly into underlying callback of perf_snapshot_branch_stack static call. This callback is set early during kernel initialization and is never updated or reset, so it's ok to fetch actual implementation using static_call_query() and call directly into it. This change eliminates a full function call and saves one LBR entry in PERF_SAMPLE_BRANCH_ANY LBR mode. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404002640.1774210-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e03ba9ed07b..ffaa9f7f153c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20188,6 +20188,61 @@ patch_map_ops_generic: goto next_insn; } + /* Implement bpf_get_branch_snapshot inline. */ + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_get_branch_snapshot) { + /* We are dealing with the following func protos: + * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); + * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt); + */ + const u32 br_entry_size = sizeof(struct perf_branch_entry); + + /* struct perf_branch_entry is part of UAPI and is + * used as an array element, so extremely unlikely to + * ever grow or shrink + */ + BUILD_BUG_ON(br_entry_size != 24); + + /* if (unlikely(flags)) return -EINVAL */ + insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7); + + /* Transform size (bytes) into number of entries (cnt = size / 24). + * But to avoid expensive division instruction, we implement + * divide-by-3 through multiplication, followed by further + * division by 8 through 3-bit right shift. + * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., + * p. 227, chapter "Unsigned Divison by 3" for details and proofs. + * + * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. + */ + insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab); + insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0); + insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36); + + /* call perf_snapshot_branch_stack implementation */ + insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack)); + /* if (entry_cnt == 0) return -ENOENT */ + insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4); + /* return entry_cnt * sizeof(struct perf_branch_entry) */ + insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size); + insn_buf[7] = BPF_JMP_A(3); + /* return -EINVAL; */ + insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + insn_buf[9] = BPF_JMP_A(1); + /* return -ENOENT; */ + insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT); + cnt = 11; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + /* Implement bpf_kptr_xchg inline */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_kptr_xchg && From 478a535ae54ad3831371904d93b5dfc403222e17 Mon Sep 17 00:00:00 2001 From: Sahil Siddiq Date: Fri, 5 Apr 2024 00:52:19 +0530 Subject: [PATCH 059/147] bpftool: Mount bpffs on provided dir instead of parent dir When pinning programs/objects under PATH (eg: during "bpftool prog loadall") the bpffs is mounted on the parent dir of PATH in the following situations: - the given dir exists but it is not bpffs. - the given dir doesn't exist and the parent dir is not bpffs. Mounting on the parent dir can also have the unintentional side- effect of hiding other files located under the parent dir. If the given dir exists but is not bpffs, then the bpffs should be mounted on the given dir and not its parent dir. Similarly, if the given dir doesn't exist and its parent dir is not bpffs, then the given dir should be created and the bpffs should be mounted on this new dir. Fixes: 2a36c26fe3b8 ("bpftool: Support bpffs mountpoint as pin path for prog loadall") Signed-off-by: Sahil Siddiq Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/2da44d24-74ae-a564-1764-afccf395eeec@isovalent.com/T/#t Link: https://lore.kernel.org/bpf/20240404192219.52373-1-icegambit91@gmail.com Closes: https://github.com/libbpf/bpftool/issues/100 Changes since v1: - Split "mount_bpffs_for_pin" into two functions. This is done to improve maintainability and readability. Changes since v2: - mount_bpffs_for_pin: rename to "create_and_mount_bpffs_dir". - mount_bpffs_given_file: rename to "mount_bpffs_given_file". - create_and_mount_bpffs_dir: - introduce "dir_exists" boolean. - remove new dir if "mnt_fs" fails. - improve error handling and error messages. Changes since v3: - Rectify function name. - Improve error messages and formatting. - mount_bpffs_for_file: - Check if dir exists before block_mount check. Changes since v4: - Use strdup instead of strcpy. - create_and_mount_bpffs_dir: - Use S_IRWXU instead of 0700. - Improve error handling and formatting. --- tools/bpf/bpftool/common.c | 96 +++++++++++++++++++++++++++++----- tools/bpf/bpftool/iter.c | 2 +- tools/bpf/bpftool/main.h | 3 +- tools/bpf/bpftool/prog.c | 5 +- tools/bpf/bpftool/struct_ops.c | 2 +- 5 files changed, 92 insertions(+), 16 deletions(-) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index cc6e6aae2447..958e92acca8e 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -244,29 +244,101 @@ int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type) return fd; } -int mount_bpffs_for_pin(const char *name, bool is_dir) +int create_and_mount_bpffs_dir(const char *dir_name) { char err_str[ERR_MAX_LEN]; - char *file; + bool dir_exists; + int err = 0; + + if (is_bpffs(dir_name)) + return err; + + dir_exists = access(dir_name, F_OK) == 0; + + if (!dir_exists) { + char *temp_name; + char *parent_name; + + temp_name = strdup(dir_name); + if (!temp_name) { + p_err("mem alloc failed"); + return -1; + } + + parent_name = dirname(temp_name); + + if (is_bpffs(parent_name)) { + /* nothing to do if already mounted */ + free(temp_name); + return err; + } + + if (access(parent_name, F_OK) == -1) { + p_err("can't create dir '%s' to pin BPF object: parent dir '%s' doesn't exist", + dir_name, parent_name); + free(temp_name); + return -1; + } + + free(temp_name); + } + + if (block_mount) { + p_err("no BPF file system found, not mounting it due to --nomount option"); + return -1; + } + + if (!dir_exists) { + err = mkdir(dir_name, S_IRWXU); + if (err) { + p_err("failed to create dir '%s': %s", dir_name, strerror(errno)); + return err; + } + } + + err = mnt_fs(dir_name, "bpf", err_str, ERR_MAX_LEN); + if (err) { + err_str[ERR_MAX_LEN - 1] = '\0'; + p_err("can't mount BPF file system on given dir '%s': %s", + dir_name, err_str); + + if (!dir_exists) + rmdir(dir_name); + } + + return err; +} + +int mount_bpffs_for_file(const char *file_name) +{ + char err_str[ERR_MAX_LEN]; + char *temp_name; char *dir; int err = 0; - if (is_dir && is_bpffs(name)) - return err; + if (access(file_name, F_OK) != -1) { + p_err("can't pin BPF object: path '%s' already exists", file_name); + return -1; + } - file = malloc(strlen(name) + 1); - if (!file) { + temp_name = strdup(file_name); + if (!temp_name) { p_err("mem alloc failed"); return -1; } - strcpy(file, name); - dir = dirname(file); + dir = dirname(temp_name); if (is_bpffs(dir)) /* nothing to do if already mounted */ goto out_free; + if (access(dir, F_OK) == -1) { + p_err("can't pin BPF object: dir '%s' doesn't exist", dir); + err = -1; + goto out_free; + } + if (block_mount) { p_err("no BPF file system found, not mounting it due to --nomount option"); err = -1; @@ -276,12 +348,12 @@ int mount_bpffs_for_pin(const char *name, bool is_dir) err = mnt_fs(dir, "bpf", err_str, ERR_MAX_LEN); if (err) { err_str[ERR_MAX_LEN - 1] = '\0'; - p_err("can't mount BPF file system to pin the object (%s): %s", - name, err_str); + p_err("can't mount BPF file system to pin the object '%s': %s", + file_name, err_str); } out_free: - free(file); + free(temp_name); return err; } @@ -289,7 +361,7 @@ int do_pin_fd(int fd, const char *name) { int err; - err = mount_bpffs_for_pin(name, false); + err = mount_bpffs_for_file(name); if (err) return err; diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c index 6b0e5202ca7a..5c39c2ed36a2 100644 --- a/tools/bpf/bpftool/iter.c +++ b/tools/bpf/bpftool/iter.c @@ -76,7 +76,7 @@ static int do_pin(int argc, char **argv) goto close_obj; } - err = mount_bpffs_for_pin(path, false); + err = mount_bpffs_for_file(path); if (err) goto close_link; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index b8bb08d10dec..9eb764fe4cc8 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -142,7 +142,8 @@ const char *get_fd_type_name(enum bpf_obj_type type); char *get_fdinfo(int fd, const char *key); int open_obj_pinned(const char *path, bool quiet); int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type); -int mount_bpffs_for_pin(const char *name, bool is_dir); +int mount_bpffs_for_file(const char *file_name); +int create_and_mount_bpffs_dir(const char *dir_name); int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(int *, char ***)); int do_pin_fd(int fd, const char *name); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 9cb42a3366c0..4c4cf16a40ba 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1778,7 +1778,10 @@ offload_dev: goto err_close_obj; } - err = mount_bpffs_for_pin(pinfile, !first_prog_only); + if (first_prog_only) + err = mount_bpffs_for_file(pinfile); + else + err = create_and_mount_bpffs_dir(pinfile); if (err) goto err_close_obj; diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c index d573f2640d8e..aa43dead249c 100644 --- a/tools/bpf/bpftool/struct_ops.c +++ b/tools/bpf/bpftool/struct_ops.c @@ -515,7 +515,7 @@ static int do_register(int argc, char **argv) if (argc == 1) linkdir = GET_ARG(); - if (linkdir && mount_bpffs_for_pin(linkdir, true)) { + if (linkdir && create_and_mount_bpffs_dir(linkdir)) { p_err("can't mount bpffs for pinning"); return -1; } From f91717007217d975aa975ddabd91ae1a107b9bff Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 3 Apr 2024 14:33:03 +0200 Subject: [PATCH 060/147] bpf: Pack struct bpf_fib_lookup The struct bpf_fib_lookup is supposed to be of size 64. A recent commit 59b418c7063d ("bpf: Add a check for struct bpf_fib_lookup size") added a static assertion to check this property so that future changes to the structure will not accidentally break this assumption. As it immediately turned out, on some 32-bit arm systems, when AEABI=n, the total size of the structure was equal to 68, see [1]. This happened because the bpf_fib_lookup structure contains a union of two 16-bit fields: union { __u16 tot_len; __u16 mtu_result; }; which was supposed to compile to a 16-bit-aligned 16-bit field. On the aforementioned setups it was instead both aligned and padded to 32-bits. Declare this inner union as __attribute__((packed, aligned(2))) such that it always is of size 2 and is aligned to 16 bits. [1] https://lore.kernel.org/all/CA+G9fYtsoP51f-oP_Sp5MOq-Ffv8La2RztNpwvE6+R1VtFiLrw@mail.gmail.com/#t Reported-by: Naresh Kamboju Fixes: e1850ea9bd9e ("bpf: bpf_fib_lookup return MTU value as output when looked up") Signed-off-by: Anton Protopopov Signed-off-by: Andrii Nakryiko Reviewed-by: Alexander Lobakin Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240403123303.1452184-1-aspsk@isovalent.com --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c548276b6b..6fe9f11c8abe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7157,7 +7157,7 @@ struct bpf_fib_lookup { /* output: MTU value */ __u16 mtu_result; - }; + } __attribute__((packed, aligned(2))); /* input: L3 device index for lookup * output: device index from FIB lookup */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79c548276b6b..6fe9f11c8abe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7157,7 +7157,7 @@ struct bpf_fib_lookup { /* output: MTU value */ __u16 mtu_result; - }; + } __attribute__((packed, aligned(2))); /* input: L3 device index for lookup * output: device index from FIB lookup */ From 1f2a74b41ea8b902687eb97c4e7e3f558801865b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 Apr 2024 14:45:35 -0700 Subject: [PATCH 061/147] bpf: prevent r10 register from being marked as precise r10 is a special register that is not under BPF program's control and is always effectively precise. The rest of precision logic assumes that only r0-r9 SCALAR registers are marked as precise, so prevent r10 from being marked precise. This can happen due to signed cast instruction allowing to do something like `r0 = (s8)r10;`, which later, if r0 needs to be precise, would lead to an attempt to mark r10 as precise. Prevent this with an extra check during instruction backtracking. Fixes: 8100928c8814 ("bpf: Support new sign-extension mov insns") Reported-by: syzbot+148110ee7cf72f39f33e@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404214536.3551295-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ffaa9f7f153c..41446bea8fab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3615,7 +3615,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * sreg needs precision before this insn */ bt_clear_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } else { /* dreg = K * dreg needs precision after this insn. @@ -3631,7 +3632,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * both dreg and sreg need precision * before this insn */ - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } /* else dreg += K * dreg still needs precision before this insn */ From 343ca8131c35ba132d200fd9752b60e65357924d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 Apr 2024 14:45:36 -0700 Subject: [PATCH 062/147] selftests/bpf: add fp-leaking precise subprog result tests Add selftests validating that BPF verifier handles precision marking for SCALAR registers derived from r10 (fp) register correctly. Given `r0 = (s8)r10;` syntax is not supported by older Clang compilers, use the raw BPF instruction syntax to maximize compatibility. Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404214536.3551295-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/include/linux/filter.h | 18 ++++ .../bpf/progs/verifier_subprog_precision.c | 89 +++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 736bdeccdfe4..65aa8ce142e5 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -111,6 +111,24 @@ .off = 0, \ .imm = IMM }) +/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ + +#define BPF_MOVSX64_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_MOVSX32_REG(DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 6f5d19665cf6..4a58e0398e72 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -6,6 +6,7 @@ #include #include #include "bpf_misc.h" +#include <../../../tools/include/linux/filter.h> #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) @@ -76,6 +77,94 @@ __naked int subprog_result_precise(void) ); } +__naked __noinline __used +static unsigned long fp_leaking_subprog() +{ + asm volatile ( + ".8byte %[r0_eq_r10_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r10_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_10, 8)) + ); +} + +__naked __noinline __used +static unsigned long sneaky_fp_leaking_subprog() +{ + asm volatile ( + "r1 = r10;" + ".8byte %[r0_eq_r1_cast_s8];" + "exit;" + :: __imm_insn(r0_eq_r1_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_1, 8)) + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") +__msg("7: R0_w=scalar") +__naked int fp_precise_subprog_result(void) +{ + asm volatile ( + "call fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") +__msg("mark_precise: frame0: regs=r0 stack= before 11: (95) exit") +__msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") +/* here r1 is marked precise, even though it's fp register, but that's fine + * because by the time we get out of subprogram it has to be derived from r10 + * anyways, at which point we'll break precision chain + */ +__msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") +__msg("7: R0_w=scalar") +__naked int sneaky_fp_precise_subprog_result(void) +{ + asm volatile ( + "call sneaky_fp_leaking_subprog;" + /* use subprog's returned value (which is derived from r10=fp + * register), as index into vals array, forcing all of that to + * be known precisely + */ + "r0 &= 3;" + "r0 *= 4;" + "r1 = %[vals];" + /* force precision marking */ + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + SEC("?raw_tp") __success __log_level(2) __msg("9: (0f) r1 += r0") From 58babe27180c8d4cb54d831589cf801bd9268876 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Apr 2024 16:26:25 +0200 Subject: [PATCH 063/147] bpf: fix perf_snapshot_branch_stack link failure The newly added code to handle bpf_get_branch_snapshot fails to link when CONFIG_PERF_EVENTS is disabled: aarch64-linux-ld: kernel/bpf/verifier.o: in function `do_misc_fixups': verifier.c:(.text+0x1090c): undefined reference to `__SCK__perf_snapshot_branch_stack' Add a build-time check for that Kconfig symbol around the code to remove the link time dependency. Fixes: 314a53623cd4 ("bpf: inline bpf_get_branch_snapshot() helper") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240405142637.577046-1-arnd@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 41446bea8fab..4353cb09c35b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20191,7 +20191,8 @@ patch_map_ops_generic: } /* Implement bpf_get_branch_snapshot inline. */ - if (prog->jit_requested && BITS_PER_LONG == 64 && + if (IS_ENABLED(CONFIG_PERF_EVENTS) && + prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_get_branch_snapshot) { /* We are dealing with the following func protos: * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); From 0a525621b7e5b49202b19d8f75382c6778fdd0c1 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 5 Apr 2024 10:55:34 +0800 Subject: [PATCH 064/147] bpf: store both map ptr and state in bpf_insn_aux_data Currently, bpf_insn_aux_data->map_ptr_state is used to store either map_ptr or its poison state (i.e., BPF_MAP_PTR_POISON). Thus BPF_MAP_PTR_POISON must be checked before reading map_ptr. In certain cases, we may need valid map_ptr even in case of poison state. This will be explained in next patch with bpf_for_each_map_elem() helper. This patch changes map_ptr_state into a new struct including both map pointer and its state (poison/unpriv). It's in the same union with struct bpf_loop_inline_state, so there is no extra memory overhead. Besides, macros BPF_MAP_PTR_UNPRIV/BPF_MAP_PTR_POISON/BPF_MAP_PTR are no longer needed. This patch does not change any existing functionality. Signed-off-by: Philo Lu Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240405025536.18113-2-lulie@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 9 ++++++++- kernel/bpf/verifier.c | 36 ++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7cb1b75eee38..36d19cd32eb5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -502,6 +502,13 @@ struct bpf_loop_inline_state { u32 callback_subprogno; /* valid when fit_for_inline is true */ }; +/* pointer and state for maps */ +struct bpf_map_ptr_state { + struct bpf_map *map_ptr; + bool poison; + bool unpriv; +}; + /* Possible states for alu_state member. */ #define BPF_ALU_SANITIZE_SRC (1U << 0) #define BPF_ALU_SANITIZE_DST (1U << 1) @@ -514,7 +521,7 @@ struct bpf_loop_inline_state { struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ - unsigned long map_ptr_state; /* pointer/poison value for maps */ + struct bpf_map_ptr_state map_ptr_state; s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4353cb09c35b..62fedb1f3312 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -190,11 +190,6 @@ struct bpf_verifier_stack_elem { #define BPF_MAP_KEY_POISON (1ULL << 63) #define BPF_MAP_KEY_SEEN (1ULL << 62) -#define BPF_MAP_PTR_UNPRIV 1UL -#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ - POISON_POINTER_DELTA)) -#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) - #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); @@ -209,21 +204,22 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { - return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; + return aux->map_ptr_state.poison; } static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) { - return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV; + return aux->map_ptr_state.unpriv; } static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, - const struct bpf_map *map, bool unpriv) + struct bpf_map *map, + bool unpriv, bool poison) { - BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_ptr_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); + aux->map_ptr_state.unpriv = unpriv; + aux->map_ptr_state.poison = poison; + aux->map_ptr_state.map_ptr = map; } static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) @@ -9660,7 +9656,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return -EINVAL; } - map = BPF_MAP_PTR(insn_aux->map_ptr_state); + map = insn_aux->map_ptr_state.map_ptr; if (!map->ops->map_set_for_each_callback_args || !map->ops->map_for_each_callback) { verbose(env, "callback function not allowed for map\n"); @@ -10019,12 +10015,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EACCES; } - if (!BPF_MAP_PTR(aux->map_ptr_state)) + if (!aux->map_ptr_state.map_ptr) bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1); - else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) - bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - !meta->map_ptr->bypass_spec_v1); + !meta->map_ptr->bypass_spec_v1, false); + else if (aux->map_ptr_state.map_ptr != meta->map_ptr) + bpf_map_ptr_store(aux, meta->map_ptr, + !meta->map_ptr->bypass_spec_v1, true); return 0; } @@ -19840,7 +19836,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) !bpf_map_ptr_unpriv(aux)) { struct bpf_jit_poke_descriptor desc = { .reason = BPF_POKE_REASON_TAIL_CALL, - .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), + .tail_call.map = aux->map_ptr_state.map_ptr, .tail_call.key = bpf_map_key_immediate(aux), .insn_idx = i + delta, }; @@ -19869,7 +19865,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) return -EINVAL; } - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -19977,7 +19973,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; ops = map_ptr->ops; if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { From 9d482da9e17a4ddd5563428f74302a36b2610306 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 5 Apr 2024 10:55:35 +0800 Subject: [PATCH 065/147] bpf: allow invoking bpf_for_each_map_elem with different maps Taking different maps within a single bpf_for_each_map_elem call is not allowed before, because from the second map, bpf_insn_aux_data->map_ptr_state will be marked as *poison*. In fact both map_ptr and state are needed to support this use case: map_ptr is used by set_map_elem_callback_state() while poison state is needed to determine whether to use direct call. Signed-off-by: Philo Lu Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240405025536.18113-3-lulie@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 62fedb1f3312..590db4e4c071 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9651,11 +9651,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, struct bpf_map *map; int err; - if (bpf_map_ptr_poisoned(insn_aux)) { - verbose(env, "tail_call abusing map_ptr\n"); - return -EINVAL; - } - + /* valid map_ptr and poison value does not matter */ map = insn_aux->map_ptr_state.map_ptr; if (!map->ops->map_set_for_each_callback_args || !map->ops->map_for_each_callback) { From fecb1597cc11a23f32faa90d70a199533871686a Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 5 Apr 2024 10:55:36 +0800 Subject: [PATCH 066/147] selftests/bpf: add test for bpf_for_each_map_elem() with different maps A test is added for bpf_for_each_map_elem() with either an arraymap or a hashmap. $ tools/testing/selftests/bpf/test_progs -t for_each #93/1 for_each/hash_map:OK #93/2 for_each/array_map:OK #93/3 for_each/write_map_key:OK #93/4 for_each/multi_maps:OK #93 for_each:OK Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Philo Lu Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240405025536.18113-4-lulie@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/for_each.c | 62 +++++++++++++++++++ .../selftests/bpf/progs/for_each_multi_maps.c | 49 +++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/for_each_multi_maps.c diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c index 8963f8a549f2..09f6487f58b9 100644 --- a/tools/testing/selftests/bpf/prog_tests/for_each.c +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -5,6 +5,7 @@ #include "for_each_hash_map_elem.skel.h" #include "for_each_array_map_elem.skel.h" #include "for_each_map_elem_write_key.skel.h" +#include "for_each_multi_maps.skel.h" static unsigned int duration; @@ -143,6 +144,65 @@ static void test_write_map_key(void) for_each_map_elem_write_key__destroy(skel); } +static void test_multi_maps(void) +{ + struct for_each_multi_maps *skel; + __u64 val, array_total, hash_total; + __u32 key, max_entries; + int i, err; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + skel = for_each_multi_maps__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_multi_maps__open_and_load")) + return; + + array_total = 0; + max_entries = bpf_map__max_entries(skel->maps.arraymap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + array_total += val; + err = bpf_map__update_elem(skel->maps.arraymap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "array_map_update")) + goto out; + } + + hash_total = 0; + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i + 100; + val = i + 1; + hash_total += val; + err = bpf_map__update_elem(skel->maps.hashmap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "hash_map_update")) + goto out; + } + + skel->bss->data_output = 0; + skel->bss->use_array = 1; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, array_total, "array output"); + + skel->bss->data_output = 0; + skel->bss->use_array = 0; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + ASSERT_EQ(skel->bss->data_output, hash_total, "hash output"); + +out: + for_each_multi_maps__destroy(skel); +} + void test_for_each(void) { if (test__start_subtest("hash_map")) @@ -151,4 +211,6 @@ void test_for_each(void) test_array_map(); if (test__start_subtest("write_map_key")) test_write_map_key(); + if (test__start_subtest("multi_maps")) + test_multi_maps(); } diff --git a/tools/testing/selftests/bpf/progs/for_each_multi_maps.c b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c new file mode 100644 index 000000000000..ff0bed7d4459 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} arraymap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 5); + __type(key, __u32); + __type(value, __u64); +} hashmap SEC(".maps"); + +struct callback_ctx { + int output; +}; + +u32 data_output = 0; +int use_array = 0; + +static __u64 +check_map_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output += *val; + return 0; +} + +SEC("tc") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.output = 0; + if (use_array) + bpf_for_each_map_elem(&arraymap, check_map_elem, &data, 0); + else + bpf_for_each_map_elem(&hashmap, check_map_elem, &data, 0); + data_output = data.output; + + return 0; +} From ba0cbe2bb4ab8aa266e48c6399bebf6e1217828a Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 4 Apr 2024 16:23:42 -0700 Subject: [PATCH 067/147] selftests/bpf: Make sure libbpf doesn't enforce the signature of a func pointer. The verifier in the kernel ensures that the struct_ops operators behave correctly by checking that they access parameters and context appropriately. The verifier will approve a program as long as it correctly accesses the context/parameters, regardless of its function signature. In contrast, libbpf should not verify the signature of function pointers and functions to enable flexibility in loading various implementations of an operator even if the signature of the function pointer does not match those in the implementations or the kernel. With this flexibility, user space applications can adapt to different kernel versions by loading a specific implementation of an operator based on feature detection. This is a follow-up of the commit c911fc61a7ce ("libbpf: Skip zeroed or null fields if not found in the kernel type.") Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240404232342.991414-1-thinker.li@gmail.com --- .../bpf/prog_tests/test_struct_ops_module.c | 24 +++++++++++++++++++ .../selftests/bpf/progs/struct_ops_module.c | 13 ++++++++++ 2 files changed, 37 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index 098776d00ab4..7cf2b9ddd3e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -138,11 +138,35 @@ static void test_struct_ops_not_zeroed(void) struct_ops_module__destroy(skel); } +/* The signature of an implementation might not match the signature of the + * function pointer prototype defined in the BPF program. This mismatch + * should be allowed as long as the behavior of the operator program + * adheres to the signature in the kernel. Libbpf should not enforce the + * signature; rather, let the kernel verifier handle the enforcement. + */ +static void test_struct_ops_incompatible(void) +{ + struct struct_ops_module *skel; + struct bpf_link *link; + + skel = struct_ops_module__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + link = bpf_map__attach_struct_ops(skel->maps.testmod_incompatible); + if (ASSERT_OK_PTR(link, "attach_struct_ops")) + bpf_link__destroy(link); + + struct_ops_module__destroy(skel); +} + void serial_test_struct_ops_module(void) { if (test__start_subtest("test_struct_ops_load")) test_struct_ops_load(); if (test__start_subtest("test_struct_ops_not_zeroed")) test_struct_ops_not_zeroed(); + if (test__start_subtest("test_struct_ops_incompatible")) + test_struct_ops_incompatible(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 86e1e50c5531..63b065dae002 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -68,3 +68,16 @@ struct bpf_testmod_ops___zeroed testmod_zeroed = { .test_1 = (void *)test_1, .test_2 = (void *)test_2_v2, }; + +struct bpf_testmod_ops___incompatible { + int (*test_1)(void); + void (*test_2)(int *a); + int data; +}; + +SEC(".struct_ops.link") +struct bpf_testmod_ops___incompatible testmod_incompatible = { + .test_1 = (void *)test_1, + .test_2 = (void *)test_2, + .data = 3, +}; From 00d5d22a5b42c3ffdfd1b29526885bbcec2d2231 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Fri, 5 Apr 2024 08:52:45 -0700 Subject: [PATCH 068/147] bpf, docs: Editorial nits in instruction-set.rst This patch addresses a number of editorial nits including spelling, punctuation, grammar, and wording consistency issues in instruction-set.rst. Signed-off-by: Dave Thaler Acked-by: David Vernet Link: https://lore.kernel.org/r/20240405155245.3618-1-dthaler1968@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/standardization/instruction-set.rst | 47 ++++++++++--------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index a5ab00ac0b14..8d0781f0bd17 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -43,7 +43,7 @@ a type's signedness (`S`) and bit width (`N`), respectively. ===== ========= For example, `u32` is a type whose valid values are all the 32-bit unsigned -numbers and `s16` is a types whose valid values are all the 16-bit signed +numbers and `s16` is a type whose valid values are all the 16-bit signed numbers. Functions @@ -108,7 +108,7 @@ conformance group means it must support all instructions in that conformance group. The use of named conformance groups enables interoperability between a runtime -that executes instructions, and tools as such compilers that generate +that executes instructions, and tools such as compilers that generate instructions for the runtime. Thus, capability discovery in terms of conformance groups might be done manually by users or automatically by tools. @@ -181,10 +181,13 @@ A basic instruction is encoded as follows:: (`64-bit immediate instructions`_ reuse this field for other purposes) **dst_reg** - destination register number (0-10) + destination register number (0-10), unless otherwise specified + (future instructions might reuse this field for other purposes) **offset** - signed integer offset used with pointer arithmetic + signed integer offset used with pointer arithmetic, except where + otherwise specified (some arithmetic instructions reuse this field + for other purposes) **imm** signed integer immediate value @@ -228,10 +231,12 @@ This is depicted in the following figure:: operation to perform, encoded as explained above **regs** - The source and destination register numbers, encoded as explained above + The source and destination register numbers (unless otherwise + specified), encoded as explained above **offset** - signed integer offset used with pointer arithmetic + signed integer offset used with pointer arithmetic, unless + otherwise specified **imm** signed integer immediate value @@ -342,8 +347,8 @@ where '(u32)' indicates that the upper 32 bits are zeroed. dst = dst ^ imm -Note that most instructions have instruction offset of 0. Only three instructions -(``SDIV``, ``SMOD``, ``MOVSX``) have a non-zero offset. +Note that most arithmetic instructions have 'offset' set to 0. Only three instructions +(``SDIV``, ``SMOD``, ``MOVSX``) have a non-zero 'offset'. Division, multiplication, and modulo operations for ``ALU`` are part of the "divmul32" conformance group, and division, multiplication, and @@ -370,10 +375,10 @@ etc. This specification requires that signed modulo use truncated division a % n = a - n * trunc(a / n) The ``MOVSX`` instruction does a move operation with sign extension. -``{MOVSX, X, ALU}`` :term:`sign extends` 8-bit and 16-bit operands into 32 -bit operands, and zeroes the remaining upper 32 bits. +``{MOVSX, X, ALU}`` :term:`sign extends` 8-bit and 16-bit operands into +32-bit operands, and zeroes the remaining upper 32 bits. ``{MOVSX, X, ALU64}`` :term:`sign extends` 8-bit, 16-bit, and 32-bit -operands into 64 bit operands. Unlike other arithmetic instructions, +operands into 64-bit operands. Unlike other arithmetic instructions, ``MOVSX`` is only defined for register source operands (``X``). The ``NEG`` instruction is only defined when the source bit is clear @@ -411,19 +416,19 @@ conformance group. Examples: -``{END, TO_LE, ALU}`` with imm = 16/32/64 means:: +``{END, TO_LE, ALU}`` with 'imm' = 16/32/64 means:: dst = htole16(dst) dst = htole32(dst) dst = htole64(dst) -``{END, TO_BE, ALU}`` with imm = 16/32/64 means:: +``{END, TO_BE, ALU}`` with 'imm' = 16/32/64 means:: dst = htobe16(dst) dst = htobe32(dst) dst = htobe64(dst) -``{END, TO_LE, ALU64}`` with imm = 16/32/64 means:: +``{END, TO_LE, ALU64}`` with 'imm' = 16/32/64 means:: dst = bswap16(dst) dst = bswap32(dst) @@ -475,7 +480,7 @@ where 's>=' indicates a signed '>=' comparison. gotol +imm -where 'imm' means the branch offset comes from insn 'imm' field. +where 'imm' means the branch offset comes from the 'imm' field. Note that there are two flavors of ``JA`` instructions. The ``JMP`` class permits a 16-bit jump offset specified by the 'offset' @@ -494,25 +499,25 @@ Helper functions are a concept whereby BPF programs can call into a set of function calls exposed by the underlying platform. Historically, each helper function was identified by an address -encoded in the imm field. The available helper functions may differ +encoded in the 'imm' field. The available helper functions may differ for each program type, but address values are unique across all program types. Platforms that support the BPF Type Format (BTF) support identifying -a helper function by a BTF ID encoded in the imm field, where the BTF ID +a helper function by a BTF ID encoded in the 'imm' field, where the BTF ID identifies the helper name and type. Program-local functions ~~~~~~~~~~~~~~~~~~~~~~~ Program-local functions are functions exposed by the same BPF program as the caller, and are referenced by offset from the call instruction, similar to -``JA``. The offset is encoded in the imm field of the call instruction. -A ``EXIT`` within the program-local function will return to the caller. +``JA``. The offset is encoded in the 'imm' field of the call instruction. +An ``EXIT`` within the program-local function will return to the caller. Load and store instructions =========================== For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the -8-bit 'opcode' field is divided as:: +8-bit 'opcode' field is divided as follows:: +-+-+-+-+-+-+-+-+ |mode |sz |class| @@ -580,7 +585,7 @@ instructions that transfer data between a register and memory. dst = *(signed size *) (src + offset) -Where size is one of: ``B``, ``H``, or ``W``, and +Where '' is one of: ``B``, ``H``, or ``W``, and 'signed size' is one of: s8, s16, or s32. Atomic operations From a8e03b6bbb2cc7cf387d1ce335e4ce4c3bdfef9b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 5 Apr 2024 09:30:40 -0500 Subject: [PATCH 069/147] bpf: Allow invoking kfuncs from BPF_PROG_TYPE_SYSCALL progs Currently, a set of core BPF kfuncs (e.g. bpf_task_*, bpf_cgroup_*, bpf_cpumask_*, etc) cannot be invoked from BPF_PROG_TYPE_SYSCALL programs. The whitelist approach taken for enabling kfuncs makes sense: it not safe to call these kfuncs from every program type. For example, it may not be safe to call bpf_task_acquire() in an fentry to free_task(). BPF_PROG_TYPE_SYSCALL, on the other hand, is a perfectly safe program type from which to invoke these kfuncs, as it's a very controlled environment, and we should never be able to run into any of the typical problems such as recursive invoations, acquiring references on freeing kptrs, etc. Being able to invoke these kfuncs would be useful, as BPF_PROG_TYPE_SYSCALL can be invoked with BPF_PROG_RUN, and would therefore enable user space programs to synchronously call into BPF to manipulate these kptrs. This patch therefore enables invoking the aforementioned core kfuncs from BPF_PROG_TYPE_SYSCALL progs. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240405143041.632519-2-void@manifault.com --- kernel/bpf/cpumask.c | 1 + kernel/bpf/helpers.c | 1 + 2 files changed, 2 insertions(+) diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index dad0fb1c8e87..33c473d676a5 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -474,6 +474,7 @@ static int __init cpumask_kfunc_init(void) ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, ARRAY_SIZE(cpumask_dtors), THIS_MODULE); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d9e7aca8ae9e..8cde717137bd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2653,6 +2653,7 @@ static int __init kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); From 1bc724af00cc48ef03e3fa6d7a2f6731ac915c37 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 5 Apr 2024 09:30:41 -0500 Subject: [PATCH 070/147] selftests/bpf: Verify calling core kfuncs from BPF_PROG_TYPE_SYCALL Now that we can call some kfuncs from BPF_PROG_TYPE_SYSCALL progs, let's add some selftests that verify as much. As a bonus, let's also verify that we can't call the progs from raw tracepoints. Do do this, we add a new selftest suite called verifier_kfunc_prog_types. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240405143041.632519-3-void@manifault.com --- .../prog_tests/verifier_kfunc_prog_types.c | 11 ++ .../selftests/bpf/progs/cgrp_kfunc_common.h | 2 +- .../selftests/bpf/progs/task_kfunc_common.h | 2 +- .../bpf/progs/verifier_kfunc_prog_types.c | 122 ++++++++++++++++++ 4 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c new file mode 100644 index 000000000000..3918ecc2ee91 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include + +#include "verifier_kfunc_prog_types.skel.h" + +void test_verifier_kfunc_prog_types(void) +{ + RUN_TESTS(verifier_kfunc_prog_types); +} diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h index 22914a70db54..73ba32e9a693 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h @@ -13,7 +13,7 @@ struct __cgrps_kfunc_map_value { struct cgroup __kptr * cgrp; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __cgrps_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h index 41f2d44f49cb..6720c4b5be41 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h +++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h @@ -13,7 +13,7 @@ struct __tasks_kfunc_map_value { struct task_struct __kptr * task; }; -struct hash_map { +struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); __type(value, struct __tasks_kfunc_map_value); diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c new file mode 100644 index 000000000000..cb32b0cfc84b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + +#include "bpf_misc.h" +#include "cgrp_kfunc_common.h" +#include "cpumask_common.h" +#include "task_kfunc_common.h" + +char _license[] SEC("license") = "GPL"; + +/*************** + * Task kfuncs * + ***************/ + +static void task_kfunc_load_test(void) +{ + struct task_struct *current, *ref_1, *ref_2; + + current = bpf_get_current_task_btf(); + ref_1 = bpf_task_from_pid(current->pid); + if (!ref_1) + return; + + ref_2 = bpf_task_acquire(ref_1); + if (ref_2) + bpf_task_release(ref_2); + bpf_task_release(ref_1); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(task_kfunc_raw_tp) +{ + task_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(task_kfunc_syscall) +{ + task_kfunc_load_test(); + return 0; +} + +/***************** + * cgroup kfuncs * + *****************/ + +static void cgrp_kfunc_load_test(void) +{ + struct cgroup *cgrp, *ref; + + cgrp = bpf_cgroup_from_id(0); + if (!cgrp) + return; + + ref = bpf_cgroup_acquire(cgrp); + if (!ref) { + bpf_cgroup_release(cgrp); + return; + } + + bpf_cgroup_release(ref); + bpf_cgroup_release(cgrp); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cgrp_kfunc_raw_tp) +{ + cgrp_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cgrp_kfunc_syscall) +{ + cgrp_kfunc_load_test(); + return 0; +} + +/****************** + * cpumask kfuncs * + ******************/ + +static void cpumask_kfunc_load_test(void) +{ + struct bpf_cpumask *alloc, *ref; + + alloc = bpf_cpumask_create(); + if (!alloc) + return; + + ref = bpf_cpumask_acquire(alloc); + bpf_cpumask_set_cpu(0, alloc); + bpf_cpumask_test_cpu(0, (const struct cpumask *)ref); + + bpf_cpumask_release(ref); + bpf_cpumask_release(alloc); +} + +SEC("raw_tp") +__failure __msg("calling kernel function") +int BPF_PROG(cpumask_kfunc_raw_tp) +{ + cpumask_kfunc_load_test(); + return 0; +} + +SEC("syscall") +__success +int BPF_PROG(cpumask_kfunc_syscall) +{ + cpumask_kfunc_load_test(); + return 0; +} From 5bd2ed658231b0698211e94efb06393836a4539d Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 6 Apr 2024 11:15:41 +0200 Subject: [PATCH 071/147] libbpf: Start v1.5 development cycle Bump libbpf.map to v1.5.0 to start a new libbpf version cycle. Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240406092005.92399-2-andrea.righi@canonical.com --- tools/lib/bpf/libbpf.map | 3 +++ tools/lib/bpf/libbpf_version.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 51732ecb1385..5dd81a7b96b5 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -416,3 +416,6 @@ LIBBPF_1.4.0 { btf__new_split; btf_ext__raw_data; } LIBBPF_1.3.0; + +LIBBPF_1.5.0 { +} LIBBPF_1.4.0; diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index e783a47da815..d6e5eff967cb 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 4 +#define LIBBPF_MINOR_VERSION 5 #endif /* __LIBBPF_VERSION_H */ From 13e8125a22763557d719db996f70c71f77c9509c Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 6 Apr 2024 11:15:42 +0200 Subject: [PATCH 072/147] libbpf: ringbuf: Allow to consume up to a certain amount of items In some cases, instead of always consuming all items from ring buffers in a greedy way, we may want to consume up to a certain amount of items, for example when we need to copy items from the BPF ring buffer to a limited user buffer. This change allows to set an upper limit to the amount of items consumed from one or more ring buffers. Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240406092005.92399-3-andrea.righi@canonical.com --- tools/lib/bpf/ringbuf.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index aacb64278a01..db05e6f526de 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -231,7 +231,7 @@ static inline int roundup_len(__u32 len) return (len + 7) / 8 * 8; } -static int64_t ringbuf_process_ring(struct ring *r) +static int64_t ringbuf_process_ring(struct ring *r, size_t n) { int *len_ptr, len, err; /* 64-bit to avoid overflow in case of extreme application behavior */ @@ -268,6 +268,9 @@ static int64_t ringbuf_process_ring(struct ring *r) } smp_store_release(r->consumer_pos, cons_pos); + + if (cnt >= n) + goto done; } } while (got_new_data); done: @@ -287,13 +290,15 @@ int ring_buffer__consume(struct ring_buffer *rb) for (i = 0; i < rb->ring_cnt; i++) { struct ring *ring = rb->rings[i]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; + if (res > INT_MAX) { + res = INT_MAX; + break; + } } - if (res > INT_MAX) - return INT_MAX; return res; } @@ -314,13 +319,13 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) __u32 ring_id = rb->events[i].data.fd; struct ring *ring = rb->rings[ring_id]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; } if (res > INT_MAX) - return INT_MAX; + res = INT_MAX; return res; } @@ -375,7 +380,7 @@ int ring__consume(struct ring *r) { int64_t res; - res = ringbuf_process_ring(r); + res = ringbuf_process_ring(r, INT_MAX); if (res < 0) return libbpf_err(res); From 4d22ea94ea33550538b3b14429d52cb9f96ad2c3 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 6 Apr 2024 11:15:43 +0200 Subject: [PATCH 073/147] libbpf: Add ring__consume_n / ring_buffer__consume_n Introduce a new API to consume items from a ring buffer, limited to a specified amount, and return to the caller the actual number of items consumed. Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/lkml/20240310154726.734289-1-andrea.righi@canonical.com/T Link: https://lore.kernel.org/bpf/20240406092005.92399-4-andrea.righi@canonical.com --- tools/lib/bpf/libbpf.h | 12 ++++++++++++ tools/lib/bpf/libbpf.map | 3 +++ tools/lib/bpf/ringbuf.c | 38 +++++++++++++++++++++++++++++++++++--- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f88ab50c0229..4f775a6dcaa0 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1293,6 +1293,7 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, ring_buffer_sample_fn sample_cb, void *ctx); LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__consume_n(struct ring_buffer *rb, size_t n); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); /** @@ -1367,6 +1368,17 @@ LIBBPF_API int ring__map_fd(const struct ring *r); */ LIBBPF_API int ring__consume(struct ring *r); +/** + * @brief **ring__consume_n()** consumes up to a requested amount of items from + * a ringbuffer without event polling. + * + * @param r A ringbuffer object. + * @param n Maximum amount of items to consume. + * @return The number of items consumed, or a negative number if any of the + * callbacks return an error. + */ +LIBBPF_API int ring__consume_n(struct ring *r, size_t n); + struct user_ring_buffer_opts { size_t sz; /* size of this struct, for forward/backward compatibility */ }; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 5dd81a7b96b5..23d82bba021a 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -418,4 +418,7 @@ LIBBPF_1.4.0 { } LIBBPF_1.3.0; LIBBPF_1.5.0 { + global: + ring__consume_n; + ring_buffer__consume_n; } LIBBPF_1.4.0; diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index db05e6f526de..99e44cf02321 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -277,6 +277,33 @@ done: return cnt; } +/* Consume available ring buffer(s) data without event polling, up to n + * records. + * + * Returns number of records consumed across all registered ring buffers (or + * n, whichever is less), or negative number if any of the callbacks return + * error. + */ +int ring_buffer__consume_n(struct ring_buffer *rb, size_t n) +{ + int64_t err, res = 0; + int i; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = rb->rings[i]; + + err = ringbuf_process_ring(ring, n); + if (err < 0) + return libbpf_err(err); + res += err; + n -= err; + + if (n == 0) + break; + } + return res; +} + /* Consume available ring buffer(s) data without event polling. * Returns number of records consumed across all registered ring buffers (or * INT_MAX, whichever is less), or negative number if any of the callbacks @@ -376,17 +403,22 @@ int ring__map_fd(const struct ring *r) return r->map_fd; } -int ring__consume(struct ring *r) +int ring__consume_n(struct ring *r, size_t n) { - int64_t res; + int res; - res = ringbuf_process_ring(r, INT_MAX); + res = ringbuf_process_ring(r, n); if (res < 0) return libbpf_err(res); return res > INT_MAX ? INT_MAX : res; } +int ring__consume(struct ring *r) +{ + return ring__consume_n(r, INT_MAX); +} + static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) { if (rb->consumer_pos) { From bb761fcb821738e8d3b720e1460d3783db74c68a Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 6 Apr 2024 22:46:13 +0800 Subject: [PATCH 074/147] selftests/bpf: eliminate warning of get_cgroup_id_from_path() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The output goes like this if I make samples/bpf: ...warning: no previous prototype for ‘get_cgroup_id_from_path’... Make this function static could solve the warning problem since no one outside of the file calls it. Signed-off-by: Jason Xing Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240406144613.4434-1-kerneljasonxing@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/cgroup_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 19be9c63d5e8..f2952a65dcc2 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -429,7 +429,7 @@ int create_and_get_cgroup(const char *relative_path) * which is an invalid cgroup id. * If there is a failure, it prints the error to stderr. */ -unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) +static unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir) { int dirfd, err, flags, mount_id, fhsize; union { From d503a04f8bc0c75dc9db9452d8cc79d748afb752 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 5 Apr 2024 16:11:33 -0700 Subject: [PATCH 075/147] bpf: Add support for certain atomics in bpf_arena to x86 JIT Support atomics in bpf_arena that can be JITed as a single x86 instruction. Instructions that are JITed as loops are not supported at the moment, since they require more complex extable and loop logic. JITs can choose to do smarter things with bpf_jit_supports_insn(). Like arm64 may decide to support all bpf atomics instructions when emit_lse_atomic is available and none in ll_sc mode. bpf_jit_supports_percpu_insn(), bpf_jit_supports_ptr_xchg() and other such callbacks can be replaced with bpf_jit_supports_insn() in the future. Signed-off-by: Alexei Starovoitov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240405231134.17274-1-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- arch/x86/net/bpf_jit_comp.c | 72 +++++++++++++++++++++++++++++++++++++ include/linux/filter.h | 4 +++ kernel/bpf/core.c | 5 +++ kernel/bpf/verifier.c | 19 +++++++++- 4 files changed, 99 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6cf9a5697c09..2b5a475c4dd0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1172,6 +1172,54 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } +static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, + u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + EMIT1(0xF0); /* lock prefix */ + switch (size) { + case BPF_W: + EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg)); + break; + case BPF_DW: + EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg)); + break; + default: + pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n"); + return -EFAULT; + } + + /* emit opcode */ + switch (atomic_op) { + case BPF_ADD: + case BPF_AND: + case BPF_OR: + case BPF_XOR: + /* lock *(u32/u64*)(dst_reg + idx_reg + off) = src_reg */ + EMIT1(simple_alu_opcodes[atomic_op]); + break; + case BPF_ADD | BPF_FETCH: + /* src_reg = atomic_fetch_add(dst_reg + idx_reg + off, src_reg); */ + EMIT2(0x0F, 0xC1); + break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + idx_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + idx_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; + default: + pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); + return -EFAULT; + } + emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off); + *pprog = prog; + return 0; +} + #define DONT_CLEAR 1 bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) @@ -1982,6 +2030,15 @@ populate_extable: return err; break; + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: + start_of_ldx = prog; + err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code), + dst_reg, src_reg, X86_REG_R12, insn->off); + if (err) + return err; + goto populate_extable; + /* call */ case BPF_JMP | BPF_CALL: { int offs; @@ -3486,6 +3543,21 @@ bool bpf_jit_supports_arena(void) return true; } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH)) + return false; + } + return true; +} + bool bpf_jit_supports_ptr_xchg(void) { return true; diff --git a/include/linux/filter.h b/include/linux/filter.h index 161d5f7b64ed..7a27f19bf44d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -75,6 +75,9 @@ struct ctl_table_header; /* unused opcode to mark special load instruction. Same as BPF_MSH */ #define BPF_PROBE_MEM32 0xa0 +/* unused opcode to mark special atomic instruction */ +#define BPF_PROBE_ATOMIC 0xe0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 @@ -997,6 +1000,7 @@ bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7a33a3a7e63c..a41718eaeefe 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2965,6 +2965,11 @@ bool __weak bpf_jit_supports_arena(void) return false; } +bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + return false; +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 590db4e4c071..2aad6d90550f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6970,6 +6970,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } +static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, + bool allow_trust_missmatch); + static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { int load_reg; @@ -7030,7 +7033,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg) || - is_arena_reg(env, insn->dst_reg)) { + (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7066,6 +7069,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; + if (is_arena_reg(env, insn->dst_reg)) { + err = save_aux_ptr_type(env, PTR_TO_ARENA, false); + if (err) + return err; + } /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); @@ -18955,6 +18963,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && + env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { + insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; } else { continue; } @@ -19226,6 +19240,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) BPF_CLASS(insn->code) == BPF_ST) && BPF_MODE(insn->code) == BPF_PROBE_MEM32) num_exentries++; + if (BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) + num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; From d0a2ba197bcbeaa795c5c961d927bcaf55964669 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 5 Apr 2024 16:11:34 -0700 Subject: [PATCH 076/147] selftests/bpf: Add tests for atomics in bpf_arena. Add selftests for atomic instructions in bpf_arena. Signed-off-by: Alexei Starovoitov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240405231134.17274-2-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 + tools/testing/selftests/bpf/DENYLIST.s390x | 1 + .../selftests/bpf/prog_tests/arena_atomics.c | 186 ++++++++++++++++++ .../selftests/bpf/progs/arena_atomics.c | 178 +++++++++++++++++ 4 files changed, 366 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_atomics.c create mode 100644 tools/testing/selftests/bpf/progs/arena_atomics.c diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index 0445ac38bc07..cf657fc35619 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,3 +10,4 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) +arena_atomics diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index f4a2f66a683d..c34adf39eeb2 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -6,3 +6,4 @@ stacktrace_build_id # compare_map_keys stackid_hmap vs. sta verifier_iterating_callbacks verifier_arena # JIT does not support arena arena_htab # JIT does not support arena +arena_atomics diff --git a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c new file mode 100644 index 000000000000..0807a48a58ee --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include "arena_atomics.skel.h" + +static void test_add(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.add); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->add64_value, 3, "add64_value"); + ASSERT_EQ(skel->arena->add64_result, 1, "add64_result"); + + ASSERT_EQ(skel->arena->add32_value, 3, "add32_value"); + ASSERT_EQ(skel->arena->add32_result, 1, "add32_result"); + + ASSERT_EQ(skel->arena->add_stack_value_copy, 3, "add_stack_value"); + ASSERT_EQ(skel->arena->add_stack_result, 1, "add_stack_result"); + + ASSERT_EQ(skel->arena->add_noreturn_value, 3, "add_noreturn_value"); +} + +static void test_sub(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.sub); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->sub64_value, -1, "sub64_value"); + ASSERT_EQ(skel->arena->sub64_result, 1, "sub64_result"); + + ASSERT_EQ(skel->arena->sub32_value, -1, "sub32_value"); + ASSERT_EQ(skel->arena->sub32_result, 1, "sub32_result"); + + ASSERT_EQ(skel->arena->sub_stack_value_copy, -1, "sub_stack_value"); + ASSERT_EQ(skel->arena->sub_stack_result, 1, "sub_stack_result"); + + ASSERT_EQ(skel->arena->sub_noreturn_value, -1, "sub_noreturn_value"); +} + +static void test_and(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.and); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->and64_value, 0x010ull << 32, "and64_value"); + ASSERT_EQ(skel->arena->and32_value, 0x010, "and32_value"); +} + +static void test_or(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.or); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->or64_value, 0x111ull << 32, "or64_value"); + ASSERT_EQ(skel->arena->or32_value, 0x111, "or32_value"); +} + +static void test_xor(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xor); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xor64_value, 0x101ull << 32, "xor64_value"); + ASSERT_EQ(skel->arena->xor32_value, 0x101, "xor32_value"); +} + +static void test_cmpxchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.cmpxchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->cmpxchg64_value, 2, "cmpxchg64_value"); + ASSERT_EQ(skel->arena->cmpxchg64_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg64_result_succeed, 1, "cmpxchg_result_succeed"); + + ASSERT_EQ(skel->arena->cmpxchg32_value, 2, "lcmpxchg32_value"); + ASSERT_EQ(skel->arena->cmpxchg32_result_fail, 1, "cmpxchg_result_fail"); + ASSERT_EQ(skel->arena->cmpxchg32_result_succeed, 1, "cmpxchg_result_succeed"); +} + +static void test_xchg(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.xchg); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->xchg64_value, 2, "xchg64_value"); + ASSERT_EQ(skel->arena->xchg64_result, 1, "xchg64_result"); + + ASSERT_EQ(skel->arena->xchg32_value, 2, "xchg32_value"); + ASSERT_EQ(skel->arena->xchg32_result, 1, "xchg32_result"); +} + +void test_arena_atomics(void) +{ + struct arena_atomics *skel; + int err; + + skel = arena_atomics__open(); + if (!ASSERT_OK_PTR(skel, "arena atomics skeleton open")) + return; + + if (skel->data->skip_tests) { + printf("%s:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang", + __func__); + test__skip(); + goto cleanup; + } + err = arena_atomics__load(skel); + if (!ASSERT_OK(err, "arena atomics skeleton load")) + return; + skel->bss->pid = getpid(); + + if (test__start_subtest("add")) + test_add(skel); + if (test__start_subtest("sub")) + test_sub(skel); + if (test__start_subtest("and")) + test_and(skel); + if (test__start_subtest("or")) + test_or(skel); + if (test__start_subtest("xor")) + test_xor(skel); + if (test__start_subtest("cmpxchg")) + test_cmpxchg(skel); + if (test__start_subtest("xchg")) + test_xchg(skel); + +cleanup: + arena_atomics__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c new file mode 100644 index 000000000000..55f10563208d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include "bpf_arena_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 10); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) +bool skip_tests __attribute((__section__(".data"))) = false; +#else +bool skip_tests = true; +#endif + +__u32 pid = 0; + +#undef __arena +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) +#define __arena __attribute__((address_space(1))) +#else +#define __arena SEC(".addr_space.1") +#endif + +__u64 __arena add64_value = 1; +__u64 __arena add64_result = 0; +__u32 __arena add32_value = 1; +__u32 __arena add32_result = 0; +__u64 __arena add_stack_value_copy = 0; +__u64 __arena add_stack_result = 0; +__u64 __arena add_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int add(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 add_stack_value = 1; + + add64_result = __sync_fetch_and_add(&add64_value, 2); + add32_result = __sync_fetch_and_add(&add32_value, 2); + add_stack_result = __sync_fetch_and_add(&add_stack_value, 2); + add_stack_value_copy = add_stack_value; + __sync_fetch_and_add(&add_noreturn_value, 2); +#endif + + return 0; +} + +__s64 __arena sub64_value = 1; +__s64 __arena sub64_result = 0; +__s32 __arena sub32_value = 1; +__s32 __arena sub32_result = 0; +__s64 __arena sub_stack_value_copy = 0; +__s64 __arena sub_stack_result = 0; +__s64 __arena sub_noreturn_value = 1; + +SEC("raw_tp/sys_enter") +int sub(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 sub_stack_value = 1; + + sub64_result = __sync_fetch_and_sub(&sub64_value, 2); + sub32_result = __sync_fetch_and_sub(&sub32_value, 2); + sub_stack_result = __sync_fetch_and_sub(&sub_stack_value, 2); + sub_stack_value_copy = sub_stack_value; + __sync_fetch_and_sub(&sub_noreturn_value, 2); +#endif + + return 0; +} + +__u64 __arena and64_value = (0x110ull << 32); +__u32 __arena and32_value = 0x110; + +SEC("raw_tp/sys_enter") +int and(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + + __sync_fetch_and_and(&and64_value, 0x011ull << 32); + __sync_fetch_and_and(&and32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena or32_value = 0x110; +__u64 __arena or64_value = (0x110ull << 32); + +SEC("raw_tp/sys_enter") +int or(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_or(&or64_value, 0x011ull << 32); + __sync_fetch_and_or(&or32_value, 0x011); +#endif + + return 0; +} + +__u64 __arena xor64_value = (0x110ull << 32); +__u32 __arena xor32_value = 0x110; + +SEC("raw_tp/sys_enter") +int xor(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __sync_fetch_and_xor(&xor64_value, 0x011ull << 32); + __sync_fetch_and_xor(&xor32_value, 0x011); +#endif + + return 0; +} + +__u32 __arena cmpxchg32_value = 1; +__u32 __arena cmpxchg32_result_fail = 0; +__u32 __arena cmpxchg32_result_succeed = 0; +__u64 __arena cmpxchg64_value = 1; +__u64 __arena cmpxchg64_result_fail = 0; +__u64 __arena cmpxchg64_result_succeed = 0; + +SEC("raw_tp/sys_enter") +int cmpxchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + cmpxchg64_result_fail = __sync_val_compare_and_swap(&cmpxchg64_value, 0, 3); + cmpxchg64_result_succeed = __sync_val_compare_and_swap(&cmpxchg64_value, 1, 2); + + cmpxchg32_result_fail = __sync_val_compare_and_swap(&cmpxchg32_value, 0, 3); + cmpxchg32_result_succeed = __sync_val_compare_and_swap(&cmpxchg32_value, 1, 2); +#endif + + return 0; +} + +__u64 __arena xchg64_value = 1; +__u64 __arena xchg64_result = 0; +__u32 __arena xchg32_value = 1; +__u32 __arena xchg32_result = 0; + +SEC("raw_tp/sys_enter") +int xchg(const void *ctx) +{ + if (pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; +#ifdef ENABLE_ATOMICS_TESTS + __u64 val64 = 2; + __u32 val32 = 2; + + xchg64_result = __sync_lock_test_and_set(&xchg64_value, val64); + xchg32_result = __sync_lock_test_and_set(&xchg32_value, val32); +#endif + + return 0; +} From 699c23f02c65cbfc3e638f14ce0d70c23a2e1f02 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:27 -0700 Subject: [PATCH 077/147] bpf: Add bpf_link support for sk_msg and sk_skb progs Add bpf_link support for sk_msg and sk_skb programs. We have an internal request to support bpf_link for sk_msg programs so user space can have a uniform handling with bpf_link based libbpf APIs. Using bpf_link based libbpf API also has a benefit which makes system robust by decoupling prog life cycle and attachment life cycle. Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043527.3737160-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 + include/linux/skmsg.h | 4 + include/uapi/linux/bpf.h | 5 + kernel/bpf/syscall.c | 4 + net/core/sock_map.c | 263 +++++++++++++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 5 + 6 files changed, 271 insertions(+), 16 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 62762390c93d..5034c1b4ded7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2996,6 +2996,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog); void sock_map_unhash(struct sock *sk); void sock_map_destroy(struct sock *sk); @@ -3094,6 +3095,11 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e65ec3fd2799..9c8dd4c01412 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,10 @@ struct sk_psock_progs { struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; + struct bpf_link *msg_parser_link; + struct bpf_link *stream_parser_link; + struct bpf_link *stream_verdict_link; + struct bpf_link *skb_verdict_link; }; enum sk_psock_state_bits { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6fe9f11c8abe..cee0a7915c08 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1135,6 +1135,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -6724,6 +6725,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e44c276e8617..7d392ec83655 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5213,6 +5213,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_SK_SKB: + ret = sock_map_link_create(attr, prog); + break; #ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 27d733c0f65e..63c016b4c169 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); + struct bpf_prog *old, struct bpf_link *link, + u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); fdput(f); return ret; } @@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) goto put_prog; } - ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); put_map: @@ -1454,55 +1466,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, - u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - *pprog = &progs->msg_parser; + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - *pprog = &progs->stream_parser; + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - *pprog = &progs->stream_verdict; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - *pprog = &progs->skb_verdict; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; return 0; } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) + struct bpf_prog *old, struct bpf_link *link, + u32 which) { struct bpf_prog **pprog; + struct bpf_link **plink; int ret; - ret = sock_map_prog_lookup(map, &pprog, which); + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; - if (old) - return psock_replace_prog(pprog, prog, old); + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; - psock_set_prog(pprog, prog); - return 0; + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, @@ -1527,7 +1568,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, rcu_read_lock(); - ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; @@ -1657,6 +1698,196 @@ void sock_map_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; + enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + sockmap_link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + sockmap_link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = sockmap_link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + sockmap_link->map = map; + sockmap_link->attach_type = attach_type; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6fe9f11c8abe..cee0a7915c08 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1135,6 +1135,7 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, __MAX_BPF_LINK_TYPE, }; @@ -6724,6 +6725,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; }; } __attribute__((aligned(8))); From 849989af61added13b2a9005608b1cf46f36f88b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:32 -0700 Subject: [PATCH 078/147] libbpf: Add bpf_link support for BPF_PROG_TYPE_SOCKMAP Introduce a libbpf API function bpf_program__attach_sockmap() which allow user to get a bpf_link for their corresponding programs. Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043532.3737722-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 7 +++++++ tools/lib/bpf/libbpf.h | 2 ++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 10 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b091154bc5b5..97eb6e5dd7c8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -149,6 +149,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_TCX] = "tcx", [BPF_LINK_TYPE_UPROBE_MULTI] = "uprobe_multi", [BPF_LINK_TYPE_NETKIT] = "netkit", + [BPF_LINK_TYPE_SOCKMAP] = "sockmap", }; static const char * const map_type_name[] = { @@ -12533,6 +12534,12 @@ bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) return bpf_program_attach_fd(prog, netns_fd, "netns", NULL); } +struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd) +{ + return bpf_program_attach_fd(prog, map_fd, "sockmap", NULL); +} + struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) { /* target_fd/target_ifindex use the same field in LINK_CREATE */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 4f775a6dcaa0..1333ae20ebe6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -795,6 +795,8 @@ bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd); LIBBPF_API struct bpf_link * bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd); LIBBPF_API struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd); +LIBBPF_API struct bpf_link * bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 23d82bba021a..c1ce8aa3520b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -419,6 +419,7 @@ LIBBPF_1.4.0 { LIBBPF_1.5.0 { global: + bpf_program__attach_sockmap; ring__consume_n; ring_buffer__consume_n; } LIBBPF_1.4.0; From 1f3e2091d25b2b140967480177fcaee2f0eebfb1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:37 -0700 Subject: [PATCH 079/147] bpftool: Add link dump support for BPF_LINK_TYPE_SOCKMAP An example output looks like: $ bpftool link 1776: sk_skb prog 49730 map_id 0 attach_type sk_skb_verdict pids test_progs(8424) 1777: sk_skb prog 49755 map_id 0 attach_type sk_skb_stream_verdict pids test_progs(8424) 1778: sk_msg prog 49770 map_id 8208 attach_type sk_msg_verdict pids test_progs(8424) Reviewed-by: John Fastabend Reviewed-by: Quentin Monnet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043537.3737928-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/link.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index afde9d0c2ea1..5cd503b763d7 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -526,6 +526,10 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) show_link_ifindex_json(info->netkit.ifindex, json_wtr); show_link_attach_type_json(info->netkit.attach_type, json_wtr); break; + case BPF_LINK_TYPE_SOCKMAP: + jsonw_uint_field(json_wtr, "map_id", info->sockmap.map_id); + show_link_attach_type_json(info->sockmap.attach_type, json_wtr); + break; case BPF_LINK_TYPE_XDP: show_link_ifindex_json(info->xdp.ifindex, json_wtr); break; @@ -915,6 +919,11 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) show_link_ifindex_plain(info->netkit.ifindex); show_link_attach_type_plain(info->netkit.attach_type); break; + case BPF_LINK_TYPE_SOCKMAP: + printf("\n\t"); + printf("map_id %u ", info->sockmap.map_id); + show_link_attach_type_plain(info->sockmap.attach_type); + break; case BPF_LINK_TYPE_XDP: printf("\n\t"); show_link_ifindex_plain(info->xdp.ifindex); From a15d58b2bc82abd8c4c994af158b0410424a18d3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:42 -0700 Subject: [PATCH 080/147] selftests/bpf: Refactor out helper functions for a few tests These helper functions will be used later new tests as well. There are no functionality change. Acked-by: Eduard Zingerman Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043542.3738166-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 38 +++++++++++-------- .../bpf/progs/test_skmsg_load_helpers.c | 9 ++++- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 77e26ecffa9d..63fb2da7930a 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -475,30 +475,19 @@ out: test_sockmap_drop_prog__destroy(drop); } -static void test_sockmap_skb_verdict_peek(void) +static void test_sockmap_skb_verdict_peek_helper(int map) { - int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail; - struct test_sockmap_pass_prog *pass; + int err, s, c1, p1, zero = 0, sent, recvd, avail; char snd[256] = "0123456789"; char rcv[256] = "0"; - pass = test_sockmap_pass_prog__open_and_load(); - if (!ASSERT_OK_PTR(pass, "open_and_load")) - return; - verdict = bpf_program__fd(pass->progs.prog_skb_verdict); - map = bpf_map__fd(pass->maps.sock_map_rx); - - err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); - if (!ASSERT_OK(err, "bpf_prog_attach")) - goto out; - s = socket_loopback(AF_INET, SOCK_STREAM); if (!ASSERT_GT(s, -1, "socket_loopback(s)")) - goto out; + return; err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); if (!ASSERT_OK(err, "create_pairs(s)")) - goto out; + return; err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) @@ -520,6 +509,25 @@ static void test_sockmap_skb_verdict_peek(void) out_close: close(c1); close(p1); +} + +static void test_sockmap_skb_verdict_peek(void) +{ + struct test_sockmap_pass_prog *pass; + int err, map, verdict; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + verdict = bpf_program__fd(pass->progs.prog_skb_verdict); + map = bpf_map__fd(pass->maps.sock_map_rx); + + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + out: test_sockmap_pass_prog__destroy(pass); } diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c index 45e8fc75a739..b753672f04c9 100644 --- a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -24,8 +24,7 @@ struct { __type(value, __u64); } socket_storage SEC(".maps"); -SEC("sk_msg") -int prog_msg_verdict(struct sk_msg_md *msg) +static int prog_msg_verdict_common(struct sk_msg_md *msg) { struct task_struct *task = (struct task_struct *)bpf_get_current_task(); int verdict = SK_PASS; @@ -44,4 +43,10 @@ int prog_msg_verdict(struct sk_msg_md *msg) return verdict; } +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + char _license[] SEC("license") = "GPL"; From 8ba218e625f0dfb3ef46fe0721dcdf565726ff76 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:47 -0700 Subject: [PATCH 081/147] selftests/bpf: Add some tests with new bpf_program__attach_sockmap() APIs Add a few more tests in sockmap_basic.c and sockmap_listen.c to test bpf_link based APIs for SK_MSG and SK_SKB programs. Link attach/detach/update are all tested. All tests are passed. Acked-by: Eduard Zingerman Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043547.3738448-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 133 ++++++++++++++++++ .../selftests/bpf/prog_tests/sockmap_listen.c | 38 +++++ .../bpf/progs/test_skmsg_load_helpers.c | 18 +++ .../bpf/progs/test_sockmap_pass_prog.c | 17 ++- .../progs/test_sockmap_skb_verdict_attach.c | 2 +- 5 files changed, 206 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 63fb2da7930a..1337153eb0ad 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -131,6 +131,65 @@ out: test_skmsg_load_helpers__destroy(skel); } +static void test_skmsg_helpers_with_link(enum bpf_map_type map_type) +{ + struct bpf_program *prog, *prog_clone, *prog_clone2; + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, opts); + struct test_skmsg_load_helpers *skel; + struct bpf_link *link, *link2; + int err, map; + + skel = test_skmsg_load_helpers__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_skmsg_load_helpers__open_and_load")) + return; + + prog = skel->progs.prog_msg_verdict; + prog_clone = skel->progs.prog_msg_verdict_clone; + prog_clone2 = skel->progs.prog_msg_verdict_clone2; + map = bpf_map__fd(skel->maps.sock_map); + + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + /* Fail since bpf_link for the same prog has been created. */ + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_ERR(err, "bpf_prog_attach")) + goto out; + + /* Fail since bpf_link for the same prog type has been created. */ + link2 = bpf_program__attach_sockmap(prog_clone, map); + if (!ASSERT_ERR_PTR(link2, "bpf_program__attach_sockmap")) { + bpf_link__detach(link2); + goto out; + } + + err = bpf_link__update_program(link, prog_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different type attempts to do update. */ + err = bpf_link__update_program(link, skel->progs.prog_skb_verdict); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + /* Fail since the old prog does not match the one in the kernel. */ + opts.old_prog_fd = bpf_program__fd(prog_clone2); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_ERR(err, "bpf_link_update")) + goto out; + + opts.old_prog_fd = bpf_program__fd(prog_clone); + opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "bpf_link_update")) + goto out; +out: + bpf_link__detach(link); + test_skmsg_load_helpers__destroy(skel); +} + static void test_sockmap_update(enum bpf_map_type map_type) { int err, prog, src; @@ -298,6 +357,40 @@ out: test_sockmap_skb_verdict_attach__destroy(skel); } +static void test_sockmap_skb_verdict_attach_with_link(void) +{ + struct test_sockmap_skb_verdict_attach *skel; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + skel = test_sockmap_skb_verdict_attach__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + prog = skel->progs.prog_skb_verdict; + map = bpf_map__fd(skel->maps.sock_map); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + bpf_link__detach(link); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + /* Fail since attaching with the same prog/map has been done. */ + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_sockmap")) + bpf_link__detach(link); + + err = bpf_prog_detach2(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT); + if (!ASSERT_OK(err, "bpf_prog_detach2")) + goto out; +out: + test_sockmap_skb_verdict_attach__destroy(skel); +} + static __u32 query_prog_id(int prog_fd) { struct bpf_prog_info info = {}; @@ -532,6 +625,38 @@ out: test_sockmap_pass_prog__destroy(pass); } +static void test_sockmap_skb_verdict_peek_with_link(void) +{ + struct test_sockmap_pass_prog *pass; + struct bpf_program *prog; + struct bpf_link *link; + int err, map; + + pass = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(pass, "open_and_load")) + return; + prog = pass->progs.prog_skb_verdict; + map = bpf_map__fd(pass->maps.sock_map_rx); + link = bpf_program__attach_sockmap(prog, map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + goto out; + + err = bpf_link__update_program(link, pass->progs.prog_skb_verdict_clone); + if (!ASSERT_OK(err, "bpf_link__update_program")) + goto out; + + /* Fail since a prog with different attach type attempts to do update. */ + err = bpf_link__update_program(link, pass->progs.prog_skb_parser); + if (!ASSERT_ERR(err, "bpf_link__update_program")) + goto out; + + test_sockmap_skb_verdict_peek_helper(map); + ASSERT_EQ(pass->bss->clone_called, 1, "clone_called"); +out: + bpf_link__detach(link); + test_sockmap_pass_prog__destroy(pass); +} + static void test_sockmap_unconnected_unix(void) { int err, map, stream = 0, dgram = 0, zero = 0; @@ -796,6 +921,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT, BPF_SK_SKB_VERDICT); } + if (test__start_subtest("sockmap skb_verdict attach_with_link")) + test_sockmap_skb_verdict_attach_with_link(); if (test__start_subtest("sockmap msg_verdict progs query")) test_sockmap_progs_query(BPF_SK_MSG_VERDICT); if (test__start_subtest("sockmap stream_parser progs query")) @@ -812,6 +939,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(false); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); + if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) + test_sockmap_skb_verdict_peek_with_link(); if (test__start_subtest("sockmap unconnected af_unix")) test_sockmap_unconnected_unix(); if (test__start_subtest("sockmap one socket to many map entries")) @@ -820,4 +949,8 @@ void test_sockmap_basic(void) test_sockmap_many_maps(); if (test__start_subtest("sockmap same socket replace")) test_sockmap_same_sock(); + if (test__start_subtest("sockmap sk_msg attach sockmap helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash sk_msg attach sockhash helpers with link")) + test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index a92807bfcd13..e91b59366030 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -767,6 +767,24 @@ static void test_msg_redir_to_connected(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_connected_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int prog_msg_verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int link_fd; + + link_fd = bpf_link_create(prog_msg_verdict, sock_map, BPF_SK_MSG_VERDICT, NULL); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + return; + + redir_to_connected(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + close(link_fd); +} + static void redir_to_listening(int family, int sotype, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { @@ -869,6 +887,24 @@ static void test_msg_redir_to_listening(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + struct bpf_program *verdict = skel->progs.prog_msg_verdict; + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + struct bpf_link *link; + + link = bpf_program__attach_sockmap(verdict, sock_map); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) + return; + + redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + bpf_link__detach(link); +} + static void redir_partial(int family, int sotype, int sock_map, int parser_map) { int s, c0 = -1, c1 = -1, p0 = -1, p1 = -1; @@ -1316,7 +1352,9 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, TEST(test_skb_redir_to_listening), TEST(test_skb_redir_partial), TEST(test_msg_redir_to_connected), + TEST(test_msg_redir_to_connected_with_link), TEST(test_msg_redir_to_listening), + TEST(test_msg_redir_to_listening_with_link), }; const char *family_name, *map_name; const struct redir_test *t; diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c index b753672f04c9..996b177324ba 100644 --- a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -49,4 +49,22 @@ int prog_msg_verdict(struct sk_msg_md *msg) return prog_msg_verdict_common(msg); } +SEC("sk_msg") +int prog_msg_verdict_clone(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_msg") +int prog_msg_verdict_clone2(struct sk_msg_md *msg) +{ + return prog_msg_verdict_common(msg); +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c index 1d86a717a290..69aacc96db36 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c @@ -23,10 +23,25 @@ struct { __type(value, int); } sock_map_msg SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/stream_verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_PASS; } +int clone_called; + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_clone(struct __sk_buff *skb) +{ + clone_called = 1; + return SK_PASS; +} + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ + return SK_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c index 3c69aa971738..d25b0bb30fc0 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c @@ -9,7 +9,7 @@ struct { __type(value, __u64); } sock_map SEC(".maps"); -SEC("sk_skb") +SEC("sk_skb/verdict") int prog_skb_verdict(struct __sk_buff *skb) { return SK_DROP; From ffa6b26b4d8a0520b78636ca9373ab842cb3b1a8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 10 Apr 2024 08:33:26 -0700 Subject: [PATCH 082/147] selftests/bpf: Enable tests for atomics with cpuv4 When looking at Alexei's patch ([1]) which added tests for atomics, I noticed that the tests will be skipped with cpuv4. For example, with latest llvm19, I see: [root@arch-fb-vm1 bpf]# ./test_progs -t arena_atomics #3/1 arena_atomics/add:OK ... #3/7 arena_atomics/xchg:OK #3 arena_atomics:OK Summary: 1/7 PASSED, 0 SKIPPED, 0 FAILED [root@arch-fb-vm1 bpf]# ./test_progs-cpuv4 -t arena_atomics #3 arena_atomics:SKIP Summary: 1/0 PASSED, 1 SKIPPED, 0 FAILED [root@arch-fb-vm1 bpf]# It is perfectly fine to enable atomics-related tests for cpuv4. With this patch, I have [root@arch-fb-vm1 bpf]# ./test_progs-cpuv4 -t arena_atomics #3/1 arena_atomics/add:OK ... #3/7 arena_atomics/xchg:OK #3 arena_atomics:OK Summary: 1/7 PASSED, 0 SKIPPED, 0 FAILED [1] https://lore.kernel.org/r/20240405231134.17274-2-alexei.starovoitov@gmail.com Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410153326.1851055-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index eea5b8deaaf0..edc73f8f5aef 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -658,7 +658,7 @@ $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) # Define test_progs-cpuv4 test runner. ifneq ($(CLANG_CPUV4),) TRUNNER_BPF_BUILD_RULE := CLANG_CPUV4_BPF_BUILD_RULE -TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS $(eval $(call DEFINE_TEST_RUNNER,test_progs,cpuv4)) endif From d75142dbeb2bd1587b9cc19f841578f541275a64 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 9 Apr 2024 13:18:40 +0800 Subject: [PATCH 083/147] selftests/bpf: Fix umount cgroup2 error in test_sockmap This patch fixes the following "umount cgroup2" error in test_sockmap.c: (cgroup_helpers.c:353: errno: Device or resource busy) umount cgroup2 Cgroup fd cg_fd should be closed before cleanup_cgroup_environment(). Fixes: 13a5f3ffd202 ("bpf: Selftests, sockmap test prog run without setting cgroup") Signed-off-by: Geliang Tang Acked-by: Yonghong Song Link: https://lore.kernel.org/r/0399983bde729708773416b8488bac2cd5e022b8.1712639568.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/test_sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 024a0faafb3b..43612de44fbf 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -2104,9 +2104,9 @@ out: free(options.whitelist); if (options.blacklist) free(options.blacklist); + close(cg_fd); if (cg_created) cleanup_cgroup_environment(); - close(cg_fd); return err; } From 68acca6e6f99b1f928a2c05b92bb1c272edb8ae7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 11 Apr 2024 13:43:11 +0800 Subject: [PATCH 084/147] selftests/bpf: Add struct send_recv_arg Avoid setting total_bytes and stop as global variables, this patch adds a new struct named send_recv_arg to pass arguments between threads. Put these two variables together with fd into this struct and pass it to server thread, so that server thread can access these two variables without setting them as global ones. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/ca1dd703b796f6810985418373e750f7068b4186.1712813933.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 077b107130f6..64f172f02a9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -21,7 +21,6 @@ static const unsigned int total_bytes = 10 * 1024 * 1024; static int expected_stg = 0xeB9F; -static int stop; static int settcpca(int fd, const char *tcp_ca) { @@ -34,13 +33,20 @@ static int settcpca(int fd, const char *tcp_ca) return 0; } +struct send_recv_arg { + int fd; + uint32_t bytes; + int stop; +}; + static void *server(void *arg) { - int lfd = (int)(long)arg, err = 0, fd; + struct send_recv_arg *a = (struct send_recv_arg *)arg; ssize_t nr_sent = 0, bytes = 0; char batch[1500]; + int err = 0, fd; - fd = accept(lfd, NULL, NULL); + fd = accept(a->fd, NULL, NULL); while (fd == -1) { if (errno == EINTR) continue; @@ -53,9 +59,9 @@ static void *server(void *arg) goto done; } - while (bytes < total_bytes && !READ_ONCE(stop)) { + while (bytes < a->bytes && !READ_ONCE(a->stop)) { nr_sent = send(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); + MIN(a->bytes - bytes, sizeof(batch)), 0); if (nr_sent == -1 && errno == EINTR) continue; if (nr_sent == -1) { @@ -65,13 +71,13 @@ static void *server(void *arg) bytes += nr_sent; } - ASSERT_EQ(bytes, total_bytes, "send"); + ASSERT_EQ(bytes, a->bytes, "send"); done: if (fd >= 0) close(fd); if (err) { - WRITE_ONCE(stop, 1); + WRITE_ONCE(a->stop, 1); return ERR_PTR(err); } return NULL; @@ -80,18 +86,22 @@ done: static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { ssize_t nr_recv = 0, bytes = 0; + struct send_recv_arg arg = { + .bytes = total_bytes, + .stop = 0, + }; int lfd = -1, fd = -1; pthread_t srv_thread; void *thread_ret; char batch[1500]; int err; - WRITE_ONCE(stop, 0); - lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_NEQ(lfd, -1, "socket")) return; + arg.fd = lfd; + fd = socket(AF_INET6, SOCK_STREAM, 0); if (!ASSERT_NEQ(fd, -1, "socket")) { close(lfd); @@ -123,12 +133,12 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) goto done; } - err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); + err = pthread_create(&srv_thread, NULL, server, (void *)&arg); if (!ASSERT_OK(err, "pthread_create")) goto done; /* recv total_bytes */ - while (bytes < total_bytes && !READ_ONCE(stop)) { + while (bytes < total_bytes && !READ_ONCE(arg.stop)) { nr_recv = recv(fd, &batch, MIN(total_bytes - bytes, sizeof(batch)), 0); if (nr_recv == -1 && errno == EINTR) @@ -140,7 +150,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) ASSERT_EQ(bytes, total_bytes, "recv"); - WRITE_ONCE(stop, 1); + WRITE_ONCE(arg.stop, 1); pthread_join(srv_thread, &thread_ret); ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); From dc34e44ea6a1c11cc517adc6df527b457acb9eaf Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 11 Apr 2024 13:43:12 +0800 Subject: [PATCH 085/147] selftests/bpf: Export send_recv_data helper This patch extracts the code to send and receive data into a new helper named send_recv_data() in network_helpers.c and export it in network_helpers.h. This helper will be used for MPTCP BPF selftests. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/5231103be91fadcce3674a589542c63b6a5eedd4.1712813933.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 102 ++++++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 1 + .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 81 +------------- 3 files changed, 104 insertions(+), 80 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 04175e16195a..dc1fd7af9c7a 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -545,3 +545,105 @@ int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param) close(sockfd); return 0; } + +struct send_recv_arg { + int fd; + uint32_t bytes; + int stop; +}; + +static void *send_recv_server(void *arg) +{ + struct send_recv_arg *a = (struct send_recv_arg *)arg; + ssize_t nr_sent = 0, bytes = 0; + char batch[1500]; + int err = 0, fd; + + fd = accept(a->fd, NULL, NULL); + while (fd == -1) { + if (errno == EINTR) + continue; + err = -errno; + goto done; + } + + if (settimeo(fd, 0)) { + err = -errno; + goto done; + } + + while (bytes < a->bytes && !READ_ONCE(a->stop)) { + nr_sent = send(fd, &batch, + MIN(a->bytes - bytes, sizeof(batch)), 0); + if (nr_sent == -1 && errno == EINTR) + continue; + if (nr_sent == -1) { + err = -errno; + break; + } + bytes += nr_sent; + } + + if (bytes != a->bytes) { + log_err("send %zd expected %u", bytes, a->bytes); + if (!err) + err = bytes > a->bytes ? -E2BIG : -EINTR; + } + +done: + if (fd >= 0) + close(fd); + if (err) { + WRITE_ONCE(a->stop, 1); + return ERR_PTR(err); + } + return NULL; +} + +int send_recv_data(int lfd, int fd, uint32_t total_bytes) +{ + ssize_t nr_recv = 0, bytes = 0; + struct send_recv_arg arg = { + .fd = lfd, + .bytes = total_bytes, + .stop = 0, + }; + pthread_t srv_thread; + void *thread_ret; + char batch[1500]; + int err = 0; + + err = pthread_create(&srv_thread, NULL, send_recv_server, (void *)&arg); + if (err) { + log_err("Failed to pthread_create"); + return err; + } + + /* recv total_bytes */ + while (bytes < total_bytes && !READ_ONCE(arg.stop)) { + nr_recv = recv(fd, &batch, + MIN(total_bytes - bytes, sizeof(batch)), 0); + if (nr_recv == -1 && errno == EINTR) + continue; + if (nr_recv == -1) { + err = -errno; + break; + } + bytes += nr_recv; + } + + if (bytes != total_bytes) { + log_err("recv %zd expected %u", bytes, total_bytes); + if (!err) + err = bytes > total_bytes ? -E2BIG : -EINTR; + } + + WRITE_ONCE(arg.stop, 1); + pthread_join(srv_thread, &thread_ret); + if (IS_ERR(thread_ret)) { + log_err("Failed in thread_ret %ld", PTR_ERR(thread_ret)); + err = err ? : PTR_ERR(thread_ret); + } + + return err; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 6457445cc6e2..70f4e4c92733 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -76,6 +76,7 @@ struct nstoken; */ struct nstoken *open_netns(const char *name); void close_netns(struct nstoken *token); +int send_recv_data(int lfd, int fd, uint32_t total_bytes); static __u16 csum_fold(__u32 csum) { diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 64f172f02a9a..907bac46c774 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -33,75 +33,15 @@ static int settcpca(int fd, const char *tcp_ca) return 0; } -struct send_recv_arg { - int fd; - uint32_t bytes; - int stop; -}; - -static void *server(void *arg) -{ - struct send_recv_arg *a = (struct send_recv_arg *)arg; - ssize_t nr_sent = 0, bytes = 0; - char batch[1500]; - int err = 0, fd; - - fd = accept(a->fd, NULL, NULL); - while (fd == -1) { - if (errno == EINTR) - continue; - err = -errno; - goto done; - } - - if (settimeo(fd, 0)) { - err = -errno; - goto done; - } - - while (bytes < a->bytes && !READ_ONCE(a->stop)) { - nr_sent = send(fd, &batch, - MIN(a->bytes - bytes, sizeof(batch)), 0); - if (nr_sent == -1 && errno == EINTR) - continue; - if (nr_sent == -1) { - err = -errno; - break; - } - bytes += nr_sent; - } - - ASSERT_EQ(bytes, a->bytes, "send"); - -done: - if (fd >= 0) - close(fd); - if (err) { - WRITE_ONCE(a->stop, 1); - return ERR_PTR(err); - } - return NULL; -} - static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { - ssize_t nr_recv = 0, bytes = 0; - struct send_recv_arg arg = { - .bytes = total_bytes, - .stop = 0, - }; int lfd = -1, fd = -1; - pthread_t srv_thread; - void *thread_ret; - char batch[1500]; int err; lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_NEQ(lfd, -1, "socket")) return; - arg.fd = lfd; - fd = socket(AF_INET6, SOCK_STREAM, 0); if (!ASSERT_NEQ(fd, -1, "socket")) { close(lfd); @@ -133,26 +73,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) goto done; } - err = pthread_create(&srv_thread, NULL, server, (void *)&arg); - if (!ASSERT_OK(err, "pthread_create")) - goto done; - - /* recv total_bytes */ - while (bytes < total_bytes && !READ_ONCE(arg.stop)) { - nr_recv = recv(fd, &batch, - MIN(total_bytes - bytes, sizeof(batch)), 0); - if (nr_recv == -1 && errno == EINTR) - continue; - if (nr_recv == -1) - break; - bytes += nr_recv; - } - - ASSERT_EQ(bytes, total_bytes, "recv"); - - WRITE_ONCE(arg.stop, 1); - pthread_join(srv_thread, &thread_ret); - ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); + ASSERT_OK(send_recv_data(lfd, fd, total_bytes), "send_recv_data"); done: close(lfd); From 23cc4fe44f1df5ccce088a7c9398f96794047c2a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 11 Apr 2024 18:43:00 +0200 Subject: [PATCH 086/147] bpftool: Fix typo in error message s/at at/at a/ Signed-off-by: Thorsten Blum Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240411164258.533063-3-thorsten.blum@toblux.com --- tools/bpf/bpftool/prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 4c4cf16a40ba..1a501cf09e78 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -2081,7 +2081,7 @@ static int profile_parse_metrics(int argc, char **argv) NEXT_ARG(); } if (selected_cnt > MAX_NUM_PROFILE_METRICS) { - p_err("too many (%d) metrics, please specify no more than %d metrics at at time", + p_err("too many (%d) metrics, please specify no more than %d metrics at a time", selected_cnt, MAX_NUM_PROFILE_METRICS); return -1; } From 4d4992ff587604455e8843a0e76dce0b99175319 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 10 Apr 2024 16:09:52 +0200 Subject: [PATCH 087/147] selftests/bpf: Add read_trace_pipe_iter function We have two printk tests reading trace_pipe in non blocking way, with the very same code. Moving that in new read_trace_pipe_iter function. Current read_trace_pipe is used from samples/bpf and needs to do blocking read and printf of the trace_pipe data, using new read_trace_pipe_iter to implement that. Both printk tests do early checks for the number of found messages and can bail earlier, but I did not find any speed difference w/o that condition, so I did not complicate the change more for that. Some of the samples/bpf programs use read_trace_pipe function, so I kept that interface untouched. I did not see any issues with affected samples/bpf programs other than there's slight change in read_trace_pipe output. The current code uses puts that adds new line after the printed string, so we would occasionally see extra new line. With this patch we read output per lines, so there's no need to use puts and we can use just printf instead without extra new line. Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240410140952.292261-1-jolsa@kernel.org --- .../selftests/bpf/prog_tests/trace_printk.c | 36 +++-------- .../selftests/bpf/prog_tests/trace_vprintk.c | 36 +++-------- tools/testing/selftests/bpf/trace_helpers.c | 63 ++++++++++++------- tools/testing/selftests/bpf/trace_helpers.h | 2 + 4 files changed, 60 insertions(+), 77 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/trace_printk.c b/tools/testing/selftests/bpf/prog_tests/trace_printk.c index 7b9124d506a5..e56e88596d64 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_printk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_printk.c @@ -5,18 +5,19 @@ #include "trace_printk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "testing,testing" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_printk(void) { struct trace_printk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_printk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_printk_lskel__open(); if (!ASSERT_OK_PTR(skel, "trace_printk__open")) @@ -35,16 +36,6 @@ void serial_test_trace_printk(void) if (!ASSERT_OK(err, "trace_printk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_printk_lskel__detach(skel); @@ -56,21 +47,12 @@ void serial_test_trace_printk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_printk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_printk_ran, "found")) goto cleanup; cleanup: trace_printk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c index 44ea2fd88f4c..2af6a6f2096a 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c @@ -5,18 +5,19 @@ #include "trace_vprintk.lskel.h" -#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" -#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" #define SEARCHMSG "1,2,3,4,5,6,7,8,9,10" +static void trace_pipe_cb(const char *str, void *data) +{ + if (strstr(str, SEARCHMSG) != NULL) + (*(int *)data)++; +} + void serial_test_trace_vprintk(void) { struct trace_vprintk_lskel__bss *bss; - int err = 0, iter = 0, found = 0; struct trace_vprintk_lskel *skel; - char *buf = NULL; - FILE *fp = NULL; - size_t buflen; + int err = 0, found = 0; skel = trace_vprintk_lskel__open_and_load(); if (!ASSERT_OK_PTR(skel, "trace_vprintk__open_and_load")) @@ -28,16 +29,6 @@ void serial_test_trace_vprintk(void) if (!ASSERT_OK(err, "trace_vprintk__attach")) goto cleanup; - if (access(TRACEFS_PIPE, F_OK) == 0) - fp = fopen(TRACEFS_PIPE, "r"); - else - fp = fopen(DEBUGFS_PIPE, "r"); - if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)")) - goto cleanup; - - /* We do not want to wait forever if this test fails... */ - fcntl(fileno(fp), F_SETFL, O_NONBLOCK); - /* wait for tracepoint to trigger */ usleep(1); trace_vprintk_lskel__detach(skel); @@ -49,14 +40,8 @@ void serial_test_trace_vprintk(void) goto cleanup; /* verify our search string is in the trace buffer */ - while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) { - if (strstr(buf, SEARCHMSG) != NULL) - found++; - if (found == bss->trace_vprintk_ran) - break; - if (++iter > 1000) - break; - } + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), + "read_trace_pipe_iter"); if (!ASSERT_EQ(found, bss->trace_vprintk_ran, "found")) goto cleanup; @@ -66,7 +51,4 @@ void serial_test_trace_vprintk(void) cleanup: trace_vprintk_lskel__destroy(skel); - free(buf); - if (fp) - fclose(fp); } diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 7f45b4cb41fe..70e29f316fe7 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -233,29 +233,6 @@ out: return err; } -void read_trace_pipe(void) -{ - int trace_fd; - - if (access(TRACEFS_PIPE, F_OK) == 0) - trace_fd = open(TRACEFS_PIPE, O_RDONLY, 0); - else - trace_fd = open(DEBUGFS_PIPE, O_RDONLY, 0); - if (trace_fd < 0) - return; - - while (1) { - static char buf[4096]; - ssize_t sz; - - sz = read(trace_fd, buf, sizeof(buf) - 1); - if (sz > 0) { - buf[sz] = 0; - puts(buf); - } - } -} - ssize_t get_uprobe_offset(const void *addr) { size_t start, end, base; @@ -413,3 +390,43 @@ out: close(fd); return err; } + +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), void *data, int iter) +{ + size_t buflen, n; + char *buf = NULL; + FILE *fp = NULL; + + if (access(TRACEFS_PIPE, F_OK) == 0) + fp = fopen(TRACEFS_PIPE, "r"); + else + fp = fopen(DEBUGFS_PIPE, "r"); + if (!fp) + return -1; + + /* We do not want to wait forever when iter is specified. */ + if (iter) + fcntl(fileno(fp), F_SETFL, O_NONBLOCK); + + while ((n = getline(&buf, &buflen, fp) >= 0) || errno == EAGAIN) { + if (n > 0) + cb(buf, data); + if (iter && !(--iter)) + break; + } + + free(buf); + if (fp) + fclose(fp); + return 0; +} + +static void trace_pipe_cb(const char *str, void *data) +{ + printf("%s", str); +} + +void read_trace_pipe(void) +{ + read_trace_pipe_iter(trace_pipe_cb, NULL, 0); +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index d1ed71789049..2ce873c9f9aa 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -33,6 +33,8 @@ struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p1, int kallsyms_find(const char *sym, unsigned long long *addr); void read_trace_pipe(void); +int read_trace_pipe_iter(void (*cb)(const char *str, void *data), + void *data, int iter); ssize_t get_uprobe_offset(const void *addr); ssize_t get_rel_offset(uintptr_t addr); From fc5eb4a84e4c063e75a6a6e92308e9533c0f19b5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Apr 2024 18:20:45 +0200 Subject: [PATCH 088/147] btf: Avoid weak external references If the BTF code is enabled in the build configuration, the start/stop BTF markers are guaranteed to exist. Only when CONFIG_DEBUG_INFO_BTF=n, the references in btf_parse_vmlinux() will remain unsatisfied, relying on the weak linkage of the external references to avoid breaking the build. Avoid GOT based relocations to these markers in the final executable by dropping the weak attribute and instead, make btf_parse_vmlinux() return ERR_PTR(-ENOENT) directly if CONFIG_DEBUG_INFO_BTF is not enabled to begin with. The compiler will drop any subsequent references to __start_BTF and __stop_BTF in that case, allowing the link to succeed. Note that Clang will notice that taking the address of __start_BTF can no longer yield NULL, so testing for that condition becomes unnecessary. Signed-off-by: Ard Biesheuvel Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Arnd Bergmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240415162041.2491523-8-ardb+git@google.com --- kernel/bpf/btf.c | 7 +++++-- kernel/bpf/sysfs_btf.c | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 90c4a32d89ff..6d46cee47ae3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5642,8 +5642,8 @@ errout_free: return ERR_PTR(err); } -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) @@ -5971,6 +5971,9 @@ struct btf *btf_parse_vmlinux(void) struct btf *btf = NULL; int err; + if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) + return ERR_PTR(-ENOENT); + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index ef6911aee3bb..fedb54c94cdb 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,8 +9,8 @@ #include /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; static ssize_t btf_vmlinux_read(struct file *file, struct kobject *kobj, @@ -32,7 +32,7 @@ static int __init btf_vmlinux_init(void) { bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) + if (bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); From dac045fc9fa653e250f991ea8350b32cfec690d2 Mon Sep 17 00:00:00 2001 From: Chen Pei Date: Mon, 15 Apr 2024 16:19:28 +0800 Subject: [PATCH 089/147] bpf, tests: Fix typos in comments Currently, there are two comments with same name "64-bit ATOMIC magnitudes", the second one should be "32-bit ATOMIC magnitudes" based on the context. Signed-off-by: Chen Pei Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240415081928.17440-1-cp0613@linux.alibaba.com --- lib/test_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 569e6d2dc55c..207ff87194db 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -13431,7 +13431,7 @@ static struct bpf_test tests[] = { .stack_depth = 8, .nr_testruns = NR_PATTERN_RUNS, }, - /* 64-bit atomic magnitudes */ + /* 32-bit atomic magnitudes */ { "ATOMIC_W_ADD: all operand magnitudes", { }, From 1f586614f3ffa80fdf2116b2a1bebcdb5969cef8 Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Tue, 16 Apr 2024 07:53:02 -0400 Subject: [PATCH 090/147] bpf: Harden and/or/xor value tracking in verifier This patch addresses a latent unsoundness issue in the scalar(32)_min_max_and/or/xor functions. While it is not a bugfix, it ensures that the functions produce sound outputs for all inputs. The issue occurs in these functions when setting signed bounds. The following example illustrates the issue for scalar_min_max_and(), but it applies to the other functions. In scalar_min_max_and() the following clause is executed when ANDing positive numbers: /* ANDing two positives gives a positive, so safe to * cast result into s64. */ dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; However, if umin_value and umax_value of dst_reg cross the sign boundary (i.e., if (s64)dst_reg->umin_value > (s64)dst_reg->umax_value), then we will end up with smin_value > smax_value, which is unsound. Previous works [1, 2] have discovered and reported this issue. Our tool Agni [2, 3] consideres it a false positive. This is because, during the verification of the abstract operator scalar_min_max_and(), Agni restricts its inputs to those passing through reg_bounds_sync(). This mimics real-world verifier behavior, as reg_bounds_sync() is invariably executed at the tail of every abstract operator. Therefore, such behavior is unlikely in an actual verifier execution. However, it is still unsound for an abstract operator to set signed bounds such that smin_value > smax_value. This patch fixes it, making the abstract operator sound for all (well-formed) inputs. It is worth noting that while the previous code updated the signed bounds (using the output unsigned bounds) only when the *input signed* bounds were positive, the new code updates them whenever the *output unsigned* bounds do not cross the sign boundary. An alternative approach to fix this latent unsoundness would be to unconditionally set the signed bounds to unbounded [S64_MIN, S64_MAX], and let reg_bounds_sync() refine the signed bounds using the unsigned bounds and the tnum. We found that our approach produces more precise (tighter) bounds. For example, consider these inputs to BPF_AND: /* dst_reg */ var_off.value: 8608032320201083347 var_off.mask: 615339716653692460 smin_value: 8070450532247928832 smax_value: 8070450532247928832 umin_value: 13206380674380886586 umax_value: 13206380674380886586 s32_min_value: -2110561598 s32_max_value: -133438816 u32_min_value: 4135055354 u32_max_value: 4135055354 /* src_reg */ var_off.value: 8584102546103074815 var_off.mask: 9862641527606476800 smin_value: 2920655011908158522 smax_value: 7495731535348625717 umin_value: 7001104867969363969 umax_value: 8584102543730304042 s32_min_value: -2097116671 s32_max_value: 71704632 u32_min_value: 1047457619 u32_max_value: 4268683090 After going through tnum_and() -> scalar32_min_max_and() -> scalar_min_max_and() -> reg_bounds_sync(), our patch produces the following bounds for s32: s32_min_value: -1263875629 s32_max_value: -159911942 Whereas, setting the signed bounds to unbounded in scalar_min_max_and() produces: s32_min_value: -1263875629 s32_max_value: -1 As observed, our patch produces a tighter s32 bound. We also confirmed using Agni and SMT verification that our patch always produces signed bounds that are equal to or more precise than setting the signed bounds to unbounded in scalar_min_max_and(). [1] https://sanjit-bhat.github.io/assets/pdf/ebpf-verifier-range-analysis22.pdf [2] https://link.springer.com/chapter/10.1007/978-3-031-37709-9_12 [3] https://github.com/bpfverif/agni Co-developed-by: Matan Shachnai Signed-off-by: Matan Shachnai Co-developed-by: Srinivas Narayana Signed-off-by: Srinivas Narayana Co-developed-by: Santosh Nagarakatte Signed-off-by: Santosh Nagarakatte Signed-off-by: Harishankar Vishwanathan Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240402212039.51815-1-harishankar.vishwanathan@gmail.com Link: https://lore.kernel.org/bpf/20240416115303.331688-1-harishankar.vishwanathan@gmail.com --- kernel/bpf/verifier.c | 94 ++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 54 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2aad6d90550f..68cfd6fc6ad4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13320,7 +13320,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umax_val = src_reg->u32_max_value; if (src_known && dst_known) { @@ -13333,18 +13332,16 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13353,7 +13350,6 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umax_val = src_reg->umax_value; if (src_known && dst_known) { @@ -13366,18 +13362,16 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = min(dst_reg->umax_value, umax_val); - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13389,7 +13383,6 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umin_val = src_reg->u32_min_value; if (src_known && dst_known) { @@ -13402,18 +13395,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13422,7 +13413,6 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umin_val = src_reg->umin_value; if (src_known && dst_known) { @@ -13435,18 +13425,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = max(dst_reg->umin_value, umin_val); dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13458,7 +13446,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -13469,10 +13456,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u32 result into s32. - */ + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; } else { @@ -13486,7 +13473,6 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; if (src_known && dst_known) { /* dst_reg->var_off.value has been updated earlier */ @@ -13498,10 +13484,10 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u64 result into s64. - */ + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; } else { From 986e7663f98ec7441d9d948263ec0dda71e7479f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sat, 13 Apr 2024 02:14:26 +0100 Subject: [PATCH 091/147] bpftool: Update documentation where progs/maps can be passed by name When using references to BPF programs, bpftool supports passing programs by name on the command line. The manual pages for "bpftool prog" and "bpftool map" (for prog_array updates) mention it, but we have a few additional subcommands that support referencing programs by name but do not mention it in their documentation. Let's update the pages for subcommands "btf", "cgroup", and "net". Similarly, we can reference maps by name when passing them to "bpftool prog load", so we update the page for "bpftool prog" as well. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240413011427.14402-2-qmo@kernel.org --- tools/bpf/bpftool/Documentation/bpftool-btf.rst | 2 +- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 2 +- tools/bpf/bpftool/Documentation/bpftool-net.rst | 2 +- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index f66781f20af2..eaba24320fb2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -30,7 +30,7 @@ BTF COMMANDS | *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } | *FORMAT* := { **raw** | **c** } | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index b2610d169e60..e8185596a759 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -30,7 +30,7 @@ CGROUP COMMANDS | **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* | **bpftool** **cgroup help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } | *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | | **cgroup_inet_sock_create** | **cgroup_sock_ops** | | **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index f8e65869f8b4..348812881297 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -28,7 +28,7 @@ NET COMMANDS | **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* | **bpftool** **net help** | -| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } | *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } DESCRIPTION diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 8e730cfb2589..d6304e01afe0 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -39,7 +39,7 @@ PROG COMMANDS | **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* | **bpftool** **prog help** | -| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } | *TYPE* := { | **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | From ad2d22b617b7c0ca2cff4da6dc063183822484bb Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sat, 13 Apr 2024 02:14:27 +0100 Subject: [PATCH 092/147] bpftool: Address minor issues in bash completion This commit contains a series of clean-ups and fixes for bpftool's bash completion file: - Make sure all local variables are declared as such. - Make sure variables are initialised before being read. - Update ELF section ("maps" -> ".maps") for looking up map names in object files. - Fix call to _init_completion. - Move definition for MAP_TYPE and PROG_TYPE higher up in the scope to avoid defining them multiple times, reuse MAP_TYPE where relevant. - Simplify completion for "duration" keyword in "bpftool prog profile". - Fix completion for "bpftool struct_ops register" and "bpftool link (pin|detach)" where we would repeatedly suggest file names instead of suggesting just one name. - Fix completion for "bpftool iter pin ... map MAP" to account for the "map" keyword. - Add missing "detach" suggestion for "bpftool link". Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240413011427.14402-3-qmo@kernel.org --- tools/bpf/bpftool/bash-completion/bpftool | 61 ++++++++++------------- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 6e4f7ce6bc01..04afe2ac2228 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -106,19 +106,19 @@ _bpftool_get_link_ids() _bpftool_get_obj_map_names() { - local obj + local obj maps obj=$1 - maps=$(objdump -j maps -t $obj 2>/dev/null | \ - command awk '/g . maps/ {print $NF}') + maps=$(objdump -j .maps -t $obj 2>/dev/null | \ + command awk '/g . .maps/ {print $NF}') COMPREPLY+=( $( compgen -W "$maps" -- "$cur" ) ) } _bpftool_get_obj_map_idxs() { - local obj + local obj nmaps obj=$1 @@ -136,7 +136,7 @@ _sysfs_get_netdevs() # Retrieve type of the map that we are operating on. _bpftool_map_guess_map_type() { - local keyword ref + local keyword idx ref="" for (( idx=3; idx < ${#words[@]}-1; idx++ )); do case "${words[$((idx-2))]}" in lookup|update) @@ -255,8 +255,9 @@ _bpftool_map_update_get_name() _bpftool() { - local cur prev words objword json=0 - _init_completion || return + local cur prev words cword comp_args + local json=0 + _init_completion -- "$@" || return # Deal with options if [[ ${words[cword]} == -* ]]; then @@ -293,7 +294,7 @@ _bpftool() esac # Remove all options so completions don't have to deal with them. - local i + local i pprev for (( i=1; i < ${#words[@]}; )); do if [[ ${words[i]::1} == - ]] && [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then @@ -307,7 +308,7 @@ _bpftool() prev=${words[cword - 1]} pprev=${words[cword - 2]} - local object=${words[1]} command=${words[2]} + local object=${words[1]} if [[ -z $object || $cword -eq 1 ]]; then case $cur in @@ -324,8 +325,12 @@ _bpftool() esac fi + local command=${words[2]} [[ $command == help ]] && return 0 + local MAP_TYPE='id pinned name' + local PROG_TYPE='id pinned tag name' + # Completion depends on object and command in use case $object in prog) @@ -346,8 +351,6 @@ _bpftool() ;; esac - local PROG_TYPE='id pinned tag name' - local MAP_TYPE='id pinned name' local METRIC_TYPE='cycles instructions l1d_loads llc_misses \ itlb_misses dtlb_misses' case $command in @@ -457,7 +460,7 @@ _bpftool() obj=${words[3]} if [[ ${words[-4]} == "map" ]]; then - COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) return 0 fi if [[ ${words[-3]} == "map" ]]; then @@ -541,20 +544,9 @@ _bpftool() COMPREPLY=( $( compgen -W "$METRIC_TYPE duration" -- "$cur" ) ) return 0 ;; - 6) - case $prev in - duration) - return 0 - ;; - *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) - return 0 - ;; - esac - return 0 - ;; *) - COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) + [[ $prev == duration ]] && return 0 + _bpftool_once_attr "$METRIC_TYPE" return 0 ;; esac @@ -612,7 +604,7 @@ _bpftool() return 0 ;; register) - _filedir + [[ $prev == $command ]] && _filedir return 0 ;; *) @@ -638,9 +630,12 @@ _bpftool() pinned) _filedir ;; - *) + map) _bpftool_one_of_list $MAP_TYPE ;; + *) + _bpftool_once_attr 'map' + ;; esac return 0 ;; @@ -652,7 +647,6 @@ _bpftool() esac ;; map) - local MAP_TYPE='id pinned name' case $command in show|list|dump|peek|pop|dequeue|freeze) case $prev in @@ -793,13 +787,11 @@ _bpftool() # map, depending on the type of the map to update. case "$(_bpftool_map_guess_map_type)" in array_of_maps|hash_of_maps) - local MAP_TYPE='id pinned name' COMPREPLY+=( $( compgen -W "$MAP_TYPE" \ -- "$cur" ) ) return 0 ;; prog_array) - local PROG_TYPE='id pinned tag name' COMPREPLY+=( $( compgen -W "$PROG_TYPE" \ -- "$cur" ) ) return 0 @@ -821,7 +813,7 @@ _bpftool() esac _bpftool_once_attr 'key' - local UPDATE_FLAGS='any exist noexist' + local UPDATE_FLAGS='any exist noexist' idx for (( idx=3; idx < ${#words[@]}-1; idx++ )); do if [[ ${words[idx]} == 'value' ]]; then # 'value' is present, but is not the last @@ -893,7 +885,6 @@ _bpftool() esac ;; btf) - local PROG_TYPE='id pinned tag name' local MAP_TYPE='id pinned name' case $command in dump) @@ -1033,7 +1024,6 @@ _bpftool() local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list_builtins attach_types 2>/dev/null | \ grep '^cgroup_')" local ATTACH_FLAGS='multi override' - local PROG_TYPE='id pinned tag name' # Check for $prev = $command first if [ $prev = $command ]; then _filedir @@ -1086,7 +1076,6 @@ _bpftool() esac ;; net) - local PROG_TYPE='id pinned tag name' local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload' case $command in show|list) @@ -1193,14 +1182,14 @@ _bpftool() pin|detach) if [[ $prev == "$command" ]]; then COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) - else + elif [[ $pprev == "$command" ]]; then _filedir fi return 0 ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'help pin detach show list' -- "$cur" ) ) ;; esac ;; From 9213e52970a5997c9eb176c7afcc6ec67b1b1e6f Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Sat, 13 Apr 2024 23:12:57 +0200 Subject: [PATCH 093/147] libbpf: Fix misaligned array closing bracket In btf_dump_array_data(), libbpf will call btf_dump_dump_type_data() for each element. For an array of characters, each element will be processed the following way: - btf_dump_dump_type_data() is called to print the character - btf_dump_data_pfx() prefixes the current line with the proper number of indentations - btf_dump_int_data() is called to print the character - After the last character is printed, btf_dump_dump_type_data() calls btf_dump_data_pfx() before writing the closing bracket However, for an array containing characters, btf_dump_int_data() won't print any '\0' and subsequent characters. This leads to situations where the line prefix is written, no character is added, then the prefix is written again before adding the closing bracket: (struct sk_metadata){ .str_array = (__u8[14])[ 'H', 'e', 'l', 'l', 'o', ], This change solves this issue by printing the '\0' character, which has two benefits: - The bracket closing the array is properly aligned - It's clear from a user point of view that libbpf uses '\0' as a terminator for arrays of characters. Signed-off-by: Quentin Deslandes Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240413211258.134421-2-qde@naccy.de --- tools/lib/bpf/btf_dump.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 4d9f30bf7f01..6a37e8517435 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1929,6 +1929,7 @@ static int btf_dump_int_data(struct btf_dump *d, if (d->typed_dump->is_array_terminated) break; if (*(char *)data == '\0') { + btf_dump_type_values(d, "'\\0'"); d->typed_dump->is_array_terminated = true; break; } From e739e01d8df8bf26dbc9dcaeaee3c8c55e8ffa71 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Sat, 13 Apr 2024 23:12:58 +0200 Subject: [PATCH 094/147] libbpf: Fix dump of subsequent char arrays When dumping a character array, libbpf will watch for a '\0' and set is_array_terminated=true if found. This prevents libbpf from printing the remaining characters of the array, treating it as a nul-terminated string. However, once this flag is set, it's never reset, leading to subsequent characters array not being printed properly: .str_multi = (__u8[2][16])[ [ 'H', 'e', 'l', ], ], This patch saves the is_array_terminated flag and restores its default (false) value before looping over the elements of an array, then restores it afterward. This way, libbpf's behavior is unchanged when dumping the characters of an array, but subsequent arrays are printed properly: .str_multi = (__u8[2][16])[ [ 'H', 'e', 'l', ], [ 'l', 'o', ], ], Signed-off-by: Quentin Deslandes Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240413211258.134421-3-qde@naccy.de --- tools/lib/bpf/btf_dump.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 6a37e8517435..5dbca76b953f 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -2032,6 +2032,7 @@ static int btf_dump_array_data(struct btf_dump *d, __u32 i, elem_type_id; __s64 elem_size; bool is_array_member; + bool is_array_terminated; elem_type_id = array->type; elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); @@ -2067,12 +2068,15 @@ static int btf_dump_array_data(struct btf_dump *d, */ is_array_member = d->typed_dump->is_array_member; d->typed_dump->is_array_member = true; + is_array_terminated = d->typed_dump->is_array_terminated; + d->typed_dump->is_array_terminated = false; for (i = 0; i < array->nelems; i++, data += elem_size) { if (d->typed_dump->is_array_terminated) break; btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); } d->typed_dump->is_array_member = is_array_member; + d->typed_dump->is_array_terminated = is_array_terminated; d->typed_dump->depth--; btf_dump_data_pfx(d); btf_dump_type_values(d, "]"); From 462e5e2a5938d0241ad146d21dd0da1be8e7eaf0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Apr 2024 14:44:06 -0700 Subject: [PATCH 095/147] bpf: Fix JIT of is_mov_percpu_addr instruction. The codegen for is_mov_percpu_addr instruction works for rax/r8 registers only. Fix it to generate proper x86 byte code for other registers. Fixes: 7bdbf7446305 ("bpf: add special internal-only MOV instruction to resolve per-CPU addrs") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240417214406.15788-1-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2b5a475c4dd0..673fdbd765d7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1439,7 +1439,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image #ifdef CONFIG_SMP /* add , gs:[] */ EMIT2(0x65, add_1mod(0x48, dst_reg)); - EMIT3(0x03, add_1reg(0x04, dst_reg), 0x25); + EMIT3(0x03, add_2reg(0x04, 0, dst_reg), 0x25); EMIT((u32)(unsigned long)&this_cpu_off, 4); #endif break; From 9c598a83b7eab7b05002911fd25b0be7b996ce6d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:07 +0800 Subject: [PATCH 096/147] selftests/bpf: Add start_server_addr helper In order to pair up with connect_to_addr(), this patch adds a new helper start_server_addr(), which is a wrapper of __start_server(). It accepts an argument 'addr' of 'struct sockaddr_storage' type instead of a string type argument like start_server(), and a network_helper_opts argument as the last one. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/2f01d48fa026467926738debe554ac452c19b86f.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 14 ++++++++++++-- tools/testing/selftests/bpf/network_helpers.h | 2 ++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index dc1fd7af9c7a..28fe8367451b 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -52,6 +52,8 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; +static const struct network_helper_opts default_opts; + int settimeo(int fd, int timeout_ms) { struct timeval timeout = { .tv_sec = 3 }; @@ -185,6 +187,16 @@ close_fds: return NULL; } +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts) +{ + if (!opts) + opts = &default_opts; + + return __start_server(type, 0, (struct sockaddr *)addr, len, + opts->timeout_ms, 0); +} + void free_fds(int *fds, unsigned int nr_close_fds) { if (fds) { @@ -278,8 +290,6 @@ error_close: return -1; } -static const struct network_helper_opts default_opts; - int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 70f4e4c92733..414ea50bb3fc 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -53,6 +53,8 @@ int start_mptcp_server(int family, const char *addr, __u16 port, int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens); +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); void free_fds(int *fds, unsigned int nr_close_fds); int connect_to_addr(const struct sockaddr_storage *addr, socklen_t len, int type); int connect_to_fd(int server_fd, int timeout_ms); From 9851382fb3697efb2f4234d70596bd845b3af535 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:08 +0800 Subject: [PATCH 097/147] selftests/bpf: Use start_server_addr in cls_redirect Include network_helpers.h in prog_tests/cls_redirect.c, use the newly added public helper start_server_addr() instead of the local defined function start_server(). This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/13f336cb4c6680175d50bb963d9532e11528c758.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/cls_redirect.c | 20 ++----------------- 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 2a55f717fc07..68cb93106658 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -10,6 +10,7 @@ #include #include +#include "network_helpers.h" #include "progs/test_cls_redirect.h" #include "test_cls_redirect.skel.h" @@ -35,23 +36,6 @@ struct tuple { struct addr_port dst; }; -static int start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto err; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - static int connect_to_server(const struct sockaddr *addr, socklen_t len, int type) { @@ -98,7 +82,7 @@ static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, socklen_t slen = sizeof(ss); struct sockaddr *sa = (struct sockaddr *)&ss; - *server = start_server(addr, len, type); + *server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (*server < 0) return false; From a2e4979536c440838794ee292955ce9ed55ad181 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:09 +0800 Subject: [PATCH 098/147] selftests/bpf: Use start_server_addr in sk_assign Include network_helpers.h in prog_tests/sk_assign.c, use the newly added public helper start_server_addr() instead of the local defined function start_server(). This can avoid duplicate code. The code that sets SO_RCVTIMEO timeout as timeo_sec (3s) can be dropped, since start_server_addr() sets default timeout as 3s. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/2af706ffbad63b4f7eaf93a426ed1076eadf1a05.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sk_assign.c | 29 +++---------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index 1374b626a985..b066b6b88d7c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -15,6 +15,7 @@ #include #include "test_progs.h" +#include "network_helpers.h" #define BIND_PORT 1234 #define CONNECT_PORT 4321 @@ -73,30 +74,6 @@ configure_stack(void) return true; } -static int -start_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(bind(fd, addr, len) == -1)) - goto close_out; - if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static int connect_to_server(const struct sockaddr *addr, socklen_t len, int type) { @@ -310,7 +287,9 @@ void test_sk_assign(void) continue; prepare_addr(test->addr, test->family, BIND_PORT, false); addr = (const struct sockaddr *)test->addr; - server = start_server(addr, test->len, test->type); + server = start_server_addr(test->type, + (const struct sockaddr_storage *)addr, + test->len, NULL); if (server == -1) goto close; From db9994d022ecd42006354609f6e4e3f57e5a4b03 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:10 +0800 Subject: [PATCH 099/147] selftests/bpf: Update arguments of connect_to_addr Move the third argument "int type" of connect_to_addr() to the first one which is closer to how the socket syscall is doing it. And add a network_helper_opts argument as the fourth one. Then change its usages in sock_addr.c too. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/088ea8a95055f93409c5f57d12f0e58d43059ac4.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 13 ++++++++++--- tools/testing/selftests/bpf/network_helpers.h | 3 ++- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 6 +++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 28fe8367451b..9d63d2ac13d8 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -270,17 +270,24 @@ static int connect_fd_to_addr(int fd, return 0; } -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t addrlen, int type) +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) { int fd; - fd = socket(addr->ss_family, type, 0); + if (!opts) + opts = &default_opts; + + fd = socket(addr->ss_family, type, opts->proto); if (fd < 0) { log_err("Failed to create client socket"); return -1; } - if (connect_fd_to_addr(fd, addr, addrlen, false)) + if (settimeo(fd, opts->timeout_ms)) + goto error_close; + + if (connect_fd_to_addr(fd, addr, addrlen, opts->must_fail)) goto error_close; return fd; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 414ea50bb3fc..aef297dfa6ca 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -56,7 +56,8 @@ int *start_reuseport_server(int family, int type, const char *addr_str, int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, const struct network_helper_opts *opts); void free_fds(int *fds, unsigned int nr_close_fds); -int connect_to_addr(const struct sockaddr_storage *addr, socklen_t len, int type); +int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len, + const struct network_helper_opts *opts); int connect_to_fd(int server_fd, int timeout_ms); int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 5fd617718991..61668e0f11b0 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -328,7 +328,7 @@ static void test_bind(struct sock_addr_test *test) goto cleanup; /* Try to connect to server just in case */ - client = connect_to_addr(&expected_addr, expected_addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &expected_addr, expected_addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -357,7 +357,7 @@ static void test_connect(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -538,7 +538,7 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(&addr, addr_len, test->socket_type); + client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; From 805b4d90c0df79a88907623b2528954a0125313d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:11 +0800 Subject: [PATCH 100/147] selftests/bpf: Use connect_to_addr in cls_redirect This patch uses public helper connect_to_addr() exported in network_helpers.h instead of the local defined function connect_to_server() in prog_tests/cls_redirect.c. This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/4a03ac92d2d392f8721f398fa449a83ac75577bc.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/cls_redirect.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 68cb93106658..34b59f6baca1 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -36,22 +36,6 @@ struct tuple { struct addr_port dst; }; -static int connect_to_server(const struct sockaddr *addr, socklen_t len, - int type) -{ - int fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - return -1; - if (CHECK_FAIL(connect(fd, addr, len))) - goto err; - - return fd; - -err: - close(fd); - return -1; -} - static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) { const struct sockaddr_in6 *in6; @@ -89,7 +73,7 @@ static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, if (CHECK_FAIL(getsockname(*server, sa, &slen))) goto close_server; - *conn = connect_to_server(sa, slen, type); + *conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL); if (*conn < 0) goto close_server; From 63a51820d29bdfcf52abed2db2d32b4f6843988e Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Apr 2024 16:09:12 +0800 Subject: [PATCH 101/147] selftests/bpf: Use connect_to_addr in sk_assign This patch uses public helper connect_to_addr() exported in network_helpers.h instead of the local defined function connect_to_server() in prog_tests/sk_assign.c. This can avoid duplicate code. The code that sets SO_SNDTIMEO timeout as timeo_sec (3s) can be dropped, since connect_to_addr() sets default timeout as 3s. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/98fdd384872bda10b2adb052e900a2212c9047b9.1713427236.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sk_assign.c | 26 +------------------ 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index b066b6b88d7c..0b9bd1d6f7cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -23,8 +23,6 @@ #define NS_SELF "/proc/self/ns/net" #define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" -static const struct timeval timeo_sec = { .tv_sec = 3 }; -static const size_t timeo_optlen = sizeof(timeo_sec); static int stop, duration; static bool @@ -74,28 +72,6 @@ configure_stack(void) return true; } -static int -connect_to_server(const struct sockaddr *addr, socklen_t len, int type) -{ - int fd = -1; - - fd = socket(addr->sa_family, type, 0); - if (CHECK_FAIL(fd == -1)) - goto out; - if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, - timeo_optlen))) - goto close_out; - if (CHECK_FAIL(connect(fd, addr, len))) - goto close_out; - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static in_port_t get_port(int fd) { @@ -138,7 +114,7 @@ run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) in_port_t port; int ret = 1; - client = connect_to_server(addr, len, type); + client = connect_to_addr(type, (struct sockaddr_storage *)addr, len, NULL); if (client == -1) { perror("Cannot connect to server"); goto out; From db50040d09cc511889b2c949a0c93d017e44f081 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Fri, 19 Apr 2024 13:36:17 -0700 Subject: [PATCH 102/147] bpf, docs: Clarify helper ID and pointer terms in instruction-set.rst Per IETF 119 meeting discussion and mailing list discussion at https://mailarchive.ietf.org/arch/msg/bpf/2JwWQwFdOeMGv0VTbD0CKWwAOEA/ the following changes are made. First, say call by "static ID" rather than call by "address" Second, change "pointer" to "address" Signed-off-by: Dave Thaler Acked-by: David Vernet Link: https://lore.kernel.org/r/20240419203617.6850-1-dthaler1968@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/standardization/instruction-set.rst | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index 8d0781f0bd17..1f92551a34b9 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -443,27 +443,27 @@ otherwise identical operations, and indicates the base64 conformance group unless otherwise specified. The 'code' field encodes the operation as below: -======== ===== ======= =============================== =================================================== -code value src_reg description notes -======== ===== ======= =============================== =================================================== -JA 0x0 0x0 PC += offset {JA, K, JMP} only -JA 0x0 0x0 PC += imm {JA, K, JMP32} only +======== ===== ======= ================================= =================================================== +code value src_reg description notes +======== ===== ======= ================================= =================================================== +JA 0x0 0x0 PC += offset {JA, K, JMP} only +JA 0x0 0x0 PC += imm {JA, K, JMP32} only JEQ 0x1 any PC += offset if dst == src -JGT 0x2 any PC += offset if dst > src unsigned -JGE 0x3 any PC += offset if dst >= src unsigned +JGT 0x2 any PC += offset if dst > src unsigned +JGE 0x3 any PC += offset if dst >= src unsigned JSET 0x4 any PC += offset if dst & src JNE 0x5 any PC += offset if dst != src -JSGT 0x6 any PC += offset if dst > src signed -JSGE 0x7 any PC += offset if dst >= src signed -CALL 0x8 0x0 call helper function by address {CALL, K, JMP} only, see `Helper functions`_ -CALL 0x8 0x1 call PC += imm {CALL, K, JMP} only, see `Program-local functions`_ -CALL 0x8 0x2 call helper function by BTF ID {CALL, K, JMP} only, see `Helper functions`_ -EXIT 0x9 0x0 return {CALL, K, JMP} only -JLT 0xa any PC += offset if dst < src unsigned -JLE 0xb any PC += offset if dst <= src unsigned -JSLT 0xc any PC += offset if dst < src signed -JSLE 0xd any PC += offset if dst <= src signed -======== ===== ======= =============================== =================================================== +JSGT 0x6 any PC += offset if dst > src signed +JSGE 0x7 any PC += offset if dst >= src signed +CALL 0x8 0x0 call helper function by static ID {CALL, K, JMP} only, see `Helper functions`_ +CALL 0x8 0x1 call PC += imm {CALL, K, JMP} only, see `Program-local functions`_ +CALL 0x8 0x2 call helper function by BTF ID {CALL, K, JMP} only, see `Helper functions`_ +EXIT 0x9 0x0 return {CALL, K, JMP} only +JLT 0xa any PC += offset if dst < src unsigned +JLE 0xb any PC += offset if dst <= src unsigned +JSLT 0xc any PC += offset if dst < src signed +JSLE 0xd any PC += offset if dst <= src signed +======== ===== ======= ================================= =================================================== The BPF program needs to store the return value into register R0 before doing an ``EXIT``. @@ -498,9 +498,9 @@ Helper functions Helper functions are a concept whereby BPF programs can call into a set of function calls exposed by the underlying platform. -Historically, each helper function was identified by an address +Historically, each helper function was identified by a static ID encoded in the 'imm' field. The available helper functions may differ -for each program type, but address values are unique across all program types. +for each program type, but static IDs are unique across all program types. Platforms that support the BPF Type Format (BTF) support identifying a helper function by a BTF ID encoded in the 'imm' field, where the BTF ID @@ -667,11 +667,11 @@ src_reg pseudocode imm type dst type ======= ========================================= =========== ============== 0x0 dst = (next_imm << 32) | imm integer integer 0x1 dst = map_by_fd(imm) map fd map -0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data pointer -0x3 dst = var_addr(imm) variable id data pointer -0x4 dst = code_addr(imm) integer code pointer +0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data address +0x3 dst = var_addr(imm) variable id data address +0x4 dst = code_addr(imm) integer code address 0x5 dst = map_by_idx(imm) map index map -0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data pointer +0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data address ======= ========================================= =========== ============== where From 735f5b8a7ccf383e50d76f7d1c25769eee474812 Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Fri, 19 Apr 2024 14:38:26 -0700 Subject: [PATCH 103/147] bpf, docs: Fix formatting nit in instruction-set.rst Other places that had pseudocode were prefixed with :: so as to appear in a literal block, but one place was inconsistent. This patch fixes that inconsistency. Signed-off-by: Dave Thaler Acked-by: David Vernet Link: https://lore.kernel.org/r/20240419213826.7301-1-dthaler1968@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/standardization/instruction-set.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index 1f92551a34b9..d03d90afbd7d 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -370,7 +370,7 @@ Note that there are varying definitions of the signed modulo operation when the dividend or divisor are negative, where implementations often vary by language such that Python, Ruby, etc. differ from C, Go, Java, etc. This specification requires that signed modulo use truncated division -(where -13 % 3 == -1) as implemented in C, Go, etc.: +(where -13 % 3 == -1) as implemented in C, Go, etc.:: a % n = a - n * trunc(a / n) From e1a7545981e2086feaa40dcb7b0db8de0bdada19 Mon Sep 17 00:00:00 2001 From: Rafael Passos Date: Wed, 17 Apr 2024 14:52:26 -0300 Subject: [PATCH 104/147] bpf: Fix typo in function save_aux_ptr_type I found this typo in the save_aux_ptr_type function. s/allow_trust_missmatch/allow_trust_mismatch/ I did not find this anywhere else in the codebase. Signed-off-by: Rafael Passos Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/fbe1d636-8172-4698-9a5a-5a3444b55322@smtp-relay.sendinblue.com --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 68cfd6fc6ad4..7b2aa065e552 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6971,7 +6971,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, - bool allow_trust_missmatch); + bool allow_trust_mismatch); static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { @@ -17530,7 +17530,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) } static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, - bool allow_trust_missmatch) + bool allow_trust_mismatch) { enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; @@ -17548,7 +17548,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * src_reg == stack|map in some other branch. * Reject it. */ - if (allow_trust_missmatch && + if (allow_trust_mismatch && base_type(type) == PTR_TO_BTF_ID && base_type(*prev_type) == PTR_TO_BTF_ID) { /* From a7de265cb2d849f8986a197499ad58dca0a4f209 Mon Sep 17 00:00:00 2001 From: Rafael Passos Date: Wed, 17 Apr 2024 15:49:14 -0300 Subject: [PATCH 105/147] bpf: Fix typos in comments Found the following typos in comments, and fixed them: s/unpriviledged/unprivileged/ s/reponsible/responsible/ s/possiblities/possibilities/ s/Divison/Division/ s/precsion/precision/ s/havea/have a/ s/reponsible/responsible/ s/responsibile/responsible/ s/tigher/tighter/ s/respecitve/respective/ Signed-off-by: Rafael Passos Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/6af7deb4-bb24-49e8-b3f1-8dd410597337@smtp-relay.sendinblue.com --- kernel/bpf/bpf_local_storage.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/hashtab.c | 2 +- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 12 ++++++------ 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index bdea1a459153..976cb258a0ed 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -318,7 +318,7 @@ static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, * * If the local_storage->list is already empty, the caller will not * care about the bpf_ma value also because the caller is not - * responsibile to free the local_storage. + * responsible to free the local_storage. */ if (storage_smap) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a41718eaeefe..95c7fd093e55 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2814,7 +2814,7 @@ void bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_free); -/* RNG for unpriviledged user space with separated state from prandom_u32(). */ +/* RNG for unprivileged user space with separated state from prandom_u32(). */ static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); void bpf_user_rnd_init_once(void) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 83a9a74260e9..c3e79a0b9361 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1539,7 +1539,7 @@ static void htab_map_free(struct bpf_map *map) */ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it - * underneath and is reponsible for waiting for callbacks to finish + * underneath and is responsible for waiting for callbacks to finish * during bpf_mem_alloc_destroy(). */ if (!htab_is_prealloc(htab)) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8cde717137bd..61126180d398 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2412,7 +2412,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * * For skb-type dynptrs, it is safe to write into the returned pointer - * if the bpf program allows skb data writes. There are two possiblities + * if the bpf program allows skb data writes. There are two possibilities * that may occur when calling bpf_dynptr_slice_rdwr: * * 1) The requested slice is in the head of the skb. In this case, the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b2aa065e552..5a7e34e83a5b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -172,7 +172,7 @@ static bool bpf_global_percpu_ma_set; /* verifier_state + insn_idx are pushed to stack when branch is encountered */ struct bpf_verifier_stack_elem { - /* verifer state is 'st' + /* verifier state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx' */ @@ -2131,7 +2131,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) { /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit - * values on both sides of 64-bit range in hope to have tigher range. + * values on both sides of 64-bit range in hope to have tighter range. * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound @@ -2139,7 +2139,7 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. * We just need to make sure that derived bounds we are intersecting - * with are well-formed ranges in respecitve s64 or u64 domain, just + * with are well-formed ranges in respective s64 or u64 domain, just * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. */ __u64 new_umin, new_umax; @@ -14714,7 +14714,7 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state /* Adjusts the register min/max values in the case that the dst_reg and * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K - * check, in which case we havea fake SCALAR_VALUE representing insn->imm). + * check, in which case we have a fake SCALAR_VALUE representing insn->imm). * Technically we can do similar adjustments for pointers to the same object, * but we don't support that right now. */ @@ -17352,7 +17352,7 @@ hit: err = propagate_liveness(env, &sl->state, cur); /* if previous state reached the exit with precision and - * current state is equivalent to it (except precsion marks) + * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ @@ -20209,7 +20209,7 @@ patch_map_ops_generic: * divide-by-3 through multiplication, followed by further * division by 8 through 3-bit right shift. * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., - * p. 227, chapter "Unsigned Divison by 3" for details and proofs. + * p. 227, chapter "Unsigned Division by 3" for details and proofs. * * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. */ From be2749beff62e0d63cf97fe63cabc79a68443139 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:01 +0200 Subject: [PATCH 106/147] bpf: make timer data struct more generic To be able to add workqueues and reuse most of the timer code, we need to make bpf_hrtimer more generic. There is no code change except that the new struct gets a new u64 flags attribute. We are still below 2 cache lines, so this shouldn't impact the current running codes. The ordering is also changed. Everything related to async callback is now on top of bpf_hrtimer. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-1-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 71 ++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 61126180d398..7c7f31dd87ba 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1079,11 +1079,20 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +struct bpf_async_cb { + struct bpf_map *map; + struct bpf_prog *prog; + void __rcu *callback_fn; + void *value; + struct rcu_head rcu; + u64 flags; +}; + /* BPF map elements can contain 'struct bpf_timer'. * Such map owns all of its BPF timers. * 'struct bpf_timer' is allocated as part of map element allocation * and it's zero initialized. - * That space is used to keep 'struct bpf_timer_kern'. + * That space is used to keep 'struct bpf_async_kern'. * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and * remembers 'struct bpf_map *' pointer it's part of. * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. @@ -1096,16 +1105,12 @@ const struct bpf_func_proto bpf_snprintf_proto = { * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { + struct bpf_async_cb cb; struct hrtimer timer; - struct bpf_map *map; - struct bpf_prog *prog; - void __rcu *callback_fn; - void *value; - struct rcu_head rcu; }; /* the actual struct hidden inside uapi struct bpf_timer */ -struct bpf_timer_kern { +struct bpf_async_kern { struct bpf_hrtimer *timer; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer @@ -1119,14 +1124,14 @@ static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); - struct bpf_map *map = t->map; - void *value = t->value; + struct bpf_map *map = t->cb.map; + void *value = t->cb.value; bpf_callback_t callback_fn; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_timer); - callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); + callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; @@ -1155,7 +1160,7 @@ out: return HRTIMER_NORESTART; } -BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, +BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, u64, flags) { clockid_t clockid = flags & (MAX_CLOCKS - 1); @@ -1163,8 +1168,8 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map int ret = 0; BUILD_BUG_ON(MAX_CLOCKS != 16); - BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); - BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); if (in_nmi()) return -EOPNOTSUPP; @@ -1187,10 +1192,10 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->record->timer_off; - t->map = map; - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); + t->cb.value = (void *)timer - map->record->timer_off; + t->cb.map = map; + t->cb.prog = NULL; + rcu_assign_pointer(t->cb.callback_fn, NULL); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; WRITE_ONCE(timer->timer, t); @@ -1222,7 +1227,7 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, +BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { struct bpf_prog *prev, *prog = aux->prog; @@ -1237,7 +1242,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb ret = -EINVAL; goto out; } - if (!atomic64_read(&t->map->usercnt)) { + if (!atomic64_read(&t->cb.map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space @@ -1246,7 +1251,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb ret = -EPERM; goto out; } - prev = t->prog; + prev = t->cb.prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. @@ -1259,9 +1264,9 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - t->prog = prog; + t->cb.prog = prog; } - rcu_assign_pointer(t->callback_fn, callback_fn); + rcu_assign_pointer(t->cb.callback_fn, callback_fn); out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; @@ -1275,7 +1280,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; int ret = 0; @@ -1287,7 +1292,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; - if (!t || !t->prog) { + if (!t || !t->cb.prog) { ret = -EINVAL; goto out; } @@ -1315,18 +1320,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_hrtimer *t) +static void drop_prog_refcnt(struct bpf_async_cb *async) { - struct bpf_prog *prog = t->prog; + struct bpf_prog *prog = async->prog; if (prog) { bpf_prog_put(prog); - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); + async->prog = NULL; + rcu_assign_pointer(async->callback_fn, NULL); } } -BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t; int ret = 0; @@ -1348,7 +1353,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) ret = -EDEADLK; goto out; } - drop_prog_refcnt(t); + drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish @@ -1371,7 +1376,7 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { */ void bpf_timer_cancel_and_free(void *val) { - struct bpf_timer_kern *timer = val; + struct bpf_async_kern *timer = val; struct bpf_hrtimer *t; /* Performance optimization: read timer->timer without lock first. */ @@ -1383,7 +1388,7 @@ void bpf_timer_cancel_and_free(void *val) t = timer->timer; if (!t) goto out; - drop_prog_refcnt(t); + drop_prog_refcnt(&t->cb); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ @@ -1410,7 +1415,7 @@ out: */ if (this_cpu_read(hrtimer_running) != t) hrtimer_cancel(&t->timer); - kfree_rcu(t, rcu); + kfree_rcu(t, cb.rcu); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) From 56b4a177ae6322173360a93ea828ad18570a5a14 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:02 +0200 Subject: [PATCH 107/147] bpf: replace bpf_timer_init with a generic helper No code change except for the new flags argument being stored in the local data struct. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-2-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 91 ++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 28 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7c7f31dd87ba..0f0201171576 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1111,7 +1111,10 @@ struct bpf_hrtimer { /* the actual struct hidden inside uapi struct bpf_timer */ struct bpf_async_kern { - struct bpf_hrtimer *timer; + union { + struct bpf_async_cb *cb; + struct bpf_hrtimer *timer; + }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. @@ -1119,6 +1122,10 @@ struct bpf_async_kern { struct bpf_spin_lock lock; } __attribute__((aligned(8))); +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, +}; + static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) @@ -1160,46 +1167,55 @@ out: return HRTIMER_NORESTART; } -BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, - u64, flags) +static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, + enum bpf_async_type type) { - clockid_t clockid = flags & (MAX_CLOCKS - 1); + struct bpf_async_cb *cb; struct bpf_hrtimer *t; + clockid_t clockid; + size_t size; int ret = 0; - BUILD_BUG_ON(MAX_CLOCKS != 16); - BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); - BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); - if (in_nmi()) return -EOPNOTSUPP; - if (flags >= MAX_CLOCKS || - /* similar to timerfd except _ALARM variants are not supported */ - (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME && - clockid != CLOCK_BOOTTIME)) + switch (type) { + case BPF_ASYNC_TYPE_TIMER: + size = sizeof(struct bpf_hrtimer); + break; + default: return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; + } + + __bpf_spin_lock_irqsave(&async->lock); + t = async->timer; if (t) { ret = -EBUSY; goto out; } + /* allocate hrtimer via map_kmalloc to use memcg accounting */ - t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); - if (!t) { + cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + if (!cb) { ret = -ENOMEM; goto out; } - t->cb.value = (void *)timer - map->record->timer_off; - t->cb.map = map; - t->cb.prog = NULL; - rcu_assign_pointer(t->cb.callback_fn, NULL); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; - WRITE_ONCE(timer->timer, t); - /* Guarantee the order between timer->timer and map->usercnt. So + + if (type == BPF_ASYNC_TYPE_TIMER) { + clockid = flags & (MAX_CLOCKS - 1); + t = (struct bpf_hrtimer *)cb; + + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); + t->timer.function = bpf_timer_cb; + cb->value = (void *)async - map->record->timer_off; + } + cb->map = map; + cb->prog = NULL; + cb->flags = flags; + rcu_assign_pointer(cb->callback_fn, NULL); + + WRITE_ONCE(async->cb, cb); + /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL * timer or atomic64_read() below returns a zero usercnt. @@ -1209,15 +1225,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(timer->timer, NULL); - kfree(t); + WRITE_ONCE(async->cb, NULL); + kfree(cb); ret = -EPERM; } out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, + u64, flags) +{ + clock_t clockid = flags & (MAX_CLOCKS - 1); + + BUILD_BUG_ON(MAX_CLOCKS != 16); + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); + + if (flags >= MAX_CLOCKS || + /* similar to timerfd except _ALARM variants are not supported */ + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME && + clockid != CLOCK_BOOTTIME)) + return -EINVAL; + + return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_init_proto = { .func = bpf_timer_init, .gpl_only = true, From 073f11b0264310b85754b6a0946afee753790c66 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:03 +0200 Subject: [PATCH 108/147] bpf: replace bpf_timer_set_callback with a generic helper In the same way we have a generic __bpf_async_init(), we also need to share code between timer and workqueue for the set_callback call. We just add an unused flags parameter, as it will be used for workqueues. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-3-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0f0201171576..1cbbf28e9398 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1262,22 +1262,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, - struct bpf_prog_aux *, aux) +static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, + struct bpf_prog_aux *aux, unsigned int flags, + enum bpf_async_type type) { struct bpf_prog *prev, *prog = aux->prog; - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { + __bpf_spin_lock_irqsave(&async->lock); + cb = async->cb; + if (!cb) { ret = -EINVAL; goto out; } - if (!atomic64_read(&t->cb.map->usercnt)) { + if (!atomic64_read(&cb->map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space @@ -1286,7 +1287,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callb ret = -EPERM; goto out; } - prev = t->cb.prog; + prev = cb->prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. @@ -1299,14 +1300,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callb if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - t->cb.prog = prog; + cb->prog = prog; } - rcu_assign_pointer(t->cb.callback_fn, callback_fn); + rcu_assign_pointer(cb->callback_fn, callback_fn); out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, + struct bpf_prog_aux *, aux) +{ + return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_set_callback_proto = { .func = bpf_timer_set_callback, .gpl_only = true, From fc22d9495f0b32d75b5d25a17b300b7aad05c55d Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:04 +0200 Subject: [PATCH 109/147] bpf: replace bpf_timer_cancel_and_free with a generic helper Same reason than most bpf_timer* functions, we need almost the same for workqueues. So extract the generic part out of it so bpf_wq_cancel_and_free can reuse it. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-4-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1cbbf28e9398..0b8ba6c81994 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1413,36 +1413,44 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; +static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) +{ + struct bpf_async_cb *cb; + + /* Performance optimization: read async->cb without lock first. */ + if (!READ_ONCE(async->cb)) + return NULL; + + __bpf_spin_lock_irqsave(&async->lock); + /* re-read it under lock */ + cb = async->cb; + if (!cb) + goto out; + drop_prog_refcnt(cb); + /* The subsequent bpf_timer_start/cancel() helpers won't be able to use + * this timer, since it won't be initialized. + */ + WRITE_ONCE(async->cb, NULL); +out: + __bpf_spin_unlock_irqrestore(&async->lock); + return cb; +} + /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_timer_cancel_and_free(void *val) { - struct bpf_async_kern *timer = val; struct bpf_hrtimer *t; - /* Performance optimization: read timer->timer without lock first. */ - if (!READ_ONCE(timer->timer)) - return; + t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); - __bpf_spin_lock_irqsave(&timer->lock); - /* re-read it under lock */ - t = timer->timer; - if (!t) - goto out; - drop_prog_refcnt(&t->cb); - /* The subsequent bpf_timer_start/cancel() helpers won't be able to use - * this timer, since it won't be initialized. - */ - WRITE_ONCE(timer->timer, NULL); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); if (!t) return; /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call kfree(t) * right after for both preallocated and non-preallocated maps. - * The timer->timer = NULL was already done and no code path can + * The async->cb = NULL was already done and no code path can * see address 't' anymore. * * Check that bpf_map_delete/update_elem() wasn't called from timer @@ -1451,7 +1459,7 @@ out: * return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', - * since timer->timer = NULL was already done. The timer will be + * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. */ From d56b63cf0c0f71e1b2e04dd8220b408f049e67ff Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:05 +0200 Subject: [PATCH 110/147] bpf: add support for bpf_wq user type Mostly a copy/paste from the bpf_timer API, without the initialization and free, as they will be done in a separate patch. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-5-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++++++++- include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/btf.c | 17 +++++++++++++++++ kernel/bpf/syscall.c | 6 +++++- kernel/bpf/verifier.c | 9 +++++++++ 5 files changed, 45 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5034c1b4ded7..c7dcfd395555 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -185,7 +185,7 @@ struct bpf_map_ops { enum { /* Support at most 10 fields in a BTF type */ - BTF_FIELDS_MAX = 10, + BTF_FIELDS_MAX = 11, }; enum btf_field_type { @@ -202,6 +202,7 @@ enum btf_field_type { BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE, BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), + BPF_WORKQUEUE = (1 << 10), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -238,6 +239,7 @@ struct btf_record { u32 field_mask; int spin_lock_off; int timer_off; + int wq_off; int refcount_off; struct btf_field fields[]; }; @@ -312,6 +314,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_spin_lock"; case BPF_TIMER: return "bpf_timer"; + case BPF_WORKQUEUE: + return "bpf_wq"; case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; @@ -340,6 +344,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_spin_lock); case BPF_TIMER: return sizeof(struct bpf_timer); + case BPF_WORKQUEUE: + return sizeof(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: @@ -367,6 +373,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_spin_lock); case BPF_TIMER: return __alignof__(struct bpf_timer); + case BPF_WORKQUEUE: + return __alignof__(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: @@ -406,6 +414,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cee0a7915c08..e4ae83550fb3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7306,6 +7306,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_wq { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6d46cee47ae3..8291fbfd27b1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3464,6 +3464,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, goto end; } } + if (field_mask & BPF_WORKQUEUE) { + if (!strcmp(name, "bpf_wq")) { + if (*seen_mask & BPF_WORKQUEUE) + return -E2BIG; + *seen_mask |= BPF_WORKQUEUE; + type = BPF_WORKQUEUE; + goto end; + } + } field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); @@ -3515,6 +3524,7 @@ static int btf_find_struct_field(const struct btf *btf, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3582,6 +3592,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3816,6 +3827,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; + rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); @@ -3846,6 +3858,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->timer_off = rec->fields[i].offset; break; + case BPF_WORKQUEUE: + WARN_ON_ONCE(rec->wq_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->wq_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d392ec83655..0848e4141b00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -559,6 +559,7 @@ void btf_record_free(struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to release */ break; default: @@ -608,6 +609,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to acquire */ break; default: @@ -679,6 +681,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); break; + case BPF_WORKQUEUE: + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -1085,7 +1089,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5a7e34e83a5b..89490a95b120 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1838,6 +1838,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) */ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; + if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -18141,6 +18143,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_wq yet\n"); + return -EINVAL; + } + } + if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); From f1d0a2fbb0088675cceeab73d1f4b4308b289984 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:06 +0200 Subject: [PATCH 111/147] tools: sync include/uapi/linux/bpf.h cp include/uapi/linux/bpf.h tools/include/uapi/linux/bpf.h Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-6-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index cee0a7915c08..e4ae83550fb3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7306,6 +7306,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_wq { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 __opaque[2]; } __attribute__((aligned(8))); From ad2c03e691be3268eefc75ff1d892db3f0e79f62 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:07 +0200 Subject: [PATCH 112/147] bpf: verifier: bail out if the argument is not a map When a kfunc is declared with a KF_ARG_PTR_TO_MAP, we should have reg->map_ptr set to a non NULL value, otherwise, that means that the underlying type is not a map. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-7-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89490a95b120..4adf7fc33e5a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11720,6 +11720,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_NULL: continue; case KF_ARG_PTR_TO_MAP: + if (!reg->map_ptr) { + verbose(env, "pointer in R%d isn't map pointer\n", regno); + return -EINVAL; + } + fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) From d940c9b94d7e6d9cff288623e3e8bf5fdea98b79 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:08 +0200 Subject: [PATCH 113/147] bpf: add support for KF_ARG_PTR_TO_WORKQUEUE Introduce support for KF_ARG_PTR_TO_WORKQUEUE. The kfuncs will use bpf_wq as argument and that will be recognized as workqueue argument by verifier. bpf_wq_kern casting can happen inside kfunc, but using bpf_wq in argument makes life easier for users who work with non-kern type in BPF progs. Duplicate process_timer_func into process_wq_func. meta argument is only needed to ensure bpf_wq_init's workqueue and map arguments are coming from the same map (map_uid logic is necessary for correct inner-map handling), so also amend check_kfunc_args() to match what helpers functions check is doing. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-8-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 65 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4adf7fc33e5a..cc20fda907fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -332,6 +332,10 @@ struct bpf_kfunc_call_arg_meta { u8 spi; u8 frameno; } iter; + struct { + struct bpf_map *ptr; + int uid; + } map; u64 mem_size; }; @@ -7598,6 +7602,23 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_wq_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (map->record->wq_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", + val + reg->off, map->record->wq_off); + return -EINVAL; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10843,6 +10864,7 @@ enum { KF_ARG_LIST_NODE_ID, KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, + KF_ARG_WORKQUEUE_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -10851,6 +10873,7 @@ BTF_ID(struct, bpf_list_head) BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) +BTF_ID(struct, bpf_wq) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -10894,6 +10917,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -10963,6 +10991,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_WORKQUEUE, }; enum special_kfunc_type { @@ -11119,6 +11148,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_map(meta->btf, &args[argno])) return KF_ARG_PTR_TO_MAP; + if (is_kfunc_arg_wq(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_WORKQUEUE; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11724,6 +11756,29 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } + if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + /* Use map_uid (which is unique id of inner map) to reject: + * inner_map1 = bpf_map_lookup_elem(outer_map, key1) + * inner_map2 = bpf_map_lookup_elem(outer_map, key2) + * if (inner_map1 && inner_map2) { + * wq = bpf_map_lookup_elem(inner_map1); + * if (wq) + * // mismatch would have been allowed + * bpf_wq_init(wq, inner_map2); + * } + * + * Comparing map_ptr is enough to distinguish normal and outer maps. + */ + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { + verbose(env, + "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } + } + meta->map.ptr = reg->map_ptr; + meta->map.uid = reg->map_uid; fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: @@ -11757,6 +11812,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CALLBACK: case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: + case KF_ARG_PTR_TO_WORKQUEUE: /* Trusted by default */ break; default: @@ -12043,6 +12099,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret) return ret; break; + case KF_ARG_PTR_TO_WORKQUEUE: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_wq_func(env, regno, meta); + if (ret < 0) + return ret; + break; } } From 246331e3f1eac905170a923f0ec76725c2558232 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:09 +0200 Subject: [PATCH 114/147] bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps Currently bpf_wq_cancel_and_free() is just a placeholder as there is no memory allocation for bpf_wq just yet. Again, duplication of the bpf_timer approach Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-9-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 18 ++++++++------ kernel/bpf/hashtab.c | 55 +++++++++++++++++++++++++++++++++---------- kernel/bpf/helpers.c | 8 +++++++ kernel/bpf/syscall.c | 9 +++++++ 5 files changed, 73 insertions(+), 19 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c7dcfd395555..64bf0cf3ee95 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -534,6 +534,7 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); +void bpf_wq_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -2204,6 +2205,7 @@ void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8c1e6d7654bb..580d07b15471 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -428,17 +428,21 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers(struct bpf_map *map) +static void array_map_free_timers_wq(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer on uref dropping to zero. */ - if (!btf_record_has_field(map->record, BPF_TIMER)) - return; + /* We don't reset or free fields other than timer and workqueue + * on uref dropping to zero. + */ + if (btf_record_has_field(map->record, BPF_TIMER)) + for (i = 0; i < array->map.max_entries; i++) + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + for (i = 0; i < array->map.max_entries; i++) + bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -782,7 +786,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers, + .map_release_uref = array_map_free_timers_wq, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c3e79a0b9361..0179183c543a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -240,6 +240,26 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } +static void htab_free_prealloced_wq(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } +} + static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -1495,7 +1515,7 @@ static void delete_all_elements(struct bpf_htab *htab) migrate_enable(); } -static void htab_free_malloced_timers(struct bpf_htab *htab) +static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer) { int i; @@ -1507,24 +1527,35 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); + if (is_timer) + bpf_obj_free_timer(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); + else + bpf_obj_free_workqueue(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } rcu_read_unlock(); } -static void htab_map_free_timers(struct bpf_map *map) +static void htab_map_free_timers_and_wq(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer on uref dropping to zero */ - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers(htab); - else - htab_free_prealloced_timers(htab); + /* We only free timer and workqueue on uref dropping to zero */ + if (btf_record_has_field(htab->map.record, BPF_TIMER)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, true); + else + htab_free_prealloced_timers(htab); + } + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, false); + else + htab_free_prealloced_wq(htab); + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -2260,7 +2291,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2281,7 +2312,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0b8ba6c81994..fe827f6e5234 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1468,6 +1468,14 @@ void bpf_timer_cancel_and_free(void *val) kfree_rcu(t, cb.rcu); } +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_wq_cancel_and_free(void *val) +{ + BTF_TYPE_EMIT(struct bpf_wq); +} + BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) { unsigned long *kptr = map_value; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0848e4141b00..63e368337483 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -661,6 +661,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) + return; + bpf_wq_cancel_and_free(obj + rec->wq_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -682,6 +689,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(field_ptr); break; case BPF_WORKQUEUE: + bpf_wq_cancel_and_free(field_ptr); break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); @@ -1119,6 +1127,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, } break; case BPF_TIMER: + case BPF_WORKQUEUE: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { From b4abee7c1ae3d59440e7915da28c6d2cd394738a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:10 +0200 Subject: [PATCH 115/147] selftests/bpf: add bpf_wq tests We simply try in all supported map types if we can store/load a bpf_wq. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-10-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/wq.c | 11 ++ tools/testing/selftests/bpf/progs/wq.c | 129 ++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/wq.c create mode 100644 tools/testing/selftests/bpf/progs/wq.c diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c new file mode 100644 index 000000000000..9a07b8bc2c52 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires */ +#include +#include "wq.skel.h" + +void serial_test_wq(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + + RUN_TESTS(wq); +} diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c new file mode 100644 index 000000000000..833f54aa8fdf --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct hmap_elem { + int counter; + struct bpf_timer timer; /* unused */ + struct bpf_spin_lock lock; /* unused */ + struct bpf_wq work; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1000); + __type(key, int); + __type(value, struct hmap_elem); +} hmap_malloc SEC(".maps"); + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +static int test_elem_callback(void *map, int *key) +{ + struct elem init = {}, *val; + + if (map == &lru && + bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + return 0; +} + +static int test_hmap_elem_callback(void *map, int *key) +{ + struct hmap_elem init = {}, *val; + + if (bpf_map_update_elem(map, key, &init, 0)) + return -1; + + val = bpf_map_lookup_elem(map, key); + if (!val) + return -2; + + return 0; +} + +SEC("tc") +/* test that workqueues can be used from an array */ +__retval(0) +long test_call_array_sleepable(void *ctx) +{ + int key = 0; + + return test_elem_callback(&array, &key); +} + +SEC("syscall") +/* Same test than above but from a sleepable context. */ +__retval(0) +long test_syscall_array_sleepable(void *ctx) +{ + int key = 1; + + return test_elem_callback(&array, &key); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap */ +__retval(0) +long test_call_hash_sleepable(void *ctx) +{ + int key = 2; + + return test_hmap_elem_callback(&hmap, &key); +} + +SEC("tc") +/* test that workqueues can be used from a hashmap with NO_PREALLOC. */ +__retval(0) +long test_call_hash_malloc_sleepable(void *ctx) +{ + int key = 3; + + return test_hmap_elem_callback(&hmap_malloc, &key); +} + +SEC("tc") +/* test that workqueues can be used from a LRU map */ +__retval(0) +long test_call_lru_sleepable(void *ctx) +{ + int key = 4; + + return test_elem_callback(&lru, &key); +} From eb48f6cd41a0f7803770a76bbffb6bd5b1b2ae2f Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:11 +0200 Subject: [PATCH 116/147] bpf: wq: add bpf_wq_init We need to teach the verifier about the second argument which is declared as void * but which is of type KF_ARG_PTR_TO_MAP. We could have dropped this extra case if we declared the second argument as struct bpf_map *, but that means users will have to do extra casting to have their program compile. We also need to duplicate the timer code for the checking if the map argument is matching the provided workqueue. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-11-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 104 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fe827f6e5234..1fc002cfe2b1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1109,11 +1109,18 @@ struct bpf_hrtimer { struct hrtimer timer; }; -/* the actual struct hidden inside uapi struct bpf_timer */ +struct bpf_work { + struct bpf_async_cb cb; + struct work_struct work; + struct work_struct delete_work; +}; + +/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ struct bpf_async_kern { union { struct bpf_async_cb *cb; struct bpf_hrtimer *timer; + struct bpf_work *work; }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer @@ -1124,6 +1131,7 @@ struct bpf_async_kern { enum bpf_async_type { BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, }; static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); @@ -1167,11 +1175,64 @@ out: return HRTIMER_NORESTART; } +static void bpf_wq_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, work); + struct bpf_tramp_run_ctx __maybe_unused run_ctx; + struct bpf_async_cb *cb = &w->cb; + struct bpf_prog *prog = cb->prog; + struct bpf_map *map = cb->map; + bpf_callback_t callback_fn; + void *value = cb->value; + void *key; + u32 idx; + + BTF_TYPE_EMIT(struct bpf_wq); + + callback_fn = READ_ONCE(cb->callback_fn); + if (!callback_fn || !prog) + return; + + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* compute the key */ + idx = ((char *)value - array->value) / array->elem_size; + key = &idx; + } else { /* hash or lru */ + key = value - round_up(map->key_size, 8); + } + + run_ctx.bpf_cookie = 0; + + if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { + /* recursion detected */ + __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); + return; + } + + callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); + /* The verifier checked that return value is zero. */ + + __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, + &run_ctx); +} + +static void bpf_wq_delete_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, delete_work); + + cancel_work_sync(&w->work); + + kfree_rcu(w, cb.rcu); +} + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { struct bpf_async_cb *cb; struct bpf_hrtimer *t; + struct bpf_work *w; clockid_t clockid; size_t size; int ret = 0; @@ -1183,6 +1244,9 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u case BPF_ASYNC_TYPE_TIMER: size = sizeof(struct bpf_hrtimer); break; + case BPF_ASYNC_TYPE_WQ: + size = sizeof(struct bpf_work); + break; default: return -EINVAL; } @@ -1201,13 +1265,22 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - if (type == BPF_ASYNC_TYPE_TIMER) { + switch (type) { + case BPF_ASYNC_TYPE_TIMER: clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; cb->value = (void *)async - map->record->timer_off; + break; + case BPF_ASYNC_TYPE_WQ: + w = (struct bpf_work *)cb; + + INIT_WORK(&w->work, bpf_wq_work); + INIT_WORK(&w->delete_work, bpf_wq_delete_work); + cb->value = (void *)async - map->record->wq_off; + break; } cb->map = map; cb->prog = NULL; @@ -1473,7 +1546,19 @@ void bpf_timer_cancel_and_free(void *val) */ void bpf_wq_cancel_and_free(void *val) { + struct bpf_work *work; + BTF_TYPE_EMIT(struct bpf_wq); + + work = (struct bpf_work *)__bpf_async_cancel_and_free(val); + if (!work) + return; + /* Trigger cancel of the sleepable work, but *do not* wait for + * it to finish if it was running as we might not be in a + * sleepable context. + * kfree will be called once the work has finished. + */ + schedule_work(&work->delete_work); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) @@ -2612,6 +2697,20 @@ __bpf_kfunc void bpf_throw(u64 cookie) WARN(1, "A call to BPF exception callback should never return\n"); } +__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_map *map = p__map; + + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq)); + + if (flags) + return -EINVAL; + + return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2689,6 +2788,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) +BTF_ID_FLAGS(func, bpf_wq_init) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { From e3d9eac99afd94980475833479332fefd74c5c2b Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:12 +0200 Subject: [PATCH 117/147] selftests/bpf: wq: add bpf_wq_init() checks Allows to test if allocation/free works Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-12-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/bpf_experimental.h | 1 + tools/testing/selftests/bpf/prog_tests/wq.c | 8 ++ tools/testing/selftests/bpf/progs/wq.c | 10 +++ .../testing/selftests/bpf/progs/wq_failures.c | 78 +++++++++++++++++++ 4 files changed, 97 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/wq_failures.c diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 3329ea080865..785b91b629be 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -470,4 +470,5 @@ extern int bpf_iter_css_new(struct bpf_iter_css *it, extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; +extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; #endif diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 9a07b8bc2c52..26ab69796103 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -2,6 +2,7 @@ /* Copyright (c) 2024 Benjamin Tissoires */ #include #include "wq.skel.h" +#include "wq_failures.skel.h" void serial_test_wq(void) { @@ -9,3 +10,10 @@ void serial_test_wq(void) RUN_TESTS(wq); } + +void serial_test_failures_wq(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + + RUN_TESTS(wq_failures); +} diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c index 833f54aa8fdf..ed2fe26e14ef 100644 --- a/tools/testing/selftests/bpf/progs/wq.c +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -52,6 +52,7 @@ struct { static int test_elem_callback(void *map, int *key) { struct elem init = {}, *val; + struct bpf_wq *wq; if (map == &lru && bpf_map_update_elem(map, key, &init, 0)) @@ -61,12 +62,17 @@ static int test_elem_callback(void *map, int *key) if (!val) return -2; + wq = &val->w; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + return 0; } static int test_hmap_elem_callback(void *map, int *key) { struct hmap_elem init = {}, *val; + struct bpf_wq *wq; if (bpf_map_update_elem(map, key, &init, 0)) return -1; @@ -75,6 +81,10 @@ static int test_hmap_elem_callback(void *map, int *key) if (!val) return -2; + wq = &val->work; + if (bpf_wq_init(wq, map, 0) != 0) + return -3; + return 0; } diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c new file mode 100644 index 000000000000..db7015c7d541 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Benjamin Tissoires + */ + +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 4); + __type(key, int); + __type(value, struct elem); +} lru SEC(".maps"); + +SEC("tc") +/* test that bpf_wq_init takes a map as a second argument + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("pointer in R2 isn't map pointer") +long test_wq_init_nomap(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &key, 0) != 0) + return -3; + + return 0; +} + +SEC("tc") +/* test that the workqueue is part of the map in bpf_wq_init + */ +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg(": (85) call bpf_wq_init#") /* anchor message */ +__msg("workqueue pointer in R1 map_uid=0 doesn't match map pointer in R2 map_uid=0") +long test_wq_init_wrong_map(void *ctx) +{ + struct bpf_wq *wq; + struct elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -1; + + wq = &val->w; + if (bpf_wq_init(wq, &lru, 0) != 0) + return -3; + + return 0; +} From 81f1d7a583fa1fa14f0c4e6140d34b5e3d08d227 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:13 +0200 Subject: [PATCH 118/147] bpf: wq: add bpf_wq_set_callback_impl To support sleepable async callbacks, we need to tell push_async_cb() whether the cb is sleepable or not. The verifier now detects that we are in bpf_wq_set_callback_impl and can allow a sleepable callback to happen. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-13-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/helpers.c | 15 +++++++++ kernel/bpf/verifier.c | 60 ++++++++++++++++++++++++++++++++---- 3 files changed, 70 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 36d19cd32eb5..9db35530c878 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -426,6 +426,7 @@ struct bpf_verifier_state { * while they are still in use. */ bool used_as_loop_entry; + bool in_sleepable; /* first and last insn idx of this verifier state */ u32 first_insn_idx; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1fc002cfe2b1..dd2eaaf77c93 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2711,6 +2711,20 @@ __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); } +__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags, + void *aux__ign) +{ + struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign; + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + + if (flags) + return -EINVAL; + + return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2789,6 +2803,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_wq_init) +BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cc20fda907fb..9715c88cc025 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -501,8 +501,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) } static bool is_sync_callback_calling_kfunc(u32 btf_id); +static bool is_async_callback_calling_kfunc(u32 btf_id); +static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); + static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || @@ -530,7 +534,8 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn) static bool is_async_callback_calling_insn(struct bpf_insn *insn) { - return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm); + return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } static bool is_may_goto_insn(struct bpf_insn *insn) @@ -1429,6 +1434,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->active_rcu_lock = src->active_rcu_lock; + dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; dst_state->active_lock.ptr = src->active_lock.ptr; dst_state->active_lock.id = src->active_lock.id; @@ -2404,7 +2410,7 @@ static void init_func_state(struct bpf_verifier_env *env, /* Similar to push_stack(), but for async callbacks */ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, - int subprog) + int subprog, bool is_sleepable) { struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; @@ -2431,6 +2437,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * Initialize it similar to do_check_common(). */ elem->st.branches = 1; + elem->st.in_sleepable = is_sleepable; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -5278,7 +5285,8 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable; + return env->prog->sleepable || + (env->cur_state && env->cur_state->in_sleepable); } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -9513,7 +9521,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins */ env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && - !is_sync_callback_calling_kfunc(insn->imm)) { + !is_callback_calling_kfunc(insn->imm)) { verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", func_id_name(insn->imm), insn->imm); return -EFAULT; @@ -9527,10 +9535,11 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins if (is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb; - /* there is no real recursion here. timer callbacks are async */ + /* there is no real recursion here. timer and workqueue callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, - insn_idx, subprog); + insn_idx, subprog, + is_bpf_wq_set_callback_impl_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -11017,6 +11026,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, + KF_bpf_wq_set_callback_impl, KF_bpf_iter_css_task_new, }; @@ -11041,6 +11051,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #endif @@ -11069,6 +11080,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #else @@ -11402,12 +11414,28 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } +static bool is_async_callback_calling_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + static bool is_bpf_throw_kfunc(struct bpf_insn *insn) { return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; } +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + +static bool is_callback_calling_kfunc(u32 btf_id) +{ + return is_sync_callback_calling_kfunc(btf_id) || + is_async_callback_calling_kfunc(btf_id); +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id) { return is_bpf_rbtree_api_kfunc(btf_id); @@ -12219,6 +12247,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_timer_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); @@ -16968,6 +17006,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_rcu_lock != cur->active_rcu_lock) return false; + if (old->in_sleepable != cur->in_sleepable) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -19639,6 +19680,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + *cnt = 3; } return 0; } From 01b7b1c5f3cc029bdd2652eba61e953ccd286c0e Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:14 +0200 Subject: [PATCH 119/147] selftests/bpf: add checks for bpf_wq_set_callback() We assign the callback and set everything up. The actual tests of these callbacks will be done when bpf_wq_start() is available. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-14-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/bpf_experimental.h | 5 ++ .../selftests/bpf/bpf_testmod/bpf_testmod.c | 5 ++ .../bpf/bpf_testmod/bpf_testmod_kfunc.h | 1 + tools/testing/selftests/bpf/progs/wq.c | 41 ++++++++++-- .../testing/selftests/bpf/progs/wq_failures.c | 66 +++++++++++++++++++ 5 files changed, 111 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 785b91b629be..b80b39f76034 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -471,4 +471,9 @@ extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; +extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags__k, void *aux__ign) __ksym; +#define bpf_wq_set_callback(timer, cb, flags) \ + bpf_wq_set_callback_impl(timer, cb, flags, NULL) #endif diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 39ad96a18123..eb2b78552ca2 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -494,6 +494,10 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused return arg; } +__bpf_kfunc void bpf_kfunc_call_test_sleepable(void) +{ +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -520,6 +524,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index 7c664dd61059..ce5cd763561c 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -96,6 +96,7 @@ void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym; void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym; void bpf_kfunc_call_test_destructive(void) __ksym; +void bpf_kfunc_call_test_sleepable(void) __ksym; void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p); struct prog_test_member *bpf_kfunc_call_memb_acquire(void); diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c index ed2fe26e14ef..dc301b52f91c 100644 --- a/tools/testing/selftests/bpf/progs/wq.c +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -49,7 +49,8 @@ struct { __type(value, struct elem); } lru SEC(".maps"); -static int test_elem_callback(void *map, int *key) +static int test_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) { struct elem init = {}, *val; struct bpf_wq *wq; @@ -66,10 +67,14 @@ static int test_elem_callback(void *map, int *key) if (bpf_wq_init(wq, map, 0) != 0) return -3; + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + return 0; } -static int test_hmap_elem_callback(void *map, int *key) +static int test_hmap_elem_callback(void *map, int *key, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) { struct hmap_elem init = {}, *val; struct bpf_wq *wq; @@ -85,6 +90,28 @@ static int test_hmap_elem_callback(void *map, int *key) if (bpf_wq_init(wq, map, 0) != 0) return -3; + if (bpf_wq_set_callback(wq, callback_fn, 0)) + return -4; + + return 0; +} + +__u32 ok; +__u32 ok_sleepable; + +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + ok |= (1 << *key); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + ok_sleepable |= (1 << *key); return 0; } @@ -95,7 +122,7 @@ long test_call_array_sleepable(void *ctx) { int key = 0; - return test_elem_callback(&array, &key); + return test_elem_callback(&array, &key, wq_cb_sleepable); } SEC("syscall") @@ -105,7 +132,7 @@ long test_syscall_array_sleepable(void *ctx) { int key = 1; - return test_elem_callback(&array, &key); + return test_elem_callback(&array, &key, wq_cb_sleepable); } SEC("tc") @@ -115,7 +142,7 @@ long test_call_hash_sleepable(void *ctx) { int key = 2; - return test_hmap_elem_callback(&hmap, &key); + return test_hmap_elem_callback(&hmap, &key, wq_callback); } SEC("tc") @@ -125,7 +152,7 @@ long test_call_hash_malloc_sleepable(void *ctx) { int key = 3; - return test_hmap_elem_callback(&hmap_malloc, &key); + return test_hmap_elem_callback(&hmap_malloc, &key, wq_callback); } SEC("tc") @@ -135,5 +162,5 @@ long test_call_lru_sleepable(void *ctx) { int key = 4; - return test_elem_callback(&lru, &key); + return test_elem_callback(&lru, &key, wq_callback); } diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index db7015c7d541..4cbdb425f223 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -27,6 +27,20 @@ struct { __type(value, struct elem); } lru SEC(".maps"); +/* callback for non sleepable workqueue */ +static int wq_callback(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_common_test(); + return 0; +} + +/* callback for sleepable workqueue */ +static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work) +{ + bpf_kfunc_call_test_sleepable(); + return 0; +} + SEC("tc") /* test that bpf_wq_init takes a map as a second argument */ @@ -76,3 +90,55 @@ long test_wq_init_wrong_map(void *ctx) return 0; } + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("arg#0 doesn't point to a map value") +long test_wrong_wq_pointer(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)&wq, wq_callback, 0)) + return 3; + + return -22; +} + +SEC("?tc") +__log_level(2) +__failure +/* check that the first argument of bpf_wq_set_callback() + * is a correct bpf_wq pointer. + */ +__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") +long test_wrong_wq_pointer_offset(void *ctx) +{ + int key = 0; + struct bpf_wq *wq; + + wq = bpf_map_lookup_elem(&array, &key); + if (!wq) + return 1; + + if (bpf_wq_init(wq, &array, 0)) + return 2; + + if (bpf_wq_set_callback((void *)wq + 1, wq_cb_sleepable, 0)) + return 3; + + return -22; +} From 8e83da9732d91c60fdc651b2486c8e5935eb0ca2 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:15 +0200 Subject: [PATCH 120/147] bpf: add bpf_wq_start again, copy/paste from bpf_timer_start(). Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-15-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index dd2eaaf77c93..047a21b7e4ba 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2711,6 +2711,23 @@ __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); } +__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_work *w; + + if (in_nmi()) + return -EOPNOTSUPP; + if (flags) + return -EINVAL; + w = READ_ONCE(async->work); + if (!w || !READ_ONCE(w->cb.prog)) + return -EINVAL; + + schedule_work(&w->work); + return 0; +} + __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, struct bpf_wq *wq), unsigned int flags, @@ -2804,6 +2821,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_start) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { From 8290dba51910d36721ced6ccf03049ed6b7ea2ce Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:16 +0200 Subject: [PATCH 121/147] selftests/bpf: wq: add bpf_wq_start() checks Allows to test if allocation/free works Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-16-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/bpf_experimental.h | 1 + tools/testing/selftests/bpf/prog_tests/wq.c | 22 +++++++++++++++++++ tools/testing/selftests/bpf/progs/wq.c | 20 ++++++++++++++--- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index b80b39f76034..93c5a6c446b3 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -471,6 +471,7 @@ extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; +extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, struct bpf_wq *wq), unsigned int flags__k, void *aux__ign) __ksym; diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 26ab69796103..8a4a91d944cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -6,9 +6,31 @@ void serial_test_wq(void) { + struct wq *wq_skel = NULL; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts); RUN_TESTS(wq); + + /* re-run the success test to check if the timer was actually executed */ + + wq_skel = wq__open_and_load(); + if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load")) + return; + + err = wq__attach(wq_skel); + if (!ASSERT_OK(err, "wq_attach")) + return; + + prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + usleep(50); /* 10 usecs should be enough, but give it extra */ + + ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); } void serial_test_failures_wq(void) diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c index dc301b52f91c..49e712acbf60 100644 --- a/tools/testing/selftests/bpf/progs/wq.c +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -49,12 +49,19 @@ struct { __type(value, struct elem); } lru SEC(".maps"); +__u32 ok; +__u32 ok_sleepable; + static int test_elem_callback(void *map, int *key, int (callback_fn)(void *map, int *key, struct bpf_wq *wq)) { struct elem init = {}, *val; struct bpf_wq *wq; + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + if (map == &lru && bpf_map_update_elem(map, key, &init, 0)) return -1; @@ -70,6 +77,9 @@ static int test_elem_callback(void *map, int *key, if (bpf_wq_set_callback(wq, callback_fn, 0)) return -4; + if (bpf_wq_start(wq, 0)) + return -5; + return 0; } @@ -79,6 +89,10 @@ static int test_hmap_elem_callback(void *map, int *key, struct hmap_elem init = {}, *val; struct bpf_wq *wq; + if ((ok & (1 << *key) || + (ok_sleepable & (1 << *key)))) + return -22; + if (bpf_map_update_elem(map, key, &init, 0)) return -1; @@ -93,12 +107,12 @@ static int test_hmap_elem_callback(void *map, int *key, if (bpf_wq_set_callback(wq, callback_fn, 0)) return -4; + if (bpf_wq_start(wq, 0)) + return -5; + return 0; } -__u32 ok; -__u32 ok_sleepable; - /* callback for non sleepable workqueue */ static int wq_callback(void *map, int *key, struct bpf_wq *work) { From dc92febf7b93da5049fe177804e6b1961fcc6bd7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 24 Apr 2024 09:00:23 -0700 Subject: [PATCH 122/147] bpf: Don't check for recursion in bpf_wq_work. __bpf_prog_enter_sleepable_recur does recursion check which is not applicable to wq callback. The callback function is part of bpf program and bpf prog might be running on the same cpu. So recursion check would incorrectly prevent callback from running. The code can call __bpf_prog_enter_sleepable(), but run_ctx would be fake, hence use explicit rcu_read_lock_trace(); migrate_disable(); to address this problem. Another reason to open code is __bpf_prog_enter* are not available in !JIT configs. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404241719.IIGdpAku-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202404241811.FFV4Bku3-lkp@intel.com/ Fixes: eb48f6cd41a0 ("bpf: wq: add bpf_wq_init") Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 047a21b7e4ba..5f61ffa7505a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1178,9 +1178,7 @@ out: static void bpf_wq_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, work); - struct bpf_tramp_run_ctx __maybe_unused run_ctx; struct bpf_async_cb *cb = &w->cb; - struct bpf_prog *prog = cb->prog; struct bpf_map *map = cb->map; bpf_callback_t callback_fn; void *value = cb->value; @@ -1190,7 +1188,7 @@ static void bpf_wq_work(struct work_struct *work) BTF_TYPE_EMIT(struct bpf_wq); callback_fn = READ_ONCE(cb->callback_fn); - if (!callback_fn || !prog) + if (!callback_fn) return; if (map->map_type == BPF_MAP_TYPE_ARRAY) { @@ -1203,19 +1201,13 @@ static void bpf_wq_work(struct work_struct *work) key = value - round_up(map->key_size, 8); } - run_ctx.bpf_cookie = 0; - - if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { - /* recursion detected */ - __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); - return; - } + rcu_read_lock_trace(); + migrate_disable(); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); - /* The verifier checked that return value is zero. */ - __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, - &run_ctx); + migrate_enable(); + rcu_read_unlock_trace(); } static void bpf_wq_delete_work(struct work_struct *work) From fc7566ad0a826cdc8886c5dbbb39ce72a0dc6333 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 24 Apr 2024 03:13:14 +0000 Subject: [PATCH 123/147] bpf: Introduce bpf_preempt_[disable,enable] kfuncs Introduce two new BPF kfuncs, bpf_preempt_disable and bpf_preempt_enable. These kfuncs allow disabling preemption in BPF programs. Nesting is allowed, since the intended use cases includes building native BPF spin locks without kernel helper involvement. Apart from that, this can be used to per-CPU data structures for cases where programs (or userspace) may preempt one or the other. Currently, while per-CPU access is stable, whether it will be consistent is not guaranteed, as only migration is disabled for BPF programs. Global functions are disallowed from being called, but support for them will be added as a follow up not just preempt kfuncs, but rcu_read_lock kfuncs as well. Static subprog calls are permitted. Sleepable helpers and kfuncs are disallowed in non-preemptible regions. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240424031315.2757363-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/helpers.c | 12 ++++++ kernel/bpf/verifier.c | 71 +++++++++++++++++++++++++++++++++++- 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9db35530c878..50aa87f8d77f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -421,6 +421,7 @@ struct bpf_verifier_state { struct bpf_active_lock active_lock; bool speculative; bool active_rcu_lock; + u32 active_preempt_lock; /* If this state was ever pointed-to by other state's loop_entry field * this flag would be set to true. Used to avoid freeing such states * while they are still in use. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5f61ffa7505a..14c401b78e12 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2734,6 +2734,16 @@ __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); } +__bpf_kfunc void bpf_preempt_disable(void) +{ + preempt_disable(); +} + +__bpf_kfunc void bpf_preempt_enable(void) +{ + preempt_enable(); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2814,6 +2824,8 @@ BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_ID_FLAGS(func, bpf_wq_start) +BTF_ID_FLAGS(func, bpf_preempt_disable) +BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9715c88cc025..8312c7a0ee19 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1434,6 +1434,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->active_rcu_lock = src->active_rcu_lock; + dst_state->active_preempt_lock = src->active_preempt_lock; dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; dst_state->active_lock.ptr = src->active_lock.ptr; @@ -9599,6 +9600,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + /* Only global subprogs cannot be called with preemption disabled. */ + if (env->cur_state->active_preempt_lock) { + verbose(env, "global function calls are not allowed with preemption disabled,\n" + "use static function instead\n"); + return -EINVAL; + } + if (err) { verbose(env, "Caller passes invalid args into func#%d ('%s')\n", subprog, sub_name); @@ -10285,6 +10293,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + if (env->cur_state->active_preempt_lock) { + if (fn->might_sleep) { + verbose(env, "sleepable helper %s#%d in non-preemptible region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + + if (in_sleepable(env) && is_storage_get_function(func_id)) + env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + } + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -11027,6 +11046,8 @@ enum special_kfunc_type { KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, KF_bpf_wq_set_callback_impl, + KF_bpf_preempt_disable, + KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, }; @@ -11081,6 +11102,8 @@ BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_preempt_disable) +BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #else @@ -11107,6 +11130,16 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock]; } +static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable]; +} + +static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; +} + static enum kfunc_ptr_arg_type get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -12195,11 +12228,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { - const struct btf_type *t, *ptr_type; + bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; u32 i, nargs, ptr_type_id, release_ref_obj_id; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; - bool sleepable, rcu_lock, rcu_unlock; + const struct btf_type *t, *ptr_type; struct bpf_kfunc_call_arg_meta meta; struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; @@ -12260,6 +12293,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); + preempt_disable = is_kfunc_bpf_preempt_disable(&meta); + preempt_enable = is_kfunc_bpf_preempt_enable(&meta); + if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; struct bpf_reg_state *reg; @@ -12292,6 +12328,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + env->cur_state->active_preempt_lock--; + } else if (sleepable) { + verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); + return -EACCES; + } + } else if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); + return -EINVAL; + } + /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -15439,6 +15491,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n"); + return -EINVAL; + } + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -17006,6 +17063,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_rcu_lock != cur->active_rcu_lock) return false; + if (old->active_preempt_lock != cur->active_preempt_lock) + return false; + if (old->in_sleepable != cur->in_sleepable) return false; @@ -17957,6 +18017,13 @@ process_bpf_exit_full: return -EINVAL; } + if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) { + verbose(env, "%d bpf_preempt_enable%s missing\n", + env->cur_state->active_preempt_lock, + env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are"); + return -EINVAL; + } + /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback From 3134396f1cba939783b87c63dae3a54708285a9a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 24 Apr 2024 03:13:15 +0000 Subject: [PATCH 124/147] selftests/bpf: Add tests for preempt kfuncs Add tests for nested cases, nested count preservation upon different subprog calls that disable/enable preemption, and test sleepable helper call in non-preemptible regions. 182/1 preempt_lock/preempt_lock_missing_1:OK 182/2 preempt_lock/preempt_lock_missing_2:OK 182/3 preempt_lock/preempt_lock_missing_3:OK 182/4 preempt_lock/preempt_lock_missing_3_minus_2:OK 182/5 preempt_lock/preempt_lock_missing_1_subprog:OK 182/6 preempt_lock/preempt_lock_missing_2_subprog:OK 182/7 preempt_lock/preempt_lock_missing_2_minus_1_subprog:OK 182/8 preempt_lock/preempt_balance:OK 182/9 preempt_lock/preempt_balance_subprog_test:OK 182/10 preempt_lock/preempt_global_subprog_test:OK 182/11 preempt_lock/preempt_sleepable_helper:OK 182 preempt_lock:OK Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240424031315.2757363-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/preempt_lock.c | 9 ++ .../selftests/bpf/progs/preempt_lock.c | 135 ++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/preempt_lock.c create mode 100644 tools/testing/selftests/bpf/progs/preempt_lock.c diff --git a/tools/testing/selftests/bpf/prog_tests/preempt_lock.c b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c new file mode 100644 index 000000000000..02917c672441 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +void test_preempt_lock(void) +{ + RUN_TESTS(preempt_lock); +} diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c new file mode 100644 index 000000000000..6c637ee01ec4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/preempt_lock.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "bpf_misc.h" + +void bpf_preempt_disable(void) __ksym; +void bpf_preempt_enable(void) __ksym; + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("3 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_3(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_3_minus_2(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_disable(); + bpf_preempt_enable(); + bpf_preempt_enable(); + return 0; +} + +static __noinline void preempt_disable(void) +{ + bpf_preempt_disable(); +} + +static __noinline void preempt_enable(void) +{ + bpf_preempt_enable(); +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("2 bpf_preempt_enable(s) are missing") +int preempt_lock_missing_2_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + return 0; +} + +SEC("?tc") +__failure __msg("1 bpf_preempt_enable is missing") +int preempt_lock_missing_2_minus_1_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_disable(); + preempt_enable(); + return 0; +} + +static __noinline void preempt_balance_subprog(void) +{ + preempt_disable(); + preempt_enable(); +} + +SEC("?tc") +__success int preempt_balance(struct __sk_buff *ctx) +{ + bpf_preempt_disable(); + bpf_preempt_enable(); + return 0; +} + +SEC("?tc") +__success int preempt_balance_subprog_test(struct __sk_buff *ctx) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("sleepable helper bpf_copy_from_user#") +int preempt_sleepable_helper(void *ctx) +{ + u32 data; + + bpf_preempt_disable(); + bpf_copy_from_user(&data, sizeof(data), NULL); + bpf_preempt_enable(); + return 0; +} + +int __noinline preempt_global_subprog(void) +{ + preempt_balance_subprog(); + return 0; +} + +SEC("?tc") +__failure __msg("global function calls are not allowed with preemption disabled") +int preempt_global_subprog_test(struct __sk_buff *ctx) +{ + preempt_disable(); + preempt_global_subprog(); + preempt_enable(); + return 0; +} + +char _license[] SEC("license") = "GPL"; From 151f7442436658ee84076681d8f52e987fe147ea Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:27 +0800 Subject: [PATCH 125/147] selftests/bpf: Fix a fd leak in error paths in open_netns As Martin mentioned in review comment, there is an existing bug that orig_netns_fd will be leaked in the later "goto fail;" case after open("/proc/self/ns/net") in open_netns() in network_helpers.c. This patch adds "close(token->orig_netns_fd);" before "free(token);" to fix it. Fixes: a30338840fa5 ("selftests/bpf: Move open_netns() and close_netns() into network_helpers.c") Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/a104040b47c3c34c67f3f125cdfdde244a870d3c.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 9d63d2ac13d8..04676572fc1e 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -478,6 +478,8 @@ struct nstoken *open_netns(const char *name) return token; fail: + if (token->orig_netns_fd != -1) + close(token->orig_netns_fd); free(token); return NULL; } From 285cffbaa8e6056c2595e07e3a320e55c71870ad Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:28 +0800 Subject: [PATCH 126/147] selftests/bpf: Use log_err in open_netns/close_netns ASSERT helpers defined in test_progs.h shouldn't be used in public functions like open_netns() and close_netns(). Since they depend on test__fail() which defined in test_progs.c. Public functions may be used not only in test_progs.c, but in other tests like test_sock_addr.c in the next commit. This patch uses log_err() to replace ASSERT helpers in open_netns() and close_netns() in network_helpers.c to decouple dependencies, then uses ASSERT_OK_PTR() to check the return values of all open_netns(). Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/d1dad22b2ff4909af3f8bfd0667d046e235303cb.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 19 ++++++++++++++----- .../selftests/bpf/prog_tests/empty_skb.c | 2 ++ .../bpf/prog_tests/ip_check_defrag.c | 2 ++ .../selftests/bpf/prog_tests/tc_redirect.c | 2 +- .../selftests/bpf/prog_tests/test_tunnel.c | 4 ++++ .../selftests/bpf/prog_tests/xdp_metadata.c | 16 ++++++++++++++++ 6 files changed, 39 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 04676572fc1e..a9e4905a2115 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -459,22 +459,30 @@ struct nstoken *open_netns(const char *name) struct nstoken *token; token = calloc(1, sizeof(struct nstoken)); - if (!ASSERT_OK_PTR(token, "malloc token")) + if (!token) { + log_err("Failed to malloc token"); return NULL; + } token->orig_netns_fd = open("/proc/self/ns/net", O_RDONLY); - if (!ASSERT_GE(token->orig_netns_fd, 0, "open /proc/self/ns/net")) + if (token->orig_netns_fd == -1) { + log_err("Failed to open(/proc/self/ns/net)"); goto fail; + } snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name); nsfd = open(nspath, O_RDONLY | O_CLOEXEC); - if (!ASSERT_GE(nsfd, 0, "open netns fd")) + if (nsfd == -1) { + log_err("Failed to open(%s)", nspath); goto fail; + } err = setns(nsfd, CLONE_NEWNET); close(nsfd); - if (!ASSERT_OK(err, "setns")) + if (err) { + log_err("Failed to setns(nsfd)"); goto fail; + } return token; fail: @@ -489,7 +497,8 @@ void close_netns(struct nstoken *token) if (!token) return; - ASSERT_OK(setns(token->orig_netns_fd, CLONE_NEWNET), "setns"); + if (setns(token->orig_netns_fd, CLONE_NEWNET)) + log_err("Failed to setns(orig_netns_fd)"); close(token->orig_netns_fd); free(token); } diff --git a/tools/testing/selftests/bpf/prog_tests/empty_skb.c b/tools/testing/selftests/bpf/prog_tests/empty_skb.c index 261228eb68e8..438583e1f2d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/empty_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/empty_skb.c @@ -94,6 +94,8 @@ void test_empty_skb(void) SYS(out, "ip netns add empty_skb"); tok = open_netns("empty_skb"); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add veth0 type veth peer veth1"); SYS(out, "ip link set dev veth0 up"); SYS(out, "ip link set dev veth1 up"); diff --git a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c index 8dd2af9081f4..284764e7179f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c +++ b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c @@ -88,6 +88,8 @@ static int attach(struct ip_check_defrag *skel, bool ipv6) int err = -1; nstoken = open_netns(NS1); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto out; skel->links.defrag = bpf_program__attach_netfilter(skel->progs.defrag, &opts); if (!ASSERT_OK_PTR(skel->links.defrag, "program attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index dbe06aeaa2b2..b1073d36d77a 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -530,7 +530,7 @@ static int wait_netstamp_needed_key(void) __u64 tstamp = 0; nstoken = open_netns(NS_DST); - if (!nstoken) + if (!ASSERT_OK_PTR(nstoken, "setns dst")) return -1; srv_fd = start_server(AF_INET6, SOCK_DGRAM, "::1", 0, 0); diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 5f1fb0a2ea56..cec746e77cd3 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -612,6 +612,8 @@ static void test_ipip_tunnel(enum ipip_encap encap) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); if (!ASSERT_OK(err, "test_ping")) goto done; @@ -666,6 +668,8 @@ static void test_xfrm_tunnel(void) /* ping from at_ns0 namespace test */ nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + goto done; err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); close_netns(nstoken); if (!ASSERT_OK(err, "test_ping")) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index 05edcf32f528..f76b5d67a3ee 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -384,6 +384,8 @@ void test_xdp_metadata(void) SYS(out, "ip netns add " RX_NETNS_NAME); tok = open_netns(TX_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; SYS(out, "ip link add numtxqueues 1 numrxqueues 1 " TX_NAME " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1"); SYS(out, "ip link set " RX_NAME " netns " RX_NETNS_NAME); @@ -400,6 +402,8 @@ void test_xdp_metadata(void) SYS(out, "ip -4 neigh add " RX_ADDR " lladdr " RX_MAC " dev " TX_NAME_VLAN); switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; SYS(out, "ip link set dev " RX_NAME " address " RX_MAC); SYS(out, "ip link set dev " RX_NAME " up"); @@ -449,6 +453,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Setup separate AF_XDP for TX interface nad send packet to the RX socket. */ tx_ifindex = if_nametoindex(TX_NAME); @@ -461,6 +467,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; /* Verify packet sent from AF_XDP has proper metadata. */ if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, true), 0, @@ -468,6 +476,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; complete_tx(&tx_xsk); /* Now check metadata of packet, generated with network stack */ @@ -475,6 +485,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, false), 0, "verify_xsk_metadata")) @@ -498,6 +510,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_tx(&tok); + if (!ASSERT_OK_PTR(tok, "setns tx")) + goto out; /* Send packet to trigger . */ if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0, @@ -505,6 +519,8 @@ void test_xdp_metadata(void) goto out; switch_ns_to_rx(&tok); + if (!ASSERT_OK_PTR(tok, "setns rx")) + goto out; while (!retries--) { if (bpf_obj2->bss->called) From e1cdb70d075e02b1f410b8446a8ff959fa15f0ee Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:29 +0800 Subject: [PATCH 127/147] selftests/bpf: Use start_server_addr in test_sock_addr Include network_helpers.h in test_sock_addr.c, use the newly added public helper start_server_addr() instead of the local defined function start_server(). This can avoid duplicate code. In order to use functions defined in network_helpers.c in test_sock_addr.c, Makefile needs to be updated and needs to be included in network_helpers.h to avoid compilation errors. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/3101f57bde5502383eb41723c8956cc26be06893.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 3 +- tools/testing/selftests/bpf/network_helpers.h | 1 + tools/testing/selftests/bpf/test_sock_addr.c | 38 ++----------------- 3 files changed, 7 insertions(+), 35 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index edc73f8f5aef..0ce5c3c26b5c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -290,11 +290,12 @@ UNPRIV_HELPERS := $(OUTPUT)/unpriv_helpers.o TRACE_HELPERS := $(OUTPUT)/trace_helpers.o JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o +NETWORK_HELPERS := $(OUTPUT)/network_helpers.o $(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) -$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(NETWORK_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index aef297dfa6ca..5a8c5cf4ec1a 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -11,6 +11,7 @@ typedef __u16 __sum16; #include #include #include +#include #include #include #include diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 80c42583f597..ca4fb142b7b7 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -19,6 +19,7 @@ #include #include "cgroup_helpers.h" +#include "network_helpers.h" #include "testing_helpers.h" #include "bpf_util.h" @@ -939,37 +940,6 @@ static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2) return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1); } -static int start_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int fd; - - fd = socket(addr->ss_family, type, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - if (bind(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (type == SOCK_STREAM) { - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - } - - goto out; -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static int connect_to_server(int type, const struct sockaddr_storage *addr, socklen_t addr_len) { @@ -1178,7 +1148,7 @@ static int run_bind_test_case(const struct sock_addr_test *test) if (init_addrs(test, &requested_addr, &expected_addr, NULL)) goto err; - servfd = start_server(test->type, &requested_addr, addr_len); + servfd = start_server_addr(test->type, &requested_addr, addr_len, NULL); if (servfd == -1) goto err; @@ -1214,7 +1184,7 @@ static int run_connect_test_case(const struct sock_addr_test *test) goto err; /* Prepare server to connect to */ - servfd = start_server(test->type, &expected_addr, addr_len); + servfd = start_server_addr(test->type, &expected_addr, addr_len, NULL); if (servfd == -1) goto err; @@ -1271,7 +1241,7 @@ static int run_xmsg_test_case(const struct sock_addr_test *test, int max_cmsg) goto err; /* Prepare server to sendmsg to */ - servfd = start_server(test->type, &server_addr, addr_len); + servfd = start_server_addr(test->type, &server_addr, addr_len, NULL); if (servfd == -1) goto err; From c6c40798428180516df80ce89da7bbfe1f6a828a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:30 +0800 Subject: [PATCH 128/147] selftests/bpf: Use connect_to_addr in test_sock_addr This patch uses public network helper connect_to_addr() exported in network_helpers.h instead of the local defined function connect_to_server() in test_sock_addr.c. This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/f263797712d93fdfaf2943585c5dfae56714a00b.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/test_sock_addr.c | 36 ++------------------ 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index ca4fb142b7b7..2e29dd9d8fc3 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -940,38 +940,6 @@ static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2) return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1); } -static int connect_to_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int domain; - int fd = -1; - - domain = addr->ss_family; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - goto err; - } - - fd = socket(domain, type, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto err; - } - - if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) { - log_err("Fail to connect to server"); - goto err; - } - - goto out; -err: - close(fd); - fd = -1; -out: - return fd; -} - int init_pktinfo(int domain, struct cmsghdr *cmsg) { struct in6_pktinfo *pktinfo6; @@ -1156,7 +1124,7 @@ static int run_bind_test_case(const struct sock_addr_test *test) goto err; /* Try to connect to server just in case */ - clientfd = connect_to_server(test->type, &expected_addr, addr_len); + clientfd = connect_to_addr(test->type, &expected_addr, addr_len, NULL); if (clientfd == -1) goto err; @@ -1188,7 +1156,7 @@ static int run_connect_test_case(const struct sock_addr_test *test) if (servfd == -1) goto err; - clientfd = connect_to_server(test->type, &requested_addr, addr_len); + clientfd = connect_to_addr(test->type, &requested_addr, addr_len, NULL); if (clientfd == -1) goto err; From e4c68bbaff1153e8730c16f566c78a25b2046372 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 23 Apr 2024 18:35:31 +0800 Subject: [PATCH 129/147] selftests/bpf: Use make_sockaddr in test_sock_addr This patch uses public helper make_sockaddr() exported in network_helpers.h instead of the local defined function mk_sockaddr() in test_sock_addr.c. This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/1473e189d6ca1a3925de4c5354d191a14eca0f3f.1713868264.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/test_sock_addr.c | 64 ++++---------------- 1 file changed, 12 insertions(+), 52 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 2e29dd9d8fc3..c412de84b88f 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -605,44 +605,6 @@ static struct sock_addr_test tests[] = { }, }; -static int mk_sockaddr(int domain, const char *ip, unsigned short port, - struct sockaddr *addr, socklen_t addr_len) -{ - struct sockaddr_in6 *addr6; - struct sockaddr_in *addr4; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - return -1; - } - - memset(addr, 0, addr_len); - - if (domain == AF_INET) { - if (addr_len < sizeof(struct sockaddr_in)) - return -1; - addr4 = (struct sockaddr_in *)addr; - addr4->sin_family = domain; - addr4->sin_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) { - log_err("Invalid IPv4: %s", ip); - return -1; - } - } else if (domain == AF_INET6) { - if (addr_len < sizeof(struct sockaddr_in6)) - return -1; - addr6 = (struct sockaddr_in6 *)addr; - addr6->sin6_family = domain; - addr6->sin6_port = htons(port); - if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) { - log_err("Invalid IPv6: %s", ip); - return -1; - } - } - - return 0; -} - static int load_insns(const struct sock_addr_test *test, const struct bpf_insn *insns, size_t insns_cnt) { @@ -757,9 +719,9 @@ static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) return -1; } - if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, - (struct sockaddr *)&dst4_rw_addr, - sizeof(dst4_rw_addr)) == -1) + if (make_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + (struct sockaddr_storage *)&dst4_rw_addr, + NULL) == -1) return -1; struct bpf_insn insns[] = { @@ -820,9 +782,9 @@ static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test, return -1; } - if (mk_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, - (struct sockaddr *)&dst6_rw_addr, - sizeof(dst6_rw_addr)) == -1) + if (make_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, + (struct sockaddr_storage *)&dst6_rw_addr, + NULL) == -1) return -1; struct bpf_insn insns[] = { @@ -1084,19 +1046,17 @@ static int init_addrs(const struct sock_addr_test *test, struct sockaddr_storage *expected_addr, struct sockaddr_storage *expected_src_addr) { - socklen_t addr_len = sizeof(struct sockaddr_storage); - - if (mk_sockaddr(test->domain, test->expected_ip, test->expected_port, - (struct sockaddr *)expected_addr, addr_len) == -1) + if (make_sockaddr(test->domain, test->expected_ip, test->expected_port, + expected_addr, NULL) == -1) goto err; - if (mk_sockaddr(test->domain, test->requested_ip, test->requested_port, - (struct sockaddr *)requested_addr, addr_len) == -1) + if (make_sockaddr(test->domain, test->requested_ip, test->requested_port, + requested_addr, NULL) == -1) goto err; if (test->expected_src_ip && - mk_sockaddr(test->domain, test->expected_src_ip, 0, - (struct sockaddr *)expected_src_addr, addr_len) == -1) + make_sockaddr(test->domain, test->expected_src_ip, 0, + expected_src_addr, NULL) == -1) goto err; return 0; From 82e38a505c9868e784ec31e743fd8a9fa5ca1084 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 24 Apr 2024 13:57:18 -0700 Subject: [PATCH 130/147] selftests/bpf: Fix wq test. The wq test was missing destroy(skel) part which was causing bpf progs to stay loaded. That was causing test_progs to complain with "Failed to unload bpf_testmod.ko from kernel: -11" message, but adding destroy() wasn't enough, since wq callback may be delayed, so loop on unload of bpf_testmod if errno is EAGAIN. Acked-by: Andrii Nakryiko Fixes: 8290dba51910 ("selftests/bpf: wq: add bpf_wq_start() checks") Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/wq.c | 1 + tools/testing/selftests/bpf/testing_helpers.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 8a4a91d944cc..c4bacd3160e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -31,6 +31,7 @@ void serial_test_wq(void) usleep(50); /* 10 usecs should be enough, but give it extra */ ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); + wq__destroy(wq_skel); } void serial_test_failures_wq(void) diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 28b6646662af..d5379a0e6da8 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -368,9 +368,23 @@ int delete_module(const char *name, int flags) int unload_bpf_testmod(bool verbose) { + int ret, cnt = 0; + if (kern_sync_rcu()) fprintf(stdout, "Failed to trigger kernel-side RCU sync!\n"); - if (delete_module("bpf_testmod", 0)) { + + for (;;) { + ret = delete_module("bpf_testmod", 0); + if (!ret || errno != EAGAIN) + break; + if (++cnt > 10000) { + fprintf(stdout, "Unload of bpf_testmod timed out\n"); + break; + } + usleep(100); + } + + if (ret) { if (errno == ENOENT) { if (verbose) fprintf(stdout, "bpf_testmod.ko is already unloaded.\n"); From 95c07d58250ca3ed01855c20be568cf04e15382f Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Wed, 24 Apr 2024 13:45:07 +0800 Subject: [PATCH 131/147] bpf: update the comment for BTF_FIELDS_MAX The commit d56b63cf0c0f ("bpf: add support for bpf_wq user type") changes the fields support number to 11, just sync the comment. Signed-off-by: Haiyue Wang Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240424054526.8031-1-haiyue.wang@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 64bf0cf3ee95..978200f6d925 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -184,7 +184,7 @@ struct bpf_map_ops { }; enum { - /* Support at most 10 fields in a BTF type */ + /* Support at most 11 fields in a BTF type */ BTF_FIELDS_MAX = 11, }; From 3e1c6f35409f9e447bf37f64840f5b65576bfb78 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 22 Apr 2024 15:50:21 -0700 Subject: [PATCH 132/147] bpf: make common crypto API for TC/XDP programs Add crypto API support to BPF to be able to decrypt or encrypt packets in TC/XDP BPF programs. Special care should be taken for initialization part of crypto algo because crypto alloc) doesn't work with preemtion disabled, it can be run only in sleepable BPF program. Also async crypto is not supported because of the very same issue - TC/XDP BPF programs are not sleepable. Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240422225024.2847039-2-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 1 + include/linux/bpf_crypto.h | 24 +++ kernel/bpf/Makefile | 3 + kernel/bpf/crypto.c | 385 +++++++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 1 + 6 files changed, 415 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_crypto.h create mode 100644 kernel/bpf/crypto.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 978200f6d925..364563b74db6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1275,6 +1275,7 @@ int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); +bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); diff --git a/include/linux/bpf_crypto.h b/include/linux/bpf_crypto.h new file mode 100644 index 000000000000..a41e71d4e2d9 --- /dev/null +++ b/include/linux/bpf_crypto.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_CRYPTO_H +#define _BPF_CRYPTO_H + +struct bpf_crypto_type { + void *(*alloc_tfm)(const char *algo); + void (*free_tfm)(void *tfm); + int (*has_algo)(const char *algo); + int (*setkey)(void *tfm, const u8 *key, unsigned int keylen); + int (*setauthsize)(void *tfm, unsigned int authsize); + int (*encrypt)(void *tfm, const u8 *src, u8 *dst, unsigned int len, u8 *iv); + int (*decrypt)(void *tfm, const u8 *src, u8 *dst, unsigned int len, u8 *iv); + unsigned int (*ivsize)(void *tfm); + unsigned int (*statesize)(void *tfm); + u32 (*get_flags)(void *tfm); + struct module *owner; + char name[14]; +}; + +int bpf_crypto_register_type(const struct bpf_crypto_type *type); +int bpf_crypto_unregister_type(const struct bpf_crypto_type *type); + +#endif /* _BPF_CRYPTO_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 368c5d86b5b7..736bd22e5ce0 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -44,6 +44,9 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif +ifeq ($(CONFIG_CRYPTO),y) +obj-$(CONFIG_BPF_SYSCALL) += crypto.o +endif obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c new file mode 100644 index 000000000000..2bee4af91e38 --- /dev/null +++ b/kernel/bpf/crypto.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta, Inc */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct bpf_crypto_type_list { + const struct bpf_crypto_type *type; + struct list_head list; +}; + +/* BPF crypto initialization parameters struct */ +/** + * struct bpf_crypto_params - BPF crypto initialization parameters structure + * @type: The string of crypto operation type. + * @reserved: Reserved member, will be reused for more options in future + * Values: + * 0 + * @algo: The string of algorithm to initialize. + * @key: The cipher key used to init crypto algorithm. + * @key_len: The length of cipher key. + * @authsize: The length of authentication tag used by algorithm. + */ +struct bpf_crypto_params { + char type[14]; + u8 reserved[2]; + char algo[128]; + u8 key[256]; + u32 key_len; + u32 authsize; +}; + +static LIST_HEAD(bpf_crypto_types); +static DECLARE_RWSEM(bpf_crypto_types_sem); + +/** + * struct bpf_crypto_ctx - refcounted BPF crypto context structure + * @type: The pointer to bpf crypto type + * @tfm: The pointer to instance of crypto API struct. + * @siv_len: Size of IV and state storage for cipher + * @rcu: The RCU head used to free the crypto context with RCU safety. + * @usage: Object reference counter. When the refcount goes to 0, the + * memory is released back to the BPF allocator, which provides + * RCU safety. + */ +struct bpf_crypto_ctx { + const struct bpf_crypto_type *type; + void *tfm; + u32 siv_len; + struct rcu_head rcu; + refcount_t usage; +}; + +int bpf_crypto_register_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -EEXIST; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (!strcmp(node->type->name, type->name)) + goto unlock; + } + + node = kmalloc(sizeof(*node), GFP_KERNEL); + err = -ENOMEM; + if (!node) + goto unlock; + + node->type = type; + list_add(&node->list, &bpf_crypto_types); + err = 0; + +unlock: + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_register_type); + +int bpf_crypto_unregister_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -ENOENT; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, type->name)) + continue; + + list_del(&node->list); + kfree(node); + err = 0; + break; + } + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type); + +static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name) +{ + const struct bpf_crypto_type *type = ERR_PTR(-ENOENT); + struct bpf_crypto_type_list *node; + + down_read(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, name)) + continue; + + if (try_module_get(node->type->owner)) + type = node->type; + break; + } + up_read(&bpf_crypto_types_sem); + + return type; +} + +__bpf_kfunc_start_defs(); + +/** + * bpf_crypto_ctx_create() - Create a mutable BPF crypto context. + * + * Allocates a crypto context that can be used, acquired, and released by + * a BPF program. The crypto context returned by this function must either + * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release(). + * As crypto API functions use GFP_KERNEL allocations, this function can + * only be used in sleepable BPF programs. + * + * bpf_crypto_ctx_create() allocates memory for crypto context. + * It may return NULL if no memory is available. + * @params: pointer to struct bpf_crypto_params which contains all the + * details needed to initialise crypto context. + * @params__sz: size of steuct bpf_crypto_params usef by bpf program + * @err: integer to store error code when NULL is returned. + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz, + int *err) +{ + const struct bpf_crypto_type *type; + struct bpf_crypto_ctx *ctx; + + if (!params || params->reserved[0] || params->reserved[1] || + params__sz != sizeof(struct bpf_crypto_params)) { + *err = -EINVAL; + return NULL; + } + + type = bpf_crypto_get_type(params->type); + if (IS_ERR(type)) { + *err = PTR_ERR(type); + return NULL; + } + + if (!type->has_algo(params->algo)) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!!params->authsize ^ !!type->setauthsize) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!params->key_len || params->key_len > sizeof(params->key)) { + *err = -EINVAL; + goto err_module_put; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + *err = -ENOMEM; + goto err_module_put; + } + + ctx->type = type; + ctx->tfm = type->alloc_tfm(params->algo); + if (IS_ERR(ctx->tfm)) { + *err = PTR_ERR(ctx->tfm); + goto err_free_ctx; + } + + if (params->authsize) { + *err = type->setauthsize(ctx->tfm, params->authsize); + if (*err) + goto err_free_tfm; + } + + *err = type->setkey(ctx->tfm, params->key, params->key_len); + if (*err) + goto err_free_tfm; + + if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) { + *err = -EINVAL; + goto err_free_tfm; + } + + ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm); + + refcount_set(&ctx->usage, 1); + + return ctx; + +err_free_tfm: + type->free_tfm(ctx->tfm); +err_free_ctx: + kfree(ctx); +err_module_put: + module_put(type->owner); + + return NULL; +} + +static void crypto_free_cb(struct rcu_head *head) +{ + struct bpf_crypto_ctx *ctx; + + ctx = container_of(head, struct bpf_crypto_ctx, rcu); + ctx->type->free_tfm(ctx->tfm); + module_put(ctx->type->owner); + kfree(ctx); +} + +/** + * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context. + * @ctx: The BPF crypto context being acquired. The ctx must be a trusted + * pointer. + * + * Acquires a reference to a BPF crypto context. The context returned by this function + * must either be embedded in a map as a kptr, or freed with + * bpf_crypto_ctx_release(). + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) +{ + if (!refcount_inc_not_zero(&ctx->usage)) + return NULL; + return ctx; +} + +/** + * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context. + * @ctx: The crypto context being released. + * + * Releases a previously acquired reference to a BPF crypto context. When the final + * reference of the BPF crypto context has been released, its memory + * will be released. + */ +__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) +{ + if (refcount_dec_and_test(&ctx->usage)) + call_rcu(&ctx->rcu, crypto_free_cb); +} + +static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv, + bool decrypt) +{ + u32 src_len, dst_len, siv_len; + const u8 *psrc; + u8 *pdst, *piv; + int err; + + if (__bpf_dynptr_is_rdonly(dst)) + return -EINVAL; + + siv_len = __bpf_dynptr_size(siv); + src_len = __bpf_dynptr_size(src); + dst_len = __bpf_dynptr_size(dst); + if (!src_len || !dst_len) + return -EINVAL; + + if (siv_len != ctx->siv_len) + return -EINVAL; + + psrc = __bpf_dynptr_data(src, src_len); + if (!psrc) + return -EINVAL; + pdst = __bpf_dynptr_data_rw(dst, dst_len); + if (!pdst) + return -EINVAL; + + piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL; + if (siv_len && !piv) + return -EINVAL; + + err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv) + : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv); + + return err; +} + +/** + * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer. + * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, true); +} + +/** + * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the plain data. Must be a trusted pointer. + * @dst: bpf_dynptr to buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, false); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(crypt_init_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_KFUNCS_END(crypt_init_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_init_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_init_kfunc_btf_ids, +}; + +BTF_KFUNCS_START(crypt_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU) +BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU) +BTF_KFUNCS_END(crypt_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_kfunc_btf_ids, +}; + +BTF_ID_LIST(bpf_crypto_dtor_ids) +BTF_ID(struct, bpf_crypto_ctx) +BTF_ID(func, bpf_crypto_ctx_release) + +static int __init crypto_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = { + { + .btf_id = bpf_crypto_dtor_ids[0], + .kfunc_btf_id = bpf_crypto_dtor_ids[1] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &crypt_init_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors, + ARRAY_SIZE(bpf_crypto_dtors), + THIS_MODULE); +} + +late_initcall(crypto_kfunc_init); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 14c401b78e12..2a69a9a36c0f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1583,7 +1583,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) +bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8312c7a0ee19..4e474ef44e9c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5310,6 +5310,7 @@ BTF_ID(struct, cgroup) BTF_ID(struct, bpf_cpumask) #endif BTF_ID(struct, task_struct) +BTF_ID(struct, bpf_crypto_ctx) BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) From fda4f71282b21a8cf230b656781efb0a41371fd4 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 22 Apr 2024 15:50:22 -0700 Subject: [PATCH 133/147] bpf: crypto: add skcipher to bpf crypto Implement skcipher crypto in BPF crypto framework. Signed-off-by: Vadim Fedorenko Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20240422225024.2847039-3-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- MAINTAINERS | 8 ++++ crypto/Makefile | 3 ++ crypto/bpf_crypto_skcipher.c | 82 ++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) create mode 100644 crypto/bpf_crypto_skcipher.c diff --git a/MAINTAINERS b/MAINTAINERS index 6a233e1a3cf2..c9f887fbb477 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3822,6 +3822,14 @@ F: kernel/bpf/tnum.c F: kernel/bpf/trampoline.c F: kernel/bpf/verifier.c +BPF [CRYPTO] +M: Vadim Fedorenko +L: bpf@vger.kernel.org +S: Maintained +F: crypto/bpf_crypto_skcipher.c +F: include/linux/bpf_crypto.h +F: kernel/bpf/crypto.c + BPF [DOCUMENTATION] (Related to Standardization) R: David Vernet L: bpf@vger.kernel.org diff --git a/crypto/Makefile b/crypto/Makefile index 408f0a1f9ab9..538124f8bf8a 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -20,6 +20,9 @@ crypto_skcipher-y += lskcipher.o crypto_skcipher-y += skcipher.o obj-$(CONFIG_CRYPTO_SKCIPHER2) += crypto_skcipher.o +ifeq ($(CONFIG_BPF_SYSCALL),y) +obj-$(CONFIG_CRYPTO_SKCIPHER2) += bpf_crypto_skcipher.o +endif obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o obj-$(CONFIG_CRYPTO_ECHAINIV) += echainiv.o diff --git a/crypto/bpf_crypto_skcipher.c b/crypto/bpf_crypto_skcipher.c new file mode 100644 index 000000000000..b5e657415770 --- /dev/null +++ b/crypto/bpf_crypto_skcipher.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta, Inc */ +#include +#include +#include +#include + +static void *bpf_crypto_lskcipher_alloc_tfm(const char *algo) +{ + return crypto_alloc_lskcipher(algo, 0, 0); +} + +static void bpf_crypto_lskcipher_free_tfm(void *tfm) +{ + crypto_free_lskcipher(tfm); +} + +static int bpf_crypto_lskcipher_has_algo(const char *algo) +{ + return crypto_has_skcipher(algo, CRYPTO_ALG_TYPE_LSKCIPHER, CRYPTO_ALG_TYPE_MASK); +} + +static int bpf_crypto_lskcipher_setkey(void *tfm, const u8 *key, unsigned int keylen) +{ + return crypto_lskcipher_setkey(tfm, key, keylen); +} + +static u32 bpf_crypto_lskcipher_get_flags(void *tfm) +{ + return crypto_lskcipher_get_flags(tfm); +} + +static unsigned int bpf_crypto_lskcipher_ivsize(void *tfm) +{ + return crypto_lskcipher_ivsize(tfm); +} + +static unsigned int bpf_crypto_lskcipher_statesize(void *tfm) +{ + return crypto_lskcipher_statesize(tfm); +} + +static int bpf_crypto_lskcipher_encrypt(void *tfm, const u8 *src, u8 *dst, + unsigned int len, u8 *siv) +{ + return crypto_lskcipher_encrypt(tfm, src, dst, len, siv); +} + +static int bpf_crypto_lskcipher_decrypt(void *tfm, const u8 *src, u8 *dst, + unsigned int len, u8 *siv) +{ + return crypto_lskcipher_decrypt(tfm, src, dst, len, siv); +} + +static const struct bpf_crypto_type bpf_crypto_lskcipher_type = { + .alloc_tfm = bpf_crypto_lskcipher_alloc_tfm, + .free_tfm = bpf_crypto_lskcipher_free_tfm, + .has_algo = bpf_crypto_lskcipher_has_algo, + .setkey = bpf_crypto_lskcipher_setkey, + .encrypt = bpf_crypto_lskcipher_encrypt, + .decrypt = bpf_crypto_lskcipher_decrypt, + .ivsize = bpf_crypto_lskcipher_ivsize, + .statesize = bpf_crypto_lskcipher_statesize, + .get_flags = bpf_crypto_lskcipher_get_flags, + .owner = THIS_MODULE, + .name = "skcipher", +}; + +static int __init bpf_crypto_skcipher_init(void) +{ + return bpf_crypto_register_type(&bpf_crypto_lskcipher_type); +} + +static void __exit bpf_crypto_skcipher_exit(void) +{ + int err = bpf_crypto_unregister_type(&bpf_crypto_lskcipher_type); + WARN_ON_ONCE(err); +} + +module_init(bpf_crypto_skcipher_init); +module_exit(bpf_crypto_skcipher_exit); +MODULE_LICENSE("GPL"); From 91541ab192fc7f573e6c711ba9c2ae22a299c408 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 22 Apr 2024 15:50:23 -0700 Subject: [PATCH 134/147] selftests: bpf: crypto skcipher algo selftests Add simple tc hook selftests to show the way to work with new crypto BPF API. Some tricky dynptr initialization is used to provide empty iv dynptr. Simple AES-ECB algo is used to demonstrate encryption and decryption of fixed size buffers. Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240422225024.2847039-4-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/config | 5 + .../selftests/bpf/prog_tests/crypto_sanity.c | 197 ++++++++++++++++++ .../selftests/bpf/progs/crypto_basic.c | 68 ++++++ .../selftests/bpf/progs/crypto_common.h | 66 ++++++ .../selftests/bpf/progs/crypto_sanity.c | 169 +++++++++++++++ 5 files changed, 505 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/crypto_sanity.c create mode 100644 tools/testing/selftests/bpf/progs/crypto_basic.c create mode 100644 tools/testing/selftests/bpf/progs/crypto_common.h create mode 100644 tools/testing/selftests/bpf/progs/crypto_sanity.c diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index afd675b1bf80..eeabd798bc3a 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -13,7 +13,12 @@ CONFIG_BPF_SYSCALL=y CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_USER_API=y CONFIG_CRYPTO_USER_API_HASH=y +CONFIG_CRYPTO_USER_API_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_AES=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_DWARF4=y diff --git a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c new file mode 100644 index 000000000000..b1a3a49a822a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include + +#include "test_progs.h" +#include "network_helpers.h" +#include "crypto_sanity.skel.h" +#include "crypto_basic.skel.h" + +#define NS_TEST "crypto_sanity_ns" +#define IPV6_IFACE_ADDR "face::1" +static const unsigned char crypto_key[] = "testtest12345678"; +static const char plain_text[] = "stringtoencrypt0"; +static int opfd = -1, tfmfd = -1; +static const char algo[] = "ecb(aes)"; +static int init_afalg(void) +{ + struct sockaddr_alg sa = { + .salg_family = AF_ALG, + .salg_type = "skcipher", + .salg_name = "ecb(aes)" + }; + + tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0); + if (tfmfd == -1) + return errno; + if (bind(tfmfd, (struct sockaddr *)&sa, sizeof(sa)) == -1) + return errno; + if (setsockopt(tfmfd, SOL_ALG, ALG_SET_KEY, crypto_key, 16) == -1) + return errno; + opfd = accept(tfmfd, NULL, 0); + if (opfd == -1) + return errno; + return 0; +} + +static void deinit_afalg(void) +{ + if (tfmfd != -1) + close(tfmfd); + if (opfd != -1) + close(opfd); +} + +static void do_crypt_afalg(const void *src, void *dst, int size, bool encrypt) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + char cbuf[CMSG_SPACE(4)] = {0}; + struct iovec iov; + + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_ALG; + cmsg->cmsg_type = ALG_SET_OP; + cmsg->cmsg_len = CMSG_LEN(4); + *(__u32 *)CMSG_DATA(cmsg) = encrypt ? ALG_OP_ENCRYPT : ALG_OP_DECRYPT; + + iov.iov_base = (char *)src; + iov.iov_len = size; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + sendmsg(opfd, &msg, 0); + read(opfd, dst, size); +} + +void test_crypto_basic(void) +{ + RUN_TESTS(crypto_basic); +} + +void test_crypto_sanity(void) +{ + LIBBPF_OPTS(bpf_tc_hook, qdisc_hook, .attach_point = BPF_TC_EGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_enc); + LIBBPF_OPTS(bpf_tc_opts, tc_attach_dec); + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct nstoken *nstoken = NULL; + struct crypto_sanity *skel; + char afalg_plain[16] = {0}; + char afalg_dst[16] = {0}; + struct sockaddr_in6 addr; + int sockfd, err, pfd; + socklen_t addrlen; + u16 udp_test_port; + + skel = crypto_sanity__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open")) + return; + + SYS(fail, "ip netns add %s", NS_TEST); + SYS(fail, "ip -net %s -6 addr add %s/128 dev lo nodad", NS_TEST, IPV6_IFACE_ADDR); + SYS(fail, "ip -net %s link set dev lo up", NS_TEST); + + nstoken = open_netns(NS_TEST); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto fail; + + err = init_afalg(); + if (!ASSERT_OK(err, "AF_ALG init fail")) + goto fail; + + qdisc_hook.ifindex = if_nametoindex("lo"); + if (!ASSERT_GT(qdisc_hook.ifindex, 0, "if_nametoindex lo")) + goto fail; + + skel->bss->key_len = 16; + skel->bss->authsize = 0; + udp_test_port = skel->data->udp_test_port; + memcpy(skel->bss->key, crypto_key, sizeof(crypto_key)); + snprintf(skel->bss->algo, 128, "%s", algo); + pfd = bpf_program__fd(skel->progs.skb_crypto_setup); + if (!ASSERT_GT(pfd, 0, "skb_crypto_setup fd")) + goto fail; + + err = bpf_prog_test_run_opts(pfd, &opts); + if (!ASSERT_OK(err, "skb_crypto_setup") || + !ASSERT_OK(opts.retval, "skb_crypto_setup retval")) + goto fail; + + if (!ASSERT_OK(skel->bss->status, "skb_crypto_setup status")) + goto fail; + + err = bpf_tc_hook_create(&qdisc_hook); + if (!ASSERT_OK(err, "create qdisc hook")) + goto fail; + + addrlen = sizeof(addr); + err = make_sockaddr(AF_INET6, IPV6_IFACE_ADDR, udp_test_port, + (void *)&addr, &addrlen); + if (!ASSERT_OK(err, "make_sockaddr")) + goto fail; + + tc_attach_enc.prog_fd = bpf_program__fd(skel->progs.encrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "attach encrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "encrypt socket")) + goto fail; + err = sendto(sockfd, plain_text, sizeof(plain_text), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(plain_text), "encrypt send")) + goto fail; + + do_crypt_afalg(plain_text, afalg_dst, sizeof(afalg_dst), true); + + if (!ASSERT_OK(skel->bss->status, "encrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_dst, sizeof(afalg_dst), "encrypt AF_ALG")) + goto fail; + + tc_attach_enc.flags = tc_attach_enc.prog_fd = tc_attach_enc.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_enc); + if (!ASSERT_OK(err, "bpf_tc_detach encrypt")) + goto fail; + + tc_attach_dec.prog_fd = bpf_program__fd(skel->progs.decrypt_sanity); + err = bpf_tc_attach(&qdisc_hook, &tc_attach_dec); + if (!ASSERT_OK(err, "attach decrypt filter")) + goto fail; + + sockfd = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_NEQ(sockfd, -1, "decrypt socket")) + goto fail; + err = sendto(sockfd, afalg_dst, sizeof(afalg_dst), 0, (void *)&addr, addrlen); + close(sockfd); + if (!ASSERT_EQ(err, sizeof(afalg_dst), "decrypt send")) + goto fail; + + do_crypt_afalg(afalg_dst, afalg_plain, sizeof(afalg_plain), false); + + if (!ASSERT_OK(skel->bss->status, "decrypt status")) + goto fail; + if (!ASSERT_STRNEQ(skel->bss->dst, afalg_plain, sizeof(afalg_plain), "decrypt AF_ALG")) + goto fail; + + tc_attach_dec.flags = tc_attach_dec.prog_fd = tc_attach_dec.prog_id = 0; + err = bpf_tc_detach(&qdisc_hook, &tc_attach_dec); + ASSERT_OK(err, "bpf_tc_detach decrypt"); + +fail: + close_netns(nstoken); + deinit_afalg(); + SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null"); + crypto_sanity__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/crypto_basic.c b/tools/testing/selftests/bpf/progs/crypto_basic.c new file mode 100644 index 000000000000..8cf7168b42d5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_basic.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +int status; +SEC("syscall") +int crypto_release(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +SEC("syscall") +__failure __msg("Unreleased reference") +int crypto_acquire(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .algo = "ecb(aes)", + .key_len = 16, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + cctx = bpf_crypto_ctx_acquire(cctx); + if (!cctx) + return -EINVAL; + + bpf_crypto_ctx_release(cctx); + + return 0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/crypto_common.h b/tools/testing/selftests/bpf/progs/crypto_common.h new file mode 100644 index 000000000000..57dd7a68a8c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_common.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#ifndef _CRYPTO_COMMON_H +#define _CRYPTO_COMMON_H + +#include "errno.h" +#include + +struct bpf_crypto_ctx *bpf_crypto_ctx_create(const struct bpf_crypto_params *params, + u32 params__sz, int *err) __ksym; +struct bpf_crypto_ctx *bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) __ksym; +void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) __ksym; +int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; +int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src, + const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym; + +struct __crypto_ctx_value { + struct bpf_crypto_ctx __kptr * ctx; +}; + +struct array_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct __crypto_ctx_value); + __uint(max_entries, 1); +} __crypto_ctx_map SEC(".maps"); + +static inline struct __crypto_ctx_value *crypto_ctx_value_lookup(void) +{ + u32 key = 0; + + return bpf_map_lookup_elem(&__crypto_ctx_map, &key); +} + +static inline int crypto_ctx_insert(struct bpf_crypto_ctx *ctx) +{ + struct __crypto_ctx_value local, *v; + struct bpf_crypto_ctx *old; + u32 key = 0; + int err; + + local.ctx = NULL; + err = bpf_map_update_elem(&__crypto_ctx_map, &key, &local, 0); + if (err) { + bpf_crypto_ctx_release(ctx); + return err; + } + + v = bpf_map_lookup_elem(&__crypto_ctx_map, &key); + if (!v) { + bpf_crypto_ctx_release(ctx); + return -ENOENT; + } + + old = bpf_kptr_xchg(&v->ctx, ctx); + if (old) { + bpf_crypto_ctx_release(old); + return -EEXIST; + } + + return 0; +} + +#endif /* _CRYPTO_COMMON_H */ diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c new file mode 100644 index 000000000000..1be0a3fa5efd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +unsigned char key[256] = {}; +u16 udp_test_port = 7777; +u32 authsize, key_len; +char algo[128] = {}; +char dst[16] = {}; +int status; + +static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) +{ + struct ipv6hdr ip6h; + struct udphdr udph; + u32 offset; + + if (skb->protocol != __bpf_constant_htons(ETH_P_IPV6)) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &ip6h, sizeof(ip6h))) + return -1; + + if (ip6h.nexthdr != IPPROTO_UDP) + return -1; + + if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(ip6h), &udph, sizeof(udph))) + return -1; + + if (udph.dest != __bpf_htons(udp_test_port)) + return -1; + + offset = ETH_HLEN + sizeof(ip6h) + sizeof(udph); + if (skb->len < offset + 16) + return -1; + + /* let's make sure that 16 bytes of payload are in the linear part of skb */ + bpf_skb_pull_data(skb, offset + 16); + bpf_dynptr_from_skb(skb, 0, psrc); + bpf_dynptr_adjust(psrc, offset, offset + 16); + + return 0; +} + +SEC("syscall") +int skb_crypto_setup(void *ctx) +{ + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + struct bpf_crypto_ctx *cctx; + int err = 0; + + status = 0; + + if (key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int decrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +SEC("tc") +int encrypt_sanity(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + int err; + + status = 0; + + err = skb_dynptr_validate(skb, &psrc); + if (err < 0) { + status = err; + return TC_ACT_SHOT; + } + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return TC_ACT_SHOT; + } + + /* dst is a global variable to make testing part easier to check. In real + * production code, a percpu map should be used to store the result. + */ + bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); + /* iv dynptr has to be initialized with 0 size, but proper memory region + * has to be provided anyway + */ + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + + return TC_ACT_SHOT; +} + +char __license[] SEC("license") = "GPL"; From 8000e627dc98efc44658af6150fd81c62d936b1b Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 22 Apr 2024 15:50:24 -0700 Subject: [PATCH 135/147] selftests: bpf: crypto: add benchmark for crypto functions Some simple benchmarks are added to understand the baseline of performance. Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240422225024.2847039-5-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 2 + tools/testing/selftests/bpf/bench.c | 6 + .../selftests/bpf/benchs/bench_bpf_crypto.c | 185 ++++++++++++++++++ .../selftests/bpf/progs/crypto_bench.c | 109 +++++++++++ 4 files changed, 302 insertions(+) create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c create mode 100644 tools/testing/selftests/bpf/progs/crypto_bench.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 0ce5c3c26b5c..ff1af05add6e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -730,6 +730,7 @@ $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tas $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.skel.h $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h +$(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -749,6 +750,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bpf_hashmap_lookup.o \ $(OUTPUT)/bench_local_storage_create.o \ $(OUTPUT)/bench_htab_mem.o \ + $(OUTPUT)/bench_bpf_crypto.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 82de56c8162e..627b74ae041b 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -281,6 +281,7 @@ extern struct argp bench_hashmap_lookup_argp; extern struct argp bench_local_storage_create_argp; extern struct argp bench_htab_mem_argp; extern struct argp bench_trigger_batch_argp; +extern struct argp bench_crypto_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -294,6 +295,7 @@ static const struct argp_child bench_parsers[] = { { &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 }, { &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 }, { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, + { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, {}, }; @@ -538,6 +540,8 @@ extern const struct bench bench_local_storage_tasks_trace; extern const struct bench bench_bpf_hashmap_lookup; extern const struct bench bench_local_storage_create; extern const struct bench bench_htab_mem; +extern const struct bench bench_crypto_encrypt; +extern const struct bench bench_crypto_decrypt; static const struct bench *benchs[] = { &bench_count_global, @@ -590,6 +594,8 @@ static const struct bench *benchs[] = { &bench_bpf_hashmap_lookup, &bench_local_storage_create, &bench_htab_mem, + &bench_crypto_encrypt, + &bench_crypto_decrypt, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c new file mode 100644 index 000000000000..2845edaba8db --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include "bench.h" +#include "crypto_bench.skel.h" + +#define MAX_CIPHER_LEN 32 +static char *input; +static struct crypto_ctx { + struct crypto_bench *skel; + int pfd; +} ctx; + +static struct crypto_args { + u32 crypto_len; + char *crypto_cipher; +} args = { + .crypto_len = 16, + .crypto_cipher = "ecb(aes)", +}; + +enum { + ARG_CRYPTO_LEN = 5000, + ARG_CRYPTO_CIPHER = 5001, +}; + +static const struct argp_option opts[] = { + { "crypto-len", ARG_CRYPTO_LEN, "CRYPTO_LEN", 0, + "Set the length of crypto buffer" }, + { "crypto-cipher", ARG_CRYPTO_CIPHER, "CRYPTO_CIPHER", 0, + "Set the cipher to use (default:ecb(aes))" }, + {}, +}; + +static error_t crypto_parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_CRYPTO_LEN: + args.crypto_len = strtoul(arg, NULL, 10); + if (!args.crypto_len || + args.crypto_len > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "Invalid crypto buffer len (limit %zu)\n", + sizeof(ctx.skel->bss->dst)); + argp_usage(state); + } + break; + case ARG_CRYPTO_CIPHER: + args.crypto_cipher = strdup(arg); + if (!strlen(args.crypto_cipher) || + strlen(args.crypto_cipher) > MAX_CIPHER_LEN) { + fprintf(stderr, "Invalid crypto cipher len (limit %d)\n", + MAX_CIPHER_LEN); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_crypto_argp = { + .options = opts, + .parser = crypto_parse_arg, +}; + +static void crypto_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "bpf crypto benchmark doesn't support consumer!\n"); + exit(1); + } +} + +static void crypto_setup(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + + int err, pfd; + size_t i, sz; + + sz = args.crypto_len; + if (!sz || sz > sizeof(ctx.skel->bss->dst)) { + fprintf(stderr, "invalid encrypt buffer size (source %zu, target %zu)\n", + sz, sizeof(ctx.skel->bss->dst)); + exit(1); + } + + setup_libbpf(); + + ctx.skel = crypto_bench__open(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + snprintf(ctx.skel->bss->cipher, 128, "%s", args.crypto_cipher); + memcpy(ctx.skel->bss->key, "12345678testtest", 16); + ctx.skel->bss->key_len = 16; + ctx.skel->bss->authsize = 0; + + srandom(time(NULL)); + input = malloc(sz); + for (i = 0; i < sz - 1; i++) + input[i] = '1' + random() % 9; + input[sz - 1] = '\0'; + + ctx.skel->rodata->len = args.crypto_len; + + err = crypto_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "failed to load skeleton\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + pfd = bpf_program__fd(ctx.skel->progs.crypto_setup); + if (pfd < 0) { + fprintf(stderr, "failed to get fd for setup prog\n"); + crypto_bench__destroy(ctx.skel); + exit(1); + } + + err = bpf_prog_test_run_opts(pfd, &opts); + if (err || ctx.skel->bss->status) { + fprintf(stderr, "failed to run setup prog: err %d, status %d\n", + err, ctx.skel->bss->status); + crypto_bench__destroy(ctx.skel); + exit(1); + } +} + +static void crypto_encrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_encrypt); +} + +static void crypto_decrypt_setup(void) +{ + crypto_setup(); + ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_decrypt); +} + +static void crypto_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); +} + +static void *crypto_producer(void *unused) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .repeat = 64, + .data_in = input, + .data_size_in = args.crypto_len, + ); + + while (true) + (void)bpf_prog_test_run_opts(ctx.pfd, &opts); + return NULL; +} + +const struct bench bench_crypto_encrypt = { + .name = "crypto-encrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_encrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_crypto_decrypt = { + .name = "crypto-decrypt", + .argp = &bench_crypto_argp, + .validate = crypto_validate, + .setup = crypto_decrypt_setup, + .producer_thread = crypto_producer, + .measure = crypto_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/progs/crypto_bench.c b/tools/testing/selftests/bpf/progs/crypto_bench.c new file mode 100644 index 000000000000..e61fe0882293 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/crypto_bench.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include "crypto_common.h" + +const volatile unsigned int len = 16; +char cipher[128] = {}; +u32 key_len, authsize; +char dst[256] = {}; +u8 key[256] = {}; +long hits = 0; +int status; + +SEC("syscall") +int crypto_setup(void *args) +{ + struct bpf_crypto_ctx *cctx; + struct bpf_crypto_params params = { + .type = "skcipher", + .key_len = key_len, + .authsize = authsize, + }; + int err = 0; + + status = 0; + + if (!cipher[0] || !key_len || key_len > 256) { + status = -EINVAL; + return 0; + } + + __builtin_memcpy(¶ms.algo, cipher, sizeof(cipher)); + __builtin_memcpy(¶ms.key, key, sizeof(key)); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + + if (!cctx) { + status = err; + return 0; + } + + err = crypto_ctx_insert(cctx); + if (err && err != -EEXIST) + status = err; + + return 0; +} + +SEC("tc") +int crypto_encrypt(struct __sk_buff *skb) +{ + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + struct bpf_dynptr psrc, pdst, iv; + + v = crypto_ctx_value_lookup(); + if (!v) { + status = -ENOENT; + return 0; + } + + ctx = v->ctx; + if (!ctx) { + status = -ENOENT; + return 0; + } + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +SEC("tc") +int crypto_decrypt(struct __sk_buff *skb) +{ + struct bpf_dynptr psrc, pdst, iv; + struct __crypto_ctx_value *v; + struct bpf_crypto_ctx *ctx; + + v = crypto_ctx_value_lookup(); + if (!v) + return -ENOENT; + + ctx = v->ctx; + if (!ctx) + return -ENOENT; + + bpf_dynptr_from_skb(skb, 0, &psrc); + bpf_dynptr_from_mem(dst, len, 0, &pdst); + bpf_dynptr_from_mem(dst, 0, 0, &iv); + + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +char __license[] SEC("license") = "GPL"; From 8ec3bf5c31d2a59c320ff59397f6ac362341d9d7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 24 Apr 2024 15:55:29 -0700 Subject: [PATCH 136/147] bpf: Add bpf_guard_preempt() convenience macro Add bpf_guard_preempt() macro that uses newly introduced bpf_preempt_disable/enable() kfuncs to guard a critical section. Signed-off-by: Alexei Starovoitov Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240424225529.16782-1-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/bpf_experimental.h | 22 +++++++++++++++++++ .../selftests/bpf/progs/preempt_lock.c | 7 ++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 93c5a6c446b3..8b9cc87be4c4 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -397,6 +397,28 @@ l_true: \ , [as]"i"((dst_as << 16) | src_as)); #endif +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; + +typedef struct { +} __bpf_preempt_t; + +static inline __bpf_preempt_t __bpf_preempt_constructor(void) +{ + __bpf_preempt_t ret = {}; + + bpf_preempt_disable(); + return ret; +} +static inline void __bpf_preempt_destructor(__bpf_preempt_t *t) +{ + bpf_preempt_enable(); +} +#define bpf_guard_preempt() \ + __bpf_preempt_t ___bpf_apply(preempt, __COUNTER__) \ + __attribute__((__unused__, __cleanup__(__bpf_preempt_destructor))) = \ + __bpf_preempt_constructor() + /* Description * Assert that a conditional expression is true. * Returns diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c index 6c637ee01ec4..672fc368d9c4 100644 --- a/tools/testing/selftests/bpf/progs/preempt_lock.c +++ b/tools/testing/selftests/bpf/progs/preempt_lock.c @@ -3,9 +3,7 @@ #include #include #include "bpf_misc.h" - -void bpf_preempt_disable(void) __ksym; -void bpf_preempt_enable(void) __ksym; +#include "bpf_experimental.h" SEC("?tc") __failure __msg("1 bpf_preempt_enable is missing") @@ -92,8 +90,7 @@ static __noinline void preempt_balance_subprog(void) SEC("?tc") __success int preempt_balance(struct __sk_buff *ctx) { - bpf_preempt_disable(); - bpf_preempt_enable(); + bpf_guard_preempt(); return 0; } From 638a485c4996be1d38303cf25ea8d12dfd16011b Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 25 Apr 2024 16:06:27 +0200 Subject: [PATCH 137/147] selftests/bpf: Add ring_buffer__consume_n test. Add a testcase for the ring_buffer__consume_n() API. The test produces multiple samples in a ring buffer, using a sys_getpid() fentry prog, and consumes them from user-space in batches, rather than consuming all of them greedily, like ring_buffer__consume() does. Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/lkml/CAEf4BzaR4zqUpDmj44KNLdpJ=Tpa97GrvzuzVNO5nM6b7oWd1w@mail.gmail.com Link: https://lore.kernel.org/bpf/20240425140627.112728-1-andrea.righi@canonical.com --- tools/testing/selftests/bpf/Makefile | 2 +- .../selftests/bpf/prog_tests/ringbuf.c | 65 +++++++++++++++++++ .../selftests/bpf/progs/test_ringbuf_n.c | 47 ++++++++++++++ 3 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_n.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ff1af05add6e..ca8b73f7c774 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -456,7 +456,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ - test_ringbuf_map_key.c + test_ringbuf_n.c test_ringbuf_map_key.c # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index 48c5695b7abf..4c6f42dae409 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -13,6 +13,7 @@ #include #include #include "test_ringbuf.lskel.h" +#include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" #define EDONE 7777 @@ -326,6 +327,68 @@ cleanup: test_ringbuf_lskel__destroy(skel); } +/* + * Test ring_buffer__consume_n() by producing N_TOT_SAMPLES samples in the ring + * buffer, via getpid(), and consuming them in chunks of N_SAMPLES. + */ +#define N_TOT_SAMPLES 32 +#define N_SAMPLES 4 + +/* Sample value to verify the callback validity */ +#define SAMPLE_VALUE 42L + +static int process_n_sample(void *ctx, void *data, size_t len) +{ + struct sample *s = data; + + ASSERT_EQ(s->value, SAMPLE_VALUE, "sample_value"); + + return 0; +} + +static void ringbuf_n_subtest(void) +{ + struct test_ringbuf_n_lskel *skel_n; + int err, i; + + skel_n = test_ringbuf_n_lskel__open(); + if (!ASSERT_OK_PTR(skel_n, "test_ringbuf_n_lskel__open")) + return; + + skel_n->maps.ringbuf.max_entries = getpagesize(); + skel_n->bss->pid = getpid(); + + err = test_ringbuf_n_lskel__load(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__load")) + goto cleanup; + + ringbuf = ring_buffer__new(skel_n->maps.ringbuf.map_fd, + process_n_sample, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new")) + goto cleanup; + + err = test_ringbuf_n_lskel__attach(skel_n); + if (!ASSERT_OK(err, "test_ringbuf_n_lskel__attach")) + goto cleanup_ringbuf; + + /* Produce N_TOT_SAMPLES samples in the ring buffer by calling getpid() */ + skel_n->bss->value = SAMPLE_VALUE; + for (i = 0; i < N_TOT_SAMPLES; i++) + syscall(__NR_getpgid); + + /* Consume all samples from the ring buffer in batches of N_SAMPLES */ + for (i = 0; i < N_TOT_SAMPLES; i += err) { + err = ring_buffer__consume_n(ringbuf, N_SAMPLES); + if (!ASSERT_EQ(err, N_SAMPLES, "rb_consume")) + goto cleanup_ringbuf; + } + +cleanup_ringbuf: + ring_buffer__free(ringbuf); +cleanup: + test_ringbuf_n_lskel__destroy(skel_n); +} + static int process_map_key_sample(void *ctx, void *data, size_t len) { struct sample *s; @@ -384,6 +447,8 @@ void test_ringbuf(void) { if (test__start_subtest("ringbuf")) ringbuf_subtest(); + if (test__start_subtest("ringbuf_n")) + ringbuf_n_subtest(); if (test__start_subtest("ringbuf_map_key")) ringbuf_map_key_subtest(); } diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_n.c b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c new file mode 100644 index 000000000000..8669eb42dbe0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Andrea Righi + +#include +#include +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +#define TASK_COMM_LEN 16 + +struct sample { + int pid; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +int pid = 0; +long value = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_ringbuf_n(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) + return 0; + + sample->pid = pid; + sample->value = value; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + + bpf_ringbuf_submit(sample, 0); + + return 0; +} From 1479eaff1f16983d8fda7c5a08a586c21891087d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 23 Apr 2024 18:28:17 -0700 Subject: [PATCH 138/147] bpf: mark bpf_dummy_struct_ops.test_1 parameter as nullable Test case dummy_st_ops/dummy_init_ret_value passes NULL as the first parameter of the test_1() function. Mark this parameter as nullable to make verifier aware of such possibility. Otherwise, NULL check in the test_1() code: SEC("struct_ops/test_1") int BPF_PROG(test_1, struct bpf_dummy_ops_state *state) { if (!state) return ...; ... access state ... } Might be removed by verifier, thus triggering NULL pointer dereference under certain conditions. Reported-by: Jose E. Marchesi Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240424012821.595216-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- net/bpf/bpf_dummy_struct_ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 25b75844891a..8f413cdfd91a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -232,7 +232,7 @@ static void bpf_dummy_unreg(void *kdata) { } -static int bpf_dummy_test_1(struct bpf_dummy_ops_state *cb) +static int bpf_dummy_ops__test_1(struct bpf_dummy_ops_state *cb__nullable) { return 0; } @@ -249,7 +249,7 @@ static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb) } static struct bpf_dummy_ops __bpf_bpf_dummy_ops = { - .test_1 = bpf_dummy_test_1, + .test_1 = bpf_dummy_ops__test_1, .test_2 = bpf_dummy_test_2, .test_sleepable = bpf_dummy_test_sleepable, }; From 3b3b84aacb4420226576c9732e7b539ca7b79633 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 23 Apr 2024 18:28:18 -0700 Subject: [PATCH 139/147] selftests/bpf: adjust dummy_st_ops_success to detect additional error As reported by Jose E. Marchesi in off-list discussion, GCC and LLVM generate slightly different code for dummy_st_ops_success/test_1(): SEC("struct_ops/test_1") int BPF_PROG(test_1, struct bpf_dummy_ops_state *state) { int ret; if (!state) return 0xf2f3f4f5; ret = state->val; state->val = 0x5a; return ret; } GCC-generated LLVM-generated ---------------------------- --------------------------- 0: r1 = *(u64 *)(r1 + 0x0) 0: w0 = -0xd0c0b0b 1: if r1 == 0x0 goto 5f 1: r1 = *(u64 *)(r1 + 0x0) 2: r0 = *(s32 *)(r1 + 0x0) 2: if r1 == 0x0 goto 6f 3: *(u32 *)(r1 + 0x0) = 0x5a 3: r0 = *(u32 *)(r1 + 0x0) 4: exit 4: w2 = 0x5a 5: r0 = -0xd0c0b0b 5: *(u32 *)(r1 + 0x0) = r2 6: exit 6: exit If the 'state' argument is not marked as nullable in net/bpf/bpf_dummy_struct_ops.c, the verifier would assume that 'r1 == 0x0' is never true: - for the GCC version, this means that instructions #5-6 would be marked as dead and removed; - for the LLVM version, all instructions would be marked as live. The test dummy_st_ops/dummy_init_ret_value actually sets the 'state' parameter to NULL. Therefore, when the 'state' argument is not marked as nullable, the GCC-generated version of the code would trigger a NULL pointer dereference at instruction #3. This patch updates the test_1() test case to always follow a shape similar to the GCC-generated version above, in order to verify whether the 'state' nullability is marked correctly. Reported-by: Jose E. Marchesi Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240424012821.595216-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/dummy_st_ops_success.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c index 1efa746c25dc..cc7b69b001aa 100644 --- a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c +++ b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c @@ -11,8 +11,17 @@ int BPF_PROG(test_1, struct bpf_dummy_ops_state *state) { int ret; - if (!state) - return 0xf2f3f4f5; + /* Check that 'state' nullable status is detected correctly. + * If 'state' argument would be assumed non-null by verifier + * the code below would be deleted as dead (which it shouldn't). + * Hide it from the compiler behind 'asm' block to avoid + * unnecessary optimizations. + */ + asm volatile ( + "if %[state] != 0 goto +2;" + "r0 = 0xf2f3f4f5;" + "exit;" + ::[state]"p"(state)); ret = state->val; state->val = 0x5a; From f612210d456a0b969a0adca91e68dbea0e0ea301 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 23 Apr 2024 18:28:19 -0700 Subject: [PATCH 140/147] selftests/bpf: do not pass NULL for non-nullable params in dummy_st_ops dummy_st_ops.test_2 and dummy_st_ops.test_sleepable do not have their 'state' parameter marked as nullable. Update dummy_st_ops.c to avoid passing NULL for such parameters, as the next patch would allow kernel to enforce this restriction. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240424012821.595216-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c | 7 +++++-- tools/testing/selftests/bpf/progs/dummy_st_ops_success.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c index f43fcb13d2c4..dd926c00f414 100644 --- a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c @@ -98,7 +98,8 @@ done: static void test_dummy_multiple_args(void) { - __u64 args[5] = {0, -100, 0x8a5f, 'c', 0x1234567887654321ULL}; + struct bpf_dummy_ops_state st = { 7 }; + __u64 args[5] = {(__u64)&st, -100, 0x8a5f, 'c', 0x1234567887654321ULL}; LIBBPF_OPTS(bpf_test_run_opts, attr, .ctx_in = args, .ctx_size_in = sizeof(args), @@ -115,6 +116,7 @@ static void test_dummy_multiple_args(void) fd = bpf_program__fd(skel->progs.test_2); err = bpf_prog_test_run_opts(fd, &attr); ASSERT_OK(err, "test_run"); + args[0] = 7; for (i = 0; i < ARRAY_SIZE(args); i++) { snprintf(name, sizeof(name), "arg %zu", i); ASSERT_EQ(skel->bss->test_2_args[i], args[i], name); @@ -125,7 +127,8 @@ static void test_dummy_multiple_args(void) static void test_dummy_sleepable(void) { - __u64 args[1] = {0}; + struct bpf_dummy_ops_state st; + __u64 args[1] = {(__u64)&st}; LIBBPF_OPTS(bpf_test_run_opts, attr, .ctx_in = args, .ctx_size_in = sizeof(args), diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c index cc7b69b001aa..ec0c595d47af 100644 --- a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c +++ b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c @@ -34,7 +34,7 @@ SEC("struct_ops/test_2") int BPF_PROG(test_2, struct bpf_dummy_ops_state *state, int a1, unsigned short a2, char a3, unsigned long a4) { - test_2_args[0] = (unsigned long)state; + test_2_args[0] = state->val; test_2_args[1] = a1; test_2_args[2] = a2; test_2_args[3] = a3; From 980ca8ceeae69ddf362870ea9183f389ae26324a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 23 Apr 2024 18:28:20 -0700 Subject: [PATCH 141/147] bpf: check bpf_dummy_struct_ops program params for test runs When doing BPF_PROG_TEST_RUN for bpf_dummy_struct_ops programs, reject execution when NULL is passed for non-nullable params. For programs with non-nullable params verifier assumes that such params are never NULL and thus might optimize out NULL checks. Suggested-by: Kui-Feng Lee Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240424012821.595216-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- net/bpf/bpf_dummy_struct_ops.c | 51 +++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 8f413cdfd91a..891cdf61c65a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -79,6 +79,51 @@ static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args) args->args[3], args->args[4]); } +static const struct bpf_ctx_arg_aux *find_ctx_arg_info(struct bpf_prog_aux *aux, int offset) +{ + int i; + + for (i = 0; i < aux->ctx_arg_info_size; i++) + if (aux->ctx_arg_info[i].offset == offset) + return &aux->ctx_arg_info[i]; + + return NULL; +} + +/* There is only one check at the moment: + * - zero should not be passed for pointer parameters not marked as nullable. + */ +static int check_test_run_args(struct bpf_prog *prog, struct bpf_dummy_ops_test_args *args) +{ + const struct btf_type *func_proto = prog->aux->attach_func_proto; + + for (u32 arg_no = 0; arg_no < btf_type_vlen(func_proto) ; ++arg_no) { + const struct btf_param *param = &btf_params(func_proto)[arg_no]; + const struct bpf_ctx_arg_aux *info; + const struct btf_type *t; + int offset; + + if (args->args[arg_no] != 0) + continue; + + /* Program is validated already, so there is no need + * to check if t is NULL. + */ + t = btf_type_skip_modifiers(bpf_dummy_ops_btf, param->type, NULL); + if (!btf_type_is_ptr(t)) + continue; + + offset = btf_ctx_arg_offset(bpf_dummy_ops_btf, func_proto, arg_no); + info = find_ctx_arg_info(prog->aux, offset); + if (info && (info->reg_type & PTR_MAYBE_NULL)) + continue; + + return -EINVAL; + } + + return 0; +} + extern const struct bpf_link_ops bpf_struct_ops_link_lops; int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, @@ -87,7 +132,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops; const struct btf_type *func_proto; struct bpf_dummy_ops_test_args *args; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_links *tlinks = NULL; struct bpf_tramp_link *link = NULL; void *image = NULL; unsigned int op_idx; @@ -109,6 +154,10 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(args)) return PTR_ERR(args); + err = check_test_run_args(prog, args); + if (err) + goto out; + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); if (!tlinks) { err = -ENOMEM; From 6a2d30d3c5bf9f088dcfd5f3746b04d84f2fab83 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 23 Apr 2024 18:28:21 -0700 Subject: [PATCH 142/147] selftests/bpf: dummy_st_ops should reject 0 for non-nullable params Check if BPF_PROG_TEST_RUN for bpf_dummy_struct_ops programs rejects execution if NULL is passed for non-nullable parameter. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240424012821.595216-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/dummy_st_ops.c | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c index dd926c00f414..d3d94596ab79 100644 --- a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c +++ b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c @@ -147,6 +147,31 @@ static void test_dummy_sleepable(void) dummy_st_ops_success__destroy(skel); } +/* dummy_st_ops.test_sleepable() parameter is not marked as nullable, + * thus bpf_prog_test_run_opts() below should be rejected as it tries + * to pass NULL for this parameter. + */ +static void test_dummy_sleepable_reject_null(void) +{ + __u64 args[1] = {0}; + LIBBPF_OPTS(bpf_test_run_opts, attr, + .ctx_in = args, + .ctx_size_in = sizeof(args), + ); + struct dummy_st_ops_success *skel; + int fd, err; + + skel = dummy_st_ops_success__open_and_load(); + if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load")) + return; + + fd = bpf_program__fd(skel->progs.test_sleepable); + err = bpf_prog_test_run_opts(fd, &attr); + ASSERT_EQ(err, -EINVAL, "test_run"); + + dummy_st_ops_success__destroy(skel); +} + void test_dummy_st_ops(void) { if (test__start_subtest("dummy_st_ops_attach")) @@ -159,6 +184,8 @@ void test_dummy_st_ops(void) test_dummy_multiple_args(); if (test__start_subtest("dummy_sleepable")) test_dummy_sleepable(); + if (test__start_subtest("dummy_sleepable_reject_null")) + test_dummy_sleepable_reject_null(); RUN_TESTS(dummy_st_ops_fail); } From 48e2cd3e3dcfe04f212df4fb189fa04c2a87b980 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 26 Apr 2024 00:17:23 +0800 Subject: [PATCH 143/147] bpf: add mrtt and srtt as BPF_SOCK_OPS_RTT_CB args Two important arguments in RTT estimation, mrtt and srtt, are passed to tcp_bpf_rtt(), so that bpf programs get more information about RTT computation in BPF_SOCK_OPS_RTT_CB. The difference between bpf_sock_ops->srtt_us and the srtt here is: the former is an old rtt before update, while srtt passed by tcp_bpf_rtt() is that after update. Signed-off-by: Philo Lu Link: https://lore.kernel.org/r/20240425161724.73707-2-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- include/net/tcp.h | 4 ++-- include/uapi/linux/bpf.h | 2 ++ net/ipv4/tcp_input.c | 4 ++-- tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ae35199d3b3..0f75d03287c2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2706,10 +2706,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } -static inline void tcp_bpf_rtt(struct sock *sk) +static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) - tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e4ae83550fb3..d94a72593ead 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6947,6 +6947,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d874817a78d..d1115d7c3936 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -911,7 +911,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } } else { /* no previous measure. */ @@ -921,7 +921,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } tp->srtt_us = max(1U, srtt); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e4ae83550fb3..d94a72593ead 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6947,6 +6947,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle From 7eb4f66b38069eec9c86c9d115f0bba1cf73ef2c Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 26 Apr 2024 00:17:24 +0800 Subject: [PATCH 144/147] selftests/bpf: extend BPF_SOCK_OPS_RTT_CB test for srtt and mrtt_us Because srtt and mrtt_us are added as args in bpf_sock_ops at BPF_SOCK_OPS_RTT_CB, a simple check is added to make sure they are both non-zero. $ ./test_progs -t tcp_rtt #373 tcp_rtt:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Suggested-by: Stanislav Fomichev Signed-off-by: Philo Lu Link: https://lore.kernel.org/r/20240425161724.73707-3-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/tcp_rtt.c | 14 ++++++++++++++ tools/testing/selftests/bpf/progs/tcp_rtt.c | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index 8fe84da1b9b4..f2b99d95d916 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; static void send_byte(int fd) @@ -83,6 +86,17 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, err++; } + /* Precise values of mrtt and srtt are unavailable, just make sure they are nonzero */ + if (val.mrtt_us == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[0] (mrtt_us) %u == 0", msg, val.mrtt_us); + err++; + } + + if (val.srtt == 0) { + log_err("%s: unexpected bpf_tcp_sock.args[1] (srtt) %u == 0", msg, val.srtt); + err++; + } + return err; } diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c index 0988d79f1587..42c729f85524 100644 --- a/tools/testing/selftests/bpf/progs/tcp_rtt.c +++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c @@ -10,6 +10,9 @@ struct tcp_rtt_storage { __u32 delivered; __u32 delivered_ce; __u32 icsk_retransmits; + + __u32 mrtt_us; /* args[0] */ + __u32 srtt; /* args[1] */ }; struct { @@ -55,5 +58,8 @@ int _sockops(struct bpf_sock_ops *ctx) storage->delivered_ce = tcp_sk->delivered_ce; storage->icsk_retransmits = tcp_sk->icsk_retransmits; + storage->mrtt_us = ctx->args[0]; + storage->srtt = ctx->args[1]; + return 1; } From e51b907d40329d4b4517a155e0bc0bf593d6767d Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Mon, 22 Apr 2024 12:09:42 -0700 Subject: [PATCH 145/147] bpf, docs: Add introduction for use in the ISA Internet Draft The proposed intro paragraph text is derived from the first paragraph of the IETF BPF WG charter at https://datatracker.ietf.org/wg/bpf/about/ Signed-off-by: Dave Thaler Acked-by: David Vernet Link: https://lore.kernel.org/r/20240422190942.24658-1-dthaler1968@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/standardization/instruction-set.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index d03d90afbd7d..b44bdacd0195 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -5,7 +5,11 @@ BPF Instruction Set Architecture (ISA) ====================================== -This document specifies the BPF instruction set architecture (ISA). +eBPF (which is no longer an acronym for anything), also commonly +referred to as BPF, is a technology with origins in the Linux kernel +that can run untrusted programs in a privileged context such as an +operating system kernel. This document specifies the BPF instruction +set architecture (ISA). Documentation conventions ========================= From 6e25bcf06af0341691f7058e17e04800f6a19e26 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Fri, 26 Apr 2024 16:51:58 +0200 Subject: [PATCH 146/147] bpf_helpers.h: Define bpf_tail_call_static when building with GCC The definition of bpf_tail_call_static in tools/lib/bpf/bpf_helpers.h is guarded by a preprocessor check to assure that clang is recent enough to support it. This patch updates the guard so the function is compiled when using GCC 13 or later as well. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240426145158.14409-1-jose.marchesi@oracle.com --- tools/lib/bpf/bpf_helpers.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index cd17f6d0791f..62e1c0cc4a59 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -137,7 +137,8 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if __clang_major__ >= 8 && defined(__bpf__) +#if (defined(__clang__) && __clang_major__ >= 8) || (!defined(__clang__) && __GNUC__ > 12) +#if defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -165,6 +166,7 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) : "r0", "r1", "r2", "r3", "r4", "r5"); } #endif +#endif enum libbpf_pin_type { LIBBPF_PIN_NONE, From 07801a24e2f18624cd2400ce15f14569eb416c9a Mon Sep 17 00:00:00 2001 From: Dave Thaler Date: Fri, 26 Apr 2024 16:11:26 -0700 Subject: [PATCH 147/147] bpf, docs: Clarify PC use in instruction-set.rst This patch elaborates on the use of PC by expanding the PC acronym, explaining the units, and the relative position to which the offset applies. Signed-off-by: Dave Thaler Signed-off-by: Daniel Borkmann Reviewed-by: David Vernet Link: https://lore.kernel.org/bpf/20240426231126.5130-1-dthaler1968@gmail.com --- Documentation/bpf/standardization/instruction-set.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst index b44bdacd0195..997560abadab 100644 --- a/Documentation/bpf/standardization/instruction-set.rst +++ b/Documentation/bpf/standardization/instruction-set.rst @@ -469,6 +469,12 @@ JSLT 0xc any PC += offset if dst < src signed JSLE 0xd any PC += offset if dst <= src signed ======== ===== ======= ================================= =================================================== +where 'PC' denotes the program counter, and the offset to increment by +is in units of 64-bit instructions relative to the instruction following +the jump instruction. Thus 'PC += 1' skips execution of the next +instruction if it's a basic instruction or results in undefined behavior +if the next instruction is a 128-bit wide instruction. + The BPF program needs to store the return value into register R0 before doing an ``EXIT``.