From 203d7b054fc719561fe258e46e280930890dceaf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 1 Sep 2020 19:31:12 -0700 Subject: [PATCH 01/95] bpf: Avoid iterating duplicated files for task_file iterator Currently, task_file iterator iterates all files from all tasks. This may potentially visit a lot of duplicated files if there are many tasks sharing the same files, e.g., typical pthreads where these pthreads and the main thread are sharing the same files. This patch changed task_file iterator to skip a particular task if that task shares the same files as its group_leader (the task having the same tgid and also task->tgid == task->pid). This will preserve the same result, visiting all files from all tasks, and will reduce runtime cost significantl, e.g., if there are a lot of pthreads and the process has a lot of open files. Suggested-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Reviewed-by: Josef Bacik Link: https://lore.kernel.org/bpf/20200902023112.1672792-1-yhs@fb.com --- kernel/bpf/task_iter.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 99af4cea1102..5b6af30bfbcd 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -22,7 +22,8 @@ struct bpf_iter_seq_task_info { }; static struct task_struct *task_seq_get_next(struct pid_namespace *ns, - u32 *tid) + u32 *tid, + bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; @@ -36,6 +37,12 @@ retry: if (!task) { ++*tid; goto retry; + } else if (skip_if_dup_files && task->tgid != task->pid && + task->files == task->group_leader->files) { + put_task_struct(task); + task = NULL; + ++*tid; + goto retry; } } rcu_read_unlock(); @@ -48,7 +55,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -65,7 +72,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -148,7 +155,7 @@ again: curr_files = *fstruct; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid); + curr_task = task_seq_get_next(ns, &curr_tid, true); if (!curr_task) return NULL; From 858e8b2eb4dd6fe6c4640463ad2f2ed3b8249ad4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 1 Sep 2020 19:31:13 -0700 Subject: [PATCH 02/95] selftests/bpf: Test task_file iterator without visiting pthreads Modified existing bpf_iter_test_file.c program to check whether all accessed files from the main thread or not. Modified existing bpf_iter_test_file program to check whether all accessed files from the main thread or not. $ ./test_progs -n 4 ... #4/7 task_file:OK ... #4 bpf_iter:OK Summary: 1/24 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200902023113.1672863-1-yhs@fb.com --- .../selftests/bpf/prog_tests/bpf_iter.c | 21 +++++++++++++++++++ .../selftests/bpf/progs/bpf_iter_task_file.c | 10 ++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 7375d9a6d242..fe1a83b9875c 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -132,17 +132,38 @@ static void test_task_stack(void) bpf_iter_task_stack__destroy(skel); } +static void *do_nothing(void *arg) +{ + pthread_exit(arg); +} + static void test_task_file(void) { struct bpf_iter_task_file *skel; + pthread_t thread_id; + void *ret; skel = bpf_iter_task_file__open_and_load(); if (CHECK(!skel, "bpf_iter_task_file__open_and_load", "skeleton open_and_load failed\n")) return; + skel->bss->tgid = getpid(); + + if (CHECK(pthread_create(&thread_id, NULL, &do_nothing, NULL), + "pthread_create", "pthread_create failed\n")) + goto done; + do_dummy_read(skel->progs.dump_task_file); + if (CHECK(pthread_join(thread_id, &ret) || ret != NULL, + "pthread_join", "pthread_join failed\n")) + goto done; + + CHECK(skel->bss->count != 0, "check_count", + "invalid non pthread file visit count %d\n", skel->bss->count); + +done: bpf_iter_task_file__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c index 8b787baa2654..b2f7c7c5f952 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -6,6 +6,9 @@ char _license[] SEC("license") = "GPL"; +int count = 0; +int tgid = 0; + SEC("iter/task_file") int dump_task_file(struct bpf_iter__task_file *ctx) { @@ -17,8 +20,13 @@ int dump_task_file(struct bpf_iter__task_file *ctx) if (task == (void *)0 || file == (void *)0) return 0; - if (ctx->meta->seq_num == 0) + if (ctx->meta->seq_num == 0) { + count = 0; BPF_SEQ_PRINTF(seq, " tgid gid fd file\n"); + } + + if (tgid == task->tgid && task->tgid != task->pid) + count++; BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, (long)file->f_op); From 53ea2076d851ee37e4f3954c5ae569439b138248 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 Sep 2020 10:52:23 +0200 Subject: [PATCH 03/95] xsk: Fix possible segfault in xsk umem diagnostics Fix possible segfault in the xsk diagnostics code when dumping information about the umem. This can happen when a umem has been created, but the socket has not been bound yet. In this case, the xsk buffer pool does not exist yet and we cannot dump the information that was moved from the umem to the buffer pool. Fix this by testing for the existence of the buffer pool and if not there, do not dump any of that information. Fixes: c2d3d6a47462 ("xsk: Move queue_id, dev and need_wakeup to buffer pool") Fixes: 7361f9c3d719 ("xsk: Move fill and completion rings to buffer pool") Reported-by: syzbot+3f04d36b7336f7868066@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1599036743-26454-1-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_diag.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 5bd8ea9d206a..c014217f5fa7 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -59,22 +59,20 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.num_pages = umem->npgs; du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; - du.ifindex = pool->netdev ? pool->netdev->ifindex : 0; - du.queue_id = pool->queue_id; + du.ifindex = (pool && pool->netdev) ? pool->netdev->ifindex : 0; + du.queue_id = pool ? pool->queue_id : 0; du.flags = 0; if (umem->zc) du.flags |= XDP_DU_F_ZEROCOPY; du.refs = refcount_read(&umem->users); err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); - - if (!err && pool->fq) + if (!err && pool && pool->fq) err = xsk_diag_put_ring(pool->fq, XDP_DIAG_UMEM_FILL_RING, nlskb); - if (!err && pool->cq) { - err = xsk_diag_put_ring(pool->cq, XDP_DIAG_UMEM_COMPLETION_RING, - nlskb); - } + if (!err && pool && pool->cq) + err = xsk_diag_put_ring(pool->cq, + XDP_DIAG_UMEM_COMPLETION_RING, nlskb); return err; } From 968be23ceaca1f402dfad0a30a8da4649ee32940 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 Sep 2020 11:06:09 +0200 Subject: [PATCH 04/95] xsk: Fix possible segfault at xskmap entry insertion Fix possible segfault when entry is inserted into xskmap. This can happen if the socket is in a state where the umem has been set up, the Rx ring created but it has yet to be bound to a device. In this case the pool has not yet been created and we cannot reference it for the existence of the fill ring. Fix this by removing the whole xsk_is_setup_for_bpf_map function. Once upon a time, it was used to make sure that the Rx and fill rings where set up before the driver could call xsk_rcv, since there are no tests for the existence of these rings in the data path. But these days, we have a state variable that we test instead. When it is XSK_BOUND, everything has been set up correctly and the socket has been bound. So no reason to have the xsk_is_setup_for_bpf_map function anymore. Fixes: 7361f9c3d719 ("xsk: Move fill and completion rings to buffer pool") Reported-by: syzbot+febe51d44243fbc564ee@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1599037569-26690-1-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 6 ------ net/xdp/xsk.h | 1 - net/xdp/xskmap.c | 5 ----- 3 files changed, 12 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5eb6662f562a..07c32276c527 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -33,12 +33,6 @@ static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) -{ - return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && - (xs->pool->fq || READ_ONCE(xs->fq_tmp)); -} - void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index da1f73e43924..b9e896cee5bb 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -39,7 +39,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); int xsk_map_inc(struct xsk_map *map); diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 2a4fd6677155..0c5df593bc56 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -185,11 +185,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, xs = (struct xdp_sock *)sock->sk; - if (!xsk_is_setup_for_bpf_map(xs)) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - map_entry = &m->xsk_map[i]; node = xsk_map_node_alloc(m, map_entry); if (IS_ERR(node)) { From 1d6fd78a213ee3874f46bdce083b7a41d208886d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 2 Sep 2020 10:07:50 -0500 Subject: [PATCH 05/95] xsk: Fix null check on error return path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, dma_map is being checked, when the right object identifier to be null-checked is dma_map->dma_pages, instead. Fix this by null-checking dma_map->dma_pages. Fixes: 921b68692abb ("xsk: Enable sharing of dma mappings") Addresses-Coverity-ID: 1496811 ("Logically dead code") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20200902150750.GA7257@embeddedor --- net/xdp/xsk_buff_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 795d7c81c0ca..5b00bc5707f2 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -287,7 +287,7 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi return NULL; dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL); - if (!dma_map) { + if (!dma_map->dma_pages) { kfree(dma_map); return NULL; } From 83cf5c68d663fc78ce529c41bf24f9f6be88bef4 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 Sep 2020 09:36:04 +0200 Subject: [PATCH 06/95] xsk: Fix use-after-free in failed shared_umem bind Fix use-after-free when a shared umem bind fails. The code incorrectly tried to free the allocated buffer pool both in the bind code and then later also when the socket was released. Fix this by setting the buffer pool pointer to NULL after the bind code has freed the pool, so that the socket release code will not try to free the pool. This is the same solution as the regular, non-shared umem code path has. This was missing from the shared umem path. Fixes: b5aea28dca13 ("xsk: Add shared umem support between queue ids") Reported-by: syzbot+5334f62e4d22804e646a@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1599032164-25684-1-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 07c32276c527..3895697f8540 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -711,6 +711,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) dev, qid); if (err) { xp_destroy(xs->pool); + xs->pool = NULL; sockfd_put(sock); goto out_unlock; } From 0201c575831171292489f14a8b6f79f98936b4d1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:29 -0700 Subject: [PATCH 07/95] libbpf: Ensure ELF symbols table is found before further ELF processing libbpf ELF parsing logic might need symbols available before ELF parsing is completed, so we need to make sure that symbols table section is found in a separate pass before all the subsequent sections are processed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200903203542.15944-2-andriin@fb.com --- tools/lib/bpf/libbpf.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b688aadf09c5..ac56d4db6d3e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2720,14 +2720,38 @@ static int bpf_object__elf_collect(struct bpf_object *obj) Elf *elf = obj->efile.elf; Elf_Data *btf_ext_data = NULL; Elf_Data *btf_data = NULL; - Elf_Scn *scn = NULL; int idx = 0, err = 0; + const char *name; + Elf_Data *data; + Elf_Scn *scn; + GElf_Shdr sh; + /* a bunch of ELF parsing functionality depends on processing symbols, + * so do the first pass and find the symbol table + */ + scn = NULL; while ((scn = elf_nextscn(elf, scn)) != NULL) { - const char *name; - GElf_Shdr sh; - Elf_Data *data; + if (elf_sec_hdr(obj, scn, &sh)) + return -LIBBPF_ERRNO__FORMAT; + if (sh.sh_type == SHT_SYMTAB) { + if (obj->efile.symbols) { + pr_warn("elf: multiple symbol tables in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + + data = elf_sec_data(obj, scn); + if (!data) + return -LIBBPF_ERRNO__FORMAT; + + obj->efile.symbols = data; + obj->efile.symbols_shndx = elf_ndxscn(scn); + obj->efile.strtabidx = sh.sh_link; + } + } + + scn = NULL; + while ((scn = elf_nextscn(elf, scn)) != NULL) { idx++; if (elf_sec_hdr(obj, scn, &sh)) @@ -2766,13 +2790,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj) } else if (strcmp(name, BTF_EXT_ELF_SEC) == 0) { btf_ext_data = data; } else if (sh.sh_type == SHT_SYMTAB) { - if (obj->efile.symbols) { - pr_warn("elf: multiple symbol tables in %s\n", obj->path); - return -LIBBPF_ERRNO__FORMAT; - } - obj->efile.symbols = data; - obj->efile.symbols_shndx = idx; - obj->efile.strtabidx = sh.sh_link; + /* already processed during the first pass above */ } else if (sh.sh_type == SHT_PROGBITS && data->d_size > 0) { if (sh.sh_flags & SHF_EXECINSTR) { if (strcmp(name, ".text") == 0) From c112239272c6bacf7fcd0a0dc71999ad8eb1c55c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:30 -0700 Subject: [PATCH 08/95] libbpf: Parse multi-function sections into multiple BPF programs Teach libbpf how to parse code sections into potentially multiple bpf_program instances, based on ELF FUNC symbols. Each BPF program will keep track of its position within containing ELF section for translating section instruction offsets into program instruction offsets: regardless of BPF program's location in ELF section, it's first instruction is always at local instruction offset 0, so when libbpf is working with relocations (which use section-based instruction offsets) this is critical to make proper translations. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200903203542.15944-3-andriin@fb.com --- tools/lib/bpf/libbpf.c | 261 +++++++++++++++++++++++------------------ 1 file changed, 148 insertions(+), 113 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ac56d4db6d3e..57f87eee5be5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -217,20 +217,45 @@ struct bpf_sec_def { * linux/filter.h. */ struct bpf_program { - /* Index in elf obj file, for relocation use. */ - int idx; - char *name; - int prog_ifindex; - char *section_name; const struct bpf_sec_def *sec_def; + char *section_name; + size_t sec_idx; + /* this program's instruction offset (in number of instructions) + * within its containing ELF section + */ + size_t sec_insn_off; + /* number of original instructions in ELF section belonging to this + * program, not taking into account subprogram instructions possible + * appended later during relocation + */ + size_t sec_insn_cnt; + /* Offset (in number of instructions) of the start of instruction + * belonging to this BPF program within its containing main BPF + * program. For the entry-point (main) BPF program, this is always + * zero. For a sub-program, this gets reset before each of main BPF + * programs are processed and relocated and is used to determined + * whether sub-program was already appended to the main program, and + * if yes, at which instruction offset. + */ + size_t sub_insn_off; + + char *name; /* section_name with / replaced by _; makes recursive pinning * in bpf_object__pin_programs easier */ char *pin_name; + + /* instructions that belong to BPF program; insns[0] is located at + * sec_insn_off instruction within its ELF section in ELF file, so + * when mapping ELF file instruction index to the local instruction, + * one needs to subtract sec_insn_off; and vice versa. + */ struct bpf_insn *insns; + /* actual number of instruction in this BPF program's image; for + * entry-point BPF programs this includes the size of main program + * itself plus all the used sub-programs, appended at the end + */ size_t insns_cnt, main_prog_cnt; - enum bpf_prog_type type; - bool load; struct reloc_desc *reloc_desc; int nr_reloc; @@ -246,7 +271,10 @@ struct bpf_program { void *priv; bpf_program_clear_priv_t clear_priv; + bool load; + enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; + int prog_ifindex; __u32 attach_btf_id; __u32 attach_prog_fd; void *func_info; @@ -446,6 +474,8 @@ static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name); static int elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn, GElf_Shdr *hdr); static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn); static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn); +static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, + size_t off, __u32 sym_type, GElf_Sym *sym); void bpf_program__unload(struct bpf_program *prog) { @@ -493,7 +523,7 @@ static void bpf_program__exit(struct bpf_program *prog) prog->nr_reloc = 0; prog->insns_cnt = 0; - prog->idx = -1; + prog->sec_idx = -1; } static char *__bpf_program__pin_name(struct bpf_program *prog) @@ -508,130 +538,118 @@ static char *__bpf_program__pin_name(struct bpf_program *prog) } static int -bpf_program__init(void *data, size_t size, const char *section_name, int idx, - struct bpf_program *prog) +bpf_program__init(struct bpf_program *prog, const char *name, + size_t sec_idx, const char *sec_name, size_t sec_off, + void *insn_data, size_t insn_data_sz) { - const size_t bpf_insn_sz = sizeof(struct bpf_insn); - - if (size == 0 || size % bpf_insn_sz) { - pr_warn("corrupted section '%s', size: %zu\n", - section_name, size); + if (insn_data_sz == 0 || insn_data_sz % BPF_INSN_SZ || sec_off % BPF_INSN_SZ) { + pr_warn("sec '%s': corrupted program '%s', offset %zu, size %zu\n", + sec_name, name, sec_off, insn_data_sz); return -EINVAL; } - memset(prog, 0, sizeof(*prog)); + prog->sec_idx = sec_idx; + prog->sec_insn_off = sec_off / BPF_INSN_SZ; + prog->sec_insn_cnt = insn_data_sz / BPF_INSN_SZ; + /* insns_cnt can later be increased by appending used subprograms */ + prog->insns_cnt = prog->sec_insn_cnt; - prog->section_name = strdup(section_name); - if (!prog->section_name) { - pr_warn("failed to alloc name for prog under section(%d) %s\n", - idx, section_name); - goto errout; - } - - prog->pin_name = __bpf_program__pin_name(prog); - if (!prog->pin_name) { - pr_warn("failed to alloc pin name for prog under section(%d) %s\n", - idx, section_name); - goto errout; - } - - prog->insns = malloc(size); - if (!prog->insns) { - pr_warn("failed to alloc insns for prog under section %s\n", - section_name); - goto errout; - } - prog->insns_cnt = size / bpf_insn_sz; - memcpy(prog->insns, data, size); - prog->idx = idx; - prog->instances.fds = NULL; - prog->instances.nr = -1; prog->type = BPF_PROG_TYPE_UNSPEC; prog->load = true; + prog->instances.fds = NULL; + prog->instances.nr = -1; + + prog->section_name = strdup(sec_name); + if (!prog->section_name) + goto errout; + + prog->name = strdup(name); + if (!prog->name) + goto errout; + + prog->pin_name = __bpf_program__pin_name(prog); + if (!prog->pin_name) + goto errout; + + prog->insns = malloc(insn_data_sz); + if (!prog->insns) + goto errout; + memcpy(prog->insns, insn_data, insn_data_sz); + return 0; errout: + pr_warn("sec '%s': failed to allocate memory for prog '%s'\n", sec_name, name); bpf_program__exit(prog); return -ENOMEM; } static int -bpf_object__add_program(struct bpf_object *obj, void *data, size_t size, - const char *section_name, int idx) +bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, + const char *sec_name, int sec_idx) { - struct bpf_program prog, *progs; + struct bpf_program *prog, *progs; + void *data = sec_data->d_buf; + size_t sec_sz = sec_data->d_size, sec_off, prog_sz; int nr_progs, err; - - err = bpf_program__init(data, size, section_name, idx, &prog); - if (err) - return err; + const char *name; + GElf_Sym sym; progs = obj->programs; nr_progs = obj->nr_programs; + sec_off = 0; - progs = libbpf_reallocarray(progs, nr_progs + 1, sizeof(progs[0])); - if (!progs) { - /* - * In this case the original obj->programs - * is still valid, so don't need special treat for - * bpf_close_object(). - */ - pr_warn("failed to alloc a new program under section '%s'\n", - section_name); - bpf_program__exit(&prog); - return -ENOMEM; - } - - pr_debug("elf: found program '%s'\n", prog.section_name); - obj->programs = progs; - obj->nr_programs = nr_progs + 1; - prog.obj = obj; - progs[nr_progs] = prog; - return 0; -} - -static int -bpf_object__init_prog_names(struct bpf_object *obj) -{ - Elf_Data *symbols = obj->efile.symbols; - struct bpf_program *prog; - size_t pi, si; - - for (pi = 0; pi < obj->nr_programs; pi++) { - const char *name = NULL; - - prog = &obj->programs[pi]; - - for (si = 0; si < symbols->d_size / sizeof(GElf_Sym) && !name; si++) { - GElf_Sym sym; - - if (!gelf_getsym(symbols, si, &sym)) - continue; - if (sym.st_shndx != prog->idx) - continue; - if (GELF_ST_BIND(sym.st_info) != STB_GLOBAL) - continue; - - name = elf_sym_str(obj, sym.st_name); - if (!name) { - pr_warn("prog '%s': failed to get symbol name\n", - prog->section_name); - return -LIBBPF_ERRNO__LIBELF; - } + while (sec_off < sec_sz) { + if (elf_sym_by_sec_off(obj, sec_idx, sec_off, STT_FUNC, &sym)) { + pr_warn("sec '%s': failed to find program symbol at offset %zu\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; } - if (!name && prog->idx == obj->efile.text_shndx) - name = ".text"; + prog_sz = sym.st_size; + name = elf_sym_str(obj, sym.st_name); if (!name) { - pr_warn("prog '%s': failed to find program symbol\n", - prog->section_name); - return -EINVAL; + pr_warn("sec '%s': failed to get symbol name for offset %zu\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; } - prog->name = strdup(name); - if (!prog->name) + if (sec_off + prog_sz > sec_sz) { + pr_warn("sec '%s': program at offset %zu crosses section boundary\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } + + pr_debug("sec '%s': found program '%s' at offset %zu, code size %zu bytes\n", + sec_name, name, sec_off, prog_sz); + + progs = reallocarray(progs, nr_progs + 1, sizeof(*progs)); + if (!progs) { + /* + * In this case the original obj->programs + * is still valid, so don't need special treat for + * bpf_close_object(). + */ + pr_warn("sec '%s': failed to alloc memory for new program '%s'\n", + sec_name, name); return -ENOMEM; + } + obj->programs = progs; + + prog = &progs[nr_progs]; + memset(prog, 0, sizeof(*prog)); + prog->obj = obj; + + err = bpf_program__init(prog, name, sec_idx, sec_name, sec_off, + data + sec_off, prog_sz); + if (err) + return err; + + nr_progs++; + obj->nr_programs = nr_progs; + + sec_off += prog_sz; } return 0; @@ -2675,6 +2693,26 @@ static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn) return data; } +static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, + size_t off, __u32 sym_type, GElf_Sym *sym) +{ + Elf_Data *symbols = obj->efile.symbols; + size_t n = symbols->d_size / sizeof(GElf_Sym); + int i; + + for (i = 0; i < n; i++) { + if (!gelf_getsym(symbols, i, sym)) + continue; + if (sym->st_shndx != sec_idx || sym->st_value != off) + continue; + if (GELF_ST_TYPE(sym->st_info) != sym_type) + continue; + return 0; + } + + return -ENOENT; +} + static bool is_sec_name_dwarf(const char *name) { /* approximation, but the actual list is too long */ @@ -2795,9 +2833,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj) if (sh.sh_flags & SHF_EXECINSTR) { if (strcmp(name, ".text") == 0) obj->efile.text_shndx = idx; - err = bpf_object__add_program(obj, data->d_buf, - data->d_size, - name, idx); + err = bpf_object__add_programs(obj, data, name, idx); if (err) return err; } else if (strcmp(name, DATA_SEC) == 0) { @@ -3183,7 +3219,7 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx) for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (prog->idx == idx) + if (prog->sec_idx == idx) return prog; } return NULL; @@ -5660,7 +5696,7 @@ bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, size_t new_cnt; int err; - if (prog->idx != obj->efile.text_shndx && prog->main_prog_cnt == 0) { + if (prog->sec_idx != obj->efile.text_shndx && prog->main_prog_cnt == 0) { text = bpf_object__find_prog_by_idx(obj, obj->efile.text_shndx); if (!text) { pr_warn("no .text section found yet relo into text exist\n"); @@ -5783,7 +5819,7 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) */ for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (prog->idx != obj->efile.text_shndx) + if (prog->sec_idx != obj->efile.text_shndx) continue; err = bpf_program__relocate(prog, obj); @@ -5799,7 +5835,7 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) */ for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (prog->idx == obj->efile.text_shndx) + if (prog->sec_idx == obj->efile.text_shndx) continue; err = bpf_program__relocate(prog, obj); @@ -6215,7 +6251,7 @@ out: static bool bpf_program__is_function_storage(const struct bpf_program *prog, const struct bpf_object *obj) { - return prog->idx == obj->efile.text_shndx && obj->has_pseudo_calls; + return prog->sec_idx == obj->efile.text_shndx && obj->has_pseudo_calls; } static int @@ -6298,7 +6334,6 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, err = err ? : bpf_object__collect_externs(obj); err = err ? : bpf_object__finalize_btf(obj); err = err ? : bpf_object__init_maps(obj, opts); - err = err ? : bpf_object__init_prog_names(obj); err = err ? : bpf_object__collect_reloc(obj); if (err) goto out; From db2b8b06423c7eb4abcb4310b7234f00b30d7730 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:31 -0700 Subject: [PATCH 09/95] libbpf: Support CO-RE relocations for multi-prog sections Fix up CO-RE relocation code to handle relocations against ELF sections containing multiple BPF programs. This requires lookup of a BPF program by its section name and instruction index it contains. While it could have been done as a simple loop, it could run into performance issues pretty quickly, as number of CO-RE relocations can be quite large in real-world applications, and each CO-RE relocation incurs BPF program look up now. So instead of simple loop, implement a binary search by section name + insn offset. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200903203542.15944-4-andriin@fb.com --- tools/lib/bpf/libbpf.c | 82 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 74 insertions(+), 8 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 57f87eee5be5..11ad230ec20c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2753,6 +2753,18 @@ static bool ignore_elf_section(GElf_Shdr *hdr, const char *name) return false; } +static int cmp_progs(const void *_a, const void *_b) +{ + const struct bpf_program *a = _a; + const struct bpf_program *b = _b; + + if (a->sec_idx != b->sec_idx) + return a->sec_idx < b->sec_idx ? -1 : 1; + + /* sec_insn_off can't be the same within the section */ + return a->sec_insn_off < b->sec_insn_off ? -1 : 1; +} + static int bpf_object__elf_collect(struct bpf_object *obj) { Elf *elf = obj->efile.elf; @@ -2887,6 +2899,11 @@ static int bpf_object__elf_collect(struct bpf_object *obj) pr_warn("elf: symbol strings section missing or invalid in %s\n", obj->path); return -LIBBPF_ERRNO__FORMAT; } + + /* sort BPF programs by section name and in-section instruction offset + * for faster search */ + qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs); + return bpf_object__init_btf(obj, btf_data, btf_ext_data); } @@ -3415,6 +3432,37 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return 0; } +static bool prog_contains_insn(const struct bpf_program *prog, size_t insn_idx) +{ + return insn_idx >= prog->sec_insn_off && + insn_idx < prog->sec_insn_off + prog->sec_insn_cnt; +} + +static struct bpf_program *find_prog_by_sec_insn(const struct bpf_object *obj, + size_t sec_idx, size_t insn_idx) +{ + int l = 0, r = obj->nr_programs - 1, m; + struct bpf_program *prog; + + while (l < r) { + m = l + (r - l + 1) / 2; + prog = &obj->programs[m]; + + if (prog->sec_idx < sec_idx || + (prog->sec_idx == sec_idx && prog->sec_insn_off <= insn_idx)) + l = m; + else + r = m - 1; + } + /* matching program could be at index l, but it still might be the + * wrong one, so we need to double check conditions for the last time + */ + prog = &obj->programs[l]; + if (prog->sec_idx == sec_idx && prog_contains_insn(prog, insn_idx)) + return prog; + return NULL; +} + static int bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, Elf_Data *data, struct bpf_object *obj) @@ -5229,6 +5277,11 @@ static int bpf_core_patch_insn(struct bpf_program *prog, if (relo->insn_off % BPF_INSN_SZ) return -EINVAL; insn_idx = relo->insn_off / BPF_INSN_SZ; + /* adjust insn_idx from section frame of reference to the local + * program's frame of reference; (sub-)program code is not yet + * relocated, so it's enough to just subtract in-section offset + */ + insn_idx = insn_idx - prog->sec_insn_off; insn = &prog->insns[insn_idx]; class = BPF_CLASS(insn->code); @@ -5619,7 +5672,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) struct bpf_program *prog; struct btf *targ_btf; const char *sec_name; - int i, err = 0; + int i, err = 0, insn_idx, sec_idx; if (obj->btf_ext->core_relo_info.len == 0) return 0; @@ -5646,24 +5699,37 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) err = -EINVAL; goto out; } + /* bpf_object's ELF is gone by now so it's not easy to find + * section index by section name, but we can find *any* + * bpf_program within desired section name and use it's + * prog->sec_idx to do a proper search by section index and + * instruction offset + */ prog = NULL; for (i = 0; i < obj->nr_programs; i++) { - if (!strcmp(obj->programs[i].section_name, sec_name)) { - prog = &obj->programs[i]; + prog = &obj->programs[i]; + if (strcmp(prog->section_name, sec_name) == 0) break; - } } if (!prog) { - pr_warn("failed to find program '%s' for CO-RE offset relocation\n", - sec_name); - err = -EINVAL; - goto out; + pr_warn("sec '%s': failed to find a BPF program\n", sec_name); + return -ENOENT; } + sec_idx = prog->sec_idx; pr_debug("sec '%s': found %d CO-RE relocations\n", sec_name, sec->num_info); for_each_btf_ext_rec(seg, sec, i, rec) { + insn_idx = rec->insn_off / BPF_INSN_SZ; + prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); + if (!prog) { + pr_warn("sec '%s': failed to find program at insn #%d for CO-RE offset relocation #%d\n", + sec_name, insn_idx, i); + err = -EINVAL; + goto out; + } + err = bpf_core_apply_relo(prog, rec, i, obj->btf, targ_btf, cand_cache); if (err) { From c3c556966de7a0d5f8c908f407c673070fcdbf2b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:32 -0700 Subject: [PATCH 10/95] libbpf: Make RELO_CALL work for multi-prog sections and sub-program calls This patch implements general and correct logic for bpf-to-bpf sub-program calls. Only sub-programs used (called into) from entry-point (main) BPF program are going to be appended at the end of main BPF program. This ensures that BPF verifier won't encounter any dead code due to copying unreferenced sub-program. This change means that each entry-point (main) BPF program might have a different set of sub-programs appended to it and potentially in different order. This has implications on how sub-program call relocations need to be handled, described below. All relocations are now split into two categores: data references (maps and global variables) and code references (sub-program calls). This distinction is important because data references need to be relocated just once per each BPF program and sub-program. These relocation are agnostic to instruction locations, because they are not code-relative and they are relocating against static targets (maps, variables with fixes offsets, etc). Sub-program RELO_CALL relocations, on the other hand, are highly-dependent on code position, because they are recorded as instruction-relative offset. So BPF sub-programs (those that do calls into other sub-programs) can't be relocated once, they need to be relocated each time such a sub-program is appended at the end of the main entry-point BPF program. As mentioned above, each main BPF program might have different subset and differen order of sub-programs, so call relocations can't be done just once. Splitting data reference and calls relocations as described above allows to do this efficiently and cleanly. bpf_object__find_program_by_name() will now ignore non-entry BPF programs. Previously one could have looked up '.text' fake BPF program, but the existence of such BPF program was always an implementation detail and you can't do much useful with it. Now, though, all non-entry sub-programs get their own BPF program with name corresponding to a function name, so there is no more '.text' name for BPF program. This means there is no regression, effectively, w.r.t. API behavior. But this is important aspect to highlight, because it's going to be critical once libbpf implements static linking of BPF programs. Non-entry static BPF programs will be allowed to have conflicting names, but global and main-entry BPF program names should be unique. Just like with normal user-space linking process. So it's important to restrict this aspect right now, keep static and non-entry functions as internal implementation details, and not have to deal with regressions in behavior later. This patch leaves .BTF.ext adjustment as is until next patch. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-5-andriin@fb.com --- tools/lib/bpf/libbpf.c | 520 ++++++++++++++++++++++++++++++----------- 1 file changed, 381 insertions(+), 139 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 11ad230ec20c..4e32a1028379 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -193,6 +193,7 @@ struct reloc_desc { int insn_idx; int map_idx; int sym_off; + bool processed; }; struct bpf_sec_def; @@ -255,7 +256,7 @@ struct bpf_program { * entry-point BPF programs this includes the size of main program * itself plus all the used sub-programs, appended at the end */ - size_t insns_cnt, main_prog_cnt; + size_t insns_cnt; struct reloc_desc *reloc_desc; int nr_reloc; @@ -412,7 +413,7 @@ struct bpf_object { int kconfig_map_idx; bool loaded; - bool has_pseudo_calls; + bool has_subcalls; /* * Information when doing elf related work. Only valid if fd @@ -537,17 +538,32 @@ static char *__bpf_program__pin_name(struct bpf_program *prog) return name; } -static int -bpf_program__init(struct bpf_program *prog, const char *name, - size_t sec_idx, const char *sec_name, size_t sec_off, - void *insn_data, size_t insn_data_sz) +static bool insn_is_subprog_call(const struct bpf_insn *insn) { + return BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_CALL && + BPF_SRC(insn->code) == BPF_K && + insn->src_reg == BPF_PSEUDO_CALL && + insn->dst_reg == 0 && + insn->off == 0; +} + +static int +bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, + const char *name, size_t sec_idx, const char *sec_name, + size_t sec_off, void *insn_data, size_t insn_data_sz) +{ + int i; + if (insn_data_sz == 0 || insn_data_sz % BPF_INSN_SZ || sec_off % BPF_INSN_SZ) { pr_warn("sec '%s': corrupted program '%s', offset %zu, size %zu\n", sec_name, name, sec_off, insn_data_sz); return -EINVAL; } + memset(prog, 0, sizeof(*prog)); + prog->obj = obj; + prog->sec_idx = sec_idx; prog->sec_insn_off = sec_off / BPF_INSN_SZ; prog->sec_insn_cnt = insn_data_sz / BPF_INSN_SZ; @@ -577,6 +593,13 @@ bpf_program__init(struct bpf_program *prog, const char *name, goto errout; memcpy(prog->insns, insn_data, insn_data_sz); + for (i = 0; i < prog->insns_cnt; i++) { + if (insn_is_subprog_call(&prog->insns[i])) { + obj->has_subcalls = true; + break; + } + } + return 0; errout: pr_warn("sec '%s': failed to allocate memory for prog '%s'\n", sec_name, name); @@ -621,10 +644,10 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, return -LIBBPF_ERRNO__FORMAT; } - pr_debug("sec '%s': found program '%s' at offset %zu, code size %zu bytes\n", - sec_name, name, sec_off, prog_sz); + pr_debug("sec '%s': found program '%s' at insn offset %zu (%zu bytes), code size %zu insns (%zu bytes)\n", + sec_name, name, sec_off / BPF_INSN_SZ, sec_off, prog_sz / BPF_INSN_SZ, prog_sz); - progs = reallocarray(progs, nr_progs + 1, sizeof(*progs)); + progs = libbpf_reallocarray(progs, nr_progs + 1, sizeof(*progs)); if (!progs) { /* * In this case the original obj->programs @@ -638,11 +661,9 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, obj->programs = progs; prog = &progs[nr_progs]; - memset(prog, 0, sizeof(*prog)); - prog->obj = obj; - err = bpf_program__init(prog, name, sec_idx, sec_name, sec_off, - data + sec_off, prog_sz); + err = bpf_object__init_prog(obj, prog, name, sec_idx, sec_name, + sec_off, data + sec_off, prog_sz); if (err) return err; @@ -3255,6 +3276,12 @@ bpf_object__find_program_by_title(const struct bpf_object *obj, return NULL; } +static bool prog_is_subprog(const struct bpf_object *obj, + const struct bpf_program *prog) +{ + return prog->sec_idx == obj->efile.text_shndx && obj->has_subcalls; +} + struct bpf_program * bpf_object__find_program_by_name(const struct bpf_object *obj, const char *name) @@ -3262,6 +3289,8 @@ bpf_object__find_program_by_name(const struct bpf_object *obj, struct bpf_program *prog; bpf_object__for_each_program(prog, obj) { + if (prog_is_subprog(obj, prog)) + continue; if (!strcmp(prog->name, name)) return prog; } @@ -3311,6 +3340,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog, const char *sym_sec_name; struct bpf_map *map; + reloc_desc->processed = false; + /* sub-program call relocation */ if (insn->code == (BPF_JMP | BPF_CALL)) { if (insn->src_reg != BPF_PSEUDO_CALL) { @@ -3332,7 +3363,6 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->type = RELO_CALL; reloc_desc->insn_idx = insn_idx; reloc_desc->sym_off = sym->st_value; - obj->has_pseudo_calls = true; return 0; } @@ -3464,13 +3494,18 @@ static struct bpf_program *find_prog_by_sec_insn(const struct bpf_object *obj, } static int -bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, - Elf_Data *data, struct bpf_object *obj) +bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data *data) { Elf_Data *symbols = obj->efile.symbols; const char *relo_sec_name, *sec_name; size_t sec_idx = shdr->sh_info; + struct bpf_program *prog; + struct reloc_desc *relos; int err, i, nrels; + const char *sym_name; + __u32 insn_idx; + GElf_Sym sym; + GElf_Rel rel; relo_sec_name = elf_sec_str(obj, shdr->sh_name); sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); @@ -3481,19 +3516,7 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, relo_sec_name, sec_idx, sec_name); nrels = shdr->sh_size / shdr->sh_entsize; - prog->reloc_desc = malloc(sizeof(*prog->reloc_desc) * nrels); - if (!prog->reloc_desc) { - pr_warn("failed to alloc memory in relocation\n"); - return -ENOMEM; - } - prog->nr_reloc = nrels; - for (i = 0; i < nrels; i++) { - const char *sym_name; - __u32 insn_idx; - GElf_Sym sym; - GElf_Rel rel; - if (!gelf_getrel(data, i, &rel)) { pr_warn("sec '%s': failed to get relo #%d\n", relo_sec_name, i); return -LIBBPF_ERRNO__FORMAT; @@ -3510,15 +3533,42 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, } insn_idx = rel.r_offset / BPF_INSN_SZ; - sym_name = elf_sym_str(obj, sym.st_name) ?: ""; + /* relocations against static functions are recorded as + * relocations against the section that contains a function; + * in such case, symbol will be STT_SECTION and sym.st_name + * will point to empty string (0), so fetch section name + * instead + */ + if (GELF_ST_TYPE(sym.st_info) == STT_SECTION && sym.st_name == 0) + sym_name = elf_sec_name(obj, elf_sec_by_idx(obj, sym.st_shndx)); + else + sym_name = elf_sym_str(obj, sym.st_name); + sym_name = sym_name ?: "reloc_desc[i], + prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); + if (!prog) { + pr_warn("sec '%s': relo #%d: program not found in section '%s' for insn #%u\n", + relo_sec_name, i, sec_name, insn_idx); + return -LIBBPF_ERRNO__RELOC; + } + + relos = libbpf_reallocarray(prog->reloc_desc, + prog->nr_reloc + 1, sizeof(*relos)); + if (!relos) + return -ENOMEM; + prog->reloc_desc = relos; + + /* adjust insn_idx to local BPF program frame of reference */ + insn_idx -= prog->sec_insn_off; + err = bpf_program__record_reloc(prog, &relos[prog->nr_reloc], insn_idx, sym_name, &sym, &rel); if (err) return err; + + prog->nr_reloc++; } return 0; } @@ -5753,89 +5803,32 @@ out: return err; } +/* Relocate data references within program code: + * - map references; + * - global variable references; + * - extern references. + */ static int -bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, - struct reloc_desc *relo) +bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) { - struct bpf_insn *insn, *new_insn; - struct bpf_program *text; - size_t new_cnt; - int err; - - if (prog->sec_idx != obj->efile.text_shndx && prog->main_prog_cnt == 0) { - text = bpf_object__find_prog_by_idx(obj, obj->efile.text_shndx); - if (!text) { - pr_warn("no .text section found yet relo into text exist\n"); - return -LIBBPF_ERRNO__RELOC; - } - new_cnt = prog->insns_cnt + text->insns_cnt; - new_insn = libbpf_reallocarray(prog->insns, new_cnt, sizeof(*insn)); - if (!new_insn) { - pr_warn("oom in prog realloc\n"); - return -ENOMEM; - } - prog->insns = new_insn; - - if (obj->btf_ext) { - err = bpf_program_reloc_btf_ext(prog, obj, - text->section_name, - prog->insns_cnt); - if (err) - return err; - } - - memcpy(new_insn + prog->insns_cnt, text->insns, - text->insns_cnt * sizeof(*insn)); - prog->main_prog_cnt = prog->insns_cnt; - prog->insns_cnt = new_cnt; - pr_debug("added %zd insn from %s to prog %s\n", - text->insns_cnt, text->section_name, - prog->section_name); - } - - insn = &prog->insns[relo->insn_idx]; - insn->imm += relo->sym_off / 8 + prog->main_prog_cnt - relo->insn_idx; - return 0; -} - -static int -bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) -{ - int i, err; - - if (!prog) - return 0; - - if (obj->btf_ext) { - err = bpf_program_reloc_btf_ext(prog, obj, - prog->section_name, 0); - if (err) - return err; - } - - if (!prog->reloc_desc) - return 0; + int i; for (i = 0; i < prog->nr_reloc; i++) { struct reloc_desc *relo = &prog->reloc_desc[i]; struct bpf_insn *insn = &prog->insns[relo->insn_idx]; struct extern_desc *ext; - if (relo->insn_idx + 1 >= (int)prog->insns_cnt) { - pr_warn("relocation out of range: '%s'\n", - prog->section_name); - return -LIBBPF_ERRNO__RELOC; - } - switch (relo->type) { case RELO_LD64: insn[0].src_reg = BPF_PSEUDO_MAP_FD; insn[0].imm = obj->maps[relo->map_idx].fd; + relo->processed = true; break; case RELO_DATA: insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; insn[1].imm = insn[0].imm + relo->sym_off; insn[0].imm = obj->maps[relo->map_idx].fd; + relo->processed = true; break; case RELO_EXTERN: ext = &obj->externs[relo->sym_off]; @@ -5847,11 +5840,10 @@ bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) insn[0].imm = (__u32)ext->ksym.addr; insn[1].imm = ext->ksym.addr >> 32; } + relo->processed = true; break; case RELO_CALL: - err = bpf_program__reloc_text(prog, obj, relo); - if (err) - return err; + /* will be handled as a follow up pass */ break; default: pr_warn("prog '%s': relo #%d: bad relo type %d\n", @@ -5860,8 +5852,244 @@ bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) } } - zfree(&prog->reloc_desc); - prog->nr_reloc = 0; + return 0; +} + +static int cmp_relo_by_insn_idx(const void *key, const void *elem) +{ + size_t insn_idx = *(const size_t *)key; + const struct reloc_desc *relo = elem; + + if (insn_idx == relo->insn_idx) + return 0; + return insn_idx < relo->insn_idx ? -1 : 1; +} + +static struct reloc_desc *find_prog_insn_relo(const struct bpf_program *prog, size_t insn_idx) +{ + return bsearch(&insn_idx, prog->reloc_desc, prog->nr_reloc, + sizeof(*prog->reloc_desc), cmp_relo_by_insn_idx); +} + +static int +bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, + struct bpf_program *prog) +{ + size_t sub_insn_idx, insn_idx, new_cnt; + struct bpf_program *subprog; + struct bpf_insn *insns, *insn; + struct reloc_desc *relo; + int err; + + err = reloc_prog_func_and_line_info(obj, main_prog, prog); + if (err) + return err; + + for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) { + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + if (!insn_is_subprog_call(insn)) + continue; + + relo = find_prog_insn_relo(prog, insn_idx); + if (relo && relo->type != RELO_CALL) { + pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n", + prog->name, insn_idx, relo->type); + return -LIBBPF_ERRNO__RELOC; + } + if (relo) { + /* sub-program instruction index is a combination of + * an offset of a symbol pointed to by relocation and + * call instruction's imm field; for global functions, + * call always has imm = -1, but for static functions + * relocation is against STT_SECTION and insn->imm + * points to a start of a static function + */ + sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + } else { + /* if subprogram call is to a static function within + * the same ELF section, there won't be any relocation + * emitted, but it also means there is no additional + * offset necessary, insns->imm is relative to + * instruction's original position within the section + */ + sub_insn_idx = prog->sec_insn_off + insn_idx + insn->imm + 1; + } + + /* we enforce that sub-programs should be in .text section */ + subprog = find_prog_by_sec_insn(obj, obj->efile.text_shndx, sub_insn_idx); + if (!subprog) { + pr_warn("prog '%s': no .text section found yet sub-program call exists\n", + prog->name); + return -LIBBPF_ERRNO__RELOC; + } + + /* if it's the first call instruction calling into this + * subprogram (meaning this subprog hasn't been processed + * yet) within the context of current main program: + * - append it at the end of main program's instructions blog; + * - process is recursively, while current program is put on hold; + * - if that subprogram calls some other not yet processes + * subprogram, same thing will happen recursively until + * there are no more unprocesses subprograms left to append + * and relocate. + */ + if (subprog->sub_insn_off == 0) { + subprog->sub_insn_off = main_prog->insns_cnt; + + new_cnt = main_prog->insns_cnt + subprog->insns_cnt; + insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); + return -ENOMEM; + } + main_prog->insns = insns; + main_prog->insns_cnt = new_cnt; + + memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, + subprog->insns_cnt * sizeof(*insns)); + + pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", + main_prog->name, subprog->insns_cnt, subprog->name); + + err = bpf_object__reloc_code(obj, main_prog, subprog); + if (err) + return err; + } + + /* main_prog->insns memory could have been re-allocated, so + * calculate pointer again + */ + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + /* calculate correct instruction position within current main + * prog; each main prog can have a different set of + * subprograms appended (potentially in different order as + * well), so position of any subprog can be different for + * different main programs */ + insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; + + if (relo) + relo->processed = true; + + pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", + prog->name, insn_idx, insn->imm, subprog->name, subprog->sub_insn_off); + } + + return 0; +} + +/* + * Relocate sub-program calls. + * + * Algorithm operates as follows. Each entry-point BPF program (referred to as + * main prog) is processed separately. For each subprog (non-entry functions, + * that can be called from either entry progs or other subprogs) gets their + * sub_insn_off reset to zero. This serves as indicator that this subprogram + * hasn't been yet appended and relocated within current main prog. Once its + * relocated, sub_insn_off will point at the position within current main prog + * where given subprog was appended. This will further be used to relocate all + * the call instructions jumping into this subprog. + * + * We start with main program and process all call instructions. If the call + * is into a subprog that hasn't been processed (i.e., subprog->sub_insn_off + * is zero), subprog instructions are appended at the end of main program's + * instruction array. Then main program is "put on hold" while we recursively + * process newly appended subprogram. If that subprogram calls into another + * subprogram that hasn't been appended, new subprogram is appended again to + * the *main* prog's instructions (subprog's instructions are always left + * untouched, as they need to be in unmodified state for subsequent main progs + * and subprog instructions are always sent only as part of a main prog) and + * the process continues recursively. Once all the subprogs called from a main + * prog or any of its subprogs are appended (and relocated), all their + * positions within finalized instructions array are known, so it's easy to + * rewrite call instructions with correct relative offsets, corresponding to + * desired target subprog. + * + * Its important to realize that some subprogs might not be called from some + * main prog and any of its called/used subprogs. Those will keep their + * subprog->sub_insn_off as zero at all times and won't be appended to current + * main prog and won't be relocated within the context of current main prog. + * They might still be used from other main progs later. + * + * Visually this process can be shown as below. Suppose we have two main + * programs mainA and mainB and BPF object contains three subprogs: subA, + * subB, and subC. mainA calls only subA, mainB calls only subC, but subA and + * subC both call subB: + * + * +--------+ +-------+ + * | v v | + * +--+---+ +--+-+-+ +---+--+ + * | subA | | subB | | subC | + * +--+---+ +------+ +---+--+ + * ^ ^ + * | | + * +---+-------+ +------+----+ + * | mainA | | mainB | + * +-----------+ +-----------+ + * + * We'll start relocating mainA, will find subA, append it and start + * processing sub A recursively: + * + * +-----------+------+ + * | mainA | subA | + * +-----------+------+ + * + * At this point we notice that subB is used from subA, so we append it and + * relocate (there are no further subcalls from subB): + * + * +-----------+------+------+ + * | mainA | subA | subB | + * +-----------+------+------+ + * + * At this point, we relocate subA calls, then go one level up and finish with + * relocatin mainA calls. mainA is done. + * + * For mainB process is similar but results in different order. We start with + * mainB and skip subA and subB, as mainB never calls them (at least + * directly), but we see subC is needed, so we append and start processing it: + * + * +-----------+------+ + * | mainB | subC | + * +-----------+------+ + * Now we see subC needs subB, so we go back to it, append and relocate it: + * + * +-----------+------+------+ + * | mainB | subC | subB | + * +-----------+------+------+ + * + * At this point we unwind recursion, relocate calls in subC, then in mainB. + */ +static int +bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) +{ + struct bpf_program *subprog; + int i, j, err; + + if (obj->btf_ext) { + err = bpf_program_reloc_btf_ext(prog, obj, + prog->section_name, 0); + if (err) + return err; + } + + /* mark all subprogs as not relocated (yet) within the context of + * current main program + */ + for (i = 0; i < obj->nr_programs; i++) { + subprog = &obj->programs[i]; + if (!prog_is_subprog(obj, subprog)) + continue; + + subprog->sub_insn_off = 0; + for (j = 0; j < subprog->nr_reloc; j++) + if (subprog->reloc_desc[j].type == RELO_CALL) + subprog->reloc_desc[j].processed = false; + } + + err = bpf_object__reloc_code(obj, prog, prog); + if (err) + return err; + + return 0; } @@ -5880,37 +6108,45 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) return err; } } - /* ensure .text is relocated first, as it's going to be copied as-is - * later for sub-program calls + /* relocate data references first for all programs and sub-programs, + * as they don't change relative to code locations, so subsequent + * subprogram processing won't need to re-calculate any of them */ for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (prog->sec_idx != obj->efile.text_shndx) - continue; - - err = bpf_program__relocate(prog, obj); + err = bpf_object__relocate_data(obj, prog); if (err) { pr_warn("prog '%s': failed to relocate data references: %d\n", prog->name, err); return err; } - break; } - /* now relocate everything but .text, which by now is relocated - * properly, so we can copy raw sub-program instructions as is safely + /* now relocate subprogram calls and append used subprograms to main + * programs; each copy of subprogram code needs to be relocated + * differently for each main program, because its code location might + * have changed */ for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (prog->sec_idx == obj->efile.text_shndx) + /* sub-program's sub-calls are relocated within the context of + * its main program only + */ + if (prog_is_subprog(obj, prog)) continue; - err = bpf_program__relocate(prog, obj); + err = bpf_object__relocate_calls(obj, prog); if (err) { pr_warn("prog '%s': failed to relocate calls: %d\n", prog->name, err); return err; } } + /* free up relocation descriptors */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + zfree(&prog->reloc_desc); + prog->nr_reloc = 0; + } return 0; } @@ -6030,41 +6266,53 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, return 0; } -static int bpf_object__collect_reloc(struct bpf_object *obj) +static int cmp_relocs(const void *_a, const void *_b) +{ + const struct reloc_desc *a = _a; + const struct reloc_desc *b = _b; + + if (a->insn_idx != b->insn_idx) + return a->insn_idx < b->insn_idx ? -1 : 1; + + /* no two relocations should have the same insn_idx, but ... */ + if (a->type != b->type) + return a->type < b->type ? -1 : 1; + + return 0; +} + +static int bpf_object__collect_relos(struct bpf_object *obj) { int i, err; - if (!obj_elf_valid(obj)) { - pr_warn("Internal error: elf object is closed\n"); - return -LIBBPF_ERRNO__INTERNAL; - } - for (i = 0; i < obj->efile.nr_reloc_sects; i++) { GElf_Shdr *shdr = &obj->efile.reloc_sects[i].shdr; Elf_Data *data = obj->efile.reloc_sects[i].data; int idx = shdr->sh_info; - struct bpf_program *prog; if (shdr->sh_type != SHT_REL) { pr_warn("internal error at %d\n", __LINE__); return -LIBBPF_ERRNO__INTERNAL; } - if (idx == obj->efile.st_ops_shndx) { + if (idx == obj->efile.st_ops_shndx) err = bpf_object__collect_st_ops_relos(obj, shdr, data); - } else if (idx == obj->efile.btf_maps_shndx) { + else if (idx == obj->efile.btf_maps_shndx) err = bpf_object__collect_map_relos(obj, shdr, data); - } else { - prog = bpf_object__find_prog_by_idx(obj, idx); - if (!prog) { - pr_warn("relocation failed: no prog in section(%d)\n", idx); - return -LIBBPF_ERRNO__RELOC; - } - err = bpf_program__collect_reloc(prog, shdr, data, obj); - } + else + err = bpf_object__collect_prog_relos(obj, shdr, data); if (err) return err; } + + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *p = &obj->programs[i]; + + if (!p->nr_reloc) + continue; + + qsort(p->reloc_desc, p->nr_reloc, sizeof(*p->reloc_desc), cmp_relocs); + } return 0; } @@ -6314,12 +6562,6 @@ out: return err; } -static bool bpf_program__is_function_storage(const struct bpf_program *prog, - const struct bpf_object *obj) -{ - return prog->sec_idx == obj->efile.text_shndx && obj->has_pseudo_calls; -} - static int bpf_object__load_progs(struct bpf_object *obj, int log_level) { @@ -6336,7 +6578,7 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (bpf_program__is_function_storage(prog, obj)) + if (prog_is_subprog(obj, prog)) continue; if (!prog->load) { pr_debug("prog '%s': skipped loading\n", prog->name); @@ -6400,7 +6642,7 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, err = err ? : bpf_object__collect_externs(obj); err = err ? : bpf_object__finalize_btf(obj); err = err ? : bpf_object__init_maps(obj, opts); - err = err ? : bpf_object__collect_reloc(obj); + err = err ? : bpf_object__collect_relos(obj); if (err) goto out; bpf_object__elf_finish(obj); @@ -7404,7 +7646,7 @@ bpf_program__next(struct bpf_program *prev, const struct bpf_object *obj) do { prog = __bpf_program__iter(prog, obj, true); - } while (prog && bpf_program__is_function_storage(prog, obj)); + } while (prog && prog_is_subprog(obj, prog)); return prog; } @@ -7416,7 +7658,7 @@ bpf_program__prev(struct bpf_program *next, const struct bpf_object *obj) do { prog = __bpf_program__iter(prog, obj, false); - } while (prog && bpf_program__is_function_storage(prog, obj)); + } while (prog && prog_is_subprog(obj, prog)); return prog; } From 8505e8709b5eedb7c662354e8436a2a164181109 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:33 -0700 Subject: [PATCH 11/95] libbpf: Implement generalized .BTF.ext func/line info adjustment Complete multi-prog sections and multi sub-prog support in libbpf by properly adjusting .BTF.ext's line and function information. Mark exposed btf_ext__reloc_func_info() and btf_ext__reloc_func_info() APIs as deprecated. These APIs have simplistic assumption that all sub-programs are going to be appended to all main BPF programs, which doesn't hold in real life. It's unlikely there are any users of this API, as it's very libbpf internals-specific. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-6-andriin@fb.com --- tools/lib/bpf/btf.h | 18 +-- tools/lib/bpf/libbpf.c | 217 ++++++++++++++++++++++------------ tools/lib/bpf/libbpf_common.h | 2 + 3 files changed, 153 insertions(+), 84 deletions(-) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 91f0ad0e0325..2a55320d87d0 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -57,14 +57,16 @@ LIBBPF_API struct btf_ext *btf_ext__new(__u8 *data, __u32 size); LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size); -LIBBPF_API int btf_ext__reloc_func_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **func_info, __u32 *cnt); -LIBBPF_API int btf_ext__reloc_line_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **line_info, __u32 *cnt); +LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") +int btf_ext__reloc_func_info(const struct btf *btf, + const struct btf_ext *btf_ext, + const char *sec_name, __u32 insns_cnt, + void **func_info, __u32 *cnt); +LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_line_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") +int btf_ext__reloc_line_info(const struct btf *btf, + const struct btf_ext *btf_ext, + const char *sec_name, __u32 insns_cnt, + void **line_info, __u32 *cnt); LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4e32a1028379..ca2b5c9145da 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4241,75 +4241,6 @@ err_out: return err; } -static int -check_btf_ext_reloc_err(struct bpf_program *prog, int err, - void *btf_prog_info, const char *info_name) -{ - if (err != -ENOENT) { - pr_warn("Error in loading %s for sec %s.\n", - info_name, prog->section_name); - return err; - } - - /* err == -ENOENT (i.e. prog->section_name not found in btf_ext) */ - - if (btf_prog_info) { - /* - * Some info has already been found but has problem - * in the last btf_ext reloc. Must have to error out. - */ - pr_warn("Error in relocating %s for sec %s.\n", - info_name, prog->section_name); - return err; - } - - /* Have problem loading the very first info. Ignore the rest. */ - pr_warn("Cannot find %s for main program sec %s. Ignore all %s.\n", - info_name, prog->section_name, info_name); - return 0; -} - -static int -bpf_program_reloc_btf_ext(struct bpf_program *prog, struct bpf_object *obj, - const char *section_name, __u32 insn_offset) -{ - int err; - - if (!insn_offset || prog->func_info) { - /* - * !insn_offset => main program - * - * For sub prog, the main program's func_info has to - * be loaded first (i.e. prog->func_info != NULL) - */ - err = btf_ext__reloc_func_info(obj->btf, obj->btf_ext, - section_name, insn_offset, - &prog->func_info, - &prog->func_info_cnt); - if (err) - return check_btf_ext_reloc_err(prog, err, - prog->func_info, - "bpf_func_info"); - - prog->func_info_rec_size = btf_ext__func_info_rec_size(obj->btf_ext); - } - - if (!insn_offset || prog->line_info) { - err = btf_ext__reloc_line_info(obj->btf, obj->btf_ext, - section_name, insn_offset, - &prog->line_info, - &prog->line_info_cnt); - if (err) - return check_btf_ext_reloc_err(prog, err, - prog->line_info, - "bpf_line_info"); - - prog->line_info_rec_size = btf_ext__line_info_rec_size(obj->btf_ext); - } - - return 0; -} - #define BPF_CORE_SPEC_MAX_LEN 64 /* represents BPF CO-RE field or array element accessor */ @@ -5855,6 +5786,147 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) return 0; } +static int adjust_prog_btf_ext_info(const struct bpf_object *obj, + const struct bpf_program *prog, + const struct btf_ext_info *ext_info, + void **prog_info, __u32 *prog_rec_cnt, + __u32 *prog_rec_sz) +{ + void *copy_start = NULL, *copy_end = NULL; + void *rec, *rec_end, *new_prog_info; + const struct btf_ext_info_sec *sec; + size_t old_sz, new_sz; + const char *sec_name; + int i, off_adj; + + for_each_btf_ext_sec(ext_info, sec) { + sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off); + if (!sec_name) + return -EINVAL; + if (strcmp(sec_name, prog->section_name) != 0) + continue; + + for_each_btf_ext_rec(ext_info, sec, i, rec) { + __u32 insn_off = *(__u32 *)rec / BPF_INSN_SZ; + + if (insn_off < prog->sec_insn_off) + continue; + if (insn_off >= prog->sec_insn_off + prog->sec_insn_cnt) + break; + + if (!copy_start) + copy_start = rec; + copy_end = rec + ext_info->rec_size; + } + + if (!copy_start) + return -ENOENT; + + /* append func/line info of a given (sub-)program to the main + * program func/line info + */ + old_sz = (*prog_rec_cnt) * ext_info->rec_size; + new_sz = old_sz + (copy_end - copy_start); + new_prog_info = realloc(*prog_info, new_sz); + if (!new_prog_info) + return -ENOMEM; + *prog_info = new_prog_info; + *prog_rec_cnt = new_sz / ext_info->rec_size; + memcpy(new_prog_info + old_sz, copy_start, copy_end - copy_start); + + /* Kernel instruction offsets are in units of 8-byte + * instructions, while .BTF.ext instruction offsets generated + * by Clang are in units of bytes. So convert Clang offsets + * into kernel offsets and adjust offset according to program + * relocated position. + */ + off_adj = prog->sub_insn_off - prog->sec_insn_off; + rec = new_prog_info + old_sz; + rec_end = new_prog_info + new_sz; + for (; rec < rec_end; rec += ext_info->rec_size) { + __u32 *insn_off = rec; + + *insn_off = *insn_off / BPF_INSN_SZ + off_adj; + } + *prog_rec_sz = ext_info->rec_size; + return 0; + } + + return -ENOENT; +} + +static int +reloc_prog_func_and_line_info(const struct bpf_object *obj, + struct bpf_program *main_prog, + const struct bpf_program *prog) +{ + int err; + + /* no .BTF.ext relocation if .BTF.ext is missing or kernel doesn't + * supprot func/line info + */ + if (!obj->btf_ext || !kernel_supports(FEAT_BTF_FUNC)) + return 0; + + /* only attempt func info relocation if main program's func_info + * relocation was successful + */ + if (main_prog != prog && !main_prog->func_info) + goto line_info; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->func_info, + &main_prog->func_info, + &main_prog->func_info_cnt, + &main_prog->func_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext function info: %d\n", + prog->name, err); + return err; + } + if (main_prog->func_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext function info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext function info for the main program, skipping all of .BTF.ext func info.\n", + prog->name); + } + +line_info: + /* don't relocate line info if main program's relocation failed */ + if (main_prog != prog && !main_prog->line_info) + return 0; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->line_info, + &main_prog->line_info, + &main_prog->line_info_cnt, + &main_prog->line_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext line info: %d\n", + prog->name, err); + return err; + } + if (main_prog->line_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext line info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext line info for the main program, skipping all of .BTF.ext line info.\n", + prog->name); + } + return 0; +} + static int cmp_relo_by_insn_idx(const void *key, const void *elem) { size_t insn_idx = *(const size_t *)key; @@ -6064,13 +6136,6 @@ bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) struct bpf_program *subprog; int i, j, err; - if (obj->btf_ext) { - err = bpf_program_reloc_btf_ext(prog, obj, - prog->section_name, 0); - if (err) - return err; - } - /* mark all subprogs as not relocated (yet) within the context of * current main program */ diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h index a23ae1ac27eb..947d8bd8a7bb 100644 --- a/tools/lib/bpf/libbpf_common.h +++ b/tools/lib/bpf/libbpf_common.h @@ -15,6 +15,8 @@ #define LIBBPF_API __attribute__((visibility("default"))) #endif +#define LIBBPF_DEPRECATED(msg) __attribute__((deprecated(msg))) + /* Helper macro to declare and initialize libbpf options struct * * This dance with uninitialized declaration, followed by memset to zero, From 7e06aad52929d17610f28acd40dd10ed62295f85 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:34 -0700 Subject: [PATCH 12/95] libbpf: Add multi-prog section support for struct_ops Adjust struct_ops handling code to work with multi-program ELF sections properly. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-7-andriin@fb.com --- tools/lib/bpf/libbpf.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ca2b5c9145da..2d6bf0af3305 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -73,8 +73,6 @@ #define __printf(a, b) __attribute__((format(printf, a, b))) static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); -static struct bpf_program *bpf_object__find_prog_by_idx(struct bpf_object *obj, - int idx); static const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); @@ -3249,20 +3247,6 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return 0; } -static struct bpf_program * -bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx) -{ - struct bpf_program *prog; - size_t i; - - for (i = 0; i < obj->nr_programs; i++) { - prog = &obj->programs[i]; - if (prog->sec_idx == idx) - return prog; - } - return NULL; -} - struct bpf_program * bpf_object__find_program_by_title(const struct bpf_object *obj, const char *title) @@ -8198,7 +8182,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, const struct btf *btf; struct bpf_map *map; Elf_Data *symbols; - unsigned int moff; + unsigned int moff, insn_idx; const char *name; __u32 member_idx; GElf_Sym sym; @@ -8243,6 +8227,12 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, map->name, (size_t)rel.r_offset, shdr_idx); return -LIBBPF_ERRNO__RELOC; } + if (sym.st_value % BPF_INSN_SZ) { + pr_warn("struct_ops reloc %s: invalid target program offset %llu\n", + map->name, (__u64)sym.st_value); + return -LIBBPF_ERRNO__FORMAT; + } + insn_idx = sym.st_value / BPF_INSN_SZ; member = find_member_by_offset(st_ops->type, moff * 8); if (!member) { @@ -8259,7 +8249,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, return -EINVAL; } - prog = bpf_object__find_prog_by_idx(obj, shdr_idx); + prog = find_prog_by_sec_insn(obj, shdr_idx, insn_idx); if (!prog) { pr_warn("struct_ops reloc %s: cannot find prog at shdr_idx %u to relocate func ptr %s\n", map->name, shdr_idx, name); From a08c02f8d4ae50527b9f26a7147686ed55c93782 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:35 -0700 Subject: [PATCH 13/95] selftests/bpf: Add selftest for multi-prog sections and bpf-to-bpf calls Add a selftest excercising bpf-to-bpf subprogram calls, as well as multiple entry-point BPF programs per section. Also make sure that BPF CO-RE works for such set ups both for sub-programs and for multi-entry sections. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-8-andriin@fb.com --- .../selftests/bpf/prog_tests/subprogs.c | 31 ++++++ .../selftests/bpf/progs/test_subprogs.c | 103 ++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/subprogs.c create mode 100644 tools/testing/selftests/bpf/progs/test_subprogs.c diff --git a/tools/testing/selftests/bpf/prog_tests/subprogs.c b/tools/testing/selftests/bpf/prog_tests/subprogs.c new file mode 100644 index 000000000000..a00abf58c037 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/subprogs.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include +#include "test_subprogs.skel.h" + +static int duration; + +void test_subprogs(void) +{ + struct test_subprogs *skel; + int err; + + skel = test_subprogs__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + + err = test_subprogs__attach(skel); + if (CHECK(err, "skel_attach", "failed to attach skeleton: %d\n", err)) + goto cleanup; + + usleep(1); + + CHECK(skel->bss->res1 != 12, "res1", "got %d, exp %d\n", skel->bss->res1, 12); + CHECK(skel->bss->res2 != 17, "res2", "got %d, exp %d\n", skel->bss->res2, 17); + CHECK(skel->bss->res3 != 19, "res3", "got %d, exp %d\n", skel->bss->res3, 19); + CHECK(skel->bss->res4 != 36, "res4", "got %d, exp %d\n", skel->bss->res4, 36); + +cleanup: + test_subprogs__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_subprogs.c b/tools/testing/selftests/bpf/progs/test_subprogs.c new file mode 100644 index 000000000000..d3c5673c0218 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_subprogs.c @@ -0,0 +1,103 @@ +#include "vmlinux.h" +#include +#include + +const char LICENSE[] SEC("license") = "GPL"; + +__noinline int sub1(int x) +{ + return x + 1; +} + +static __noinline int sub5(int v); + +__noinline int sub2(int y) +{ + return sub5(y + 2); +} + +static __noinline int sub3(int z) +{ + return z + 3 + sub1(4); +} + +static __noinline int sub4(int w) +{ + return w + sub3(5) + sub1(6); +} + +/* sub5() is an identitify function, just to test weirder functions layout and + * call patterns + */ +static __noinline int sub5(int v) +{ + return sub1(v) - 1; /* compensates sub1()'s + 1 */ +} + +/* unfortunately verifier rejects `struct task_struct *t` as an unkown pointer + * type, so we need to accept pointer as integer and then cast it inside the + * function + */ +__noinline int get_task_tgid(uintptr_t t) +{ + /* this ensures that CO-RE relocs work in multi-subprogs .text */ + return BPF_CORE_READ((struct task_struct *)(void *)t, tgid); +} + +int res1 = 0; +int res2 = 0; +int res3 = 0; +int res4 = 0; + +SEC("raw_tp/sys_enter") +int prog1(void *ctx) +{ + /* perform some CO-RE relocations to ensure they work with multi-prog + * sections correctly + */ + struct task_struct *t = (void *)bpf_get_current_task(); + + if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t)) + return 1; + + res1 = sub1(1) + sub3(2); /* (1 + 1) + (2 + 3 + (4 + 1)) = 12 */ + return 0; +} + +SEC("raw_tp/sys_exit") +int prog2(void *ctx) +{ + struct task_struct *t = (void *)bpf_get_current_task(); + + if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t)) + return 1; + + res2 = sub2(3) + sub3(4); /* (3 + 2) + (4 + 3 + (4 + 1)) = 17 */ + return 0; +} + +/* prog3 has the same section name as prog1 */ +SEC("raw_tp/sys_enter") +int prog3(void *ctx) +{ + struct task_struct *t = (void *)bpf_get_current_task(); + + if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t)) + return 1; + + res3 = sub3(5) + 6; /* (5 + 3 + (4 + 1)) + 6 = 19 */ + return 0; +} + +/* prog4 has the same section name as prog2 */ +SEC("raw_tp/sys_exit") +int prog4(void *ctx) +{ + struct task_struct *t = (void *)bpf_get_current_task(); + + if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t)) + return 1; + + res4 = sub4(7) + sub1(8); /* (7 + (5 + 3 + (4 + 1)) + (6 + 1)) + (8 + 1) = 36 */ + return 0; +} From fd17e272be9c50190f1185a1007d72dc1eedb903 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:36 -0700 Subject: [PATCH 14/95] tools/bpftool: Replace bpf_program__title() with bpf_program__section_name() bpf_program__title() is deprecated, switch to bpf_program__section_name() and avoid compilation warnings. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-9-andriin@fb.com --- tools/bpf/bpftool/prog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index d393eb8263a6..f7923414a052 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1304,7 +1304,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) enum bpf_prog_type prog_type = common_prog_type; if (prog_type == BPF_PROG_TYPE_UNSPEC) { - const char *sec_name = bpf_program__title(pos, false); + const char *sec_name = bpf_program__section_name(pos); err = get_prog_type_by_name(sec_name, &prog_type, &expected_attach_type); @@ -1398,7 +1398,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) err = bpf_obj_pin(bpf_program__fd(prog), pinfile); if (err) { p_err("failed to pin program %s", - bpf_program__title(prog, false)); + bpf_program__section_name(prog)); goto err_close_obj; } } else { From a7659cc30bc9210d885692d175f4bc6f3a9a2175 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:37 -0700 Subject: [PATCH 15/95] selftests/bpf: Don't use deprecated libbpf APIs Remove all uses of bpf_program__title() and bpf_program__find_program_by_title(). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-10-andriin@fb.com --- tools/testing/selftests/bpf/flow_dissector_load.h | 8 +++++++- .../testing/selftests/bpf/prog_tests/reference_tracking.c | 2 +- tools/testing/selftests/bpf/test_socket_cookie.c | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/flow_dissector_load.h b/tools/testing/selftests/bpf/flow_dissector_load.h index daeaeb518894..7290401ec172 100644 --- a/tools/testing/selftests/bpf/flow_dissector_load.h +++ b/tools/testing/selftests/bpf/flow_dissector_load.h @@ -23,7 +23,13 @@ static inline int bpf_flow_load(struct bpf_object **obj, if (ret) return ret; - main_prog = bpf_object__find_program_by_title(*obj, section_name); + main_prog = NULL; + bpf_object__for_each_program(prog, *obj) { + if (strcmp(section_name, bpf_program__section_name(prog)) == 0) { + main_prog = prog; + break; + } + } if (!main_prog) return -1; diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c index fc0d7f4f02cf..ac1ee10cffd8 100644 --- a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c +++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c @@ -27,7 +27,7 @@ void test_reference_tracking(void) const char *title; /* Ignore .text sections */ - title = bpf_program__title(prog, false); + title = bpf_program__section_name(prog); if (strstr(title, ".text") != NULL) continue; diff --git a/tools/testing/selftests/bpf/test_socket_cookie.c b/tools/testing/selftests/bpf/test_socket_cookie.c index 154a8fd2a48d..ca7ca87e91aa 100644 --- a/tools/testing/selftests/bpf/test_socket_cookie.c +++ b/tools/testing/selftests/bpf/test_socket_cookie.c @@ -151,7 +151,7 @@ static int run_test(int cgfd) } bpf_object__for_each_program(prog, pobj) { - prog_name = bpf_program__title(prog, /*needs_copy*/ false); + prog_name = bpf_program__section_name(prog); if (libbpf_attach_type_by_name(prog_name, &attach_type)) goto err; From 52109584202783e90a83c95103307a9a03ba7d9e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:38 -0700 Subject: [PATCH 16/95] libbpf: Deprecate notion of BPF program "title" in favor of "section name" BPF program title is ambigious and misleading term. It is ELF section name, so let's just call it that and deprecate bpf_program__title() API in favor of bpf_program__section_name(). Additionally, using bpf_object__find_program_by_title() is now inherently dangerous and ambiguous, as multiple BPF program can have the same section name. So deprecate this API as well and recommend to switch to non-ambiguous bpf_object__find_program_by_name(). Internally, clean up usage and mis-usage of BPF program section name for denoting BPF program name. Shorten the field name to prog->sec_name to be consistent with all other prog->sec_* variables. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-11-andriin@fb.com --- tools/lib/bpf/libbpf.c | 215 ++++++++++++++++++--------------------- tools/lib/bpf/libbpf.h | 5 +- tools/lib/bpf/libbpf.map | 1 + 3 files changed, 101 insertions(+), 120 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2d6bf0af3305..47b43c13eee5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -217,7 +217,7 @@ struct bpf_sec_def { */ struct bpf_program { const struct bpf_sec_def *sec_def; - char *section_name; + char *sec_name; size_t sec_idx; /* this program's instruction offset (in number of instructions) * within its containing ELF section @@ -239,7 +239,7 @@ struct bpf_program { size_t sub_insn_off; char *name; - /* section_name with / replaced by _; makes recursive pinning + /* sec_name with / replaced by _; makes recursive pinning * in bpf_object__pin_programs easier */ char *pin_name; @@ -515,7 +515,7 @@ static void bpf_program__exit(struct bpf_program *prog) bpf_program__unload(prog); zfree(&prog->name); - zfree(&prog->section_name); + zfree(&prog->sec_name); zfree(&prog->pin_name); zfree(&prog->insns); zfree(&prog->reloc_desc); @@ -529,7 +529,7 @@ static char *__bpf_program__pin_name(struct bpf_program *prog) { char *name, *p; - name = p = strdup(prog->section_name); + name = p = strdup(prog->sec_name); while ((p = strchr(p, '/'))) *p = '_'; @@ -574,8 +574,8 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->instances.fds = NULL; prog->instances.nr = -1; - prog->section_name = strdup(sec_name); - if (!prog->section_name) + prog->sec_name = strdup(sec_name); + if (!prog->sec_name) goto errout; prog->name = strdup(name); @@ -3254,7 +3254,7 @@ bpf_object__find_program_by_title(const struct bpf_object *obj, struct bpf_program *pos; bpf_object__for_each_program(pos, obj) { - if (pos->section_name && !strcmp(pos->section_name, title)) + if (pos->sec_name && !strcmp(pos->sec_name, title)) return pos; } return NULL; @@ -4994,8 +4994,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog, *val = sz; } else { pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n", - bpf_program__title(prog, false), - relo->kind, relo->insn_off / 8); + prog->name, relo->kind, relo->insn_off / 8); return -EINVAL; } if (validate) @@ -5017,8 +5016,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog, if (byte_sz >= 8) { /* bitfield can't be read with 64-bit read */ pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n", - bpf_program__title(prog, false), - relo->kind, relo->insn_off / 8); + prog->name, relo->kind, relo->insn_off / 8); return -E2BIG; } byte_sz *= 2; @@ -5183,8 +5181,8 @@ static int bpf_core_calc_relo(const struct bpf_program *prog, } else if (err == -EOPNOTSUPP) { /* EOPNOTSUPP means unknown/unsupported relocation */ pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n", - bpf_program__title(prog, false), relo_idx, - core_relo_kind_str(relo->kind), relo->kind, relo->insn_off / 8); + prog->name, relo_idx, core_relo_kind_str(relo->kind), + relo->kind, relo->insn_off / 8); } return err; @@ -5198,7 +5196,7 @@ static void bpf_core_poison_insn(struct bpf_program *prog, int relo_idx, int insn_idx, struct bpf_insn *insn) { pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n", - bpf_program__title(prog, false), relo_idx, insn_idx); + prog->name, relo_idx, insn_idx); insn->code = BPF_JMP | BPF_CALL; insn->dst_reg = 0; insn->src_reg = 0; @@ -5270,14 +5268,14 @@ static int bpf_core_patch_insn(struct bpf_program *prog, return -EINVAL; if (res->validate && insn->imm != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n", - bpf_program__title(prog, false), relo_idx, + prog->name, relo_idx, insn_idx, insn->imm, orig_val, new_val); return -EINVAL; } orig_val = insn->imm; insn->imm = new_val; pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %u -> %u\n", - bpf_program__title(prog, false), relo_idx, insn_idx, + prog->name, relo_idx, insn_idx, orig_val, new_val); break; case BPF_LDX: @@ -5285,21 +5283,18 @@ static int bpf_core_patch_insn(struct bpf_program *prog, case BPF_STX: if (res->validate && insn->off != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %u -> %u\n", - bpf_program__title(prog, false), relo_idx, - insn_idx, insn->off, orig_val, new_val); + prog->name, relo_idx, insn_idx, insn->off, orig_val, new_val); return -EINVAL; } if (new_val > SHRT_MAX) { pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %u\n", - bpf_program__title(prog, false), relo_idx, - insn_idx, new_val); + prog->name, relo_idx, insn_idx, new_val); return -ERANGE; } orig_val = insn->off; insn->off = new_val; pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n", - bpf_program__title(prog, false), relo_idx, insn_idx, - orig_val, new_val); + prog->name, relo_idx, insn_idx, orig_val, new_val); break; case BPF_LD: { __u64 imm; @@ -5310,14 +5305,14 @@ static int bpf_core_patch_insn(struct bpf_program *prog, insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n", - bpf_program__title(prog, false), relo_idx, insn_idx); + prog->name, relo_idx, insn_idx); return -EINVAL; } imm = insn[0].imm + ((__u64)insn[1].imm << 32); if (res->validate && imm != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %u -> %u\n", - bpf_program__title(prog, false), relo_idx, + prog->name, relo_idx, insn_idx, (unsigned long long)imm, orig_val, new_val); return -EINVAL; @@ -5326,15 +5321,14 @@ static int bpf_core_patch_insn(struct bpf_program *prog, insn[0].imm = new_val; insn[1].imm = 0; /* currently only 32-bit values are supported */ pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %u\n", - bpf_program__title(prog, false), relo_idx, insn_idx, + prog->name, relo_idx, insn_idx, (unsigned long long)imm, new_val); break; } default: pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n", - bpf_program__title(prog, false), relo_idx, - insn_idx, insn->code, insn->src_reg, insn->dst_reg, - insn->off, insn->imm); + prog->name, relo_idx, insn_idx, insn->code, + insn->src_reg, insn->dst_reg, insn->off, insn->imm); return -EINVAL; } @@ -5464,7 +5458,6 @@ static int bpf_core_apply_relo(struct bpf_program *prog, const struct btf *targ_btf, struct hashmap *cand_cache) { - const char *prog_name = bpf_program__title(prog, false); struct bpf_core_spec local_spec, cand_spec, targ_spec = {}; const void *type_key = u32_as_hash_key(relo->type_id); struct bpf_core_relo_res cand_res, targ_res; @@ -5491,13 +5484,13 @@ static int bpf_core_apply_relo(struct bpf_program *prog, err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec); if (err) { pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", - prog_name, relo_idx, local_id, btf_kind_str(local_type), + prog->name, relo_idx, local_id, btf_kind_str(local_type), str_is_empty(local_name) ? "" : local_name, spec_str, err); return -EINVAL; } - pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name, + pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog->name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec); libbpf_print(LIBBPF_DEBUG, "\n"); @@ -5514,7 +5507,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, /* libbpf doesn't support candidate search for anonymous types */ if (str_is_empty(spec_str)) { pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n", - prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); + prog->name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); return -EOPNOTSUPP; } @@ -5522,7 +5515,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, cand_ids = bpf_core_find_cands(local_btf, local_id, targ_btf); if (IS_ERR(cand_ids)) { pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld", - prog_name, relo_idx, local_id, btf_kind_str(local_type), + prog->name, relo_idx, local_id, btf_kind_str(local_type), local_name, PTR_ERR(cand_ids)); return PTR_ERR(cand_ids); } @@ -5538,13 +5531,13 @@ static int bpf_core_apply_relo(struct bpf_program *prog, err = bpf_core_spec_match(&local_spec, targ_btf, cand_id, &cand_spec); if (err < 0) { pr_warn("prog '%s': relo #%d: error matching candidate #%d ", - prog_name, relo_idx, i); + prog->name, relo_idx, i); bpf_core_dump_spec(LIBBPF_WARN, &cand_spec); libbpf_print(LIBBPF_WARN, ": %d\n", err); return err; } - pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name, + pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog->name, relo_idx, err == 0 ? "non-matching" : "matching", i); bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec); libbpf_print(LIBBPF_DEBUG, "\n"); @@ -5564,7 +5557,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, * should all resolve to the same bit offset */ pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", - prog_name, relo_idx, cand_spec.bit_offset, + prog->name, relo_idx, cand_spec.bit_offset, targ_spec.bit_offset); return -EINVAL; } else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) { @@ -5573,7 +5566,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, * proceed due to ambiguity */ pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n", - prog_name, relo_idx, + prog->name, relo_idx, cand_res.poison ? "failure" : "success", cand_res.new_val, targ_res.poison ? "failure" : "success", targ_res.new_val); return -EINVAL; @@ -5606,7 +5599,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, */ if (j == 0) { pr_debug("prog '%s': relo #%d: no matching targets found\n", - prog_name, relo_idx); + prog->name, relo_idx); /* calculate single target relo result explicitly */ err = bpf_core_calc_relo(prog, relo, relo_idx, &local_spec, NULL, &targ_res); @@ -5619,7 +5612,7 @@ patch_insn: err = bpf_core_patch_insn(prog, relo, relo_idx, &targ_res); if (err) { pr_warn("prog '%s': relo #%d: failed to patch insn at offset %d: %d\n", - prog_name, relo_idx, relo->insn_off, err); + prog->name, relo_idx, relo->insn_off, err); return -EINVAL; } @@ -5673,7 +5666,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) prog = NULL; for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (strcmp(prog->section_name, sec_name) == 0) + if (strcmp(prog->sec_name, sec_name) == 0) break; } if (!prog) { @@ -5787,7 +5780,7 @@ static int adjust_prog_btf_ext_info(const struct bpf_object *obj, sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off); if (!sec_name) return -EINVAL; - if (strcmp(sec_name, prog->section_name) != 0) + if (strcmp(sec_name, prog->sec_name) != 0) continue; for_each_btf_ext_rec(ext_info, sec, i, rec) { @@ -6527,8 +6520,7 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) int err = 0, fd, i, btf_id; if (prog->obj->loaded) { - pr_warn("prog '%s'('%s'): can't load after object was loaded\n", - prog->name, prog->section_name); + pr_warn("prog '%s': can't load after object was loaded\n", prog->name); return -EINVAL; } @@ -6544,7 +6536,7 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) if (prog->instances.nr < 0 || !prog->instances.fds) { if (prog->preprocessor) { pr_warn("Internal error: can't load program '%s'\n", - prog->section_name); + prog->name); return -LIBBPF_ERRNO__INTERNAL; } @@ -6559,8 +6551,8 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) if (!prog->preprocessor) { if (prog->instances.nr != 1) { - pr_warn("Program '%s' is inconsistent: nr(%d) != 1\n", - prog->section_name, prog->instances.nr); + pr_warn("prog '%s': inconsistent nr(%d) != 1\n", + prog->name, prog->instances.nr); } err = load_program(prog, prog->insns, prog->insns_cnt, license, kern_ver, &fd); @@ -6578,13 +6570,13 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) prog->insns_cnt, &result); if (err) { pr_warn("Preprocessing the %dth instance of program '%s' failed\n", - i, prog->section_name); + i, prog->name); goto out; } if (!result.new_insn_ptr || !result.new_insn_cnt) { pr_debug("Skip loading the %dth instance of program '%s'\n", - i, prog->section_name); + i, prog->name); prog->instances.fds[i] = -1; if (result.pfd) *result.pfd = -1; @@ -6595,7 +6587,7 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) result.new_insn_cnt, license, kern_ver, &fd); if (err) { pr_warn("Loading the %dth instance of program '%s' failed\n", - i, prog->section_name); + i, prog->name); goto out; } @@ -6605,7 +6597,7 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) } out: if (err) - pr_warn("failed to load program '%s'\n", prog->section_name); + pr_warn("failed to load program '%s'\n", prog->name); zfree(&prog->insns); prog->insns_cnt = 0; return err; @@ -6697,7 +6689,7 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, bpf_object__elf_finish(obj); bpf_object__for_each_program(prog, obj) { - prog->sec_def = find_sec_def(prog->section_name); + prog->sec_def = find_sec_def(prog->sec_name); if (!prog->sec_def) /* couldn't guess, but user might manually specify */ continue; @@ -7078,7 +7070,7 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path, if (instance < 0 || instance >= prog->instances.nr) { pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->section_name, prog->instances.nr); + instance, prog->name, prog->instances.nr); return -EINVAL; } @@ -7109,7 +7101,7 @@ int bpf_program__unpin_instance(struct bpf_program *prog, const char *path, if (instance < 0 || instance >= prog->instances.nr) { pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->section_name, prog->instances.nr); + instance, prog->name, prog->instances.nr); return -EINVAL; } @@ -7139,8 +7131,7 @@ int bpf_program__pin(struct bpf_program *prog, const char *path) } if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", - prog->section_name); + pr_warn("no instances of prog %s to pin\n", prog->name); return -EINVAL; } @@ -7202,8 +7193,7 @@ int bpf_program__unpin(struct bpf_program *prog, const char *path) } if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", - prog->section_name); + pr_warn("no instances of prog %s to pin\n", prog->name); return -EINVAL; } @@ -7738,11 +7728,16 @@ const char *bpf_program__name(const struct bpf_program *prog) return prog->name; } +const char *bpf_program__section_name(const struct bpf_program *prog) +{ + return prog->sec_name; +} + const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy) { const char *title; - title = prog->section_name; + title = prog->sec_name; if (needs_copy) { title = strdup(title); if (!title) { @@ -7815,14 +7810,14 @@ int bpf_program__nth_fd(const struct bpf_program *prog, int n) if (n >= prog->instances.nr || n < 0) { pr_warn("Can't get the %dth fd from program %s: only %d instances\n", - n, prog->section_name, prog->instances.nr); + n, prog->name, prog->instances.nr); return -EINVAL; } fd = prog->instances.fds[n]; if (fd < 0) { pr_warn("%dth instance of program '%s' is invalid\n", - n, prog->section_name); + n, prog->name); return -ENOENT; } @@ -8259,7 +8254,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, if (prog->type == BPF_PROG_TYPE_UNSPEC) { const struct bpf_sec_def *sec_def; - sec_def = find_sec_def(prog->section_name); + sec_def = find_sec_def(prog->sec_name); if (sec_def && sec_def->prog_type != BPF_PROG_TYPE_STRUCT_OPS) { /* for pr_warn */ @@ -8282,7 +8277,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, invalid_prog: pr_warn("struct_ops reloc %s: cannot use prog %s in sec %s with type %u attach_btf_id %u expected_attach_type %u for func ptr %s\n", - map->name, prog->name, prog->section_name, prog->type, + map->name, prog->name, prog->sec_name, prog->type, prog->attach_btf_id, prog->expected_attach_type, name); return -EINVAL; } @@ -8386,7 +8381,7 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog) { enum bpf_attach_type attach_type = prog->expected_attach_type; __u32 attach_prog_fd = prog->attach_prog_fd; - const char *name = prog->section_name; + const char *name = prog->sec_name; int i, err; if (!name) @@ -8913,14 +8908,14 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int prog_fd, err; if (pfd < 0) { - pr_warn("program '%s': invalid perf event FD %d\n", - bpf_program__title(prog, false), pfd); + pr_warn("prog '%s': invalid perf event FD %d\n", + prog->name, pfd); return ERR_PTR(-EINVAL); } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach BPF program w/o FD (did you load it?)\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + prog->name); return ERR_PTR(-EINVAL); } @@ -8933,20 +8928,18 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { err = -errno; free(link); - pr_warn("program '%s': failed to attach to pfd %d: %s\n", - bpf_program__title(prog, false), pfd, - libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to attach to pfd %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); if (err == -EPROTO) - pr_warn("program '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", - bpf_program__title(prog, false), pfd); + pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", + prog->name, pfd); return ERR_PTR(err); } if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { err = -errno; free(link); - pr_warn("program '%s': failed to enable pfd %d: %s\n", - bpf_program__title(prog, false), pfd, - libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to enable pfd %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); return ERR_PTR(err); } return link; @@ -9068,9 +9061,8 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog, pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name, 0 /* offset */, -1 /* pid */); if (pfd < 0) { - pr_warn("program '%s': failed to create %s '%s' perf event: %s\n", - bpf_program__title(prog, false), - retprobe ? "kretprobe" : "kprobe", func_name, + pr_warn("prog '%s': failed to create %s '%s' perf event: %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", func_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); } @@ -9078,9 +9070,8 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog, if (IS_ERR(link)) { close(pfd); err = PTR_ERR(link); - pr_warn("program '%s': failed to attach to %s '%s': %s\n", - bpf_program__title(prog, false), - retprobe ? "kretprobe" : "kprobe", func_name, + pr_warn("prog '%s': failed to attach to %s '%s': %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", func_name, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); return link; } @@ -9093,7 +9084,7 @@ static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec, const char *func_name; bool retprobe; - func_name = bpf_program__title(prog, false) + sec->len; + func_name = prog->sec_name + sec->len; retprobe = strcmp(sec->sec, "kretprobe/") == 0; return bpf_program__attach_kprobe(prog, retprobe, func_name); @@ -9111,9 +9102,8 @@ struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, func_offset, pid); if (pfd < 0) { - pr_warn("program '%s': failed to create %s '%s:0x%zx' perf event: %s\n", - bpf_program__title(prog, false), - retprobe ? "uretprobe" : "uprobe", + pr_warn("prog '%s': failed to create %s '%s:0x%zx' perf event: %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", binary_path, func_offset, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); @@ -9122,9 +9112,8 @@ struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, if (IS_ERR(link)) { close(pfd); err = PTR_ERR(link); - pr_warn("program '%s': failed to attach to %s '%s:0x%zx': %s\n", - bpf_program__title(prog, false), - retprobe ? "uretprobe" : "uprobe", + pr_warn("prog '%s': failed to attach to %s '%s:0x%zx': %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", binary_path, func_offset, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); return link; @@ -9192,9 +9181,8 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog, pfd = perf_event_open_tracepoint(tp_category, tp_name); if (pfd < 0) { - pr_warn("program '%s': failed to create tracepoint '%s/%s' perf event: %s\n", - bpf_program__title(prog, false), - tp_category, tp_name, + pr_warn("prog '%s': failed to create tracepoint '%s/%s' perf event: %s\n", + prog->name, tp_category, tp_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); } @@ -9202,9 +9190,8 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog, if (IS_ERR(link)) { close(pfd); err = PTR_ERR(link); - pr_warn("program '%s': failed to attach to tracepoint '%s/%s': %s\n", - bpf_program__title(prog, false), - tp_category, tp_name, + pr_warn("prog '%s': failed to attach to tracepoint '%s/%s': %s\n", + prog->name, tp_category, tp_name, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); return link; } @@ -9217,7 +9204,7 @@ static struct bpf_link *attach_tp(const struct bpf_sec_def *sec, char *sec_name, *tp_cat, *tp_name; struct bpf_link *link; - sec_name = strdup(bpf_program__title(prog, false)); + sec_name = strdup(prog->sec_name); if (!sec_name) return ERR_PTR(-ENOMEM); @@ -9246,8 +9233,7 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach before loaded\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach before loaded\n", prog->name); return ERR_PTR(-EINVAL); } @@ -9260,9 +9246,8 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, if (pfd < 0) { pfd = -errno; free(link); - pr_warn("program '%s': failed to attach to raw tracepoint '%s': %s\n", - bpf_program__title(prog, false), tp_name, - libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to attach to raw tracepoint '%s': %s\n", + prog->name, tp_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); } link->fd = pfd; @@ -9272,7 +9257,7 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, static struct bpf_link *attach_raw_tp(const struct bpf_sec_def *sec, struct bpf_program *prog) { - const char *tp_name = bpf_program__title(prog, false) + sec->len; + const char *tp_name = prog->sec_name + sec->len; return bpf_program__attach_raw_tracepoint(prog, tp_name); } @@ -9286,8 +9271,7 @@ static struct bpf_link *bpf_program__attach_btf_id(struct bpf_program *prog) prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach before loaded\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach before loaded\n", prog->name); return ERR_PTR(-EINVAL); } @@ -9300,9 +9284,8 @@ static struct bpf_link *bpf_program__attach_btf_id(struct bpf_program *prog) if (pfd < 0) { pfd = -errno; free(link); - pr_warn("program '%s': failed to attach: %s\n", - bpf_program__title(prog, false), - libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to attach: %s\n", + prog->name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); } link->fd = pfd; @@ -9348,8 +9331,7 @@ bpf_program__attach_fd(struct bpf_program *prog, int target_fd, prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach before loaded\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach before loaded\n", prog->name); return ERR_PTR(-EINVAL); } @@ -9363,8 +9345,8 @@ bpf_program__attach_fd(struct bpf_program *prog, int target_fd, if (link_fd < 0) { link_fd = -errno; free(link); - pr_warn("program '%s': failed to attach to %s: %s\n", - bpf_program__title(prog, false), target_name, + pr_warn("prog '%s': failed to attach to %s: %s\n", + prog->name, target_name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); return ERR_PTR(link_fd); } @@ -9408,8 +9390,7 @@ bpf_program__attach_iter(struct bpf_program *prog, prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach before loaded\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach before loaded\n", prog->name); return ERR_PTR(-EINVAL); } @@ -9423,9 +9404,8 @@ bpf_program__attach_iter(struct bpf_program *prog, if (link_fd < 0) { link_fd = -errno; free(link); - pr_warn("program '%s': failed to attach to iterator: %s\n", - bpf_program__title(prog, false), - libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to attach to iterator: %s\n", + prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); return ERR_PTR(link_fd); } link->fd = link_fd; @@ -9436,7 +9416,7 @@ struct bpf_link *bpf_program__attach(struct bpf_program *prog) { const struct bpf_sec_def *sec_def; - sec_def = find_sec_def(bpf_program__title(prog, false)); + sec_def = find_sec_def(prog->sec_name); if (!sec_def || !sec_def->attach_fn) return ERR_PTR(-ESRCH); @@ -10506,12 +10486,11 @@ int bpf_object__attach_skeleton(struct bpf_object_skeleton *s) struct bpf_program *prog = *s->progs[i].prog; struct bpf_link **link = s->progs[i].link; const struct bpf_sec_def *sec_def; - const char *sec_name = bpf_program__title(prog, false); if (!prog->load) continue; - sec_def = find_sec_def(sec_name); + sec_def = find_sec_def(prog->sec_name); if (!sec_def || !sec_def->attach_fn) continue; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 308e0ded8f14..a750f67a23f6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -198,8 +198,9 @@ LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex); LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); -LIBBPF_API const char *bpf_program__title(const struct bpf_program *prog, - bool needs_copy); +LIBBPF_API const char *bpf_program__section_name(const struct bpf_program *prog); +LIBBPF_API LIBBPF_DEPRECATED("BPF program title is confusing term; please use bpf_program__section_name() instead") +const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy); LIBBPF_API bool bpf_program__autoload(const struct bpf_program *prog); LIBBPF_API int bpf_program__set_autoload(struct bpf_program *prog, bool autoload); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 3fedcdc4ae2f..92ceb48a5ca2 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -302,6 +302,7 @@ LIBBPF_0.1.0 { LIBBPF_0.2.0 { global: + bpf_program__section_name; perf_buffer__buffer_cnt; perf_buffer__buffer_fd; perf_buffer__epoll_fd; From d86687ae6b759a3ac3e2db63390cef6006144681 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:39 -0700 Subject: [PATCH 17/95] selftests/bpf: Turn fexit_bpf2bpf into test with subtests There are clearly 4 subtests, so make it official. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-12-andriin@fb.com --- .../selftests/bpf/prog_tests/fexit_bpf2bpf.c | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index a550dab9ba7a..eda682727787 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -208,11 +208,18 @@ static void test_func_map_prog_compatibility(void) void test_fexit_bpf2bpf(void) { - test_target_no_callees(); - test_target_yes_callees(); - test_func_replace(); - test_func_replace_verify(); - test_func_sockmap_update(); - test_func_replace_return_code(); - test_func_map_prog_compatibility(); + if (test__start_subtest("target_no_callees")) + test_target_no_callees(); + if (test__start_subtest("target_yes_callees")) + test_target_yes_callees(); + if (test__start_subtest("func_replace")) + test_func_replace(); + if (test__start_subtest("func_replace_verify")) + test_func_replace_verify(); + if (test__start_subtest("func_sockmap_update")) + test_func_sockmap_update(); + if (test__start_subtest("func_replace_return_code")) + test_func_replace_return_code(); + if (test__start_subtest("func_map_prog_compatibility")) + test_func_map_prog_compatibility(); } From fab45be1d26e8c3cee62e09bcc39c149b175f848 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:40 -0700 Subject: [PATCH 18/95] selftests/bpf: Add subprogs to pyperf, strobemeta, and l4lb_noinline tests Add use of non-inlined subprogs to few bigger selftests to excercise libbpf's bpf2bpf handling logic. Also split l4lb_all selftest into two sub-tests. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-13-andriin@fb.com --- .../bpf/prog_tests/bpf_verif_scale.c | 4 ++ .../selftests/bpf/prog_tests/l4lb_all.c | 9 ++-- tools/testing/selftests/bpf/progs/pyperf.h | 11 ++++- .../selftests/bpf/progs/pyperf_subprogs.c | 5 +++ .../testing/selftests/bpf/progs/strobemeta.h | 30 ++++++++++---- .../selftests/bpf/progs/strobemeta_subprogs.c | 10 +++++ .../selftests/bpf/progs/test_l4lb_noinline.c | 41 +++++++++---------- 7 files changed, 73 insertions(+), 37 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/pyperf_subprogs.c create mode 100644 tools/testing/selftests/bpf/progs/strobemeta_subprogs.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index e9f2f12ba06b..e698ee6bb6c2 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -49,6 +49,7 @@ void test_bpf_verif_scale(void) { "test_verif_scale3.o", BPF_PROG_TYPE_SCHED_CLS }, { "pyperf_global.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, + { "pyperf_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, /* full unroll by llvm */ { "pyperf50.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, @@ -86,6 +87,9 @@ void test_bpf_verif_scale(void) { "strobemeta_nounroll1.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, { "strobemeta_nounroll2.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, + /* non-inlined subprogs */ + { "strobemeta_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, + { "test_sysctl_loop1.o", BPF_PROG_TYPE_CGROUP_SYSCTL }, { "test_sysctl_loop2.o", BPF_PROG_TYPE_CGROUP_SYSCTL }, diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c index c2d373e294bb..8073105548ff 100644 --- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c +++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c @@ -80,9 +80,8 @@ out: void test_l4lb_all(void) { - const char *file1 = "./test_l4lb.o"; - const char *file2 = "./test_l4lb_noinline.o"; - - test_l4lb(file1); - test_l4lb(file2); + if (test__start_subtest("l4lb_inline")) + test_l4lb("test_l4lb.o"); + if (test__start_subtest("l4lb_noinline")) + test_l4lb("test_l4lb_noinline.o"); } diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h index cc615b82b56e..2fb7adafb6b6 100644 --- a/tools/testing/selftests/bpf/progs/pyperf.h +++ b/tools/testing/selftests/bpf/progs/pyperf.h @@ -67,7 +67,12 @@ typedef struct { void* co_name; // PyCodeObject.co_name } FrameData; -static __always_inline void *get_thread_state(void *tls_base, PidData *pidData) +#ifdef SUBPROGS +__noinline +#else +__always_inline +#endif +static void *get_thread_state(void *tls_base, PidData *pidData) { void* thread_state; int key; @@ -155,7 +160,9 @@ struct { } stackmap SEC(".maps"); #ifdef GLOBAL_FUNC -__attribute__((noinline)) +__noinline +#elif defined(SUBPROGS) +static __noinline #else static __always_inline #endif diff --git a/tools/testing/selftests/bpf/progs/pyperf_subprogs.c b/tools/testing/selftests/bpf/progs/pyperf_subprogs.c new file mode 100644 index 000000000000..60e27a7f0cca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pyperf_subprogs.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define STACK_MAX_LEN 50 +#define SUBPROGS +#include "pyperf.h" diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index ad61b722a9de..7de534f38c3f 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -266,8 +266,12 @@ struct tls_index { uint64_t offset; }; -static __always_inline void *calc_location(struct strobe_value_loc *loc, - void *tls_base) +#ifdef SUBPROGS +__noinline +#else +__always_inline +#endif +static void *calc_location(struct strobe_value_loc *loc, void *tls_base) { /* * tls_mode value is: @@ -327,10 +331,15 @@ static __always_inline void *calc_location(struct strobe_value_loc *loc, : NULL; } -static __always_inline void read_int_var(struct strobemeta_cfg *cfg, - size_t idx, void *tls_base, - struct strobe_value_generic *value, - struct strobemeta_payload *data) +#ifdef SUBPROGS +__noinline +#else +__always_inline +#endif +static void read_int_var(struct strobemeta_cfg *cfg, + size_t idx, void *tls_base, + struct strobe_value_generic *value, + struct strobemeta_payload *data) { void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) @@ -440,8 +449,13 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, * read_strobe_meta returns NULL, if no metadata was read; otherwise returns * pointer to *right after* payload ends */ -static __always_inline void *read_strobe_meta(struct task_struct *task, - struct strobemeta_payload *data) +#ifdef SUBPROGS +__noinline +#else +__always_inline +#endif +static void *read_strobe_meta(struct task_struct *task, + struct strobemeta_payload *data) { pid_t pid = bpf_get_current_pid_tgid() >> 32; struct strobe_value_generic value = {0}; diff --git a/tools/testing/selftests/bpf/progs/strobemeta_subprogs.c b/tools/testing/selftests/bpf/progs/strobemeta_subprogs.c new file mode 100644 index 000000000000..b6c01f8fc559 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/strobemeta_subprogs.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// Copyright (c) 2019 Facebook + +#define STROBE_MAX_INTS 2 +#define STROBE_MAX_STRS 25 +#define STROBE_MAX_MAPS 13 +#define STROBE_MAX_MAP_ENTRIES 20 +#define NO_UNROLL +#define SUBPROGS +#include "strobemeta.h" diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c index 28351936a438..b9e2753f4f91 100644 --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c @@ -17,9 +17,7 @@ #include "test_iptunnel_common.h" #include -int _version SEC("version") = 1; - -static __u32 rol32(__u32 word, unsigned int shift) +static __always_inline __u32 rol32(__u32 word, unsigned int shift) { return (word << shift) | (word >> ((-shift) & 31)); } @@ -52,7 +50,7 @@ static __u32 rol32(__u32 word, unsigned int shift) typedef unsigned int u32; -static u32 jhash(const void *key, u32 length, u32 initval) +static __noinline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const unsigned char *k = key; @@ -88,7 +86,7 @@ static u32 jhash(const void *key, u32 length, u32 initval) return c; } -static u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) +static __noinline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; @@ -97,7 +95,7 @@ static u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) return c; } -static u32 jhash_2words(u32 a, u32 b, u32 initval) +static __noinline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } @@ -200,8 +198,7 @@ struct { __type(value, struct ctl_value); } ctl_array SEC(".maps"); -static __u32 get_packet_hash(struct packet_description *pckt, - bool ipv6) +static __noinline __u32 get_packet_hash(struct packet_description *pckt, bool ipv6) { if (ipv6) return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS), @@ -210,10 +207,10 @@ static __u32 get_packet_hash(struct packet_description *pckt, return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE); } -static bool get_packet_dst(struct real_definition **real, - struct packet_description *pckt, - struct vip_meta *vip_info, - bool is_ipv6) +static __noinline bool get_packet_dst(struct real_definition **real, + struct packet_description *pckt, + struct vip_meta *vip_info, + bool is_ipv6) { __u32 hash = get_packet_hash(pckt, is_ipv6); __u32 key = RING_SIZE * vip_info->vip_num + hash % RING_SIZE; @@ -233,8 +230,8 @@ static bool get_packet_dst(struct real_definition **real, return true; } -static int parse_icmpv6(void *data, void *data_end, __u64 off, - struct packet_description *pckt) +static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off, + struct packet_description *pckt) { struct icmp6hdr *icmp_hdr; struct ipv6hdr *ip6h; @@ -255,8 +252,8 @@ static int parse_icmpv6(void *data, void *data_end, __u64 off, return TC_ACT_UNSPEC; } -static int parse_icmp(void *data, void *data_end, __u64 off, - struct packet_description *pckt) +static __noinline int parse_icmp(void *data, void *data_end, __u64 off, + struct packet_description *pckt) { struct icmphdr *icmp_hdr; struct iphdr *iph; @@ -280,8 +277,8 @@ static int parse_icmp(void *data, void *data_end, __u64 off, return TC_ACT_UNSPEC; } -static bool parse_udp(void *data, __u64 off, void *data_end, - struct packet_description *pckt) +static __noinline bool parse_udp(void *data, __u64 off, void *data_end, + struct packet_description *pckt) { struct udphdr *udp; udp = data + off; @@ -299,8 +296,8 @@ static bool parse_udp(void *data, __u64 off, void *data_end, return true; } -static bool parse_tcp(void *data, __u64 off, void *data_end, - struct packet_description *pckt) +static __noinline bool parse_tcp(void *data, __u64 off, void *data_end, + struct packet_description *pckt) { struct tcphdr *tcp; @@ -321,8 +318,8 @@ static bool parse_tcp(void *data, __u64 off, void *data_end, return true; } -static int process_packet(void *data, __u64 off, void *data_end, - bool is_ipv6, struct __sk_buff *skb) +static __noinline int process_packet(void *data, __u64 off, void *data_end, + bool is_ipv6, struct __sk_buff *skb) { void *pkt_start = (void *)(long)skb->data; struct packet_description pckt = {}; From baaf680e089f34468f26e86f76af712105f6019b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:41 -0700 Subject: [PATCH 19/95] selftests/bpf: Modernize xdp_noinline test w/ skeleton and __noinline Update xdp_noinline to use BPF skeleton and force __noinline on helper sub-programs. Also, split existing logic into v4- and v6-only to complicate sub-program calling patterns (partially overlapped sets of functions for entry-level BPF programs). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200903203542.15944-14-andriin@fb.com --- .../selftests/bpf/prog_tests/xdp_noinline.c | 49 +++++++------------ .../selftests/bpf/progs/test_xdp_noinline.c | 36 ++++++++++---- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c index f284f72158ef..a1f06424cf83 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c @@ -1,11 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include "test_xdp_noinline.skel.h" void test_xdp_noinline(void) { - const char *file = "./test_xdp_noinline.o"; unsigned int nr_cpus = bpf_num_possible_cpus(); + struct test_xdp_noinline *skel; struct vip key = {.protocol = 6}; struct vip_meta { __u32 flags; @@ -25,58 +26,42 @@ void test_xdp_noinline(void) } real_def = {.dst = MAGIC_VAL}; __u32 ch_key = 11, real_num = 3; __u32 duration, retval, size; - int err, i, prog_fd, map_fd; + int err, i; __u64 bytes = 0, pkts = 0; - struct bpf_object *obj; char buf[128]; u32 *magic = (u32 *)buf; - err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); - if (CHECK_FAIL(err)) + skel = test_xdp_noinline__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "failed\n")) return; - map_fd = bpf_find_map(__func__, obj, "vip_map"); - if (map_fd < 0) - goto out; - bpf_map_update_elem(map_fd, &key, &value, 0); + bpf_map_update_elem(bpf_map__fd(skel->maps.vip_map), &key, &value, 0); + bpf_map_update_elem(bpf_map__fd(skel->maps.ch_rings), &ch_key, &real_num, 0); + bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &real_num, &real_def, 0); - map_fd = bpf_find_map(__func__, obj, "ch_rings"); - if (map_fd < 0) - goto out; - bpf_map_update_elem(map_fd, &ch_key, &real_num, 0); - - map_fd = bpf_find_map(__func__, obj, "reals"); - if (map_fd < 0) - goto out; - bpf_map_update_elem(map_fd, &real_num, &real_def, 0); - - err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4), + err = bpf_prog_test_run(bpf_program__fd(skel->progs.balancer_ingress_v4), + NUM_ITER, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); CHECK(err || retval != 1 || size != 54 || *magic != MAGIC_VAL, "ipv4", "err %d errno %d retval %d size %d magic %x\n", err, errno, retval, size, *magic); - err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6), + err = bpf_prog_test_run(bpf_program__fd(skel->progs.balancer_ingress_v6), + NUM_ITER, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); CHECK(err || retval != 1 || size != 74 || *magic != MAGIC_VAL, "ipv6", "err %d errno %d retval %d size %d magic %x\n", err, errno, retval, size, *magic); - map_fd = bpf_find_map(__func__, obj, "stats"); - if (map_fd < 0) - goto out; - bpf_map_lookup_elem(map_fd, &stats_key, stats); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), &stats_key, stats); for (i = 0; i < nr_cpus; i++) { bytes += stats[i].bytes; pkts += stats[i].pkts; } - if (CHECK_FAIL(bytes != MAGIC_BYTES * NUM_ITER * 2 || - pkts != NUM_ITER * 2)) { - printf("test_xdp_noinline:FAIL:stats %lld %lld\n", - bytes, pkts); - } -out: - bpf_object__close(obj); + CHECK(bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2, + "stats", "bytes %lld pkts %lld\n", + (unsigned long long)bytes, (unsigned long long)pkts); + test_xdp_noinline__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index 8beecec166d9..3a67921f62b5 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -16,7 +16,7 @@ #include #include -static __u32 rol32(__u32 word, unsigned int shift) +static __always_inline __u32 rol32(__u32 word, unsigned int shift) { return (word << shift) | (word >> ((-shift) & 31)); } @@ -49,7 +49,7 @@ static __u32 rol32(__u32 word, unsigned int shift) typedef unsigned int u32; -static __attribute__ ((noinline)) +static __noinline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; @@ -86,7 +86,7 @@ u32 jhash(const void *key, u32 length, u32 initval) return c; } -__attribute__ ((noinline)) +__noinline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; @@ -96,7 +96,7 @@ u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) return c; } -__attribute__ ((noinline)) +__noinline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); @@ -213,7 +213,7 @@ struct eth_hdr { unsigned short eth_proto; }; -static inline __u64 calc_offset(bool is_ipv6, bool is_icmp) +static __noinline __u64 calc_offset(bool is_ipv6, bool is_icmp) { __u64 off = sizeof(struct eth_hdr); if (is_ipv6) { @@ -797,8 +797,8 @@ out: return XDP_DROP; } -__attribute__ ((section("xdp-test"), used)) -int balancer_ingress(struct xdp_md *ctx) +SEC("xdp-test-v4") +int balancer_ingress_v4(struct xdp_md *ctx) { void *data = (void *)(long)ctx->data; void *data_end = (void *)(long)ctx->data_end; @@ -812,11 +812,27 @@ int balancer_ingress(struct xdp_md *ctx) eth_proto = bpf_ntohs(eth->eth_proto); if (eth_proto == ETH_P_IP) return process_packet(data, nh_off, data_end, 0, ctx); - else if (eth_proto == ETH_P_IPV6) + else + return XDP_DROP; +} + +SEC("xdp-test-v6") +int balancer_ingress_v6(struct xdp_md *ctx) +{ + void *data = (void *)(long)ctx->data; + void *data_end = (void *)(long)ctx->data_end; + struct eth_hdr *eth = data; + __u32 eth_proto; + __u32 nh_off; + + nh_off = sizeof(struct eth_hdr); + if (data + nh_off > data_end) + return XDP_DROP; + eth_proto = bpf_ntohs(eth->eth_proto); + if (eth_proto == ETH_P_IPV6) return process_packet(data, nh_off, data_end, 1, ctx); else return XDP_DROP; } -char _license[] __attribute__ ((section("license"), used)) = "GPL"; -int _version __attribute__ ((section("version"), used)) = 1; +char _license[] SEC("license") = "GPL"; From ee333df50bff22956ad37f3784cf120028f343a8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 13:35:42 -0700 Subject: [PATCH 20/95] selftests/bpf: Add __noinline variant of cls_redirect selftest As one of the most complicated and close-to-real-world programs, cls_redirect is a good candidate to exercise libbpf's logic of handling bpf2bpf calls. So add variant with using explicit __noinline for majority of functions except few most basic ones. If those few functions are inlined, verifier starts to complain about program instruction limit of 1mln instructions being exceeded, most probably due to instruction overhead of doing a sub-program call. Convert user-space part of selftest to have to sub-tests: with and without inlining. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Cc: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200903203542.15944-15-andriin@fb.com --- .../selftests/bpf/prog_tests/cls_redirect.c | 72 +++++++++--- .../selftests/bpf/progs/test_cls_redirect.c | 103 ++++++++++-------- .../bpf/progs/test_cls_redirect_subprogs.c | 2 + 3 files changed, 114 insertions(+), 63 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index f259085cca6a..9781d85cb223 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -12,10 +12,13 @@ #include "progs/test_cls_redirect.h" #include "test_cls_redirect.skel.h" +#include "test_cls_redirect_subprogs.skel.h" #define ENCAP_IP INADDR_LOOPBACK #define ENCAP_PORT (1234) +static int duration = 0; + struct addr_port { in_port_t port; union { @@ -361,30 +364,18 @@ static void close_fds(int *fds, int n) close(fds[i]); } -void test_cls_redirect(void) +static void test_cls_redirect_common(struct bpf_program *prog) { - struct test_cls_redirect *skel = NULL; struct bpf_prog_test_run_attr tattr = {}; int families[] = { AF_INET, AF_INET6 }; struct sockaddr_storage ss; struct sockaddr *addr; socklen_t slen; int i, j, err; - int servers[__NR_KIND][ARRAY_SIZE(families)] = {}; int conns[__NR_KIND][ARRAY_SIZE(families)] = {}; struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)]; - skel = test_cls_redirect__open(); - if (CHECK_FAIL(!skel)) - return; - - skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); - skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); - - if (CHECK_FAIL(test_cls_redirect__load(skel))) - goto cleanup; - addr = (struct sockaddr *)&ss; for (i = 0; i < ARRAY_SIZE(families); i++) { slen = prepare_addr(&ss, families[i]); @@ -402,7 +393,7 @@ void test_cls_redirect(void) goto cleanup; } - tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect); + tattr.prog_fd = bpf_program__fd(prog); for (i = 0; i < ARRAY_SIZE(tests); i++) { struct test_cfg *test = &tests[i]; @@ -450,7 +441,58 @@ void test_cls_redirect(void) } cleanup: - test_cls_redirect__destroy(skel); close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0])); close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0])); } + +static void test_cls_redirect_inlined(void) +{ + struct test_cls_redirect *skel; + int err; + + skel = test_cls_redirect__open(); + if (CHECK(!skel, "skel_open", "failed\n")) + return; + + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); + + err = test_cls_redirect__load(skel); + if (CHECK(err, "skel_load", "failed: %d\n", err)) + goto cleanup; + + test_cls_redirect_common(skel->progs.cls_redirect); + +cleanup: + test_cls_redirect__destroy(skel); +} + +static void test_cls_redirect_subprogs(void) +{ + struct test_cls_redirect_subprogs *skel; + int err; + + skel = test_cls_redirect_subprogs__open(); + if (CHECK(!skel, "skel_open", "failed\n")) + return; + + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); + + err = test_cls_redirect_subprogs__load(skel); + if (CHECK(err, "skel_load", "failed: %d\n", err)) + goto cleanup; + + test_cls_redirect_common(skel->progs.cls_redirect); + +cleanup: + test_cls_redirect_subprogs__destroy(skel); +} + +void test_cls_redirect(void) +{ + if (test__start_subtest("cls_redirect_inlined")) + test_cls_redirect_inlined(); + if (test__start_subtest("cls_redirect_subprogs")) + test_cls_redirect_subprogs(); +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index f0b72e86bee5..c9f8464996ea 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -22,6 +22,12 @@ #include "test_cls_redirect.h" +#ifdef SUBPROGS +#define INLINING __noinline +#else +#define INLINING __always_inline +#endif + #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) @@ -125,7 +131,7 @@ typedef struct buf { uint8_t *const tail; } buf_t; -static size_t buf_off(const buf_t *buf) +static __always_inline size_t buf_off(const buf_t *buf) { /* Clang seems to optimize constructs like * a - b + c @@ -145,7 +151,7 @@ static size_t buf_off(const buf_t *buf) return off; } -static bool buf_copy(buf_t *buf, void *dst, size_t len) +static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len) { if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { return false; @@ -155,7 +161,7 @@ static bool buf_copy(buf_t *buf, void *dst, size_t len) return true; } -static bool buf_skip(buf_t *buf, const size_t len) +static __always_inline bool buf_skip(buf_t *buf, const size_t len) { /* Check whether off + len is valid in the non-linear part. */ if (buf_off(buf) + len > buf->skb->len) { @@ -173,7 +179,7 @@ static bool buf_skip(buf_t *buf, const size_t len) * If scratch is not NULL, the function will attempt to load non-linear * data via bpf_skb_load_bytes. On success, scratch is returned. */ -static void *buf_assign(buf_t *buf, const size_t len, void *scratch) +static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch) { if (buf->head + len > buf->tail) { if (scratch == NULL) { @@ -188,7 +194,7 @@ static void *buf_assign(buf_t *buf, const size_t len, void *scratch) return ptr; } -static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) +static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) { if (ipv4->ihl <= 5) { return true; @@ -197,13 +203,13 @@ static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) return buf_skip(buf, (ipv4->ihl - 5) * 4); } -static bool ipv4_is_fragment(const struct iphdr *ip) +static INLINING bool ipv4_is_fragment(const struct iphdr *ip) { uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; } -static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) +static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) { struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); if (ipv4 == NULL) { @@ -222,7 +228,7 @@ static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) } /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ -static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) +static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) { if (!buf_copy(pkt, ports, sizeof(*ports))) { return false; @@ -237,7 +243,7 @@ static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) return true; } -static uint16_t pkt_checksum_fold(uint32_t csum) +static INLINING uint16_t pkt_checksum_fold(uint32_t csum) { /* The highest reasonable value for an IPv4 header * checksum requires two folds, so we just do that always. @@ -247,7 +253,7 @@ static uint16_t pkt_checksum_fold(uint32_t csum) return (uint16_t)~csum; } -static void pkt_ipv4_checksum(struct iphdr *iph) +static INLINING void pkt_ipv4_checksum(struct iphdr *iph) { iph->check = 0; @@ -268,10 +274,11 @@ static void pkt_ipv4_checksum(struct iphdr *iph) iph->check = pkt_checksum_fold(acc); } -static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, - const struct ipv6hdr *ipv6, - uint8_t *upper_proto, - bool *is_fragment) +static INLINING +bool pkt_skip_ipv6_extension_headers(buf_t *pkt, + const struct ipv6hdr *ipv6, + uint8_t *upper_proto, + bool *is_fragment) { /* We understand five extension headers. * https://tools.ietf.org/html/rfc8200#section-4.1 states that all @@ -336,7 +343,7 @@ static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, * scratch is allocated on the stack. However, this usage should be safe since * it's the callers stack after all. */ -static inline __attribute__((__always_inline__)) struct ipv6hdr * +static __always_inline struct ipv6hdr * pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, bool *is_fragment) { @@ -354,20 +361,20 @@ pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, /* Global metrics, per CPU */ -struct bpf_map_def metrics_map SEC("maps") = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(unsigned int), - .value_size = sizeof(metrics_t), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, unsigned int); + __type(value, metrics_t); +} metrics_map SEC(".maps"); -static metrics_t *get_global_metrics(void) +static INLINING metrics_t *get_global_metrics(void) { uint64_t key = 0; return bpf_map_lookup_elem(&metrics_map, &key); } -static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) +static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) { const int payload_off = sizeof(*encap) + @@ -388,8 +395,8 @@ static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) return bpf_redirect(skb->ifindex, BPF_F_INGRESS); } -static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, - struct in_addr *next_hop, metrics_t *metrics) +static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) { metrics->forwarded_packets_total_gre++; @@ -509,8 +516,8 @@ static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, return bpf_redirect(skb->ifindex, 0); } -static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, - struct in_addr *next_hop, metrics_t *metrics) +static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) { /* swap L2 addresses */ /* This assumes that packets are received from a router. @@ -546,7 +553,7 @@ static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, return bpf_redirect(skb->ifindex, 0); } -static ret_t skip_next_hops(buf_t *pkt, int n) +static INLINING ret_t skip_next_hops(buf_t *pkt, int n) { switch (n) { case 1: @@ -566,8 +573,8 @@ static ret_t skip_next_hops(buf_t *pkt, int n) * pkt is positioned just after the variable length GLB header * iff the call is successful. */ -static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, - struct in_addr *next_hop) +static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, + struct in_addr *next_hop) { if (encap->unigue.next_hop > encap->unigue.hop_count) { return TC_ACT_SHOT; @@ -601,8 +608,8 @@ static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, * return value, and calling code works while still being "generic" to * IPv4 and IPv6. */ -static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, - uint64_t iphlen, uint16_t sport, uint16_t dport) +static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, + uint64_t iphlen, uint16_t sport, uint16_t dport) { switch (iphlen) { case sizeof(struct iphdr): { @@ -630,9 +637,9 @@ static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, } } -static verdict_t classify_tcp(struct __sk_buff *skb, - struct bpf_sock_tuple *tuple, uint64_t tuplen, - void *iph, struct tcphdr *tcp) +static INLINING verdict_t classify_tcp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + void *iph, struct tcphdr *tcp) { struct bpf_sock *sk = bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); @@ -663,8 +670,8 @@ static verdict_t classify_tcp(struct __sk_buff *skb, return UNKNOWN; } -static verdict_t classify_udp(struct __sk_buff *skb, - struct bpf_sock_tuple *tuple, uint64_t tuplen) +static INLINING verdict_t classify_udp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen) { struct bpf_sock *sk = bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); @@ -681,9 +688,9 @@ static verdict_t classify_udp(struct __sk_buff *skb, return UNKNOWN; } -static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, - struct bpf_sock_tuple *tuple, uint64_t tuplen, - metrics_t *metrics) +static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + metrics_t *metrics) { switch (proto) { case IPPROTO_TCP: @@ -698,7 +705,7 @@ static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, } } -static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) +static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) { struct icmphdr icmp; if (!buf_copy(pkt, &icmp, sizeof(icmp))) { @@ -745,7 +752,7 @@ static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) sizeof(tuple.ipv4), metrics); } -static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) +static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) { struct icmp6hdr icmp6; if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { @@ -797,8 +804,8 @@ static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) metrics); } -static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, - metrics_t *metrics) +static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) { metrics->l4_protocol_packets_total_tcp++; @@ -819,8 +826,8 @@ static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); } -static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, - metrics_t *metrics) +static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) { metrics->l4_protocol_packets_total_udp++; @@ -837,7 +844,7 @@ static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, return classify_udp(pkt->skb, &tuple, tuplen); } -static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) +static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) { metrics->l3_protocol_packets_total_ipv4++; @@ -874,7 +881,7 @@ static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) } } -static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) +static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) { metrics->l3_protocol_packets_total_ipv6++; diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c new file mode 100644 index 000000000000..eed26b70e3a2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c @@ -0,0 +1,2 @@ +#define SUBPROGS +#include "test_cls_redirect.c" From 95cec14b0308085c028c4d4fb3d09fad3902b4c3 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 3 Sep 2020 13:05:28 -0700 Subject: [PATCH 21/95] selftests/bpf: Fix check in global_data_init. The returned value of bpf_object__open_file() should be checked with libbpf_get_error() rather than NULL. This fix prevents test_progs from crash when test_global_data.o is not present. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200903200528.747884-1-haoluo@google.com --- tools/testing/selftests/bpf/prog_tests/global_data_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/global_data_init.c b/tools/testing/selftests/bpf/prog_tests/global_data_init.c index 3bdaa5a40744..ee46b11f1f9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/global_data_init.c +++ b/tools/testing/selftests/bpf/prog_tests/global_data_init.c @@ -12,7 +12,8 @@ void test_global_data_init(void) size_t sz; obj = bpf_object__open_file(file, NULL); - if (CHECK_FAIL(!obj)) + err = libbpf_get_error(obj); + if (CHECK_FAIL(err)) return; map = bpf_object__find_map_by_name(obj, "test_glo.rodata"); From 17e54b096e6a8dc92b92c59c2e3437550383b27a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 21:16:10 -0700 Subject: [PATCH 22/95] libbpf: Fix another __u64 cast in printf Another issue of __u64 needing either %lu or %llu, depending on the architecture. Fix with cast to `unsigned long long`. Fixes: 7e06aad52929 ("libbpf: Add multi-prog section support for struct_ops") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904041611.1695163-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 47b43c13eee5..53be32a2b9fc 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8224,7 +8224,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, } if (sym.st_value % BPF_INSN_SZ) { pr_warn("struct_ops reloc %s: invalid target program offset %llu\n", - map->name, (__u64)sym.st_value); + map->name, (unsigned long long)sym.st_value); return -LIBBPF_ERRNO__FORMAT; } insn_idx = sym.st_value / BPF_INSN_SZ; From 8eb629585d2231e90112148009e2a11b0979ca38 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Sep 2020 21:16:11 -0700 Subject: [PATCH 23/95] libbpf: Fix potential multiplication overflow Detected by LGTM static analyze in Github repo, fix potential multiplication overflow before result is casted to size_t. Fixes: 8505e8709b5e ("libbpf: Implement generalized .BTF.ext func/line info adjustment") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904041611.1695163-2-andriin@fb.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 53be32a2b9fc..550950eb1860 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5802,7 +5802,7 @@ static int adjust_prog_btf_ext_info(const struct bpf_object *obj, /* append func/line info of a given (sub-)program to the main * program func/line info */ - old_sz = (*prog_rec_cnt) * ext_info->rec_size; + old_sz = (size_t)(*prog_rec_cnt) * ext_info->rec_size; new_sz = old_sz + (copy_end - copy_start); new_prog_info = realloc(*prog_info, new_sz); if (!new_prog_info) From 698584dffd4bca834ac4733fc6a659a0a0d213e7 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 4 Sep 2020 15:34:33 +0900 Subject: [PATCH 24/95] samples, bpf: Replace bpf_program__title() with bpf_program__section_name() From commit 521095842027 ("libbpf: Deprecate notion of BPF program "title" in favor of "section name""), the term title has been replaced with section name in libbpf. Since the bpf_program__title() has been deprecated, this commit switches this function to bpf_program__section_name(). Due to this commit, the compilation warning issue has also been resolved. Fixes: 521095842027 ("libbpf: Deprecate notion of BPF program "title" in favor of "section name"") Signed-off-by: Daniel T. Lee Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904063434.24963-1-danieltimlee@gmail.com --- samples/bpf/sockex3_user.c | 6 +++--- samples/bpf/spintest_user.c | 6 +++--- samples/bpf/tracex5_user.c | 6 +++--- samples/bpf/xdp_redirect_cpu_user.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 4dbee7427d47..7793f6a6ae7e 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -29,8 +29,8 @@ int main(int argc, char **argv) struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_program *prog; struct bpf_object *obj; + const char *section; char filename[256]; - const char *title; FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); @@ -58,8 +58,8 @@ int main(int argc, char **argv) bpf_object__for_each_program(prog, obj) { fd = bpf_program__fd(prog); - title = bpf_program__title(prog, false); - if (sscanf(title, "socket/%d", &key) != 1) { + section = bpf_program__section_name(prog); + if (sscanf(section, "socket/%d", &key) != 1) { fprintf(stderr, "ERROR: finding prog failed\n"); goto cleanup; } diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index 847da9284fa8..f090d0dc60d6 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -17,7 +17,7 @@ int main(int ac, char **argv) long key, next_key, value; struct bpf_program *prog; int map_fd, i, j = 0; - const char *title; + const char *section; struct ksym *sym; if (setrlimit(RLIMIT_MEMLOCK, &r)) { @@ -51,8 +51,8 @@ int main(int ac, char **argv) } bpf_object__for_each_program(prog, obj) { - title = bpf_program__title(prog, false); - if (sscanf(title, "kprobe/%s", symbol) != 1) + section = bpf_program__section_name(prog); + if (sscanf(section, "kprobe/%s", symbol) != 1) continue; /* Attach prog only when symbol exists */ diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c index 98dad57a96c4..c17d3fb5fd64 100644 --- a/samples/bpf/tracex5_user.c +++ b/samples/bpf/tracex5_user.c @@ -39,8 +39,8 @@ int main(int ac, char **argv) struct bpf_program *prog; struct bpf_object *obj; int key, fd, progs_fd; + const char *section; char filename[256]; - const char *title; FILE *f; setrlimit(RLIMIT_MEMLOCK, &r); @@ -78,9 +78,9 @@ int main(int ac, char **argv) } bpf_object__for_each_program(prog, obj) { - title = bpf_program__title(prog, false); + section = bpf_program__section_name(prog); /* register only syscalls to PROG_ARRAY */ - if (sscanf(title, "kprobe/%d", &key) != 1) + if (sscanf(section, "kprobe/%d", &key) != 1) continue; fd = bpf_program__fd(prog); diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 004c0622c913..3dd366e9474d 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -111,7 +111,7 @@ static void print_avail_progs(struct bpf_object *obj) bpf_object__for_each_program(pos, obj) { if (bpf_program__is_xdp(pos)) - printf(" %s\n", bpf_program__title(pos, false)); + printf(" %s\n", bpf_program__section_name(pos)); } } From f9bec5d756b30d5b21aa5ff9b7d5d115741517c1 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 4 Sep 2020 15:34:34 +0900 Subject: [PATCH 25/95] samples, bpf: Add xsk_fwd test file to .gitignore This commit adds xsk_fwd test file to .gitignore which is newly added to samples/bpf. Signed-off-by: Daniel T. Lee Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904063434.24963-2-danieltimlee@gmail.com --- samples/bpf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 034800c4d1e6..b2f29bc8dc43 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -50,4 +50,5 @@ xdp_rxq_info xdp_sample_pkts xdp_tx_iptunnel xdpsock +xsk_fwd testfile.img From 1a7581b174e99e18d459856bb26ab01258c027c3 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 4 Sep 2020 17:14:52 +0100 Subject: [PATCH 26/95] tools: bpftool: Fix formatting in bpftool-link documentation Fix a formatting error in the documentation for bpftool-link, so that the man page can build correctly. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904161454.31135-2-quentin@isovalent.com --- tools/bpf/bpftool/Documentation/bpftool-link.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 4a52e7a93339..dc7693b5e606 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -21,7 +21,7 @@ LINK COMMANDS | **bpftool** **link { show | list }** [*LINK*] | **bpftool** **link pin** *LINK* *FILE* -| **bpftool** **link detach *LINK* +| **bpftool** **link detach** *LINK* | **bpftool** **link help** | | *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } From 938c3efd9e650ca343d04e70d11a17c64119e17c Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 4 Sep 2020 17:14:53 +0100 Subject: [PATCH 27/95] bpf: Fix formatting in documentation for BPF helpers Fix a formatting error in the description of bpf_load_hdr_opt() (rst2man complains about a wrong indentation, but what is missing is actually a blank line before the bullet list). Fix and harmonise the formatting for other helpers. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904161454.31135-3-quentin@isovalent.com --- include/uapi/linux/bpf.h | 87 +++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8dda13880957..90359cab501d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3349,38 +3349,38 @@ union bpf_attr { * Description * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *task*, which is a valid - * pointer to struct task_struct. To store the stacktrace, the - * bpf program provides *buf* with a nonnegative *size*. + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with @@ -3410,12 +3410,12 @@ union bpf_attr { * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description * Load header option. Support reading a particular TCP header - * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). * * If *flags* is 0, it will search the option from the - * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** * has details on what skb_data contains under different - * sock_ops->op. + * *skops*\ **->op**. * * The first byte of the *searchby_res* specifies the * kind that it wants to search. @@ -3435,7 +3435,7 @@ union bpf_attr { * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. * * To search for the standard window scale option (3), - * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. * Note, kind-length must be 0 for regular option. * * Searching for No-Op (0) and End-of-Option-List (1) are @@ -3445,27 +3445,30 @@ union bpf_attr { * of a header option. * * Supported flags: + * * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the * saved_syn packet or the just-received syn packet. * * Return - * >0 when found, the header option is copied to *searchby_res*. - * The return value is the total length copied. + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: * - * **-EINVAL** If param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOMSG** The option is not found + * **-ENOMSG** if the option is not found. * - * **-ENOENT** No syn packet available when - * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. * - * **-ENOSPC** Not enough space. Only *len* number of - * bytes are copied. + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. * - * **-EFAULT** Cannot parse the header options in the packet + * **-EFAULT** on failure to parse the header options in the + * packet. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) * Description @@ -3483,44 +3486,44 @@ union bpf_attr { * by searching the same option in the outgoing skb. * * This helper can only be called during - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** If param is invalid + * **-EINVAL** If param is invalid. * - * **-ENOSPC** Not enough space in the header. - * Nothing has been written + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written * - * **-EEXIST** The option has already existed + * **-EEXIST** if the option already exists. * - * **-EFAULT** Cannot parse the existing header options + * **-EFAULT** on failrue to parse the existing header options. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) * Description * Reserve *len* bytes for the bpf header option. The - * space will be used by bpf_store_hdr_opt() later in - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * - * If bpf_reserve_hdr_opt() is called multiple times, + * If **bpf_reserve_hdr_opt**\ () is called multiple times, * the total number of bytes will be reserved. * * This helper can only be called during - * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** if param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOSPC** Not enough space in the header. + * **-ENOSPC** if there is not enough space in the header. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description @@ -3560,9 +3563,9 @@ union bpf_attr { * * long bpf_d_path(struct path *path, char *buf, u32 sz) * Description - * Return full path for given 'struct path' object, which - * needs to be the kernel BTF 'path' object. The path is - * returned in the provided buffer 'buf' of size 'sz' and + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and * is zero terminated. * * Return @@ -3573,7 +3576,7 @@ union bpf_attr { * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) * Description * Read *size* bytes from user space address *user_ptr* and store - * the data in *dst*. This is a wrapper of copy_from_user(). + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. */ From bc0b5a03079bd78fb3d5cba1ccabf0a7efb1d99f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 4 Sep 2020 17:14:54 +0100 Subject: [PATCH 28/95] tools, bpf: Synchronise BPF UAPI header with tools Synchronise the bpf.h header under tools, to report the fixes recently brought to the documentation for the BPF helpers. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904161454.31135-4-quentin@isovalent.com --- tools/include/uapi/linux/bpf.h | 87 ++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8dda13880957..90359cab501d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3349,38 +3349,38 @@ union bpf_attr { * Description * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *task*, which is a valid - * pointer to struct task_struct. To store the stacktrace, the - * bpf program provides *buf* with a nonnegative *size*. + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with @@ -3410,12 +3410,12 @@ union bpf_attr { * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description * Load header option. Support reading a particular TCP header - * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). * * If *flags* is 0, it will search the option from the - * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** * has details on what skb_data contains under different - * sock_ops->op. + * *skops*\ **->op**. * * The first byte of the *searchby_res* specifies the * kind that it wants to search. @@ -3435,7 +3435,7 @@ union bpf_attr { * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. * * To search for the standard window scale option (3), - * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. * Note, kind-length must be 0 for regular option. * * Searching for No-Op (0) and End-of-Option-List (1) are @@ -3445,27 +3445,30 @@ union bpf_attr { * of a header option. * * Supported flags: + * * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the * saved_syn packet or the just-received syn packet. * * Return - * >0 when found, the header option is copied to *searchby_res*. - * The return value is the total length copied. + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: * - * **-EINVAL** If param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOMSG** The option is not found + * **-ENOMSG** if the option is not found. * - * **-ENOENT** No syn packet available when - * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. * - * **-ENOSPC** Not enough space. Only *len* number of - * bytes are copied. + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. * - * **-EFAULT** Cannot parse the header options in the packet + * **-EFAULT** on failure to parse the header options in the + * packet. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) * Description @@ -3483,44 +3486,44 @@ union bpf_attr { * by searching the same option in the outgoing skb. * * This helper can only be called during - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** If param is invalid + * **-EINVAL** If param is invalid. * - * **-ENOSPC** Not enough space in the header. - * Nothing has been written + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written * - * **-EEXIST** The option has already existed + * **-EEXIST** if the option already exists. * - * **-EFAULT** Cannot parse the existing header options + * **-EFAULT** on failrue to parse the existing header options. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) * Description * Reserve *len* bytes for the bpf header option. The - * space will be used by bpf_store_hdr_opt() later in - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * - * If bpf_reserve_hdr_opt() is called multiple times, + * If **bpf_reserve_hdr_opt**\ () is called multiple times, * the total number of bytes will be reserved. * * This helper can only be called during - * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** if param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOSPC** Not enough space in the header. + * **-ENOSPC** if there is not enough space in the header. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description @@ -3560,9 +3563,9 @@ union bpf_attr { * * long bpf_d_path(struct path *path, char *buf, u32 sz) * Description - * Return full path for given 'struct path' object, which - * needs to be the kernel BTF 'path' object. The path is - * returned in the provided buffer 'buf' of size 'sz' and + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and * is zero terminated. * * Return @@ -3573,7 +3576,7 @@ union bpf_attr { * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) * Description * Read *size* bytes from user space address *user_ptr* and store - * the data in *dst*. This is a wrapper of copy_from_user(). + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. */ From 7c6967326267bd5c0dded0a99541357d70dd11ac Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 8 Sep 2020 10:57:02 -0700 Subject: [PATCH 29/95] bpf: Permit map_ptr arithmetic with opcode add and offset 0 Commit 41c48f3a98231 ("bpf: Support access to bpf map fields") added support to access map fields with CORE support. For example, struct bpf_map { __u32 max_entries; } __attribute__((preserve_access_index)); struct bpf_array { struct bpf_map map; __u32 elem_size; } __attribute__((preserve_access_index)); struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 4); __type(key, __u32); __type(value, __u32); } m_array SEC(".maps"); SEC("cgroup_skb/egress") int cg_skb(void *ctx) { struct bpf_array *array = (struct bpf_array *)&m_array; /* .. array->map.max_entries .. */ } In kernel, bpf_htab has similar structure, struct bpf_htab { struct bpf_map map; ... } In the above cg_skb(), to access array->map.max_entries, with CORE, the clang will generate two builtin's. base = &m_array; /* access array.map */ map_addr = __builtin_preserve_struct_access_info(base, 0, 0); /* access array.map.max_entries */ max_entries_addr = __builtin_preserve_struct_access_info(map_addr, 0, 0); max_entries = *max_entries_addr; In the current llvm, if two builtin's are in the same function or in the same function after inlining, the compiler is smart enough to chain them together and generates like below: base = &m_array; max_entries = *(base + reloc_offset); /* reloc_offset = 0 in this case */ and we are fine. But if we force no inlining for one of functions in test_map_ptr() selftest, e.g., check_default(), the above two __builtin_preserve_* will be in two different functions. In this case, we will have code like: func check_hash(): reloc_offset_map = 0; base = &m_array; map_base = base + reloc_offset_map; check_default(map_base, ...) func check_default(map_base, ...): max_entries = *(map_base + reloc_offset_max_entries); In kernel, map_ptr (CONST_PTR_TO_MAP) does not allow any arithmetic. The above "map_base = base + reloc_offset_map" will trigger a verifier failure. ; VERIFY(check_default(&hash->map, map)); 0: (18) r7 = 0xffffb4fe8018a004 2: (b4) w1 = 110 3: (63) *(u32 *)(r7 +0) = r1 R1_w=invP110 R7_w=map_value(id=0,off=4,ks=4,vs=8,imm=0) R10=fp0 ; VERIFY_TYPE(BPF_MAP_TYPE_HASH, check_hash); 4: (18) r1 = 0xffffb4fe8018a000 6: (b4) w2 = 1 7: (63) *(u32 *)(r1 +0) = r2 R1_w=map_value(id=0,off=0,ks=4,vs=8,imm=0) R2_w=invP1 R7_w=map_value(id=0,off=4,ks=4,vs=8,imm=0) R10=fp0 8: (b7) r2 = 0 9: (18) r8 = 0xffff90bcb500c000 11: (18) r1 = 0xffff90bcb500c000 13: (0f) r1 += r2 R1 pointer arithmetic on map_ptr prohibited To fix the issue, let us permit map_ptr + 0 arithmetic which will result in exactly the same map_ptr. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200908175702.2463625-1-yhs@fb.com --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b4e9c56b8b32..814bc6c1ad16 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5317,6 +5317,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst, reg_type_str[ptr_reg->type]); return -EACCES; case CONST_PTR_TO_MAP: + /* smin_val represents the known value */ + if (known && smin_val == 0 && opcode == BPF_ADD) + break; + /* fall-through */ case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: From e6054fc1f865ac46936a6b5517ae2d36554c94f8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 8 Sep 2020 10:57:03 -0700 Subject: [PATCH 30/95] selftests/bpf: Add test for map_ptr arithmetic Change selftest map_ptr_kern.c with disabling inlining for one of subtests, which will fail the test without previous verifier change. Also added to verifier test for both "map_ptr += scalar" and "scalar += map_ptr" arithmetic. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200908175703.2463721-1-yhs@fb.com --- .../selftests/bpf/progs/map_ptr_kern.c | 10 +++++- .../testing/selftests/bpf/verifier/map_ptr.c | 32 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index 982a2d8aa844..c325405751e2 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -82,6 +82,14 @@ static inline int check_default(struct bpf_map *indirect, return 1; } +static __noinline int +check_default_noinline(struct bpf_map *indirect, struct bpf_map *direct) +{ + VERIFY(check(indirect, direct, sizeof(__u32), sizeof(__u32), + MAX_ENTRIES)); + return 1; +} + typedef struct { int counter; } atomic_t; @@ -107,7 +115,7 @@ static inline int check_hash(void) struct bpf_map *map = (struct bpf_map *)&m_hash; int i; - VERIFY(check_default(&hash->map, map)); + VERIFY(check_default_noinline(&hash->map, map)); VERIFY(hash->n_buckets == MAX_ENTRIES); VERIFY(hash->elem_size == 64); diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index b52209db8250..637f9293bda8 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -60,3 +60,35 @@ .result = ACCEPT, .retval = 1, }, +{ + "bpf_map_ptr: r = 0, map_ptr = map_ptr + r", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 4 }, + .result = ACCEPT, +}, +{ + "bpf_map_ptr: r = 0, r = r + map_ptr", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_LD_MAP_FD(BPF_REG_0, 0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 4 }, + .result = ACCEPT, +}, From 7fb5eefd76394cfefb380724a87ca40b47d44405 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 9 Sep 2020 10:15:42 -0700 Subject: [PATCH 31/95] selftests/bpf: Fix test_sysctl_loop{1, 2} failure due to clang change Andrii reported that with latest clang, when building selftests, we have error likes: error: progs/test_sysctl_loop1.c:23:16: in function sysctl_tcp_mem i32 (%struct.bpf_sysctl*): Looks like the BPF stack limit of 512 bytes is exceeded. Please move large on stack variables into BPF per-cpu array map. The error is triggered by the following LLVM patch: https://reviews.llvm.org/D87134 For example, the following code is from test_sysctl_loop1.c: static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx) { volatile char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string"; ... } Without the above LLVM patch, the compiler did optimization to load the string (59 bytes long) with 7 64bit loads, 1 8bit load and 1 16bit load, occupying 64 byte stack size. With the above LLVM patch, the compiler only uses 8bit loads, but subregister is 32bit. So stack requirements become 4 * 59 = 236 bytes. Together with other stuff on the stack, total stack size exceeds 512 bytes, hence compiler complains and quits. To fix the issue, removing "volatile" key word or changing "volatile" to "const"/"static const" does not work, the string is put in .rodata.str1.1 section, which libbpf did not process it and errors out with libbpf: elf: skipping unrecognized data section(6) .rodata.str1.1 libbpf: prog 'sysctl_tcp_mem': bad map relo against '.L__const.is_tcp_mem.tcp_mem_name' in section '.rodata.str1.1' Defining the string const as global variable can fix the issue as it puts the string constant in '.rodata' section which is recognized by libbpf. In the future, when libbpf can process '.rodata.str*.*' properly, the global definition can be changed back to local definition. Defining tcp_mem_name as a global, however, triggered a verifier failure. ./test_progs -n 7/21 libbpf: load bpf program failed: Permission denied libbpf: -- BEGIN DUMP LOG --- libbpf: invalid stack off=0 size=1 verification time 6975 usec stack depth 160+64 processed 889 insns (limit 1000000) max_states_per_insn 4 total_states 14 peak_states 14 mark_read 10 libbpf: -- END LOG -- libbpf: failed to load program 'sysctl_tcp_mem' libbpf: failed to load object 'test_sysctl_loop2.o' test_bpf_verif_scale:FAIL:114 #7/21 test_sysctl_loop2.o:FAIL This actually exposed a bpf program bug. In test_sysctl_loop{1,2}, we have code like const char tcp_mem_name[] = "<...long string...>"; ... char name[64]; ... for (i = 0; i < sizeof(tcp_mem_name); ++i) if (name[i] != tcp_mem_name[i]) return 0; In the above code, if sizeof(tcp_mem_name) > 64, name[i] access may be out of bound. The sizeof(tcp_mem_name) is 59 for test_sysctl_loop1.c and 79 for test_sysctl_loop2.c. Without promotion-to-global change, old compiler generates code where the overflowed stack access is actually filled with valid value, so hiding the bpf program bug. With promotion-to-global change, the code is different, more specifically, the previous loading constants to stack is gone, and "name" occupies stack[-64:0] and overflow access triggers a verifier error. To fix the issue, adjust "name" buffer size properly. Reported-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200909171542.3673449-1-yhs@fb.com --- tools/testing/selftests/bpf/progs/test_sysctl_loop1.c | 4 ++-- tools/testing/selftests/bpf/progs/test_sysctl_loop2.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c index 458b0d69133e..553a282d816a 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c @@ -18,11 +18,11 @@ #define MAX_ULONG_STR_LEN 7 #define MAX_VALUE_STR_LEN (TCP_MEM_LOOPS * MAX_ULONG_STR_LEN) +const char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string"; static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx) { - volatile char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string"; unsigned char i; - char name[64]; + char name[sizeof(tcp_mem_name)]; int ret; memset(name, 0, sizeof(name)); diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c index b2e6f9b0894d..2b64bc563a12 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c @@ -18,11 +18,11 @@ #define MAX_ULONG_STR_LEN 7 #define MAX_VALUE_STR_LEN (TCP_MEM_LOOPS * MAX_ULONG_STR_LEN) +const char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string_to_stress_byte_loop"; static __attribute__((noinline)) int is_tcp_mem(struct bpf_sysctl *ctx) { - volatile char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string_to_stress_byte_loop"; unsigned char i; - char name[64]; + char name[sizeof(tcp_mem_name)]; int ret; memset(name, 0, sizeof(name)); From 8081ede1f73139687a75147d529cebcc5e4149c5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 8 Sep 2020 11:01:27 -0700 Subject: [PATCH 32/95] perf: Stop using deprecated bpf_program__title() Switch from deprecated bpf_program__title() API to bpf_program__section_name(). Also drop unnecessary error checks because neither bpf_program__title() nor bpf_program__section_name() can fail or return NULL. Fixes: 521095842027 ("libbpf: Deprecate notion of BPF program "title" in favor of "section name"") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Tobias Klauser Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20200908180127.1249-1-andriin@fb.com --- tools/perf/util/bpf-loader.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 2feb751516ab..0374adcb223c 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -328,12 +328,6 @@ config_bpf_program(struct bpf_program *prog) probe_conf.no_inlines = false; probe_conf.force_add = false; - config_str = bpf_program__title(prog, false); - if (IS_ERR(config_str)) { - pr_debug("bpf: unable to get title for program\n"); - return PTR_ERR(config_str); - } - priv = calloc(sizeof(*priv), 1); if (!priv) { pr_debug("bpf: failed to alloc priv\n"); @@ -341,6 +335,7 @@ config_bpf_program(struct bpf_program *prog) } pev = &priv->pev; + config_str = bpf_program__section_name(prog); pr_debug("bpf: config program '%s'\n", config_str); err = parse_prog_config(config_str, &main_str, &is_tp, pev); if (err) @@ -454,10 +449,7 @@ preproc_gen_prologue(struct bpf_program *prog, int n, if (err) { const char *title; - title = bpf_program__title(prog, false); - if (!title) - title = "[unknown]"; - + title = bpf_program__section_name(prog); pr_debug("Failed to generate prologue for program %s\n", title); return err; From e9091bb77f6e130e42f2edd2e6bb4e5a2b2f9d77 Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Tue, 8 Sep 2020 21:22:01 +0800 Subject: [PATCH 33/95] bpf: Remove duplicate headers Remove duplicate headers which are included twice. Signed-off-by: Chen Zhou Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200908132201.184005-1-chenzhou10@huawei.com --- net/core/bpf_sk_storage.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index a0d1a3265b71..4a86ea34f29e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -12,7 +12,6 @@ #include #include #include -#include DEFINE_BPF_STORAGE_CACHE(sk_cache); From 16f3ddfbad52fddd7dafdf4b540ca6c506c9d815 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 9 Sep 2020 17:22:50 +0100 Subject: [PATCH 34/95] tools: bpftool: Log info-level messages when building bpftool man pages To build man pages for bpftool (and for eBPF helper functions), rst2man can log different levels of information. Let's make it log all levels to keep the RST files clean. Doing so, rst2man complains about double colons, used for literal blocks, that look like underlines for section titles. Let's add the necessary blank lines. v2: - Use "--verbose" instead of "-r 1" (same behaviour but more readable). - Pass it through a RST2MAN_OPTS variable so we can easily pass other options too. Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200909162251.15498-2-quentin@isovalent.com --- tools/bpf/bpftool/Documentation/Makefile | 3 ++- tools/bpf/bpftool/Documentation/bpftool-btf.rst | 3 +++ tools/bpf/bpftool/Documentation/bpftool-gen.rst | 4 ++++ tools/bpf/bpftool/Documentation/bpftool-map.rst | 3 +++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index 815ac9804aee..a45b51d98468 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -28,12 +28,13 @@ man: man8 helpers man8: $(DOC_MAN8) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) +RST2MAN_OPTS += --verbose $(OUTPUT)%.8: %.rst ifndef RST2MAN_DEP $(error "rst2man not found, but required to generate man pages") endif - $(QUIET_GEN)rst2man $< > $@ + $(QUIET_GEN)rst2man $(RST2MAN_OPTS) $< > $@ clean: helpers-clean $(call QUIET_CLEAN, Documentation) diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 896f4c6c2870..864553e62af4 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -91,6 +91,7 @@ OPTIONS EXAMPLES ======== **# bpftool btf dump id 1226** + :: [1] PTR '(anon)' type_id=2 @@ -104,6 +105,7 @@ EXAMPLES This gives an example of default output for all supported BTF kinds. **$ cat prog.c** + :: struct fwd_struct; @@ -144,6 +146,7 @@ This gives an example of default output for all supported BTF kinds. } **$ bpftool btf dump file prog.o** + :: [1] PTR '(anon)' type_id=2 diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index df85dbd962c0..d52b03a352d7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -146,6 +146,7 @@ OPTIONS EXAMPLES ======== **$ cat example.c** + :: #include @@ -187,6 +188,7 @@ This is example BPF application with two BPF programs and a mix of BPF maps and global variables. **$ bpftool gen skeleton example.o** + :: /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ @@ -241,6 +243,7 @@ and global variables. #endif /* __EXAMPLE_SKEL_H__ */ **$ cat example_user.c** + :: #include "example.skel.h" @@ -283,6 +286,7 @@ and global variables. } **# ./example_user** + :: my_map name: my_map diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 083db6c2fc67..8f187c6416cd 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -182,6 +182,7 @@ OPTIONS EXAMPLES ======== **# bpftool map show** + :: 10: hash name some_map flags 0x0 @@ -203,6 +204,7 @@ The following three commands are equivalent: **# bpftool map dump id 10** + :: key: 00 01 02 03 value: 00 01 02 03 04 05 06 07 @@ -210,6 +212,7 @@ The following three commands are equivalent: Found 2 elements **# bpftool map getnext id 10 key 0 1 2 3** + :: key: From 41d5c37b74084b44ccc688dbab508acb5fdddc48 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 9 Sep 2020 17:22:51 +0100 Subject: [PATCH 35/95] selftests, bpftool: Add bpftool (and eBPF helpers) documentation build eBPF selftests include a script to check that bpftool builds correctly with different command lines. Let's add one build for bpftool's documentation so as to detect errors or warning reported by rst2man when compiling the man pages. Also add a build to the selftests Makefile to make sure we build bpftool documentation along with bpftool when building the selftests. This also builds and checks warnings for the man page for eBPF helpers, which is built along bpftool's documentation. This change adds rst2man as a dependency for selftests (it comes with Python's "docutils"). v2: - Use "--exit-status=1" option for rst2man instead of counting lines from stderr. - Also build bpftool as part as the selftests build (and not only when the tests are actually run). Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200909162251.15498-3-quentin@isovalent.com --- tools/testing/selftests/bpf/Makefile | 5 +++++ .../selftests/bpf/test_bpftool_build.sh | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 65d3d9aaeb31..05798c2b5c67 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -176,6 +176,11 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ OUTPUT=$(BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(SCRATCH_DIR)/ install + $(Q)mkdir -p $(BUILD_DIR)/bpftool/Documentation + $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ + -C $(BPFTOOLDIR)/Documentation \ + OUTPUT=$(BUILD_DIR)/bpftool/Documentation/ \ + prefix= DESTDIR=$(SCRATCH_DIR)/ install $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index ac349a5cea7e..2db3c60e1e61 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -85,6 +85,23 @@ make_with_tmpdir() { echo } +make_doc_and_clean() { + echo -e "\$PWD: $PWD" + echo -e "command: make -s $* doc >/dev/null" + RST2MAN_OPTS="--exit-status=1" make $J -s $* doc + if [ $? -ne 0 ] ; then + ERROR=1 + printf "FAILURE: Errors or warnings when building documentation\n" + fi + ( + if [ $# -ge 1 ] ; then + cd ${@: -1} + fi + make -s doc-clean + ) + echo +} + echo "Trying to build bpftool" echo -e "... through kbuild\n" @@ -145,3 +162,7 @@ make_and_clean make_with_tmpdir OUTPUT make_with_tmpdir O + +echo -e "Checking documentation build\n" +# From tools/bpf/bpftool +make_doc_and_clean From 82b8cf0acc7bfe7b247e5ddc4417f6146d74c0b8 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 9 Sep 2020 17:24:58 +0100 Subject: [PATCH 36/95] tools: bpftool: Print optional built-in features along with version Bpftool has a number of features that can be included or left aside during compilation. This includes: - Support for libbfd, providing the disassembler for JIT-compiled programs. - Support for BPF skeletons, used for profiling programs or iterating on the PIDs of processes associated with BPF objects. In order to make it easy for users to understand what features were compiled for a given bpftool binary, print the status of the two features above when showing the version number for bpftool ("bpftool -V" or "bpftool version"). Document this in the main manual page. Example invocations: $ bpftool version ./bpftool v5.9.0-rc1 features: libbfd, skeletons $ bpftool -p version { "version": "5.9.0-rc1", "features": { "libbfd": true, "skeletons": true } } Some other parameters are optional at compilation ("DISASM_FOUR_ARGS_SIGNATURE", LIBCAP support) but they do not impact significantly bpftool's behaviour from a user's point of view, so their status is not reported. Available commands and supported program types depend on the version number, and are therefore not reported either. Note that they are already available, albeit without JSON, via bpftool's help messages. v3: - Use a simple list instead of boolean values for plain output. v2: - Fix JSON (object instead or array for the features). Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200909162500.17010-2-quentin@isovalent.com --- tools/bpf/bpftool/Documentation/bpftool.rst | 8 ++++- tools/bpf/bpftool/main.c | 33 +++++++++++++++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 420d4d5df8b6..a3629a3f1175 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -50,7 +50,13 @@ OPTIONS Print short help message (similar to **bpftool help**). -V, --version - Print version number (similar to **bpftool version**). + Print version number (similar to **bpftool version**), and + optional features that were included when bpftool was + compiled. Optional features include linking against libbfd to + provide the disassembler for JIT-ted programs (**bpftool prog + dump jited**) and usage of BPF skeletons (some features like + **bpftool prog profile** or showing pids associated to BPF + objects may rely on it). -j, --json Generate JSON output. For commands that cannot produce JSON, this diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 4a191fcbeb82..682daaa49e6a 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -70,13 +70,42 @@ static int do_help(int argc, char **argv) static int do_version(int argc, char **argv) { +#ifdef HAVE_LIBBFD_SUPPORT + const bool has_libbfd = true; +#else + const bool has_libbfd = false; +#endif +#ifdef BPFTOOL_WITHOUT_SKELETONS + const bool has_skeletons = false; +#else + const bool has_skeletons = true; +#endif + if (json_output) { - jsonw_start_object(json_wtr); + jsonw_start_object(json_wtr); /* root object */ + jsonw_name(json_wtr, "version"); jsonw_printf(json_wtr, "\"%s\"", BPFTOOL_VERSION); - jsonw_end_object(json_wtr); + + jsonw_name(json_wtr, "features"); + jsonw_start_object(json_wtr); /* features */ + jsonw_bool_field(json_wtr, "libbfd", has_libbfd); + jsonw_bool_field(json_wtr, "skeletons", has_skeletons); + jsonw_end_object(json_wtr); /* features */ + + jsonw_end_object(json_wtr); /* root object */ } else { + unsigned int nb_features = 0; + printf("%s v%s\n", bin_name, BPFTOOL_VERSION); + printf("features:"); + if (has_libbfd) { + printf(" libbfd"); + nb_features++; + } + if (has_skeletons) + printf("%s skeletons", nb_features++ ? "," : ""); + printf("\n"); } return 0; } From f28ef96d7b04a76575a020a9da4f92358abe68c6 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 9 Sep 2020 17:24:59 +0100 Subject: [PATCH 37/95] tools: bpftool: Include common options from separate file Nearly all man pages for bpftool have the same common set of option flags (--help, --version, --json, --pretty, --debug). The description is duplicated across all the pages, which is more difficult to maintain if the description of an option changes. It may also be confusing to sort out what options are not "common" and should not be copied when creating new manual pages. Let's move the description for those common options to a separate file, which is included with a RST directive when generating the man pages. Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200909162500.17010-3-quentin@isovalent.com --- tools/bpf/bpftool/Documentation/Makefile | 2 +- .../bpf/bpftool/Documentation/bpftool-btf.rst | 17 +------------ .../bpftool/Documentation/bpftool-cgroup.rst | 17 +------------ .../bpftool/Documentation/bpftool-feature.rst | 17 +------------ .../bpf/bpftool/Documentation/bpftool-gen.rst | 17 +------------ .../bpftool/Documentation/bpftool-iter.rst | 11 +-------- .../bpftool/Documentation/bpftool-link.rst | 17 +------------ .../bpf/bpftool/Documentation/bpftool-map.rst | 17 +------------ .../bpf/bpftool/Documentation/bpftool-net.rst | 17 +------------ .../bpftool/Documentation/bpftool-perf.rst | 17 +------------ .../bpftool/Documentation/bpftool-prog.rst | 18 +------------- .../Documentation/bpftool-struct_ops.rst | 18 +------------- tools/bpf/bpftool/Documentation/bpftool.rst | 24 +------------------ .../bpftool/Documentation/common_options.rst | 22 +++++++++++++++++ 14 files changed, 35 insertions(+), 196 deletions(-) create mode 100644 tools/bpf/bpftool/Documentation/common_options.rst diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index a45b51d98468..5e3815320dab 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -19,7 +19,7 @@ man8dir = $(mandir)/man8 # Load targets for building eBPF helpers man page. include ../../Makefile.helpers -MAN8_RST = $(filter-out $(HELPERS_RST),$(wildcard *.rst)) +MAN8_RST = $(wildcard bpftool*.rst) _DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST)) DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 864553e62af4..52a7b2f6c9cb 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -71,22 +71,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. + .. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index a226aee3574f..3dba89db000e 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -116,26 +116,11 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -f, --bpffs Show file names of pinned programs. - -d, --debug - Print all logs available from libbpf, including debug-level - information. - EXAMPLES ======== | diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 8609f06e71de..f1aae5690e3c 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -71,22 +71,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. + .. include:: common_options.rst SEE ALSO ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index d52b03a352d7..29d4cf4c3422 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -126,22 +126,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, - this option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. + .. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 070ffacb42b5..b688cf11805c 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -51,16 +51,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf as well as from the verifier, when - attempting to load programs. + .. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index dc7693b5e606..ce122be58bae 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -62,18 +62,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -f, --bpffs When showing BPF links, show file names of pinned @@ -83,10 +72,6 @@ OPTIONS Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf. - EXAMPLES ======== **# bpftool link show** diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 8f187c6416cd..4b42629ade3e 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -155,18 +155,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -f, --bpffs Show file names of pinned maps. @@ -175,10 +164,6 @@ OPTIONS Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. - -d, --debug - Print all logs available from libbpf, including debug-level - information. - EXAMPLES ======== **# bpftool map show** diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index aa7450736179..56439c32934d 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -75,22 +75,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. + .. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 9c592b7c6775..36d257a36e9b 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -40,22 +40,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. + .. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 82e356b664e8..9b2b18e2a3ac 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -210,18 +210,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -f, --bpffs When showing BPF programs, show file names of pinned @@ -234,11 +223,6 @@ OPTIONS Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf as well as from the verifier, when - attempting to load programs. - EXAMPLES ======== **# bpftool prog show** diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index d93cd1cb8b0f..315f1f21f2ba 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -60,23 +60,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf as well as from the verifier, when - attempting to load programs. + .. include:: common_options.rst EXAMPLES ======== diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index a3629a3f1175..b87f8c2df49d 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -46,24 +46,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**), and - optional features that were included when bpftool was - compiled. Optional features include linking against libbfd to - provide the disassembler for JIT-ted programs (**bpftool prog - dump jited**) and usage of BPF skeletons (some features like - **bpftool prog profile** or showing pids associated to BPF - objects may rely on it). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -m, --mapcompat Allow loading maps with unknown map definitions. @@ -72,11 +55,6 @@ OPTIONS Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf as well as from the verifier, when - attempting to load programs. - SEE ALSO ======== **bpf**\ (2), diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst new file mode 100644 index 000000000000..05d06c74dcaa --- /dev/null +++ b/tools/bpf/bpftool/Documentation/common_options.rst @@ -0,0 +1,22 @@ +-h, --help + Print short help message (similar to **bpftool help**). + +-V, --version + Print version number (similar to **bpftool version**), and optional + features that were included when bpftool was compiled. Optional + features include linking against libbfd to provide the disassembler + for JIT-ted programs (**bpftool prog dump jited**) and usage of BPF + skeletons (some features like **bpftool prog profile** or showing + pids associated to BPF objects may rely on it). + +-j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + +-p, --pretty + Generate human-readable JSON output. Implies **-j**. + +-d, --debug + Print all logs available, even debug-level information. This includes + logs from libbpf as well as from the verifier, when attempting to + load programs. From 654785a1afe1b1428106c322c7ea650e3f8d29e9 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 9 Sep 2020 17:27:10 +0100 Subject: [PATCH 38/95] net: sockmap: Remove unnecessary sk_fullsock checks The lookup paths for sockmap and sockhash currently include a check that returns NULL if the socket we just found is not a full socket. However, this check is not necessary. On insertion we ensure that we have a full socket (caveat around sock_ops), so request sockets are not a problem. Time-wait sockets are allocated separate from the original socket and then fed into the hashdance. They don't affect the sockets already stored in the sockmap. Suggested-by: Jakub Sitnicki Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200909162712.221874-2-lmb@cloudflare.com --- net/core/sock_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 078386d7d9a2..82494810d0ee 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -382,7 +382,7 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); - if (!sk || !sk_fullsock(sk)) + if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; @@ -1110,7 +1110,7 @@ static void *sock_hash_lookup(struct bpf_map *map, void *key) struct sock *sk; sk = __sock_hash_lookup_elem(map, key); - if (!sk || !sk_fullsock(sk)) + if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; From 0365351524d7560d8ed7a42801a15252c6c56f41 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 9 Sep 2020 17:27:11 +0100 Subject: [PATCH 39/95] net: Allow iterating sockmap and sockhash Add bpf_iter support for sockmap / sockhash, based on the bpf_sk_storage and hashtable implementation. sockmap and sockhash share the same iteration context: a pointer to an arbitrary key and a pointer to a socket. Both pointers may be NULL, and so BPF has to perform a NULL check before accessing them. Technically it's not possible for sockhash iteration to yield a NULL socket, but we ignore this to be able to use a single iteration point. Iteration will visit all keys that remain unmodified during the lifetime of the iterator. It may or may not visit newly added ones. Switch from using rcu_dereference_raw to plain rcu_dereference, so we gain another guard rail if CONFIG_PROVE_RCU is enabled. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200909162712.221874-3-lmb@cloudflare.com --- net/core/sock_map.c | 280 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 278 insertions(+), 2 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 82494810d0ee..e1f05e3fa1d0 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -2,6 +2,7 @@ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include +#include #include #include #include @@ -703,6 +704,109 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +struct sock_map_seq_info { + struct bpf_map *map; + struct sock *sk; + u32 index; +}; + +struct bpf_iter__sockmap { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(struct sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, + struct sock *sk) + +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) +{ + if (unlikely(info->index >= info->map->max_entries)) + return NULL; + + info->sk = __sock_map_lookup_elem(info->map, info->index); + + /* can't return sk directly, since that might be NULL */ + return info; +} + +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct sock_map_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_map_seq_stop */ + rcu_read_lock(); + return sock_map_seq_lookup_elem(info); +} + +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock_map_seq_info *info = seq->private; + + ++*pos; + ++info->index; + + return sock_map_seq_lookup_elem(info); +} + +static int sock_map_seq_show(struct seq_file *seq, void *v) +{ + struct sock_map_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !v); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + ctx.sk = info->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)sock_map_seq_show(seq, NULL); + + /* pairs with sock_map_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_map_seq_ops = { + .start = sock_map_seq_start, + .next = sock_map_seq_next, + .stop = sock_map_seq_stop, + .show = sock_map_seq_show, +}; + +static int sock_map_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_map_seq_info *info = priv_data; + + info->map = aux->map; + return 0; +} + +static const struct bpf_iter_seq_info sock_map_iter_seq_info = { + .seq_ops = &sock_map_seq_ops, + .init_seq_private = sock_map_init_seq_private, + .seq_priv_size = sizeof(struct sock_map_seq_info), +}; + static int sock_map_btf_id; const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -717,6 +821,7 @@ const struct bpf_map_ops sock_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_stab", .map_btf_id = &sock_map_btf_id, + .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { @@ -953,7 +1058,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, if (!elem) goto find_first_elem; - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), + elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); @@ -965,7 +1070,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); @@ -1199,6 +1304,117 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .arg4_type = ARG_ANYTHING, }; +struct sock_hash_seq_info { + struct bpf_map *map; + struct bpf_shtab *htab; + u32 bucket_id; +}; + +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, + struct bpf_shtab_elem *prev_elem) +{ + const struct bpf_shtab *htab = info->htab; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + struct hlist_node *node; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + + /* no more elements, continue in the next bucket */ + info->bucket_id++; + } + + for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { + bucket = &htab->buckets[info->bucket_id]; + node = rcu_dereference(hlist_first_rcu(&bucket->head)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + } + + return NULL; +} + +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct sock_hash_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_hash_seq_stop */ + rcu_read_lock(); + return sock_hash_seq_find_next(info, NULL); +} + +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock_hash_seq_info *info = seq->private; + + ++*pos; + return sock_hash_seq_find_next(info, v); +} + +static int sock_hash_seq_show(struct seq_file *seq, void *v) +{ + struct sock_hash_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_shtab_elem *elem = v; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !elem); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->key; + ctx.sk = elem->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_hash_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)sock_hash_seq_show(seq, NULL); + + /* pairs with sock_hash_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_hash_seq_ops = { + .start = sock_hash_seq_start, + .next = sock_hash_seq_next, + .stop = sock_hash_seq_stop, + .show = sock_hash_seq_show, +}; + +static int sock_hash_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_hash_seq_info *info = priv_data; + + info->map = aux->map; + info->htab = container_of(aux->map, struct bpf_shtab, map); + return 0; +} + +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { + .seq_ops = &sock_hash_seq_ops, + .init_seq_private = sock_hash_init_seq_private, + .seq_priv_size = sizeof(struct sock_hash_seq_info), +}; + static int sock_hash_map_btf_id; const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -1213,6 +1429,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_shtab", .map_btf_id = &sock_hash_map_btf_id, + .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) @@ -1323,3 +1540,62 @@ void sock_map_close(struct sock *sk, long timeout) release_sock(sk); saved_close(sk, timeout); } + +static int sock_map_iter_attach_target(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && + map->map_type != BPF_MAP_TYPE_SOCKHASH) + goto put_map; + + if (prog->aux->max_rdonly_access > map->key_size) { + err = -EACCES; + goto put_map; + } + + aux->map = map; + return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); +} + +static struct bpf_iter_reg sock_map_iter_reg = { + .target = "sockmap", + .attach_target = sock_map_iter_attach_target, + .detach_target = sock_map_iter_detach_target, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__sockmap, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__sockmap, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_sockmap_iter_init(void) +{ + sock_map_iter_reg.ctx_arg_info[1].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&sock_map_iter_reg); +} +late_initcall(bpf_sockmap_iter_init); From 2f7de9865ba3cbfcf8b504f07154fdb6124176a4 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 9 Sep 2020 17:27:12 +0100 Subject: [PATCH 40/95] selftests: bpf: Test iterating a sockmap Add a test that exercises a basic sockmap / sockhash iteration. For now we simply count the number of elements seen. Once sockmap update from iterators works we can extend this to perform a full copy. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200909162712.221874-4-lmb@cloudflare.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 89 +++++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_iter.h | 9 ++ .../selftests/bpf/progs/bpf_iter_sockmap.c | 43 +++++++++ .../selftests/bpf/progs/bpf_iter_sockmap.h | 3 + 4 files changed, 144 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_sockmap.h diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 0b79d78b98db..4b7a527e7e82 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -6,6 +6,9 @@ #include "test_skmsg_load_helpers.skel.h" #include "test_sockmap_update.skel.h" #include "test_sockmap_invalid_update.skel.h" +#include "bpf_iter_sockmap.skel.h" + +#include "progs/bpf_iter_sockmap.h" #define TCP_REPAIR 19 /* TCP sock is under repair right now */ @@ -171,6 +174,88 @@ static void test_sockmap_invalid_update(void) test_sockmap_invalid_update__destroy(skel); } +static void test_sockmap_iter(enum bpf_map_type map_type) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + int err, len, src_fd, iter_fd, duration = 0; + union bpf_iter_link_info linfo = {0}; + __s64 sock_fd[SOCKMAP_MAX_ENTRIES]; + __u32 i, num_sockets, max_elems; + struct bpf_iter_sockmap *skel; + struct bpf_link *link; + struct bpf_map *src; + char buf[64]; + + skel = bpf_iter_sockmap__open_and_load(); + if (CHECK(!skel, "bpf_iter_sockmap__open_and_load", "skeleton open_and_load failed\n")) + return; + + for (i = 0; i < ARRAY_SIZE(sock_fd); i++) + sock_fd[i] = -1; + + /* Make sure we have at least one "empty" entry to test iteration of + * an empty slot. + */ + num_sockets = ARRAY_SIZE(sock_fd) - 1; + + if (map_type == BPF_MAP_TYPE_SOCKMAP) { + src = skel->maps.sockmap; + max_elems = bpf_map__max_entries(src); + } else { + src = skel->maps.sockhash; + max_elems = num_sockets; + } + + src_fd = bpf_map__fd(src); + + for (i = 0; i < num_sockets; i++) { + sock_fd[i] = connected_socket_v4(); + if (CHECK(sock_fd[i] == -1, "connected_socket_v4", "cannot connect\n")) + goto out; + + err = bpf_map_update_elem(src_fd, &i, &sock_fd[i], BPF_NOEXIST); + if (CHECK(err, "map_update", "failed: %s\n", strerror(errno))) + goto out; + } + + linfo.map.map_fd = src_fd; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + link = bpf_program__attach_iter(skel->progs.count_elems, &opts); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto out; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + /* do some tests */ + while ((len = read(iter_fd, buf, sizeof(buf))) > 0) + ; + if (CHECK(len < 0, "read", "failed: %s\n", strerror(errno))) + goto close_iter; + + /* test results */ + if (CHECK(skel->bss->elems != max_elems, "elems", "got %u expected %u\n", + skel->bss->elems, max_elems)) + goto close_iter; + + if (CHECK(skel->bss->socks != num_sockets, "socks", "got %u expected %u\n", + skel->bss->socks, num_sockets)) + goto close_iter; + +close_iter: + close(iter_fd); +free_link: + bpf_link__destroy(link); +out: + for (i = 0; i < num_sockets; i++) { + if (sock_fd[i] >= 0) + close(sock_fd[i]); + } + bpf_iter_sockmap__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -187,4 +272,8 @@ void test_sockmap_basic(void) test_sockmap_update(BPF_MAP_TYPE_SOCKHASH); if (test__start_subtest("sockmap update in unsafe context")) test_sockmap_invalid_update(); + if (test__start_subtest("sockmap iter")) + test_sockmap_iter(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash iter")) + test_sockmap_iter(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h index c196280df90d..df682af75510 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter.h @@ -13,6 +13,7 @@ #define udp6_sock udp6_sock___not_used #define bpf_iter__bpf_map_elem bpf_iter__bpf_map_elem___not_used #define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used +#define bpf_iter__sockmap bpf_iter__sockmap___not_used #include "vmlinux.h" #undef bpf_iter_meta #undef bpf_iter__bpf_map @@ -26,6 +27,7 @@ #undef udp6_sock #undef bpf_iter__bpf_map_elem #undef bpf_iter__bpf_sk_storage_map +#undef bpf_iter__sockmap struct bpf_iter_meta { struct seq_file *seq; @@ -96,3 +98,10 @@ struct bpf_iter__bpf_sk_storage_map { struct sock *sk; void *value; }; + +struct bpf_iter__sockmap { + struct bpf_iter_meta *meta; + struct bpf_map *map; + void *key; + struct sock *sk; +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c new file mode 100644 index 000000000000..0e27f73dd803 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Cloudflare */ +#include "bpf_iter.h" +#include "bpf_tracing_net.h" +#include "bpf_iter_sockmap.h" +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, SOCKMAP_MAX_ENTRIES); + __type(key, __u32); + __type(value, __u64); +} sockmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, SOCKMAP_MAX_ENTRIES); + __type(key, __u32); + __type(value, __u64); +} sockhash SEC(".maps"); + +__u32 elems = 0; +__u32 socks = 0; + +SEC("iter/sockmap") +int count_elems(struct bpf_iter__sockmap *ctx) +{ + struct sock *sk = ctx->sk; + __u32 tmp, *key = ctx->key; + int ret; + + if (key) + elems++; + + if (sk) + socks++; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.h b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.h new file mode 100644 index 000000000000..35a675d13c0f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define SOCKMAP_MAX_ENTRIES (64) From a20693b6e72e59fb9b896193df0f8007693447d0 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 10 Sep 2020 11:26:50 +0100 Subject: [PATCH 41/95] tools: bpftool: Clean up function to dump map entry The function used to dump a map entry in bpftool is a bit difficult to follow, as a consequence to earlier refactorings. There is a variable ("num_elems") which does not appear to be necessary, and the error handling would look cleaner if moved to its own function. Let's clean it up. No functional change. v2: - v1 was erroneously removing the check on fd maps in an attempt to get support for outer map dumps. This is already working. Instead, v2 focuses on cleaning up the dump_map_elem() function, to avoid similar confusion in the future. Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200910102652.10509-2-quentin@isovalent.com --- tools/bpf/bpftool/map.c | 101 +++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 49 deletions(-) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index bc0071228f88..c8159cb4fb1e 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -213,8 +213,9 @@ static void print_entry_json(struct bpf_map_info *info, unsigned char *key, jsonw_end_object(json_wtr); } -static void print_entry_error(struct bpf_map_info *info, unsigned char *key, - const char *error_msg) +static void +print_entry_error_msg(struct bpf_map_info *info, unsigned char *key, + const char *error_msg) { int msg_size = strlen(error_msg); bool single_line, break_names; @@ -232,6 +233,40 @@ static void print_entry_error(struct bpf_map_info *info, unsigned char *key, printf("\n"); } +static void +print_entry_error(struct bpf_map_info *map_info, void *key, int lookup_errno) +{ + /* For prog_array maps or arrays of maps, failure to lookup the value + * means there is no entry for that key. Do not print an error message + * in that case. + */ + if (map_is_map_of_maps(map_info->type) || + map_is_map_of_progs(map_info->type)) + return; + + if (json_output) { + jsonw_start_object(json_wtr); /* entry */ + jsonw_name(json_wtr, "key"); + print_hex_data_json(key, map_info->key_size); + jsonw_name(json_wtr, "value"); + jsonw_start_object(json_wtr); /* error */ + jsonw_string_field(json_wtr, "error", strerror(lookup_errno)); + jsonw_end_object(json_wtr); /* error */ + jsonw_end_object(json_wtr); /* entry */ + } else { + const char *msg = NULL; + + if (lookup_errno == ENOENT) + msg = ""; + else if (lookup_errno == ENOSPC && + map_info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + msg = ""; + + print_entry_error_msg(map_info, key, + msg ? : strerror(lookup_errno)); + } +} + static void print_entry_plain(struct bpf_map_info *info, unsigned char *key, unsigned char *value) { @@ -713,56 +748,23 @@ static int dump_map_elem(int fd, void *key, void *value, struct bpf_map_info *map_info, struct btf *btf, json_writer_t *btf_wtr) { - int num_elems = 0; - int lookup_errno; - - if (!bpf_map_lookup_elem(fd, key, value)) { - if (json_output) { - print_entry_json(map_info, key, value, btf); - } else { - if (btf) { - struct btf_dumper d = { - .btf = btf, - .jw = btf_wtr, - .is_plain_text = true, - }; - - do_dump_btf(&d, map_info, key, value); - } else { - print_entry_plain(map_info, key, value); - } - num_elems++; - } - return num_elems; + if (bpf_map_lookup_elem(fd, key, value)) { + print_entry_error(map_info, key, errno); + return -1; } - /* lookup error handling */ - lookup_errno = errno; - - if (map_is_map_of_maps(map_info->type) || - map_is_map_of_progs(map_info->type)) - return 0; - if (json_output) { - jsonw_start_object(json_wtr); - jsonw_name(json_wtr, "key"); - print_hex_data_json(key, map_info->key_size); - jsonw_name(json_wtr, "value"); - jsonw_start_object(json_wtr); - jsonw_string_field(json_wtr, "error", strerror(lookup_errno)); - jsonw_end_object(json_wtr); - jsonw_end_object(json_wtr); + print_entry_json(map_info, key, value, btf); + } else if (btf) { + struct btf_dumper d = { + .btf = btf, + .jw = btf_wtr, + .is_plain_text = true, + }; + + do_dump_btf(&d, map_info, key, value); } else { - const char *msg = NULL; - - if (lookup_errno == ENOENT) - msg = ""; - else if (lookup_errno == ENOSPC && - map_info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) - msg = ""; - - print_entry_error(map_info, key, - msg ? : strerror(lookup_errno)); + print_entry_plain(map_info, key, value); } return 0; @@ -873,7 +875,8 @@ map_dump(int fd, struct bpf_map_info *info, json_writer_t *wtr, err = 0; break; } - num_elems += dump_map_elem(fd, key, value, info, btf, wtr); + if (!dump_map_elem(fd, key, value, info, btf, wtr)) + num_elems++; prev_key = key; } From 86233ce35e4b886dc5998c72ec6158ed72150782 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 10 Sep 2020 11:26:51 +0100 Subject: [PATCH 42/95] tools: bpftool: Keep errors for map-of-map dumps if distinct from ENOENT When dumping outer maps or prog_array maps, and on lookup failure, bpftool simply skips the entry with no error message. This is because the kernel returns non-zero when no value is found for the provided key, which frequently happen for those maps if they have not been filled. When such a case occurs, errno is set to ENOENT. It seems unlikely we could receive other error codes at this stage (we successfully retrieved map info just before), but to be on the safe side, let's skip the entry only if errno was ENOENT, and not for the other errors. v3: New patch Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200910102652.10509-3-quentin@isovalent.com --- tools/bpf/bpftool/map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index c8159cb4fb1e..d8581d5e98a1 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -240,8 +240,8 @@ print_entry_error(struct bpf_map_info *map_info, void *key, int lookup_errno) * means there is no entry for that key. Do not print an error message * in that case. */ - if (map_is_map_of_maps(map_info->type) || - map_is_map_of_progs(map_info->type)) + if ((map_is_map_of_maps(map_info->type) || + map_is_map_of_progs(map_info->type)) && lookup_errno == ENOENT) return; if (json_output) { From e3b9626f09d429788d929c9b9000a069fcfc056e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 10 Sep 2020 11:26:52 +0100 Subject: [PATCH 43/95] tools: bpftool: Add "inner_map" to "bpftool map create" outer maps There is no support for creating maps of types array-of-map or hash-of-map in bpftool. This is because the kernel needs an inner_map_fd to collect metadata on the inner maps to be supported by the new map, but bpftool does not provide a way to pass this file descriptor. Add a new optional "inner_map" keyword that can be used to pass a reference to a map, retrieve a fd to that map, and pass it as the inner_map_fd. Add related documentation and bash completion. Note that we can reference the inner map by its name, meaning we can have several times the keyword "name" with different meanings (mandatory outer map name, and possibly a name to use to find the inner_map_fd). The bash completion will offer it just once, and will not suggest "name" on the following command: # bpftool map create /sys/fs/bpf/my_outer_map type hash_of_maps \ inner_map name my_inner_map [TAB] Fixing that specific case seems too convoluted. Completion will work as expected, however, if the outer map name comes first and the "inner_map name ..." is passed second. Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200910102652.10509-4-quentin@isovalent.com --- .../bpf/bpftool/Documentation/bpftool-map.rst | 10 +++- tools/bpf/bpftool/bash-completion/bpftool | 22 ++++++++- tools/bpf/bpftool/map.c | 48 +++++++++++++------ 3 files changed, 62 insertions(+), 18 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 4b42629ade3e..8eac254ade48 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -23,7 +23,8 @@ MAP COMMANDS | **bpftool** **map** { **show** | **list** } [*MAP*] | **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ -| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ +| [**dev** *NAME*] | **bpftool** **map dump** *MAP* | **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] | **bpftool** **map lookup** *MAP* [**key** *DATA*] @@ -67,7 +68,7 @@ DESCRIPTION maps. On such kernels bpftool will automatically emit this information as well. - **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] + **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**dev** *NAME*] Create a new map with given parameters and pin it to *bpffs* as *FILE*. @@ -75,6 +76,11 @@ DESCRIPTION desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h UAPI header for existing flags). + To create maps of type array-of-maps or hash-of-maps, the + **inner_map** keyword must be used to pass an inner map. The + kernel needs it to collect metadata related to the inner maps + that the new map will work with. + Keyword **dev** expects a network interface name, and is used to request hardware offload for the map. diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 7b68e3c0a5fb..3f1da30c4da6 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -709,9 +709,26 @@ _bpftool() "$cur" ) ) return 0 ;; - key|value|flags|name|entries) + key|value|flags|entries) return 0 ;; + inner_map) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + _bpftool_get_map_ids + ;; + name) + case $pprev in + inner_map) + _bpftool_get_map_names + ;; + *) + return 0 + ;; + esac + ;; *) _bpftool_once_attr 'type' _bpftool_once_attr 'key' @@ -719,6 +736,9 @@ _bpftool() _bpftool_once_attr 'entries' _bpftool_once_attr 'name' _bpftool_once_attr 'flags' + if _bpftool_search_list 'array_of_maps' 'hash_of_maps'; then + _bpftool_once_attr 'inner_map' + fi _bpftool_once_attr 'dev' return 0 ;; diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index d8581d5e98a1..a7efbd84fbcc 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1250,7 +1250,7 @@ static int do_create(int argc, char **argv) { struct bpf_create_map_attr attr = { NULL, }; const char *pinfile; - int err, fd; + int err = -1, fd; if (!REQ_ARGS(7)) return -1; @@ -1265,13 +1265,13 @@ static int do_create(int argc, char **argv) if (attr.map_type) { p_err("map type already specified"); - return -1; + goto exit; } attr.map_type = map_type_from_str(*argv); if ((int)attr.map_type < 0) { p_err("unrecognized map type: %s", *argv); - return -1; + goto exit; } NEXT_ARG(); } else if (is_prefix(*argv, "name")) { @@ -1280,43 +1280,56 @@ static int do_create(int argc, char **argv) } else if (is_prefix(*argv, "key")) { if (parse_u32_arg(&argc, &argv, &attr.key_size, "key size")) - return -1; + goto exit; } else if (is_prefix(*argv, "value")) { if (parse_u32_arg(&argc, &argv, &attr.value_size, "value size")) - return -1; + goto exit; } else if (is_prefix(*argv, "entries")) { if (parse_u32_arg(&argc, &argv, &attr.max_entries, "max entries")) - return -1; + goto exit; } else if (is_prefix(*argv, "flags")) { if (parse_u32_arg(&argc, &argv, &attr.map_flags, "flags")) - return -1; + goto exit; } else if (is_prefix(*argv, "dev")) { NEXT_ARG(); if (attr.map_ifindex) { p_err("offload device already specified"); - return -1; + goto exit; } attr.map_ifindex = if_nametoindex(*argv); if (!attr.map_ifindex) { p_err("unrecognized netdevice '%s': %s", *argv, strerror(errno)); - return -1; + goto exit; } NEXT_ARG(); + } else if (is_prefix(*argv, "inner_map")) { + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int inner_map_fd; + + NEXT_ARG(); + if (!REQ_ARGS(2)) + usage(); + inner_map_fd = map_parse_fd_and_info(&argc, &argv, + &info, &len); + if (inner_map_fd < 0) + return -1; + attr.inner_map_fd = inner_map_fd; } else { p_err("unknown arg %s", *argv); - return -1; + goto exit; } } if (!attr.name) { p_err("map name not specified"); - return -1; + goto exit; } set_max_rlimit(); @@ -1324,17 +1337,22 @@ static int do_create(int argc, char **argv) fd = bpf_create_map_xattr(&attr); if (fd < 0) { p_err("map create failed: %s", strerror(errno)); - return -1; + goto exit; } err = do_pin_fd(fd, pinfile); close(fd); if (err) - return err; + goto exit; if (json_output) jsonw_null(json_wtr); - return 0; + +exit: + if (attr.inner_map_fd > 0) + close(attr.inner_map_fd); + + return err; } static int do_pop_dequeue(int argc, char **argv) @@ -1420,7 +1438,7 @@ static int do_help(int argc, char **argv) "Usage: %1$s %2$s { show | list } [MAP]\n" " %1$s %2$s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" - " [dev NAME]\n" + " [inner_map MAP] [dev NAME]\n" " %1$s %2$s dump MAP\n" " %1$s %2$s update MAP [key DATA] [value VALUE] [UPDATE_FLAGS]\n" " %1$s %2$s lookup MAP [key DATA]\n" From d66423fbe11e141206f117b232916aa899d44959 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 10 Sep 2020 12:02:48 +0100 Subject: [PATCH 44/95] bpf: Plug hole in struct bpf_sk_lookup_kern As Alexei points out, struct bpf_sk_lookup_kern has two 4-byte holes. This leads to suboptimal instructions being generated (IPv4, x86): 1372 struct bpf_sk_lookup_kern ctx = { 0xffffffff81b87f30 <+624>: xor %eax,%eax 0xffffffff81b87f32 <+626>: mov $0x6,%ecx 0xffffffff81b87f37 <+631>: lea 0x90(%rsp),%rdi 0xffffffff81b87f3f <+639>: movl $0x110002,0x88(%rsp) 0xffffffff81b87f4a <+650>: rep stos %rax,%es:(%rdi) 0xffffffff81b87f4d <+653>: mov 0x8(%rsp),%eax 0xffffffff81b87f51 <+657>: mov %r13d,0x90(%rsp) 0xffffffff81b87f59 <+665>: incl %gs:0x7e4970a0(%rip) 0xffffffff81b87f60 <+672>: mov %eax,0x8c(%rsp) 0xffffffff81b87f67 <+679>: movzwl 0x10(%rsp),%eax 0xffffffff81b87f6c <+684>: mov %ax,0xa8(%rsp) 0xffffffff81b87f74 <+692>: movzwl 0x38(%rsp),%eax 0xffffffff81b87f79 <+697>: mov %ax,0xaa(%rsp) Fix this by moving around sport and dport. pahole confirms there are no more holes: struct bpf_sk_lookup_kern { u16 family; /* 0 2 */ u16 protocol; /* 2 2 */ __be16 sport; /* 4 2 */ u16 dport; /* 6 2 */ struct { __be32 saddr; /* 8 4 */ __be32 daddr; /* 12 4 */ } v4; /* 8 8 */ struct { const struct in6_addr * saddr; /* 16 8 */ const struct in6_addr * daddr; /* 24 8 */ } v6; /* 16 16 */ struct sock * selected_sk; /* 32 8 */ bool no_reuseport; /* 40 1 */ /* size: 48, cachelines: 1, members: 8 */ /* padding: 7 */ /* last cacheline: 48 bytes */ }; The assembly also doesn't contain the pesky rep stos anymore: 1372 struct bpf_sk_lookup_kern ctx = { 0xffffffff81b87f60 <+624>: movzwl 0x10(%rsp),%eax 0xffffffff81b87f65 <+629>: movq $0x0,0xa8(%rsp) 0xffffffff81b87f71 <+641>: movq $0x0,0xb0(%rsp) 0xffffffff81b87f7d <+653>: mov %ax,0x9c(%rsp) 0xffffffff81b87f85 <+661>: movzwl 0x38(%rsp),%eax 0xffffffff81b87f8a <+666>: movq $0x0,0xb8(%rsp) 0xffffffff81b87f96 <+678>: mov %ax,0x9e(%rsp) 0xffffffff81b87f9e <+686>: mov 0x8(%rsp),%eax 0xffffffff81b87fa2 <+690>: movq $0x0,0xc0(%rsp) 0xffffffff81b87fae <+702>: movl $0x110002,0x98(%rsp) 0xffffffff81b87fb9 <+713>: mov %eax,0xa0(%rsp) 0xffffffff81b87fc0 <+720>: mov %r13d,0xa4(%rsp) 1: https://lore.kernel.org/bpf/CAADnVQKE6y9h2fwX6OS837v-Uf+aBXnT_JXiN_bbo2gitZQ3tA@mail.gmail.com/ Fixes: e9ddbb7707ff ("bpf: Introduce SK_LOOKUP program type with a dedicated attach point") Suggested-by: Alexei Starovoitov Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20200910110248.198326-1-lmb@cloudflare.com --- include/linux/filter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 995625950cc1..e962dd8117d8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1287,6 +1287,8 @@ int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); struct bpf_sk_lookup_kern { u16 family; u16 protocol; + __be16 sport; + u16 dport; struct { __be32 saddr; __be32 daddr; @@ -1295,8 +1297,6 @@ struct bpf_sk_lookup_kern { const struct in6_addr *saddr; const struct in6_addr *daddr; } v6; - __be16 sport; - u16 dport; struct sock *selected_sk; bool no_reuseport; }; From 90a1deda75c6b749fb89310e4744554a4c36ea33 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 10 Sep 2020 19:13:36 +0200 Subject: [PATCH 45/95] selftests/bpf: Fix test_ksyms on non-SMP kernels On non-SMP kernels __per_cpu_start is not 0, so look it up in kallsyms. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200910171336.3161995-1-iii@linux.ibm.com --- tools/testing/selftests/bpf/prog_tests/ksyms.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c index e3d6777226a8..b771804b2342 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c @@ -32,6 +32,7 @@ out: void test_ksyms(void) { + __u64 per_cpu_start_addr = kallsyms_find("__per_cpu_start"); __u64 link_fops_addr = kallsyms_find("bpf_link_fops"); const char *btf_path = "/sys/kernel/btf/vmlinux"; struct test_ksyms *skel; @@ -63,8 +64,9 @@ void test_ksyms(void) "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0); CHECK(data->out__btf_size != btf_size, "btf_size", "got %llu, exp %llu\n", data->out__btf_size, btf_size); - CHECK(data->out__per_cpu_start != 0, "__per_cpu_start", - "got %llu, exp %llu\n", data->out__per_cpu_start, (__u64)0); + CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start", + "got %llu, exp %llu\n", data->out__per_cpu_start, + per_cpu_start_addr); cleanup: test_ksyms__destroy(skel); From 6e057fc15a2da4ee03eb1fa6889cf687e690106e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 10 Sep 2020 13:27:18 -0700 Subject: [PATCH 46/95] selftests/bpf: Define string const as global for test_sysctl_prog.c When tweaking llvm optimizations, I found that selftest build failed with the following error: libbpf: elf: skipping unrecognized data section(6) .rodata.str1.1 libbpf: prog 'sysctl_tcp_mem': bad map relo against '.L__const.is_tcp_mem.tcp_mem_name' in section '.rodata.str1.1' Error: failed to open BPF object file: Relocation failed make: *** [/work/net-next/tools/testing/selftests/bpf/test_sysctl_prog.skel.h] Error 255 make: *** Deleting file `/work/net-next/tools/testing/selftests/bpf/test_sysctl_prog.skel.h' The local string constant "tcp_mem_name" is put into '.rodata.str1.1' section which libbpf cannot handle. Using untweaked upstream llvm, "tcp_mem_name" is completely inlined after loop unrolling. Commit 7fb5eefd7639 ("selftests/bpf: Fix test_sysctl_loop{1, 2} failure due to clang change") solved a similar problem by defining the string const as a global. Let us do the same here for test_sysctl_prog.c so it can weather future potential llvm changes. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200910202718.956042-1-yhs@fb.com --- tools/testing/selftests/bpf/progs/test_sysctl_prog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c index 50525235380e..5489823c83fc 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c @@ -19,11 +19,11 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif +const char tcp_mem_name[] = "net/ipv4/tcp_mem"; static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx) { - char tcp_mem_name[] = "net/ipv4/tcp_mem"; unsigned char i; - char name[64]; + char name[sizeof(tcp_mem_name)]; int ret; memset(name, 0, sizeof(name)); From 1aef5b4391f0c75c0a1523706a7b0311846ee12f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 10 Sep 2020 13:33:14 -0700 Subject: [PATCH 47/95] bpf: Fix comment for helper bpf_current_task_under_cgroup() This should be "current" not "skb". Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Cc: Link: https://lore.kernel.org/bpf/20200910203314.70018-1-songliubraving@fb.com --- include/uapi/linux/bpf.h | 4 ++-- tools/include/uapi/linux/bpf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 90359cab501d..7dd314176df7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1447,8 +1447,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 90359cab501d..7dd314176df7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1447,8 +1447,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) From 18841da98100c936990ac9013d55bf6b40e1f609 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 10 Sep 2020 21:39:35 +0100 Subject: [PATCH 48/95] tools: bpftool: Automate generation for "SEE ALSO" sections in man pages The "SEE ALSO" sections of bpftool's manual pages refer to bpf(2), bpf-helpers(7), then all existing bpftool man pages (save the current one). This leads to nearly-identical lists being duplicated in all manual pages. Ideally, when a new page is created, all lists should be updated accordingly, but this has led to omissions and inconsistencies multiple times in the past. Let's take it out of the RST files and generate the "SEE ALSO" sections automatically in the Makefile when generating the man pages. The lists are not really useful in the RST anyway because all other pages are available in the same directory. v3: - Fix conflict with a previous patchset that introduced RST2MAN_OPTS variable passed to rst2man. v2: - Use "echo -n" instead of "printf" in Makefile, to avoid any risk of passing a format string directly to the command. Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200910203935.25304-1-quentin@isovalent.com --- tools/bpf/bpftool/Documentation/Makefile | 12 +++++++++++- tools/bpf/bpftool/Documentation/bpftool-btf.rst | 17 ----------------- .../bpftool/Documentation/bpftool-cgroup.rst | 16 ---------------- .../bpftool/Documentation/bpftool-feature.rst | 16 ---------------- tools/bpf/bpftool/Documentation/bpftool-gen.rst | 16 ---------------- .../bpf/bpftool/Documentation/bpftool-iter.rst | 16 ---------------- .../bpf/bpftool/Documentation/bpftool-link.rst | 17 ----------------- tools/bpf/bpftool/Documentation/bpftool-map.rst | 16 ---------------- tools/bpf/bpftool/Documentation/bpftool-net.rst | 17 ----------------- .../bpf/bpftool/Documentation/bpftool-perf.rst | 17 ----------------- .../bpf/bpftool/Documentation/bpftool-prog.rst | 16 ---------------- .../Documentation/bpftool-struct_ops.rst | 17 ----------------- tools/bpf/bpftool/Documentation/bpftool.rst | 16 ---------------- 13 files changed, 11 insertions(+), 198 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index 5e3815320dab..4c9dd1e45244 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -30,11 +30,21 @@ man8: $(DOC_MAN8) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) RST2MAN_OPTS += --verbose +list_pages = $(sort $(basename $(filter-out $(1),$(MAN8_RST)))) +see_also = $(subst " ",, \ + "\n" \ + "SEE ALSO\n" \ + "========\n" \ + "\t**bpf**\ (2),\n" \ + "\t**bpf-helpers**\\ (7)" \ + $(foreach page,$(call list_pages,$(1)),",\n\t**$(page)**\\ (8)") \ + "\n") + $(OUTPUT)%.8: %.rst ifndef RST2MAN_DEP $(error "rst2man not found, but required to generate man pages") endif - $(QUIET_GEN)rst2man $(RST2MAN_OPTS) $< > $@ + $(QUIET_GEN)( cat $< ; echo -n $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@ clean: helpers-clean $(call QUIET_CLEAN, Documentation) diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 52a7b2f6c9cb..ff4d327a582e 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -217,20 +217,3 @@ All the standard ways to specify map or program are supported: **# bpftool btf dump prog tag b88e0a09b1d9759d** **# bpftool btf dump prog pinned /sys/fs/bpf/prog_name** - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 3dba89db000e..790944c35602 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -143,19 +143,3 @@ EXAMPLES :: ID AttachType AttachFlags Name - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index f1aae5690e3c..dd3771bdbc57 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -72,19 +72,3 @@ DESCRIPTION OPTIONS ======= .. include:: common_options.rst - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 29d4cf4c3422..84cf0639696f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -279,19 +279,3 @@ and global variables. my_static_var: 7 This is a stripped-out version of skeleton generated for above example code. - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index b688cf11805c..51f49bead619 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -68,19 +68,3 @@ EXAMPLES Create a file-based bpf iterator from bpf_iter_hashmap.o and map with id 20, and pin it to /sys/fs/bpf/my_hashmap - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index ce122be58bae..5f7db2a837cc 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -106,20 +106,3 @@ EXAMPLES :: -rw------- 1 root root 0 Apr 23 21:39 link - - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 8eac254ade48..dade10cdf295 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -270,19 +270,3 @@ would be lost as soon as bpftool exits). key: 00 00 00 00 value: 22 02 00 00 Found 1 element - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 56439c32934d..d8165d530937 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -172,20 +172,3 @@ EXAMPLES :: xdp: - - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 36d257a36e9b..e958ce91de72 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -63,20 +63,3 @@ EXAMPLES {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \ {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \ {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}] - - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 9b2b18e2a3ac..358c7309d419 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -326,19 +326,3 @@ EXAMPLES 40176203 cycles (83.05%) 42518139 instructions # 1.06 insns per cycle (83.39%) 123 llc_misses # 2.89 LLC misses per million insns (83.15%) - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 315f1f21f2ba..506e70ee78e9 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -82,20 +82,3 @@ EXAMPLES :: Registered tcp_congestion_ops cubic id 110 - - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index b87f8c2df49d..e7d949334961 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -54,19 +54,3 @@ OPTIONS -n, --nomount Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) From 8919a9b31eb4fb4c0a93e5fb350a626924302aa6 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:32 -0400 Subject: [PATCH 49/95] tcp: Only init congestion control if not initialized already Change tcp_init_transfer() to only initialize congestion control if it has not been initialized already. With this new approach, we can arrange things so that if the EBPF code sets the congestion control by calling setsockopt(TCP_CONGESTION) then tcp_init_transfer() will not re-initialize the CC module. This is an approach that has the following beneficial properties: (1) This allows CC module customizations made by the EBPF called in tcp_init_transfer() to persist, and not be wiped out by a later call to tcp_init_congestion_control() in tcp_init_transfer(). (2) Does not flip the order of EBPF and CC init, to avoid causing bugs for existing code upstream that depends on the current order. (3) Does not cause 2 initializations for for CC in the case where the EBPF called in tcp_init_transfer() wants to set the CC to a new CC algorithm. (4) Allows follow-on simplifications to the code in net/core/filter.c and net/ipv4/tcp_cong.c, which currently both have some complexity to special-case CC initialization to avoid double CC initialization if EBPF sets the CC. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- include/net/inet_connection_sock.h | 3 ++- net/ipv4/tcp.c | 1 + net/ipv4/tcp_cong.c | 3 ++- net/ipv4/tcp_input.c | 4 +++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c738abeb3265..dc763ca9413c 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -96,7 +96,8 @@ struct inet_connection_sock { void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); - __u8 icsk_ca_state:6, + __u8 icsk_ca_state:5, + icsk_ca_initialized:1, icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 57a568875539..7360d3db2b61 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2698,6 +2698,7 @@ int tcp_disconnect(struct sock *sk, int flags) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 62878cf26d9c..d18d7a1ce4ce 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -176,7 +176,7 @@ void tcp_assign_congestion_control(struct sock *sk) void tcp_init_congestion_control(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) @@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); + icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4337841faeff..0e5ac0d33fd3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5894,8 +5894,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; + icsk->icsk_ca_initialized = 0; bpf_skops_established(sk, bpf_op, skb); - tcp_init_congestion_control(sk); + if (!icsk->icsk_ca_initialized) + tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } From e7b10a4dd1b13c8fb8ee1686da7d01437c5da1d2 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:33 -0400 Subject: [PATCH 50/95] tcp: Simplify EBPF TCP_CONGESTION to always init CC Now that the previous patch ensures we don't initialize the congestion control twice, when EBPF sets the congestion control algorithm at connection establishment we can simplify the code by simply initializing the congestion control module at that time. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- net/core/filter.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 47eef9a0be6a..067f6759a68f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4313,8 +4313,6 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#define SOCKOPT_CC_REINIT (1 << 0) - static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { @@ -4449,13 +4447,12 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; ret = tcp_set_congestion_control(sk, name, false, - reinit, true); + true, true); } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -4652,8 +4649,6 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { u32 flags = 0; - if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - flags |= SOCKOPT_CC_REINIT; return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, flags); } From 29a949325c6c90f1431db9af64592275c83d9b2a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:34 -0400 Subject: [PATCH 51/95] tcp: simplify tcp_set_congestion_control(): Always reinitialize Now that the previous patches ensure that all call sites for tcp_set_congestion_control() want to initialize congestion control, we can simplify tcp_set_congestion_control() by removing the reinit argument and the code to support it. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- include/net/tcp.h | 2 +- net/core/filter.c | 3 +-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cong.c | 11 ++--------- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index e85d564446c6..f857146c17a5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1104,7 +1104,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin); + bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/net/core/filter.c b/net/core/filter.c index 067f6759a68f..e89d6d7da03c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4451,8 +4451,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, - true, true); + ret = tcp_set_congestion_control(sk, name, false, true); } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7360d3db2b61..e58ab9db73ff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3050,7 +3050,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true, + err = tcp_set_congestion_control(sk, name, true, ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); release_sock(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d18d7a1ce4ce..a9b0fb52a1ec 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -341,7 +341,7 @@ out: * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin) + bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -365,15 +365,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, if (!ca) { err = -ENOENT; } else if (!load) { - const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - if (bpf_try_module_get(ca, ca->owner)) { - if (reinit) { - tcp_reinit_congestion_control(sk, ca); - } else { - icsk->icsk_ca_ops = ca; - bpf_module_put(old_ca, old_ca->owner); - } + tcp_reinit_congestion_control(sk, ca); } else { err = -EBUSY; } From 5cdc744caab7cb0d0a3ed479246cbab3dbc59c0e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:35 -0400 Subject: [PATCH 52/95] tcp: simplify _bpf_setsockopt(): Remove flags argument Now that the previous patches have removed the code that uses the flags argument to _bpf_setsockopt(), we can remove that argument. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- net/core/filter.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index e89d6d7da03c..d266c6941967 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4314,7 +4314,7 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { }; static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen, u32 flags) + char *optval, int optlen) { char devname[IFNAMSIZ]; int val, valbool; @@ -4611,9 +4611,7 @@ err_clear: BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { @@ -4647,9 +4645,7 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { From 5050bef8736facac341fb7e2c0eda5002619acf8 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:36 -0400 Subject: [PATCH 53/95] tcp: Simplify tcp_set_congestion_control() load=false case Simplify tcp_set_congestion_control() by removing the initialization code path for the !load case. There are only two call sites for tcp_set_congestion_control(). The EBPF call site is the only one that passes load=false; it also passes cap_net_admin=true. Because of that, the exact same behavior can be achieved by removing the special if (!load) branch of the logic. Both before and after this commit, the EBPF case will call bpf_try_module_get(), and if that succeeds then call tcp_reinit_congestion_control() or if that fails then return EBUSY. Note that this returns the logic to a structure very similar to the structure before: commit 91b5b21c7c16 ("bpf: Add support for changing congestion control") except that the CAP_NET_ADMIN status is passed in as a function argument. This clean-up was suggested by Martin KaFai Lau. Suggested-by: Martin KaFai Lau Signed-off-by: Neal Cardwell Signed-off-by: Alexei Starovoitov Cc: Lawrence Brakmo Cc: Eric Dumazet Cc: Yuchung Cheng Cc: Kevin Yang --- net/ipv4/tcp_cong.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a9b0fb52a1ec..db47ac24d057 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -362,21 +362,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, goto out; } - if (!ca) { + if (!ca) err = -ENOENT; - } else if (!load) { - if (bpf_try_module_get(ca, ca->owner)) { - tcp_reinit_congestion_control(sk, ca); - } else { - err = -EBUSY; - } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; - } else if (!bpf_try_module_get(ca, ca->owner)) { + else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; - } else { + else tcp_reinit_congestion_control(sk, ca); - } out: rcu_read_unlock(); return err; From d72714c1da138e6755d3bd14662dc5b7f17fae7f Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 10 Sep 2020 01:21:41 +0200 Subject: [PATCH 54/95] s390/bpf: Fix multiple tail calls In order to branch around tail calls (due to out-of-bounds index, exceeding tail call count or missing tail call target), JIT uses label[0] field, which contains the address of the instruction following the tail call. When there are multiple tail calls, label[0] value comes from handling of a previous tail call, which is incorrect. Fix by getting rid of label array and resolving the label address locally: for all 3 branches that jump to it, emit 0 offsets at the beginning, and then backpatch them with the correct value. Also, do not use the long jump infrastructure: the tail call sequence is known to be short, so make all 3 jumps short. Fixes: 6651ee070b31 ("s390/bpf: implement bpf_tail_call() helper") Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200909232141.3099367-1-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 61 ++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index be4b8532dd3c..0a4182792876 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -50,7 +50,6 @@ struct bpf_jit { int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ int excnt; /* Number of exception table entries */ - int labels[1]; /* Labels for local jumps */ }; #define SEEN_MEM BIT(0) /* use mem[] for temporary storage */ @@ -229,18 +228,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) REG_SET_SEEN(b3); \ }) -#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask) \ +#define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \ ({ \ - int rel = (jit->labels[label] - jit->prg) >> 1; \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ (op2) | (mask) << 12); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) -#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask) \ +#define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \ ({ \ - int rel = (jit->labels[label] - jit->prg) >> 1; \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ REG_SET_SEEN(b1); \ @@ -1282,7 +1281,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9040000, BPF_REG_0, REG_2); break; } - case BPF_JMP | BPF_TAIL_CALL: + case BPF_JMP | BPF_TAIL_CALL: { + int patch_1_clrj, patch_2_clij, patch_3_brc; + /* * Implicit input: * B1: pointer to ctx @@ -1300,16 +1301,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2, offsetof(struct bpf_array, map.max_entries)); /* if ((u32)%b3 >= (u32)%w1) goto out; */ - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* clrj %b3,%w1,0xa,label0 */ - EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3, - REG_W1, 0, 0xa); - } else { - /* clr %b3,%w1 */ - EMIT2(0x1500, BPF_REG_3, REG_W1); - /* brcl 0xa,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0xa, jit->labels[0]); - } + /* clrj %b3,%w1,0xa,out */ + patch_1_clrj = jit->prg; + EMIT6_PCREL_RIEB(0xec000000, 0x0077, BPF_REG_3, REG_W1, 0xa, + jit->prg); /* * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT) @@ -1324,16 +1319,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7080000, REG_W0, 1); /* laal %w1,%w0,off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */ - EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1, - MAX_TAIL_CALL_CNT, 0, 0x2); - } else { - /* clfi %w1,MAX_TAIL_CALL_CNT */ - EMIT6_IMM(0xc20f0000, REG_W1, MAX_TAIL_CALL_CNT); - /* brcl 0x2,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0x2, jit->labels[0]); - } + /* clij %w1,MAX_TAIL_CALL_CNT,0x2,out */ + patch_2_clij = jit->prg; + EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT, + 2, jit->prg); /* * prog = array->ptrs[index]; @@ -1348,13 +1337,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* ltg %r1,prog(%b2,%r1) */ EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2, REG_1, offsetof(struct bpf_array, ptrs)); - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* brc 0x8,label0 */ - EMIT4_PCREL_RIC(0xa7040000, 0x8, jit->labels[0]); - } else { - /* brcl 0x8,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0x8, jit->labels[0]); - } + /* brc 0x8,out */ + patch_3_brc = jit->prg; + EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg); /* * Restore registers before calling function @@ -1371,8 +1356,16 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* bc 0xf,tail_call_start(%r1) */ _EMIT4(0x47f01000 + jit->tail_call_start); /* out: */ - jit->labels[0] = jit->prg; + if (jit->prg_buf) { + *(u16 *)(jit->prg_buf + patch_1_clrj + 2) = + (jit->prg - patch_1_clrj) >> 1; + *(u16 *)(jit->prg_buf + patch_2_clij + 2) = + (jit->prg - patch_2_clij) >> 1; + *(u16 *)(jit->prg_buf + patch_3_brc + 2) = + (jit->prg - patch_3_brc) >> 1; + } break; + } case BPF_JMP | BPF_EXIT: /* return b0 */ last = (i == fp->len - 1) ? 1 : 0; if (last) From 3131cf66d3033aa40db5b5d72a2673315b61c862 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 10 Sep 2020 10:31:04 +0200 Subject: [PATCH 55/95] samples/bpf: Fix one packet sending in xdpsock Fix the sending of a single packet (or small burst) in xdpsock when executing in copy mode. Currently, the l2fwd application in xdpsock only transmits the packets after a batch of them has been received, which might be confusing if you only send one packet and expect that it is returned pronto. Fix this by calling sendto() more often and add a comment in the code that states that this can be optimized if needed. Reported-by: Tirthendu Sarkar Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1599726666-8431-2-git-send-email-magnus.karlsson@gmail.com --- samples/bpf/xdpsock_user.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 4cead341ae57..b6175cb9a31d 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -897,6 +897,14 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, if (!xsk->outstanding_tx) return; + /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to + * really send the packets. In zero-copy mode we do not have to do this, since Tx + * is driven by the NAPI loop. So as an optimization, we do not have to call + * sendto() all the time in zero-copy mode for l2fwd. + */ + if (opt_xdp_bind_flags & XDP_COPY) + kick_tx(xsk); + ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : xsk->outstanding_tx; From 5a2a0dd88f0f267ac5953acd81050ae43a82201f Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 10 Sep 2020 10:31:05 +0200 Subject: [PATCH 56/95] samples/bpf: Fix possible deadlock in xdpsock Fix a possible deadlock in the l2fwd application in xdpsock that can occur when there is no space in the Tx ring. There are two ways to get the kernel to consume entries in the Tx ring: calling sendto() to make it send packets and freeing entries from the completion ring, as the kernel will not send a packet if there is no space for it to add a completion entry in the completion ring. The Tx loop in l2fwd only used to call sendto(). This patches adds cleaning the completion ring in that loop. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1599726666-8431-3-git-send-email-magnus.karlsson@gmail.com --- samples/bpf/xdpsock_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index b6175cb9a31d..b60bf4ef7cdb 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1125,6 +1125,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); + complete_tx_l2fwd(xsk, fds); if (xsk_ring_prod__needs_wakeup(&xsk->tx)) kick_tx(xsk); ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); From 74e00676d7f1f3c0676fdfffcf404a09a037bfe3 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 10 Sep 2020 10:31:06 +0200 Subject: [PATCH 57/95] samples/bpf: Add quiet option to xdpsock Add a quiet option (-Q) that disables the statistics print outs of xdpsock. This is good to have when measuring 0% loss rate performance as it will be quite terrible if the application uses printfs. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1599726666-8431-4-git-send-email-magnus.karlsson@gmail.com --- samples/bpf/xdpsock_user.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index b60bf4ef7cdb..b220173dbe1e 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -78,6 +78,7 @@ static int opt_pkt_count; static u16 opt_pkt_size = MIN_PKT_SIZE; static u32 opt_pkt_fill_pattern = 0x12345678; static bool opt_extra_stats; +static bool opt_quiet; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -718,6 +719,7 @@ static struct option long_options[] = { {"tx-pkt-size", required_argument, 0, 's'}, {"tx-pkt-pattern", required_argument, 0, 'P'}, {"extra-stats", no_argument, 0, 'x'}, + {"quiet", no_argument, 0, 'Q'}, {0, 0, 0, 0} }; @@ -753,6 +755,7 @@ static void usage(const char *prog) " Min size: %d, Max size %d.\n" " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" " -x, --extra-stats Display extra statistics.\n" + " -Q, --quiet Do not display any stats.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, @@ -768,7 +771,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:x", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQ", long_options, &option_index); if (c == -1) break; @@ -852,6 +855,9 @@ static void parse_command_line(int argc, char **argv) case 'x': opt_extra_stats = 1; break; + case 'Q': + opt_quiet = 1; + break; default: usage(basename(argv[0])); } @@ -1286,9 +1292,11 @@ int main(int argc, char **argv) setlocale(LC_ALL, ""); - ret = pthread_create(&pt, NULL, poller, NULL); - if (ret) - exit_with_error(ret); + if (!opt_quiet) { + ret = pthread_create(&pt, NULL, poller, NULL); + if (ret) + exit_with_error(ret); + } prev_time = get_nsecs(); start_time = prev_time; @@ -1302,7 +1310,8 @@ int main(int argc, char **argv) benchmark_done = true; - pthread_join(pt, NULL); + if (!opt_quiet) + pthread_join(pt, NULL); xdpsock_cleanup(); From bf74a370eb4086d3aee77c73e95303ee24779ee8 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 14 Sep 2020 16:50:36 +0200 Subject: [PATCH 58/95] xsk: Fix refcount warning in xp_dma_map Fix a potential refcount warning that a zero value is increased to one in xp_dma_map, by initializing the refcount to one to start with, instead of zero plus a refcount_inc(). Fixes: 921b68692abb ("xsk: Enable sharing of dma mappings") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/1600095036-23868-1-git-send-email-magnus.karlsson@gmail.com --- net/xdp/xsk_buff_pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 5b00bc5707f2..e63fadd000db 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -296,7 +296,7 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi dma_map->dev = dev; dma_map->dma_need_sync = false; dma_map->dma_pages_cnt = nr_pages; - refcount_set(&dma_map->users, 0); + refcount_set(&dma_map->users, 1); list_add(&dma_map->list, &umem->xsk_dma_list); return dma_map; } @@ -369,7 +369,6 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_ pool->dev = dma_map->dev; pool->dma_pages_cnt = dma_map->dma_pages_cnt; pool->dma_need_sync = dma_map->dma_need_sync; - refcount_inc(&dma_map->users); memcpy(pool->dma_pages, dma_map->dma_pages, pool->dma_pages_cnt * sizeof(*pool->dma_pages)); @@ -390,6 +389,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, if (err) return err; + refcount_inc(&dma_map->users); return 0; } From 63bea244fee23f31a4379769ef09e30365bb74b7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 14 Sep 2020 11:31:10 -0700 Subject: [PATCH 59/95] bpftool: Fix build failure When building bpf selftests like make -C tools/testing/selftests/bpf -j20 I hit the following errors: ... GEN /net-next/tools/testing/selftests/bpf/tools/build/bpftool/Documentation/bpftool-gen.8 :75: (WARNING/2) Block quote ends without a blank line; unexpected unindent. :71: (WARNING/2) Literal block ends without a blank line; unexpected unindent. :85: (WARNING/2) Literal block ends without a blank line; unexpected unindent. :57: (WARNING/2) Block quote ends without a blank line; unexpected unindent. :66: (WARNING/2) Literal block ends without a blank line; unexpected unindent. :109: (WARNING/2) Literal block ends without a blank line; unexpected unindent. :175: (WARNING/2) Literal block ends without a blank line; unexpected unindent. :273: (WARNING/2) Literal block ends without a blank line; unexpected unindent. make[1]: *** [/net-next/tools/testing/selftests/bpf/tools/build/bpftool/Documentation/bpftool-perf.8] Error 12 make[1]: *** Waiting for unfinished jobs.... make[1]: *** [/net-next/tools/testing/selftests/bpf/tools/build/bpftool/Documentation/bpftool-iter.8] Error 12 make[1]: *** [/net-next/tools/testing/selftests/bpf/tools/build/bpftool/Documentation/bpftool-struct_ops.8] Error 12 ... I am using: -bash-4.4$ rst2man --version rst2man (Docutils 0.11 [repository], Python 2.7.5, on linux2) -bash-4.4$ The Makefile generated final .rst file (e.g., bpftool-cgroup.rst) looks like ... ID AttachType AttachFlags Name \n SEE ALSO\n========\n\t**bpf**\ (2),\n\t**bpf-helpers**\ (7),\n\t**bpftool**\ (8),\n\t**bpftool-btf**\ (8),\n\t**bpftool-feature**\ (8),\n\t**bpftool-gen**\ (8),\n\t**bpftool-iter**\ (8),\n\t**bpftool-link**\ (8),\n\t**bpftool-map**\ (8),\n\t**bpftool-net**\ (8),\n\t**bpftool-perf**\ (8),\n\t**bpftool-prog**\ (8),\n\t**bpftool-struct_ops**\ (8)\n The rst2man generated .8 file looks like Literal block ends without a blank line; unexpected unindent. .sp n SEEALSOn========nt**bpf**(2),nt**bpf\-helpers**(7),nt**bpftool**(8),nt**bpftool\-btf**(8),nt** bpftool\-feature**(8),nt**bpftool\-gen**(8),nt**bpftool\-iter**(8),nt**bpftool\-link**(8),nt** bpftool\-map**(8),nt**bpftool\-net**(8),nt**bpftool\-perf**(8),nt**bpftool\-prog**(8),nt** bpftool\-struct_ops**(8)n Looks like that particular version of rst2man prefers to have actual new line instead of \n. Since `echo -e` may not be available in some environment, let us use `printf`. Format string "%b" is used for `printf` to ensure all escape characters are interpretted properly. Fixes: 18841da98100 ("tools: bpftool: Automate generation for "SEE ALSO" sections in man pages") Suggested-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Andrii Nakryiko Cc: Quentin Monnet Link: https://lore.kernel.org/bpf/20200914183110.999906-1-yhs@fb.com --- tools/bpf/bpftool/Documentation/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index 4c9dd1e45244..f33cb02de95c 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -44,7 +44,7 @@ $(OUTPUT)%.8: %.rst ifndef RST2MAN_DEP $(error "rst2man not found, but required to generate man pages") endif - $(QUIET_GEN)( cat $< ; echo -n $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@ + $(QUIET_GEN)( cat $< ; printf "%b" $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@ clean: helpers-clean $(call QUIET_CLEAN, Documentation) From d317b0a8acfc4b126858e4cdadb03338d22f8ce0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 14 Sep 2020 15:32:10 -0700 Subject: [PATCH 60/95] libbpf: Fix a compilation error with xsk.c for ubuntu 16.04 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When syncing latest libbpf repo to bcc, ubuntu 16.04 (4.4.0 LTS kernel) failed compilation for xsk.c: In file included from /tmp/debuild.0jkauG/bcc/src/cc/libbpf/src/xsk.c:23:0: /tmp/debuild.0jkauG/bcc/src/cc/libbpf/src/xsk.c: In function ‘xsk_get_ctx’: /tmp/debuild.0jkauG/bcc/src/cc/libbpf/include/linux/list.h:81:9: warning: implicit declaration of function ‘container_of’ [-Wimplicit-function-declaration] container_of(ptr, type, member) ^ /tmp/debuild.0jkauG/bcc/src/cc/libbpf/include/linux/list.h:83:9: note: in expansion of macro ‘list_entry’ list_entry((ptr)->next, type, member) ... src/cc/CMakeFiles/bpf-static.dir/build.make:209: recipe for target 'src/cc/CMakeFiles/bpf-static.dir/libbpf/src/xsk.c.o' failed Commit 2f6324a3937f ("libbpf: Support shared umems between queues and devices") added include file , which uses macro "container_of". xsk.c file also includes before . In a more recent distro kernel, includes which contains the macro definition for "container_of". So compilation is all fine. But in ubuntu 16.04 kernel, does not contain which caused the above compilation error. Let explicitly add in xsk.c to avoid compilation error in old distro's. Fixes: 2f6324a3937f ("libbpf: Support shared umems between queues and devices") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200914223210.1831262-1-yhs@fb.com --- tools/lib/bpf/xsk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 49c324594792..30b4ca5d2ac7 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include From 984fe94f94756dacb3c8cc52904a23adf9e04da1 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:39 -0700 Subject: [PATCH 61/95] bpf: Mutex protect used_maps array and count To support modifying the used_maps array, we use a mutex to protect the use of the counter and the array. The mutex is initialized right after the prog aux is allocated, and destroyed right before prog aux is freed. This way we guarantee it's initialized for both cBPF and eBPF. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-2-sdf@google.com --- .../net/ethernet/netronome/nfp/bpf/offload.c | 18 ++++++++++++------ include/linux/bpf.h | 1 + kernel/bpf/core.c | 15 +++++++++++---- kernel/bpf/syscall.c | 16 ++++++++++++---- net/core/dev.c | 11 ++++++++--- 5 files changed, 44 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index ac02369174a9..53851853562c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -111,7 +111,9 @@ static int nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, struct bpf_prog *prog) { - int i, cnt, err; + int i, cnt, err = 0; + + mutex_lock(&prog->aux->used_maps_mutex); /* Quickly count the maps we will have to remember */ cnt = 0; @@ -119,13 +121,15 @@ nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, if (bpf_map_offload_neutral(prog->aux->used_maps[i])) cnt++; if (!cnt) - return 0; + goto out; nfp_prog->map_records = kmalloc_array(cnt, sizeof(nfp_prog->map_records[0]), GFP_KERNEL); - if (!nfp_prog->map_records) - return -ENOMEM; + if (!nfp_prog->map_records) { + err = -ENOMEM; + goto out; + } for (i = 0; i < prog->aux->used_map_cnt; i++) if (bpf_map_offload_neutral(prog->aux->used_maps[i])) { @@ -133,12 +137,14 @@ nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, prog->aux->used_maps[i]); if (err) { nfp_map_ptrs_forget(bpf, nfp_prog); - return err; + goto out; } } WARN_ON(cnt != nfp_prog->map_records_cnt); - return 0; +out: + mutex_unlock(&prog->aux->used_maps_mutex); + return err; } static int diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c6d9f2c444f4..5dcce0364634 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -751,6 +751,7 @@ struct bpf_prog_aux { struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; + struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ed0b3578867c..2a20c2833996 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -98,6 +98,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); + mutex_init(&fp->aux->used_maps_mutex); return fp; } @@ -253,6 +254,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { + mutex_destroy(&fp->aux->used_maps_mutex); free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); @@ -1747,8 +1749,9 @@ bool bpf_prog_array_compatible(struct bpf_array *array, static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; - int i; + int i, ret = 0; + mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; struct bpf_array *array; @@ -1757,11 +1760,15 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) continue; array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) - return -EINVAL; + if (!bpf_prog_array_compatible(array, fp)) { + ret = -EINVAL; + goto out; + } } - return 0; +out: + mutex_unlock(&aux->used_maps_mutex); + return ret; } static void bpf_prog_select_func(struct bpf_prog *fp) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4108ef3b828b..a67b8c6746be 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3162,21 +3162,25 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, const struct bpf_map *map; int i; + mutex_lock(&prog->aux->used_maps_mutex); for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { map = prog->aux->used_maps[i]; if (map == (void *)addr) { *type = BPF_PSEUDO_MAP_FD; - return map; + goto out; } if (!map->ops->map_direct_value_meta) continue; if (!map->ops->map_direct_value_meta(map, addr, off)) { *type = BPF_PSEUDO_MAP_VALUE; - return map; + goto out; } } + map = NULL; - return NULL; +out: + mutex_unlock(&prog->aux->used_maps_mutex); + return map; } static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, @@ -3294,6 +3298,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + mutex_lock(&prog->aux->used_maps_mutex); ulen = info.nr_map_ids; info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); @@ -3303,9 +3308,12 @@ static int bpf_prog_get_info_by_fd(struct file *file, for (i = 0; i < ulen; i++) if (put_user(prog->aux->used_maps[i]->id, - &user_map_ids[i])) + &user_map_ids[i])) { + mutex_unlock(&prog->aux->used_maps_mutex); return -EFAULT; + } } + mutex_unlock(&prog->aux->used_maps_mutex); err = set_info_rec_size(&info); if (err) diff --git a/net/core/dev.c b/net/core/dev.c index d42c9ea0c3c0..75d7f91337d9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5441,15 +5441,20 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) if (new) { u32 i; + mutex_lock(&new->aux->used_maps_mutex); + /* generic XDP does not work with DEVMAPs that can * have a bpf_prog installed on an entry */ for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i])) - return -EINVAL; - if (cpu_map_prog_allowed(new->aux->used_maps[i])) + if (dev_map_can_have_prog(new->aux->used_maps[i]) || + cpu_map_prog_allowed(new->aux->used_maps[i])) { + mutex_unlock(&new->aux->used_maps_mutex); return -EINVAL; + } } + + mutex_unlock(&new->aux->used_maps_mutex); } switch (xdp->command) { From ef15314aa5de955c6afd87d512e8b00f5ac08d06 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:40 -0700 Subject: [PATCH 62/95] bpf: Add BPF_PROG_BIND_MAP syscall This syscall binds a map to a program. Returns success if the map is already bound to the program. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-3-sdf@google.com --- include/uapi/linux/bpf.h | 7 ++++ kernel/bpf/syscall.c | 63 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 ++++ 3 files changed, 77 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7dd314176df7..a22812561064 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a67b8c6746be..2ce32cad5c8e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4161,6 +4161,66 @@ static int bpf_iter_create(union bpf_attr *attr) return err; } +#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags + +static int bpf_prog_bind_map(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + struct bpf_map **used_maps_old, **used_maps_new; + int i, ret = 0; + + if (CHECK_ATTR(BPF_PROG_BIND_MAP)) + return -EINVAL; + + if (attr->prog_bind_map.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_bind_map.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + map = bpf_map_get(attr->prog_bind_map.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto out_prog_put; + } + + mutex_lock(&prog->aux->used_maps_mutex); + + used_maps_old = prog->aux->used_maps; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (used_maps_old[i] == map) + goto out_unlock; + + used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, + sizeof(used_maps_new[0]), + GFP_KERNEL); + if (!used_maps_new) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(used_maps_new, used_maps_old, + sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); + used_maps_new[prog->aux->used_map_cnt] = map; + + prog->aux->used_map_cnt++; + prog->aux->used_maps = used_maps_new; + + kfree(used_maps_old); + +out_unlock: + mutex_unlock(&prog->aux->used_maps_mutex); + + if (ret) + bpf_map_put(map); +out_prog_put: + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4294,6 +4354,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_DETACH: err = link_detach(&attr); break; + case BPF_PROG_BIND_MAP: + err = bpf_prog_bind_map(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7dd314176df7..a22812561064 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF From 5d23328dccd93c47e2719cb9d2ae303c235d277d Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:41 -0700 Subject: [PATCH 63/95] libbpf: Add BPF_PROG_BIND_MAP syscall and use it on .rodata section The patch adds a simple wrapper bpf_prog_bind_map around the syscall. When the libbpf tries to load a program, it will probe the kernel for the support of this syscall and unconditionally bind .rodata section to the program. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-4-sdf@google.com --- tools/lib/bpf/bpf.c | 16 ++++++++++ tools/lib/bpf/bpf.h | 8 +++++ tools/lib/bpf/libbpf.c | 69 ++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.map | 1 + 4 files changed, 94 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 82b983ff6569..2baa1308737c 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -872,3 +872,19 @@ int bpf_enable_stats(enum bpf_stats_type type) return sys_bpf(BPF_ENABLE_STATS, &attr, sizeof(attr)); } + +int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts) +{ + union bpf_attr attr; + + if (!OPTS_VALID(opts, bpf_prog_bind_opts)) + return -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.prog_bind_map.prog_fd = prog_fd; + attr.prog_bind_map.map_fd = map_fd; + attr.prog_bind_map.flags = OPTS_GET(opts, flags, 0); + + return sys_bpf(BPF_PROG_BIND_MAP, &attr, sizeof(attr)); +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 015d13f25fcc..8c1ac4b42f90 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -243,6 +243,14 @@ LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, enum bpf_stats_type; /* defined in up-to-date linux/bpf.h */ LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); +struct bpf_prog_bind_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; +}; +#define bpf_prog_bind_opts__last_field flags + +LIBBPF_API int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts); #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 550950eb1860..570235dbc922 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -174,6 +174,8 @@ enum kern_feature_id { FEAT_EXP_ATTACH_TYPE, /* bpf_probe_read_{kernel,user}[_str] helpers */ FEAT_PROBE_READ_KERN, + /* BPF_PROG_BIND_MAP is supported */ + FEAT_PROG_BIND_MAP, __FEAT_CNT, }; @@ -409,6 +411,7 @@ struct bpf_object { struct extern_desc *externs; int nr_extern; int kconfig_map_idx; + int rodata_map_idx; bool loaded; bool has_subcalls; @@ -1070,6 +1073,7 @@ static struct bpf_object *bpf_object__new(const char *path, obj->efile.bss_shndx = -1; obj->efile.st_ops_shndx = -1; obj->kconfig_map_idx = -1; + obj->rodata_map_idx = -1; obj->kern_version = get_kernel_version(); obj->loaded = false; @@ -1428,6 +1432,8 @@ static int bpf_object__init_global_data_maps(struct bpf_object *obj) obj->efile.rodata->d_size); if (err) return err; + + obj->rodata_map_idx = obj->nr_maps - 1; } if (obj->efile.bss_shndx >= 0) { err = bpf_object__init_internal_map(obj, LIBBPF_MAP_BSS, @@ -3894,6 +3900,52 @@ static int probe_kern_probe_read_kernel(void) return probe_fd(bpf_load_program_xattr(&attr, NULL, 0)); } +static int probe_prog_bind_map(void) +{ + struct bpf_load_program_attr prg_attr; + struct bpf_create_map_attr map_attr; + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, prog; + + memset(&map_attr, 0, sizeof(map_attr)); + map_attr.map_type = BPF_MAP_TYPE_ARRAY; + map_attr.key_size = sizeof(int); + map_attr.value_size = 32; + map_attr.max_entries = 1; + + map = bpf_create_map_xattr(&map_attr); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + memset(&prg_attr, 0, sizeof(prg_attr)); + prg_attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + prg_attr.insns = insns; + prg_attr.insns_cnt = ARRAY_SIZE(insns); + prg_attr.license = "GPL"; + + prog = bpf_load_program_xattr(&prg_attr, NULL, 0); + if (prog < 0) { + close(map); + return 0; + } + + ret = bpf_prog_bind_map(prog, map, NULL); + + close(map); + close(prog); + + return ret >= 0; +} + enum kern_feature_result { FEAT_UNKNOWN = 0, FEAT_SUPPORTED = 1, @@ -3934,6 +3986,9 @@ static struct kern_feature_desc { }, [FEAT_PROBE_READ_KERN] = { "bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel, + }, + [FEAT_PROG_BIND_MAP] = { + "BPF_PROG_BIND_MAP support", probe_prog_bind_map, } }; @@ -6468,6 +6523,20 @@ retry_load: if (ret >= 0) { if (log_buf && load_attr.log_level) pr_debug("verifier log:\n%s", log_buf); + + if (prog->obj->rodata_map_idx >= 0 && + kernel_supports(FEAT_PROG_BIND_MAP)) { + struct bpf_map *rodata_map = + &prog->obj->maps[prog->obj->rodata_map_idx]; + + if (bpf_prog_bind_map(ret, bpf_map__fd(rodata_map), NULL)) { + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': failed to bind .rodata map: %s\n", + prog->name, cp); + /* Don't fail hard if can't bind rodata. */ + } + } + *pfd = ret; ret = 0; goto out; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 92ceb48a5ca2..5f054dadf082 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -302,6 +302,7 @@ LIBBPF_0.1.0 { LIBBPF_0.2.0 { global: + bpf_prog_bind_map; bpf_program__section_name; perf_buffer__buffer_cnt; perf_buffer__buffer_fd; From aff52e685eb39984f3a613e8a5c570d97e5d2414 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:42 -0700 Subject: [PATCH 64/95] bpftool: Support dumping metadata Dump metadata in the 'bpftool prog' list if it's present. For some formatting some BTF code is put directly in the metadata dumping. Sanity checks on the map and the kind of the btf_type to make sure we are actually dumping what we are expecting. A helper jsonw_reset is added to json writer so we can reuse the same json writer without having extraneous commas. Sample output: $ bpftool prog 6: cgroup_skb name prog tag bcf7977d3b93787c gpl [...] btf_id 4 metadata: a = "foo" b = 1 $ bpftool prog --json --pretty [{ "id": 6, [...] "btf_id": 4, "metadata": { "a": "foo", "b": 1 } } ] Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-5-sdf@google.com --- tools/bpf/bpftool/json_writer.c | 6 + tools/bpf/bpftool/json_writer.h | 3 + tools/bpf/bpftool/prog.c | 199 ++++++++++++++++++++++++++++++++ 3 files changed, 208 insertions(+) diff --git a/tools/bpf/bpftool/json_writer.c b/tools/bpf/bpftool/json_writer.c index 86501cd3c763..7fea83bedf48 100644 --- a/tools/bpf/bpftool/json_writer.c +++ b/tools/bpf/bpftool/json_writer.c @@ -119,6 +119,12 @@ void jsonw_pretty(json_writer_t *self, bool on) self->pretty = on; } +void jsonw_reset(json_writer_t *self) +{ + assert(self->depth == 0); + self->sep = '\0'; +} + /* Basic blocks */ static void jsonw_begin(json_writer_t *self, int c) { diff --git a/tools/bpf/bpftool/json_writer.h b/tools/bpf/bpftool/json_writer.h index 35cf1f00f96c..8ace65cdb92f 100644 --- a/tools/bpf/bpftool/json_writer.h +++ b/tools/bpf/bpftool/json_writer.h @@ -27,6 +27,9 @@ void jsonw_destroy(json_writer_t **self_p); /* Cause output to have pretty whitespace */ void jsonw_pretty(json_writer_t *self, bool on); +/* Reset separator to create new JSON */ +void jsonw_reset(json_writer_t *self); + /* Add property name */ void jsonw_name(json_writer_t *self, const char *name); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f7923414a052..d942c1e3372c 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -29,6 +29,9 @@ #include "main.h" #include "xlated_dumper.h" +#define BPF_METADATA_PREFIX "bpf_metadata_" +#define BPF_METADATA_PREFIX_LEN (sizeof(BPF_METADATA_PREFIX) - 1) + const char * const prog_type_name[] = { [BPF_PROG_TYPE_UNSPEC] = "unspec", [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", @@ -151,6 +154,198 @@ static void show_prog_maps(int fd, __u32 num_maps) } } +static void *find_metadata(int prog_fd, struct bpf_map_info *map_info) +{ + struct bpf_prog_info prog_info; + __u32 prog_info_len; + __u32 map_info_len; + void *value = NULL; + __u32 *map_ids; + int nr_maps; + int key = 0; + int map_fd; + int ret; + __u32 i; + + memset(&prog_info, 0, sizeof(prog_info)); + prog_info_len = sizeof(prog_info); + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) + return NULL; + + if (!prog_info.nr_map_ids) + return NULL; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(__u32)); + if (!map_ids) + return NULL; + + nr_maps = prog_info.nr_map_ids; + memset(&prog_info, 0, sizeof(prog_info)); + prog_info.nr_map_ids = nr_maps; + prog_info.map_ids = ptr_to_u64(map_ids); + prog_info_len = sizeof(prog_info); + + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) + goto free_map_ids; + + for (i = 0; i < prog_info.nr_map_ids; i++) { + map_fd = bpf_map_get_fd_by_id(map_ids[i]); + if (map_fd < 0) + goto free_map_ids; + + memset(map_info, 0, sizeof(*map_info)); + map_info_len = sizeof(*map_info); + ret = bpf_obj_get_info_by_fd(map_fd, map_info, &map_info_len); + if (ret < 0) { + close(map_fd); + goto free_map_ids; + } + + if (map_info->type != BPF_MAP_TYPE_ARRAY || + map_info->key_size != sizeof(int) || + map_info->max_entries != 1 || + !map_info->btf_value_type_id || + !strstr(map_info->name, ".rodata")) { + close(map_fd); + continue; + } + + value = malloc(map_info->value_size); + if (!value) { + close(map_fd); + goto free_map_ids; + } + + if (bpf_map_lookup_elem(map_fd, &key, value)) { + close(map_fd); + free(value); + value = NULL; + goto free_map_ids; + } + + close(map_fd); + break; + } + +free_map_ids: + free(map_ids); + return value; +} + +static bool has_metadata_prefix(const char *s) +{ + return strncmp(s, BPF_METADATA_PREFIX, BPF_METADATA_PREFIX_LEN) == 0; +} + +static void show_prog_metadata(int fd, __u32 num_maps) +{ + const struct btf_type *t_datasec, *t_var; + struct bpf_map_info map_info; + struct btf_var_secinfo *vsi; + bool printed_header = false; + struct btf *btf = NULL; + unsigned int i, vlen; + void *value = NULL; + const char *name; + int err; + + if (!num_maps) + return; + + memset(&map_info, 0, sizeof(map_info)); + value = find_metadata(fd, &map_info); + if (!value) + return; + + err = btf__get_from_id(map_info.btf_id, &btf); + if (err || !btf) + goto out_free; + + t_datasec = btf__type_by_id(btf, map_info.btf_value_type_id); + if (!btf_is_datasec(t_datasec)) + goto out_free; + + vlen = btf_vlen(t_datasec); + vsi = btf_var_secinfos(t_datasec); + + /* We don't proceed to check the kinds of the elements of the DATASEC. + * The verifier enforces them to be BTF_KIND_VAR. + */ + + if (json_output) { + struct btf_dumper d = { + .btf = btf, + .jw = json_wtr, + .is_plain_text = false, + }; + + for (i = 0; i < vlen; i++, vsi++) { + t_var = btf__type_by_id(btf, vsi->type); + name = btf__name_by_offset(btf, t_var->name_off); + + if (!has_metadata_prefix(name)) + continue; + + if (!printed_header) { + jsonw_name(json_wtr, "metadata"); + jsonw_start_object(json_wtr); + printed_header = true; + } + + jsonw_name(json_wtr, name + BPF_METADATA_PREFIX_LEN); + err = btf_dumper_type(&d, t_var->type, value + vsi->offset); + if (err) { + p_err("btf dump failed: %d", err); + break; + } + } + if (printed_header) + jsonw_end_object(json_wtr); + } else { + json_writer_t *btf_wtr = jsonw_new(stdout); + struct btf_dumper d = { + .btf = btf, + .jw = btf_wtr, + .is_plain_text = true, + }; + + if (!btf_wtr) { + p_err("jsonw alloc failed"); + goto out_free; + } + + for (i = 0; i < vlen; i++, vsi++) { + t_var = btf__type_by_id(btf, vsi->type); + name = btf__name_by_offset(btf, t_var->name_off); + + if (!has_metadata_prefix(name)) + continue; + + if (!printed_header) { + printf("\tmetadata:"); + printed_header = true; + } + + printf("\n\t\t%s = ", name + BPF_METADATA_PREFIX_LEN); + + jsonw_reset(btf_wtr); + err = btf_dumper_type(&d, t_var->type, value + vsi->offset); + if (err) { + p_err("btf dump failed: %d", err); + break; + } + } + if (printed_header) + jsonw_destroy(&btf_wtr); + } + +out_free: + btf__free(btf); + free(value); +} + static void print_prog_header_json(struct bpf_prog_info *info) { jsonw_uint_field(json_wtr, "id", info->id); @@ -228,6 +423,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) emit_obj_refs_json(&refs_table, info->id, json_wtr); + show_prog_metadata(fd, info->nr_map_ids); + jsonw_end_object(json_wtr); } @@ -297,6 +494,8 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd) emit_obj_refs_plain(&refs_table, info->id, "\n\tpids "); printf("\n"); + + show_prog_metadata(fd, info->nr_map_ids); } static int show_prog(int fd) From d42d1cc44d702123d6ff12ce54a0e854036d29cb Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:43 -0700 Subject: [PATCH 65/95] selftests/bpf: Test load and dump metadata with btftool and skel This is a simple test to check that loading and dumping metadata in btftool works, whether or not metadata contents are used by the program. A C test is also added to make sure the skeleton code can read the metadata values. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-6-sdf@google.com --- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/metadata.c | 141 ++++++++++++++++++ .../selftests/bpf/progs/metadata_unused.c | 15 ++ .../selftests/bpf/progs/metadata_used.c | 15 ++ .../selftests/bpf/test_bpftool_metadata.sh | 82 ++++++++++ 5 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/metadata.c create mode 100644 tools/testing/selftests/bpf/progs/metadata_unused.c create mode 100644 tools/testing/selftests/bpf/progs/metadata_used.c create mode 100755 tools/testing/selftests/bpf/test_bpftool_metadata.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 05798c2b5c67..2a63791177c4 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -68,7 +68,8 @@ TEST_PROGS := test_kmod.sh \ test_tc_edt.sh \ test_xdping.sh \ test_bpftool_build.sh \ - test_bpftool.sh + test_bpftool.sh \ + test_bpftool_metadata.sh \ TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ diff --git a/tools/testing/selftests/bpf/prog_tests/metadata.c b/tools/testing/selftests/bpf/prog_tests/metadata.c new file mode 100644 index 000000000000..2c53eade88e3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/metadata.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright 2020 Google LLC. + */ + +#include +#include +#include + +#include "metadata_unused.skel.h" +#include "metadata_used.skel.h" + +static int duration; + +static int prog_holds_map(int prog_fd, int map_fd) +{ + struct bpf_prog_info prog_info = {}; + struct bpf_prog_info map_info = {}; + __u32 prog_info_len; + __u32 map_info_len; + __u32 *map_ids; + int nr_maps; + int ret; + int i; + + map_info_len = sizeof(map_info); + ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len); + if (ret) + return -errno; + + prog_info_len = sizeof(prog_info); + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) + return -errno; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(__u32)); + if (!map_ids) + return -ENOMEM; + + nr_maps = prog_info.nr_map_ids; + memset(&prog_info, 0, sizeof(prog_info)); + prog_info.nr_map_ids = nr_maps; + prog_info.map_ids = ptr_to_u64(map_ids); + prog_info_len = sizeof(prog_info); + + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) { + ret = -errno; + goto free_map_ids; + } + + ret = -ENOENT; + for (i = 0; i < prog_info.nr_map_ids; i++) { + if (map_ids[i] == map_info.id) { + ret = 0; + break; + } + } + +free_map_ids: + free(map_ids); + return ret; +} + +static void test_metadata_unused(void) +{ + struct metadata_unused *obj; + int err; + + obj = metadata_unused__open_and_load(); + if (CHECK(!obj, "skel-load", "errno %d", errno)) + return; + + err = prog_holds_map(bpf_program__fd(obj->progs.prog), + bpf_map__fd(obj->maps.rodata)); + if (CHECK(err, "prog-holds-rodata", "errno: %d", err)) + return; + + /* Assert that we can access the metadata in skel and the values are + * what we expect. + */ + if (CHECK(strncmp(obj->rodata->bpf_metadata_a, "foo", + sizeof(obj->rodata->bpf_metadata_a)), + "bpf_metadata_a", "expected \"foo\", value differ")) + goto close_bpf_object; + if (CHECK(obj->rodata->bpf_metadata_b != 1, "bpf_metadata_b", + "expected 1, got %d", obj->rodata->bpf_metadata_b)) + goto close_bpf_object; + + /* Assert that binding metadata map to prog again succeeds. */ + err = bpf_prog_bind_map(bpf_program__fd(obj->progs.prog), + bpf_map__fd(obj->maps.rodata), NULL); + CHECK(err, "rebind_map", "errno %d, expected 0", errno); + +close_bpf_object: + metadata_unused__destroy(obj); +} + +static void test_metadata_used(void) +{ + struct metadata_used *obj; + int err; + + obj = metadata_used__open_and_load(); + if (CHECK(!obj, "skel-load", "errno %d", errno)) + return; + + err = prog_holds_map(bpf_program__fd(obj->progs.prog), + bpf_map__fd(obj->maps.rodata)); + if (CHECK(err, "prog-holds-rodata", "errno: %d", err)) + return; + + /* Assert that we can access the metadata in skel and the values are + * what we expect. + */ + if (CHECK(strncmp(obj->rodata->bpf_metadata_a, "bar", + sizeof(obj->rodata->bpf_metadata_a)), + "metadata_a", "expected \"bar\", value differ")) + goto close_bpf_object; + if (CHECK(obj->rodata->bpf_metadata_b != 2, "metadata_b", + "expected 2, got %d", obj->rodata->bpf_metadata_b)) + goto close_bpf_object; + + /* Assert that binding metadata map to prog again succeeds. */ + err = bpf_prog_bind_map(bpf_program__fd(obj->progs.prog), + bpf_map__fd(obj->maps.rodata), NULL); + CHECK(err, "rebind_map", "errno %d, expected 0", errno); + +close_bpf_object: + metadata_used__destroy(obj); +} + +void test_metadata(void) +{ + if (test__start_subtest("unused")) + test_metadata_unused(); + + if (test__start_subtest("used")) + test_metadata_used(); +} diff --git a/tools/testing/selftests/bpf/progs/metadata_unused.c b/tools/testing/selftests/bpf/progs/metadata_unused.c new file mode 100644 index 000000000000..672a0d19f8d0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/metadata_unused.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +volatile const char bpf_metadata_a[] SEC(".rodata") = "foo"; +volatile const int bpf_metadata_b SEC(".rodata") = 1; + +SEC("cgroup_skb/egress") +int prog(struct xdp_md *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/metadata_used.c b/tools/testing/selftests/bpf/progs/metadata_used.c new file mode 100644 index 000000000000..b7198e65383d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/metadata_used.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +volatile const char bpf_metadata_a[] SEC(".rodata") = "bar"; +volatile const int bpf_metadata_b SEC(".rodata") = 2; + +SEC("cgroup_skb/egress") +int prog(struct xdp_md *ctx) +{ + return bpf_metadata_b ? 1 : 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_bpftool_metadata.sh b/tools/testing/selftests/bpf/test_bpftool_metadata.sh new file mode 100755 index 000000000000..1bf81b49457a --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool_metadata.sh @@ -0,0 +1,82 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +TESTNAME=bpftool_metadata +BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) +BPF_DIR=$BPF_FS/test_$TESTNAME + +_cleanup() +{ + set +e + rm -rf $BPF_DIR 2> /dev/null +} + +cleanup_skip() +{ + echo "selftests: $TESTNAME [SKIP]" + _cleanup + + exit $ksft_skip +} + +cleanup() +{ + if [ "$?" = 0 ]; then + echo "selftests: $TESTNAME [PASS]" + else + echo "selftests: $TESTNAME [FAILED]" + fi + _cleanup +} + +if [ $(id -u) -ne 0 ]; then + echo "selftests: $TESTNAME [SKIP] Need root privileges" + exit $ksft_skip +fi + +if [ -z "$BPF_FS" ]; then + echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted" + exit $ksft_skip +fi + +if ! bpftool version > /dev/null 2>&1; then + echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool" + exit $ksft_skip +fi + +set -e + +trap cleanup_skip EXIT + +mkdir $BPF_DIR + +trap cleanup EXIT + +bpftool prog load metadata_unused.o $BPF_DIR/unused + +METADATA_PLAIN="$(bpftool prog)" +echo "$METADATA_PLAIN" | grep 'a = "foo"' > /dev/null +echo "$METADATA_PLAIN" | grep 'b = 1' > /dev/null + +bpftool prog --json | grep '"metadata":{"a":"foo","b":1}' > /dev/null + +bpftool map | grep 'metadata.rodata' > /dev/null + +rm $BPF_DIR/unused + +bpftool prog load metadata_used.o $BPF_DIR/used + +METADATA_PLAIN="$(bpftool prog)" +echo "$METADATA_PLAIN" | grep 'a = "bar"' > /dev/null +echo "$METADATA_PLAIN" | grep 'b = 2' > /dev/null + +bpftool prog --json | grep '"metadata":{"a":"bar","b":2}' > /dev/null + +bpftool map | grep 'metadata.rodata' > /dev/null + +rm $BPF_DIR/used + +exit 0 From c64779e24e88a9915b2f35fbb9851ef2c5462cc2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 15 Sep 2020 17:48:19 -0700 Subject: [PATCH 66/95] selftests/bpf: Merge most of test_btf into test_progs Merge 183 tests from test_btf into test_progs framework to be exercised regularly. All the test_btf tests that were moved are modeled as proper sub-tests in test_progs framework for ease of debugging and reporting. No functional or behavioral changes were intended, I tried to preserve original behavior as much as possible. E.g., `test_progs -v` will activate "always_log" flag to emit BTF validation log. The only difference is in reducing the max_entries limit for pretty-printing tests from (128 * 1024) to just 128 to reduce tests running time without reducing the coverage. Example test run: $ sudo ./test_progs -n 8 ... #8 btf:OK Summary: 1/183 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200916004819.3767489-1-andriin@fb.com --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 2 +- .../bpf/{test_btf.c => prog_tests/btf.c} | 410 ++++-------------- 3 files changed, 78 insertions(+), 335 deletions(-) rename tools/testing/selftests/bpf/{test_btf.c => prog_tests/btf.c} (96%) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 9a0946ddb705..e8fed558b8b8 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -15,7 +15,6 @@ test_sock test_sock_addr test_sock_fields urandom_read -test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2a63791177c4..59a5fa5fe837 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -33,7 +33,7 @@ LDLIBS += -lcap -lelf -lz -lrt -lpthread # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_verifier_log test_dev_cgroup test_tcpbpf_user \ - test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \ + test_sock test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ test_netcnt test_tcpnotify_user test_sock_fields test_sysctl \ test_progs-no_alu32 \ diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c similarity index 96% rename from tools/testing/selftests/bpf/test_btf.c rename to tools/testing/selftests/bpf/prog_tests/btf.c index c75fc6447186..93162484c2ca 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -24,40 +24,17 @@ #include "bpf_rlimit.h" #include "bpf_util.h" -#include "test_btf.h" +#include "../test_btf.h" +#include "test_progs.h" #define MAX_INSNS 512 #define MAX_SUBPROGS 16 -static uint32_t pass_cnt; -static uint32_t error_cnt; -static uint32_t skip_cnt; +static int duration = 0; +static bool always_log; -#define CHECK(condition, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \ - fprintf(stderr, format); \ - } \ - __ret; \ -}) - -static int count_result(int err) -{ - if (err) - error_cnt++; - else - pass_cnt++; - - fprintf(stderr, "\n"); - return err; -} - -static int __base_pr(enum libbpf_print_level level __attribute__((unused)), - const char *format, va_list args) -{ - return vfprintf(stderr, format, args); -} +#undef CHECK +#define CHECK(condition, format...) _CHECK(condition, "check", duration, format) #define BTF_END_RAW 0xdeadbeef #define NAME_TBD 0xdeadb33f @@ -69,21 +46,6 @@ static int __base_pr(enum libbpf_print_level level __attribute__((unused)), #define MAX_NR_RAW_U32 1024 #define BTF_LOG_BUF_SIZE 65535 -static struct args { - unsigned int raw_test_num; - unsigned int file_test_num; - unsigned int get_info_test_num; - unsigned int info_raw_test_num; - unsigned int dedup_test_num; - bool raw_test; - bool file_test; - bool get_info_test; - bool pprint_test; - bool always_log; - bool info_raw_test; - bool dedup_test; -} args; - static char btf_log_buf[BTF_LOG_BUF_SIZE]; static struct btf_header hdr_tmpl = { @@ -3664,7 +3626,7 @@ done: return raw_btf; } -static int do_test_raw(unsigned int test_num) +static void do_test_raw(unsigned int test_num) { struct btf_raw_test *test = &raw_tests[test_num - 1]; struct bpf_create_map_attr create_attr = {}; @@ -3674,15 +3636,16 @@ static int do_test_raw(unsigned int test_num) void *raw_btf; int err; - fprintf(stderr, "BTF raw test[%u] (%s): ", test_num, test->descr); + if (!test__start_subtest(test->descr)) + return; + raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types, test->str_sec, test->str_sec_size, &raw_btf_size, NULL); - if (!raw_btf) - return -1; + return; hdr = raw_btf; @@ -3694,7 +3657,7 @@ static int do_test_raw(unsigned int test_num) *btf_log_buf = '\0'; btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); free(raw_btf); err = ((btf_fd == -1) != test->btf_load_err); @@ -3725,32 +3688,12 @@ static int do_test_raw(unsigned int test_num) map_fd, test->map_create_err); done: - if (!err) - fprintf(stderr, "OK"); - - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); - if (btf_fd != -1) close(btf_fd); if (map_fd != -1) close(map_fd); - - return err; -} - -static int test_raw(void) -{ - unsigned int i; - int err = 0; - - if (args.raw_test_num) - return count_result(do_test_raw(args.raw_test_num)); - - for (i = 1; i <= ARRAY_SIZE(raw_tests); i++) - err |= count_result(do_test_raw(i)); - - return err; } struct btf_get_info_test { @@ -3814,11 +3757,6 @@ const struct btf_get_info_test get_info_tests[] = { }, }; -static inline __u64 ptr_to_u64(const void *ptr) -{ - return (__u64)(unsigned long)ptr; -} - static int test_big_btf_info(unsigned int test_num) { const struct btf_get_info_test *test = &get_info_tests[test_num - 1]; @@ -3851,7 +3789,7 @@ static int test_big_btf_info(unsigned int test_num) btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); if (CHECK(btf_fd == -1, "errno:%d", errno)) { err = -1; goto done; @@ -3892,7 +3830,7 @@ static int test_big_btf_info(unsigned int test_num) fprintf(stderr, "OK"); done: - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); free(raw_btf); @@ -3939,7 +3877,7 @@ static int test_btf_id(unsigned int test_num) btf_fd[0] = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) { err = -1; goto done; @@ -4024,7 +3962,7 @@ static int test_btf_id(unsigned int test_num) fprintf(stderr, "OK"); done: - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); free(raw_btf); @@ -4039,7 +3977,7 @@ done: return err; } -static int do_test_get_info(unsigned int test_num) +static void do_test_get_info(unsigned int test_num) { const struct btf_get_info_test *test = &get_info_tests[test_num - 1]; unsigned int raw_btf_size, user_btf_size, expected_nbytes; @@ -4048,11 +3986,14 @@ static int do_test_get_info(unsigned int test_num) int btf_fd = -1, err, ret; uint32_t info_len; - fprintf(stderr, "BTF GET_INFO test[%u] (%s): ", - test_num, test->descr); + if (!test__start_subtest(test->descr)) + return; - if (test->special_test) - return test->special_test(test_num); + if (test->special_test) { + err = test->special_test(test_num); + if (CHECK(err, "failed: %d\n", err)) + return; + } raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types, @@ -4061,7 +4002,7 @@ static int do_test_get_info(unsigned int test_num) &raw_btf_size, NULL); if (!raw_btf) - return -1; + return; *btf_log_buf = '\0'; @@ -4073,7 +4014,7 @@ static int do_test_get_info(unsigned int test_num) btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); if (CHECK(btf_fd == -1, "errno:%d", errno)) { err = -1; goto done; @@ -4114,7 +4055,7 @@ static int do_test_get_info(unsigned int test_num) fprintf(stderr, "OK"); done: - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); free(raw_btf); @@ -4122,22 +4063,6 @@ done: if (btf_fd != -1) close(btf_fd); - - return err; -} - -static int test_get_info(void) -{ - unsigned int i; - int err = 0; - - if (args.get_info_test_num) - return count_result(do_test_get_info(args.get_info_test_num)); - - for (i = 1; i <= ARRAY_SIZE(get_info_tests); i++) - err |= count_result(do_test_get_info(i)); - - return err; } struct btf_file_test { @@ -4151,7 +4076,7 @@ static struct btf_file_test file_tests[] = { { .file = "test_btf_nokv.o", .btf_kv_notfound = true, }, }; -static int do_test_file(unsigned int test_num) +static void do_test_file(unsigned int test_num) { const struct btf_file_test *test = &file_tests[test_num - 1]; const char *expected_fnames[] = {"_dummy_tracepoint", @@ -4169,17 +4094,17 @@ static int do_test_file(unsigned int test_num) struct bpf_map *map; int i, err, prog_fd; - fprintf(stderr, "BTF libbpf test[%u] (%s): ", test_num, - test->file); + if (!test__start_subtest(test->file)) + return; btf = btf__parse_elf(test->file, &btf_ext); if (IS_ERR(btf)) { if (PTR_ERR(btf) == -ENOENT) { - fprintf(stderr, "SKIP. No ELF %s found", BTF_ELF_SEC); - skip_cnt++; - return 0; + printf("%s:SKIP: No ELF %s found", __func__, BTF_ELF_SEC); + test__skip(); + return; } - return PTR_ERR(btf); + return; } btf__free(btf); @@ -4188,7 +4113,7 @@ static int do_test_file(unsigned int test_num) obj = bpf_object__open(test->file); if (CHECK(IS_ERR(obj), "obj: %ld", PTR_ERR(obj))) - return PTR_ERR(obj); + return; prog = bpf_program__next(NULL, obj); if (CHECK(!prog, "Cannot find bpf_prog")) { @@ -4310,21 +4235,6 @@ skip: done: free(func_info); bpf_object__close(obj); - return err; -} - -static int test_file(void) -{ - unsigned int i; - int err = 0; - - if (args.file_test_num) - return count_result(do_test_file(args.file_test_num)); - - for (i = 1; i <= ARRAY_SIZE(file_tests); i++) - err |= count_result(do_test_file(i)); - - return err; } const char *pprint_enum_str[] = { @@ -4428,7 +4338,7 @@ static struct btf_raw_test pprint_test_template[] = { .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ .value_type_id = 16, /* struct pprint_mapv */ - .max_entries = 128 * 1024, + .max_entries = 128, }, { @@ -4493,7 +4403,7 @@ static struct btf_raw_test pprint_test_template[] = { .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ .value_type_id = 16, /* struct pprint_mapv */ - .max_entries = 128 * 1024, + .max_entries = 128, }, { @@ -4564,7 +4474,7 @@ static struct btf_raw_test pprint_test_template[] = { .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ .value_type_id = 16, /* struct pprint_mapv */ - .max_entries = 128 * 1024, + .max_entries = 128, }, #ifdef __SIZEOF_INT128__ @@ -4591,7 +4501,7 @@ static struct btf_raw_test pprint_test_template[] = { .value_size = sizeof(struct pprint_mapv_int128), .key_type_id = 1, .value_type_id = 4, - .max_entries = 128 * 1024, + .max_entries = 128, .mapv_kind = PPRINT_MAPV_KIND_INT128, }, #endif @@ -4790,7 +4700,7 @@ static int check_line(const char *expected_line, int nexpected_line, } -static int do_test_pprint(int test_num) +static void do_test_pprint(int test_num) { const struct btf_raw_test *test = &pprint_test_template[test_num]; enum pprint_mapv_kind_t mapv_kind = test->mapv_kind; @@ -4809,18 +4719,20 @@ static int do_test_pprint(int test_num) uint8_t *raw_btf; ssize_t nread; - fprintf(stderr, "%s(#%d)......", test->descr, test_num); + if (!test__start_subtest(test->descr)) + return; + raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types, test->str_sec, test->str_sec_size, &raw_btf_size, NULL); if (!raw_btf) - return -1; + return; *btf_log_buf = '\0'; btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); free(raw_btf); if (CHECK(btf_fd == -1, "errno:%d", errno)) { @@ -4971,7 +4883,7 @@ done: free(mapv); if (!err) fprintf(stderr, "OK"); - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); if (btf_fd != -1) close(btf_fd); @@ -4981,14 +4893,11 @@ done: fclose(pin_file); unlink(pin_path); free(line); - - return err; } -static int test_pprint(void) +static void test_pprint(void) { unsigned int i; - int err = 0; /* test various maps with the first test template */ for (i = 0; i < ARRAY_SIZE(pprint_tests_meta); i++) { @@ -4999,7 +4908,7 @@ static int test_pprint(void) pprint_test_template[0].lossless_map = pprint_tests_meta[i].lossless_map; pprint_test_template[0].percpu_map = pprint_tests_meta[i].percpu_map; - err |= count_result(do_test_pprint(0)); + do_test_pprint(0); } /* test rest test templates with the first map */ @@ -5010,10 +4919,8 @@ static int test_pprint(void) pprint_test_template[i].ordered_map = pprint_tests_meta[0].ordered_map; pprint_test_template[i].lossless_map = pprint_tests_meta[0].lossless_map; pprint_test_template[i].percpu_map = pprint_tests_meta[0].percpu_map; - err |= count_result(do_test_pprint(i)); + do_test_pprint(i); } - - return err; } #define BPF_LINE_INFO_ENC(insn_off, file_off, line_off, line_num, line_col) \ @@ -6178,7 +6085,7 @@ done: return err; } -static int do_test_info_raw(unsigned int test_num) +static void do_test_info_raw(unsigned int test_num) { const struct prog_info_raw_test *test = &info_raw_tests[test_num - 1]; unsigned int raw_btf_size, linfo_str_off, linfo_size; @@ -6187,18 +6094,19 @@ static int do_test_info_raw(unsigned int test_num) const char *ret_next_str; union bpf_attr attr = {}; - fprintf(stderr, "BTF prog info raw test[%u] (%s): ", test_num, test->descr); + if (!test__start_subtest(test->descr)) + return; + raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types, test->str_sec, test->str_sec_size, &raw_btf_size, &ret_next_str); - if (!raw_btf) - return -1; + return; *btf_log_buf = '\0'; btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); free(raw_btf); if (CHECK(btf_fd == -1, "invalid btf_fd errno:%d", errno)) { @@ -6206,7 +6114,7 @@ static int do_test_info_raw(unsigned int test_num) goto done; } - if (*btf_log_buf && args.always_log) + if (*btf_log_buf && always_log) fprintf(stderr, "\n%s", btf_log_buf); *btf_log_buf = '\0'; @@ -6261,10 +6169,7 @@ static int do_test_info_raw(unsigned int test_num) goto done; done: - if (!err) - fprintf(stderr, "OK"); - - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); if (btf_fd != -1) @@ -6274,22 +6179,6 @@ done: if (!IS_ERR(patched_linfo)) free(patched_linfo); - - return err; -} - -static int test_info_raw(void) -{ - unsigned int i; - int err = 0; - - if (args.info_raw_test_num) - return count_result(do_test_info_raw(args.info_raw_test_num)); - - for (i = 1; i <= ARRAY_SIZE(info_raw_tests); i++) - err |= count_result(do_test_info_raw(i)); - - return err; } struct btf_raw_data { @@ -6754,7 +6643,7 @@ static void dump_btf_strings(const char *strs, __u32 len) } } -static int do_test_dedup(unsigned int test_num) +static void do_test_dedup(unsigned int test_num) { const struct btf_dedup_test *test = &dedup_tests[test_num - 1]; __u32 test_nr_types, expect_nr_types, test_btf_size, expect_btf_size; @@ -6769,13 +6658,15 @@ static int do_test_dedup(unsigned int test_num) void *raw_btf; int err = 0, i; - fprintf(stderr, "BTF dedup test[%u] (%s):", test_num, test->descr); + if (!test__start_subtest(test->descr)) + return; raw_btf = btf_raw_create(&hdr_tmpl, test->input.raw_types, test->input.str_sec, test->input.str_sec_size, &raw_btf_size, &ret_test_next_str); if (!raw_btf) - return -1; + return; + test_btf = btf__new((__u8 *)raw_btf, raw_btf_size); free(raw_btf); if (CHECK(IS_ERR(test_btf), "invalid test_btf errno:%ld", @@ -6789,7 +6680,7 @@ static int do_test_dedup(unsigned int test_num) test->expect.str_sec_size, &raw_btf_size, &ret_expect_next_str); if (!raw_btf) - return -1; + return; expect_btf = btf__new((__u8 *)raw_btf, raw_btf_size); free(raw_btf); if (CHECK(IS_ERR(expect_btf), "invalid expect_btf errno:%ld", @@ -6894,174 +6785,27 @@ static int do_test_dedup(unsigned int test_num) } done: - if (!err) - fprintf(stderr, "OK"); if (!IS_ERR(test_btf)) btf__free(test_btf); if (!IS_ERR(expect_btf)) btf__free(expect_btf); - - return err; } -static int test_dedup(void) +void test_btf(void) { - unsigned int i; - int err = 0; + int i; - if (args.dedup_test_num) - return count_result(do_test_dedup(args.dedup_test_num)); + always_log = env.verbosity > VERBOSE_NONE; + for (i = 1; i <= ARRAY_SIZE(raw_tests); i++) + do_test_raw(i); + for (i = 1; i <= ARRAY_SIZE(get_info_tests); i++) + do_test_get_info(i); + for (i = 1; i <= ARRAY_SIZE(file_tests); i++) + do_test_file(i); + for (i = 1; i <= ARRAY_SIZE(info_raw_tests); i++) + do_test_info_raw(i); for (i = 1; i <= ARRAY_SIZE(dedup_tests); i++) - err |= count_result(do_test_dedup(i)); - - return err; -} - -static void usage(const char *cmd) -{ - fprintf(stderr, "Usage: %s [-l] [[-r btf_raw_test_num (1 - %zu)] |\n" - "\t[-g btf_get_info_test_num (1 - %zu)] |\n" - "\t[-f btf_file_test_num (1 - %zu)] |\n" - "\t[-k btf_prog_info_raw_test_num (1 - %zu)] |\n" - "\t[-p (pretty print test)] |\n" - "\t[-d btf_dedup_test_num (1 - %zu)]]\n", - cmd, ARRAY_SIZE(raw_tests), ARRAY_SIZE(get_info_tests), - ARRAY_SIZE(file_tests), ARRAY_SIZE(info_raw_tests), - ARRAY_SIZE(dedup_tests)); -} - -static int parse_args(int argc, char **argv) -{ - const char *optstr = "hlpk:f:r:g:d:"; - int opt; - - while ((opt = getopt(argc, argv, optstr)) != -1) { - switch (opt) { - case 'l': - args.always_log = true; - break; - case 'f': - args.file_test_num = atoi(optarg); - args.file_test = true; - break; - case 'r': - args.raw_test_num = atoi(optarg); - args.raw_test = true; - break; - case 'g': - args.get_info_test_num = atoi(optarg); - args.get_info_test = true; - break; - case 'p': - args.pprint_test = true; - break; - case 'k': - args.info_raw_test_num = atoi(optarg); - args.info_raw_test = true; - break; - case 'd': - args.dedup_test_num = atoi(optarg); - args.dedup_test = true; - break; - case 'h': - usage(argv[0]); - exit(0); - default: - usage(argv[0]); - return -1; - } - } - - if (args.raw_test_num && - (args.raw_test_num < 1 || - args.raw_test_num > ARRAY_SIZE(raw_tests))) { - fprintf(stderr, "BTF raw test number must be [1 - %zu]\n", - ARRAY_SIZE(raw_tests)); - return -1; - } - - if (args.file_test_num && - (args.file_test_num < 1 || - args.file_test_num > ARRAY_SIZE(file_tests))) { - fprintf(stderr, "BTF file test number must be [1 - %zu]\n", - ARRAY_SIZE(file_tests)); - return -1; - } - - if (args.get_info_test_num && - (args.get_info_test_num < 1 || - args.get_info_test_num > ARRAY_SIZE(get_info_tests))) { - fprintf(stderr, "BTF get info test number must be [1 - %zu]\n", - ARRAY_SIZE(get_info_tests)); - return -1; - } - - if (args.info_raw_test_num && - (args.info_raw_test_num < 1 || - args.info_raw_test_num > ARRAY_SIZE(info_raw_tests))) { - fprintf(stderr, "BTF prog info raw test number must be [1 - %zu]\n", - ARRAY_SIZE(info_raw_tests)); - return -1; - } - - if (args.dedup_test_num && - (args.dedup_test_num < 1 || - args.dedup_test_num > ARRAY_SIZE(dedup_tests))) { - fprintf(stderr, "BTF dedup test number must be [1 - %zu]\n", - ARRAY_SIZE(dedup_tests)); - return -1; - } - - return 0; -} - -static void print_summary(void) -{ - fprintf(stderr, "PASS:%u SKIP:%u FAIL:%u\n", - pass_cnt - skip_cnt, skip_cnt, error_cnt); -} - -int main(int argc, char **argv) -{ - int err = 0; - - err = parse_args(argc, argv); - if (err) - return err; - - if (args.always_log) - libbpf_set_print(__base_pr); - - if (args.raw_test) - err |= test_raw(); - - if (args.get_info_test) - err |= test_get_info(); - - if (args.file_test) - err |= test_file(); - - if (args.pprint_test) - err |= test_pprint(); - - if (args.info_raw_test) - err |= test_info_raw(); - - if (args.dedup_test) - err |= test_dedup(); - - if (args.raw_test || args.get_info_test || args.file_test || - args.pprint_test || args.info_raw_test || args.dedup_test) - goto done; - - err |= test_raw(); - err |= test_get_info(); - err |= test_file(); - err |= test_info_raw(); - err |= test_dedup(); - -done: - print_summary(); - return err; + do_test_dedup(i); + test_pprint(); } From 0d4ddce300bd9031a36b55451045d505ebf7cae2 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:04 +0200 Subject: [PATCH 67/95] bpf, x64: use %rcx instead of %rax for tail call retpolines Currently, %rax is used to store the jump target when BPF program is emitting the retpoline instructions that are handling the indirect tailcall. There is a plan to use %rax for different purpose, which is storing the tail call counter. In order to preserve this value across the tailcalls, adjust the BPF indirect tailcalls so that the target program will reside in %rcx and teach the retpoline instructions about new location of jump target. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/nospec-branch.h | 16 ++++++++-------- arch/x86/net/bpf_jit_comp.c | 20 ++++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e7752b4038ff..e491c3d9f227 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -314,19 +314,19 @@ static inline void mds_idle_clear_cpu_buffers(void) * lfence * jmp spec_trap * do_rop: - * mov %rax,(%rsp) for x86_64 + * mov %rcx,(%rsp) for x86_64 * mov %edx,(%esp) for x86_32 * retq * * Without retpolines configured: * - * jmp *%rax for x86_64 + * jmp *%rcx for x86_64 * jmp *%edx for x86_32 */ #ifdef CONFIG_RETPOLINE # ifdef CONFIG_X86_64 -# define RETPOLINE_RAX_BPF_JIT_SIZE 17 -# define RETPOLINE_RAX_BPF_JIT() \ +# define RETPOLINE_RCX_BPF_JIT_SIZE 17 +# define RETPOLINE_RCX_BPF_JIT() \ do { \ EMIT1_off32(0xE8, 7); /* callq do_rop */ \ /* spec_trap: */ \ @@ -334,7 +334,7 @@ do { \ EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ /* do_rop: */ \ - EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \ + EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \ EMIT1(0xC3); /* retq */ \ } while (0) # else /* !CONFIG_X86_64 */ @@ -352,9 +352,9 @@ do { \ # endif #else /* !CONFIG_RETPOLINE */ # ifdef CONFIG_X86_64 -# define RETPOLINE_RAX_BPF_JIT_SIZE 2 -# define RETPOLINE_RAX_BPF_JIT() \ - EMIT2(0xFF, 0xE0); /* jmp *%rax */ +# define RETPOLINE_RCX_BPF_JIT_SIZE 2 +# define RETPOLINE_RCX_BPF_JIT() \ + EMIT2(0xFF, 0xE1); /* jmp *%rcx */ # else /* !CONFIG_X86_64 */ # define RETPOLINE_EDX_BPF_JIT() \ EMIT2(0xFF, 0xE2) /* jmp *%edx */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7d9ea7b41c71..6fb8c9435980 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -370,7 +370,7 @@ static void emit_bpf_tail_call_indirect(u8 **pprog) EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */ +#define OFFSET1 (41 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; @@ -380,36 +380,36 @@ static void emit_bpf_tail_call_indirect(u8 **pprog) */ EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE) +#define OFFSET2 (30 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ /* prog = array->ptrs[index]; */ - EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ + EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); /* * if (prog == NULL) * goto out; */ - EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ -#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE) + EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ +#define OFFSET3 (8 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JE, OFFSET3); /* je out */ label3 = cnt; /* goto *(prog->bpf_func + prologue_size); */ - EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */ + EMIT4(0x48, 0x8B, 0x49, /* mov rcx, qword ptr [rcx + 32] */ offsetof(struct bpf_prog, bpf_func)); - EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ + EMIT4(0x48, 0x83, 0xC1, PROLOGUE_SIZE); /* add rcx, prologue_size */ /* - * Wow we're ready to jump into next BPF program + * Now we're ready to jump into next BPF program * rdi == ctx (1st arg) - * rax == prog->bpf_func + prologue_size + * rcx == prog->bpf_func + prologue_size */ - RETPOLINE_RAX_BPF_JIT(); + RETPOLINE_RCX_BPF_JIT(); /* out: */ BUILD_BUG_ON(cnt - label1 != OFFSET1); From a748c6975dea325da540610c2ba9b5f332c603e6 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:05 +0200 Subject: [PATCH 68/95] bpf: propagate poke descriptors to subprograms Previously, there was no need for poke descriptors being present in subprogram's bpf_prog_aux struct since tailcalls were simply not allowed in them. Each subprog is JITed independently so in order to enable JITing subprograms that use tailcalls, do the following: - in fixup_bpf_calls() store the index of tailcall insn onto the generated poke descriptor, - in case when insn patching occurs, adjust the tailcall insn idx from bpf_patch_insn_data, - then in jit_subprogs() check whether the given poke descriptor belongs to the current subprog by checking if that previously stored absolute index of tail call insn is in the scope of the insns of given subprog, - update the insn->imm with new poke descriptor slot so that while JITing the proper poke descriptor will be grabbed This way each of the main program's poke descriptors are distributed across the subprograms poke descriptor array, so main program's descriptors can be untracked out of the prog array map. Add also subprog's aux struct to the BPF map poke_progs list by calling on it map_poke_track(). In case of any error, call the map_poke_untrack() on subprog's aux structs that have already been registered to prog array map. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 66 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5dcce0364634..a23e5eb728c8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -707,6 +707,7 @@ struct bpf_jit_poke_descriptor { bool ip_stable; u8 adj_off; u16 reason; + u32 insn_idx; }; /* reg_type info for ctx arguments */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 814bc6c1ad16..8a18756953de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9623,6 +9623,18 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void adjust_poke_descs(struct bpf_prog *prog, u32 len) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + int i, sz = prog->aux->size_poke_tab; + struct bpf_jit_poke_descriptor *desc; + + for (i = 0; i < sz; i++) { + desc = &tab[i]; + desc->insn_idx += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -9639,6 +9651,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); + adjust_poke_descs(new_prog, len); return new_prog; } @@ -10169,6 +10182,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_map *map_ptr; struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; @@ -10236,6 +10250,31 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + for (j = 0; j < prog->aux->size_poke_tab; j++) { + u32 insn_idx = prog->aux->poke_tab[j].insn_idx; + int ret; + + if (!(insn_idx >= subprog_start && + insn_idx <= subprog_end)) + continue; + + ret = bpf_jit_add_poke_descriptor(func[i], + &prog->aux->poke_tab[j]); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + goto out_free; + } + + func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1; + + map_ptr = func[i]->aux->poke_tab[ret].tail_call.map; + ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + goto out_free; + } + } + /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names */ @@ -10261,6 +10300,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) } cond_resched(); } + + /* Untrack main program's aux structs so that during map_poke_run() + * we will not stumble upon the unfilled poke descriptors; each + * of the main program's poke descs got distributed across subprogs + * and got tracked onto map, so we are sure that none of them will + * be missed after the operation below + */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + + map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); + } + /* at this point all bpf functions were successfully JITed * now populate all bpf_calls with correct addresses and * run last pass of JIT @@ -10329,9 +10381,16 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_free_unused_jited_linfo(prog); return 0; out_free: - for (i = 0; i < env->subprog_cnt; i++) - if (func[i]) - bpf_jit_free(func[i]); + for (i = 0; i < env->subprog_cnt; i++) { + if (!func[i]) + continue; + + for (j = 0; j < func[i]->aux->size_poke_tab; j++) { + map_ptr = func[i]->aux->poke_tab[j].tail_call.map; + map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux); + } + bpf_jit_free(func[i]); + } kfree(func); out_undo_insn: /* cleanup main prog to be interpreted */ @@ -10549,6 +10608,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) .reason = BPF_POKE_REASON_TAIL_CALL, .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), .tail_call.key = bpf_map_key_immediate(aux), + .insn_idx = i + delta, }; ret = bpf_jit_add_poke_descriptor(prog, &desc); From cf71b174d3464c7dc22f86f25d629a8d9d5c3519 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:06 +0200 Subject: [PATCH 69/95] bpf: rename poke descriptor's 'ip' member to 'tailcall_target' Reflect the actual purpose of poke->ip and rename it to poke->tailcall_target so that it will not the be confused with another poke target that will be introduced in next commit. While at it, do the same thing with poke->ip_stable - rename it to poke->tailcall_target_stable. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 20 +++++++++++--------- include/linux/bpf.h | 4 ++-- kernel/bpf/arraymap.c | 17 +++++++++-------- kernel/bpf/core.c | 3 ++- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6fb8c9435980..7b0ff169c9a0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -434,7 +434,7 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ - poke->ip = image + (addr - X86_PATCH_SIZE); + poke->tailcall_target = image + (addr - X86_PATCH_SIZE); poke->adj_off = PROLOGUE_SIZE; memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); @@ -453,7 +453,7 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) for (i = 0; i < prog->aux->size_poke_tab; i++) { poke = &prog->aux->poke_tab[i]; - WARN_ON_ONCE(READ_ONCE(poke->ip_stable)); + WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable)); if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; @@ -464,18 +464,20 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) if (target) { /* Plain memcpy is used when image is not live yet * and still not locked as read-only. Once poke - * location is active (poke->ip_stable), any parallel - * bpf_arch_text_poke() might occur still on the - * read-write image until we finally locked it as - * read-only. Both modifications on the given image - * are under text_mutex to avoid interference. + * location is active (poke->tailcall_target_stable), + * any parallel bpf_arch_text_poke() might occur + * still on the read-write image until we finally + * locked it as read-only. Both modifications on + * the given image are under text_mutex to avoid + * interference. */ - ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL, + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, NULL, (u8 *)target->bpf_func + poke->adj_off, false); BUG_ON(ret < 0); } - WRITE_ONCE(poke->ip_stable, true); + WRITE_ONCE(poke->tailcall_target_stable, true); mutex_unlock(&array->aux->poke_mutex); } } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a23e5eb728c8..f3790c9cf542 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -697,14 +697,14 @@ enum bpf_jit_poke_reason { /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { - void *ip; + void *tailcall_target; union { struct { struct bpf_map *map; u32 key; } tail_call; }; - bool ip_stable; + bool tailcall_target_stable; u8 adj_off; u16 reason; u32 insn_idx; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e046fb7d17cd..60abf7fe12de 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -918,12 +918,13 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * there could be danger of use after free otherwise. * 2) Initially when we start tracking aux, the program * is not JITed yet and also does not have a kallsyms - * entry. We skip these as poke->ip_stable is not - * active yet. The JIT will do the final fixup before - * setting it stable. The various poke->ip_stable are - * successively activated, so tail call updates can - * arrive from here while JIT is still finishing its - * final fixup for non-activated poke entries. + * entry. We skip these as poke->tailcall_target_stable + * is not active yet. The JIT will do the final fixup + * before setting it stable. The various + * poke->tailcall_target_stable are successively + * activated, so tail call updates can arrive from here + * while JIT is still finishing its final fixup for + * non-activated poke entries. * 3) On program teardown, the program's kallsym entry gets * removed out of RCU callback, but we can only untrack * from sleepable context, therefore bpf_arch_text_poke() @@ -940,7 +941,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * 5) Any other error happening below from bpf_arch_text_poke() * is a unexpected bug. */ - if (!READ_ONCE(poke->ip_stable)) + if (!READ_ONCE(poke->tailcall_target_stable)) continue; if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; @@ -948,7 +949,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, + ret = bpf_arch_text_poke(poke->tailcall_target, BPF_MOD_JUMP, old ? (u8 *)old->bpf_func + poke->adj_off : NULL, new ? (u8 *)new->bpf_func + diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2a20c2833996..2e00ac028d38 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -775,7 +775,8 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, if (size > poke_tab_max) return -ENOSPC; - if (poke->ip || poke->ip_stable || poke->adj_off) + if (poke->tailcall_target || poke->tailcall_target_stable || + poke->adj_off) return -EINVAL; switch (poke->reason) { From 7f6e4312e15a5c370e84eaa685879b6bdcc717e4 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:07 +0200 Subject: [PATCH 70/95] bpf: Limit caller's stack depth 256 for subprogs with tailcalls Protect against potential stack overflow that might happen when bpf2bpf calls get combined with tailcalls. Limit the caller's stack depth for such case down to 256 so that the worst case scenario would result in 8k stack size (32 which is tailcall limit * 256 = 8k). Suggested-by: Alexei Starovoitov Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 53c7bd568c5d..5026b75db972 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -358,6 +358,7 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ + bool has_tail_call; }; /* single container for all structs diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8a18756953de..0958fba48d59 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1490,6 +1490,10 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; + if (code == (BPF_JMP | BPF_CALL) && + insn[i].imm == BPF_FUNC_tail_call && + insn[i].src_reg != BPF_PSEUDO_CALL) + subprog[cur_subprog].has_tail_call = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) @@ -2983,6 +2987,31 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int ret_prog[MAX_CALL_FRAMES]; process_func: + /* protect against potential stack overflow that might happen when + * bpf2bpf calls get combined with tailcalls. Limit the caller's stack + * depth for such case down to 256 so that the worst case scenario + * would result in 8k stack size (32 which is tailcall limit * 256 = + * 8k). + * + * To get the idea what might happen, see an example: + * func1 -> sub rsp, 128 + * subfunc1 -> sub rsp, 256 + * tailcall1 -> add rsp, 256 + * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320) + * subfunc2 -> sub rsp, 64 + * subfunc22 -> sub rsp, 128 + * tailcall2 -> add rsp, 128 + * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416) + * + * tailcall will unwind the current stack frame but it will not get rid + * of caller's stack as shown on the example above. + */ + if (idx && subprog[idx].has_tail_call && depth >= 256) { + verbose(env, + "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n", + depth); + return -EACCES; + } /* round up to 32-bytes, since this is granularity * of interpreter stack size */ From ebf7d1f508a73871acf3b2bfbfa1323a477acdb3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:08 +0200 Subject: [PATCH 71/95] bpf, x64: rework pro/epilogue and tailcall handling in JIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit serves two things: 1) it optimizes BPF prologue/epilogue generation 2) it makes possible to have tailcalls within BPF subprogram Both points are related to each other since without 1), 2) could not be achieved. In [1], Alexei says: "The prologue will look like: nop5 xor eax,eax  // two new bytes if bpf_tail_call() is used in this // function push rbp mov rbp, rsp sub rsp, rounded_stack_depth push rax // zero init tail_call counter variable number of push rbx,r13,r14,r15 Then bpf_tail_call will pop variable number rbx,.. and final 'pop rax' Then 'add rsp, size_of_current_stack_frame' jmp to next function and skip over 'nop5; xor eax,eax; push rpb; mov rbp, rsp' This way new function will set its own stack size and will init tail call counter with whatever value the parent had. If next function doesn't use bpf_tail_call it won't have 'xor eax,eax'. Instead it would need to have 'nop2' in there." Implement that suggestion. Since the layout of stack is changed, tail call counter handling can not rely anymore on popping it to rbx just like it have been handled for constant prologue case and later overwrite of rbx with actual value of rbx pushed to stack. Therefore, let's use one of the register (%rcx) that is considered to be volatile/caller-saved and pop the value of tail call counter in there in the epilogue. Drop the BUILD_BUG_ON in emit_prologue and in emit_bpf_tail_call_indirect where instruction layout is not constant anymore. Introduce new poke target, 'tailcall_bypass' to poke descriptor that is dedicated for skipping the register pops and stack unwind that are generated right before the actual jump to target program. For case when the target program is not present, BPF program will skip the pop instructions and nop5 dedicated for jmpq $target. An example of such state when only R6 of callee saved registers is used by program: ffffffffc0513aa1: e9 0e 00 00 00 jmpq 0xffffffffc0513ab4 ffffffffc0513aa6: 5b pop %rbx ffffffffc0513aa7: 58 pop %rax ffffffffc0513aa8: 48 81 c4 00 00 00 00 add $0x0,%rsp ffffffffc0513aaf: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) ffffffffc0513ab4: 48 89 df mov %rbx,%rdi When target program is inserted, the jump that was there to skip pops/nop5 will become the nop5, so CPU will go over pops and do the actual tailcall. One might ask why there simply can not be pushes after the nop5? In the following example snippet: ffffffffc037030c: 48 89 fb mov %rdi,%rbx (...) ffffffffc0370332: 5b pop %rbx ffffffffc0370333: 58 pop %rax ffffffffc0370334: 48 81 c4 00 00 00 00 add $0x0,%rsp ffffffffc037033b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) ffffffffc0370340: 48 81 ec 00 00 00 00 sub $0x0,%rsp ffffffffc0370347: 50 push %rax ffffffffc0370348: 53 push %rbx ffffffffc0370349: 48 89 df mov %rbx,%rdi ffffffffc037034c: e8 f7 21 00 00 callq 0xffffffffc0372548 There is the bpf2bpf call (at ffffffffc037034c) right after the tailcall and jump target is not present. ctx is in %rbx register and BPF subprogram that we will call into on ffffffffc037034c is relying on it, e.g. it will pick ctx from there. Such code layout is therefore broken as we would overwrite the content of %rbx with the value that was pushed on the prologue. That is the reason for the 'bypass' approach. Special care needs to be taken during the install/update/remove of tailcall target. In case when target program is not present, the CPU must not execute the pop instructions that precede the tailcall. To address that, the following states can be defined: A nop, unwind, nop B nop, unwind, tail C skip, unwind, nop D skip, unwind, tail A is forbidden (lead to incorrectness). The state transitions between tailcall install/update/remove will work as follows: First install tail call f: C->D->B(f) * poke the tailcall, after that get rid of the skip Update tail call f to f': B(f)->B(f') * poke the tailcall (poke->tailcall_target) and do NOT touch the poke->tailcall_bypass Remove tail call: B(f')->C(f') * poke->tailcall_bypass is poked back to jump, then we wait the RCU grace period so that other programs will finish its execution and after that we are safe to remove the poke->tailcall_target Install new tail call (f''): C(f')->D(f'')->B(f''). * same as first step This way CPU can never be exposed to "unwind, tail" state. Last but not least, when tailcalls get mixed with bpf2bpf calls, it would be possible to encounter the endless loop due to clearing the tailcall counter if for example we would use the tailcall3-like from BPF selftests program that would be subprogram-based, meaning the tailcall would be present within the BPF subprogram. This test, broken down to particular steps, would do: entry -> set tailcall counter to 0, bump it by 1, tailcall to func0 func0 -> call subprog_tail (we are NOT skipping the first 11 bytes of prologue and this subprogram has a tailcall, therefore we clear the counter...) subprog -> do the same thing as entry and then loop forever. To address this, the idea is to go through the call chain of bpf2bpf progs and look for a tailcall presence throughout whole chain. If we saw a single tail call then each node in this call chain needs to be marked as a subprog that can reach the tailcall. We would later feed the JIT with this info and: - set eax to 0 only when tailcall is reachable and this is the entry prog - if tailcall is reachable but there's no tailcall in insns of currently JITed prog then push rax anyway, so that it will be possible to propagate further down the call chain - finally if tailcall is reachable, then we need to precede the 'call' insn with mov rax, [rbp - (stack_depth + 8)] Tail call related cases from test_verifier kselftest are also working fine. Sample BPF programs that utilize tail calls (sockex3, tracex5) work properly as well. [1]: https://lore.kernel.org/bpf/20200517043227.2gpq22ifoq37ogst@ast-mbp.dhcp.thefacebook.com/ Suggested-by: Alexei Starovoitov Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 237 ++++++++++++++++++++++++++++------- include/linux/bpf.h | 3 + include/linux/bpf_verifier.h | 1 + kernel/bpf/arraymap.c | 40 +++++- kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 16 +++ 6 files changed, 244 insertions(+), 55 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7b0ff169c9a0..26f43279b78b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -221,14 +221,48 @@ struct jit_context { /* Number of bytes emit_patch() needs to generate instructions */ #define X86_PATCH_SIZE 5 +/* Number of bytes that will be skipped on tailcall */ +#define X86_TAIL_CALL_OFFSET 11 -#define PROLOGUE_SIZE 25 +static void push_callee_regs(u8 **pprog, bool *callee_regs_used) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (callee_regs_used[0]) + EMIT1(0x53); /* push rbx */ + if (callee_regs_used[1]) + EMIT2(0x41, 0x55); /* push r13 */ + if (callee_regs_used[2]) + EMIT2(0x41, 0x56); /* push r14 */ + if (callee_regs_used[3]) + EMIT2(0x41, 0x57); /* push r15 */ + *pprog = prog; +} + +static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (callee_regs_used[3]) + EMIT2(0x41, 0x5F); /* pop r15 */ + if (callee_regs_used[2]) + EMIT2(0x41, 0x5E); /* pop r14 */ + if (callee_regs_used[1]) + EMIT2(0x41, 0x5D); /* pop r13 */ + if (callee_regs_used[0]) + EMIT1(0x5B); /* pop rbx */ + *pprog = prog; +} /* - * Emit x86-64 prologue code for BPF program and check its size. - * bpf_tail_call helper will skip it while jumping into another program + * Emit x86-64 prologue code for BPF program. + * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes + * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) +static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, + bool tail_call_reachable, bool is_subprog) { u8 *prog = *pprog; int cnt = X86_PATCH_SIZE; @@ -238,19 +272,18 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) */ memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt); prog += cnt; + if (!ebpf_from_cbpf) { + if (tail_call_reachable && !is_subprog) + EMIT2(0x31, 0xC0); /* xor eax, eax */ + else + EMIT2(0x66, 0x90); /* nop2 */ + } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ /* sub rsp, rounded_stack_depth */ EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); - EMIT1(0x53); /* push rbx */ - EMIT2(0x41, 0x55); /* push r13 */ - EMIT2(0x41, 0x56); /* push r14 */ - EMIT2(0x41, 0x57); /* push r15 */ - if (!ebpf_from_cbpf) { - /* zero init tail_call_cnt */ - EMIT2(0x6a, 0x00); - BUILD_BUG_ON(cnt != PROLOGUE_SIZE); - } + if (tail_call_reachable) + EMIT1(0x50); /* push rax */ *pprog = prog; } @@ -314,13 +347,14 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, mutex_lock(&text_mutex); if (memcmp(ip, old_insn, X86_PATCH_SIZE)) goto out; + ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { if (text_live) text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); else memcpy(ip, new_insn, X86_PATCH_SIZE); + ret = 0; } - ret = 0; out: mutex_unlock(&text_mutex); return ret; @@ -337,6 +371,22 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); } +static int get_pop_bytes(bool *callee_regs_used) +{ + int bytes = 0; + + if (callee_regs_used[3]) + bytes += 2; + if (callee_regs_used[2]) + bytes += 2; + if (callee_regs_used[1]) + bytes += 2; + if (callee_regs_used[0]) + bytes += 1; + + return bytes; +} + /* * Generate the following code: * @@ -351,12 +401,26 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call_indirect(u8 **pprog) +static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, + u32 stack_depth) { + int tcc_off = -4 - round_up(stack_depth, 8); u8 *prog = *pprog; - int label1, label2, label3; + int pop_bytes = 0; + int off1 = 49; + int off2 = 38; + int off3 = 16; int cnt = 0; + /* count the additional bytes used for popping callee regs from stack + * that need to be taken into account for each of the offsets that + * are used for bailing out of the tail call + */ + pop_bytes = get_pop_bytes(callee_regs_used); + off1 += pop_bytes; + off2 += pop_bytes; + off3 += pop_bytes; + /* * rdi - pointer to ctx * rsi - pointer to bpf_array @@ -370,21 +434,19 @@ static void emit_bpf_tail_call_indirect(u8 **pprog) EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (41 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ +#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ - label1 = cnt; /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 (30 + RETPOLINE_RCX_BPF_JIT_SIZE) +#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JA, OFFSET2); /* ja out */ - label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ /* prog = array->ptrs[index]; */ EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */ @@ -394,48 +456,84 @@ static void emit_bpf_tail_call_indirect(u8 **pprog) * if (prog == NULL) * goto out; */ - EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ -#define OFFSET3 (8 + RETPOLINE_RCX_BPF_JIT_SIZE) + EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ +#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JE, OFFSET3); /* je out */ - label3 = cnt; - /* goto *(prog->bpf_func + prologue_size); */ + *pprog = prog; + pop_callee_regs(pprog, callee_regs_used); + prog = *pprog; + + EMIT1(0x58); /* pop rax */ + EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */ + round_up(stack_depth, 8)); + + /* goto *(prog->bpf_func + X86_TAIL_CALL_OFFSET); */ EMIT4(0x48, 0x8B, 0x49, /* mov rcx, qword ptr [rcx + 32] */ offsetof(struct bpf_prog, bpf_func)); - EMIT4(0x48, 0x83, 0xC1, PROLOGUE_SIZE); /* add rcx, prologue_size */ - + EMIT4(0x48, 0x83, 0xC1, /* add rcx, X86_TAIL_CALL_OFFSET */ + X86_TAIL_CALL_OFFSET); /* * Now we're ready to jump into next BPF program * rdi == ctx (1st arg) - * rcx == prog->bpf_func + prologue_size + * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ RETPOLINE_RCX_BPF_JIT(); /* out: */ - BUILD_BUG_ON(cnt - label1 != OFFSET1); - BUILD_BUG_ON(cnt - label2 != OFFSET2); - BUILD_BUG_ON(cnt - label3 != OFFSET3); *pprog = prog; } static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, - u8 **pprog, int addr, u8 *image) + u8 **pprog, int addr, u8 *image, + bool *callee_regs_used, u32 stack_depth) { + int tcc_off = -4 - round_up(stack_depth, 8); u8 *prog = *pprog; + int pop_bytes = 0; + int off1 = 27; + int poke_off; int cnt = 0; + /* count the additional bytes used for popping callee regs to stack + * that need to be taken into account for jump offset that is used for + * bailing out from of the tail call when limit is reached + */ + pop_bytes = get_pop_bytes(callee_regs_used); + off1 += pop_bytes; + + /* + * total bytes for: + * - nop5/ jmpq $off + * - pop callee regs + * - sub rsp, $val + * - pop rax + */ + poke_off = X86_PATCH_SIZE + pop_bytes + 7 + 1; + /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ - EMIT2(X86_JA, 14); /* ja out */ + EMIT2(X86_JA, off1); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ + poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE); + poke->adj_off = X86_TAIL_CALL_OFFSET; poke->tailcall_target = image + (addr - X86_PATCH_SIZE); - poke->adj_off = PROLOGUE_SIZE; + poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE; + + emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, + poke->tailcall_bypass); + + *pprog = prog; + pop_callee_regs(pprog, callee_regs_used); + prog = *pprog; + EMIT1(0x58); /* pop rax */ + EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; @@ -476,6 +574,11 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) (u8 *)target->bpf_func + poke->adj_off, false); BUG_ON(ret < 0); + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + (u8 *)poke->tailcall_target + + X86_PATCH_SIZE, NULL, false); + BUG_ON(ret < 0); } WRITE_ONCE(poke->tailcall_target_stable, true); mutex_unlock(&array->aux->poke_mutex); @@ -654,19 +757,49 @@ static bool ex_handler_bpf(const struct exception_table_entry *x, return true; } +static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, + bool *regs_used, bool *tail_call_seen) +{ + int i; + + for (i = 1; i <= insn_cnt; i++, insn++) { + if (insn->code == (BPF_JMP | BPF_TAIL_CALL)) + *tail_call_seen = true; + if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6) + regs_used[0] = true; + if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7) + regs_used[1] = true; + if (insn->dst_reg == BPF_REG_8 || insn->src_reg == BPF_REG_8) + regs_used[2] = true; + if (insn->dst_reg == BPF_REG_9 || insn->src_reg == BPF_REG_9) + regs_used[3] = true; + } +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { + bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; + bool callee_regs_used[4] = {}; int insn_cnt = bpf_prog->len; + bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; + detect_reg_usage(insn, insn_cnt, callee_regs_used, + &tail_call_seen); + + /* tail call's presence in current prog implies it is reachable */ + tail_call_reachable |= tail_call_seen; + emit_prologue(&prog, bpf_prog->aux->stack_depth, - bpf_prog_was_classic(bpf_prog)); + bpf_prog_was_classic(bpf_prog), tail_call_reachable, + bpf_prog->aux->func_idx != 0); + push_callee_regs(&prog, callee_regs_used); addrs[0] = prog - temp; for (i = 1; i <= insn_cnt; i++, insn++) { @@ -1104,16 +1237,27 @@ xadd: if (is_imm8(insn->off)) /* call */ case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) - return -EINVAL; + if (tail_call_reachable) { + EMIT3_off32(0x48, 0x8B, 0x85, + -(bpf_prog->aux->stack_depth + 8)); + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) + return -EINVAL; + } else { + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) + return -EINVAL; + } break; case BPF_JMP | BPF_TAIL_CALL: if (imm32) emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], - &prog, addrs[i], image); + &prog, addrs[i], image, + callee_regs_used, + bpf_prog->aux->stack_depth); else - emit_bpf_tail_call_indirect(&prog); + emit_bpf_tail_call_indirect(&prog, + callee_regs_used, + bpf_prog->aux->stack_depth); break; /* cond jump */ @@ -1296,12 +1440,9 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; - if (!bpf_prog_was_classic(bpf_prog)) - EMIT1(0x5B); /* get rid of tail_call_cnt */ - EMIT2(0x41, 0x5F); /* pop r15 */ - EMIT2(0x41, 0x5E); /* pop r14 */ - EMIT2(0x41, 0x5D); /* pop r13 */ - EMIT1(0x5B); /* pop rbx */ + pop_callee_regs(&prog, callee_regs_used); + if (tail_call_reachable) + EMIT1(0x59); /* pop rcx, get rid of tail_call_cnt */ EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f3790c9cf542..d7c5a6ed87e3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -698,6 +698,8 @@ enum bpf_jit_poke_reason { /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { void *tailcall_target; + void *tailcall_bypass; + void *bypass_addr; union { struct { struct bpf_map *map; @@ -738,6 +740,7 @@ struct bpf_prog_aux { bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; bool sleepable; + bool tail_call_reachable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5026b75db972..fbc964526ba3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -359,6 +359,7 @@ struct bpf_subprog_info { u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ bool has_tail_call; + bool tail_call_reachable; }; /* single container for all structs diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 60abf7fe12de..e5fd31268ae0 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -898,6 +898,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { + u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -949,12 +950,39 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->tailcall_target, BPF_MOD_JUMP, - old ? (u8 *)old->bpf_func + - poke->adj_off : NULL, - new ? (u8 *)new->bpf_func + - poke->adj_off : NULL); - BUG_ON(ret < 0 && ret != -EINVAL); + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + if (new) { + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + if (!old) { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } else { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } } } } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2e00ac028d38..c4811b139caa 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -776,7 +776,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, if (size > poke_tab_max) return -ENOSPC; if (poke->tailcall_target || poke->tailcall_target_stable || - poke->adj_off) + poke->tailcall_bypass || poke->adj_off || poke->bypass_addr) return -EINVAL; switch (poke->reason) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0958fba48d59..172e12df9eaa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2983,8 +2983,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; + bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; + int j; process_func: /* protect against potential stack overflow that might happen when @@ -3040,6 +3042,10 @@ continue_func: i); return -EFAULT; } + + if (subprog[idx].has_tail_call) + tail_call_reachable = true; + frame++; if (frame >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep !\n", @@ -3048,6 +3054,15 @@ continue_func: } goto process_func; } + /* if tail call got detected across bpf2bpf calls then mark each of the + * currently present subprog frames as tail call reachable subprogs; + * this info will be utilized by JIT so that we will be preserving the + * tail call counter throughout bpf2bpf calls combined with tailcalls + */ + if (tail_call_reachable) + for (j = 0; j < frame; j++) + subprog[ret_prog[j]].tail_call_reachable = true; + /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT */ @@ -10322,6 +10337,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) num_exentries++; } func[i]->aux->num_exentries = num_exentries; + func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; From e411901c0b775a3ae7f3e2505f8d2d90ac696178 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:09 +0200 Subject: [PATCH 72/95] bpf: allow for tailcalls in BPF subprograms for x64 JIT Relax verifier's restriction that was meant to forbid tailcall usage when subprog count was higher than 1. Also, do not max out the stack depth of program that utilizes tailcalls. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 172e12df9eaa..d1c009e8c57f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4268,6 +4268,11 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) return false; } +static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) +{ + return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); +} + static int check_map_func_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, int func_id) { @@ -4383,8 +4388,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt > 1) { - verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); return -EINVAL; } break; @@ -10469,6 +10474,13 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { + /* When JIT fails the progs with bpf2bpf calls and tail_calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + return -EINVAL; + } for (i = 0; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) @@ -10632,8 +10644,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * the program array. */ prog->cb_access = 1; - env->prog->aux->stack_depth = MAX_BPF_STACK; - env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; + if (!allow_tail_call_in_subprogs(env)) + prog->aux->stack_depth = MAX_BPF_STACK; + prog->aux->max_pkt_offset = MAX_PACKET_OFF; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal From 09b28d76eac48e922dc293da1aa2b2b85c32aeee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 17 Sep 2020 19:09:18 -0700 Subject: [PATCH 73/95] bpf: Add abnormal return checks. LD_[ABS|IND] instructions may return from the function early. bpf_tail_call pseudo instruction is either fallthrough or return. Allow them in the subprograms only when subprograms are BTF annotated and have scalar return types. Allow ld_abs and tail_call in the main program even if it calls into subprograms. In the past that was not ok to do for ld_abs, since it was JITed with special exit sequence. Since bpf_gen_ld_abs() was introduced the ld_abs looks like normal exit insn from JIT point of view, so it's safe to allow them in the main program. Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 67 ++++++++++++++------ tools/testing/selftests/bpf/verifier/calls.c | 6 +- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index fbc964526ba3..2bb48a2c4d08 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -360,6 +360,7 @@ struct bpf_subprog_info { u16 stack_depth; /* max. stack depth used by this function */ bool has_tail_call; bool tail_call_reachable; + bool has_ld_abs; }; /* single container for all structs diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1c009e8c57f..4161b6c406bc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1494,6 +1494,9 @@ static int check_subprogs(struct bpf_verifier_env *env) insn[i].imm == BPF_FUNC_tail_call && insn[i].src_reg != BPF_PSEUDO_CALL) subprog[cur_subprog].has_tail_call = true; + if (BPF_CLASS(code) == BPF_LD && + (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND)) + subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) @@ -7514,18 +7517,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (env->subprog_cnt > 1) { - /* when program has LD_ABS insn JITs and interpreter assume - * that r1 == ctx == skb which is not the case for callees - * that can have arbitrary arguments. It's problematic - * for main prog as well since JITs would need to analyze - * all functions in order to make proper register save/restore - * decisions in the main prog. Hence disallow LD_ABS with calls - */ - verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); - return -EINVAL; - } - if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -7936,6 +7927,23 @@ err_free: return ret; } +static int check_abnormal_return(struct bpf_verifier_env *env) +{ + int i; + + for (i = 1; i < env->subprog_cnt; i++) { + if (env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + if (env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + } + return 0; +} + /* The minimum supported BTF func info size */ #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 @@ -7944,20 +7952,24 @@ static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, union bpf_attr __user *uattr) { + const struct btf_type *type, *func_proto, *ret_type; u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; struct bpf_func_info_aux *info_aux = NULL; - const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; void __user *urecord; u32 prev_offset = 0; + bool scalar_return; int ret = -ENOMEM; nfuncs = attr->func_info_cnt; - if (!nfuncs) + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } if (nfuncs != env->subprog_cnt) { verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); @@ -8005,25 +8017,23 @@ static int check_btf_func(struct bpf_verifier_env *env, } /* check insn_off */ + ret = -EINVAL; if (i == 0) { if (krecord[i].insn_off) { verbose(env, "nonzero insn_off %u for the first func info record", krecord[i].insn_off); - ret = -EINVAL; goto err_free; } } else if (krecord[i].insn_off <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", krecord[i].insn_off, prev_offset); - ret = -EINVAL; goto err_free; } if (env->subprog_info[i].start != krecord[i].insn_off) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - ret = -EINVAL; goto err_free; } @@ -8032,10 +8042,26 @@ static int check_btf_func(struct bpf_verifier_env *env, if (!type || !btf_type_is_func(type)) { verbose(env, "invalid type id %d in func info", krecord[i].type_id); - ret = -EINVAL; goto err_free; } info_aux[i].linkage = BTF_INFO_VLEN(type->info); + + func_proto = btf_type_by_id(btf, type->type); + if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) + /* btf_func_check() already verified it during BTF load */ + goto err_free; + ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); + scalar_return = + btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); + goto err_free; + } + if (i && !scalar_return && env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is only allowed in functions that return 'int'.\n"); + goto err_free; + } + prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -8196,8 +8222,11 @@ static int check_btf_info(struct bpf_verifier_env *env, struct btf *btf; int err; - if (!attr->func_info_cnt && !attr->line_info_cnt) + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } btf = btf_get_by_fd(attr->prog_btf_fd); if (IS_ERR(btf)) diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 94258c6b5235..c4f5d909e58a 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -647,13 +647,14 @@ .result = REJECT, }, { - "calls: ld_abs with changing ctx data in callee", + "calls: subprog call with ld_abs in main prog", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_LD_ABS(BPF_B, 0), BPF_LD_ABS(BPF_H, 0), BPF_LD_ABS(BPF_W, 0), BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), BPF_LD_ABS(BPF_B, 0), @@ -666,8 +667,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .errstr = "BPF_LD_[ABS|IND] instructions cannot be mixed", - .result = REJECT, + .result = ACCEPT, }, { "calls: two calls with bad fallthrough", From 3b0379111197fb97f4f46f93946fe30b22a15223 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:10 +0200 Subject: [PATCH 74/95] selftests/bpf: Add tailcall_bpf2bpf tests Add four tests to tailcalls selftest explicitly named "tailcall_bpf2bpf_X" as their purpose is to validate that combination of tailcalls with bpf2bpf calls are working properly. These tests also validate LD_ABS from subprograms. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tailcalls.c | 332 ++++++++++++++++++ .../selftests/bpf/progs/tailcall_bpf2bpf1.c | 38 ++ .../selftests/bpf/progs/tailcall_bpf2bpf2.c | 41 +++ .../selftests/bpf/progs/tailcall_bpf2bpf3.c | 61 ++++ .../selftests/bpf/progs/tailcall_bpf2bpf4.c | 61 ++++ 5 files changed, 533 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index bb8fe646dd9f..ee27d68d2a1c 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include /* test_tailcall_1 checks basic functionality by patching multiple locations * in a single program for a single tail call slot with nop->jmp, jmp->nop @@ -472,6 +473,329 @@ out: bpf_object__close(obj); } +/* test_tailcall_bpf2bpf_1 purpose is to make sure that tailcalls are working + * correctly in correlation with BPF subprograms + */ +static void test_tailcall_bpf2bpf_1(void) +{ + int err, map_fd, prog_fd, main_fd, i; + struct bpf_map *prog_array; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + + err = bpf_prog_load("tailcall_bpf2bpf1.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + /* nop -> jmp */ + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + 0, &retval, &duration); + CHECK(err || retval != 1, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + + /* jmp -> nop, call subprog that will do tailcall */ + i = 1; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + 0, &retval, &duration); + CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + /* make sure that subprog can access ctx and entry prog that + * called this subprog can properly return + */ + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + 0, &retval, &duration); + CHECK(err || retval != sizeof(pkt_v4) * 2, + "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_bpf2bpf_2 checks that the count value of the tail call limit + * enforcement matches with expectations when tailcall is preceded with + * bpf2bpf call. + */ +static void test_tailcall_bpf2bpf_2(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i, val; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char buff[128] = {}; + + err = bpf_prog_load("tailcall_bpf2bpf2.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + prog = bpf_object__find_program_by_title(obj, "classifier/0"); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + i = 0; + err = bpf_map_lookup_elem(data_fd, &i, &val); + CHECK(err || val != 33, "tailcall count", "err %d errno %d count %d\n", + err, errno, val); + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_bpf2bpf_3 checks that non-trivial amount of stack (up to + * 256 bytes) can be used within bpf subprograms that have the tailcalls + * in them + */ +static void test_tailcall_bpf2bpf_3(void) +{ + int err, map_fd, prog_fd, main_fd, i; + struct bpf_map *prog_array; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + + err = bpf_prog_load("tailcall_bpf2bpf3.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + &duration, &retval, NULL); + CHECK(err || retval != sizeof(pkt_v4) * 3, + "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + i = 1; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + &duration, &retval, NULL); + CHECK(err || retval != sizeof(pkt_v4), + "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + &duration, &retval, NULL); + CHECK(err || retval != sizeof(pkt_v4) * 2, + "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_bpf2bpf_4 checks that tailcall counter is correctly preserved + * across tailcalls combined with bpf2bpf calls. for making sure that tailcall + * counter behaves correctly, bpf program will go through following flow: + * + * entry -> entry_subprog -> tailcall0 -> bpf_func0 -> subprog0 -> + * -> tailcall1 -> bpf_func1 -> subprog1 -> tailcall2 -> bpf_func2 -> + * subprog2 [here bump global counter] --------^ + * + * We go through first two tailcalls and start counting from the subprog2 where + * the loop begins. At the end of the test make sure that the global counter is + * equal to 31, because tailcall counter includes the first two tailcalls + * whereas global counter is incremented only on loop presented on flow above. + */ +static void test_tailcall_bpf2bpf_4(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i, val; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + + err = bpf_prog_load("tailcall_bpf2bpf4.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + &duration, &retval, NULL); + CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + i = 0; + err = bpf_map_lookup_elem(data_fd, &i, &val); + CHECK(err || val != 31, "tailcall count", "err %d errno %d count %d\n", + err, errno, val); + +out: + bpf_object__close(obj); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -484,4 +808,12 @@ void test_tailcalls(void) test_tailcall_4(); if (test__start_subtest("tailcall_5")) test_tailcall_5(); + if (test__start_subtest("tailcall_bpf2bpf_1")) + test_tailcall_bpf2bpf_1(); + if (test__start_subtest("tailcall_bpf2bpf_2")) + test_tailcall_bpf2bpf_2(); + if (test__start_subtest("tailcall_bpf2bpf_3")) + test_tailcall_bpf2bpf_3(); + if (test__start_subtest("tailcall_bpf2bpf_4")) + test_tailcall_bpf2bpf_4(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c new file mode 100644 index 000000000000..b5d9c8e778ae --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 2); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +#define TAIL_FUNC(x) \ + SEC("classifier/" #x) \ + int bpf_func_##x(struct __sk_buff *skb) \ + { \ + return x; \ + } +TAIL_FUNC(0) +TAIL_FUNC(1) + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + + return skb->len * 2; +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 1); + + return subprog_tail(skb); +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c new file mode 100644 index 000000000000..a004ab28ce28 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_legacy.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + if (load_byte(skb, 0)) + bpf_tail_call(skb, &jmp_table, 1); + else + bpf_tail_call(skb, &jmp_table, 0); + return 1; +} + +static volatile int count; + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + count++; + return subprog_tail(skb); +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + + return 0; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c new file mode 100644 index 000000000000..96dbef2b6b7c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_legacy.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 2); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +__noinline +int subprog_tail2(struct __sk_buff *skb) +{ + volatile char arr[64] = {}; + + if (load_word(skb, 0) || load_half(skb, 0)) + bpf_tail_call(skb, &jmp_table, 10); + else + bpf_tail_call(skb, &jmp_table, 1); + + return skb->len; +} + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + volatile char arr[64] = {}; + + bpf_tail_call(skb, &jmp_table, 0); + + return skb->len * 2; +} + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + volatile char arr[128] = {}; + + return subprog_tail2(skb); +} + +SEC("classifier/1") +int bpf_func_1(struct __sk_buff *skb) +{ + volatile char arr[128] = {}; + + return skb->len * 3; +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + volatile char arr[128] = {}; + + return subprog_tail(skb); +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c new file mode 100644 index 000000000000..98b40a95bc67 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static volatile int count; + +__noinline +int subprog_tail_2(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 2); + return skb->len * 3; +} + +__noinline +int subprog_tail_1(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 1); + return skb->len * 2; +} + +__noinline +int subprog_tail(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + return skb->len; +} + +SEC("classifier/1") +int bpf_func_1(struct __sk_buff *skb) +{ + return subprog_tail_2(skb); +} + +SEC("classifier/2") +int bpf_func_2(struct __sk_buff *skb) +{ + count++; + return subprog_tail_2(skb); +} + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + return subprog_tail_1(skb); +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + return subprog_tail(skb); +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; From b6ed6cf4a3acdeab9aed8e0a524850761ec9b152 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 15 Sep 2020 13:38:15 +0200 Subject: [PATCH 75/95] selftests/bpf: Fix endianness issue in sk_assign server_map's value size is 8, but the test tries to put an int there. This sort of works on x86 (unless followed by non-0), but hard fails on s390. Fix by using __s64 instead of int. Fixes: 2d7824ffd25c ("selftests: bpf: Add test for sk_assign") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200915113815.3768217-1-iii@linux.ibm.com --- tools/testing/selftests/bpf/prog_tests/sk_assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index a49a26f95a8b..3a469099f30d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -265,7 +265,7 @@ void test_sk_assign(void) TEST("ipv6 udp port redir", AF_INET6, SOCK_DGRAM, false), TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), }; - int server = -1; + __s64 server = -1; int server_map; int self_net; int i; From fec47bbc10b243690f5d0ee484a0bbdee273e71b Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 15 Sep 2020 13:39:28 +0200 Subject: [PATCH 76/95] selftests/bpf: Fix endianness issue in test_sockopt_sk getsetsockopt() calls getsockopt() with optlen == 1, but then checks the resulting int. It is ok on little endian, but not on big endian. Fix by checking char instead. Fixes: 8a027dc0d8f5 ("selftests/bpf: add sockopt test that exercises sk helpers") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200915113928.3768496-1-iii@linux.ibm.com --- tools/testing/selftests/bpf/prog_tests/sockopt_sk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index 5f54c6aec7f0..b25c9c45c148 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -45,9 +45,9 @@ static int getsetsockopt(void) goto err; } - if (*(int *)big_buf != 0x08) { + if (*big_buf != 0x08) { log_err("Unexpected getsockopt(IP_TOS) optval 0x%x != 0x08", - *(int *)big_buf); + (int)*big_buf); goto err; } From f55f4c349a03d820c27145bdf457013b42e4b487 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 15 Sep 2020 13:55:19 +0200 Subject: [PATCH 77/95] samples/bpf: Fix test_map_in_map on s390 s390 uses socketcall multiplexer instead of individual socket syscalls. Therefore, "kprobe/" SYSCALL(sys_connect) does not trigger and test_map_in_map fails. Fix by using "kprobe/__sys_connect" instead. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200915115519.3769807-1-iii@linux.ibm.com --- samples/bpf/test_map_in_map_kern.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c index 8def45c5b697..b0200c8eac09 100644 --- a/samples/bpf/test_map_in_map_kern.c +++ b/samples/bpf/test_map_in_map_kern.c @@ -103,10 +103,9 @@ static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) return result ? *result : -ENOENT; } -SEC("kprobe/" SYSCALL(sys_connect)) +SEC("kprobe/__sys_connect") int trace_sys_connect(struct pt_regs *ctx) { - struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); struct sockaddr_in6 *in6; u16 test_case, port, dst6[8]; int addrlen, ret, inline_ret, ret_key = 0; @@ -114,8 +113,8 @@ int trace_sys_connect(struct pt_regs *ctx) void *outer_map, *inner_map; bool inline_hash = false; - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs); - addrlen = (int)PT_REGS_PARM3_CORE(real_regs); + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(ctx); + addrlen = (int)PT_REGS_PARM3_CORE(ctx); if (addrlen != sizeof(*in6)) return 0; From 70b971118e074d5042715587953f27929e99117a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 16 Sep 2020 13:44:53 -0700 Subject: [PATCH 78/95] bpf: Use hlist_add_head_rcu when linking to local_storage The local_storage->list will be traversed by rcu reader in parallel. Thus, hlist_add_head_rcu() is needed in bpf_selem_link_storage_nolock(). This patch fixes it. This part of the code has recently been refactored in bpf-next and this patch makes changes to the new file "bpf_local_storage.c". Instead of using the original offending commit in the Fixes tag, the commit that created the file "bpf_local_storage.c" is used. A separate fix has been provided to the bpf tree. Fixes: 450af8d0f6be ("bpf: Split bpf_local_storage to bpf_sk_storage") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200916204453.2003915-1-kafai@fb.com --- kernel/bpf/bpf_local_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index ffa7d11fc2bd..5d3a7af9ba9b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -159,7 +159,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { RCU_INIT_POINTER(selem->local_storage, local_storage); - hlist_add_head(&selem->snode, &local_storage->list); + hlist_add_head_rcu(&selem->snode, &local_storage->list); } void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) From 31f23a6a181c81543b10a1a9056b0e6c7ef1c747 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 17 Sep 2020 15:44:53 +0800 Subject: [PATCH 79/95] bpf: Fix potential call bpf_link_free() in atomic context The in_atomic() macro cannot always detect atomic context, in particular, it cannot know about held spinlocks in non-preemptible kernels. Although, there is no user call bpf_link_put() with holding spinlock now, be on the safe side, so we can avoid this in the future. Signed-off-by: Muchun Song Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200917074453.20621-1-songmuchun@bytedance.com --- kernel/bpf/syscall.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2ce32cad5c8e..ec68d3a23a2b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2345,12 +2345,8 @@ void bpf_link_put(struct bpf_link *link) if (!atomic64_dec_and_test(&link->refcnt)) return; - if (in_atomic()) { - INIT_WORK(&link->work, bpf_link_put_deferred); - schedule_work(&link->work); - } else { - bpf_link_free(link); - } + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); } static int bpf_link_release(struct inode *inode, struct file *filp) From 2af30f115d6957f372ce3096c7198763ff253d97 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:17 +0100 Subject: [PATCH 80/95] btf: Make btf_set_contains take a const pointer bsearch doesn't modify the contents of the array, so we can take a const pointer. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200921121227.255763-2-lmb@cloudflare.com --- include/linux/bpf.h | 2 +- kernel/bpf/btf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d7c5a6ed87e3..0478b20d335b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1905,6 +1905,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); struct btf_id_set; -bool btf_id_set_contains(struct btf_id_set *set, u32 id); +bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f9ac6935ab3c..a2330f6fe2e6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4772,7 +4772,7 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } -bool btf_id_set_contains(struct btf_id_set *set, u32 id) +bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } From 0d004c020b5574e51f7a525e57d2a11958b334b5 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:18 +0100 Subject: [PATCH 81/95] bpf: Check scalar or invalid register in check_helper_mem_access Move the check for a NULL or zero register to check_helper_mem_access. This makes check_stack_boundary easier to understand. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200921121227.255763-3-lmb@cloudflare.com --- kernel/bpf/verifier.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4161b6c406bc..06238395f244 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3641,18 +3641,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; - if (reg->type != PTR_TO_STACK) { - /* Allow zero-byte read from NULL, regardless of pointer type */ - if (zero_size_allowed && access_size == 0 && - register_is_null(reg)) - return 0; - - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); - return -EACCES; - } - if (tnum_is_const(reg->var_off)) { min_off = max_off = reg->var_off.value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, @@ -3797,9 +3785,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, access_size, zero_size_allowed, "rdwr", &env->prog->aux->max_rdwr_access); - default: /* scalar_value|ptr_to_stack or invalid ptr */ + case PTR_TO_STACK: return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); + default: /* scalar_value or invalid ptr */ + /* Allow zero-byte read from NULL, regardless of pointer type */ + if (zero_size_allowed && access_size == 0 && + register_is_null(reg)) + return 0; + + verbose(env, "R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], + reg_type_str[PTR_TO_STACK]); + return -EACCES; } } From 27774b7073b5d520c80f1fcb8e9993fc139f21bd Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:19 +0100 Subject: [PATCH 82/95] btf: Add BTF_ID_LIST_SINGLE macro Add a convenience macro that allows defining a BTF ID list with a single item. This lets us cut down on repetitive macros. Suggested-by: Andrii Nakryiko Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-4-lmb@cloudflare.com --- include/linux/btf_ids.h | 8 ++++++++ tools/include/linux/btf_ids.h | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 210b086188a3..57890b357f85 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -76,6 +76,13 @@ extern u32 name[]; #define BTF_ID_LIST_GLOBAL(name) \ __BTF_ID_LIST(name, globl) +/* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with + * a single entry. + */ +#define BTF_ID_LIST_SINGLE(name, prefix, typename) \ + BTF_ID_LIST(name) \ + BTF_ID(prefix, typename) + /* * The BTF_ID_UNUSED macro defines 4 zero bytes. * It's used when we want to define 'unused' entry @@ -140,6 +147,7 @@ extern struct btf_id_set name; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 210b086188a3..57890b357f85 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -76,6 +76,13 @@ extern u32 name[]; #define BTF_ID_LIST_GLOBAL(name) \ __BTF_ID_LIST(name, globl) +/* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with + * a single entry. + */ +#define BTF_ID_LIST_SINGLE(name, prefix, typename) \ + BTF_ID_LIST(name) \ + BTF_ID(prefix, typename) + /* * The BTF_ID_UNUSED macro defines 4 zero bytes. * It's used when we want to define 'unused' entry @@ -140,6 +147,7 @@ extern struct btf_id_set name; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) From 9436ef6e862b9ca22e5b12f87b106e07d5af4cae Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:20 +0100 Subject: [PATCH 83/95] bpf: Allow specifying a BTF ID per argument in function protos Function prototypes using ARG_PTR_TO_BTF_ID currently use two ways to signal which BTF IDs are acceptable. First, bpf_func_proto.btf_id is an array of IDs, one for each argument. This array is only accessed up to the highest numbered argument that uses ARG_PTR_TO_BTF_ID and may therefore be less than five arguments long. It usually points at a BTF_ID_LIST. Second, check_btf_id is a function pointer that is called by the verifier if present. It gets the actual BTF ID of the register, and the argument number we're currently checking. It turns out that the only user check_arg_btf_id ignores the argument, and is simply used to check whether the BTF ID has a struct sock_common at it's start. Replace both of these mechanisms with an explicit BTF ID for each argument in a function proto. Thanks to btf_struct_ids_match this is very flexible: check_arg_btf_id can be replaced by requiring struct sock_common. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-5-lmb@cloudflare.com --- include/linux/bpf.h | 18 +++++++------- kernel/bpf/bpf_inode_storage.c | 8 +++---- kernel/bpf/btf.c | 13 ---------- kernel/bpf/stackmap.c | 5 ++-- kernel/bpf/verifier.c | 44 +++++++++++++++++----------------- kernel/trace/bpf_trace.c | 15 ++++-------- net/core/bpf_sk_storage.c | 8 ++----- net/core/filter.c | 31 +++++++----------------- net/ipv4/bpf_tcp_ca.c | 19 ++++----------- 9 files changed, 58 insertions(+), 103 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0478b20d335b..87b0d5dcc1ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -326,12 +326,16 @@ struct bpf_func_proto { }; enum bpf_arg_type arg_type[5]; }; - int *btf_id; /* BTF ids of arguments */ - bool (*check_btf_id)(u32 btf_id, u32 arg); /* if the argument btf_id is - * valid. Often used if more - * than one btf id is permitted - * for this argument. - */ + union { + struct { + u32 *arg1_btf_id; + u32 *arg2_btf_id; + u32 *arg3_btf_id; + u32 *arg4_btf_id; + u32 *arg5_btf_id; + }; + u32 *arg_btf_id[5]; + }; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(const struct bpf_prog *prog); }; @@ -1385,8 +1389,6 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 *next_btf_id); bool btf_struct_ids_match(struct bpf_verifier_log *log, int off, u32 id, u32 need_type_id); -int btf_resolve_helper_id(struct bpf_verifier_log *log, - const struct bpf_func_proto *fn, int); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 75be02799c0f..6edff97ad594 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -249,9 +249,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_owner_storage_ptr = inode_storage_ptr, }; -BTF_ID_LIST(bpf_inode_storage_btf_ids) -BTF_ID_UNUSED -BTF_ID(struct, inode) +BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode) const struct bpf_func_proto bpf_inode_storage_get_proto = { .func = bpf_inode_storage_get, @@ -259,9 +257,9 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = { .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, - .btf_id = bpf_inode_storage_btf_ids, }; const struct bpf_func_proto bpf_inode_storage_delete_proto = { @@ -270,5 +268,5 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .btf_id = bpf_inode_storage_btf_ids, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a2330f6fe2e6..5d3c36e13139 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4193,19 +4193,6 @@ again: return true; } -int btf_resolve_helper_id(struct bpf_verifier_log *log, - const struct bpf_func_proto *fn, int arg) -{ - int id; - - if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux) - return -EINVAL; - id = fn->btf_id[arg]; - if (!id || id > btf_vmlinux->nr_types) - return -EINVAL; - return id; -} - static int __get_type_size(struct btf *btf, u32 btf_id, const struct btf_type **bad_type) { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a2fa006f430e..06065fa27124 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -665,18 +665,17 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return __bpf_get_stack(regs, task, NULL, buf, size, flags); } -BTF_ID_LIST(bpf_get_task_stack_btf_ids) -BTF_ID(struct, task_struct) +BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_get_task_stack_btf_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, - .btf_id = bpf_get_task_stack_btf_ids, }; BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 06238395f244..30da34d9b93b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,7 +238,6 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; - u32 btf_id; }; struct btf *btf_vmlinux; @@ -4049,29 +4048,23 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, goto err_type; } } else if (arg_type == ARG_PTR_TO_BTF_ID) { - bool ids_match = false; + const u32 *btf_id = fn->arg_btf_id[arg]; expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - if (!fn->check_btf_id) { - if (reg->btf_id != meta->btf_id) { - ids_match = btf_struct_ids_match(&env->log, reg->off, reg->btf_id, - meta->btf_id); - if (!ids_match) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), - kernel_type_name(reg->btf_id), regno); - return -EACCES; - } - } - } else if (!fn->check_btf_id(reg->btf_id, arg)) { - verbose(env, "Helper does not support %s in R%d\n", - kernel_type_name(reg->btf_id), regno); + if (!btf_id) { + verbose(env, "verifier internal error: missing BTF ID\n"); + return -EFAULT; + } + + if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, *btf_id)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf_id), kernel_type_name(*btf_id)); return -EACCES; } - if ((reg->off && !ids_match) || !tnum_is_const(reg->var_off) || reg->var_off.value) { + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", regno); return -EACCES; @@ -4545,10 +4538,22 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) return count <= 1; } +static bool check_btf_id_ok(const struct bpf_func_proto *fn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) + if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + return false; + + return true; +} + static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && + check_btf_id_ok(fn) && check_refcount_ok(fn, func_id) ? 0 : -EINVAL; } @@ -4944,11 +4949,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - if (!fn->check_btf_id) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - } err = check_func_arg(env, i, &meta, fn); if (err) return err; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b2a5380eb187..ebf9be4d0d6a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -743,19 +743,18 @@ out: return err; } -BTF_ID_LIST(bpf_seq_printf_btf_ids) -BTF_ID(struct, seq_file) +BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM_OR_NULL, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_printf_btf_ids, }; BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) @@ -763,17 +762,14 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) return seq_write(m, data, len) ? -EOVERFLOW : 0; } -BTF_ID_LIST(bpf_seq_write_btf_ids) -BTF_ID(struct, seq_file) - static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_write_btf_ids, }; static __always_inline int @@ -1130,17 +1126,16 @@ static bool bpf_d_path_allowed(const struct bpf_prog *prog) return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); } -BTF_ID_LIST(bpf_d_path_btf_ids) -BTF_ID(struct, path) +BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) static const struct bpf_func_proto bpf_d_path_proto = { .func = bpf_d_path, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_d_path_btf_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_d_path_btf_ids, .allowed = bpf_d_path_allowed, }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4a86ea34f29e..d653a583dbc9 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -378,19 +378,15 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_SOCKET, }; -BTF_ID_LIST(sk_storage_btf_ids) -BTF_ID_UNUSED -BTF_ID(struct, sock) - const struct bpf_func_proto sk_storage_get_btf_proto = { .func = bpf_sk_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, - .btf_id = sk_storage_btf_ids, }; const struct bpf_func_proto sk_storage_delete_btf_proto = { @@ -399,7 +395,7 @@ const struct bpf_func_proto sk_storage_delete_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .btf_id = sk_storage_btf_ids, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], }; struct bpf_sk_storage_diag { diff --git a/net/core/filter.c b/net/core/filter.c index d266c6941967..6014e5f40c58 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3803,19 +3803,18 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -BTF_ID_LIST(bpf_skb_output_btf_ids) -BTF_ID(struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_skb_output_btf_ids, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -4199,19 +4198,18 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -BTF_ID_LIST(bpf_xdp_output_btf_ids) -BTF_ID(struct, xdp_buff) +BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_xdp_output_btf_ids, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) @@ -9897,17 +9895,6 @@ BTF_SOCK_TYPE_xxx u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; #endif -static bool check_arg_btf_id(u32 btf_id, u32 arg) -{ - int i; - - /* only one argument, no need to check arg */ - for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) - if (btf_sock_ids[i] == btf_id) - return true; - return false; -} - BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { /* tcp6_sock type is not generated in dwarf and hence btf, @@ -9926,7 +9913,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; @@ -9943,7 +9930,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], }; @@ -9967,7 +9954,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], }; @@ -9991,7 +9978,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; @@ -10013,6 +10000,6 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index e3939f76b024..74a2ef598c31 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -28,23 +28,18 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; -static int btf_sk_storage_get_ids[5]; static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly; - -static int btf_sk_storage_delete_ids[5]; static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly; -static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids, - const struct bpf_func_proto *from) +static void convert_sk_func_proto(struct bpf_func_proto *to, const struct bpf_func_proto *from) { int i; *to = *from; - to->btf_id = to_btf_ids; for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) { if (to->arg_type[i] == ARG_PTR_TO_SOCKET) { to->arg_type[i] = ARG_PTR_TO_BTF_ID; - to->btf_id[i] = tcp_sock_id; + to->arg_btf_id[i] = &tcp_sock_id; } } } @@ -64,12 +59,8 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); - convert_sk_func_proto(&btf_sk_storage_get_proto, - btf_sk_storage_get_ids, - &bpf_sk_storage_get_proto); - convert_sk_func_proto(&btf_sk_storage_delete_proto, - btf_sk_storage_delete_ids, - &bpf_sk_storage_delete_proto); + convert_sk_func_proto(&btf_sk_storage_get_proto, &bpf_sk_storage_get_proto); + convert_sk_func_proto(&btf_sk_storage_delete_proto, &bpf_sk_storage_delete_proto); return 0; } @@ -185,8 +176,8 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { /* In case we want to report error later */ .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &tcp_sock_id, .arg2_type = ARG_ANYTHING, - .btf_id = &tcp_sock_id, }; static const struct bpf_func_proto * From d7b9454a4f6333bf145189b8e769011d15bdd50e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:21 +0100 Subject: [PATCH 84/95] bpf: Make BTF pointer type checking generic Perform BTF type checks if the register we're working on contains a BTF pointer, rather than if the argument is for a BTF pointer. This is easier to understand, and allows removing the code from the arg_type checking section of the function. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-6-lmb@cloudflare.com --- kernel/bpf/verifier.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 30da34d9b93b..a0e919232968 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4048,27 +4048,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, goto err_type; } } else if (arg_type == ARG_PTR_TO_BTF_ID) { - const u32 *btf_id = fn->arg_btf_id[arg]; - expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - - if (!btf_id) { - verbose(env, "verifier internal error: missing BTF ID\n"); - return -EFAULT; - } - - if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, *btf_id)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf_id), kernel_type_name(*btf_id)); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -4123,6 +4105,26 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EFAULT; } + if (type == PTR_TO_BTF_ID) { + const u32 *btf_id = fn->arg_btf_id[arg]; + + if (!btf_id) { + verbose(env, "verifier internal error: missing BTF ID\n"); + return -EFAULT; + } + + if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, *btf_id)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf_id), kernel_type_name(*btf_id)); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + regno); + return -EACCES; + } + } + if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ meta->map_ptr = reg->map_ptr; From 02f7c9585d1e2d5d76cac497bd5ced8ecf9d6f56 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:22 +0100 Subject: [PATCH 85/95] bpf: Make reference tracking generic Instead of dealing with reg->ref_obj_id individually for every arg type that needs it, rely on the fact that ref_obj_id is zero if the register is not reference tracked. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-7-lmb@cloudflare.com --- kernel/bpf/verifier.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a0e919232968..a4549b2656ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4030,15 +4030,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ if (!type_is_sk_pointer(type)) goto err_type; - if (reg->ref_obj_id) { - if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; - } - meta->ref_obj_id = reg->ref_obj_id; - } } else if (arg_type == ARG_PTR_TO_SOCKET || arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; @@ -4087,13 +4078,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, /* final test in check_stack_boundary() */; else if (type != expected_type) goto err_type; - if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; - } - meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -4125,6 +4109,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } } + if (reg->ref_obj_id) { + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; + } + if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ meta->map_ptr = reg->map_ptr; From feec70401672bd9b0268ae59ec5efd15d86ae138 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:23 +0100 Subject: [PATCH 86/95] bpf: Make context access check generic Always check context access if the register we're operating on is PTR_TO_CTX, rather than relying on ARG_PTR_TO_CTX. This allows simplifying the arg_type checking section of the function. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-8-lmb@cloudflare.com --- kernel/bpf/verifier.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a4549b2656ad..fc795bac42ed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4021,9 +4021,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, arg_type == ARG_PTR_TO_CTX_OR_NULL)) { if (type != expected_type) goto err_type; - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; } } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { expected_type = PTR_TO_SOCK_COMMON; @@ -4107,6 +4104,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, regno); return -EACCES; } + } else if (type == PTR_TO_CTX) { + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; } if (reg->ref_obj_id) { From a2bbe7cc90755283f1db719eb757616cefd2a9fd Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:24 +0100 Subject: [PATCH 87/95] bpf: Set meta->raw_mode for pointers close to use If we encounter a pointer to memory, we set meta->raw_mode depending on the type of memory we point at. What isn't obvious is that this information is only used when the next memory size argument is encountered. Move the assignment closer to where it's used, and add a comment that explains what is going on. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-9-lmb@cloudflare.com --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fc795bac42ed..446fbe7f6b49 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4067,7 +4067,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, type != PTR_TO_RDWR_BUF && type != expected_type) goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else if (arg_type_is_alloc_mem_ptr(arg_type)) { expected_type = PTR_TO_MEM; if (register_is_null(reg) && @@ -4156,6 +4155,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type_is_mem_ptr(arg_type)) { + /* The access to this pointer is only checked when we hit the + * next is_mem_size argument below. + */ + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); From c18f0b6aee2aaa6ab2aefd4b9aa1d89142a48824 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:25 +0100 Subject: [PATCH 88/95] bpf: Check ARG_PTR_TO_SPINLOCK register type in check_func_arg Move the check for PTR_TO_MAP_VALUE to check_func_arg, where all other checking is done as well. Move the invocation of process_spin_lock away from the register type checking, to allow a future refactoring. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-10-lmb@cloudflare.com --- kernel/bpf/verifier.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 446fbe7f6b49..779091cb5bdd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3828,10 +3828,6 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; - if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "R%d is not a pointer to map_value\n", regno); - return -EINVAL; - } if (!is_const) { verbose(env, "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", @@ -4040,16 +4036,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (type != expected_type) goto err_type; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { - if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; - } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; - } else { - verbose(env, "verifier internal error\n"); - return -EFAULT; - } + expected_type = PTR_TO_MAP_VALUE; + if (type != expected_type) + goto err_type; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -4155,6 +4144,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); + return -EFAULT; + } } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. From fd1b0d604c56e0d9f143b39a92132a2ea9625e6d Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:26 +0100 Subject: [PATCH 89/95] bpf: Hoist type checking for nullable arg types check_func_arg has a plethora of weird if statements with empty branches. They work around the fact that *_OR_NULL argument types should accept a SCALAR_VALUE register, as long as it's value is 0. These statements make it difficult to reason about the type checking logic. Instead, skip more detailed type checking logic iff the register is 0, and the function expects a nullable type. This allows simplifying the type checking itself. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-11-lmb@cloudflare.com --- kernel/bpf/verifier.c | 64 ++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 779091cb5bdd..129416ea0256 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -435,6 +435,15 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } +static bool arg_type_may_be_null(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_CTX_OR_NULL || + type == ARG_PTR_TO_SOCKET_OR_NULL || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + /* Determine whether the function releases some resources allocated by another * function call. The first reference type argument will be assumed to be * released by release_reference(). @@ -3988,17 +3997,20 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; } + if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + /* A NULL register has a SCALAR_VALUE type, so skip + * type checking. + */ + goto skip_type_check; + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { expected_type = PTR_TO_STACK; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO || @@ -4013,11 +4025,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } else if (arg_type == ARG_PTR_TO_CTX || arg_type == ARG_PTR_TO_CTX_OR_NULL) { expected_type = PTR_TO_CTX; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_CTX_OR_NULL)) { - if (type != expected_type) - goto err_type; - } + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { expected_type = PTR_TO_SOCK_COMMON; /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ @@ -4026,11 +4035,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } else if (arg_type == ARG_PTR_TO_SOCKET || arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { - if (type != expected_type) - goto err_type; - } + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) @@ -4041,27 +4047,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, goto err_type; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; - /* One exception here. In case function allows for NULL to be - * passed in as argument, it's a SCALAR_VALUE type. Final test - * happens during stack boundary checking. - */ - if (register_is_null(reg) && - (arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != PTR_TO_MEM && - type != PTR_TO_RDONLY_BUF && - type != PTR_TO_RDWR_BUF && - type != expected_type) + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && + type != PTR_TO_RDONLY_BUF && + type != PTR_TO_RDWR_BUF && + type != expected_type) goto err_type; } else if (arg_type_is_alloc_mem_ptr(arg_type)) { expected_type = PTR_TO_MEM; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) - /* final test in check_stack_boundary() */; - else if (type != expected_type) + if (type != expected_type) goto err_type; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; @@ -4098,6 +4093,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; } +skip_type_check: if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", From f79e7ea571732a6e16f15c6e2f000c347e2d7431 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:27 +0100 Subject: [PATCH 90/95] bpf: Use a table to drive helper arg type checks The mapping between bpf_arg_type and bpf_reg_type is encoded in a big hairy if statement that is hard to follow. The debug output also leaves to be desired: if a reg_type doesn't match we only print one of the options, instead printing all the valid ones. Convert the if statement into a table which is then used to drive type checking. If none of the reg_types match we print all options, e.g.: R2 type=rdonly_buf expected=fp, pkt, pkt_meta, map_value Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-12-lmb@cloudflare.com --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 183 +++++++++++++++++++++++++----------------- 2 files changed, 110 insertions(+), 74 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 87b0d5dcc1ff..fc5c901c7542 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -292,6 +292,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ + __BPF_ARG_TYPE_MAX, }; /* type of values returned from helper functions */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 129416ea0256..15ab889b0a3f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3903,12 +3903,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } -static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_ALLOC_MEM || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; -} - static bool arg_type_is_alloc_size(enum bpf_arg_type type) { return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; @@ -3957,14 +3951,115 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, return 0; } +struct bpf_reg_types { + const enum bpf_reg_type types[10]; +}; + +static const struct bpf_reg_types map_key_value_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types sock_types = { + .types = { + PTR_TO_SOCK_COMMON, + PTR_TO_SOCKET, + PTR_TO_TCP_SOCK, + PTR_TO_XDP_SOCK, + }, +}; + +static const struct bpf_reg_types mem_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + PTR_TO_MEM, + PTR_TO_RDONLY_BUF, + PTR_TO_RDWR_BUF, + }, +}; + +static const struct bpf_reg_types int_ptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; +static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; +static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; +static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; +static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; + +static const struct bpf_reg_types *compatible_reg_types[] = { + [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, + [ARG_CONST_SIZE] = &scalar_types, + [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_MAP_PTR] = &const_map_ptr_types, + [ARG_PTR_TO_CTX] = &context_types, + [ARG_PTR_TO_CTX_OR_NULL] = &context_types, + [ARG_PTR_TO_SOCK_COMMON] = &sock_types, + [ARG_PTR_TO_SOCKET] = &fullsock_types, + [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, + [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, + [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, + [ARG_PTR_TO_MEM] = &mem_types, + [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, + [ARG_PTR_TO_UNINIT_MEM] = &mem_types, + [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, + [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, + [ARG_PTR_TO_INT] = &int_ptr_types, + [ARG_PTR_TO_LONG] = &int_ptr_types, + [__BPF_ARG_TYPE_MAX] = NULL, +}; + +static int check_reg_type(struct bpf_verifier_env *env, u32 regno, + const struct bpf_reg_types *compatible) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + enum bpf_reg_type expected, type = reg->type; + int i, j; + + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { + expected = compatible->types[i]; + if (expected == NOT_INIT) + break; + + if (type == expected) + return 0; + } + + verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + for (j = 0; j + 1 < i; j++) + verbose(env, "%s, ", reg_type_str[compatible->types[j]]); + verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + return -EACCES; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) { u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - enum bpf_reg_type expected_type, type = reg->type; enum bpf_arg_type arg_type = fn->arg_type[arg]; + const struct bpf_reg_types *compatible; + enum bpf_reg_type type = reg->type; int err = 0; if (arg_type == ARG_DONTCARE) @@ -4003,72 +4098,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO || - arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { - expected_type = SCALAR_VALUE; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_MAP_PTR) { - expected_type = CONST_PTR_TO_MAP; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_CTX || - arg_type == ARG_PTR_TO_CTX_OR_NULL) { - expected_type = PTR_TO_CTX; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { - expected_type = PTR_TO_SOCK_COMMON; - /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ - if (!type_is_sk_pointer(type)) - goto err_type; - } else if (arg_type == ARG_PTR_TO_SOCKET || - arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { - expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_BTF_ID) { - expected_type = PTR_TO_BTF_ID; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { - expected_type = PTR_TO_MAP_VALUE; - if (type != expected_type) - goto err_type; - } else if (arg_type_is_mem_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != PTR_TO_MEM && - type != PTR_TO_RDONLY_BUF && - type != PTR_TO_RDWR_BUF && - type != expected_type) - goto err_type; - } else if (arg_type_is_alloc_mem_ptr(arg_type)) { - expected_type = PTR_TO_MEM; - if (type != expected_type) - goto err_type; - } else if (arg_type_is_int_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else { - verbose(env, "unsupported arg_type %d\n", arg_type); + compatible = compatible_reg_types[arg_type]; + if (!compatible) { + verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); return -EFAULT; } + err = check_reg_type(env, regno, compatible); + if (err) + return err; + if (type == PTR_TO_BTF_ID) { const u32 *btf_id = fn->arg_btf_id[arg]; @@ -4221,10 +4260,6 @@ skip_type_check: } return err; -err_type: - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[type], reg_type_str[expected_type]); - return -EACCES; } static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) From c69d2ddb2072eb7ffca1a31ee5ddc16dcd414ed9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 16 Sep 2020 15:46:45 -0700 Subject: [PATCH 91/95] bpf: Using rcu_read_lock for bpf_sk_storage_map iterator If a bucket contains a lot of sockets, during bpf_iter traversing a bucket, concurrent userspace bpf_map_update_elem() and bpf program bpf_sk_storage_{get,delete}() may experience some undesirable delays as they will compete with bpf_iter for bucket lock. Note that the number of buckets for bpf_sk_storage_map is roughly the same as the number of cpus. So if there are lots of sockets in the system, each bucket could contain lots of sockets. Different actual use cases may experience different delays. Here, using selftest bpf_iter subtest bpf_sk_storage_map, I hacked the kernel with ktime_get_mono_fast_ns() to collect the time when a bucket was locked during bpf_iter prog traversing that bucket. This way, the maximum incurred delay was measured w.r.t. the number of elements in a bucket. # elems in each bucket delay(ns) 64 17000 256 72512 2048 875246 The potential delays will be further increased if we have even more elemnts in a bucket. Using rcu_read_lock() is a reasonable compromise here. It may lose some precision, e.g., access stale sockets, but it will not hurt performance of bpf program or user space application which also tries to get/delete or update map elements. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200916224645.720172-1-yhs@fb.com --- net/core/bpf_sk_storage.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d653a583dbc9..838efc682cff 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -674,6 +674,7 @@ struct bpf_iter_seq_sk_storage_map_info { static struct bpf_local_storage_elem * bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, struct bpf_local_storage_elem *prev_selem) + __acquires(RCU) __releases(RCU) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_elem *selem; @@ -692,16 +693,16 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, selem = prev_selem; count = 0; while (selem) { - selem = hlist_entry_safe(selem->map_node.next, + selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)), struct bpf_local_storage_elem, map_node); if (!selem) { /* not found, unlock and go to the next bucket */ b = &smap->buckets[bucket_id++]; - raw_spin_unlock_bh(&b->lock); + rcu_read_unlock(); skip_elems = 0; break; } - sk_storage = rcu_dereference_raw(selem->local_storage); + sk_storage = rcu_dereference(selem->local_storage); if (sk_storage) { info->skip_elems = skip_elems + count; return selem; @@ -711,10 +712,10 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, for (i = bucket_id; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; - raw_spin_lock_bh(&b->lock); + rcu_read_lock(); count = 0; - hlist_for_each_entry(selem, &b->list, map_node) { - sk_storage = rcu_dereference_raw(selem->local_storage); + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + sk_storage = rcu_dereference(selem->local_storage); if (sk_storage && count >= skip_elems) { info->bucket_id = i; info->skip_elems = count; @@ -722,7 +723,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, } count++; } - raw_spin_unlock_bh(&b->lock); + rcu_read_unlock(); skip_elems = 0; } @@ -781,7 +782,7 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, ctx.meta = &meta; ctx.map = info->map; if (selem) { - sk_storage = rcu_dereference_raw(selem->local_storage); + sk_storage = rcu_dereference(selem->local_storage); ctx.sk = sk_storage->owner; ctx.value = SDATA(selem)->data; } @@ -797,18 +798,12 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) } static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - struct bpf_iter_seq_sk_storage_map_info *info = seq->private; - struct bpf_local_storage_map *smap; - struct bpf_local_storage_map_bucket *b; - - if (!v) { + if (!v) (void)__bpf_sk_storage_map_seq_show(seq, v); - } else { - smap = (struct bpf_local_storage_map *)info->map; - b = &smap->buckets[info->bucket_id]; - raw_spin_unlock_bh(&b->lock); - } + else + rcu_read_unlock(); } static int bpf_iter_init_sk_storage_map(void *priv_data, From a8a717963fe5ecfd274eb93dd1285ee9428ffca7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 18 Sep 2020 13:23:38 +0200 Subject: [PATCH 92/95] selftests/bpf: Fix stat probe in d_path test Some kernels builds might inline vfs_getattr call within fstat syscall code path, so fentry/vfs_getattr trampoline is not called. Add security_inode_getattr to allowlist and switch the d_path test stat trampoline to security_inode_getattr. Keeping dentry_open and filp_close, because they are in their own files, so unlikely to be inlined, but in case they are, adding security_file_open. Adding flags that indicate trampolines were called and failing the test if any of them got missed, so it's easier to identify the issue next time. Fixes: e4d1af4b16f8 ("selftests/bpf: Add test for d_path helper") Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200918112338.2618444-1-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 8 ++++++++ tools/testing/selftests/bpf/prog_tests/d_path.c | 10 ++++++++++ tools/testing/selftests/bpf/progs/test_d_path.c | 9 ++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ebf9be4d0d6a..36508f46a8db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1114,6 +1114,14 @@ BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) } BTF_SET_START(btf_allowlist_d_path) +#ifdef CONFIG_SECURITY +BTF_ID(func, security_file_permission) +BTF_ID(func, security_inode_getattr) +BTF_ID(func, security_file_open) +#endif +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, security_path_truncate) +#endif BTF_ID(func, vfs_truncate) BTF_ID(func, vfs_fallocate) BTF_ID(func, dentry_open) diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c index fc12e0d445ff..0a577a248d34 100644 --- a/tools/testing/selftests/bpf/prog_tests/d_path.c +++ b/tools/testing/selftests/bpf/prog_tests/d_path.c @@ -120,6 +120,16 @@ void test_d_path(void) if (err < 0) goto cleanup; + if (CHECK(!bss->called_stat, + "stat", + "trampoline for security_inode_getattr was not called\n")) + goto cleanup; + + if (CHECK(!bss->called_close, + "close", + "trampoline for filp_close was not called\n")) + goto cleanup; + for (int i = 0; i < MAX_FILES; i++) { CHECK(strncmp(src.paths[i], bss->paths_stat[i], MAX_PATH_LEN), "check", diff --git a/tools/testing/selftests/bpf/progs/test_d_path.c b/tools/testing/selftests/bpf/progs/test_d_path.c index 61f007855649..84e1f883f97b 100644 --- a/tools/testing/selftests/bpf/progs/test_d_path.c +++ b/tools/testing/selftests/bpf/progs/test_d_path.c @@ -15,7 +15,10 @@ char paths_close[MAX_FILES][MAX_PATH_LEN] = {}; int rets_stat[MAX_FILES] = {}; int rets_close[MAX_FILES] = {}; -SEC("fentry/vfs_getattr") +int called_stat = 0; +int called_close = 0; + +SEC("fentry/security_inode_getattr") int BPF_PROG(prog_stat, struct path *path, struct kstat *stat, __u32 request_mask, unsigned int query_flags) { @@ -23,6 +26,8 @@ int BPF_PROG(prog_stat, struct path *path, struct kstat *stat, __u32 cnt = cnt_stat; int ret; + called_stat = 1; + if (pid != my_pid) return 0; @@ -42,6 +47,8 @@ int BPF_PROG(prog_close, struct file *file, void *id) __u32 cnt = cnt_close; int ret; + called_close = 1; + if (pid != my_pid) return 0; From 0789e13bc3f84f0adafe1935af036956638950f9 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 23 Sep 2020 17:01:55 +0100 Subject: [PATCH 93/95] bpf: Explicitly size compatible_reg_types Arrays with designated initializers have an implicit length of the highest initialized value plus one. I used this to ensure that newly added entries in enum bpf_reg_type get a NULL entry in compatible_reg_types. This is difficult to understand since it requires knowledge of the peculiarities of designated initializers. Use __BPF_ARG_TYPE_MAX to size the array instead. Suggested-by: Alexei Starovoitov Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200923160156.80814-1-lmb@cloudflare.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15ab889b0a3f..d7c993ded26a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4002,7 +4002,7 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types *compatible_reg_types[] = { +static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, @@ -4025,7 +4025,6 @@ static const struct bpf_reg_types *compatible_reg_types[] = { [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, - [__BPF_ARG_TYPE_MAX] = NULL, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, From 017dab341ee75c0d3675b9b3d28be28d38e6defc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 23 Sep 2020 20:57:34 +0200 Subject: [PATCH 94/95] bpf: Check CONFIG_BPF option for resolve_btfids Currently all the resolve_btfids 'users' are under CONFIG_BPF code, so if we have CONFIG_BPF disabled, resolve_btfids will fail, because there's no data to resolve. Disabling resolve_btfids if there's CONFIG_BPF disabled, so we won't fail such builds. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200923185735.3048198-1-jolsa@kernel.org --- Makefile | 4 +++- scripts/link-vmlinux.sh | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index c4470a4e131f..b05682eea2c0 100644 --- a/Makefile +++ b/Makefile @@ -1084,13 +1084,15 @@ ifdef CONFIG_STACK_VALIDATION endif endif +ifdef CONFIG_BPF ifdef CONFIG_DEBUG_INFO_BTF ifeq ($(has_libelf),1) resolve_btfids_target := tools/bpf/resolve_btfids FORCE else ERROR_RESOLVE_BTFIDS := 1 endif -endif +endif # CONFIG_DEBUG_INFO_BTF +endif # CONFIG_BPF PHONY += prepare0 diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index e6e2d9e5ff48..dbde59d343b1 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -341,9 +341,9 @@ fi vmlinux_link vmlinux "${kallsymso}" ${btf_vmlinux_bin_o} # fill in BTF IDs -if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then -info BTFIDS vmlinux -${RESOLVE_BTFIDS} vmlinux +if [ -n "${CONFIG_DEBUG_INFO_BTF}" -a -n "${CONFIG_BPF}" ]; then + info BTFIDS vmlinux + ${RESOLVE_BTFIDS} vmlinux fi if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then From dc3652d3f0d5479768ec8eb7f7aabbba6ed75d95 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 23 Sep 2020 20:57:35 +0200 Subject: [PATCH 95/95] tools resolve_btfids: Always force HOSTARCH Seth reported problem with cross builds, that fail on resolve_btfids build, because we are trying to build it on cross build arch. Fixing this by always forcing the host arch. Reported-by: Seth Forshee Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200923185735.3048198-2-jolsa@kernel.org --- tools/bpf/resolve_btfids/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index a88cd4426398..d3c818b8d8d3 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only include ../../scripts/Makefile.include +include ../../scripts/Makefile.arch ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(CURDIR))) @@ -29,6 +30,7 @@ endif AR = $(HOSTAR) CC = $(HOSTCC) LD = $(HOSTLD) +ARCH = $(HOSTARCH) OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/