bpf-for-netdev

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQTFp0I1jqZrAX+hPRXbK58LschIgwUCZnlmXgAKCRDbK58LschI
 g2ovAP9iynwwFEjMSxHjQVXSq1J1PMqF4966vmy30RCKJMMN/QD/SRsRRKcfsPis
 BzKOdsOVbWlDl2CUqvBrPZGT6laKoQc=
 =6/0V
 -----END PGP SIGNATURE-----

Merge tag 'for-netdev' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2024-06-24

We've added 12 non-merge commits during the last 10 day(s) which contain
a total of 10 files changed, 412 insertions(+), 16 deletions(-).

The main changes are:

1) Fix a BPF verifier issue validating may_goto with a negative offset,
   from Alexei Starovoitov.

2) Fix a BPF verifier validation bug with may_goto combined with jump to
   the first instruction, also from Alexei Starovoitov.

3) Fix a bug with overrunning reservations in BPF ring buffer,
   from Daniel Borkmann.

4) Fix a bug in BPF verifier due to missing proper var_off setting related
   to movsx instruction, from Yonghong Song.

5) Silence unnecessary syzkaller-triggered warning in __xdp_reg_mem_model(),
   from Daniil Dulov.

* tag 'for-netdev' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  xdp: Remove WARN() from __xdp_reg_mem_model()
  selftests/bpf: Add tests for may_goto with negative offset.
  bpf: Fix may_goto with negative offset.
  selftests/bpf: Add more ring buffer test coverage
  bpf: Fix overrunning reservations in ringbuf
  selftests/bpf: Tests with may_goto and jumps to the 1st insn
  bpf: Fix the corner case with may_goto and jump to the 1st insn.
  bpf: Update BPF LSM maintainer list
  bpf: Fix remap of arena.
  selftests/bpf: Add a few tests to cover
  bpf: Add missed var_off setting in coerce_subreg_to_size_sx()
  bpf: Add missed var_off setting in set_sext32_default_val()
====================

Link: https://patch.msgid.link/20240624124330.8401-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2024-06-24 18:15:21 -07:00
commit 482000cf7f
10 changed files with 412 additions and 16 deletions

View File

@ -4083,12 +4083,13 @@ F: kernel/bpf/ringbuf.c
BPF [SECURITY & LSM] (Security Audit and Enforcement using BPF)
M: KP Singh <kpsingh@kernel.org>
R: Matt Bobrowski <mattbobrowski@google.com>
M: Matt Bobrowski <mattbobrowski@google.com>
L: bpf@vger.kernel.org
S: Maintained
F: Documentation/bpf/prog_lsm.rst
F: include/linux/bpf_lsm.h
F: kernel/bpf/bpf_lsm.c
F: kernel/trace/bpf_trace.c
F: security/bpf/
BPF [SELFTESTS] (Test Runners & Infrastructure)

View File

@ -212,6 +212,7 @@ static u64 arena_map_mem_usage(const struct bpf_map *map)
struct vma_list {
struct vm_area_struct *vma;
struct list_head head;
atomic_t mmap_count;
};
static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
@ -221,20 +222,30 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
vml = kmalloc(sizeof(*vml), GFP_KERNEL);
if (!vml)
return -ENOMEM;
atomic_set(&vml->mmap_count, 1);
vma->vm_private_data = vml;
vml->vma = vma;
list_add(&vml->head, &arena->vma_list);
return 0;
}
static void arena_vm_open(struct vm_area_struct *vma)
{
struct vma_list *vml = vma->vm_private_data;
atomic_inc(&vml->mmap_count);
}
static void arena_vm_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
struct vma_list *vml;
struct vma_list *vml = vma->vm_private_data;
if (!atomic_dec_and_test(&vml->mmap_count))
return;
guard(mutex)(&arena->lock);
vml = vma->vm_private_data;
/* update link list under lock */
list_del(&vml->head);
vma->vm_private_data = NULL;
kfree(vml);
@ -287,6 +298,7 @@ out:
}
static const struct vm_operations_struct arena_vm_ops = {
.open = arena_vm_open,
.close = arena_vm_close,
.fault = arena_vm_fault,
};

View File

@ -51,7 +51,8 @@ struct bpf_ringbuf {
* This prevents a user-space application from modifying the
* position and ruining in-kernel tracking. The permissions of the
* pages depend on who is producing samples: user-space or the
* kernel.
* kernel. Note that the pending counter is placed in the same
* page as the producer, so that it shares the same cache line.
*
* Kernel-producer
* ---------------
@ -70,6 +71,7 @@ struct bpf_ringbuf {
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
unsigned long pending_pos;
char data[] __aligned(PAGE_SIZE);
};
@ -179,6 +181,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
rb->mask = data_sz - 1;
rb->consumer_pos = 0;
rb->producer_pos = 0;
rb->pending_pos = 0;
return rb;
}
@ -404,9 +407,9 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
{
unsigned long cons_pos, prod_pos, new_prod_pos, flags;
u32 len, pg_off;
unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;
struct bpf_ringbuf_hdr *hdr;
u32 len, pg_off, tmp_size, hdr_len;
if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
return NULL;
@ -424,13 +427,29 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
spin_lock_irqsave(&rb->spinlock, flags);
}
pend_pos = rb->pending_pos;
prod_pos = rb->producer_pos;
new_prod_pos = prod_pos + len;
/* check for out of ringbuf space by ensuring producer position
* doesn't advance more than (ringbuf_size - 1) ahead
while (pend_pos < prod_pos) {
hdr = (void *)rb->data + (pend_pos & rb->mask);
hdr_len = READ_ONCE(hdr->len);
if (hdr_len & BPF_RINGBUF_BUSY_BIT)
break;
tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT;
tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8);
pend_pos += tmp_size;
}
rb->pending_pos = pend_pos;
/* check for out of ringbuf space:
* - by ensuring producer position doesn't advance more than
* (ringbuf_size - 1) ahead
* - by ensuring oldest not yet committed record until newest
* record does not span more than (ringbuf_size - 1)
*/
if (new_prod_pos - cons_pos > rb->mask) {
if (new_prod_pos - cons_pos > rb->mask ||
new_prod_pos - pend_pos > rb->mask) {
spin_unlock_irqrestore(&rb->spinlock, flags);
return NULL;
}

View File

@ -6236,6 +6236,7 @@ static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
}
reg->u32_min_value = 0;
reg->u32_max_value = U32_MAX;
reg->var_off = tnum_subreg(tnum_unknown);
}
static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
@ -6280,6 +6281,7 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
reg->s32_max_value = s32_max;
reg->u32_min_value = (u32)s32_min;
reg->u32_max_value = (u32)s32_max;
reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max));
return;
}
@ -12719,6 +12721,16 @@ static bool signed_add32_overflows(s32 a, s32 b)
return res < a;
}
static bool signed_add16_overflows(s16 a, s16 b)
{
/* Do the add in u16, where overflow is well-defined */
s16 res = (s16)((u16)a + (u16)b);
if (b < 0)
return res > a;
return res < a;
}
static bool signed_sub_overflows(s64 a, s64 b)
{
/* Do the sub in u64, where overflow is well-defined */
@ -17448,11 +17460,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
goto skip_inf_loop_check;
}
if (is_may_goto_insn_at(env, insn_idx)) {
if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
if (sl->state.may_goto_depth != cur->may_goto_depth &&
states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
update_loop_entry(cur, &sl->state);
goto hit;
}
goto skip_inf_loop_check;
}
if (calls_callback(env, insn_idx)) {
if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
@ -18730,6 +18742,39 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
/*
* For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
* jump offset by 'delta'.
*/
static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
{
struct bpf_insn *insn = prog->insnsi;
u32 insn_cnt = prog->len, i;
for (i = 0; i < insn_cnt; i++, insn++) {
u8 code = insn->code;
if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
continue;
if (insn->code == (BPF_JMP32 | BPF_JA)) {
if (i + 1 + insn->imm != tgt_idx)
continue;
if (signed_add32_overflows(insn->imm, delta))
return -ERANGE;
insn->imm += delta;
} else {
if (i + 1 + insn->off != tgt_idx)
continue;
if (signed_add16_overflows(insn->imm, delta))
return -ERANGE;
insn->off += delta;
}
}
return 0;
}
static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
u32 off, u32 cnt)
{
@ -20004,7 +20049,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
stack_depth_extra = 8;
insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
if (insn->off >= 0)
insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
else
insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
cnt = 4;
@ -20546,6 +20594,13 @@ next_insn:
if (!new_prog)
return -ENOMEM;
env->prog = prog = new_prog;
/*
* If may_goto is a first insn of a prog there could be a jmp
* insn that points to it, hence adjust all such jmps to point
* to insn after BPF_ST that inits may_goto count.
* Adjustment will succeed because bpf_patch_insn_data() didn't fail.
*/
WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1));
}
/* Since poke tab is now finalized, publish aux to tracker. */

View File

@ -295,10 +295,8 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
mutex_lock(&mem_id_lock);
ret = __mem_id_init_hash_table();
mutex_unlock(&mem_id_lock);
if (ret < 0) {
WARN_ON(1);
if (ret < 0)
return ERR_PTR(ret);
}
}
xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);

View File

@ -457,7 +457,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \
LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \
trace_printk.c trace_vprintk.c map_ptr_kern.c \
core_kern.c core_kern_overflow.c test_ringbuf.c \
test_ringbuf_n.c test_ringbuf_map_key.c
test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
# Generate both light skeleton and libbpf skeleton for these
LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \

View File

@ -12,9 +12,11 @@
#include <sys/sysinfo.h>
#include <linux/perf_event.h>
#include <linux/ring_buffer.h>
#include "test_ringbuf.lskel.h"
#include "test_ringbuf_n.lskel.h"
#include "test_ringbuf_map_key.lskel.h"
#include "test_ringbuf_write.lskel.h"
#define EDONE 7777
@ -84,6 +86,58 @@ static void *poll_thread(void *input)
return (void *)(long)ring_buffer__poll(ringbuf, timeout);
}
static void ringbuf_write_subtest(void)
{
struct test_ringbuf_write_lskel *skel;
int page_size = getpagesize();
size_t *mmap_ptr;
int err, rb_fd;
skel = test_ringbuf_write_lskel__open();
if (!ASSERT_OK_PTR(skel, "skel_open"))
return;
skel->maps.ringbuf.max_entries = 0x4000;
err = test_ringbuf_write_lskel__load(skel);
if (!ASSERT_OK(err, "skel_load"))
goto cleanup;
rb_fd = skel->maps.ringbuf.map_fd;
mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rb_fd, 0);
if (!ASSERT_OK_PTR(mmap_ptr, "rw_cons_pos"))
goto cleanup;
*mmap_ptr = 0x3000;
ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw");
skel->bss->pid = getpid();
ringbuf = ring_buffer__new(rb_fd, process_sample, NULL, NULL);
if (!ASSERT_OK_PTR(ringbuf, "ringbuf_new"))
goto cleanup;
err = test_ringbuf_write_lskel__attach(skel);
if (!ASSERT_OK(err, "skel_attach"))
goto cleanup_ringbuf;
skel->bss->discarded = 0;
skel->bss->passed = 0;
/* trigger exactly two samples */
syscall(__NR_getpgid);
syscall(__NR_getpgid);
ASSERT_EQ(skel->bss->discarded, 2, "discarded");
ASSERT_EQ(skel->bss->passed, 0, "passed");
test_ringbuf_write_lskel__detach(skel);
cleanup_ringbuf:
ring_buffer__free(ringbuf);
cleanup:
test_ringbuf_write_lskel__destroy(skel);
}
static void ringbuf_subtest(void)
{
const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample);
@ -451,4 +505,6 @@ void test_ringbuf(void)
ringbuf_n_subtest();
if (test__start_subtest("ringbuf_map_key"))
ringbuf_map_key_subtest();
if (test__start_subtest("ringbuf_write"))
ringbuf_write_subtest();
}

View File

@ -0,0 +1,46 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
char _license[] SEC("license") = "GPL";
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
} ringbuf SEC(".maps");
/* inputs */
int pid = 0;
/* outputs */
long passed = 0;
long discarded = 0;
SEC("fentry/" SYS_PREFIX "sys_getpgid")
int test_ringbuf_write(void *ctx)
{
int *foo, cur_pid = bpf_get_current_pid_tgid() >> 32;
void *sample1, *sample2;
if (cur_pid != pid)
return 0;
sample1 = bpf_ringbuf_reserve(&ringbuf, 0x3000, 0);
if (!sample1)
return 0;
/* first one can pass */
sample2 = bpf_ringbuf_reserve(&ringbuf, 0x3000, 0);
if (!sample2) {
bpf_ringbuf_discard(sample1, 0);
__sync_fetch_and_add(&discarded, 1);
return 0;
}
/* second one must not */
__sync_fetch_and_add(&passed, 1);
foo = sample2 + 4084;
*foo = 256;
bpf_ringbuf_discard(sample1, 0);
bpf_ringbuf_discard(sample2, 0);
return 0;
}

View File

@ -274,6 +274,58 @@ static __naked void iter_limit_bug_cb(void)
);
}
int tmp_var;
SEC("socket")
__failure __msg("infinite loop detected at insn 2")
__naked void jgt_imm64_and_may_goto(void)
{
asm volatile (" \
r0 = %[tmp_var] ll; \
l0_%=: .byte 0xe5; /* may_goto */ \
.byte 0; /* regs */ \
.short -3; /* off -3 */ \
.long 0; /* imm */ \
if r0 > 10 goto l0_%=; \
r0 = 0; \
exit; \
" :: __imm_addr(tmp_var)
: __clobber_all);
}
SEC("socket")
__failure __msg("infinite loop detected at insn 1")
__naked void may_goto_self(void)
{
asm volatile (" \
r0 = *(u32 *)(r10 - 4); \
l0_%=: .byte 0xe5; /* may_goto */ \
.byte 0; /* regs */ \
.short -1; /* off -1 */ \
.long 0; /* imm */ \
if r0 > 10 goto l0_%=; \
r0 = 0; \
exit; \
" ::: __clobber_all);
}
SEC("socket")
__success __retval(0)
__naked void may_goto_neg_off(void)
{
asm volatile (" \
r0 = *(u32 *)(r10 - 4); \
goto l0_%=; \
goto l1_%=; \
l0_%=: .byte 0xe5; /* may_goto */ \
.byte 0; /* regs */ \
.short -2; /* off -2 */ \
.long 0; /* imm */ \
if r0 > 10 goto l0_%=; \
l1_%=: r0 = 0; \
exit; \
" ::: __clobber_all);
}
SEC("tc")
__failure
__flag(BPF_F_TEST_STATE_FREQ)
@ -307,6 +359,100 @@ int iter_limit_bug(struct __sk_buff *skb)
return 0;
}
SEC("socket")
__success __retval(0)
__naked void ja_and_may_goto(void)
{
asm volatile (" \
l0_%=: .byte 0xe5; /* may_goto */ \
.byte 0; /* regs */ \
.short 1; /* off 1 */ \
.long 0; /* imm */ \
goto l0_%=; \
r0 = 0; \
exit; \
" ::: __clobber_common);
}
SEC("socket")
__success __retval(0)
__naked void ja_and_may_goto2(void)
{
asm volatile (" \
l0_%=: r0 = 0; \
.byte 0xe5; /* may_goto */ \
.byte 0; /* regs */ \
.short 1; /* off 1 */ \
.long 0; /* imm */ \
goto l0_%=; \
r0 = 0; \
exit; \
" ::: __clobber_common);
}
SEC("socket")
__success __retval(0)
__naked void jlt_and_may_goto(void)
{
asm volatile (" \
l0_%=: call %[bpf_jiffies64]; \
.byte 0xe5; /* may_goto */ \
.byte 0; /* regs */ \
.short 1; /* off 1 */ \
.long 0; /* imm */ \
if r0 < 10 goto l0_%=; \
r0 = 0; \
exit; \
" :: __imm(bpf_jiffies64)
: __clobber_all);
}
#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \
(defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \
defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390) || \
defined(__TARGET_ARCH_loongarch)) && \
__clang_major__ >= 18
SEC("socket")
__success __retval(0)
__naked void gotol_and_may_goto(void)
{
asm volatile (" \
l0_%=: r0 = 0; \
.byte 0xe5; /* may_goto */ \
.byte 0; /* regs */ \
.short 1; /* off 1 */ \
.long 0; /* imm */ \
gotol l0_%=; \
r0 = 0; \
exit; \
" ::: __clobber_common);
}
#endif
SEC("socket")
__success __retval(0)
__naked void ja_and_may_goto_subprog(void)
{
asm volatile (" \
call subprog_with_may_goto; \
exit; \
" ::: __clobber_all);
}
static __naked __noinline __used
void subprog_with_may_goto(void)
{
asm volatile (" \
l0_%=: .byte 0xe5; /* may_goto */ \
.byte 0; /* regs */ \
.short 1; /* off 1 */ \
.long 0; /* imm */ \
goto l0_%=; \
r0 = 0; \
exit; \
" ::: __clobber_all);
}
#define ARR_SZ 1000000
int zero;
char arr[ARR_SZ];

View File

@ -224,6 +224,69 @@ l0_%=: \
: __clobber_all);
}
SEC("socket")
__description("MOV32SX, S8, var_off u32_max")
__failure __msg("infinite loop detected")
__failure_unpriv __msg_unpriv("back-edge from insn 2 to 0")
__naked void mov64sx_s32_varoff_1(void)
{
asm volatile (" \
l0_%=: \
r3 = *(u8 *)(r10 -387); \
w7 = (s8)w3; \
if w7 >= 0x2533823b goto l0_%=; \
w0 = 0; \
exit; \
" :
:
: __clobber_all);
}
SEC("socket")
__description("MOV32SX, S8, var_off not u32_max, positive after s8 extension")
__success __retval(0)
__failure_unpriv __msg_unpriv("frame pointer is read only")
__naked void mov64sx_s32_varoff_2(void)
{
asm volatile (" \
call %[bpf_get_prandom_u32]; \
r3 = r0; \
r3 &= 0xf; \
w7 = (s8)w3; \
if w7 s>= 16 goto l0_%=; \
w0 = 0; \
exit; \
l0_%=: \
r10 = 1; \
exit; \
" :
: __imm(bpf_get_prandom_u32)
: __clobber_all);
}
SEC("socket")
__description("MOV32SX, S8, var_off not u32_max, negative after s8 extension")
__success __retval(0)
__failure_unpriv __msg_unpriv("frame pointer is read only")
__naked void mov64sx_s32_varoff_3(void)
{
asm volatile (" \
call %[bpf_get_prandom_u32]; \
r3 = r0; \
r3 &= 0xf; \
r3 |= 0x80; \
w7 = (s8)w3; \
if w7 s>= -5 goto l0_%=; \
w0 = 0; \
exit; \
l0_%=: \
r10 = 1; \
exit; \
" :
: __imm(bpf_get_prandom_u32)
: __clobber_all);
}
#else
SEC("socket")