2014-12-02 07:06:37 +08:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <libelf.h>
|
|
|
|
#include <gelf.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <stdbool.h>
|
2015-03-26 03:49:23 +08:00
|
|
|
#include <stdlib.h>
|
2014-12-02 07:06:37 +08:00
|
|
|
#include <linux/bpf.h>
|
|
|
|
#include <linux/filter.h>
|
2015-03-26 03:49:23 +08:00
|
|
|
#include <linux/perf_event.h>
|
2016-12-08 07:53:14 +08:00
|
|
|
#include <linux/netlink.h>
|
|
|
|
#include <linux/rtnetlink.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/socket.h>
|
2015-03-26 03:49:23 +08:00
|
|
|
#include <sys/syscall.h>
|
|
|
|
#include <sys/ioctl.h>
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#include <poll.h>
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
#include <ctype.h>
|
2014-12-02 07:06:37 +08:00
|
|
|
#include "libbpf.h"
|
|
|
|
#include "bpf_load.h"
|
2016-12-09 10:46:19 +08:00
|
|
|
#include "perf-sys.h"
|
2014-12-02 07:06:37 +08:00
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
#define DEBUGFS "/sys/kernel/debug/tracing/"
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
static char license[128];
|
2015-03-26 03:49:23 +08:00
|
|
|
static int kern_version;
|
2014-12-02 07:06:37 +08:00
|
|
|
static bool processed_sec[128];
|
samples/bpf: Make samples more libbpf-centric
Switch all of the sample code to use the function names from
tools/lib/bpf so that they're consistent with that, and to declare their
own log buffers. This allow the next commit to be purely devoted to
getting rid of the duplicate library in samples/bpf.
Committer notes:
Testing it:
On a fedora rawhide container, with clang/llvm 3.9, sharing the host
linux kernel git tree:
# make O=/tmp/build/linux/ headers_install
# make O=/tmp/build/linux -C samples/bpf/
Since I forgot to make it privileged, just tested it outside the
container, using what it generated:
# uname -a
Linux jouet 4.9.0-rc8+ #1 SMP Mon Dec 12 11:20:49 BRT 2016 x86_64 x86_64 x86_64 GNU/Linux
# cd /var/lib/docker/devicemapper/mnt/c43e09a53ff56c86a07baf79847f00e2cc2a17a1e2220e1adbf8cbc62734feda/rootfs/tmp/build/linux/samples/bpf/
# ls -la offwaketime
-rwxr-xr-x. 1 root root 24200 Dec 15 12:19 offwaketime
# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=c940d3f127d5e66cdd680e42d885cb0b64f8a0e4, not stripped
# readelf -SW offwaketime_kern.o | grep PROGBITS
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
# ./offwaketime | head -5
swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 106
CPU 0/KVM;entry_SYSCALL_64_fastpath;sys_ioctl;do_vfs_ioctl;kvm_vcpu_ioctl;kvm_arch_vcpu_ioctl_run;kvm_vcpu_block;schedule;__schedule;-;try_to_wake_up;swake_up_locked;swake_up;apic_timer_expired;apic_timer_fn;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary;;swapper/3 2
Compositor;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;futex_requeue;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;SoftwareVsyncTh 5
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 13
JS Helper;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;firefox 2
#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20161214224342.12858-2-joe@ovn.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:38 +08:00
|
|
|
char bpf_log_buf[BPF_LOG_BUF_SIZE];
|
2014-12-02 07:06:37 +08:00
|
|
|
int map_fd[MAX_MAPS];
|
|
|
|
int prog_fd[MAX_PROGS];
|
2015-03-26 03:49:23 +08:00
|
|
|
int event_fd[MAX_PROGS];
|
2014-12-02 07:06:37 +08:00
|
|
|
int prog_cnt;
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
int prog_array_fd = -1;
|
|
|
|
|
samples/bpf: Make samples more libbpf-centric
Switch all of the sample code to use the function names from
tools/lib/bpf so that they're consistent with that, and to declare their
own log buffers. This allow the next commit to be purely devoted to
getting rid of the duplicate library in samples/bpf.
Committer notes:
Testing it:
On a fedora rawhide container, with clang/llvm 3.9, sharing the host
linux kernel git tree:
# make O=/tmp/build/linux/ headers_install
# make O=/tmp/build/linux -C samples/bpf/
Since I forgot to make it privileged, just tested it outside the
container, using what it generated:
# uname -a
Linux jouet 4.9.0-rc8+ #1 SMP Mon Dec 12 11:20:49 BRT 2016 x86_64 x86_64 x86_64 GNU/Linux
# cd /var/lib/docker/devicemapper/mnt/c43e09a53ff56c86a07baf79847f00e2cc2a17a1e2220e1adbf8cbc62734feda/rootfs/tmp/build/linux/samples/bpf/
# ls -la offwaketime
-rwxr-xr-x. 1 root root 24200 Dec 15 12:19 offwaketime
# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=c940d3f127d5e66cdd680e42d885cb0b64f8a0e4, not stripped
# readelf -SW offwaketime_kern.o | grep PROGBITS
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
# ./offwaketime | head -5
swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 106
CPU 0/KVM;entry_SYSCALL_64_fastpath;sys_ioctl;do_vfs_ioctl;kvm_vcpu_ioctl;kvm_arch_vcpu_ioctl_run;kvm_vcpu_block;schedule;__schedule;-;try_to_wake_up;swake_up_locked;swake_up;apic_timer_expired;apic_timer_fn;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary;;swapper/3 2
Compositor;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;futex_requeue;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;SoftwareVsyncTh 5
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 13
JS Helper;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;firefox 2
#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20161214224342.12858-2-joe@ovn.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:38 +08:00
|
|
|
struct bpf_map_def {
|
|
|
|
unsigned int type;
|
|
|
|
unsigned int key_size;
|
|
|
|
unsigned int value_size;
|
|
|
|
unsigned int max_entries;
|
|
|
|
unsigned int map_flags;
|
|
|
|
};
|
|
|
|
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
static int populate_prog_array(const char *event, int prog_fd)
|
|
|
|
{
|
|
|
|
int ind = atoi(event), err;
|
|
|
|
|
samples/bpf: Make samples more libbpf-centric
Switch all of the sample code to use the function names from
tools/lib/bpf so that they're consistent with that, and to declare their
own log buffers. This allow the next commit to be purely devoted to
getting rid of the duplicate library in samples/bpf.
Committer notes:
Testing it:
On a fedora rawhide container, with clang/llvm 3.9, sharing the host
linux kernel git tree:
# make O=/tmp/build/linux/ headers_install
# make O=/tmp/build/linux -C samples/bpf/
Since I forgot to make it privileged, just tested it outside the
container, using what it generated:
# uname -a
Linux jouet 4.9.0-rc8+ #1 SMP Mon Dec 12 11:20:49 BRT 2016 x86_64 x86_64 x86_64 GNU/Linux
# cd /var/lib/docker/devicemapper/mnt/c43e09a53ff56c86a07baf79847f00e2cc2a17a1e2220e1adbf8cbc62734feda/rootfs/tmp/build/linux/samples/bpf/
# ls -la offwaketime
-rwxr-xr-x. 1 root root 24200 Dec 15 12:19 offwaketime
# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=c940d3f127d5e66cdd680e42d885cb0b64f8a0e4, not stripped
# readelf -SW offwaketime_kern.o | grep PROGBITS
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
# ./offwaketime | head -5
swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 106
CPU 0/KVM;entry_SYSCALL_64_fastpath;sys_ioctl;do_vfs_ioctl;kvm_vcpu_ioctl;kvm_arch_vcpu_ioctl_run;kvm_vcpu_block;schedule;__schedule;-;try_to_wake_up;swake_up_locked;swake_up;apic_timer_expired;apic_timer_fn;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary;;swapper/3 2
Compositor;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;futex_requeue;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;SoftwareVsyncTh 5
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 13
JS Helper;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;firefox 2
#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20161214224342.12858-2-joe@ovn.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:38 +08:00
|
|
|
err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
if (err < 0) {
|
|
|
|
printf("failed to store prog_fd in prog_array\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2014-12-02 07:06:37 +08:00
|
|
|
|
|
|
|
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
|
|
|
|
{
|
|
|
|
bool is_socket = strncmp(event, "socket", 6) == 0;
|
2015-03-26 03:49:23 +08:00
|
|
|
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
|
|
|
|
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
|
2016-04-07 09:43:29 +08:00
|
|
|
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
|
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-20 03:16:51 +08:00
|
|
|
bool is_xdp = strncmp(event, "xdp", 3) == 0;
|
2016-09-02 09:37:25 +08:00
|
|
|
bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
|
2016-12-02 00:48:07 +08:00
|
|
|
bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
|
|
|
|
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
|
samples/bpf: Switch over to libbpf
Now that libbpf under tools/lib/bpf/* is synced with the version from
samples/bpf, we can get rid most of the libbpf library here.
Committer notes:
Built it in a docker fedora rawhide container and ran it in the f25 host, seems
to work just like it did before this patch, i.e. the switch to tools/lib/bpf/
doesn't seem to have introduced problems and Joe said he tested it with
all the entries in samples/bpf/ and other code he found:
[root@f5065a7d6272 linux]# make -j4 O=/tmp/build/linux headers_install
<SNIP>
[root@f5065a7d6272 linux]# rm -rf /tmp/build/linux/samples/bpf/
[root@f5065a7d6272 linux]# make -j4 O=/tmp/build/linux samples/bpf/
make[1]: Entering directory '/tmp/build/linux'
CHK include/config/kernel.release
HOSTCC scripts/basic/fixdep
GEN ./Makefile
CHK include/generated/uapi/linux/version.h
Using /git/linux as source for kernel
CHK include/generated/utsrelease.h
HOSTCC scripts/basic/bin2c
HOSTCC arch/x86/tools/relocs_32.o
HOSTCC arch/x86/tools/relocs_64.o
LD samples/bpf/built-in.o
<SNIP>
HOSTCC samples/bpf/fds_example.o
HOSTCC samples/bpf/sockex1_user.o
/git/linux/samples/bpf/fds_example.c: In function 'bpf_prog_create':
/git/linux/samples/bpf/fds_example.c:63:6: warning: passing argument 2 of 'bpf_load_program' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
insns, insns_cnt, "GPL", 0,
^~~~~
In file included from /git/linux/samples/bpf/libbpf.h:5:0,
from /git/linux/samples/bpf/bpf_load.h:4,
from /git/linux/samples/bpf/fds_example.c:15:
/git/linux/tools/lib/bpf/bpf.h:31:5: note: expected 'struct bpf_insn *' but argument is of type 'const struct bpf_insn *'
int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
^~~~~~~~~~~~~~~~
HOSTCC samples/bpf/sockex2_user.o
<SNIP>
HOSTCC samples/bpf/xdp_tx_iptunnel_user.o
clang -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/6.2.1/include -I/git/linux/arch/x86/include -I./arch/x86/include/generated/uapi -I./arch/x86/include/generated -I/git/linux/include -I./include -I/git/linux/arch/x86/include/uapi -I/git/linux/include/uapi -I./include/generated/uapi -include /git/linux/include/linux/kconfig.h \
-D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
-Wno-compare-distinct-pointer-types \
-Wno-gnu-variable-sized-type-not-at-end \
-Wno-address-of-packed-member -Wno-tautological-compare \
-O2 -emit-llvm -c /git/linux/samples/bpf/sockex1_kern.c -o -| llc -march=bpf -filetype=obj -o samples/bpf/sockex1_kern.o
HOSTLD samples/bpf/tc_l2_redirect
<SNIP>
HOSTLD samples/bpf/lwt_len_hist
HOSTLD samples/bpf/xdp_tx_iptunnel
make[1]: Leaving directory '/tmp/build/linux'
[root@f5065a7d6272 linux]#
And then, in the host:
[root@jouet bpf]# mount | grep "docker.*devicemapper\/"
/dev/mapper/docker-253:0-1705076-9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9 on /var/lib/docker/devicemapper/mnt/9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9 type xfs (rw,relatime,context="system_u:object_r:container_file_t:s0:c73,c276",nouuid,attr2,inode64,sunit=1024,swidth=1024,noquota)
[root@jouet bpf]# cd /var/lib/docker/devicemapper/mnt/9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9/rootfs/tmp/build/linux/samples/bpf/
[root@jouet bpf]# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=f423d171e0487b2f802b6a792657f0f3c8f6d155, not stripped
[root@jouet bpf]# readelf -SW offwaketime
offwaketime offwaketime_kern.o offwaketime_user.o
[root@jouet bpf]# readelf -SW offwaketime_kern.o
There are 11 section headers, starting at offset 0x700:
Section Headers:
[Nr] Name Type Address Off Size ES Flg Lk Inf Al
[ 0] NULL 0000000000000000 000000 000000 00 0 0 0
[ 1] .strtab STRTAB 0000000000000000 000658 0000a8 00 0 0 1
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 4] .relkprobe/try_to_wake_up REL 0000000000000000 0005a8 000020 10 10 3 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 6] .reltracepoint/sched/sched_switch REL 0000000000000000 0005c8 000090 10 10 5 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
[10] .symtab SYMTAB 0000000000000000 000488 000120 18 1 4 8
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings)
I (info), L (link order), G (group), T (TLS), E (exclude), x (unknown)
O (extra OS processing required) o (OS specific), p (processor specific)
[root@jouet bpf]# ./offwaketime | head -3
qemu-system-x86;entry_SYSCALL_64_fastpath;sys_ppoll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter_state;cpuidle_enter;call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel;start_cpu;;swapper/0 4
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 1
swapper/2;start_cpu;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 61
[root@jouet bpf]#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: https://github.com/joestringer/linux/commit/5c40f54a52b1f437123c81e21873f4b4b1f9bd55.patch
Link: http://lkml.kernel.org/n/tip-xr8twtx7sjh5821g8qw47yxk@git.kernel.org
[ Use -I$(srctree)/tools/lib/ to support out of source code tree builds, as noticed by Wang Nan ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:39 +08:00
|
|
|
size_t insns_cnt = size / sizeof(struct bpf_insn);
|
2015-03-26 03:49:23 +08:00
|
|
|
enum bpf_prog_type prog_type;
|
|
|
|
char buf[256];
|
|
|
|
int fd, efd, err, id;
|
|
|
|
struct perf_event_attr attr = {};
|
|
|
|
|
|
|
|
attr.type = PERF_TYPE_TRACEPOINT;
|
|
|
|
attr.sample_type = PERF_SAMPLE_RAW;
|
|
|
|
attr.sample_period = 1;
|
|
|
|
attr.wakeup_events = 1;
|
|
|
|
|
|
|
|
if (is_socket) {
|
|
|
|
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
|
|
|
|
} else if (is_kprobe || is_kretprobe) {
|
|
|
|
prog_type = BPF_PROG_TYPE_KPROBE;
|
2016-04-07 09:43:29 +08:00
|
|
|
} else if (is_tracepoint) {
|
|
|
|
prog_type = BPF_PROG_TYPE_TRACEPOINT;
|
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-20 03:16:51 +08:00
|
|
|
} else if (is_xdp) {
|
|
|
|
prog_type = BPF_PROG_TYPE_XDP;
|
2016-09-02 09:37:25 +08:00
|
|
|
} else if (is_perf_event) {
|
|
|
|
prog_type = BPF_PROG_TYPE_PERF_EVENT;
|
2016-12-02 00:48:07 +08:00
|
|
|
} else if (is_cgroup_skb) {
|
|
|
|
prog_type = BPF_PROG_TYPE_CGROUP_SKB;
|
|
|
|
} else if (is_cgroup_sk) {
|
|
|
|
prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
|
2015-03-26 03:49:23 +08:00
|
|
|
} else {
|
|
|
|
printf("Unknown event '%s'\n", event);
|
2014-12-02 07:06:37 +08:00
|
|
|
return -1;
|
2015-03-26 03:49:23 +08:00
|
|
|
}
|
|
|
|
|
samples/bpf: Switch over to libbpf
Now that libbpf under tools/lib/bpf/* is synced with the version from
samples/bpf, we can get rid most of the libbpf library here.
Committer notes:
Built it in a docker fedora rawhide container and ran it in the f25 host, seems
to work just like it did before this patch, i.e. the switch to tools/lib/bpf/
doesn't seem to have introduced problems and Joe said he tested it with
all the entries in samples/bpf/ and other code he found:
[root@f5065a7d6272 linux]# make -j4 O=/tmp/build/linux headers_install
<SNIP>
[root@f5065a7d6272 linux]# rm -rf /tmp/build/linux/samples/bpf/
[root@f5065a7d6272 linux]# make -j4 O=/tmp/build/linux samples/bpf/
make[1]: Entering directory '/tmp/build/linux'
CHK include/config/kernel.release
HOSTCC scripts/basic/fixdep
GEN ./Makefile
CHK include/generated/uapi/linux/version.h
Using /git/linux as source for kernel
CHK include/generated/utsrelease.h
HOSTCC scripts/basic/bin2c
HOSTCC arch/x86/tools/relocs_32.o
HOSTCC arch/x86/tools/relocs_64.o
LD samples/bpf/built-in.o
<SNIP>
HOSTCC samples/bpf/fds_example.o
HOSTCC samples/bpf/sockex1_user.o
/git/linux/samples/bpf/fds_example.c: In function 'bpf_prog_create':
/git/linux/samples/bpf/fds_example.c:63:6: warning: passing argument 2 of 'bpf_load_program' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
insns, insns_cnt, "GPL", 0,
^~~~~
In file included from /git/linux/samples/bpf/libbpf.h:5:0,
from /git/linux/samples/bpf/bpf_load.h:4,
from /git/linux/samples/bpf/fds_example.c:15:
/git/linux/tools/lib/bpf/bpf.h:31:5: note: expected 'struct bpf_insn *' but argument is of type 'const struct bpf_insn *'
int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
^~~~~~~~~~~~~~~~
HOSTCC samples/bpf/sockex2_user.o
<SNIP>
HOSTCC samples/bpf/xdp_tx_iptunnel_user.o
clang -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/6.2.1/include -I/git/linux/arch/x86/include -I./arch/x86/include/generated/uapi -I./arch/x86/include/generated -I/git/linux/include -I./include -I/git/linux/arch/x86/include/uapi -I/git/linux/include/uapi -I./include/generated/uapi -include /git/linux/include/linux/kconfig.h \
-D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
-Wno-compare-distinct-pointer-types \
-Wno-gnu-variable-sized-type-not-at-end \
-Wno-address-of-packed-member -Wno-tautological-compare \
-O2 -emit-llvm -c /git/linux/samples/bpf/sockex1_kern.c -o -| llc -march=bpf -filetype=obj -o samples/bpf/sockex1_kern.o
HOSTLD samples/bpf/tc_l2_redirect
<SNIP>
HOSTLD samples/bpf/lwt_len_hist
HOSTLD samples/bpf/xdp_tx_iptunnel
make[1]: Leaving directory '/tmp/build/linux'
[root@f5065a7d6272 linux]#
And then, in the host:
[root@jouet bpf]# mount | grep "docker.*devicemapper\/"
/dev/mapper/docker-253:0-1705076-9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9 on /var/lib/docker/devicemapper/mnt/9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9 type xfs (rw,relatime,context="system_u:object_r:container_file_t:s0:c73,c276",nouuid,attr2,inode64,sunit=1024,swidth=1024,noquota)
[root@jouet bpf]# cd /var/lib/docker/devicemapper/mnt/9bd8aa1e0af33adce89ff42090847868ca676932878942be53941a06ec5923f9/rootfs/tmp/build/linux/samples/bpf/
[root@jouet bpf]# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=f423d171e0487b2f802b6a792657f0f3c8f6d155, not stripped
[root@jouet bpf]# readelf -SW offwaketime
offwaketime offwaketime_kern.o offwaketime_user.o
[root@jouet bpf]# readelf -SW offwaketime_kern.o
There are 11 section headers, starting at offset 0x700:
Section Headers:
[Nr] Name Type Address Off Size ES Flg Lk Inf Al
[ 0] NULL 0000000000000000 000000 000000 00 0 0 0
[ 1] .strtab STRTAB 0000000000000000 000658 0000a8 00 0 0 1
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 4] .relkprobe/try_to_wake_up REL 0000000000000000 0005a8 000020 10 10 3 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 6] .reltracepoint/sched/sched_switch REL 0000000000000000 0005c8 000090 10 10 5 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
[10] .symtab SYMTAB 0000000000000000 000488 000120 18 1 4 8
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings)
I (info), L (link order), G (group), T (TLS), E (exclude), x (unknown)
O (extra OS processing required) o (OS specific), p (processor specific)
[root@jouet bpf]# ./offwaketime | head -3
qemu-system-x86;entry_SYSCALL_64_fastpath;sys_ppoll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter_state;cpuidle_enter;call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel;start_cpu;;swapper/0 4
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 1
swapper/2;start_cpu;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 61
[root@jouet bpf]#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: https://github.com/joestringer/linux/commit/5c40f54a52b1f437123c81e21873f4b4b1f9bd55.patch
Link: http://lkml.kernel.org/n/tip-xr8twtx7sjh5821g8qw47yxk@git.kernel.org
[ Use -I$(srctree)/tools/lib/ to support out of source code tree builds, as noticed by Wang Nan ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:39 +08:00
|
|
|
fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
|
samples/bpf: Make samples more libbpf-centric
Switch all of the sample code to use the function names from
tools/lib/bpf so that they're consistent with that, and to declare their
own log buffers. This allow the next commit to be purely devoted to
getting rid of the duplicate library in samples/bpf.
Committer notes:
Testing it:
On a fedora rawhide container, with clang/llvm 3.9, sharing the host
linux kernel git tree:
# make O=/tmp/build/linux/ headers_install
# make O=/tmp/build/linux -C samples/bpf/
Since I forgot to make it privileged, just tested it outside the
container, using what it generated:
# uname -a
Linux jouet 4.9.0-rc8+ #1 SMP Mon Dec 12 11:20:49 BRT 2016 x86_64 x86_64 x86_64 GNU/Linux
# cd /var/lib/docker/devicemapper/mnt/c43e09a53ff56c86a07baf79847f00e2cc2a17a1e2220e1adbf8cbc62734feda/rootfs/tmp/build/linux/samples/bpf/
# ls -la offwaketime
-rwxr-xr-x. 1 root root 24200 Dec 15 12:19 offwaketime
# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=c940d3f127d5e66cdd680e42d885cb0b64f8a0e4, not stripped
# readelf -SW offwaketime_kern.o | grep PROGBITS
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
# ./offwaketime | head -5
swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 106
CPU 0/KVM;entry_SYSCALL_64_fastpath;sys_ioctl;do_vfs_ioctl;kvm_vcpu_ioctl;kvm_arch_vcpu_ioctl_run;kvm_vcpu_block;schedule;__schedule;-;try_to_wake_up;swake_up_locked;swake_up;apic_timer_expired;apic_timer_fn;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary;;swapper/3 2
Compositor;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;futex_requeue;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;SoftwareVsyncTh 5
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 13
JS Helper;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;firefox 2
#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20161214224342.12858-2-joe@ovn.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:38 +08:00
|
|
|
bpf_log_buf, BPF_LOG_BUF_SIZE);
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
if (fd < 0) {
|
samples/bpf: Make samples more libbpf-centric
Switch all of the sample code to use the function names from
tools/lib/bpf so that they're consistent with that, and to declare their
own log buffers. This allow the next commit to be purely devoted to
getting rid of the duplicate library in samples/bpf.
Committer notes:
Testing it:
On a fedora rawhide container, with clang/llvm 3.9, sharing the host
linux kernel git tree:
# make O=/tmp/build/linux/ headers_install
# make O=/tmp/build/linux -C samples/bpf/
Since I forgot to make it privileged, just tested it outside the
container, using what it generated:
# uname -a
Linux jouet 4.9.0-rc8+ #1 SMP Mon Dec 12 11:20:49 BRT 2016 x86_64 x86_64 x86_64 GNU/Linux
# cd /var/lib/docker/devicemapper/mnt/c43e09a53ff56c86a07baf79847f00e2cc2a17a1e2220e1adbf8cbc62734feda/rootfs/tmp/build/linux/samples/bpf/
# ls -la offwaketime
-rwxr-xr-x. 1 root root 24200 Dec 15 12:19 offwaketime
# file offwaketime
offwaketime: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 2.6.32, BuildID[sha1]=c940d3f127d5e66cdd680e42d885cb0b64f8a0e4, not stripped
# readelf -SW offwaketime_kern.o | grep PROGBITS
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] kprobe/try_to_wake_up PROGBITS 0000000000000000 000040 0000d8 00 AX 0 0 8
[ 5] tracepoint/sched/sched_switch PROGBITS 0000000000000000 000118 000318 00 AX 0 0 8
[ 7] maps PROGBITS 0000000000000000 000430 000050 00 WA 0 0 4
[ 8] license PROGBITS 0000000000000000 000480 000004 00 WA 0 0 1
[ 9] version PROGBITS 0000000000000000 000484 000004 00 WA 0 0 4
# ./offwaketime | head -5
swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 106
CPU 0/KVM;entry_SYSCALL_64_fastpath;sys_ioctl;do_vfs_ioctl;kvm_vcpu_ioctl;kvm_arch_vcpu_ioctl_run;kvm_vcpu_block;schedule;__schedule;-;try_to_wake_up;swake_up_locked;swake_up;apic_timer_expired;apic_timer_fn;__hrtimer_run_queues;hrtimer_interrupt;local_apic_timer_interrupt;smp_apic_timer_interrupt;__irqentry_text_start;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary;;swapper/3 2
Compositor;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;futex_requeue;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;SoftwareVsyncTh 5
firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 13
JS Helper;entry_SYSCALL_64_fastpath;sys_futex;do_futex;futex_wait;futex_wait_queue_me;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;firefox 2
#
Signed-off-by: Joe Stringer <joe@ovn.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: netdev@vger.kernel.org
Link: http://lkml.kernel.org/r/20161214224342.12858-2-joe@ovn.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-15 06:43:38 +08:00
|
|
|
printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
prog_fd[prog_cnt++] = fd;
|
|
|
|
|
2016-12-02 00:48:07 +08:00
|
|
|
if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
|
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-20 03:16:51 +08:00
|
|
|
return 0;
|
|
|
|
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
if (is_socket) {
|
|
|
|
event += 6;
|
|
|
|
if (*event != '/')
|
|
|
|
return 0;
|
|
|
|
event++;
|
|
|
|
if (!isdigit(*event)) {
|
|
|
|
printf("invalid prog number\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return populate_prog_array(event, fd);
|
|
|
|
}
|
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
if (is_kprobe || is_kretprobe) {
|
|
|
|
if (is_kprobe)
|
|
|
|
event += 7;
|
|
|
|
else
|
|
|
|
event += 10;
|
|
|
|
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
if (*event == 0) {
|
|
|
|
printf("event name cannot be empty\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isdigit(*event))
|
|
|
|
return populate_prog_array(event, fd);
|
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
snprintf(buf, sizeof(buf),
|
|
|
|
"echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
|
|
|
|
is_kprobe ? 'p' : 'r', event, event);
|
|
|
|
err = system(buf);
|
|
|
|
if (err < 0) {
|
|
|
|
printf("failed to create kprobe '%s' error '%s'\n",
|
|
|
|
event, strerror(errno));
|
|
|
|
return -1;
|
|
|
|
}
|
2014-12-02 07:06:37 +08:00
|
|
|
|
2016-04-07 09:43:29 +08:00
|
|
|
strcpy(buf, DEBUGFS);
|
|
|
|
strcat(buf, "events/kprobes/");
|
|
|
|
strcat(buf, event);
|
|
|
|
strcat(buf, "/id");
|
|
|
|
} else if (is_tracepoint) {
|
|
|
|
event += 11;
|
|
|
|
|
|
|
|
if (*event == 0) {
|
|
|
|
printf("event name cannot be empty\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
strcpy(buf, DEBUGFS);
|
|
|
|
strcat(buf, "events/");
|
|
|
|
strcat(buf, event);
|
|
|
|
strcat(buf, "/id");
|
|
|
|
}
|
2015-03-26 03:49:23 +08:00
|
|
|
|
|
|
|
efd = open(buf, O_RDONLY, 0);
|
|
|
|
if (efd < 0) {
|
|
|
|
printf("failed to open event %s\n", event);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = read(efd, buf, sizeof(buf));
|
|
|
|
if (err < 0 || err >= sizeof(buf)) {
|
|
|
|
printf("read from '%s' failed '%s'\n", event, strerror(errno));
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
close(efd);
|
|
|
|
|
|
|
|
buf[err] = 0;
|
|
|
|
id = atoi(buf);
|
|
|
|
attr.config = id;
|
|
|
|
|
2016-12-09 10:46:19 +08:00
|
|
|
efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
|
2015-03-26 03:49:23 +08:00
|
|
|
if (efd < 0) {
|
|
|
|
printf("event %d fd %d err %s\n", id, efd, strerror(errno));
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
event_fd[prog_cnt - 1] = efd;
|
|
|
|
ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
|
|
|
|
ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int load_maps(struct bpf_map_def *maps, int len)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < len / sizeof(struct bpf_map_def); i++) {
|
|
|
|
|
|
|
|
map_fd[i] = bpf_create_map(maps[i].type,
|
|
|
|
maps[i].key_size,
|
|
|
|
maps[i].value_size,
|
2016-03-08 13:57:20 +08:00
|
|
|
maps[i].max_entries,
|
|
|
|
maps[i].map_flags);
|
2016-03-08 13:57:18 +08:00
|
|
|
if (map_fd[i] < 0) {
|
|
|
|
printf("failed to create a map: %d %s\n",
|
|
|
|
errno, strerror(errno));
|
2014-12-02 07:06:37 +08:00
|
|
|
return 1;
|
2016-03-08 13:57:18 +08:00
|
|
|
}
|
samples/bpf: bpf_tail_call example for tracing
kprobe example that demonstrates how future seccomp programs may look like.
It attaches to seccomp_phase1() function and tail-calls other BPF programs
depending on syscall number.
Existing optimized classic BPF seccomp programs generated by Chrome look like:
if (sd.nr < 121) {
if (sd.nr < 57) {
if (sd.nr < 22) {
if (sd.nr < 7) {
if (sd.nr < 4) {
if (sd.nr < 1) {
check sys_read
} else {
if (sd.nr < 3) {
check sys_write and sys_open
} else {
check sys_close
}
}
} else {
} else {
} else {
} else {
} else {
}
the future seccomp using native eBPF may look like:
bpf_tail_call(&sd, &syscall_jmp_table, sd.nr);
which is simpler, faster and leaves more room for per-syscall checks.
Usage:
$ sudo ./tracex5
<...>-366 [001] d... 4.870033: : read(fd=1, buf=00007f6d5bebf000, size=771)
<...>-369 [003] d... 4.870066: : mmap
<...>-369 [003] d... 4.870077: : syscall=110 (one of get/set uid/pid/gid)
<...>-369 [003] d... 4.870089: : syscall=107 (one of get/set uid/pid/gid)
sh-369 [000] d... 4.891740: : read(fd=0, buf=00000000023d1000, size=512)
sh-369 [000] d... 4.891747: : write(fd=1, buf=00000000023d3000, size=512)
sh-369 [000] d... 4.891747: : read(fd=1, buf=00000000023d3000, size=512)
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:05 +08:00
|
|
|
|
|
|
|
if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY)
|
|
|
|
prog_array_fd = map_fd[i];
|
2014-12-02 07:06:37 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
|
|
|
|
GElf_Shdr *shdr, Elf_Data **data)
|
|
|
|
{
|
|
|
|
Elf_Scn *scn;
|
|
|
|
|
|
|
|
scn = elf_getscn(elf, i);
|
|
|
|
if (!scn)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (gelf_getshdr(scn, shdr) != shdr)
|
|
|
|
return 2;
|
|
|
|
|
|
|
|
*shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
|
|
|
|
if (!*shname || !shdr->sh_size)
|
|
|
|
return 3;
|
|
|
|
|
|
|
|
*data = elf_getdata(scn, 0);
|
|
|
|
if (!*data || elf_getdata(scn, *data) != NULL)
|
|
|
|
return 4;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
|
|
|
|
GElf_Shdr *shdr, struct bpf_insn *insn)
|
|
|
|
{
|
|
|
|
int i, nrels;
|
|
|
|
|
|
|
|
nrels = shdr->sh_size / shdr->sh_entsize;
|
|
|
|
|
|
|
|
for (i = 0; i < nrels; i++) {
|
|
|
|
GElf_Sym sym;
|
|
|
|
GElf_Rel rel;
|
|
|
|
unsigned int insn_idx;
|
|
|
|
|
|
|
|
gelf_getrel(data, i, &rel);
|
|
|
|
|
|
|
|
insn_idx = rel.r_offset / sizeof(struct bpf_insn);
|
|
|
|
|
|
|
|
gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
|
|
|
|
|
|
|
|
if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
|
|
|
|
printf("invalid relo for insn[%d].code 0x%x\n",
|
|
|
|
insn_idx, insn[insn_idx].code);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
|
|
|
|
insn[insn_idx].imm = map_fd[sym.st_value / sizeof(struct bpf_map_def)];
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int load_bpf_file(char *path)
|
|
|
|
{
|
|
|
|
int fd, i;
|
|
|
|
Elf *elf;
|
|
|
|
GElf_Ehdr ehdr;
|
|
|
|
GElf_Shdr shdr, shdr_prog;
|
|
|
|
Elf_Data *data, *data_prog, *symbols = NULL;
|
|
|
|
char *shname, *shname_prog;
|
|
|
|
|
2017-02-09 04:27:43 +08:00
|
|
|
/* reset global variables */
|
|
|
|
kern_version = 0;
|
|
|
|
memset(license, 0, sizeof(license));
|
|
|
|
memset(processed_sec, 0, sizeof(processed_sec));
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
if (elf_version(EV_CURRENT) == EV_NONE)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
fd = open(path, O_RDONLY, 0);
|
|
|
|
if (fd < 0)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
elf = elf_begin(fd, ELF_C_READ, NULL);
|
|
|
|
|
|
|
|
if (!elf)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (gelf_getehdr(elf, &ehdr) != &ehdr)
|
|
|
|
return 1;
|
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
/* clear all kprobes */
|
|
|
|
i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
/* scan over all elf sections to get license and map info */
|
|
|
|
for (i = 1; i < ehdr.e_shnum; i++) {
|
|
|
|
|
|
|
|
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (0) /* helpful for llvm debugging */
|
|
|
|
printf("section %d:%s data %p size %zd link %d flags %d\n",
|
|
|
|
i, shname, data->d_buf, data->d_size,
|
|
|
|
shdr.sh_link, (int) shdr.sh_flags);
|
|
|
|
|
|
|
|
if (strcmp(shname, "license") == 0) {
|
|
|
|
processed_sec[i] = true;
|
|
|
|
memcpy(license, data->d_buf, data->d_size);
|
2015-03-26 03:49:23 +08:00
|
|
|
} else if (strcmp(shname, "version") == 0) {
|
|
|
|
processed_sec[i] = true;
|
|
|
|
if (data->d_size != sizeof(int)) {
|
|
|
|
printf("invalid size of version section %zd\n",
|
|
|
|
data->d_size);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
memcpy(&kern_version, data->d_buf, sizeof(int));
|
2014-12-02 07:06:37 +08:00
|
|
|
} else if (strcmp(shname, "maps") == 0) {
|
|
|
|
processed_sec[i] = true;
|
|
|
|
if (load_maps(data->d_buf, data->d_size))
|
|
|
|
return 1;
|
|
|
|
} else if (shdr.sh_type == SHT_SYMTAB) {
|
|
|
|
symbols = data;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* load programs that need map fixup (relocations) */
|
|
|
|
for (i = 1; i < ehdr.e_shnum; i++) {
|
2017-02-09 04:27:42 +08:00
|
|
|
if (processed_sec[i])
|
|
|
|
continue;
|
2014-12-02 07:06:37 +08:00
|
|
|
|
|
|
|
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
|
|
|
|
continue;
|
|
|
|
if (shdr.sh_type == SHT_REL) {
|
|
|
|
struct bpf_insn *insns;
|
|
|
|
|
|
|
|
if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
|
|
|
|
&shdr_prog, &data_prog))
|
|
|
|
continue;
|
|
|
|
|
2016-11-23 08:52:09 +08:00
|
|
|
if (shdr_prog.sh_type != SHT_PROGBITS ||
|
|
|
|
!(shdr_prog.sh_flags & SHF_EXECINSTR))
|
|
|
|
continue;
|
|
|
|
|
2014-12-02 07:06:37 +08:00
|
|
|
insns = (struct bpf_insn *) data_prog->d_buf;
|
|
|
|
|
|
|
|
processed_sec[shdr.sh_info] = true;
|
|
|
|
processed_sec[i] = true;
|
|
|
|
|
|
|
|
if (parse_relo_and_apply(data, symbols, &shdr, insns))
|
|
|
|
continue;
|
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
|
|
|
|
memcmp(shname_prog, "kretprobe/", 10) == 0 ||
|
2016-04-07 09:43:29 +08:00
|
|
|
memcmp(shname_prog, "tracepoint/", 11) == 0 ||
|
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-20 03:16:51 +08:00
|
|
|
memcmp(shname_prog, "xdp", 3) == 0 ||
|
2016-09-02 09:37:25 +08:00
|
|
|
memcmp(shname_prog, "perf_event", 10) == 0 ||
|
2016-12-02 00:48:07 +08:00
|
|
|
memcmp(shname_prog, "socket", 6) == 0 ||
|
|
|
|
memcmp(shname_prog, "cgroup/", 7) == 0)
|
2014-12-02 07:06:37 +08:00
|
|
|
load_and_attach(shname_prog, insns, data_prog->d_size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* load programs that don't use maps */
|
|
|
|
for (i = 1; i < ehdr.e_shnum; i++) {
|
|
|
|
|
|
|
|
if (processed_sec[i])
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
|
|
|
|
continue;
|
|
|
|
|
2015-03-26 03:49:23 +08:00
|
|
|
if (memcmp(shname, "kprobe/", 7) == 0 ||
|
|
|
|
memcmp(shname, "kretprobe/", 10) == 0 ||
|
2016-04-07 09:43:29 +08:00
|
|
|
memcmp(shname, "tracepoint/", 11) == 0 ||
|
Add sample for adding simple drop program to link
Add a sample program that only drops packets at the BPF_PROG_TYPE_XDP_RX
hook of a link. With the drop-only program, observed single core rate is
~20Mpps.
Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.
$ perf record -a samples/bpf/xdp1 $(</sys/class/net/eth0/ifindex)
proto 17: 20403027 drops/s
./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 11791017(c11788327+d2689) usec, 59622913 (60byte,0frags)
5056638pps 2427Mb/sec (2427186240bps) errors: 0
Device: eth4@1
Result: OK: 11791012(c11787906+d3106) usec, 60526944 (60byte,0frags)
5133311pps 2463Mb/sec (2463989280bps) errors: 0
Device: eth4@2
Result: OK: 11791019(c11788249+d2769) usec, 59868091 (60byte,0frags)
5077431pps 2437Mb/sec (2437166880bps) errors: 0
Device: eth4@3
Result: OK: 11795039(c11792403+d2636) usec, 59483181 (60byte,0frags)
5043067pps 2420Mb/sec (2420672160bps) errors: 0
perf report --no-children:
26.05% ksoftirqd/0 [mlx4_en] [k] mlx4_en_process_rx_cq
17.84% ksoftirqd/0 [mlx4_en] [k] mlx4_en_alloc_frags
5.52% ksoftirqd/0 [mlx4_en] [k] mlx4_en_free_frag
4.90% swapper [kernel.vmlinux] [k] poll_idle
4.14% ksoftirqd/0 [kernel.vmlinux] [k] get_page_from_freelist
2.78% ksoftirqd/0 [kernel.vmlinux] [k] __free_pages_ok
2.57% ksoftirqd/0 [kernel.vmlinux] [k] bpf_map_lookup_elem
2.51% swapper [mlx4_en] [k] mlx4_en_process_rx_cq
1.94% ksoftirqd/0 [kernel.vmlinux] [k] percpu_array_map_lookup_elem
1.45% swapper [mlx4_en] [k] mlx4_en_alloc_frags
1.35% ksoftirqd/0 [kernel.vmlinux] [k] free_one_page
1.33% swapper [kernel.vmlinux] [k] intel_idle
1.04% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5c5
0.96% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c58d
0.93% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6ee
0.92% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c6b9
0.89% ksoftirqd/0 [kernel.vmlinux] [k] __alloc_pages_nodemask
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c686
0.83% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5d5
0.78% ksoftirqd/0 [mlx4_en] [k] mlx4_alloc_pages.isra.23
0.77% ksoftirqd/0 [mlx4_en] [k] 0x000000000001c5b4
0.77% ksoftirqd/0 [kernel.vmlinux] [k] net_rx_action
machine specs:
receiver - Intel E5-1630 v3 @ 3.70GHz
sender - Intel E5645 @ 2.40GHz
Mellanox ConnectX-3 @40G
Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-20 03:16:51 +08:00
|
|
|
memcmp(shname, "xdp", 3) == 0 ||
|
2016-09-02 09:37:25 +08:00
|
|
|
memcmp(shname, "perf_event", 10) == 0 ||
|
2016-12-02 00:48:07 +08:00
|
|
|
memcmp(shname, "socket", 6) == 0 ||
|
|
|
|
memcmp(shname, "cgroup/", 7) == 0)
|
2014-12-02 07:06:37 +08:00
|
|
|
load_and_attach(shname, data->d_buf, data->d_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
close(fd);
|
|
|
|
return 0;
|
|
|
|
}
|
2015-03-26 03:49:23 +08:00
|
|
|
|
|
|
|
void read_trace_pipe(void)
|
|
|
|
{
|
|
|
|
int trace_fd;
|
|
|
|
|
|
|
|
trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
|
|
|
|
if (trace_fd < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
static char buf[4096];
|
|
|
|
ssize_t sz;
|
|
|
|
|
|
|
|
sz = read(trace_fd, buf, sizeof(buf));
|
|
|
|
if (sz > 0) {
|
|
|
|
buf[sz] = 0;
|
|
|
|
puts(buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-03-08 13:57:19 +08:00
|
|
|
|
|
|
|
#define MAX_SYMS 300000
|
|
|
|
static struct ksym syms[MAX_SYMS];
|
|
|
|
static int sym_cnt;
|
|
|
|
|
|
|
|
static int ksym_cmp(const void *p1, const void *p2)
|
|
|
|
{
|
|
|
|
return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
int load_kallsyms(void)
|
|
|
|
{
|
|
|
|
FILE *f = fopen("/proc/kallsyms", "r");
|
|
|
|
char func[256], buf[256];
|
|
|
|
char symbol;
|
|
|
|
void *addr;
|
|
|
|
int i = 0;
|
|
|
|
|
|
|
|
if (!f)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
while (!feof(f)) {
|
|
|
|
if (!fgets(buf, sizeof(buf), f))
|
|
|
|
break;
|
|
|
|
if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
|
|
|
|
break;
|
|
|
|
if (!addr)
|
|
|
|
continue;
|
|
|
|
syms[i].addr = (long) addr;
|
|
|
|
syms[i].name = strdup(func);
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
sym_cnt = i;
|
|
|
|
qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ksym *ksym_search(long key)
|
|
|
|
{
|
|
|
|
int start = 0, end = sym_cnt;
|
|
|
|
int result;
|
|
|
|
|
|
|
|
while (start < end) {
|
|
|
|
size_t mid = start + (end - start) / 2;
|
|
|
|
|
|
|
|
result = key - syms[mid].addr;
|
|
|
|
if (result < 0)
|
|
|
|
end = mid;
|
|
|
|
else if (result > 0)
|
|
|
|
start = mid + 1;
|
|
|
|
else
|
|
|
|
return &syms[mid];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (start >= 1 && syms[start - 1].addr < key &&
|
|
|
|
key < syms[start].addr)
|
|
|
|
/* valid ksym */
|
|
|
|
return &syms[start - 1];
|
|
|
|
|
|
|
|
/* out of range. return _stext */
|
|
|
|
return &syms[0];
|
|
|
|
}
|
2016-12-08 07:53:14 +08:00
|
|
|
|
|
|
|
int set_link_xdp_fd(int ifindex, int fd)
|
|
|
|
{
|
|
|
|
struct sockaddr_nl sa;
|
|
|
|
int sock, seq = 0, len, ret = -1;
|
|
|
|
char buf[4096];
|
|
|
|
struct nlattr *nla, *nla_xdp;
|
|
|
|
struct {
|
|
|
|
struct nlmsghdr nh;
|
|
|
|
struct ifinfomsg ifinfo;
|
|
|
|
char attrbuf[64];
|
|
|
|
} req;
|
|
|
|
struct nlmsghdr *nh;
|
|
|
|
struct nlmsgerr *err;
|
|
|
|
|
|
|
|
memset(&sa, 0, sizeof(sa));
|
|
|
|
sa.nl_family = AF_NETLINK;
|
|
|
|
|
|
|
|
sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
|
|
|
|
if (sock < 0) {
|
|
|
|
printf("open netlink socket: %s\n", strerror(errno));
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
|
|
|
|
printf("bind to netlink: %s\n", strerror(errno));
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
memset(&req, 0, sizeof(req));
|
|
|
|
req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
|
|
|
|
req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
|
|
|
|
req.nh.nlmsg_type = RTM_SETLINK;
|
|
|
|
req.nh.nlmsg_pid = 0;
|
|
|
|
req.nh.nlmsg_seq = ++seq;
|
|
|
|
req.ifinfo.ifi_family = AF_UNSPEC;
|
|
|
|
req.ifinfo.ifi_index = ifindex;
|
|
|
|
nla = (struct nlattr *)(((char *)&req)
|
|
|
|
+ NLMSG_ALIGN(req.nh.nlmsg_len));
|
|
|
|
nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
|
|
|
|
|
|
|
|
nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
|
|
|
|
nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
|
|
|
|
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
|
|
|
|
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
|
|
|
|
nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len;
|
|
|
|
|
|
|
|
req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
|
|
|
|
|
|
|
|
if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
|
|
|
|
printf("send to netlink: %s\n", strerror(errno));
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
len = recv(sock, buf, sizeof(buf), 0);
|
|
|
|
if (len < 0) {
|
|
|
|
printf("recv from netlink: %s\n", strerror(errno));
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
|
|
|
|
nh = NLMSG_NEXT(nh, len)) {
|
|
|
|
if (nh->nlmsg_pid != getpid()) {
|
|
|
|
printf("Wrong pid %d, expected %d\n",
|
|
|
|
nh->nlmsg_pid, getpid());
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
if (nh->nlmsg_seq != seq) {
|
|
|
|
printf("Wrong seq %d, expected %d\n",
|
|
|
|
nh->nlmsg_seq, seq);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
switch (nh->nlmsg_type) {
|
|
|
|
case NLMSG_ERROR:
|
|
|
|
err = (struct nlmsgerr *)NLMSG_DATA(nh);
|
|
|
|
if (!err->error)
|
|
|
|
continue;
|
|
|
|
printf("nlmsg error %s\n", strerror(-err->error));
|
|
|
|
goto cleanup;
|
|
|
|
case NLMSG_DONE:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
close(sock);
|
|
|
|
return ret;
|
|
|
|
}
|