linux/net/xdp/xskmap.c

274 lines
6.2 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/* XSKMAP used for AF_XDP sockets
* Copyright(c) 2018 Intel Corporation.
*/
#include <linux/bpf.h>
#include <linux/capability.h>
#include <net/xdp_sock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include "xsk.h"
int xsk_map_inc(struct xsk_map *map)
{
bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails 92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit (32k). Due to using 32-bit counter, it's possible in practice to overflow refcounter and make it wrap around to 0, causing erroneous map free, while there are still references to it, causing use-after-free problems. But having a failing refcounting operations are problematic in some cases. One example is mmap() interface. After establishing initial memory-mapping, user is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily splitting it into multiple non-contiguous regions. All this happening without any control from the users of mmap subsystem. Rather mmap subsystem sends notifications to original creator of memory mapping through open/close callbacks, which are optionally specified during initial memory mapping creation. These callbacks are used to maintain accurate refcount for bpf_map (see next patch in this series). The problem is that open() callback is not supposed to fail, because memory-mapped resource is set up and properly referenced. This is posing a problem for using memory-mapping with BPF maps. One solution to this is to maintain separate refcount for just memory-mappings and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively. There are similar use cases in current work on tcp-bpf, necessitating extra counter as well. This seems like a rather unfortunate and ugly solution that doesn't scale well to various new use cases. Another approach to solve this is to use non-failing refcount_t type, which uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX, stays there. This utlimately causes memory leak, but prevents use after free. But given refcounting is not the most performance-critical operation with BPF maps (it's not used from running BPF program code), we can also just switch to 64-bit counter that can't overflow in practice, potentially disadvantaging 32-bit platforms a tiny bit. This simplifies semantics and allows above described scenarios to not worry about failing refcount increment operation. In terms of struct bpf_map size, we are still good and use the same amount of space: BEFORE (3 cache lines, 8 bytes of padding at the end): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */ atomic_t usercnt; /* 132 4 */ struct work_struct work; /* 136 32 */ char name[16]; /* 168 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 146, holes: 1, sum holes: 38 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); AFTER (same 3 cache lines, no extra padding now): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ atomic64_t usercnt; /* 136 8 */ struct work_struct work; /* 144 32 */ char name[16]; /* 176 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 154, holes: 1, sum holes: 38 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); This patch, while modifying all users of bpf_map_inc, also cleans up its interface to match bpf_map_put with separate operations for bpf_map_inc and bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref, respectively). Also, given there are no users of bpf_map_inc_not_zero specifying uref=true, remove uref flag and default to uref=false internally. Signed-off-by: Andrii Nakryiko <andriin@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Song Liu <songliubraving@fb.com> Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com
2019-11-18 01:28:02 +08:00
bpf_map_inc(&map->map);
return 0;
}
void xsk_map_put(struct xsk_map *map)
{
bpf_map_put(&map->map);
}
static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
struct xdp_sock **map_entry)
{
struct xsk_map_node *node;
int err;
node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
if (!node)
return ERR_PTR(-ENOMEM);
err = xsk_map_inc(map);
if (err) {
kfree(node);
return ERR_PTR(err);
}
node->map = map;
node->map_entry = map_entry;
return node;
}
static void xsk_map_node_free(struct xsk_map_node *node)
{
xsk_map_put(node->map);
kfree(node);
}
static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
{
spin_lock_bh(&xs->map_list_lock);
list_add_tail(&node->node, &xs->map_list);
spin_unlock_bh(&xs->map_list_lock);
}
static void xsk_map_sock_delete(struct xdp_sock *xs,
struct xdp_sock **map_entry)
{
struct xsk_map_node *n, *tmp;
spin_lock_bh(&xs->map_list_lock);
list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
if (map_entry == n->map_entry) {
list_del(&n->node);
xsk_map_node_free(n);
}
}
spin_unlock_bh(&xs->map_list_lock);
}
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
struct bpf_map_memory mem;
int err, numa_node;
struct xsk_map *m;
u64 size;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size != 4 ||
attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
return ERR_PTR(-EINVAL);
numa_node = bpf_map_attr_numa_node(attr);
size = struct_size(m, xsk_map, attr->max_entries);
err = bpf_map_charge_init(&mem, size);
if (err < 0)
return ERR_PTR(err);
m = bpf_map_area_alloc(size, numa_node);
if (!m) {
bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
}
bpf_map_init_from_attr(&m->map, attr);
bpf_map_charge_move(&m->map.memory, &mem);
spin_lock_init(&m->lock);
return &m->map;
}
static void xsk_map_free(struct bpf_map *map)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
bpf: fix redirect to map under tail calls Commits 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner pointer on bpf_prog_realloc") tried to mitigate that buggy programs using bpf_redirect_map() helper call do not leave stale maps behind. Idea was to add a map_owner cookie into the per CPU struct redirect_info which was set to prog->aux by the prog making the helper call as a proof that the map is not stale since the prog is implicitly holding a reference to it. This owner cookie could later on get compared with the program calling into BPF whether they match and therefore the redirect could proceed with processing the map safely. In (obvious) hindsight, this approach breaks down when tail calls are involved since the original caller's prog->aux pointer does not have to match the one from one of the progs out of the tail call chain, and therefore the xdp buffer will be dropped instead of redirected. A way around that would be to fix the issue differently (which also allows to remove related work in fast path at the same time): once the life-time of a redirect map has come to its end we use it's map free callback where we need to wait on synchronize_rcu() for current outstanding xdp buffers and remove such a map pointer from the redirect info if found to be present. At that time no program is using this map anymore so we simply invalidate the map pointers to NULL iff they previously pointed to that instance while making sure that the redirect path only reads out the map once. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Reported-by: Sebastiano Miano <sebastiano.miano@polito.it> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-08-18 05:26:14 +08:00
bpf_clear_redirect_map(map);
synchronize_net();
bpf_map_area_free(m);
}
static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = next_key;
if (index >= m->map.max_entries) {
*next = 0;
return 0;
}
if (index == m->map.max_entries - 1)
return -ENOENT;
*next = index + 1;
return 0;
}
bpf: Allow for map-in-map with dynamic inner array map entries Recent work in f4d05259213f ("bpf: Add map_meta_equal map ops") and 134fede4eecf ("bpf: Relax max_entries check for most of the inner map types") added support for dynamic inner max elements for most map-in-map types. Exceptions were maps like array or prog array where the map_gen_lookup() callback uses the maps' max_entries field as a constant when emitting instructions. We recently implemented Maglev consistent hashing into Cilium's load balancer which uses map-in-map with an outer map being hash and inner being array holding the Maglev backend table for each service. This has been designed this way in order to reduce overall memory consumption given the outer hash map allows to avoid preallocating a large, flat memory area for all services. Also, the number of service mappings is not always known a-priori. The use case for dynamic inner array map entries is to further reduce memory overhead, for example, some services might just have a small number of back ends while others could have a large number. Right now the Maglev backend table for small and large number of backends would need to have the same inner array map entries which adds a lot of unneeded overhead. Dynamic inner array map entries can be realized by avoiding the inlined code generation for their lookup. The lookup will still be efficient since it will be calling into array_map_lookup_elem() directly and thus avoiding retpoline. The patch adds a BPF_F_INNER_MAP flag to map creation which therefore skips inline code generation and relaxes array_map_meta_equal() check to ignore both maps' max_entries. This also still allows to have faster lookups for map-in-map when BPF_F_INNER_MAP is not specified and hence dynamic max_entries not needed. Example code generation where inner map is dynamic sized array: # bpftool p d x i 125 int handle__sys_enter(void * ctx): ; int handle__sys_enter(void *ctx) 0: (b4) w1 = 0 ; int key = 0; 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 ; 3: (07) r2 += -4 ; inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); 4: (18) r1 = map[id:468] 6: (07) r1 += 272 7: (61) r0 = *(u32 *)(r2 +0) 8: (35) if r0 >= 0x3 goto pc+5 9: (67) r0 <<= 3 10: (0f) r0 += r1 11: (79) r0 = *(u64 *)(r0 +0) 12: (15) if r0 == 0x0 goto pc+1 13: (05) goto pc+1 14: (b7) r0 = 0 15: (b4) w6 = -1 ; if (!inner_map) 16: (15) if r0 == 0x0 goto pc+6 17: (bf) r2 = r10 ; 18: (07) r2 += -4 ; val = bpf_map_lookup_elem(inner_map, &key); 19: (bf) r1 = r0 | No inlining but instead 20: (85) call array_map_lookup_elem#149280 | call to array_map_lookup_elem() ; return val ? *val : -1; | for inner array lookup. 21: (15) if r0 == 0x0 goto pc+1 ; return val ? *val : -1; 22: (61) r6 = *(u32 *)(r0 +0) ; } 23: (bc) w0 = w6 24: (95) exit Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20201010234006.7075-4-daniel@iogearbox.net
2020-10-11 07:40:03 +08:00
static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
struct bpf_insn *insn = insn_buf;
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
*insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
*insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
*insn++ = BPF_MOV64_IMM(ret, 0);
return insn - insn_buf;
}
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return __xsk_map_lookup_elem(map, *(u32 *)key);
}
static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
{
return ERR_PTR(-EOPNOTSUPP);
}
static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct xdp_sock *xs, *old_xs, **map_entry;
u32 i = *(u32 *)key, fd = *(u32 *)value;
struct xsk_map_node *node;
struct socket *sock;
int err;
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
if (unlikely(i >= m->map.max_entries))
return -E2BIG;
sock = sockfd_lookup(fd, &err);
if (!sock)
return err;
if (sock->sk->sk_family != PF_XDP) {
sockfd_put(sock);
return -EOPNOTSUPP;
}
xs = (struct xdp_sock *)sock->sk;
map_entry = &m->xsk_map[i];
node = xsk_map_node_alloc(m, map_entry);
if (IS_ERR(node)) {
sockfd_put(sock);
return PTR_ERR(node);
}
spin_lock_bh(&m->lock);
old_xs = READ_ONCE(*map_entry);
if (old_xs == xs) {
err = 0;
goto out;
} else if (old_xs && map_flags == BPF_NOEXIST) {
err = -EEXIST;
goto out;
} else if (!old_xs && map_flags == BPF_EXIST) {
err = -ENOENT;
goto out;
}
xsk_map_sock_add(xs, node);
WRITE_ONCE(*map_entry, xs);
if (old_xs)
xsk_map_sock_delete(old_xs, map_entry);
spin_unlock_bh(&m->lock);
sockfd_put(sock);
return 0;
out:
spin_unlock_bh(&m->lock);
sockfd_put(sock);
xsk_map_node_free(node);
return err;
}
static int xsk_map_delete_elem(struct bpf_map *map, void *key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct xdp_sock *old_xs, **map_entry;
int k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
spin_lock_bh(&m->lock);
map_entry = &m->xsk_map[k];
old_xs = xchg(map_entry, NULL);
if (old_xs)
xsk_map_sock_delete(old_xs, map_entry);
spin_unlock_bh(&m->lock);
return 0;
}
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry)
{
spin_lock_bh(&map->lock);
if (READ_ONCE(*map_entry) == xs) {
WRITE_ONCE(*map_entry, NULL);
xsk_map_sock_delete(xs, map_entry);
}
spin_unlock_bh(&map->lock);
}
static bool xsk_map_meta_equal(const struct bpf_map *meta0,
const struct bpf_map *meta1)
{
return meta0->max_entries == meta1->max_entries &&
bpf_map_meta_equal(meta0, meta1);
}
static int xsk_map_btf_id;
const struct bpf_map_ops xsk_map_ops = {
.map_meta_equal = xsk_map_meta_equal,
.map_alloc = xsk_map_alloc,
.map_free = xsk_map_free,
.map_get_next_key = xsk_map_get_next_key,
.map_lookup_elem = xsk_map_lookup_elem,
.map_gen_lookup = xsk_map_gen_lookup,
.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
.map_update_elem = xsk_map_update_elem,
.map_delete_elem = xsk_map_delete_elem,
.map_check_btf = map_check_no_btf,
.map_btf_name = "xsk_map",
.map_btf_id = &xsk_map_btf_id,
};