linux/net/core/lwt_bpf.c

398 lines
8.9 KiB
C
Raw Normal View History

/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/bpf.h>
#include <net/lwtunnel.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
char *name;
};
struct bpf_lwt {
struct bpf_lwt_prog in;
struct bpf_lwt_prog out;
struct bpf_lwt_prog xmit;
int family;
};
#define MAX_PROG_NAME 256
static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
{
return (struct bpf_lwt *)lwt->data;
}
#define NO_REDIRECT false
#define CAN_REDIRECT true
static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
struct dst_entry *dst, bool can_redirect)
{
int ret;
/* Preempt disable is needed to protect per-cpu redirect_info between
* BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
* access to maps strictly require a rcu_read_lock() for protection,
* mixing with BH RCU lock doesn't work.
*/
preempt_disable();
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
switch (ret) {
case BPF_OK:
break;
case BPF_REDIRECT:
if (unlikely(!can_redirect)) {
pr_warn_once("Illegal redirect return code in prog %s\n",
lwt->name ? : "<unknown>");
ret = BPF_OK;
} else {
ret = skb_do_redirect(skb);
if (ret == 0)
ret = BPF_REDIRECT;
}
break;
case BPF_DROP:
kfree_skb(skb);
ret = -EPERM;
break;
default:
pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
kfree_skb(skb);
ret = -EINVAL;
break;
}
preempt_enable();
return ret;
}
static int bpf_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct bpf_lwt *bpf;
int ret;
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->in.prog) {
ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
if (ret < 0)
return ret;
}
if (unlikely(!dst->lwtstate->orig_input)) {
pr_warn_once("orig_input not set on dst for prog %s\n",
bpf->out.name);
kfree_skb(skb);
return -EINVAL;
}
return dst->lwtstate->orig_input(skb);
}
static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct bpf_lwt *bpf;
int ret;
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->out.prog) {
ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
if (ret < 0)
return ret;
}
if (unlikely(!dst->lwtstate->orig_output)) {
pr_warn_once("orig_output not set on dst for prog %s\n",
bpf->out.name);
kfree_skb(skb);
return -EINVAL;
}
return dst->lwtstate->orig_output(net, sk, skb);
}
static int xmit_check_hhlen(struct sk_buff *skb)
{
int hh_len = skb_dst(skb)->dev->hard_header_len;
if (skb_headroom(skb) < hh_len) {
int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
return -ENOMEM;
}
return 0;
}
static int bpf_xmit(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct bpf_lwt *bpf;
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->xmit.prog) {
int ret;
ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
switch (ret) {
case BPF_OK:
/* If the header was expanded, headroom might be too
* small for L2 header to come, expand as needed.
*/
ret = xmit_check_hhlen(skb);
if (unlikely(ret))
return ret;
return LWTUNNEL_XMIT_CONTINUE;
case BPF_REDIRECT:
return LWTUNNEL_XMIT_DONE;
default:
return ret;
}
}
return LWTUNNEL_XMIT_CONTINUE;
}
static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
{
if (prog->prog)
bpf_prog_put(prog->prog);
kfree(prog->name);
}
static void bpf_destroy_state(struct lwtunnel_state *lwt)
{
struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
bpf_lwt_prog_destroy(&bpf->in);
bpf_lwt_prog_destroy(&bpf->out);
bpf_lwt_prog_destroy(&bpf->xmit);
}
static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
[LWT_BPF_PROG_FD] = { .type = NLA_U32, },
[LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
.len = MAX_PROG_NAME },
};
static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
enum bpf_prog_type type)
{
struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
struct bpf_prog *p;
int ret;
u32 fd;
ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy,
NULL);
if (ret < 0)
return ret;
if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
return -EINVAL;
bpf: use GFP_ATOMIC instead of GFP_KERNEL in bpf_parse_prog() bpf_parse_prog() is protected by rcu_read_lock(). so that GFP_KERNEL is not allowed in the bpf_parse_prog(). [51015.579396] ============================= [51015.579418] WARNING: suspicious RCU usage [51015.579444] 4.18.0-rc6+ #208 Not tainted [51015.579464] ----------------------------- [51015.579488] ./include/linux/rcupdate.h:303 Illegal context switch in RCU read-side critical section! [51015.579510] other info that might help us debug this: [51015.579532] rcu_scheduler_active = 2, debug_locks = 1 [51015.579556] 2 locks held by ip/1861: [51015.579577] #0: 00000000a8c12fd1 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x2e0/0x910 [51015.579711] #1: 00000000bf815f8e (rcu_read_lock){....}, at: lwtunnel_build_state+0x96/0x390 [51015.579842] stack backtrace: [51015.579869] CPU: 0 PID: 1861 Comm: ip Not tainted 4.18.0-rc6+ #208 [51015.579891] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [51015.579911] Call Trace: [51015.579950] dump_stack+0x74/0xbb [51015.580000] ___might_sleep+0x16b/0x3a0 [51015.580047] __kmalloc_track_caller+0x220/0x380 [51015.580077] kmemdup+0x1c/0x40 [51015.580077] bpf_parse_prog+0x10e/0x230 [51015.580164] ? kasan_kmalloc+0xa0/0xd0 [51015.580164] ? bpf_destroy_state+0x30/0x30 [51015.580164] ? bpf_build_state+0xe2/0x3e0 [51015.580164] bpf_build_state+0x1bb/0x3e0 [51015.580164] ? bpf_parse_prog+0x230/0x230 [51015.580164] ? lock_is_held_type+0x123/0x1a0 [51015.580164] lwtunnel_build_state+0x1aa/0x390 [51015.580164] fib_create_info+0x1579/0x33d0 [51015.580164] ? sched_clock_local+0xe2/0x150 [51015.580164] ? fib_info_update_nh_saddr+0x1f0/0x1f0 [51015.580164] ? sched_clock_local+0xe2/0x150 [51015.580164] fib_table_insert+0x201/0x1990 [51015.580164] ? lock_downgrade+0x610/0x610 [51015.580164] ? fib_table_lookup+0x1920/0x1920 [51015.580164] ? lwtunnel_valid_encap_type.part.6+0xcb/0x3a0 [51015.580164] ? rtm_to_fib_config+0x637/0xbd0 [51015.580164] inet_rtm_newroute+0xed/0x1b0 [51015.580164] ? rtm_to_fib_config+0xbd0/0xbd0 [51015.580164] rtnetlink_rcv_msg+0x331/0x910 [ ... ] Fixes: 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-07-28 23:28:31 +08:00
prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC);
if (!prog->name)
return -ENOMEM;
fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
p = bpf_prog_get_type(fd, type);
if (IS_ERR(p))
return PTR_ERR(p);
prog->prog = p;
return 0;
}
static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
[LWT_BPF_IN] = { .type = NLA_NESTED, },
[LWT_BPF_OUT] = { .type = NLA_NESTED, },
[LWT_BPF_XMIT] = { .type = NLA_NESTED, },
[LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
};
static int bpf_build_state(struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[LWT_BPF_MAX + 1];
struct lwtunnel_state *newts;
struct bpf_lwt *bpf;
int ret;
if (family != AF_INET && family != AF_INET6)
return -EAFNOSUPPORT;
ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack);
if (ret < 0)
return ret;
if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
return -EINVAL;
newts = lwtunnel_state_alloc(sizeof(*bpf));
if (!newts)
return -ENOMEM;
newts->type = LWTUNNEL_ENCAP_BPF;
bpf = bpf_lwt_lwtunnel(newts);
if (tb[LWT_BPF_IN]) {
newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
BPF_PROG_TYPE_LWT_IN);
if (ret < 0)
goto errout;
}
if (tb[LWT_BPF_OUT]) {
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
BPF_PROG_TYPE_LWT_OUT);
if (ret < 0)
goto errout;
}
if (tb[LWT_BPF_XMIT]) {
newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
BPF_PROG_TYPE_LWT_XMIT);
if (ret < 0)
goto errout;
}
if (tb[LWT_BPF_XMIT_HEADROOM]) {
u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
if (headroom > LWT_BPF_MAX_HEADROOM) {
ret = -ERANGE;
goto errout;
}
newts->headroom = headroom;
}
bpf->family = family;
*ts = newts;
return 0;
errout:
bpf_destroy_state(newts);
kfree(newts);
return ret;
}
static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
struct bpf_lwt_prog *prog)
{
struct nlattr *nest;
if (!prog->prog)
return 0;
nest = nla_nest_start(skb, attr);
if (!nest)
return -EMSGSIZE;
if (prog->name &&
nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
return -EMSGSIZE;
return nla_nest_end(skb, nest);
}
static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
{
struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
return -EMSGSIZE;
return 0;
}
static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
{
int nest_len = nla_total_size(sizeof(struct nlattr)) +
nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
0;
return nest_len + /* LWT_BPF_IN */
nest_len + /* LWT_BPF_OUT */
nest_len + /* LWT_BPF_XMIT */
0;
}
static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
{
/* FIXME:
* The LWT state is currently rebuilt for delete requests which
* results in a new bpf_prog instance. Comparing names for now.
*/
if (!a->name && !b->name)
return 0;
if (!a->name || !b->name)
return 1;
return strcmp(a->name, b->name);
}
static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
{
struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
}
static const struct lwtunnel_encap_ops bpf_encap_ops = {
.build_state = bpf_build_state,
.destroy_state = bpf_destroy_state,
.input = bpf_input,
.output = bpf_output,
.xmit = bpf_xmit,
.fill_encap = bpf_fill_encap_info,
.get_encap_size = bpf_encap_nlsize,
.cmp_encap = bpf_encap_cmp,
.owner = THIS_MODULE,
};
static int __init bpf_lwt_init(void)
{
return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
}
subsys_initcall(bpf_lwt_init)