linux/net/sched/cls_api.c

751 lines
17 KiB
C
Raw Normal View History

/*
* net/sched/cls_api.c Packet classifier API.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
*
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/err.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
/* The list of all installed classifier types */
static LIST_HEAD(tcf_proto_base);
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
/* Find classifier type by string name */
static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
{
const struct tcf_proto_ops *t, *res = NULL;
if (kind) {
read_lock(&cls_mod_lock);
list_for_each_entry(t, &tcf_proto_base, head) {
if (strcmp(kind, t->kind) == 0) {
if (try_module_get(t->owner))
res = t;
break;
}
}
read_unlock(&cls_mod_lock);
}
return res;
}
/* Register(unregister) new classifier type */
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
{
struct tcf_proto_ops *t;
int rc = -EEXIST;
write_lock(&cls_mod_lock);
list_for_each_entry(t, &tcf_proto_base, head)
if (!strcmp(ops->kind, t->kind))
goto out;
list_add_tail(&ops->head, &tcf_proto_base);
rc = 0;
out:
write_unlock(&cls_mod_lock);
return rc;
}
EXPORT_SYMBOL(register_tcf_proto_ops);
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
{
struct tcf_proto_ops *t;
int rc = -ENOENT;
net: sched: fix call_rcu() race on classifier module unloads Vijay reported that a loop as simple as ... while true; do tc qdisc add dev foo root handle 1: prio tc filter add dev foo parent 1: u32 match u32 0 0 flowid 1 tc qdisc del dev foo root rmmod cls_u32 done ... will panic the kernel. Moreover, he bisected the change apparently introducing it to 78fd1d0ab072 ("netlink: Re-add locking to netlink_lookup() and seq walker"). The removal of synchronize_net() from the netlink socket triggering the qdisc to be removed, seems to have uncovered an RCU resp. module reference count race from the tc API. Given that RCU conversion was done after e341694e3eb5 ("netlink: Convert netlink_lookup() to use RCU protected hash table") which added the synchronize_net() originally, occasion of hitting the bug was less likely (not impossible though): When qdiscs that i) support attaching classifiers and, ii) have at least one of them attached, get deleted, they invoke tcf_destroy_chain(), and thus call into ->destroy() handler from a classifier module. After RCU conversion, all classifier that have an internal prio list, unlink them and initiate freeing via call_rcu() deferral. Meanhile, tcf_destroy() releases already reference to the tp->ops->owner module before the queued RCU callback handler has been invoked. Subsequent rmmod on the classifier module is then not prevented since all module references are already dropped. By the time, the kernel invokes the RCU callback handler from the module, that function address is then invalid. One way to fix it would be to add an rcu_barrier() to unregister_tcf_proto_ops() to wait for all pending call_rcu()s to complete. synchronize_rcu() is not appropriate as under heavy RCU callback load, registered call_rcu()s could be deferred longer than a grace period. In case we don't have any pending call_rcu()s, the barrier is allowed to return immediately. Since we came here via unregister_tcf_proto_ops(), there are no users of a given classifier anymore. Further nested call_rcu()s pointing into the module space are not being done anywhere. Only cls_bpf_delete_prog() may schedule a work item, to unlock pages eventually, but that is not in the range/context of cls_bpf anymore. Fixes: 25d8c0d55f24 ("net: rcu-ify tcf_proto") Fixes: 9888faefe132 ("net: sched: cls_basic use RCU") Reported-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Cc: John Fastabend <john.r.fastabend@intel.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Thomas Graf <tgraf@suug.ch> Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: Alexei Starovoitov <ast@plumgrid.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Acked-by: Alexei Starovoitov <ast@plumgrid.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 23:13:33 +08:00
/* Wait for outstanding call_rcu()s, if any, from a
* tcf_proto_ops's destroy() handler.
*/
rcu_barrier();
write_lock(&cls_mod_lock);
list_for_each_entry(t, &tcf_proto_base, head) {
if (t == ops) {
list_del(&t->head);
rc = 0;
break;
}
}
write_unlock(&cls_mod_lock);
return rc;
}
EXPORT_SYMBOL(unregister_tcf_proto_ops);
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
unsigned long fh, int event, bool unicast);
net, cls: allow for deleting all filters for given parent Add a possibility where the user can just specify the parent and all filters under that parent are then being purged. Currently, for example for scripting, one needs to specify pref/prio to have a well-defined number for 'tc filter del' command for addressing the previously created instance or additionally filter handle in case of priorities being the same. Improve usage by allowing the option for tc to specify the parent and removing the whole chain for that given parent. Example usage after patch, no tc changes required: # tc qdisc replace dev foo clsact # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter show dev foo egress filter protocol all pref 49151 bpf filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action # tc filter del dev foo egress # tc filter show dev foo egress # Previously, RTM_DELTFILTER requests with invalid prio of 0 were rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE flag were allowed where the kernel would auto-generate a pref/prio. We can piggyback on that and use prio of 0 as a wildcard for requests of RTM_DELTFILTER. For notifying tc netlink monitoring users (e.g. libnl uses this for caching), there are two options, that is, sending individual tfilter_notify() notifications for each tcf_proto, or sending a single one indicating wildcard removal. I tried both and there are pros and cons for each, eventually I decided for sending individual tfilter_notify(), so that user space can support this seamlessly and there won't be a mess of changing each and every application to make sure expectations from the kernel won't break when they don't understand single notification. Since linear chains don't really scale, I expect only a handful of classifiers to be attached at max for a given parent anyway. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n,
struct tcf_proto __rcu **chain, int event)
{
struct tcf_proto __rcu **it_chain;
struct tcf_proto *tp;
for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL;
it_chain = &tp->next)
tfilter_notify(net, oskb, n, tp, 0, event, false);
net, cls: allow for deleting all filters for given parent Add a possibility where the user can just specify the parent and all filters under that parent are then being purged. Currently, for example for scripting, one needs to specify pref/prio to have a well-defined number for 'tc filter del' command for addressing the previously created instance or additionally filter handle in case of priorities being the same. Improve usage by allowing the option for tc to specify the parent and removing the whole chain for that given parent. Example usage after patch, no tc changes required: # tc qdisc replace dev foo clsact # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter show dev foo egress filter protocol all pref 49151 bpf filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action # tc filter del dev foo egress # tc filter show dev foo egress # Previously, RTM_DELTFILTER requests with invalid prio of 0 were rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE flag were allowed where the kernel would auto-generate a pref/prio. We can piggyback on that and use prio of 0 as a wildcard for requests of RTM_DELTFILTER. For notifying tc netlink monitoring users (e.g. libnl uses this for caching), there are two options, that is, sending individual tfilter_notify() notifications for each tcf_proto, or sending a single one indicating wildcard removal. I tried both and there are pros and cons for each, eventually I decided for sending individual tfilter_notify(), so that user space can support this seamlessly and there won't be a mess of changing each and every application to make sure expectations from the kernel won't break when they don't understand single notification. Since linear chains don't really scale, I expect only a handful of classifiers to be attached at max for a given parent anyway. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
}
/* Select new prio value from the range, managed by kernel. */
static inline u32 tcf_auto_prio(struct tcf_proto *tp)
{
u32 first = TC_H_MAKE(0xC0000000U, 0U);
if (tp)
first = tp->prio - 1;
return first;
}
static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
u32 prio, u32 parent, struct Qdisc *q)
{
struct tcf_proto *tp;
int err;
tp = kzalloc(sizeof(*tp), GFP_KERNEL);
if (!tp)
return ERR_PTR(-ENOBUFS);
err = -ENOENT;
tp->ops = tcf_proto_lookup_ops(kind);
if (!tp->ops) {
#ifdef CONFIG_MODULES
rtnl_unlock();
request_module("cls_%s", kind);
rtnl_lock();
tp->ops = tcf_proto_lookup_ops(kind);
/* We dropped the RTNL semaphore in order to perform
* the module load. So, even if we succeeded in loading
* the module we have to replay the request. We indicate
* this using -EAGAIN.
*/
if (tp->ops) {
module_put(tp->ops->owner);
err = -EAGAIN;
} else {
err = -ENOENT;
}
goto errout;
#endif
}
tp->classify = tp->ops->classify;
tp->protocol = protocol;
tp->prio = prio;
tp->classid = parent;
tp->q = q;
err = tp->ops->init(tp);
if (err) {
module_put(tp->ops->owner);
goto errout;
}
return tp;
errout:
kfree(tp);
return ERR_PTR(err);
}
static bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
{
if (tp->ops->destroy(tp, force)) {
module_put(tp->ops->owner);
kfree_rcu(tp, rcu);
return true;
}
return false;
}
void tcf_destroy_chain(struct tcf_proto __rcu **fl)
{
struct tcf_proto *tp;
while ((tp = rtnl_dereference(*fl)) != NULL) {
RCU_INIT_POINTER(*fl, tp->next);
tcf_proto_destroy(tp, true);
}
}
EXPORT_SYMBOL(tcf_destroy_chain);
/* Add/change/delete/get a filter node */
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
struct tcmsg *t;
u32 protocol;
u32 prio;
u32 nprio;
u32 parent;
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto __rcu **back;
struct tcf_proto __rcu **chain;
struct tcf_proto *tp;
const struct Qdisc_class_ops *cops;
unsigned long cl;
unsigned long fh;
int err;
net, sched: fix soft lockup in tc_classify Shahar reported a soft lockup in tc_classify(), where we run into an endless loop when walking the classifier chain due to tp->next == tp which is a state we should never run into. The issue only seems to trigger under load in the tc control path. What happens is that in tc_ctl_tfilter(), thread A allocates a new tp, initializes it, sets tp_created to 1, and calls into tp->ops->change() with it. In that classifier callback we had to unlock/lock the rtnl mutex and returned with -EAGAIN. One reason why we need to drop there is, for example, that we need to request an action module to be loaded. This happens via tcf_exts_validate() -> tcf_action_init/_1() meaning after we loaded and found the requested action, we need to redo the whole request so we don't race against others. While we had to unlock rtnl in that time, thread B's request was processed next on that CPU. Thread B added a new tp instance successfully to the classifier chain. When thread A returned grabbing the rtnl mutex again, propagating -EAGAIN and destroying its tp instance which never got linked, we goto replay and redo A's request. This time when walking the classifier chain in tc_ctl_tfilter() for checking for existing tp instances we had a priority match and found the tp instance that was created and linked by thread B. Now calling again into tp->ops->change() with that tp was successful and returned without error. tp_created was never cleared in the second round, thus kernel thinks that we need to link it into the classifier chain (once again). tp and *back point to the same object due to the match we had earlier on. Thus for thread B's already public tp, we reset tp->next to tp itself and link it into the chain, which eventually causes the mentioned endless loop in tc_classify() once a packet hits the data path. Fix is to clear tp_created at the beginning of each request, also when we replay it. On the paths that can cause -EAGAIN we already destroy the original tp instance we had and on replay we really need to start from scratch. It seems that this issue was first introduced in commit 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup"). Fixes: 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup") Reported-by: Shahar Klein <shahark@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Cc: Cong Wang <xiyou.wangcong@gmail.com> Acked-by: Eric Dumazet <edumazet@google.com> Tested-by: Shahar Klein <shahark@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-12-22 01:04:11 +08:00
int tp_created;
if ((n->nlmsg_type != RTM_GETTFILTER) &&
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
replay:
net, sched: fix soft lockup in tc_classify Shahar reported a soft lockup in tc_classify(), where we run into an endless loop when walking the classifier chain due to tp->next == tp which is a state we should never run into. The issue only seems to trigger under load in the tc control path. What happens is that in tc_ctl_tfilter(), thread A allocates a new tp, initializes it, sets tp_created to 1, and calls into tp->ops->change() with it. In that classifier callback we had to unlock/lock the rtnl mutex and returned with -EAGAIN. One reason why we need to drop there is, for example, that we need to request an action module to be loaded. This happens via tcf_exts_validate() -> tcf_action_init/_1() meaning after we loaded and found the requested action, we need to redo the whole request so we don't race against others. While we had to unlock rtnl in that time, thread B's request was processed next on that CPU. Thread B added a new tp instance successfully to the classifier chain. When thread A returned grabbing the rtnl mutex again, propagating -EAGAIN and destroying its tp instance which never got linked, we goto replay and redo A's request. This time when walking the classifier chain in tc_ctl_tfilter() for checking for existing tp instances we had a priority match and found the tp instance that was created and linked by thread B. Now calling again into tp->ops->change() with that tp was successful and returned without error. tp_created was never cleared in the second round, thus kernel thinks that we need to link it into the classifier chain (once again). tp and *back point to the same object due to the match we had earlier on. Thus for thread B's already public tp, we reset tp->next to tp itself and link it into the chain, which eventually causes the mentioned endless loop in tc_classify() once a packet hits the data path. Fix is to clear tp_created at the beginning of each request, also when we replay it. On the paths that can cause -EAGAIN we already destroy the original tp instance we had and on replay we really need to start from scratch. It seems that this issue was first introduced in commit 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup"). Fixes: 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup") Reported-by: Shahar Klein <shahark@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Cc: Cong Wang <xiyou.wangcong@gmail.com> Acked-by: Eric Dumazet <edumazet@google.com> Tested-by: Shahar Klein <shahark@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-12-22 01:04:11 +08:00
tp_created = 0;
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
if (err < 0)
return err;
t = nlmsg_data(n);
protocol = TC_H_MIN(t->tcm_info);
prio = TC_H_MAJ(t->tcm_info);
nprio = prio;
parent = t->tcm_parent;
cl = 0;
if (prio == 0) {
net, cls: allow for deleting all filters for given parent Add a possibility where the user can just specify the parent and all filters under that parent are then being purged. Currently, for example for scripting, one needs to specify pref/prio to have a well-defined number for 'tc filter del' command for addressing the previously created instance or additionally filter handle in case of priorities being the same. Improve usage by allowing the option for tc to specify the parent and removing the whole chain for that given parent. Example usage after patch, no tc changes required: # tc qdisc replace dev foo clsact # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter show dev foo egress filter protocol all pref 49151 bpf filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action # tc filter del dev foo egress # tc filter show dev foo egress # Previously, RTM_DELTFILTER requests with invalid prio of 0 were rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE flag were allowed where the kernel would auto-generate a pref/prio. We can piggyback on that and use prio of 0 as a wildcard for requests of RTM_DELTFILTER. For notifying tc netlink monitoring users (e.g. libnl uses this for caching), there are two options, that is, sending individual tfilter_notify() notifications for each tcf_proto, or sending a single one indicating wildcard removal. I tried both and there are pros and cons for each, eventually I decided for sending individual tfilter_notify(), so that user space can support this seamlessly and there won't be a mess of changing each and every application to make sure expectations from the kernel won't break when they don't understand single notification. Since linear chains don't really scale, I expect only a handful of classifiers to be attached at max for a given parent anyway. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
switch (n->nlmsg_type) {
case RTM_DELTFILTER:
if (protocol || t->tcm_handle || tca[TCA_KIND])
net, cls: allow for deleting all filters for given parent Add a possibility where the user can just specify the parent and all filters under that parent are then being purged. Currently, for example for scripting, one needs to specify pref/prio to have a well-defined number for 'tc filter del' command for addressing the previously created instance or additionally filter handle in case of priorities being the same. Improve usage by allowing the option for tc to specify the parent and removing the whole chain for that given parent. Example usage after patch, no tc changes required: # tc qdisc replace dev foo clsact # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter show dev foo egress filter protocol all pref 49151 bpf filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action # tc filter del dev foo egress # tc filter show dev foo egress # Previously, RTM_DELTFILTER requests with invalid prio of 0 were rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE flag were allowed where the kernel would auto-generate a pref/prio. We can piggyback on that and use prio of 0 as a wildcard for requests of RTM_DELTFILTER. For notifying tc netlink monitoring users (e.g. libnl uses this for caching), there are two options, that is, sending individual tfilter_notify() notifications for each tcf_proto, or sending a single one indicating wildcard removal. I tried both and there are pros and cons for each, eventually I decided for sending individual tfilter_notify(), so that user space can support this seamlessly and there won't be a mess of changing each and every application to make sure expectations from the kernel won't break when they don't understand single notification. Since linear chains don't really scale, I expect only a handful of classifiers to be attached at max for a given parent anyway. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
return -ENOENT;
break;
case RTM_NEWTFILTER:
/* If no priority is provided by the user,
* we allocate one.
*/
if (n->nlmsg_flags & NLM_F_CREATE) {
prio = TC_H_MAKE(0x80000000U, 0U);
break;
}
/* fall-through */
default:
return -ENOENT;
net, cls: allow for deleting all filters for given parent Add a possibility where the user can just specify the parent and all filters under that parent are then being purged. Currently, for example for scripting, one needs to specify pref/prio to have a well-defined number for 'tc filter del' command for addressing the previously created instance or additionally filter handle in case of priorities being the same. Improve usage by allowing the option for tc to specify the parent and removing the whole chain for that given parent. Example usage after patch, no tc changes required: # tc qdisc replace dev foo clsact # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter show dev foo egress filter protocol all pref 49151 bpf filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action # tc filter del dev foo egress # tc filter show dev foo egress # Previously, RTM_DELTFILTER requests with invalid prio of 0 were rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE flag were allowed where the kernel would auto-generate a pref/prio. We can piggyback on that and use prio of 0 as a wildcard for requests of RTM_DELTFILTER. For notifying tc netlink monitoring users (e.g. libnl uses this for caching), there are two options, that is, sending individual tfilter_notify() notifications for each tcf_proto, or sending a single one indicating wildcard removal. I tried both and there are pros and cons for each, eventually I decided for sending individual tfilter_notify(), so that user space can support this seamlessly and there won't be a mess of changing each and every application to make sure expectations from the kernel won't break when they don't understand single notification. Since linear chains don't really scale, I expect only a handful of classifiers to be attached at max for a given parent anyway. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
}
}
/* Find head of filter chain. */
/* Find link */
dev = __dev_get_by_index(net, t->tcm_ifindex);
if (dev == NULL)
return -ENODEV;
/* Find qdisc */
if (!parent) {
q = dev->qdisc;
parent = q->handle;
} else {
q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
if (q == NULL)
return -EINVAL;
}
/* Is it classful? */
cops = q->ops->cl_ops;
if (!cops)
return -EINVAL;
if (cops->tcf_chain == NULL)
return -EOPNOTSUPP;
/* Do we search for filter, attached to class? */
if (TC_H_MIN(parent)) {
cl = cops->get(q, parent);
if (cl == 0)
return -ENOENT;
}
/* And the last stroke */
chain = cops->tcf_chain(q, cl);
err = -EINVAL;
if (chain == NULL)
goto errout;
net, cls: allow for deleting all filters for given parent Add a possibility where the user can just specify the parent and all filters under that parent are then being purged. Currently, for example for scripting, one needs to specify pref/prio to have a well-defined number for 'tc filter del' command for addressing the previously created instance or additionally filter handle in case of priorities being the same. Improve usage by allowing the option for tc to specify the parent and removing the whole chain for that given parent. Example usage after patch, no tc changes required: # tc qdisc replace dev foo clsact # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter add dev foo egress bpf da obj ./bpf.o # tc filter show dev foo egress filter protocol all pref 49151 bpf filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action # tc filter del dev foo egress # tc filter show dev foo egress # Previously, RTM_DELTFILTER requests with invalid prio of 0 were rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE flag were allowed where the kernel would auto-generate a pref/prio. We can piggyback on that and use prio of 0 as a wildcard for requests of RTM_DELTFILTER. For notifying tc netlink monitoring users (e.g. libnl uses this for caching), there are two options, that is, sending individual tfilter_notify() notifications for each tcf_proto, or sending a single one indicating wildcard removal. I tried both and there are pros and cons for each, eventually I decided for sending individual tfilter_notify(), so that user space can support this seamlessly and there won't be a mess of changing each and every application to make sure expectations from the kernel won't break when they don't understand single notification. Since linear chains don't really scale, I expect only a handful of classifiers to be attached at max for a given parent anyway. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
tcf_destroy_chain(chain);
err = 0;
goto errout;
}
/* Check the chain for existence of proto-tcf with this priority */
for (back = chain;
(tp = rtnl_dereference(*back)) != NULL;
back = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
if (!nprio ||
(tp->protocol != protocol && protocol))
goto errout;
} else
tp = NULL;
break;
}
}
if (tp == NULL) {
/* Proto-tcf does not exist, create new one */
if (tca[TCA_KIND] == NULL || !protocol)
goto errout;
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER ||
!(n->nlmsg_flags & NLM_F_CREATE))
goto errout;
if (!nprio)
nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
protocol, nprio, parent, q);
if (IS_ERR(tp)) {
err = PTR_ERR(tp);
goto errout;
}
tp_created = 1;
} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
goto errout;
fh = tp->ops->get(tp, t->tcm_handle);
if (fh == 0) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
struct tcf_proto *next = rtnl_dereference(tp->next);
RCU_INIT_POINTER(*back, next);
tfilter_notify(net, skb, n, tp, fh,
RTM_DELTFILTER, false);
tcf_proto_destroy(tp, true);
err = 0;
goto errout;
}
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER ||
!(n->nlmsg_flags & NLM_F_CREATE))
goto errout;
} else {
switch (n->nlmsg_type) {
case RTM_NEWTFILTER:
err = -EEXIST;
if (n->nlmsg_flags & NLM_F_EXCL) {
if (tp_created)
tcf_proto_destroy(tp, true);
goto errout;
}
break;
case RTM_DELTFILTER:
err = tp->ops->delete(tp, fh);
if (err == 0) {
struct tcf_proto *next = rtnl_dereference(tp->next);
net sched filters: fix notification of filter delete with proper handle Daniel says: While trying out [1][2], I noticed that tc monitor doesn't show the correct handle on delete: $ tc monitor qdisc clsact ffff: dev eno1 parent ffff:fff1 filter dev eno1 ingress protocol all pref 49152 bpf handle 0x2a [...] deleted filter dev eno1 ingress protocol all pref 49152 bpf handle 0xf3be0c80 some context to explain the above: The user identity of any tc filter is represented by a 32-bit identifier encoded in tcm->tcm_handle. Example 0x2a in the bpf filter above. A user wishing to delete, get or even modify a specific filter uses this handle to reference it. Every classifier is free to provide its own semantics for the 32 bit handle. Example: classifiers like u32 use schemes like 800:1:801 to describe the semantics of their filters represented as hash table, bucket and node ids etc. Classifiers also have internal per-filter representation which is different from this externally visible identity. Most classifiers set this internal representation to be a pointer address (which allows fast retrieval of said filters in their implementations). This internal representation is referenced with the "fh" variable in the kernel control code. When a user successfuly deletes a specific filter, by specifying the correct tcm->tcm_handle, an event is generated to user space which indicates which specific filter was deleted. Before this patch, the "fh" value was sent to user space as the identity. As an example what is shown in the sample bpf filter delete event above is 0xf3be0c80. This is infact a 32-bit truncation of 0xffff8807f3be0c80 which happens to be a 64-bit memory address of the internal filter representation (address of the corresponding filter's struct cls_bpf_prog); After this patch the appropriate user identifiable handle as encoded in the originating request tcm->tcm_handle is generated in the event. One of the cardinal rules of netlink rules is to be able to take an event (such as a delete in this case) and reflect it back to the kernel and successfully delete the filter. This patch achieves that. Note, this issue has existed since the original TC action infrastructure code patch back in 2004 as found in: https://git.kernel.org/cgit/linux/kernel/git/history/history.git/commit/ [1] http://patchwork.ozlabs.org/patch/682828/ [2] http://patchwork.ozlabs.org/patch/682829/ Fixes: 4e54c4816bfe ("[NET]: Add tc extensions infrastructure.") Reported-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Cong Wang <xiyou.wangcong@gmail.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-25 08:18:27 +08:00
tfilter_notify(net, skb, n, tp,
t->tcm_handle,
RTM_DELTFILTER, false);
if (tcf_proto_destroy(tp, false))
RCU_INIT_POINTER(*back, next);
}
goto errout;
case RTM_GETTFILTER:
err = tfilter_notify(net, skb, n, tp, fh,
RTM_NEWTFILTER, true);
goto errout;
default:
err = -EINVAL;
goto errout;
}
}
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);
if (err == 0) {
if (tp_created) {
RCU_INIT_POINTER(tp->next, rtnl_dereference(*back));
rcu_assign_pointer(*back, tp);
}
tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
} else {
if (tp_created)
tcf_proto_destroy(tp, true);
}
errout:
if (cl)
cops->put(q, cl);
if (err == -EAGAIN)
/* Replay the request. */
goto replay;
return err;
}
static int tcf_fill_node(struct net *net, struct sk_buff *skb,
struct tcf_proto *tp, unsigned long fh, u32 portid,
u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
if (!nlh)
goto out_nlmsg_trim;
tcm = nlmsg_data(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm__pad1 = 0;
tcm->tcm__pad2 = 0;
tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
tcm->tcm_parent = tp->classid;
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
goto nla_put_failure;
tcm->tcm_handle = fh;
if (RTM_DELTFILTER != event) {
tcm->tcm_handle = 0;
if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
goto nla_put_failure;
}
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
return skb->len;
out_nlmsg_trim:
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
unsigned long fh, int event, bool unicast)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
n->nlmsg_flags, event) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
if (unicast)
return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
n->nlmsg_flags & NLM_F_ECHO);
}
struct tcf_dump_args {
struct tcf_walker w;
struct sk_buff *skb;
struct netlink_callback *cb;
};
static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
struct tcf_walker *arg)
{
struct tcf_dump_args *a = (void *)arg;
struct net *net = sock_net(a->skb->sk);
return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWTFILTER);
}
/* called with RTNL */
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
int t;
int s_t;
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto *tp, __rcu **chain;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
unsigned long cl = 0;
const struct Qdisc_class_ops *cops;
struct tcf_dump_args arg;
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return skb->len;
dev = __dev_get_by_index(net, tcm->tcm_ifindex);
if (!dev)
return skb->len;
if (!tcm->tcm_parent)
q = dev->qdisc;
else
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
if (!q)
goto out;
cops = q->ops->cl_ops;
if (!cops)
goto errout;
if (cops->tcf_chain == NULL)
goto errout;
if (TC_H_MIN(tcm->tcm_parent)) {
cl = cops->get(q, tcm->tcm_parent);
if (cl == 0)
goto errout;
}
chain = cops->tcf_chain(q, cl);
if (chain == NULL)
goto errout;
s_t = cb->args[0];
for (tp = rtnl_dereference(*chain), t = 0;
tp; tp = rtnl_dereference(tp->next), t++) {
if (t < s_t)
continue;
if (TC_H_MAJ(tcm->tcm_info) &&
TC_H_MAJ(tcm->tcm_info) != tp->prio)
continue;
if (TC_H_MIN(tcm->tcm_info) &&
TC_H_MIN(tcm->tcm_info) != tp->protocol)
continue;
if (t > s_t)
memset(&cb->args[1], 0,
sizeof(cb->args)-sizeof(cb->args[0]));
if (cb->args[1] == 0) {
if (tcf_fill_node(net, skb, tp, 0,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWTFILTER) <= 0)
break;
cb->args[1] = 1;
}
if (tp->ops->walk == NULL)
continue;
arg.w.fn = tcf_node_dump;
arg.skb = skb;
arg.cb = cb;
arg.w.stop = 0;
arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
tp->ops->walk(tp, &arg.w);
cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
break;
}
cb->args[0] = t;
errout:
if (cl)
cops->put(q, cl);
out:
return skb->len;
}
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
LIST_HEAD(actions);
tcf_exts_to_list(exts, &actions);
tcf_action_destroy(&actions, TCA_ACT_UNBIND);
kfree(exts->actions);
exts->nr_actions = 0;
#endif
}
EXPORT_SYMBOL(tcf_exts_destroy);
int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
{
#ifdef CONFIG_NET_CLS_ACT
{
struct tc_action *act;
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
"police", ovr, TCA_ACT_BIND);
if (IS_ERR(act))
return PTR_ERR(act);
act->type = exts->type = TCA_OLD_COMPAT;
exts->actions[0] = act;
exts->nr_actions = 1;
} else if (exts->action && tb[exts->action]) {
LIST_HEAD(actions);
int err, i = 0;
err = tcf_action_init(net, tb[exts->action], rate_tlv,
NULL, ovr, TCA_ACT_BIND,
&actions);
if (err)
return err;
list_for_each_entry(act, &actions, list)
exts->actions[i++] = act;
exts->nr_actions = i;
}
}
#else
if ((exts->action && tb[exts->action]) ||
(exts->police && tb[exts->police]))
return -EOPNOTSUPP;
#endif
return 0;
}
EXPORT_SYMBOL(tcf_exts_validate);
void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
struct tcf_exts *src)
{
#ifdef CONFIG_NET_CLS_ACT
struct tcf_exts old = *dst;
tcf_tree_lock(tp);
dst->nr_actions = src->nr_actions;
dst->actions = src->actions;
dst->type = src->type;
tcf_tree_unlock(tp);
tcf_exts_destroy(&old);
#endif
}
EXPORT_SYMBOL(tcf_exts_change);
#ifdef CONFIG_NET_CLS_ACT
static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts)
{
if (exts->nr_actions == 0)
return NULL;
else
return exts->actions[0];
}
#endif
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
struct nlattr *nest;
if (exts->action && exts->nr_actions) {
/*
* again for backward compatible mode - we want
* to work with both old and new modes of entering
* tc data even if iproute2 was newer - jhs
*/
if (exts->type != TCA_OLD_COMPAT) {
LIST_HEAD(actions);
nest = nla_nest_start(skb, exts->action);
if (nest == NULL)
goto nla_put_failure;
tcf_exts_to_list(exts, &actions);
if (tcf_action_dump(skb, &actions, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (exts->police) {
struct tc_action *act = tcf_exts_first_act(exts);
nest = nla_nest_start(skb, exts->police);
if (nest == NULL || !act)
goto nla_put_failure;
if (tcf_action_dump_old(skb, act, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
}
}
return 0;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
#else
return 0;
#endif
}
EXPORT_SYMBOL(tcf_exts_dump);
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
struct tc_action *a = tcf_exts_first_act(exts);
if (a != NULL && tcf_action_copy_stats(skb, a, 1) < 0)
return -1;
#endif
return 0;
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
struct net_device **hw_dev)
{
#ifdef CONFIG_NET_CLS_ACT
const struct tc_action *a;
LIST_HEAD(actions);
if (tc_no_actions(exts))
return -EINVAL;
tcf_exts_to_list(exts, &actions);
list_for_each_entry(a, &actions, list) {
if (a->ops->get_dev) {
a->ops->get_dev(a, dev_net(dev), hw_dev);
break;
}
}
if (*hw_dev)
return 0;
#endif
return -EOPNOTSUPP;
}
EXPORT_SYMBOL(tcf_exts_get_dev);
static int __init tc_filter_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
tc_dump_tfilter, NULL);
return 0;
}
subsys_initcall(tc_filter_init);