2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* net/sched/cls_api.c Packet classifier API.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
|
|
|
*
|
|
|
|
* Changes:
|
|
|
|
*
|
|
|
|
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/errno.h>
|
2017-02-09 21:38:57 +08:00
|
|
|
#include <linux/err.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/kmod.h>
|
2008-01-24 12:33:13 +08:00
|
|
|
#include <linux/err.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2007-11-30 21:21:31 +08:00
|
|
|
#include <net/net_namespace.h>
|
|
|
|
#include <net/sock.h>
|
2007-03-26 14:06:12 +08:00
|
|
|
#include <net/netlink.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/pkt_sched.h>
|
|
|
|
#include <net/pkt_cls.h>
|
|
|
|
|
|
|
|
/* The list of all installed classifier types */
|
2013-12-16 12:15:11 +08:00
|
|
|
static LIST_HEAD(tcf_proto_base);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Protects list of registered TC modules. It is pure SMP lock. */
|
|
|
|
static DEFINE_RWLOCK(cls_mod_lock);
|
|
|
|
|
|
|
|
/* Find classifier type by string name */
|
|
|
|
|
2017-02-09 21:38:57 +08:00
|
|
|
static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-12-21 02:04:18 +08:00
|
|
|
const struct tcf_proto_ops *t, *res = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (kind) {
|
|
|
|
read_lock(&cls_mod_lock);
|
2013-12-16 12:15:11 +08:00
|
|
|
list_for_each_entry(t, &tcf_proto_base, head) {
|
2017-02-09 21:38:57 +08:00
|
|
|
if (strcmp(kind, t->kind) == 0) {
|
2013-12-21 02:04:18 +08:00
|
|
|
if (try_module_get(t->owner))
|
|
|
|
res = t;
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
read_unlock(&cls_mod_lock);
|
|
|
|
}
|
2013-12-21 02:04:18 +08:00
|
|
|
return res;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Register(unregister) new classifier type */
|
|
|
|
|
|
|
|
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
|
|
|
|
{
|
2013-12-16 12:15:11 +08:00
|
|
|
struct tcf_proto_ops *t;
|
2005-04-17 06:20:36 +08:00
|
|
|
int rc = -EEXIST;
|
|
|
|
|
|
|
|
write_lock(&cls_mod_lock);
|
2013-12-16 12:15:11 +08:00
|
|
|
list_for_each_entry(t, &tcf_proto_base, head)
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!strcmp(ops->kind, t->kind))
|
|
|
|
goto out;
|
|
|
|
|
2013-12-16 12:15:11 +08:00
|
|
|
list_add_tail(&ops->head, &tcf_proto_base);
|
2005-04-17 06:20:36 +08:00
|
|
|
rc = 0;
|
|
|
|
out:
|
|
|
|
write_unlock(&cls_mod_lock);
|
|
|
|
return rc;
|
|
|
|
}
|
2008-01-21 18:26:41 +08:00
|
|
|
EXPORT_SYMBOL(register_tcf_proto_ops);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
|
|
|
|
{
|
2013-12-16 12:15:11 +08:00
|
|
|
struct tcf_proto_ops *t;
|
2005-04-17 06:20:36 +08:00
|
|
|
int rc = -ENOENT;
|
|
|
|
|
net: sched: fix call_rcu() race on classifier module unloads
Vijay reported that a loop as simple as ...
while true; do
tc qdisc add dev foo root handle 1: prio
tc filter add dev foo parent 1: u32 match u32 0 0 flowid 1
tc qdisc del dev foo root
rmmod cls_u32
done
... will panic the kernel. Moreover, he bisected the change
apparently introducing it to 78fd1d0ab072 ("netlink: Re-add
locking to netlink_lookup() and seq walker").
The removal of synchronize_net() from the netlink socket
triggering the qdisc to be removed, seems to have uncovered
an RCU resp. module reference count race from the tc API.
Given that RCU conversion was done after e341694e3eb5 ("netlink:
Convert netlink_lookup() to use RCU protected hash table")
which added the synchronize_net() originally, occasion of
hitting the bug was less likely (not impossible though):
When qdiscs that i) support attaching classifiers and,
ii) have at least one of them attached, get deleted, they
invoke tcf_destroy_chain(), and thus call into ->destroy()
handler from a classifier module.
After RCU conversion, all classifier that have an internal
prio list, unlink them and initiate freeing via call_rcu()
deferral.
Meanhile, tcf_destroy() releases already reference to the
tp->ops->owner module before the queued RCU callback handler
has been invoked.
Subsequent rmmod on the classifier module is then not prevented
since all module references are already dropped.
By the time, the kernel invokes the RCU callback handler from
the module, that function address is then invalid.
One way to fix it would be to add an rcu_barrier() to
unregister_tcf_proto_ops() to wait for all pending call_rcu()s
to complete.
synchronize_rcu() is not appropriate as under heavy RCU
callback load, registered call_rcu()s could be deferred
longer than a grace period. In case we don't have any pending
call_rcu()s, the barrier is allowed to return immediately.
Since we came here via unregister_tcf_proto_ops(), there
are no users of a given classifier anymore. Further nested
call_rcu()s pointing into the module space are not being
done anywhere.
Only cls_bpf_delete_prog() may schedule a work item, to
unlock pages eventually, but that is not in the range/context
of cls_bpf anymore.
Fixes: 25d8c0d55f24 ("net: rcu-ify tcf_proto")
Fixes: 9888faefe132 ("net: sched: cls_basic use RCU")
Reported-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 23:13:33 +08:00
|
|
|
/* Wait for outstanding call_rcu()s, if any, from a
|
|
|
|
* tcf_proto_ops's destroy() handler.
|
|
|
|
*/
|
|
|
|
rcu_barrier();
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
write_lock(&cls_mod_lock);
|
2013-12-21 02:04:18 +08:00
|
|
|
list_for_each_entry(t, &tcf_proto_base, head) {
|
|
|
|
if (t == ops) {
|
|
|
|
list_del(&t->head);
|
|
|
|
rc = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2013-12-21 02:04:18 +08:00
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
write_unlock(&cls_mod_lock);
|
|
|
|
return rc;
|
|
|
|
}
|
2008-01-21 18:26:41 +08:00
|
|
|
EXPORT_SYMBOL(unregister_tcf_proto_ops);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-03-19 23:40:13 +08:00
|
|
|
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
|
|
|
|
struct nlmsghdr *n, struct tcf_proto *tp,
|
2016-10-10 11:25:55 +08:00
|
|
|
unsigned long fh, int event, bool unicast);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
net, cls: allow for deleting all filters for given parent
Add a possibility where the user can just specify the parent and
all filters under that parent are then being purged. Currently,
for example for scripting, one needs to specify pref/prio to have
a well-defined number for 'tc filter del' command for addressing
the previously created instance or additionally filter handle in
case of priorities being the same. Improve usage by allowing the
option for tc to specify the parent and removing the whole chain
for that given parent.
Example usage after patch, no tc changes required:
# tc qdisc replace dev foo clsact
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter show dev foo egress
filter protocol all pref 49151 bpf
filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action
# tc filter del dev foo egress
# tc filter show dev foo egress
#
Previously, RTM_DELTFILTER requests with invalid prio of 0 were
rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE
flag were allowed where the kernel would auto-generate a pref/prio.
We can piggyback on that and use prio of 0 as a wildcard for
requests of RTM_DELTFILTER.
For notifying tc netlink monitoring users (e.g. libnl uses this
for caching), there are two options, that is, sending individual
tfilter_notify() notifications for each tcf_proto, or sending a
single one indicating wildcard removal. I tried both and there
are pros and cons for each, eventually I decided for sending
individual tfilter_notify(), so that user space can support this
seamlessly and there won't be a mess of changing each and every
application to make sure expectations from the kernel won't break
when they don't understand single notification. Since linear chains
don't really scale, I expect only a handful of classifiers to be
attached at max for a given parent anyway.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
|
|
|
static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
|
|
|
|
struct nlmsghdr *n,
|
|
|
|
struct tcf_proto __rcu **chain, int event)
|
|
|
|
{
|
|
|
|
struct tcf_proto __rcu **it_chain;
|
|
|
|
struct tcf_proto *tp;
|
|
|
|
|
|
|
|
for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL;
|
|
|
|
it_chain = &tp->next)
|
2016-11-23 09:57:04 +08:00
|
|
|
tfilter_notify(net, oskb, n, tp, 0, event, false);
|
net, cls: allow for deleting all filters for given parent
Add a possibility where the user can just specify the parent and
all filters under that parent are then being purged. Currently,
for example for scripting, one needs to specify pref/prio to have
a well-defined number for 'tc filter del' command for addressing
the previously created instance or additionally filter handle in
case of priorities being the same. Improve usage by allowing the
option for tc to specify the parent and removing the whole chain
for that given parent.
Example usage after patch, no tc changes required:
# tc qdisc replace dev foo clsact
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter show dev foo egress
filter protocol all pref 49151 bpf
filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action
# tc filter del dev foo egress
# tc filter show dev foo egress
#
Previously, RTM_DELTFILTER requests with invalid prio of 0 were
rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE
flag were allowed where the kernel would auto-generate a pref/prio.
We can piggyback on that and use prio of 0 as a wildcard for
requests of RTM_DELTFILTER.
For notifying tc netlink monitoring users (e.g. libnl uses this
for caching), there are two options, that is, sending individual
tfilter_notify() notifications for each tcf_proto, or sending a
single one indicating wildcard removal. I tried both and there
are pros and cons for each, eventually I decided for sending
individual tfilter_notify(), so that user space can support this
seamlessly and there won't be a mess of changing each and every
application to make sure expectations from the kernel won't break
when they don't understand single notification. Since linear chains
don't really scale, I expect only a handful of classifiers to be
attached at max for a given parent anyway.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Select new prio value from the range, managed by kernel. */
|
|
|
|
|
2008-01-21 18:26:41 +08:00
|
|
|
static inline u32 tcf_auto_prio(struct tcf_proto *tp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-01-21 18:26:41 +08:00
|
|
|
u32 first = TC_H_MAKE(0xC0000000U, 0U);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (tp)
|
2011-01-20 03:26:56 +08:00
|
|
|
first = tp->prio - 1;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return first;
|
|
|
|
}
|
|
|
|
|
2017-02-09 21:38:57 +08:00
|
|
|
static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
|
|
|
|
u32 prio, u32 parent, struct Qdisc *q)
|
|
|
|
{
|
|
|
|
struct tcf_proto *tp;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
tp = kzalloc(sizeof(*tp), GFP_KERNEL);
|
|
|
|
if (!tp)
|
|
|
|
return ERR_PTR(-ENOBUFS);
|
|
|
|
|
|
|
|
err = -ENOENT;
|
|
|
|
tp->ops = tcf_proto_lookup_ops(kind);
|
|
|
|
if (!tp->ops) {
|
|
|
|
#ifdef CONFIG_MODULES
|
|
|
|
rtnl_unlock();
|
|
|
|
request_module("cls_%s", kind);
|
|
|
|
rtnl_lock();
|
|
|
|
tp->ops = tcf_proto_lookup_ops(kind);
|
|
|
|
/* We dropped the RTNL semaphore in order to perform
|
|
|
|
* the module load. So, even if we succeeded in loading
|
|
|
|
* the module we have to replay the request. We indicate
|
|
|
|
* this using -EAGAIN.
|
|
|
|
*/
|
|
|
|
if (tp->ops) {
|
|
|
|
module_put(tp->ops->owner);
|
|
|
|
err = -EAGAIN;
|
|
|
|
} else {
|
|
|
|
err = -ENOENT;
|
|
|
|
}
|
|
|
|
goto errout;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
tp->classify = tp->ops->classify;
|
|
|
|
tp->protocol = protocol;
|
|
|
|
tp->prio = prio;
|
|
|
|
tp->classid = parent;
|
|
|
|
tp->q = q;
|
|
|
|
|
|
|
|
err = tp->ops->init(tp);
|
|
|
|
if (err) {
|
|
|
|
module_put(tp->ops->owner);
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
return tp;
|
|
|
|
|
|
|
|
errout:
|
|
|
|
kfree(tp);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
|
2017-02-09 21:38:56 +08:00
|
|
|
static bool tcf_proto_destroy(struct tcf_proto *tp, bool force)
|
|
|
|
{
|
|
|
|
if (tp->ops->destroy(tp, force)) {
|
|
|
|
module_put(tp->ops->owner);
|
|
|
|
kfree_rcu(tp, rcu);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void tcf_destroy_chain(struct tcf_proto __rcu **fl)
|
|
|
|
{
|
|
|
|
struct tcf_proto *tp;
|
|
|
|
|
|
|
|
while ((tp = rtnl_dereference(*fl)) != NULL) {
|
|
|
|
RCU_INIT_POINTER(*fl, tp->next);
|
|
|
|
tcf_proto_destroy(tp, true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(tcf_destroy_chain);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Add/change/delete/get a filter node */
|
|
|
|
|
2017-04-17 00:48:24 +08:00
|
|
|
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
|
|
|
|
struct netlink_ext_ack *extack)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-03-26 01:26:21 +08:00
|
|
|
struct net *net = sock_net(skb->sk);
|
2008-01-23 14:11:33 +08:00
|
|
|
struct nlattr *tca[TCA_MAX + 1];
|
2005-04-17 06:20:36 +08:00
|
|
|
struct tcmsg *t;
|
|
|
|
u32 protocol;
|
|
|
|
u32 prio;
|
|
|
|
u32 nprio;
|
|
|
|
u32 parent;
|
|
|
|
struct net_device *dev;
|
|
|
|
struct Qdisc *q;
|
2014-09-13 11:05:27 +08:00
|
|
|
struct tcf_proto __rcu **back;
|
|
|
|
struct tcf_proto __rcu **chain;
|
2017-02-09 21:39:00 +08:00
|
|
|
struct tcf_proto *next;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct tcf_proto *tp;
|
2007-11-14 17:44:41 +08:00
|
|
|
const struct Qdisc_class_ops *cops;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long cl;
|
|
|
|
unsigned long fh;
|
|
|
|
int err;
|
net, sched: fix soft lockup in tc_classify
Shahar reported a soft lockup in tc_classify(), where we run into an
endless loop when walking the classifier chain due to tp->next == tp
which is a state we should never run into. The issue only seems to
trigger under load in the tc control path.
What happens is that in tc_ctl_tfilter(), thread A allocates a new
tp, initializes it, sets tp_created to 1, and calls into tp->ops->change()
with it. In that classifier callback we had to unlock/lock the rtnl
mutex and returned with -EAGAIN. One reason why we need to drop there
is, for example, that we need to request an action module to be loaded.
This happens via tcf_exts_validate() -> tcf_action_init/_1() meaning
after we loaded and found the requested action, we need to redo the
whole request so we don't race against others. While we had to unlock
rtnl in that time, thread B's request was processed next on that CPU.
Thread B added a new tp instance successfully to the classifier chain.
When thread A returned grabbing the rtnl mutex again, propagating -EAGAIN
and destroying its tp instance which never got linked, we goto replay
and redo A's request.
This time when walking the classifier chain in tc_ctl_tfilter() for
checking for existing tp instances we had a priority match and found
the tp instance that was created and linked by thread B. Now calling
again into tp->ops->change() with that tp was successful and returned
without error.
tp_created was never cleared in the second round, thus kernel thinks
that we need to link it into the classifier chain (once again). tp and
*back point to the same object due to the match we had earlier on. Thus
for thread B's already public tp, we reset tp->next to tp itself and
link it into the chain, which eventually causes the mentioned endless
loop in tc_classify() once a packet hits the data path.
Fix is to clear tp_created at the beginning of each request, also when
we replay it. On the paths that can cause -EAGAIN we already destroy
the original tp instance we had and on replay we really need to start
from scratch. It seems that this issue was first introduced in commit
12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining
and avoid kernel panic when we use cls_cgroup").
Fixes: 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup")
Reported-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Tested-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-12-22 01:04:11 +08:00
|
|
|
int tp_created;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-04-30 23:25:43 +08:00
|
|
|
if ((n->nlmsg_type != RTM_GETTFILTER) &&
|
2014-05-13 01:19:14 +08:00
|
|
|
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
|
2012-11-16 11:03:00 +08:00
|
|
|
return -EPERM;
|
2013-03-26 01:36:33 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
replay:
|
net, sched: fix soft lockup in tc_classify
Shahar reported a soft lockup in tc_classify(), where we run into an
endless loop when walking the classifier chain due to tp->next == tp
which is a state we should never run into. The issue only seems to
trigger under load in the tc control path.
What happens is that in tc_ctl_tfilter(), thread A allocates a new
tp, initializes it, sets tp_created to 1, and calls into tp->ops->change()
with it. In that classifier callback we had to unlock/lock the rtnl
mutex and returned with -EAGAIN. One reason why we need to drop there
is, for example, that we need to request an action module to be loaded.
This happens via tcf_exts_validate() -> tcf_action_init/_1() meaning
after we loaded and found the requested action, we need to redo the
whole request so we don't race against others. While we had to unlock
rtnl in that time, thread B's request was processed next on that CPU.
Thread B added a new tp instance successfully to the classifier chain.
When thread A returned grabbing the rtnl mutex again, propagating -EAGAIN
and destroying its tp instance which never got linked, we goto replay
and redo A's request.
This time when walking the classifier chain in tc_ctl_tfilter() for
checking for existing tp instances we had a priority match and found
the tp instance that was created and linked by thread B. Now calling
again into tp->ops->change() with that tp was successful and returned
without error.
tp_created was never cleared in the second round, thus kernel thinks
that we need to link it into the classifier chain (once again). tp and
*back point to the same object due to the match we had earlier on. Thus
for thread B's already public tp, we reset tp->next to tp itself and
link it into the chain, which eventually causes the mentioned endless
loop in tc_classify() once a packet hits the data path.
Fix is to clear tp_created at the beginning of each request, also when
we replay it. On the paths that can cause -EAGAIN we already destroy
the original tp instance we had and on replay we really need to start
from scratch. It seems that this issue was first introduced in commit
12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining
and avoid kernel panic when we use cls_cgroup").
Fixes: 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup")
Reported-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Tested-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-12-22 01:04:11 +08:00
|
|
|
tp_created = 0;
|
|
|
|
|
2017-04-17 00:48:24 +08:00
|
|
|
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
|
2013-03-26 01:36:33 +08:00
|
|
|
if (err < 0)
|
|
|
|
return err;
|
|
|
|
|
2012-06-27 12:48:50 +08:00
|
|
|
t = nlmsg_data(n);
|
2005-04-17 06:20:36 +08:00
|
|
|
protocol = TC_H_MIN(t->tcm_info);
|
|
|
|
prio = TC_H_MAJ(t->tcm_info);
|
|
|
|
nprio = prio;
|
|
|
|
parent = t->tcm_parent;
|
|
|
|
cl = 0;
|
|
|
|
|
|
|
|
if (prio == 0) {
|
net, cls: allow for deleting all filters for given parent
Add a possibility where the user can just specify the parent and
all filters under that parent are then being purged. Currently,
for example for scripting, one needs to specify pref/prio to have
a well-defined number for 'tc filter del' command for addressing
the previously created instance or additionally filter handle in
case of priorities being the same. Improve usage by allowing the
option for tc to specify the parent and removing the whole chain
for that given parent.
Example usage after patch, no tc changes required:
# tc qdisc replace dev foo clsact
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter show dev foo egress
filter protocol all pref 49151 bpf
filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action
# tc filter del dev foo egress
# tc filter show dev foo egress
#
Previously, RTM_DELTFILTER requests with invalid prio of 0 were
rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE
flag were allowed where the kernel would auto-generate a pref/prio.
We can piggyback on that and use prio of 0 as a wildcard for
requests of RTM_DELTFILTER.
For notifying tc netlink monitoring users (e.g. libnl uses this
for caching), there are two options, that is, sending individual
tfilter_notify() notifications for each tcf_proto, or sending a
single one indicating wildcard removal. I tried both and there
are pros and cons for each, eventually I decided for sending
individual tfilter_notify(), so that user space can support this
seamlessly and there won't be a mess of changing each and every
application to make sure expectations from the kernel won't break
when they don't understand single notification. Since linear chains
don't really scale, I expect only a handful of classifiers to be
attached at max for a given parent anyway.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
|
|
|
switch (n->nlmsg_type) {
|
|
|
|
case RTM_DELTFILTER:
|
2016-06-17 05:19:29 +08:00
|
|
|
if (protocol || t->tcm_handle || tca[TCA_KIND])
|
net, cls: allow for deleting all filters for given parent
Add a possibility where the user can just specify the parent and
all filters under that parent are then being purged. Currently,
for example for scripting, one needs to specify pref/prio to have
a well-defined number for 'tc filter del' command for addressing
the previously created instance or additionally filter handle in
case of priorities being the same. Improve usage by allowing the
option for tc to specify the parent and removing the whole chain
for that given parent.
Example usage after patch, no tc changes required:
# tc qdisc replace dev foo clsact
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter show dev foo egress
filter protocol all pref 49151 bpf
filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action
# tc filter del dev foo egress
# tc filter show dev foo egress
#
Previously, RTM_DELTFILTER requests with invalid prio of 0 were
rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE
flag were allowed where the kernel would auto-generate a pref/prio.
We can piggyback on that and use prio of 0 as a wildcard for
requests of RTM_DELTFILTER.
For notifying tc netlink monitoring users (e.g. libnl uses this
for caching), there are two options, that is, sending individual
tfilter_notify() notifications for each tcf_proto, or sending a
single one indicating wildcard removal. I tried both and there
are pros and cons for each, eventually I decided for sending
individual tfilter_notify(), so that user space can support this
seamlessly and there won't be a mess of changing each and every
application to make sure expectations from the kernel won't break
when they don't understand single notification. Since linear chains
don't really scale, I expect only a handful of classifiers to be
attached at max for a given parent anyway.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
|
|
|
return -ENOENT;
|
|
|
|
break;
|
|
|
|
case RTM_NEWTFILTER:
|
|
|
|
/* If no priority is provided by the user,
|
|
|
|
* we allocate one.
|
|
|
|
*/
|
|
|
|
if (n->nlmsg_flags & NLM_F_CREATE) {
|
|
|
|
prio = TC_H_MAKE(0x80000000U, 0U);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* fall-through */
|
|
|
|
default:
|
2005-04-17 06:20:36 +08:00
|
|
|
return -ENOENT;
|
net, cls: allow for deleting all filters for given parent
Add a possibility where the user can just specify the parent and
all filters under that parent are then being purged. Currently,
for example for scripting, one needs to specify pref/prio to have
a well-defined number for 'tc filter del' command for addressing
the previously created instance or additionally filter handle in
case of priorities being the same. Improve usage by allowing the
option for tc to specify the parent and removing the whole chain
for that given parent.
Example usage after patch, no tc changes required:
# tc qdisc replace dev foo clsact
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter show dev foo egress
filter protocol all pref 49151 bpf
filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action
# tc filter del dev foo egress
# tc filter show dev foo egress
#
Previously, RTM_DELTFILTER requests with invalid prio of 0 were
rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE
flag were allowed where the kernel would auto-generate a pref/prio.
We can piggyback on that and use prio of 0 as a wildcard for
requests of RTM_DELTFILTER.
For notifying tc netlink monitoring users (e.g. libnl uses this
for caching), there are two options, that is, sending individual
tfilter_notify() notifications for each tcf_proto, or sending a
single one indicating wildcard removal. I tried both and there
are pros and cons for each, eventually I decided for sending
individual tfilter_notify(), so that user space can support this
seamlessly and there won't be a mess of changing each and every
application to make sure expectations from the kernel won't break
when they don't understand single notification. Since linear chains
don't really scale, I expect only a handful of classifiers to be
attached at max for a given parent anyway.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Find head of filter chain. */
|
|
|
|
|
|
|
|
/* Find link */
|
2010-03-19 23:40:13 +08:00
|
|
|
dev = __dev_get_by_index(net, t->tcm_ifindex);
|
2008-01-21 18:26:41 +08:00
|
|
|
if (dev == NULL)
|
2005-04-17 06:20:36 +08:00
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
/* Find qdisc */
|
|
|
|
if (!parent) {
|
2009-09-04 14:41:18 +08:00
|
|
|
q = dev->qdisc;
|
2005-04-17 06:20:36 +08:00
|
|
|
parent = q->handle;
|
2008-01-21 18:26:41 +08:00
|
|
|
} else {
|
|
|
|
q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
|
|
|
|
if (q == NULL)
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Is it classful? */
|
2011-01-20 03:26:56 +08:00
|
|
|
cops = q->ops->cl_ops;
|
|
|
|
if (!cops)
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
2009-09-04 14:41:15 +08:00
|
|
|
if (cops->tcf_chain == NULL)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Do we search for filter, attached to class? */
|
|
|
|
if (TC_H_MIN(parent)) {
|
|
|
|
cl = cops->get(q, parent);
|
|
|
|
if (cl == 0)
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* And the last stroke */
|
|
|
|
chain = cops->tcf_chain(q, cl);
|
2017-02-09 21:38:58 +08:00
|
|
|
if (chain == NULL) {
|
|
|
|
err = -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
2017-02-09 21:38:58 +08:00
|
|
|
}
|
net, cls: allow for deleting all filters for given parent
Add a possibility where the user can just specify the parent and
all filters under that parent are then being purged. Currently,
for example for scripting, one needs to specify pref/prio to have
a well-defined number for 'tc filter del' command for addressing
the previously created instance or additionally filter handle in
case of priorities being the same. Improve usage by allowing the
option for tc to specify the parent and removing the whole chain
for that given parent.
Example usage after patch, no tc changes required:
# tc qdisc replace dev foo clsact
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter show dev foo egress
filter protocol all pref 49151 bpf
filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action
# tc filter del dev foo egress
# tc filter show dev foo egress
#
Previously, RTM_DELTFILTER requests with invalid prio of 0 were
rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE
flag were allowed where the kernel would auto-generate a pref/prio.
We can piggyback on that and use prio of 0 as a wildcard for
requests of RTM_DELTFILTER.
For notifying tc netlink monitoring users (e.g. libnl uses this
for caching), there are two options, that is, sending individual
tfilter_notify() notifications for each tcf_proto, or sending a
single one indicating wildcard removal. I tried both and there
are pros and cons for each, eventually I decided for sending
individual tfilter_notify(), so that user space can support this
seamlessly and there won't be a mess of changing each and every
application to make sure expectations from the kernel won't break
when they don't understand single notification. Since linear chains
don't really scale, I expect only a handful of classifiers to be
attached at max for a given parent anyway.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 05:10:22 +08:00
|
|
|
if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
|
|
|
|
tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER);
|
|
|
|
tcf_destroy_chain(chain);
|
|
|
|
err = 0;
|
|
|
|
goto errout;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Check the chain for existence of proto-tcf with this priority */
|
2014-09-13 11:05:27 +08:00
|
|
|
for (back = chain;
|
|
|
|
(tp = rtnl_dereference(*back)) != NULL;
|
|
|
|
back = &tp->next) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (tp->prio >= prio) {
|
|
|
|
if (tp->prio == prio) {
|
2011-01-20 03:26:56 +08:00
|
|
|
if (!nprio ||
|
2017-02-09 21:38:58 +08:00
|
|
|
(tp->protocol != protocol && protocol)) {
|
|
|
|
err = -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
2017-02-09 21:38:58 +08:00
|
|
|
}
|
2017-02-09 21:38:59 +08:00
|
|
|
} else {
|
2005-04-17 06:20:36 +08:00
|
|
|
tp = NULL;
|
2017-02-09 21:38:59 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (tp == NULL) {
|
|
|
|
/* Proto-tcf does not exist, create new one */
|
|
|
|
|
2017-02-09 21:38:58 +08:00
|
|
|
if (tca[TCA_KIND] == NULL || !protocol) {
|
|
|
|
err = -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
2017-02-09 21:38:58 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-20 03:26:56 +08:00
|
|
|
if (n->nlmsg_type != RTM_NEWTFILTER ||
|
2017-02-09 21:38:58 +08:00
|
|
|
!(n->nlmsg_flags & NLM_F_CREATE)) {
|
|
|
|
err = -ENOENT;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
2017-02-09 21:38:58 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-02-09 21:38:57 +08:00
|
|
|
if (!nprio)
|
|
|
|
nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2017-02-09 21:38:57 +08:00
|
|
|
tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
|
|
|
|
protocol, nprio, parent, q);
|
|
|
|
if (IS_ERR(tp)) {
|
|
|
|
err = PTR_ERR(tp);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
|
|
|
}
|
2009-06-02 17:17:34 +08:00
|
|
|
tp_created = 1;
|
2017-02-09 21:38:58 +08:00
|
|
|
} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
|
|
|
|
err = -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
2017-02-09 21:38:58 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
fh = tp->ops->get(tp, t->tcm_handle);
|
|
|
|
|
|
|
|
if (fh == 0) {
|
|
|
|
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
|
2017-02-09 21:39:00 +08:00
|
|
|
next = rtnl_dereference(tp->next);
|
2014-09-13 11:05:27 +08:00
|
|
|
RCU_INIT_POINTER(*back, next);
|
2016-10-10 11:25:55 +08:00
|
|
|
tfilter_notify(net, skb, n, tp, fh,
|
|
|
|
RTM_DELTFILTER, false);
|
2017-02-09 21:38:55 +08:00
|
|
|
tcf_proto_destroy(tp, true);
|
2005-04-17 06:20:36 +08:00
|
|
|
err = 0;
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
|
2008-01-21 18:26:41 +08:00
|
|
|
if (n->nlmsg_type != RTM_NEWTFILTER ||
|
2017-02-09 21:38:58 +08:00
|
|
|
!(n->nlmsg_flags & NLM_F_CREATE)) {
|
|
|
|
err = -ENOENT;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
2017-02-09 21:38:58 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
switch (n->nlmsg_type) {
|
2007-02-09 22:25:16 +08:00
|
|
|
case RTM_NEWTFILTER:
|
2009-06-02 17:17:34 +08:00
|
|
|
if (n->nlmsg_flags & NLM_F_EXCL) {
|
|
|
|
if (tp_created)
|
2017-02-09 21:38:55 +08:00
|
|
|
tcf_proto_destroy(tp, true);
|
2017-02-09 21:38:58 +08:00
|
|
|
err = -EEXIST;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
2009-06-02 17:17:34 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
case RTM_DELTFILTER:
|
|
|
|
err = tp->ops->delete(tp, fh);
|
2017-02-09 21:39:00 +08:00
|
|
|
if (err)
|
|
|
|
goto errout;
|
|
|
|
next = rtnl_dereference(tp->next);
|
|
|
|
tfilter_notify(net, skb, n, tp, t->tcm_handle,
|
|
|
|
RTM_DELTFILTER, false);
|
|
|
|
if (tcf_proto_destroy(tp, false))
|
|
|
|
RCU_INIT_POINTER(*back, next);
|
2017-02-14 23:27:13 +08:00
|
|
|
goto errout;
|
2005-04-17 06:20:36 +08:00
|
|
|
case RTM_GETTFILTER:
|
2016-09-18 20:45:33 +08:00
|
|
|
err = tfilter_notify(net, skb, n, tp, fh,
|
2016-10-10 11:25:55 +08:00
|
|
|
RTM_NEWTFILTER, true);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
|
|
|
default:
|
|
|
|
err = -EINVAL;
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-26 04:54:06 +08:00
|
|
|
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
|
|
|
|
n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);
|
2009-06-02 17:17:34 +08:00
|
|
|
if (err == 0) {
|
|
|
|
if (tp_created) {
|
2014-09-13 11:05:27 +08:00
|
|
|
RCU_INIT_POINTER(tp->next, rtnl_dereference(*back));
|
|
|
|
rcu_assign_pointer(*back, tp);
|
2009-06-02 17:17:34 +08:00
|
|
|
}
|
2016-10-10 11:25:55 +08:00
|
|
|
tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
|
2009-06-02 17:17:34 +08:00
|
|
|
} else {
|
|
|
|
if (tp_created)
|
2017-02-09 21:38:55 +08:00
|
|
|
tcf_proto_destroy(tp, true);
|
2009-06-02 17:17:34 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
errout:
|
|
|
|
if (cl)
|
|
|
|
cops->put(q, cl);
|
|
|
|
if (err == -EAGAIN)
|
|
|
|
/* Replay the request. */
|
|
|
|
goto replay;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2016-06-05 22:41:32 +08:00
|
|
|
static int tcf_fill_node(struct net *net, struct sk_buff *skb,
|
|
|
|
struct tcf_proto *tp, unsigned long fh, u32 portid,
|
|
|
|
u32 seq, u16 flags, int event)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct tcmsg *tcm;
|
|
|
|
struct nlmsghdr *nlh;
|
2007-04-20 11:29:13 +08:00
|
|
|
unsigned char *b = skb_tail_pointer(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-09-08 04:12:54 +08:00
|
|
|
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
|
2012-06-27 12:48:50 +08:00
|
|
|
if (!nlh)
|
|
|
|
goto out_nlmsg_trim;
|
|
|
|
tcm = nlmsg_data(nlh);
|
2005-04-17 06:20:36 +08:00
|
|
|
tcm->tcm_family = AF_UNSPEC;
|
2005-06-29 03:55:30 +08:00
|
|
|
tcm->tcm__pad1 = 0;
|
2009-10-08 16:21:46 +08:00
|
|
|
tcm->tcm__pad2 = 0;
|
2008-07-09 08:06:30 +08:00
|
|
|
tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
|
2005-04-17 06:20:36 +08:00
|
|
|
tcm->tcm_parent = tp->classid;
|
|
|
|
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
|
2012-03-29 17:11:39 +08:00
|
|
|
if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
|
|
|
|
goto nla_put_failure;
|
2005-04-17 06:20:36 +08:00
|
|
|
tcm->tcm_handle = fh;
|
|
|
|
if (RTM_DELTFILTER != event) {
|
|
|
|
tcm->tcm_handle = 0;
|
2014-01-10 08:14:01 +08:00
|
|
|
if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
|
2008-01-23 14:11:33 +08:00
|
|
|
goto nla_put_failure;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-04-20 11:29:13 +08:00
|
|
|
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
|
2005-04-17 06:20:36 +08:00
|
|
|
return skb->len;
|
|
|
|
|
2012-06-27 12:48:50 +08:00
|
|
|
out_nlmsg_trim:
|
2008-01-23 14:11:33 +08:00
|
|
|
nla_put_failure:
|
2007-03-26 14:06:12 +08:00
|
|
|
nlmsg_trim(skb, b);
|
2005-04-17 06:20:36 +08:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2010-03-19 23:40:13 +08:00
|
|
|
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
|
|
|
|
struct nlmsghdr *n, struct tcf_proto *tp,
|
2016-10-10 11:25:55 +08:00
|
|
|
unsigned long fh, int event, bool unicast)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct sk_buff *skb;
|
2012-09-08 04:12:54 +08:00
|
|
|
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
|
|
|
|
if (!skb)
|
|
|
|
return -ENOBUFS;
|
|
|
|
|
2016-11-17 06:16:10 +08:00
|
|
|
if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
|
|
|
|
n->nlmsg_flags, event) <= 0) {
|
2005-04-17 06:20:36 +08:00
|
|
|
kfree_skb(skb);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2016-10-10 11:25:55 +08:00
|
|
|
if (unicast)
|
|
|
|
return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
|
|
|
|
|
2012-09-08 04:12:54 +08:00
|
|
|
return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
|
2008-01-21 18:26:41 +08:00
|
|
|
n->nlmsg_flags & NLM_F_ECHO);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-01-21 18:26:41 +08:00
|
|
|
struct tcf_dump_args {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct tcf_walker w;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
struct netlink_callback *cb;
|
|
|
|
};
|
|
|
|
|
2008-01-21 18:26:41 +08:00
|
|
|
static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
|
|
|
|
struct tcf_walker *arg)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-01-21 18:26:41 +08:00
|
|
|
struct tcf_dump_args *a = (void *)arg;
|
2014-01-10 08:14:01 +08:00
|
|
|
struct net *net = sock_net(a->skb->sk);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-01-10 08:14:01 +08:00
|
|
|
return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
|
2016-09-18 20:45:33 +08:00
|
|
|
a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
|
|
|
|
RTM_NEWTFILTER);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2009-11-06 12:57:26 +08:00
|
|
|
/* called with RTNL */
|
2005-04-17 06:20:36 +08:00
|
|
|
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
|
|
|
|
{
|
2008-03-26 01:26:21 +08:00
|
|
|
struct net *net = sock_net(skb->sk);
|
2005-04-17 06:20:36 +08:00
|
|
|
int t;
|
|
|
|
int s_t;
|
|
|
|
struct net_device *dev;
|
|
|
|
struct Qdisc *q;
|
2014-09-13 11:05:27 +08:00
|
|
|
struct tcf_proto *tp, __rcu **chain;
|
2012-06-27 12:48:50 +08:00
|
|
|
struct tcmsg *tcm = nlmsg_data(cb->nlh);
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long cl = 0;
|
2007-11-14 17:44:41 +08:00
|
|
|
const struct Qdisc_class_ops *cops;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct tcf_dump_args arg;
|
|
|
|
|
2013-03-27 14:47:04 +08:00
|
|
|
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
|
2005-04-17 06:20:36 +08:00
|
|
|
return skb->len;
|
2011-01-20 03:26:56 +08:00
|
|
|
dev = __dev_get_by_index(net, tcm->tcm_ifindex);
|
|
|
|
if (!dev)
|
2005-04-17 06:20:36 +08:00
|
|
|
return skb->len;
|
|
|
|
|
|
|
|
if (!tcm->tcm_parent)
|
2009-09-04 14:41:18 +08:00
|
|
|
q = dev->qdisc;
|
2005-04-17 06:20:36 +08:00
|
|
|
else
|
|
|
|
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
|
|
|
|
if (!q)
|
|
|
|
goto out;
|
2011-01-20 03:26:56 +08:00
|
|
|
cops = q->ops->cl_ops;
|
|
|
|
if (!cops)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto errout;
|
2009-09-04 14:41:15 +08:00
|
|
|
if (cops->tcf_chain == NULL)
|
|
|
|
goto errout;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (TC_H_MIN(tcm->tcm_parent)) {
|
|
|
|
cl = cops->get(q, tcm->tcm_parent);
|
|
|
|
if (cl == 0)
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
chain = cops->tcf_chain(q, cl);
|
|
|
|
if (chain == NULL)
|
|
|
|
goto errout;
|
|
|
|
|
|
|
|
s_t = cb->args[0];
|
|
|
|
|
2014-09-13 11:05:27 +08:00
|
|
|
for (tp = rtnl_dereference(*chain), t = 0;
|
|
|
|
tp; tp = rtnl_dereference(tp->next), t++) {
|
2011-01-20 03:26:56 +08:00
|
|
|
if (t < s_t)
|
|
|
|
continue;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (TC_H_MAJ(tcm->tcm_info) &&
|
|
|
|
TC_H_MAJ(tcm->tcm_info) != tp->prio)
|
|
|
|
continue;
|
|
|
|
if (TC_H_MIN(tcm->tcm_info) &&
|
|
|
|
TC_H_MIN(tcm->tcm_info) != tp->protocol)
|
|
|
|
continue;
|
|
|
|
if (t > s_t)
|
2016-06-05 22:41:32 +08:00
|
|
|
memset(&cb->args[1], 0,
|
|
|
|
sizeof(cb->args)-sizeof(cb->args[0]));
|
2005-04-17 06:20:36 +08:00
|
|
|
if (cb->args[1] == 0) {
|
2016-06-05 22:41:32 +08:00
|
|
|
if (tcf_fill_node(net, skb, tp, 0,
|
|
|
|
NETLINK_CB(cb->skb).portid,
|
2008-01-21 18:26:41 +08:00
|
|
|
cb->nlh->nlmsg_seq, NLM_F_MULTI,
|
|
|
|
RTM_NEWTFILTER) <= 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2008-01-21 18:26:41 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
cb->args[1] = 1;
|
|
|
|
}
|
|
|
|
if (tp->ops->walk == NULL)
|
|
|
|
continue;
|
|
|
|
arg.w.fn = tcf_node_dump;
|
|
|
|
arg.skb = skb;
|
|
|
|
arg.cb = cb;
|
|
|
|
arg.w.stop = 0;
|
2011-01-20 03:26:56 +08:00
|
|
|
arg.w.skip = cb->args[1] - 1;
|
2005-04-17 06:20:36 +08:00
|
|
|
arg.w.count = 0;
|
|
|
|
tp->ops->walk(tp, &arg.w);
|
2011-01-20 03:26:56 +08:00
|
|
|
cb->args[1] = arg.w.count + 1;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (arg.w.stop)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
cb->args[0] = t;
|
|
|
|
|
|
|
|
errout:
|
|
|
|
if (cl)
|
|
|
|
cops->put(q, cl);
|
|
|
|
out:
|
|
|
|
return skb->len;
|
|
|
|
}
|
|
|
|
|
2014-09-26 01:26:37 +08:00
|
|
|
void tcf_exts_destroy(struct tcf_exts *exts)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
2016-08-14 13:35:00 +08:00
|
|
|
LIST_HEAD(actions);
|
|
|
|
|
|
|
|
tcf_exts_to_list(exts, &actions);
|
|
|
|
tcf_action_destroy(&actions, TCA_ACT_UNBIND);
|
|
|
|
kfree(exts->actions);
|
|
|
|
exts->nr_actions = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
}
|
2008-01-21 18:26:41 +08:00
|
|
|
EXPORT_SYMBOL(tcf_exts_destroy);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-01-14 13:15:39 +08:00
|
|
|
int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
|
2016-09-18 20:45:33 +08:00
|
|
|
struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
|
|
|
{
|
|
|
|
struct tc_action *act;
|
|
|
|
|
2013-12-16 12:15:07 +08:00
|
|
|
if (exts->police && tb[exts->police]) {
|
|
|
|
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
|
2016-09-18 20:45:33 +08:00
|
|
|
"police", ovr, TCA_ACT_BIND);
|
2008-01-24 12:33:13 +08:00
|
|
|
if (IS_ERR(act))
|
|
|
|
return PTR_ERR(act);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-12-16 12:15:05 +08:00
|
|
|
act->type = exts->type = TCA_OLD_COMPAT;
|
2016-08-14 13:35:00 +08:00
|
|
|
exts->actions[0] = act;
|
|
|
|
exts->nr_actions = 1;
|
2013-12-16 12:15:07 +08:00
|
|
|
} else if (exts->action && tb[exts->action]) {
|
2016-08-14 13:35:00 +08:00
|
|
|
LIST_HEAD(actions);
|
|
|
|
int err, i = 0;
|
|
|
|
|
2013-12-16 12:15:07 +08:00
|
|
|
err = tcf_action_init(net, tb[exts->action], rate_tlv,
|
2016-09-18 20:45:33 +08:00
|
|
|
NULL, ovr, TCA_ACT_BIND,
|
|
|
|
&actions);
|
2013-12-16 12:15:05 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
2016-08-14 13:35:00 +08:00
|
|
|
list_for_each_entry(act, &actions, list)
|
|
|
|
exts->actions[i++] = act;
|
|
|
|
exts->nr_actions = i;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
2013-12-16 12:15:07 +08:00
|
|
|
if ((exts->action && tb[exts->action]) ||
|
|
|
|
(exts->police && tb[exts->police]))
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EOPNOTSUPP;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2008-01-21 18:26:41 +08:00
|
|
|
EXPORT_SYMBOL(tcf_exts_validate);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-01-21 18:26:41 +08:00
|
|
|
void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
|
|
|
|
struct tcf_exts *src)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
2016-08-14 13:35:00 +08:00
|
|
|
struct tcf_exts old = *dst;
|
|
|
|
|
2014-04-26 04:55:30 +08:00
|
|
|
tcf_tree_lock(tp);
|
2016-08-14 13:35:00 +08:00
|
|
|
dst->nr_actions = src->nr_actions;
|
|
|
|
dst->actions = src->actions;
|
2014-10-07 08:21:54 +08:00
|
|
|
dst->type = src->type;
|
2014-04-26 04:55:30 +08:00
|
|
|
tcf_tree_unlock(tp);
|
2016-08-14 13:35:00 +08:00
|
|
|
|
|
|
|
tcf_exts_destroy(&old);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
}
|
2008-01-21 18:26:41 +08:00
|
|
|
EXPORT_SYMBOL(tcf_exts_change);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-08-14 13:35:00 +08:00
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
|
|
|
static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts)
|
|
|
|
{
|
|
|
|
if (exts->nr_actions == 0)
|
|
|
|
return NULL;
|
|
|
|
else
|
|
|
|
return exts->actions[0];
|
|
|
|
}
|
|
|
|
#endif
|
2013-12-16 12:15:05 +08:00
|
|
|
|
2013-12-16 12:15:07 +08:00
|
|
|
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
2014-07-17 05:25:30 +08:00
|
|
|
struct nlattr *nest;
|
|
|
|
|
2016-08-14 13:35:00 +08:00
|
|
|
if (exts->action && exts->nr_actions) {
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* again for backward compatible mode - we want
|
|
|
|
* to work with both old and new modes of entering
|
|
|
|
* tc data even if iproute2 was newer - jhs
|
|
|
|
*/
|
2013-12-16 12:15:05 +08:00
|
|
|
if (exts->type != TCA_OLD_COMPAT) {
|
2016-08-14 13:35:00 +08:00
|
|
|
LIST_HEAD(actions);
|
|
|
|
|
2013-12-16 12:15:07 +08:00
|
|
|
nest = nla_nest_start(skb, exts->action);
|
2008-01-24 12:34:11 +08:00
|
|
|
if (nest == NULL)
|
|
|
|
goto nla_put_failure;
|
2016-08-14 13:35:00 +08:00
|
|
|
|
|
|
|
tcf_exts_to_list(exts, &actions);
|
|
|
|
if (tcf_action_dump(skb, &actions, 0, 0) < 0)
|
2008-01-23 14:11:33 +08:00
|
|
|
goto nla_put_failure;
|
2008-01-24 12:34:11 +08:00
|
|
|
nla_nest_end(skb, nest);
|
2013-12-16 12:15:07 +08:00
|
|
|
} else if (exts->police) {
|
2013-12-16 12:15:05 +08:00
|
|
|
struct tc_action *act = tcf_exts_first_act(exts);
|
2013-12-16 12:15:07 +08:00
|
|
|
nest = nla_nest_start(skb, exts->police);
|
2013-12-23 21:02:12 +08:00
|
|
|
if (nest == NULL || !act)
|
2008-01-24 12:34:11 +08:00
|
|
|
goto nla_put_failure;
|
2013-12-16 12:15:05 +08:00
|
|
|
if (tcf_action_dump_old(skb, act, 0, 0) < 0)
|
2008-01-23 14:11:33 +08:00
|
|
|
goto nla_put_failure;
|
2008-01-24 12:34:11 +08:00
|
|
|
nla_nest_end(skb, nest);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
2014-07-17 05:25:30 +08:00
|
|
|
|
|
|
|
nla_put_failure:
|
|
|
|
nla_nest_cancel(skb, nest);
|
2005-04-17 06:20:36 +08:00
|
|
|
return -1;
|
2014-07-17 05:25:30 +08:00
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2008-01-21 18:26:41 +08:00
|
|
|
EXPORT_SYMBOL(tcf_exts_dump);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-01-21 18:26:41 +08:00
|
|
|
|
2013-12-16 12:15:07 +08:00
|
|
|
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
2013-12-16 12:15:05 +08:00
|
|
|
struct tc_action *a = tcf_exts_first_act(exts);
|
2015-02-04 02:05:18 +08:00
|
|
|
if (a != NULL && tcf_action_copy_stats(skb, a, 1) < 0)
|
2013-12-16 12:15:05 +08:00
|
|
|
return -1;
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
}
|
2008-01-21 18:26:41 +08:00
|
|
|
EXPORT_SYMBOL(tcf_exts_dump_stats);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-12-01 20:06:37 +08:00
|
|
|
int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
|
|
|
|
struct net_device **hw_dev)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
|
|
|
const struct tc_action *a;
|
|
|
|
LIST_HEAD(actions);
|
|
|
|
|
|
|
|
if (tc_no_actions(exts))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
tcf_exts_to_list(exts, &actions);
|
|
|
|
list_for_each_entry(a, &actions, list) {
|
|
|
|
if (a->ops->get_dev) {
|
|
|
|
a->ops->get_dev(a, dev_net(dev), hw_dev);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (*hw_dev)
|
|
|
|
return 0;
|
|
|
|
#endif
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(tcf_exts_get_dev);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static int __init tc_filter_init(void)
|
|
|
|
{
|
2011-06-10 09:27:09 +08:00
|
|
|
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
|
|
|
|
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
|
2007-03-23 02:56:22 +08:00
|
|
|
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
|
2011-06-10 09:27:09 +08:00
|
|
|
tc_dump_tfilter, NULL);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
subsys_initcall(tc_filter_init);
|