2013-12-30 01:27:10 +08:00
|
|
|
/*
|
|
|
|
* net/core/netclassid_cgroup.c Classid Cgroupfs Handling
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* Authors: Thomas Graf <tgraf@suug.ch>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/cgroup.h>
|
|
|
|
#include <linux/fdtable.h>
|
|
|
|
#include <net/cls_cgroup.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
|
|
|
|
static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
|
|
|
|
{
|
|
|
|
return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
|
|
|
|
{
|
2015-07-22 17:23:20 +08:00
|
|
|
return css_cls_state(task_css_check(p, net_cls_cgrp_id,
|
|
|
|
rcu_read_lock_bh_held()));
|
2013-12-30 01:27:10 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(task_cls_state);
|
|
|
|
|
|
|
|
static struct cgroup_subsys_state *
|
|
|
|
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
|
|
|
|
{
|
|
|
|
struct cgroup_cls_state *cs;
|
|
|
|
|
|
|
|
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
|
|
|
|
if (!cs)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
return &cs->css;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cgrp_css_online(struct cgroup_subsys_state *css)
|
|
|
|
{
|
|
|
|
struct cgroup_cls_state *cs = css_cls_state(css);
|
2014-05-17 01:22:48 +08:00
|
|
|
struct cgroup_cls_state *parent = css_cls_state(css->parent);
|
2013-12-30 01:27:10 +08:00
|
|
|
|
|
|
|
if (parent)
|
|
|
|
cs->classid = parent->classid;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cgrp_css_free(struct cgroup_subsys_state *css)
|
|
|
|
{
|
|
|
|
kfree(css_cls_state(css));
|
|
|
|
}
|
|
|
|
|
2015-11-21 04:31:39 +08:00
|
|
|
static int update_classid_sock(const void *v, struct file *file, unsigned n)
|
2013-12-30 01:27:10 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
struct socket *sock = sock_from_file(file, &err);
|
|
|
|
|
sock, cgroup: add sock->sk_cgroup
In cgroup v1, dealing with cgroup membership was difficult because the
number of membership associations was unbound. As a result, cgroup v1
grew several controllers whose primary purpose is either tagging
membership or pull in configuration knobs from other subsystems so
that cgroup membership test can be avoided.
net_cls and net_prio controllers are examples of the latter. They
allow configuring network-specific attributes from cgroup side so that
network subsystem can avoid testing cgroup membership; unfortunately,
these are not only cumbersome but also problematic.
Both net_cls and net_prio aren't properly hierarchical. Both inherit
configuration from the parent on creation but there's no interaction
afterwards. An ancestor doesn't restrict the behavior in its subtree
in anyway and configuration changes aren't propagated downwards.
Especially when combined with cgroup delegation, this is problematic
because delegatees can mess up whatever network configuration
implemented at the system level. net_prio would allow the delegatees
to set whatever priority value regardless of CAP_NET_ADMIN and net_cls
the same for classid.
While it is possible to solve these issues from controller side by
implementing hierarchical allowable ranges in both controllers, it
would involve quite a bit of complexity in the controllers and further
obfuscate network configuration as it becomes even more difficult to
tell what's actually being configured looking from the network side.
While not much can be done for v1 at this point, as membership
handling is sane on cgroup v2, it'd be better to make cgroup matching
behave like other network matches and classifiers than introducing
further complications.
In preparation, this patch updates sock->sk_cgrp_data handling so that
it points to the v2 cgroup that sock was created in until either
net_prio or net_cls is used. Once either of the two is used,
sock->sk_cgrp_data reverts to its previous role of carrying prioidx
and classid. This is to avoid adding yet another cgroup related field
to struct sock.
As the mode switching can happen at most once per boot, the switching
mechanism is aimed at lowering hot path overhead. It may leak a
finite, likely small, number of cgroup refs and report spurious
prioidx or classid on switching; however, dynamic updates of prioidx
and classid have always been racy and lossy - socks between creation
and fd installation are never updated, config changes don't update
existing sockets at all, and prioidx may index with dead and recycled
cgroup IDs. Non-critical inaccuracies from small race windows won't
make any noticeable difference.
This patch doesn't make use of the pointer yet. The following patch
will implement netfilter match for cgroup2 membership.
v2: Use sock_cgroup_data to avoid inflating struct sock w/ another
cgroup specific field.
v3: Add comments explaining why sock_data_prioidx() and
sock_data_classid() use different fallback values.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-08 06:38:53 +08:00
|
|
|
if (sock) {
|
|
|
|
spin_lock(&cgroup_sk_update_lock);
|
2015-12-08 06:38:52 +08:00
|
|
|
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
|
|
|
|
(unsigned long)v);
|
sock, cgroup: add sock->sk_cgroup
In cgroup v1, dealing with cgroup membership was difficult because the
number of membership associations was unbound. As a result, cgroup v1
grew several controllers whose primary purpose is either tagging
membership or pull in configuration knobs from other subsystems so
that cgroup membership test can be avoided.
net_cls and net_prio controllers are examples of the latter. They
allow configuring network-specific attributes from cgroup side so that
network subsystem can avoid testing cgroup membership; unfortunately,
these are not only cumbersome but also problematic.
Both net_cls and net_prio aren't properly hierarchical. Both inherit
configuration from the parent on creation but there's no interaction
afterwards. An ancestor doesn't restrict the behavior in its subtree
in anyway and configuration changes aren't propagated downwards.
Especially when combined with cgroup delegation, this is problematic
because delegatees can mess up whatever network configuration
implemented at the system level. net_prio would allow the delegatees
to set whatever priority value regardless of CAP_NET_ADMIN and net_cls
the same for classid.
While it is possible to solve these issues from controller side by
implementing hierarchical allowable ranges in both controllers, it
would involve quite a bit of complexity in the controllers and further
obfuscate network configuration as it becomes even more difficult to
tell what's actually being configured looking from the network side.
While not much can be done for v1 at this point, as membership
handling is sane on cgroup v2, it'd be better to make cgroup matching
behave like other network matches and classifiers than introducing
further complications.
In preparation, this patch updates sock->sk_cgrp_data handling so that
it points to the v2 cgroup that sock was created in until either
net_prio or net_cls is used. Once either of the two is used,
sock->sk_cgrp_data reverts to its previous role of carrying prioidx
and classid. This is to avoid adding yet another cgroup related field
to struct sock.
As the mode switching can happen at most once per boot, the switching
mechanism is aimed at lowering hot path overhead. It may leak a
finite, likely small, number of cgroup refs and report spurious
prioidx or classid on switching; however, dynamic updates of prioidx
and classid have always been racy and lossy - socks between creation
and fd installation are never updated, config changes don't update
existing sockets at all, and prioidx may index with dead and recycled
cgroup IDs. Non-critical inaccuracies from small race windows won't
make any noticeable difference.
This patch doesn't make use of the pointer yet. The following patch
will implement netfilter match for cgroup2 membership.
v2: Use sock_cgroup_data to avoid inflating struct sock w/ another
cgroup specific field.
v3: Add comments explaining why sock_data_prioidx() and
sock_data_classid() use different fallback values.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-08 06:38:53 +08:00
|
|
|
spin_unlock(&cgroup_sk_update_lock);
|
|
|
|
}
|
2013-12-30 01:27:10 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-11-21 04:31:39 +08:00
|
|
|
static void update_classid(struct cgroup_subsys_state *css, void *v)
|
2013-12-30 01:27:10 +08:00
|
|
|
{
|
2015-11-21 04:31:39 +08:00
|
|
|
struct css_task_iter it;
|
2013-12-30 01:27:10 +08:00
|
|
|
struct task_struct *p;
|
|
|
|
|
2015-11-21 04:31:39 +08:00
|
|
|
css_task_iter_start(css, &it);
|
|
|
|
while ((p = css_task_iter_next(&it))) {
|
2013-12-30 01:27:10 +08:00
|
|
|
task_lock(p);
|
2015-11-21 04:31:39 +08:00
|
|
|
iterate_fd(p->files, 0, update_classid_sock, v);
|
2013-12-30 01:27:10 +08:00
|
|
|
task_unlock(p);
|
|
|
|
}
|
2015-11-21 04:31:39 +08:00
|
|
|
css_task_iter_end(&it);
|
|
|
|
}
|
|
|
|
|
2015-12-07 23:09:03 +08:00
|
|
|
static void cgrp_attach(struct cgroup_taskset *tset)
|
2015-11-21 04:31:39 +08:00
|
|
|
{
|
2015-12-07 23:09:03 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
|
|
|
|
|
|
|
cgroup_taskset_first(tset, &css);
|
2015-11-21 04:31:39 +08:00
|
|
|
update_classid(css,
|
|
|
|
(void *)(unsigned long)css_cls_state(css)->classid);
|
2013-12-30 01:27:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
|
|
|
|
{
|
|
|
|
return css_cls_state(css)->classid;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
|
|
|
|
u64 value)
|
|
|
|
{
|
2015-11-21 04:31:39 +08:00
|
|
|
struct cgroup_cls_state *cs = css_cls_state(css);
|
|
|
|
|
sock, cgroup: add sock->sk_cgroup
In cgroup v1, dealing with cgroup membership was difficult because the
number of membership associations was unbound. As a result, cgroup v1
grew several controllers whose primary purpose is either tagging
membership or pull in configuration knobs from other subsystems so
that cgroup membership test can be avoided.
net_cls and net_prio controllers are examples of the latter. They
allow configuring network-specific attributes from cgroup side so that
network subsystem can avoid testing cgroup membership; unfortunately,
these are not only cumbersome but also problematic.
Both net_cls and net_prio aren't properly hierarchical. Both inherit
configuration from the parent on creation but there's no interaction
afterwards. An ancestor doesn't restrict the behavior in its subtree
in anyway and configuration changes aren't propagated downwards.
Especially when combined with cgroup delegation, this is problematic
because delegatees can mess up whatever network configuration
implemented at the system level. net_prio would allow the delegatees
to set whatever priority value regardless of CAP_NET_ADMIN and net_cls
the same for classid.
While it is possible to solve these issues from controller side by
implementing hierarchical allowable ranges in both controllers, it
would involve quite a bit of complexity in the controllers and further
obfuscate network configuration as it becomes even more difficult to
tell what's actually being configured looking from the network side.
While not much can be done for v1 at this point, as membership
handling is sane on cgroup v2, it'd be better to make cgroup matching
behave like other network matches and classifiers than introducing
further complications.
In preparation, this patch updates sock->sk_cgrp_data handling so that
it points to the v2 cgroup that sock was created in until either
net_prio or net_cls is used. Once either of the two is used,
sock->sk_cgrp_data reverts to its previous role of carrying prioidx
and classid. This is to avoid adding yet another cgroup related field
to struct sock.
As the mode switching can happen at most once per boot, the switching
mechanism is aimed at lowering hot path overhead. It may leak a
finite, likely small, number of cgroup refs and report spurious
prioidx or classid on switching; however, dynamic updates of prioidx
and classid have always been racy and lossy - socks between creation
and fd installation are never updated, config changes don't update
existing sockets at all, and prioidx may index with dead and recycled
cgroup IDs. Non-critical inaccuracies from small race windows won't
make any noticeable difference.
This patch doesn't make use of the pointer yet. The following patch
will implement netfilter match for cgroup2 membership.
v2: Use sock_cgroup_data to avoid inflating struct sock w/ another
cgroup specific field.
v3: Add comments explaining why sock_data_prioidx() and
sock_data_classid() use different fallback values.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-08 06:38:53 +08:00
|
|
|
cgroup_sk_alloc_disable();
|
|
|
|
|
2015-11-21 04:31:39 +08:00
|
|
|
cs->classid = (u32)value;
|
2013-12-30 01:27:10 +08:00
|
|
|
|
2015-11-21 04:31:39 +08:00
|
|
|
update_classid(css, (void *)(unsigned long)cs->classid);
|
2013-12-30 01:27:10 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct cftype ss_files[] = {
|
|
|
|
{
|
|
|
|
.name = "classid",
|
|
|
|
.read_u64 = read_classid,
|
|
|
|
.write_u64 = write_classid,
|
|
|
|
},
|
|
|
|
{ } /* terminate */
|
|
|
|
};
|
|
|
|
|
2014-02-08 23:36:58 +08:00
|
|
|
struct cgroup_subsys net_cls_cgrp_subsys = {
|
2013-12-30 01:27:10 +08:00
|
|
|
.css_alloc = cgrp_css_alloc,
|
|
|
|
.css_online = cgrp_css_online,
|
|
|
|
.css_free = cgrp_css_free,
|
|
|
|
.attach = cgrp_attach,
|
2014-07-15 23:05:09 +08:00
|
|
|
.legacy_cftypes = ss_files,
|
2013-12-30 01:27:10 +08:00
|
|
|
};
|