mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2025-01-02 20:04:49 +08:00
8520e224f5
Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used. Back in the days, commitbd1060a1d6
("sock, cgroup: add sock->sk_cgroup") embedded per-socket cgroup information into sock->sk_cgrp_data and in order to save 8 bytes in struct sock made both mutually exclusive, that is, when cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2 falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp). The assumption made was "there is no reason to mix the two and this is in line with how legacy and v2 compatibility is handled" as stated inbd1060a1d6
. However, with Kubernetes more widely supporting cgroups v2 as well nowadays, this assumption no longer holds, and the possibility of the v1/v2 mixed mode with the v2 root fallback being hit becomes a real security issue. Many of the cgroup v2 BPF programs are also used for policy enforcement, just to pick _one_ example, that is, to programmatically deny socket related system calls like connect(2) or bind(2). A v2 root fallback would implicitly cause a policy bypass for the affected Pods. In production environments, we have recently seen this case due to various circumstances: i) a different 3rd party agent and/or ii) a container runtime such as [0] in the user's environment configuring legacy cgroup v1 net_cls tags, which triggered implicitly mentioned root fallback. Another case is Kubernetes projects like kind [1] which create Kubernetes nodes in a container and also add cgroup namespaces to the mix, meaning programs which are attached to the cgroup v2 root of the cgroup namespace get attached to a non-root cgroup v2 path from init namespace point of view. And the latter's root is out of reach for agents on a kind Kubernetes node to configure. Meaning, any entity on the node setting cgroup v1 net_cls tag will trigger the bypass despite cgroup v2 BPF programs attached to the namespace root. Generally, this mutual exclusiveness does not hold anymore in today's user environments and makes cgroup v2 usage from BPF side fragile and unreliable. This fix adds proper struct cgroup pointer for the cgroup v2 case to struct sock_cgroup_data in order to address these issues; this implicitly also fixes the tradeoffs being made back then with regards to races and refcount leaks as stated inbd1060a1d6
, and removes the fallback, so that cgroup v2 BPF programs always operate as expected. [0] https://github.com/nestybox/sysbox/ [1] https://kind.sigs.k8s.io/ Fixes:bd1060a1d6
("sock, cgroup: add sock->sk_cgroup") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Stanislav Fomichev <sdf@google.com> Acked-by: Tejun Heo <tj@kernel.org> Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
147 lines
3.2 KiB
C
147 lines
3.2 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* net/core/netclassid_cgroup.c Classid Cgroupfs Handling
|
|
*
|
|
* Authors: Thomas Graf <tgraf@suug.ch>
|
|
*/
|
|
|
|
#include <linux/slab.h>
|
|
#include <linux/cgroup.h>
|
|
#include <linux/fdtable.h>
|
|
#include <linux/sched/task.h>
|
|
|
|
#include <net/cls_cgroup.h>
|
|
#include <net/sock.h>
|
|
|
|
static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
|
|
{
|
|
return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
|
|
}
|
|
|
|
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
|
|
{
|
|
return css_cls_state(task_css_check(p, net_cls_cgrp_id,
|
|
rcu_read_lock_bh_held()));
|
|
}
|
|
EXPORT_SYMBOL_GPL(task_cls_state);
|
|
|
|
static struct cgroup_subsys_state *
|
|
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
|
|
{
|
|
struct cgroup_cls_state *cs;
|
|
|
|
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
|
|
if (!cs)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
return &cs->css;
|
|
}
|
|
|
|
static int cgrp_css_online(struct cgroup_subsys_state *css)
|
|
{
|
|
struct cgroup_cls_state *cs = css_cls_state(css);
|
|
struct cgroup_cls_state *parent = css_cls_state(css->parent);
|
|
|
|
if (parent)
|
|
cs->classid = parent->classid;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void cgrp_css_free(struct cgroup_subsys_state *css)
|
|
{
|
|
kfree(css_cls_state(css));
|
|
}
|
|
|
|
/*
|
|
* To avoid freezing of sockets creation for tasks with big number of threads
|
|
* and opened sockets lets release file_lock every 1000 iterated descriptors.
|
|
* New sockets will already have been created with new classid.
|
|
*/
|
|
|
|
struct update_classid_context {
|
|
u32 classid;
|
|
unsigned int batch;
|
|
};
|
|
|
|
#define UPDATE_CLASSID_BATCH 1000
|
|
|
|
static int update_classid_sock(const void *v, struct file *file, unsigned n)
|
|
{
|
|
struct update_classid_context *ctx = (void *)v;
|
|
struct socket *sock = sock_from_file(file);
|
|
|
|
if (sock)
|
|
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
|
|
if (--ctx->batch == 0) {
|
|
ctx->batch = UPDATE_CLASSID_BATCH;
|
|
return n + 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void update_classid_task(struct task_struct *p, u32 classid)
|
|
{
|
|
struct update_classid_context ctx = {
|
|
.classid = classid,
|
|
.batch = UPDATE_CLASSID_BATCH
|
|
};
|
|
unsigned int fd = 0;
|
|
|
|
do {
|
|
task_lock(p);
|
|
fd = iterate_fd(p->files, fd, update_classid_sock, &ctx);
|
|
task_unlock(p);
|
|
cond_resched();
|
|
} while (fd);
|
|
}
|
|
|
|
static void cgrp_attach(struct cgroup_taskset *tset)
|
|
{
|
|
struct cgroup_subsys_state *css;
|
|
struct task_struct *p;
|
|
|
|
cgroup_taskset_for_each(p, css, tset) {
|
|
update_classid_task(p, css_cls_state(css)->classid);
|
|
}
|
|
}
|
|
|
|
static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
|
|
{
|
|
return css_cls_state(css)->classid;
|
|
}
|
|
|
|
static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
|
|
u64 value)
|
|
{
|
|
struct cgroup_cls_state *cs = css_cls_state(css);
|
|
struct css_task_iter it;
|
|
struct task_struct *p;
|
|
|
|
cs->classid = (u32)value;
|
|
|
|
css_task_iter_start(css, 0, &it);
|
|
while ((p = css_task_iter_next(&it)))
|
|
update_classid_task(p, cs->classid);
|
|
css_task_iter_end(&it);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static struct cftype ss_files[] = {
|
|
{
|
|
.name = "classid",
|
|
.read_u64 = read_classid,
|
|
.write_u64 = write_classid,
|
|
},
|
|
{ } /* terminate */
|
|
};
|
|
|
|
struct cgroup_subsys net_cls_cgrp_subsys = {
|
|
.css_alloc = cgrp_css_alloc,
|
|
.css_online = cgrp_css_online,
|
|
.css_free = cgrp_css_free,
|
|
.attach = cgrp_attach,
|
|
.legacy_cftypes = ss_files,
|
|
};
|