2019-05-27 14:55:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2011-11-22 13:10:51 +08:00
|
|
|
/*
|
|
|
|
* net/core/netprio_cgroup.c Priority Control Group
|
|
|
|
*
|
|
|
|
* Authors: Neil Horman <nhorman@tuxdriver.com>
|
|
|
|
*/
|
|
|
|
|
2012-05-17 03:58:40 +08:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2017-03-29 05:45:06 +08:00
|
|
|
#include <linux/module.h>
|
2011-11-22 13:10:51 +08:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/cgroup.h>
|
|
|
|
#include <linux/rcupdate.h>
|
|
|
|
#include <linux/atomic.h>
|
2017-02-06 17:57:33 +08:00
|
|
|
#include <linux/sched/task.h>
|
|
|
|
|
2011-11-22 13:10:51 +08:00
|
|
|
#include <net/rtnetlink.h>
|
|
|
|
#include <net/pkt_cls.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/netprio_cgroup.h>
|
|
|
|
|
2012-07-20 18:39:25 +08:00
|
|
|
#include <linux/fdtable.h>
|
|
|
|
|
2015-12-08 06:38:51 +08:00
|
|
|
/*
|
|
|
|
* netprio allocates per-net_device priomap array which is indexed by
|
|
|
|
* css->id. Limiting css ID to 16bits doesn't lose anything.
|
|
|
|
*/
|
|
|
|
#define NETPRIO_ID_MAX USHRT_MAX
|
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
#define PRIOMAP_MIN_SZ 128
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
/*
|
2013-12-09 04:15:44 +08:00
|
|
|
* Extend @dev->priomap so that it's large enough to accommodate
|
2012-11-22 23:32:46 +08:00
|
|
|
* @target_idx. @dev->priomap.priomap_len > @target_idx after successful
|
|
|
|
* return. Must be called under rtnl lock.
|
|
|
|
*/
|
|
|
|
static int extend_netdev_table(struct net_device *dev, u32 target_idx)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2012-11-22 23:32:46 +08:00
|
|
|
struct netprio_map *old, *new;
|
|
|
|
size_t new_sz, new_len;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
/* is the existing priomap large enough? */
|
2012-11-22 23:32:46 +08:00
|
|
|
old = rtnl_dereference(dev->priomap);
|
2012-11-22 23:32:46 +08:00
|
|
|
if (old && old->priomap_len > target_idx)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine the new size. Let's keep it power-of-two. We start
|
|
|
|
* from PRIOMAP_MIN_SZ and double it until it's large enough to
|
|
|
|
* accommodate @target_idx.
|
|
|
|
*/
|
|
|
|
new_sz = PRIOMAP_MIN_SZ;
|
|
|
|
while (true) {
|
|
|
|
new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
|
|
|
|
sizeof(new->priomap[0]);
|
|
|
|
if (new_len > target_idx)
|
|
|
|
break;
|
|
|
|
new_sz *= 2;
|
|
|
|
/* overflowed? */
|
|
|
|
if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
|
|
|
|
return -ENOSPC;
|
|
|
|
}
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
/* allocate & copy */
|
|
|
|
new = kzalloc(new_sz, GFP_KERNEL);
|
2013-02-05 00:48:16 +08:00
|
|
|
if (!new)
|
net: cgroup: fix access the unallocated memory in netprio cgroup
there are some out of bound accesses in netprio cgroup.
now before accessing the dev->priomap.priomap array,we only check
if the dev->priomap exist.and because we don't want to see
additional bound checkings in fast path, so we should make sure
that dev->priomap is null or array size of dev->priomap.priomap
is equal to max_prioidx + 1;
so in write_priomap logic,we should call extend_netdev_table when
dev->priomap is null and dev->priomap.priomap_len < max_len.
and in cgrp_create->update_netdev_tables logic,we should call
extend_netdev_table only when dev->priomap exist and
dev->priomap.priomap_len < max_len.
and it's not needed to call update_netdev_tables in write_priomap,
we can only allocate the net device's priomap which we change through
net_prio.ifpriomap.
this patch also add a return value for update_netdev_tables &
extend_netdev_table, so when new_priomap is allocated failed,
write_priomap will stop to access the priomap,and return -ENOMEM
back to the userspace to tell the user what happend.
Change From v3:
1. add rtnl protect when reading max_prioidx in write_priomap.
2. only call extend_netdev_table when map->priomap_len < max_len,
this will make sure array size of dev->map->priomap always
bigger than any prioidx.
3. add a function write_update_netdev_table to make codes clear.
Change From v2:
1. protect extend_netdev_table by RTNL.
2. when extend_netdev_table failed,call dev_put to reduce device's refcount.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-12 05:50:15 +08:00
|
|
|
return -ENOMEM;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
if (old)
|
|
|
|
memcpy(new->priomap, old->priomap,
|
|
|
|
old->priomap_len * sizeof(old->priomap[0]));
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
new->priomap_len = new_len;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
/* install the new priomap */
|
2012-11-22 23:32:46 +08:00
|
|
|
rcu_assign_pointer(dev->priomap, new);
|
|
|
|
if (old)
|
|
|
|
kfree_rcu(old, rcu);
|
net: cgroup: fix access the unallocated memory in netprio cgroup
there are some out of bound accesses in netprio cgroup.
now before accessing the dev->priomap.priomap array,we only check
if the dev->priomap exist.and because we don't want to see
additional bound checkings in fast path, so we should make sure
that dev->priomap is null or array size of dev->priomap.priomap
is equal to max_prioidx + 1;
so in write_priomap logic,we should call extend_netdev_table when
dev->priomap is null and dev->priomap.priomap_len < max_len.
and in cgrp_create->update_netdev_tables logic,we should call
extend_netdev_table only when dev->priomap exist and
dev->priomap.priomap_len < max_len.
and it's not needed to call update_netdev_tables in write_priomap,
we can only allocate the net device's priomap which we change through
net_prio.ifpriomap.
this patch also add a return value for update_netdev_tables &
extend_netdev_table, so when new_priomap is allocated failed,
write_priomap will stop to access the priomap,and return -ENOMEM
back to the userspace to tell the user what happend.
Change From v3:
1. add rtnl protect when reading max_prioidx in write_priomap.
2. only call extend_netdev_table when map->priomap_len < max_len,
this will make sure array size of dev->map->priomap always
bigger than any prioidx.
3. add a function write_update_netdev_table to make codes clear.
Change From v2:
1. protect extend_netdev_table by RTNL.
2. when extend_netdev_table failed,call dev_put to reduce device's refcount.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-12 05:50:15 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-11-22 23:32:47 +08:00
|
|
|
/**
|
|
|
|
* netprio_prio - return the effective netprio of a cgroup-net_device pair
|
2013-08-09 08:11:22 +08:00
|
|
|
* @css: css part of the target pair
|
2012-11-22 23:32:47 +08:00
|
|
|
* @dev: net_device part of the target pair
|
|
|
|
*
|
|
|
|
* Should be called under RCU read or rtnl lock.
|
|
|
|
*/
|
2013-08-09 08:11:22 +08:00
|
|
|
static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
|
2012-11-22 23:32:47 +08:00
|
|
|
{
|
|
|
|
struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
|
2019-11-05 07:54:29 +08:00
|
|
|
int id = css->id;
|
2012-11-22 23:32:47 +08:00
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
if (map && id < map->priomap_len)
|
|
|
|
return map->priomap[id];
|
2012-11-22 23:32:47 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* netprio_set_prio - set netprio on a cgroup-net_device pair
|
2013-08-09 08:11:22 +08:00
|
|
|
* @css: css part of the target pair
|
2012-11-22 23:32:47 +08:00
|
|
|
* @dev: net_device part of the target pair
|
|
|
|
* @prio: prio to set
|
|
|
|
*
|
2013-08-09 08:11:22 +08:00
|
|
|
* Set netprio to @prio on @css-@dev pair. Should be called under rtnl
|
2012-11-22 23:32:47 +08:00
|
|
|
* lock and may fail under memory pressure for non-zero @prio.
|
|
|
|
*/
|
2013-08-09 08:11:22 +08:00
|
|
|
static int netprio_set_prio(struct cgroup_subsys_state *css,
|
|
|
|
struct net_device *dev, u32 prio)
|
2012-11-22 23:32:47 +08:00
|
|
|
{
|
|
|
|
struct netprio_map *map;
|
2019-11-05 07:54:29 +08:00
|
|
|
int id = css->id;
|
2012-11-22 23:32:47 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* avoid extending priomap for zero writes */
|
|
|
|
map = rtnl_dereference(dev->priomap);
|
2013-08-09 08:11:22 +08:00
|
|
|
if (!prio && (!map || map->priomap_len <= id))
|
2012-11-22 23:32:47 +08:00
|
|
|
return 0;
|
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
ret = extend_netdev_table(dev, id);
|
2012-11-22 23:32:47 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
map = rtnl_dereference(dev->priomap);
|
2013-08-09 08:11:22 +08:00
|
|
|
map->priomap[id] = prio;
|
2012-11-22 23:32:47 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static struct cgroup_subsys_state *
|
|
|
|
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2013-08-09 08:11:22 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
2012-11-22 23:32:47 +08:00
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
css = kzalloc(sizeof(*css), GFP_KERNEL);
|
|
|
|
if (!css)
|
2011-11-22 13:10:51 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
return css;
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static int cgrp_css_online(struct cgroup_subsys_state *css)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2014-05-17 01:22:48 +08:00
|
|
|
struct cgroup_subsys_state *parent_css = css->parent;
|
2011-11-22 13:10:51 +08:00
|
|
|
struct net_device *dev;
|
2012-11-22 23:32:47 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2015-12-08 06:38:51 +08:00
|
|
|
if (css->id > NETPRIO_ID_MAX)
|
|
|
|
return -ENOSPC;
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
if (!parent_css)
|
2012-11-22 23:32:47 +08:00
|
|
|
return 0;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
|
|
|
rtnl_lock();
|
2012-11-22 23:32:47 +08:00
|
|
|
/*
|
|
|
|
* Inherit prios from the parent. As all prios are set during
|
|
|
|
* onlining, there is no need to clear them on offline.
|
|
|
|
*/
|
|
|
|
for_each_netdev(&init_net, dev) {
|
2013-08-09 08:11:22 +08:00
|
|
|
u32 prio = netprio_prio(parent_css, dev);
|
2012-11-22 23:32:47 +08:00
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
ret = netprio_set_prio(css, dev, prio);
|
2012-11-22 23:32:47 +08:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
2011-11-22 13:10:51 +08:00
|
|
|
rtnl_unlock();
|
2012-11-22 23:32:47 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static void cgrp_css_free(struct cgroup_subsys_state *css)
|
2012-11-22 23:32:47 +08:00
|
|
|
{
|
2013-08-09 08:11:23 +08:00
|
|
|
kfree(css);
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:24 +08:00
|
|
|
static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2019-11-05 07:54:29 +08:00
|
|
|
return css->id;
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
|
|
|
|
2013-12-06 01:28:04 +08:00
|
|
|
static int read_priomap(struct seq_file *sf, void *v)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
|
|
|
struct net_device *dev;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2012-11-22 23:32:47 +08:00
|
|
|
for_each_netdev_rcu(&init_net, dev)
|
2013-12-06 01:28:04 +08:00
|
|
|
seq_printf(sf, "%s %u\n", dev->name,
|
|
|
|
netprio_prio(seq_css(sf), dev));
|
2011-11-22 13:10:51 +08:00
|
|
|
rcu_read_unlock();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-05-14 00:16:21 +08:00
|
|
|
static ssize_t write_priomap(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2012-11-22 23:32:46 +08:00
|
|
|
char devname[IFNAMSIZ + 1];
|
2011-11-22 13:10:51 +08:00
|
|
|
struct net_device *dev;
|
2012-11-22 23:32:46 +08:00
|
|
|
u32 prio;
|
|
|
|
int ret;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2014-05-14 00:16:21 +08:00
|
|
|
if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
|
2012-11-22 23:32:46 +08:00
|
|
|
return -EINVAL;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
|
|
|
dev = dev_get_by_name(&init_net, devname);
|
|
|
|
if (!dev)
|
2012-11-22 23:32:46 +08:00
|
|
|
return -ENODEV;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-08-14 20:34:35 +08:00
|
|
|
rtnl_lock();
|
2012-11-22 23:32:46 +08:00
|
|
|
|
2014-05-14 00:16:21 +08:00
|
|
|
ret = netprio_set_prio(of_css(of), dev, prio);
|
net: cgroup: fix access the unallocated memory in netprio cgroup
there are some out of bound accesses in netprio cgroup.
now before accessing the dev->priomap.priomap array,we only check
if the dev->priomap exist.and because we don't want to see
additional bound checkings in fast path, so we should make sure
that dev->priomap is null or array size of dev->priomap.priomap
is equal to max_prioidx + 1;
so in write_priomap logic,we should call extend_netdev_table when
dev->priomap is null and dev->priomap.priomap_len < max_len.
and in cgrp_create->update_netdev_tables logic,we should call
extend_netdev_table only when dev->priomap exist and
dev->priomap.priomap_len < max_len.
and it's not needed to call update_netdev_tables in write_priomap,
we can only allocate the net device's priomap which we change through
net_prio.ifpriomap.
this patch also add a return value for update_netdev_tables &
extend_netdev_table, so when new_priomap is allocated failed,
write_priomap will stop to access the priomap,and return -ENOMEM
back to the userspace to tell the user what happend.
Change From v3:
1. add rtnl protect when reading max_prioidx in write_priomap.
2. only call extend_netdev_table when map->priomap_len < max_len,
this will make sure array size of dev->map->priomap always
bigger than any prioidx.
3. add a function write_update_netdev_table to make codes clear.
Change From v2:
1. protect extend_netdev_table by RTNL.
2. when extend_netdev_table failed,call dev_put to reduce device's refcount.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-12 05:50:15 +08:00
|
|
|
|
2012-08-14 20:34:35 +08:00
|
|
|
rtnl_unlock();
|
2011-11-22 13:10:51 +08:00
|
|
|
dev_put(dev);
|
2014-05-14 00:16:21 +08:00
|
|
|
return ret ?: nbytes;
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
|
|
|
|
2012-08-22 10:32:06 +08:00
|
|
|
static int update_netprio(const void *v, struct file *file, unsigned n)
|
|
|
|
{
|
2020-12-04 19:36:04 +08:00
|
|
|
struct socket *sock = sock_from_file(file);
|
bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.
Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.
In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.
Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.
[0] https://github.com/nestybox/sysbox/
[1] https://kind.sigs.k8s.io/
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
2021-09-14 07:07:57 +08:00
|
|
|
|
|
|
|
if (sock)
|
2015-12-08 06:38:52 +08:00
|
|
|
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
|
|
|
|
(unsigned long)v);
|
2012-08-22 10:32:06 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
static void net_prio_attach(struct cgroup_taskset *tset)
|
2012-07-20 18:39:25 +08:00
|
|
|
{
|
|
|
|
struct task_struct *p;
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
|
|
|
|
|
|
|
cgroup_taskset_for_each(p, css, tset) {
|
2019-11-05 07:54:29 +08:00
|
|
|
void *v = (void *)(unsigned long)css->id;
|
2012-07-20 18:39:25 +08:00
|
|
|
|
|
|
|
task_lock(p);
|
2012-08-22 10:32:06 +08:00
|
|
|
iterate_fd(p->files, 0, update_netprio, v);
|
2012-07-20 18:39:25 +08:00
|
|
|
task_unlock(p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-11-22 13:10:51 +08:00
|
|
|
static struct cftype ss_files[] = {
|
|
|
|
{
|
|
|
|
.name = "prioidx",
|
|
|
|
.read_u64 = read_prioidx,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "ifpriomap",
|
2013-12-06 01:28:04 +08:00
|
|
|
.seq_show = read_priomap,
|
2014-05-14 00:16:21 +08:00
|
|
|
.write = write_priomap,
|
2011-11-22 13:10:51 +08:00
|
|
|
},
|
2012-04-02 03:09:55 +08:00
|
|
|
{ } /* terminate */
|
2011-11-22 13:10:51 +08:00
|
|
|
};
|
|
|
|
|
2014-02-08 23:36:58 +08:00
|
|
|
struct cgroup_subsys net_prio_cgrp_subsys = {
|
2012-11-20 00:13:38 +08:00
|
|
|
.css_alloc = cgrp_css_alloc,
|
2012-11-22 23:32:47 +08:00
|
|
|
.css_online = cgrp_css_online,
|
2012-11-20 00:13:38 +08:00
|
|
|
.css_free = cgrp_css_free,
|
2012-07-20 18:39:25 +08:00
|
|
|
.attach = net_prio_attach,
|
2014-07-15 23:05:09 +08:00
|
|
|
.legacy_cftypes = ss_files,
|
2012-04-02 03:09:55 +08:00
|
|
|
};
|
2011-11-22 13:10:51 +08:00
|
|
|
|
|
|
|
static int netprio_device_event(struct notifier_block *unused,
|
|
|
|
unsigned long event, void *ptr)
|
|
|
|
{
|
2013-05-28 09:30:21 +08:00
|
|
|
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
2011-11-22 13:10:51 +08:00
|
|
|
struct netprio_map *old;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note this is called with rtnl_lock held so we have update side
|
|
|
|
* protection on our rcu assignments
|
|
|
|
*/
|
|
|
|
|
|
|
|
switch (event) {
|
|
|
|
case NETDEV_UNREGISTER:
|
|
|
|
old = rtnl_dereference(dev->priomap);
|
2011-11-23 15:09:32 +08:00
|
|
|
RCU_INIT_POINTER(dev->priomap, NULL);
|
2011-11-22 13:10:51 +08:00
|
|
|
if (old)
|
|
|
|
kfree_rcu(old, rcu);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block netprio_device_notifier = {
|
|
|
|
.notifier_call = netprio_device_event
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init init_cgroup_netprio(void)
|
|
|
|
{
|
|
|
|
register_netdevice_notifier(&netprio_device_notifier);
|
2014-02-08 23:36:58 +08:00
|
|
|
return 0;
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
2014-02-08 23:36:58 +08:00
|
|
|
subsys_initcall(init_cgroup_netprio);
|