diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index dc5e2dcdbef4..558c3a739baf 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing and experimenting easier, the kernel parameter cgroup_no_v1= allows disabling controllers in v1 and make them always available in v2. +cgroup v2 currently supports the following mount options. + + nsdelegate + + Consider cgroup namespaces as delegation boundaries. This + option is system wide and can only be set on mount or modified + through remount from the init namespace. The mount option is + ignored on non-init namespace mounts. Please refer to the + Delegation section for details. + 2-2. Organizing Processes @@ -308,18 +318,27 @@ file. 2-5-1. Model of Delegation -A cgroup can be delegated to a less privileged user by granting write -access of the directory and its "cgroup.procs" file to the user. Note -that resource control interface files in a given directory control the -distribution of the parent's resources and thus must not be delegated -along with the directory. +A cgroup can be delegated in two ways. First, to a less privileged +user by granting write access of the directory and its "cgroup.procs" +and "cgroup.subtree_control" files to the user. Second, if the +"nsdelegate" mount option is set, automatically to a cgroup namespace +on namespace creation. -Once delegated, the user can build sub-hierarchy under the directory, -organize processes as it sees fit and further distribute the resources -it received from the parent. The limits and other settings of all -resource controllers are hierarchical and regardless of what happens -in the delegated sub-hierarchy, nothing can escape the resource -restrictions imposed by the parent. +Because the resource control interface files in a given directory +control the distribution of the parent's resources, the delegatee +shouldn't be allowed to write to them. For the first method, this is +achieved by not granting access to these files. For the second, the +kernel rejects writes to all files other than "cgroup.procs" and +"cgroup.subtree_control" on a namespace root from inside the +namespace. + +The end results are equivalent for both delegation types. Once +delegated, the user can build sub-hierarchy under the directory, +organize processes inside it as it sees fit and further distribute the +resources it received from the parent. The limits and other settings +of all resource controllers are hierarchical and regardless of what +happens in the delegated sub-hierarchy, nothing can escape the +resource restrictions imposed by the parent. Currently, cgroup doesn't impose any restrictions on the number of cgroups in or nesting depth of a delegated sub-hierarchy; however, @@ -329,10 +348,12 @@ this may be limited explicitly in the future. 2-5-2. Delegation Containment A delegated sub-hierarchy is contained in the sense that processes -can't be moved into or out of the sub-hierarchy by the delegatee. For -a process with a non-root euid to migrate a target process into a -cgroup by writing its PID to the "cgroup.procs" file, the following -conditions must be met. +can't be moved into or out of the sub-hierarchy by the delegatee. + +For delegations to a less privileged user, this is achieved by +requiring the following conditions for a process with a non-root euid +to migrate a target process into a cgroup by writing its PID to the +"cgroup.procs" file. - The writer must have write access to the "cgroup.procs" file. @@ -359,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would not have write access to its "cgroup.procs" files and thus the write will be denied with -EACCES. +For delegations to namespaces, containment is achieved by requiring +that both the source and destination cgroups are reachable from the +namespace of the process which is attempting the migration. If either +is not reachable, the migration is rejected with -ENOENT. + 2-6. Guidelines @@ -1413,7 +1439,7 @@ D. Deprecated v1 Core Features - Multiple hierarchies including named ones are not supported. -- All mount options and remounting are not supported. +- All v1 mount options are not supported. - The "tasks" file is removed and "cgroup.procs" is not sorted. diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ec47101cb1bf..09f4c7df1478 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -67,12 +67,21 @@ enum { enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ + + /* + * Consider namespaces as delegation boundaries. If this flag is + * set, controller specific interface files in a namespace root + * aren't writeable from inside the namespace. + */ + CGRP_ROOT_NS_DELEGATE = (1 << 3), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ + CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ @@ -166,6 +175,9 @@ struct css_set { /* the default cgroup associated with this css_set */ struct cgroup *dfl_cgrp; + /* internal task count, protected by css_set_lock */ + int nr_tasks; + /* * Lists running through all tasks using this cgroup group. * mg_tasks lists tasks which belong to this cset but are in the diff --git a/init/Kconfig b/init/Kconfig index ee0f03b69d11..b0fcbb2c6f56 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -859,11 +859,14 @@ config CGROUP_BPF inet sockets. config CGROUP_DEBUG - bool "Example controller" + bool "Debug controller" default n + depends on DEBUG_KERNEL help This option enables a simple controller that exports - debugging information about the cgroups framework. + debugging information about the cgroups framework. This + controller is for control cgroup debugging only. Its + interfaces are not stable. Say N. diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 387348a40c64..ce693ccb8c58 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 00f4d6bf048f..793565c05742 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int cgroup_task_count(const struct cgroup *cgrp); + /* * namespace.c */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 85d75152402d..7bf4b1533f34 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question - * - * Return the number of tasks in the cgroup. The returned number can be - * higher than the actual number of tasks due to css_set references from - * namespace roots and temporary usages. */ -static int cgroup_task_count(const struct cgroup *cgrp) +int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += refcount_read(&link->cset->refcount); + count += link->cset->nr_tasks; spin_unlock_irq(&css_set_lock); return count; } @@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str) return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); - - -#ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state * -debug_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_css_free(struct cgroup_subsys_state *css) -{ - kfree(css); -} - -static u64 debug_taskcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return cgroup_task_count(css->cgroup); -} - -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return (u64)(unsigned long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = refcount_read(&task_css_set(current)->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct seq_file *seq, void *v) -{ - struct cgrp_cset_link *link; - struct css_set *cset; - char *name_buf; - - name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name_buf) - return -ENOMEM; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - cgroup_name(c, name_buf, NAME_MAX + 1); - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name_buf); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - kfree(name_buf); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct seq_file *seq, void *v) -{ - struct cgroup_subsys_state *css = seq_css(seq); - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { - struct css_set *cset = link->cset; - struct task_struct *task; - int count = 0; - - seq_printf(seq, "css_set %pK\n", cset); - - list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - - list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - continue; - overflow: - seq_puts(seq, " ...\n"); - } - spin_unlock_irq(&css_set_lock); - return 0; -} - -static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return (!cgroup_is_populated(css->cgroup) && - !css_has_online_children(&css->cgroup->self)); -} - -static struct cftype debug_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - }, - - { - .name = "cgroup_css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, - - { } /* terminate */ -}; - -struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, -}; -#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8d4e85eae42c..620794a20a33 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */ /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set + * + * css_set_populated() should be the same as !!cset->nr_tasks at steady + * state. However, css_set_populated() can be called while a task is being + * added to or removed from the linked list before the nr_tasks is + * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { @@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } +static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) +{ + char *token; + + *root_flags = 0; + + if (!data) + return 0; + + while ((token = strsep(&data, ",")) != NULL) { + if (!strcmp(token, "nsdelegate")) { + *root_flags |= CGRP_ROOT_NS_DELEGATE; + continue; + } + + pr_err("cgroup2: unknown option \"%s\"\n", token); + return -EINVAL; + } + + return 0; +} + +static void apply_cgroup_root_flags(unsigned int root_flags) +{ + if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { + if (root_flags & CGRP_ROOT_NS_DELEGATE) + cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + } +} + +static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +{ + if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) + seq_puts(seq, ",nsdelegate"); + return 0; +} + static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { - pr_err("remount is not allowed\n"); - return -EINVAL; + unsigned int root_flags; + int ret; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) + return ret; + + apply_cgroup_root_flags(root_flags); + return 0; } /* @@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void) css_set_update_populated(cset, true); list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); + cset->nr_tasks++; } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); @@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct dentry *dentry; + int ret; get_cgroup_ns(ns); @@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, cgroup_enable_task_cg_lists(); if (fs_type == &cgroup2_fs_type) { - if (data) { - pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + unsigned int root_flags; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) { put_cgroup_ns(ns); - return ERR_PTR(-EINVAL); + return ERR_PTR(ret); } + cgrp_dfl_visible = true; cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); + if (!IS_ERR(dentry)) + apply_cgroup_root_flags(root_flags); } else { dentry = cgroup1_mount(&cgroup_fs_type, flags, data, CGROUP_SUPER_MAGIC, ns); @@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); + to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); put_css_set_locked(from_cset); + from_cset->nr_tasks--; } } spin_unlock_irq(&css_set_lock); @@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) { - int ret = 0; + struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; + struct cgroup *src_cgrp, *com_cgrp; + struct inode *inode; + int ret; - if (cgroup_on_dfl(dst_cgrp)) { - struct super_block *sb = of->file->f_path.dentry->d_sb; - struct cgroup *cgrp; - struct inode *inode; - - spin_lock_irq(&css_set_lock); - cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - while (!cgroup_is_descendant(dst_cgrp, cgrp)) - cgrp = cgroup_parent(cgrp); - - ret = -ENOMEM; - inode = kernfs_get_inode(sb, cgrp->procs_file.kn); - if (inode) { - ret = inode_permission(inode, MAY_WRITE); - iput(inode); - } - } else { + if (!cgroup_on_dfl(dst_cgrp)) { const struct cred *cred = current_cred(); const struct cred *tcred = get_task_cred(task); @@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task, * even if we're attaching all tasks in the thread group, * we only need to check permissions on one of them. */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + if (uid_eq(cred->euid, GLOBAL_ROOT_UID) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->euid, tcred->suid)) + ret = 0; + else ret = -EACCES; + put_cred(tcred); + return ret; } - return ret; + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + /* and the common ancestor */ + com_cgrp = src_cgrp; + while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) + com_cgrp = cgroup_parent(com_cgrp); + + /* %current should be authorized to migrate to the common ancestor */ + inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + if (ret) + return ret; + + /* + * If namespaces are delegation boundaries, %current must be able + * to see both source and destination cgroups from its namespace. + */ + if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && + (!cgroup_is_descendant(src_cgrp, root_cgrp) || + !cgroup_is_descendant(dst_cgrp, root_cgrp))) + return -ENOENT; + + return 0; } /* @@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of) static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; struct cgroup_subsys_state *css; int ret; + /* + * If namespaces are delegation boundaries, disallow writes to + * files in an non-init namespace root from inside the namespace + * except for the files explicitly marked delegatable - + * cgroup.procs and cgroup.subtree_control. + */ + if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && + !(cft->flags & CFTYPE_NS_DELEGATABLE) && + ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + return -EPERM; + if (cft->write) return cft->write(of, buf, nbytes, off); @@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v) static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", + .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, @@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.subtree_control", + .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, @@ -4393,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn) } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .show_options = cgroup_show_options, .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, @@ -4789,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child) cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); + cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } spin_unlock_irq(&css_set_lock); @@ -4838,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + cset->nr_tasks--; spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c new file mode 100644 index 000000000000..dac46af22782 --- /dev/null +++ b/kernel/cgroup/debug.c @@ -0,0 +1,357 @@ +/* + * Debug controller + * + * WARNING: This controller is for cgroup core debugging only. + * Its interfaces are unstable and subject to changes at any time. + */ +#include +#include +#include + +#include "cgroup-internal.h" + +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} + +/* + * debug_taskcount_read - return the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} + +static int current_css_set_read(struct seq_file *seq, void *v) +{ + struct kernfs_open_file *of = seq->private; + struct css_set *cset; + struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; + int i, refcnt; + + if (!cgroup_kn_lock_live(of->kn, false)) + return -ENODEV; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + refcnt = refcount_read(&cset->refcount); + seq_printf(seq, "css_set %pK %d", cset, refcnt); + if (refcnt > cset->nr_tasks) + seq_printf(seq, " +%d", refcnt - cset->nr_tasks); + seq_puts(seq, "\n"); + + /* + * Print the css'es stored in the current css_set. + */ + for_each_subsys(ss, i) { + css = cset->subsys[ss->id]; + if (!css) + continue; + seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, + (unsigned long)css, css->id); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + cgroup_kn_unlock(of->kn); + return 0; +} + +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = refcount_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) +{ + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; + int dead_cnt = 0, extra_refs = 0; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; + int refcnt = refcount_read(&cset->refcount); + + seq_printf(seq, " %d", refcnt); + if (refcnt - cset->nr_tasks > 0) { + int extra = refcnt - cset->nr_tasks; + + seq_printf(seq, " +%d", extra); + /* + * Take out the one additional reference in + * init_css_set. + */ + if (cset == &init_css_set) + extra--; + extra_refs += extra; + } + seq_puts(seq, "\n"); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ <= MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ <= MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); + } + /* show # of overflowed tasks */ + if (count > MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " ... (%d)\n", + count - MAX_TASKS_SHOWN_PER_CSS); + + if (cset->dead) { + seq_puts(seq, " [dead]\n"); + dead_cnt++; + } + + WARN_ON(count != cset->nr_tasks); + } + spin_unlock_irq(&css_set_lock); + + if (!dead_cnt && !extra_refs) + return 0; + + seq_puts(seq, "\n"); + if (extra_refs) + seq_printf(seq, "extra references = %d\n", extra_refs); + if (dead_cnt) + seq_printf(seq, "dead css_sets = %d\n", dead_cnt); + + return 0; +} + +static int cgroup_subsys_states_read(struct seq_file *seq, void *v) +{ + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp; + struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; + char pbuf[16]; + int i; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + for_each_subsys(ss, i) { + css = rcu_dereference_check(cgrp->subsys[ss->id], true); + if (!css) + continue; + + pbuf[0] = '\0'; + + /* Show the parent CSS if applicable*/ + if (css->parent) + snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", + css->parent->id); + seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, + (unsigned long)css, css->id, + atomic_read(&css->online_cnt), pbuf); + } + + cgroup_kn_unlock(of->kn); + return 0; +} + +static void cgroup_masks_read_one(struct seq_file *seq, const char *name, + u16 mask) +{ + struct cgroup_subsys *ss; + int ssid; + bool first = true; + + seq_printf(seq, "%-17s: ", name); + for_each_subsys(ss, ssid) { + if (!(mask & (1 << ssid))) + continue; + if (!first) + seq_puts(seq, ", "); + seq_puts(seq, ss->name); + first = false; + } + seq_putc(seq, '\n'); +} + +static int cgroup_masks_read(struct seq_file *seq, void *v) +{ + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); + cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); + + cgroup_kn_unlock(of->kn); + return 0; +} + +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); +} + +static struct cftype debug_legacy_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .seq_show = current_css_set_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "cgroup_subsys_states", + .seq_show = cgroup_subsys_states_read, + }, + + { + .name = "cgroup_masks", + .seq_show = cgroup_masks_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .seq_show = current_css_set_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "csses", + .seq_show = cgroup_subsys_states_read, + }, + + { + .name = "masks", + .seq_show = cgroup_masks_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_legacy_files, +}; + +/* + * On v2, debug is an implicit controller enabled by "cgroup_debug" boot + * parameter. + */ +static int __init enable_cgroup_debug(char *str) +{ + debug_cgrp_subsys.dfl_cftypes = debug_files; + debug_cgrp_subsys.implicit_on_dfl = true; + return 1; +} +__setup("cgroup_debug", enable_cgroup_debug);