From 73a7242a06ff995d771fbe243e72b516feaa6e3d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 13 Jun 2017 17:18:01 -0400
Subject: [PATCH 01/10] cgroup: Keep accurate count of tasks in each css_set

The reference count in the css_set data structure was used as a
proxy of the number of tasks attached to that css_set. However, that
count is actually not an accurate measure especially with thread mode
support. So a new variable nr_tasks is added to the css_set to keep
track of the actual task count. This new variable is protected by
the css_set_lock. Functions that require the actual task count are
updated to use the new variable.

tj: s/task_count/nr_tasks/ for consistency with cgroup_root->nr_cgrps.
    Refreshed on top of cgroup/for-v4.13 which dropped on
    css_set_populated() -> nr_tasks conversion.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  3 +++
 kernel/cgroup/cgroup-v1.c   |  6 +-----
 kernel/cgroup/cgroup.c      | 10 ++++++++++
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ec47101cb1bf..3bc4196bf217 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -166,6 +166,9 @@ struct css_set {
 	/* the default cgroup associated with this css_set */
 	struct cgroup *dfl_cgrp;
 
+	/* internal task count, protected by css_set_lock */
+	int nr_tasks;
+
 	/*
 	 * Lists running through all tasks using this cgroup group.
 	 * mg_tasks lists tasks which belong to this cset but are in the
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 85d75152402d..e9ea5f201fac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -334,10 +334,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.  The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
  */
 static int cgroup_task_count(const struct cgroup *cgrp)
 {
@@ -346,7 +342,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += refcount_read(&link->cset->refcount);
+		count += link->cset->nr_tasks;
 	spin_unlock_irq(&css_set_lock);
 	return count;
 }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8d4e85eae42c..dbfd7028b1c6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -573,6 +573,11 @@ static int css_set_count	= 1;	/* 1 for init_css_set */
 /**
  * css_set_populated - does a css_set contain any tasks?
  * @cset: target css_set
+ *
+ * css_set_populated() should be the same as !!cset->nr_tasks at steady
+ * state. However, css_set_populated() can be called while a task is being
+ * added to or removed from the linked list before the nr_tasks is
+ * properly updated. Hence, we can't just look at ->nr_tasks here.
  */
 static bool css_set_populated(struct css_set *cset)
 {
@@ -1598,6 +1603,7 @@ static void cgroup_enable_task_cg_lists(void)
 				css_set_update_populated(cset, true);
 			list_add_tail(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
+			cset->nr_tasks++;
 		}
 		spin_unlock(&p->sighand->siglock);
 	} while_each_thread(g, p);
@@ -2064,8 +2070,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 			struct css_set *to_cset = cset->mg_dst_cset;
 
 			get_css_set(to_cset);
+			to_cset->nr_tasks++;
 			css_set_move_task(task, from_cset, to_cset, true);
 			put_css_set_locked(from_cset);
+			from_cset->nr_tasks--;
 		}
 	}
 	spin_unlock_irq(&css_set_lock);
@@ -4789,6 +4797,7 @@ void cgroup_post_fork(struct task_struct *child)
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			get_css_set(cset);
+			cset->nr_tasks++;
 			css_set_move_task(child, NULL, cset, false);
 		}
 		spin_unlock_irq(&css_set_lock);
@@ -4838,6 +4847,7 @@ void cgroup_exit(struct task_struct *tsk)
 	if (!list_empty(&tsk->cg_list)) {
 		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
+		cset->nr_tasks--;
 		spin_unlock_irq(&css_set_lock);
 	} else {
 		get_css_set(cset);

From a28f8f5e995fe5964ae304444913536058f26e37 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 13 Jun 2017 17:18:02 -0400
Subject: [PATCH 02/10] cgroup: Move debug cgroup to its own file

The debug cgroup currently resides within cgroup-v1.c and is enabled
only for v1 cgroup. To enable the debug cgroup also for v2, it makes
sense to put the code into its own file as it will no longer be v1
specific. There is no change to the debug cgroup specific code.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/Makefile          |   1 +
 kernel/cgroup/cgroup-internal.h |   2 +
 kernel/cgroup/cgroup-v1.c       | 149 +------------------------------
 kernel/cgroup/debug.c           | 153 ++++++++++++++++++++++++++++++++
 4 files changed, 157 insertions(+), 148 deletions(-)
 create mode 100644 kernel/cgroup/debug.c

diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 387348a40c64..ce693ccb8c58 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 00f4d6bf048f..793565c05742 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 		     struct kernfs_root *kf_root);
 
+int cgroup_task_count(const struct cgroup *cgrp);
+
 /*
  * namespace.c
  */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index e9ea5f201fac..7bf4b1533f34 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -335,7 +335,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  */
-static int cgroup_task_count(const struct cgroup *cgrp)
+int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
 	struct cgrp_cset_link *link;
@@ -1259,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
 	return 1;
 }
 __setup("cgroup_no_v1=", cgroup_no_v1);
-
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-	if (!css)
-		return ERR_PTR(-ENOMEM);
-
-	return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	u64 count;
-
-	rcu_read_lock();
-	count = refcount_read(&task_css_set(current)->refcount);
-	rcu_read_unlock();
-	return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-	char *name_buf;
-
-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!name_buf)
-		return -ENOMEM;
-
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-		struct cgroup *c = link->cgrp;
-
-		cgroup_name(c, name_buf, NAME_MAX + 1);
-		seq_printf(seq, "Root %d group %s\n",
-			   c->root->hierarchy_id, name_buf);
-	}
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-	kfree(name_buf);
-	return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(seq);
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-		struct css_set *cset = link->cset;
-		struct task_struct *task;
-		int count = 0;
-
-		seq_printf(seq, "css_set %pK\n", cset);
-
-		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-
-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-		continue;
-	overflow:
-		seq_puts(seq, "  ...\n");
-	}
-	spin_unlock_irq(&css_set_lock);
-	return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	return (!cgroup_is_populated(css->cgroup) &&
-		!css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] =  {
-	{
-		.name = "taskcount",
-		.read_u64 = debug_taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-	},
-
-	{
-		.name = "current_css_set_cg_links",
-		.seq_show = current_css_set_cg_links_read,
-	},
-
-	{
-		.name = "cgroup_css_links",
-		.seq_show = cgroup_css_links_read,
-	},
-
-	{
-		.name = "releasable",
-		.read_u64 = releasable_read,
-	},
-
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc = debug_css_alloc,
-	.css_free = debug_css_free,
-	.legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644
index 000000000000..1c209aa43733
--- /dev/null
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,153 @@
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "cgroup-internal.h"
+
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+	if (!css)
+		return ERR_PTR(-ENOMEM);
+
+	return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css);
+}
+
+/*
+ * debug_taskcount_read - return the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	u64 count;
+
+	rcu_read_lock();
+	count = refcount_read(&task_css_set(current)->refcount);
+	rcu_read_unlock();
+	return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+	char *name_buf;
+
+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		struct cgroup *c = link->cgrp;
+
+		cgroup_name(c, name_buf, NAME_MAX + 1);
+		seq_printf(seq, "Root %d group %s\n",
+			   c->root->hierarchy_id, name_buf);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	kfree(name_buf);
+	return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(seq);
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
+		struct task_struct *task;
+		int count = 0;
+
+		seq_printf(seq, "css_set %pK\n", cset);
+
+		list_for_each_entry(task, &cset->tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+
+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+		continue;
+	overflow:
+		seq_puts(seq, "  ...\n");
+	}
+	spin_unlock_irq(&css_set_lock);
+	return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return (!cgroup_is_populated(css->cgroup) &&
+		!css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.read_u64 = current_css_set_read,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+	},
+
+	{
+		.name = "cgroup_css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	},
+
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+	.css_alloc = debug_css_alloc,
+	.css_free = debug_css_free,
+	.legacy_cftypes = debug_files,
+};

From 23b0be480f341db26ce0dee7d3f6e67f8e0e166f Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 13 Jun 2017 17:18:03 -0400
Subject: [PATCH 03/10] cgroup: Make Kconfig prompt of debug cgroup more
 accurate

The Kconfig prompt and description of the debug cgroup controller
more accurate by saying that it is for debug purpose only and its
interfaces are unstable.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 init/Kconfig          | 7 +++++--
 kernel/cgroup/debug.c | 6 ++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 1d3475fc9496..5f9d13929ae0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1205,11 +1205,14 @@ config CGROUP_BPF
 	  inet sockets.
 
 config CGROUP_DEBUG
-	bool "Example controller"
+	bool "Debug controller"
 	default n
+	depends on DEBUG_KERNEL
 	help
 	  This option enables a simple controller that exports
-	  debugging information about the cgroups framework.
+	  debugging information about the cgroups framework. This
+	  controller is for control cgroup debugging only. Its
+	  interfaces are not stable.
 
 	  Say N.
 
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 1c209aa43733..cbe77a2087c5 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -1,3 +1,9 @@
+/*
+ * Debug controller
+ *
+ * WARNING: This controller is for cgroup core debugging only.
+ * Its interfaces are unstable and subject to changes at any time.
+ */
 #include <linux/ctype.h>
 #include <linux/mm.h>
 #include <linux/slab.h>

From 575313f40ff33d0c2aff2701dfb2ccfcd6211d55 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 13 Jun 2017 17:18:04 -0400
Subject: [PATCH 04/10] cgroup: Make debug cgroup support v2 and thread mode

Besides supporting cgroup v2 and thread mode, the following changes
are also made:
 1) current_* cgroup files now resides only at the root as we don't
    need duplicated files of the same function all over the cgroup
    hierarchy.
 2) The cgroup_css_links_read() function is modified to report
    the number of tasks that are skipped because of overflow.
 3) The number of extra unaccounted references are displayed.
 4) The current_css_set_read() function now prints out the addresses of
    the css'es associated with the current css_set.
 5) A new cgroup_subsys_states file is added to display the css objects
    associated with a cgroup.
 6) A new cgroup_masks file is added to display the various controller
    bit masks in the cgroup.

tj: Dropped thread mode related information for now so that debug
    controller changes aren't blocked on the thread mode.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/debug.c | 170 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 153 insertions(+), 17 deletions(-)

diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index cbe77a2087c5..057d9b07f461 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -36,10 +36,37 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
 	return cgroup_task_count(css->cgroup);
 }
 
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
+static int current_css_set_read(struct seq_file *seq, void *v)
 {
-	return (u64)(unsigned long)current->cgroups;
+	struct css_set *cset;
+	struct cgroup_subsys *ss;
+	struct cgroup_subsys_state *css;
+	int i, refcnt;
+
+	mutex_lock(&cgroup_mutex);
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	refcnt = refcount_read(&cset->refcount);
+	seq_printf(seq, "css_set %pK %d", cset, refcnt);
+	if (refcnt > cset->nr_tasks)
+		seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
+	seq_puts(seq, "\n");
+
+	/*
+	 * Print the css'es stored in the current css_set.
+	 */
+	for_each_subsys(ss, i) {
+		css = cset->subsys[ss->id];
+		if (!css)
+			continue;
+		seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
+			  (unsigned long)css, css->id);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	mutex_unlock(&cgroup_mutex);
+	return 0;
 }
 
 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
@@ -84,31 +111,126 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 {
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
+	int dead_cnt = 0, extra_refs = 0;
 
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
 		int count = 0;
+		int refcnt = refcount_read(&cset->refcount);
 
-		seq_printf(seq, "css_set %pK\n", cset);
+		seq_printf(seq, " %d", refcnt);
+		if (refcnt - cset->nr_tasks > 0) {
+			int extra = refcnt - cset->nr_tasks;
+
+			seq_printf(seq, " +%d", extra);
+			/*
+			 * Take out the one additional reference in
+			 * init_css_set.
+			 */
+			if (cset == &init_css_set)
+				extra--;
+			extra_refs += extra;
+		}
+		seq_puts(seq, "\n");
 
 		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+				seq_printf(seq, "  task %d\n",
+					   task_pid_vnr(task));
 		}
 
 		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+				seq_printf(seq, "  task %d\n",
+					   task_pid_vnr(task));
 		}
-		continue;
-	overflow:
-		seq_puts(seq, "  ...\n");
+		/* show # of overflowed tasks */
+		if (count > MAX_TASKS_SHOWN_PER_CSS)
+			seq_printf(seq, "  ... (%d)\n",
+				   count - MAX_TASKS_SHOWN_PER_CSS);
+
+		if (cset->dead) {
+			seq_puts(seq, "    [dead]\n");
+			dead_cnt++;
+		}
+
+		WARN_ON(count != cset->nr_tasks);
 	}
 	spin_unlock_irq(&css_set_lock);
+
+	if (!dead_cnt && !extra_refs)
+		return 0;
+
+	seq_puts(seq, "\n");
+	if (extra_refs)
+		seq_printf(seq, "extra references = %d\n", extra_refs);
+	if (dead_cnt)
+		seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
+
+	return 0;
+}
+
+static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct cgroup_subsys *ss;
+	struct cgroup_subsys_state *css;
+	char pbuf[16];
+	int i;
+
+	mutex_lock(&cgroup_mutex);
+	for_each_subsys(ss, i) {
+		css = rcu_dereference_check(cgrp->subsys[ss->id], true);
+		if (!css)
+			continue;
+
+		pbuf[0] = '\0';
+
+		/* Show the parent CSS if applicable*/
+		if (css->parent)
+			snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
+				 css->parent->id);
+		seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
+			  (unsigned long)css, css->id,
+			  atomic_read(&css->online_cnt), pbuf);
+	}
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroup_masks_read(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct cgroup_subsys *ss;
+	int i, j;
+	struct {
+		u16  *mask;
+		char *name;
+	} mask_list[] = {
+		{ &cgrp->subtree_control,  "subtree_control"  },
+		{ &cgrp->subtree_ss_mask,  "subtree_ss_mask"  },
+	};
+
+	mutex_lock(&cgroup_mutex);
+	for (i = 0; i < ARRAY_SIZE(mask_list); i++) {
+		u16 mask = *mask_list[i].mask;
+		bool first = true;
+
+		seq_printf(seq, "%-17s: ", mask_list[i].name);
+		for_each_subsys(ss, j) {
+			if (!(mask & (1 << ss->id)))
+				continue;
+			if (!first)
+				seq_puts(seq, ", ");
+			seq_puts(seq, ss->name);
+			first = false;
+		}
+		seq_putc(seq, '\n');
+	}
+
+	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 
@@ -126,17 +248,20 @@ static struct cftype debug_files[] =  {
 
 	{
 		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
+		.seq_show = current_css_set_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
 	},
 
 	{
 		.name = "current_css_set_refcount",
 		.read_u64 = current_css_set_refcount_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
 	},
 
 	{
 		.name = "current_css_set_cg_links",
 		.seq_show = current_css_set_cg_links_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
 	},
 
 	{
@@ -144,6 +269,16 @@ static struct cftype debug_files[] =  {
 		.seq_show = cgroup_css_links_read,
 	},
 
+	{
+		.name = "cgroup_subsys_states",
+		.seq_show = cgroup_subsys_states_read,
+	},
+
+	{
+		.name = "cgroup_masks",
+		.seq_show = cgroup_masks_read,
+	},
+
 	{
 		.name = "releasable",
 		.read_u64 = releasable_read,
@@ -153,7 +288,8 @@ static struct cftype debug_files[] =  {
 };
 
 struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc = debug_css_alloc,
-	.css_free = debug_css_free,
-	.legacy_cftypes = debug_files,
+	.css_alloc	= debug_css_alloc,
+	.css_free	= debug_css_free,
+	.legacy_cftypes	= debug_files,
+	.dfl_cftypes	= debug_files,
 };

From 8cc38fa7fa317d44710f24475576b1f9ee205da9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Jun 2017 16:01:32 -0400
Subject: [PATCH 05/10] cgroup: make debug an implicit controller on cgroup2

Make debug an implicit controller on cgroup2 which is enabled by
"cgroup_debug" boot param.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
---
 kernel/cgroup/debug.c | 59 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 057d9b07f461..d61e692a5338 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -240,7 +240,7 @@ static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 		!css_has_online_children(&css->cgroup->self));
 }
 
-static struct cftype debug_files[] =  {
+static struct cftype debug_legacy_files[] =  {
 	{
 		.name = "taskcount",
 		.read_u64 = debug_taskcount_read,
@@ -287,9 +287,62 @@ static struct cftype debug_files[] =  {
 	{ }	/* terminate */
 };
 
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.seq_show = current_css_set_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "csses",
+		.seq_show = cgroup_subsys_states_read,
+	},
+
+	{
+		.name = "masks",
+		.seq_show = cgroup_masks_read,
+	},
+
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys debug_cgrp_subsys = {
 	.css_alloc	= debug_css_alloc,
 	.css_free	= debug_css_free,
-	.legacy_cftypes	= debug_files,
-	.dfl_cftypes	= debug_files,
+	.legacy_cftypes	= debug_legacy_files,
 };
+
+/*
+ * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
+ * parameter.
+ */
+static int __init enable_cgroup_debug(char *str)
+{
+	debug_cgrp_subsys.dfl_cftypes = debug_files;
+	debug_cgrp_subsys.implicit_on_dfl = true;
+	return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);

From 2866c0b4cf25274040f0b4cc045ad1f44b885dd8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Jun 2017 16:01:36 -0400
Subject: [PATCH 06/10] cgroup: refactor cgroup_masks_read() in the debug
 controller

Factor out cgroup_masks_read_one() out of cgroup_masks_read() for
simplicity.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
---
 kernel/cgroup/debug.c | 46 ++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index d61e692a5338..163fdbd7adf6 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -200,36 +200,32 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
 	return 0;
 }
 
+static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
+				  u16 mask)
+{
+	struct cgroup_subsys *ss;
+	int ssid;
+	bool first = true;
+
+	seq_printf(seq, "%-17s: ", name);
+	for_each_subsys(ss, ssid) {
+		if (!(mask & (1 << ssid)))
+			continue;
+		if (!first)
+			seq_puts(seq, ", ");
+		seq_puts(seq, ss->name);
+		first = false;
+	}
+	seq_putc(seq, '\n');
+}
+
 static int cgroup_masks_read(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct cgroup_subsys *ss;
-	int i, j;
-	struct {
-		u16  *mask;
-		char *name;
-	} mask_list[] = {
-		{ &cgrp->subtree_control,  "subtree_control"  },
-		{ &cgrp->subtree_ss_mask,  "subtree_ss_mask"  },
-	};
 
 	mutex_lock(&cgroup_mutex);
-	for (i = 0; i < ARRAY_SIZE(mask_list); i++) {
-		u16 mask = *mask_list[i].mask;
-		bool first = true;
-
-		seq_printf(seq, "%-17s: ", mask_list[i].name);
-		for_each_subsys(ss, j) {
-			if (!(mask & (1 << ss->id)))
-				continue;
-			if (!first)
-				seq_puts(seq, ", ");
-			seq_puts(seq, ss->name);
-			first = false;
-		}
-		seq_putc(seq, '\n');
-	}
-
+	cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
+	cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }

From b6053d40e3387c916d56d319ba6bfdb2c42a67c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Jun 2017 16:01:41 -0400
Subject: [PATCH 07/10] cgroup: fix lockdep warning in debug controller

The debug controller grabs cgroup_mutex from interface file show
functions which can deadlock and triggers lockdep warnings.  Fix it by
using cgroup_kn_lock_live()/cgroup_kn_unlock() instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
---
 kernel/cgroup/debug.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 163fdbd7adf6..dac46af22782 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -38,12 +38,15 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
 
 static int current_css_set_read(struct seq_file *seq, void *v)
 {
+	struct kernfs_open_file *of = seq->private;
 	struct css_set *cset;
 	struct cgroup_subsys *ss;
 	struct cgroup_subsys_state *css;
 	int i, refcnt;
 
-	mutex_lock(&cgroup_mutex);
+	if (!cgroup_kn_lock_live(of->kn, false))
+		return -ENODEV;
+
 	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
@@ -65,7 +68,7 @@ static int current_css_set_read(struct seq_file *seq, void *v)
 	}
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
-	mutex_unlock(&cgroup_mutex);
+	cgroup_kn_unlock(of->kn);
 	return 0;
 }
 
@@ -174,13 +177,17 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 
 static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
 {
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct kernfs_open_file *of = seq->private;
+	struct cgroup *cgrp;
 	struct cgroup_subsys *ss;
 	struct cgroup_subsys_state *css;
 	char pbuf[16];
 	int i;
 
-	mutex_lock(&cgroup_mutex);
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+
 	for_each_subsys(ss, i) {
 		css = rcu_dereference_check(cgrp->subsys[ss->id], true);
 		if (!css)
@@ -196,7 +203,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
 			  (unsigned long)css, css->id,
 			  atomic_read(&css->online_cnt), pbuf);
 	}
-	mutex_unlock(&cgroup_mutex);
+
+	cgroup_kn_unlock(of->kn);
 	return 0;
 }
 
@@ -221,12 +229,17 @@ static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
 
 static int cgroup_masks_read(struct seq_file *seq, void *v)
 {
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct kernfs_open_file *of = seq->private;
+	struct cgroup *cgrp;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
 
-	mutex_lock(&cgroup_mutex);
 	cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
 	cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
-	mutex_unlock(&cgroup_mutex);
+
+	cgroup_kn_unlock(of->kn);
 	return 0;
 }
 

From 39fd64ae9f471f2d0aee46c5bc8646e9f2494e64 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 25 Jun 2017 00:27:17 -0400
Subject: [PATCH 08/10] cgroup: "cgroup.subtree_control" should be writeable by
 delegatee

"cgroup.subtree_control" determines which resource types a cgroup
wants to control.  Unlike actual resource knobs, this is an attribute
which belongs to the cgroup itself instead of its parent and thus
should be writeable by the delegatee in a delegated cgroup.

Update delegation documentation accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroup-v2.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index dc5e2dcdbef4..0260ed053efd 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -309,10 +309,11 @@ file.
 2-5-1. Model of Delegation
 
 A cgroup can be delegated to a less privileged user by granting write
-access of the directory and its "cgroup.procs" file to the user.  Note
-that resource control interface files in a given directory control the
-distribution of the parent's resources and thus must not be delegated
-along with the directory.
+access of the directory and its "cgroup.procs" and
+"cgroup.subtree_control" files to the user.  Note that resource
+control interface files in a given directory control the distribution
+of the parent's resources and thus must not be delegated along with
+the directory.
 
 Once delegated, the user can build sub-hierarchy under the directory,
 organize processes as it sees fit and further distribute the resources

From 824ecbe01c5d833b8c8a371c209e3ac3a76cd18a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 25 Jun 2017 00:27:59 -0400
Subject: [PATCH 09/10] cgroup: restructure cgroup_procs_write_permission()

Restructure cgroup_procs_write_permission() to make extending
permission logic easier.

This patch doesn't cause any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 57 ++++++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index dbfd7028b1c6..d48069ee84c2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2363,27 +2363,12 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct cgroup *dst_cgrp,
 					 struct kernfs_open_file *of)
 {
-	int ret = 0;
+	struct super_block *sb = of->file->f_path.dentry->d_sb;
+	struct cgroup *src_cgrp, *com_cgrp;
+	struct inode *inode;
+	int ret;
 
-	if (cgroup_on_dfl(dst_cgrp)) {
-		struct super_block *sb = of->file->f_path.dentry->d_sb;
-		struct cgroup *cgrp;
-		struct inode *inode;
-
-		spin_lock_irq(&css_set_lock);
-		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-		spin_unlock_irq(&css_set_lock);
-
-		while (!cgroup_is_descendant(dst_cgrp, cgrp))
-			cgrp = cgroup_parent(cgrp);
-
-		ret = -ENOMEM;
-		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
-		if (inode) {
-			ret = inode_permission(inode, MAY_WRITE);
-			iput(inode);
-		}
-	} else {
+	if (!cgroup_on_dfl(dst_cgrp)) {
 		const struct cred *cred = current_cred();
 		const struct cred *tcred = get_task_cred(task);
 
@@ -2391,14 +2376,38 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 		 * even if we're attaching all tasks in the thread group,
 		 * we only need to check permissions on one of them.
 		 */
-		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-		    !uid_eq(cred->euid, tcred->uid) &&
-		    !uid_eq(cred->euid, tcred->suid))
+		if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
+		    uid_eq(cred->euid, tcred->uid) ||
+		    uid_eq(cred->euid, tcred->suid))
+			ret = 0;
+		else
 			ret = -EACCES;
+
 		put_cred(tcred);
+		return ret;
 	}
 
-	return ret;
+	/* find the source cgroup */
+	spin_lock_irq(&css_set_lock);
+	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+	spin_unlock_irq(&css_set_lock);
+
+	/* and the common ancestor */
+	com_cgrp = src_cgrp;
+	while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+		com_cgrp = cgroup_parent(com_cgrp);
+
+	/* %current should be authorized to migrate to the common ancestor */
+	inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+	if (!inode)
+		return -ENOMEM;
+
+	ret = inode_permission(inode, MAY_WRITE);
+	iput(inode);
+	if (ret)
+		return ret;
+
+	return 0;
 }
 
 /*

From 5136f6365ce3eace5a926e10f16ed2a233db5ba9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Jun 2017 14:30:28 -0400
Subject: [PATCH 10/10] cgroup: implement "nsdelegate" mount option

Currently, cgroup only supports delegation to !root users and cgroup
namespaces don't get any special treatments.  This limits the
usefulness of cgroup namespaces as they by themselves can't be safe
delegation boundaries.  A process inside a cgroup can change the
resource control knobs of the parent in the namespace root and may
move processes in and out of the namespace if cgroups outside its
namespace are visible somehow.

This patch adds a new mount option "nsdelegate" which makes cgroup
namespaces delegation boundaries.  If set, cgroup behaves as if write
permission based delegation took place at namespace boundaries -
writes to the resource control knobs from the namespace root are
denied and migration crossing the namespace boundary aren't allowed
from inside the namespace.

This allows cgroup namespace to function as a delegation boundary by
itself.

v2: Silently ignore nsdelegate specified on !init mounts.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Aravind Anbudurai <aru7@fb.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Eric Biederman <ebiederm@xmission.com>
---
 Documentation/cgroup-v2.txt | 59 ++++++++++++++++++-------
 include/linux/cgroup-defs.h |  9 ++++
 kernel/cgroup/cgroup.c      | 88 ++++++++++++++++++++++++++++++++++---
 3 files changed, 134 insertions(+), 22 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 0260ed053efd..558c3a739baf 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing
 and experimenting easier, the kernel parameter cgroup_no_v1= allows
 disabling controllers in v1 and make them always available in v2.
 
+cgroup v2 currently supports the following mount options.
+
+  nsdelegate
+
+	Consider cgroup namespaces as delegation boundaries.  This
+	option is system wide and can only be set on mount or modified
+	through remount from the init namespace.  The mount option is
+	ignored on non-init namespace mounts.  Please refer to the
+	Delegation section for details.
+
 
 2-2. Organizing Processes
 
@@ -308,19 +318,27 @@ file.
 
 2-5-1. Model of Delegation
 
-A cgroup can be delegated to a less privileged user by granting write
-access of the directory and its "cgroup.procs" and
-"cgroup.subtree_control" files to the user.  Note that resource
-control interface files in a given directory control the distribution
-of the parent's resources and thus must not be delegated along with
-the directory.
+A cgroup can be delegated in two ways.  First, to a less privileged
+user by granting write access of the directory and its "cgroup.procs"
+and "cgroup.subtree_control" files to the user.  Second, if the
+"nsdelegate" mount option is set, automatically to a cgroup namespace
+on namespace creation.
 
-Once delegated, the user can build sub-hierarchy under the directory,
-organize processes as it sees fit and further distribute the resources
-it received from the parent.  The limits and other settings of all
-resource controllers are hierarchical and regardless of what happens
-in the delegated sub-hierarchy, nothing can escape the resource
-restrictions imposed by the parent.
+Because the resource control interface files in a given directory
+control the distribution of the parent's resources, the delegatee
+shouldn't be allowed to write to them.  For the first method, this is
+achieved by not granting access to these files.  For the second, the
+kernel rejects writes to all files other than "cgroup.procs" and
+"cgroup.subtree_control" on a namespace root from inside the
+namespace.
+
+The end results are equivalent for both delegation types.  Once
+delegated, the user can build sub-hierarchy under the directory,
+organize processes inside it as it sees fit and further distribute the
+resources it received from the parent.  The limits and other settings
+of all resource controllers are hierarchical and regardless of what
+happens in the delegated sub-hierarchy, nothing can escape the
+resource restrictions imposed by the parent.
 
 Currently, cgroup doesn't impose any restrictions on the number of
 cgroups in or nesting depth of a delegated sub-hierarchy; however,
@@ -330,10 +348,12 @@ this may be limited explicitly in the future.
 2-5-2. Delegation Containment
 
 A delegated sub-hierarchy is contained in the sense that processes
-can't be moved into or out of the sub-hierarchy by the delegatee.  For
-a process with a non-root euid to migrate a target process into a
-cgroup by writing its PID to the "cgroup.procs" file, the following
-conditions must be met.
+can't be moved into or out of the sub-hierarchy by the delegatee.
+
+For delegations to a less privileged user, this is achieved by
+requiring the following conditions for a process with a non-root euid
+to migrate a target process into a cgroup by writing its PID to the
+"cgroup.procs" file.
 
 - The writer must have write access to the "cgroup.procs" file.
 
@@ -360,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would
 not have write access to its "cgroup.procs" files and thus the write
 will be denied with -EACCES.
 
+For delegations to namespaces, containment is achieved by requiring
+that both the source and destination cgroups are reachable from the
+namespace of the process which is attempting the migration.  If either
+is not reachable, the migration is rejected with -ENOENT.
+
 
 2-6. Guidelines
 
@@ -1414,7 +1439,7 @@ D. Deprecated v1 Core Features
 
 - Multiple hierarchies including named ones are not supported.
 
-- All mount options and remounting are not supported.
+- All v1 mount options are not supported.
 
 - The "tasks" file is removed and "cgroup.procs" is not sorted.
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 3bc4196bf217..09f4c7df1478 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -67,12 +67,21 @@ enum {
 enum {
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
+
+	/*
+	 * Consider namespaces as delegation boundaries.  If this flag is
+	 * set, controller specific interface files in a namespace root
+	 * aren't writeable from inside the namespace.
+	 */
+	CGRP_ROOT_NS_DELEGATE	= (1 << 3),
 };
 
 /* cftype->flags */
 enum {
 	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
+	CFTYPE_NS_DELEGATABLE	= (1 << 2),	/* writeable beyond delegation boundaries */
+
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d48069ee84c2..620794a20a33 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1547,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	return len;
 }
 
+static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
+{
+	char *token;
+
+	*root_flags = 0;
+
+	if (!data)
+		return 0;
+
+	while ((token = strsep(&data, ",")) != NULL) {
+		if (!strcmp(token, "nsdelegate")) {
+			*root_flags |= CGRP_ROOT_NS_DELEGATE;
+			continue;
+		}
+
+		pr_err("cgroup2: unknown option \"%s\"\n", token);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void apply_cgroup_root_flags(unsigned int root_flags)
+{
+	if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
+		if (root_flags & CGRP_ROOT_NS_DELEGATE)
+			cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
+		else
+			cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+	}
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
+		seq_puts(seq, ",nsdelegate");
+	return 0;
+}
+
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
-	pr_err("remount is not allowed\n");
-	return -EINVAL;
+	unsigned int root_flags;
+	int ret;
+
+	ret = parse_cgroup_root_flags(data, &root_flags);
+	if (ret)
+		return ret;
+
+	apply_cgroup_root_flags(root_flags);
+	return 0;
 }
 
 /*
@@ -1790,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 {
 	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct dentry *dentry;
+	int ret;
 
 	get_cgroup_ns(ns);
 
@@ -1807,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		cgroup_enable_task_cg_lists();
 
 	if (fs_type == &cgroup2_fs_type) {
-		if (data) {
-			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+		unsigned int root_flags;
+
+		ret = parse_cgroup_root_flags(data, &root_flags);
+		if (ret) {
 			put_cgroup_ns(ns);
-			return ERR_PTR(-EINVAL);
+			return ERR_PTR(ret);
 		}
+
 		cgrp_dfl_visible = true;
 		cgroup_get_live(&cgrp_dfl_root.cgrp);
 
 		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
 					 CGROUP2_SUPER_MAGIC, ns);
+		if (!IS_ERR(dentry))
+			apply_cgroup_root_flags(root_flags);
 	} else {
 		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
 				       CGROUP_SUPER_MAGIC, ns);
@@ -2364,6 +2416,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct kernfs_open_file *of)
 {
 	struct super_block *sb = of->file->f_path.dentry->d_sb;
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
 	struct cgroup *src_cgrp, *com_cgrp;
 	struct inode *inode;
 	int ret;
@@ -2407,6 +2461,15 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 	if (ret)
 		return ret;
 
+	/*
+	 * If namespaces are delegation boundaries, %current must be able
+	 * to see both source and destination cgroups from its namespace.
+	 */
+	if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+	    (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
+	     !cgroup_is_descendant(dst_cgrp, root_cgrp)))
+		return -ENOENT;
+
 	return 0;
 }
 
@@ -2971,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = of->kn->priv;
 	struct cgroup_subsys_state *css;
 	int ret;
 
+	/*
+	 * If namespaces are delegation boundaries, disallow writes to
+	 * files in an non-init namespace root from inside the namespace
+	 * except for the files explicitly marked delegatable -
+	 * cgroup.procs and cgroup.subtree_control.
+	 */
+	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+	    !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+	    ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+		return -EPERM;
+
 	if (cft->write)
 		return cft->write(of, buf, nbytes, off);
 
@@ -3809,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
 static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
+		.flags = CFTYPE_NS_DELEGATABLE,
 		.file_offset = offsetof(struct cgroup, procs_file),
 		.release = cgroup_procs_release,
 		.seq_start = cgroup_procs_start,
@@ -3822,6 +3898,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cgroup.subtree_control",
+		.flags = CFTYPE_NS_DELEGATABLE,
 		.seq_show = cgroup_subtree_control_show,
 		.write = cgroup_subtree_control_write,
 	},
@@ -4410,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
 }
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+	.show_options		= cgroup_show_options,
 	.remount_fs		= cgroup_remount,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,