cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem

Currently there are two ways to walk tasks of a cgroup -
css_task_iter_start/next/end() and css_scan_tasks().  The latter
builds on the former but allows blocking while iterating.
Unfortunately, the way css_scan_tasks() is implemented is rather
nasty, it uses a priority heap of pointers to extract some number of
tasks in task creation order and loops over them invoking the callback
and repeats that until it reaches the end.  It requires either
preallocated heap or may fail under memory pressure, while unlikely to
be problematic, the complexity is O(N^2), and in general just nasty.

We're gonna convert all css_scan_users() to
css_task_iter_start/next/end() and remove css_scan_users().  As
css_scan_tasks() users may block, let's convert css_set_lock to a
rwsem so that tasks can block during css_task_iter_*() is in progress.

While this does increase the chance of possible deadlock scenarios,
given the current usage, the probability is relatively low, and even
if that happens, the right thing to do is updating the iteration in
the similar way to css iterators so that it can handle blocking.

Most conversions are trivial; however, task_cgroup_path() now expects
to be called with css_set_rwsem locked instead of locking itself.
This is because the function is called with RCU read lock held and
rwsem locking should nest outside RCU read lock.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
This commit is contained in:
Tejun Heo 2014-02-13 06:58:40 -05:00
parent e406d1cfff
commit 96d365e0b8

View File

@ -42,6 +42,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/sort.h> #include <linux/sort.h>
#include <linux/kmod.h> #include <linux/kmod.h>
@ -341,11 +342,10 @@ static struct css_set init_css_set;
static struct cgrp_cset_link init_cgrp_cset_link; static struct cgrp_cset_link init_cgrp_cset_link;
/* /*
* css_set_lock protects the list of css_set objects, and the chain of * css_set_rwsem protects the list of css_set objects, and the chain of
* tasks off each css_set. Nests outside task->alloc_lock due to * tasks off each css_set.
* css_task_iter_start().
*/ */
static DEFINE_RWLOCK(css_set_lock); static DECLARE_RWSEM(css_set_rwsem);
static int css_set_count; static int css_set_count;
/* /*
@ -380,9 +380,9 @@ static void __put_css_set(struct css_set *cset, int taskexit)
*/ */
if (atomic_add_unless(&cset->refcount, -1, 1)) if (atomic_add_unless(&cset->refcount, -1, 1))
return; return;
write_lock(&css_set_lock); down_write(&css_set_rwsem);
if (!atomic_dec_and_test(&cset->refcount)) { if (!atomic_dec_and_test(&cset->refcount)) {
write_unlock(&css_set_lock); up_write(&css_set_rwsem);
return; return;
} }
@ -396,7 +396,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
list_del(&link->cset_link); list_del(&link->cset_link);
list_del(&link->cgrp_link); list_del(&link->cgrp_link);
/* @cgrp can't go away while we're holding css_set_lock */ /* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
if (taskexit) if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags); set_bit(CGRP_RELEASABLE, &cgrp->flags);
@ -406,7 +406,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
kfree(link); kfree(link);
} }
write_unlock(&css_set_lock); up_write(&css_set_rwsem);
kfree_rcu(cset, rcu_head); kfree_rcu(cset, rcu_head);
} }
@ -627,11 +627,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
/* First see if we already have a cgroup group that matches /* First see if we already have a cgroup group that matches
* the desired set */ * the desired set */
read_lock(&css_set_lock); down_read(&css_set_rwsem);
cset = find_existing_css_set(old_cset, cgrp, template); cset = find_existing_css_set(old_cset, cgrp, template);
if (cset) if (cset)
get_css_set(cset); get_css_set(cset);
read_unlock(&css_set_lock); up_read(&css_set_rwsem);
if (cset) if (cset)
return cset; return cset;
@ -655,7 +655,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
* find_existing_css_set() */ * find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys)); memcpy(cset->subsys, template, sizeof(cset->subsys));
write_lock(&css_set_lock); down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */ /* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp; struct cgroup *c = link->cgrp;
@ -673,7 +673,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
key = css_set_hash(cset->subsys); key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key); hash_add(css_set_table, &cset->hlist, key);
write_unlock(&css_set_lock); up_write(&css_set_rwsem);
return cset; return cset;
} }
@ -739,14 +739,14 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
* Release all the links from cset_links to this hierarchy's * Release all the links from cset_links to this hierarchy's
* root cgroup * root cgroup
*/ */
write_lock(&css_set_lock); down_write(&css_set_rwsem);
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
list_del(&link->cset_link); list_del(&link->cset_link);
list_del(&link->cgrp_link); list_del(&link->cgrp_link);
kfree(link); kfree(link);
} }
write_unlock(&css_set_lock); up_write(&css_set_rwsem);
if (!list_empty(&root->root_list)) { if (!list_empty(&root->root_list)) {
list_del(&root->root_list); list_del(&root->root_list);
@ -764,7 +764,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
/* /*
* Return the cgroup for "task" from the given hierarchy. Must be * Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex held. * called with cgroup_mutex and css_set_rwsem held.
*/ */
static struct cgroup *task_cgroup_from_root(struct task_struct *task, static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroupfs_root *root) struct cgroupfs_root *root)
@ -772,8 +772,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct css_set *cset; struct css_set *cset;
struct cgroup *res = NULL; struct cgroup *res = NULL;
BUG_ON(!mutex_is_locked(&cgroup_mutex)); lockdep_assert_held(&cgroup_mutex);
read_lock(&css_set_lock); lockdep_assert_held(&css_set_rwsem);
/* /*
* No need to lock the task - since we hold cgroup_mutex the * No need to lock the task - since we hold cgroup_mutex the
* task can't change groups, so the only thing that can happen * task can't change groups, so the only thing that can happen
@ -794,7 +795,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
} }
} }
} }
read_unlock(&css_set_lock);
BUG_ON(!res); BUG_ON(!res);
return res; return res;
} }
@ -1310,7 +1311,7 @@ static void cgroup_enable_task_cg_lists(void)
{ {
struct task_struct *p, *g; struct task_struct *p, *g;
write_lock(&css_set_lock); down_write(&css_set_rwsem);
if (use_task_css_set_links) if (use_task_css_set_links)
goto out_unlock; goto out_unlock;
@ -1343,7 +1344,7 @@ static void cgroup_enable_task_cg_lists(void)
} while_each_thread(g, p); } while_each_thread(g, p);
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
out_unlock: out_unlock:
write_unlock(&css_set_lock); up_write(&css_set_rwsem);
} }
static void init_cgroup_housekeeping(struct cgroup *cgrp) static void init_cgroup_housekeeping(struct cgroup *cgrp)
@ -1408,7 +1409,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
root_cgrp->id = ret; root_cgrp->id = ret;
/* /*
* We're accessing css_set_count without locking css_set_lock here, * We're accessing css_set_count without locking css_set_rwsem here,
* but that's OK - it can only be increased by someone holding * but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. The worst that can happen is that we * cgroup_lock, and that's us. The worst that can happen is that we
* have some link structures left over * have some link structures left over
@ -1451,10 +1452,10 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
* Link the top cgroup in this hierarchy into all the css_set * Link the top cgroup in this hierarchy into all the css_set
* objects. * objects.
*/ */
write_lock(&css_set_lock); down_write(&css_set_rwsem);
hash_for_each(css_set_table, i, cset, hlist) hash_for_each(css_set_table, i, cset, hlist)
link_css_set(&tmp_links, cset, root_cgrp); link_css_set(&tmp_links, cset, root_cgrp);
write_unlock(&css_set_lock); up_write(&css_set_rwsem);
BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(!list_empty(&root_cgrp->children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1); BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@ -1617,6 +1618,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
char *path = NULL; char *path = NULL;
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
down_read(&css_set_rwsem);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@ -1629,6 +1631,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
path = buf; path = buf;
} }
up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
return path; return path;
} }
@ -1739,9 +1742,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
rcu_assign_pointer(tsk->cgroups, new_cset); rcu_assign_pointer(tsk->cgroups, new_cset);
task_unlock(tsk); task_unlock(tsk);
write_lock(&css_set_lock); down_write(&css_set_rwsem);
list_move(&tsk->cg_list, &new_cset->tasks); list_move(&tsk->cg_list, &new_cset->tasks);
write_unlock(&css_set_lock); up_write(&css_set_rwsem);
/* /*
* We just gained a reference on old_cset by taking it from the * We just gained a reference on old_cset by taking it from the
@ -1799,6 +1802,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
* already PF_EXITING could be freed from underneath us unless we * already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock. * take an rcu_read_lock.
*/ */
down_read(&css_set_rwsem);
rcu_read_lock(); rcu_read_lock();
do { do {
struct task_and_cgroup ent; struct task_and_cgroup ent;
@ -1826,6 +1830,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
break; break;
} while_each_thread(leader, tsk); } while_each_thread(leader, tsk);
rcu_read_unlock(); rcu_read_unlock();
up_read(&css_set_rwsem);
/* remember the number of threads in the array for later. */ /* remember the number of threads in the array for later. */
group_size = i; group_size = i;
tset.tc_array = group; tset.tc_array = group;
@ -2003,7 +2008,11 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
for_each_active_root(root) { for_each_active_root(root) {
struct cgroup *from_cgrp = task_cgroup_from_root(from, root); struct cgroup *from_cgrp;
down_read(&css_set_rwsem);
from_cgrp = task_cgroup_from_root(from, root);
up_read(&css_set_rwsem);
retval = cgroup_attach_task(from_cgrp, tsk, false); retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval) if (retval)
@ -2396,10 +2405,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
int count = 0; int count = 0;
struct cgrp_cset_link *link; struct cgrp_cset_link *link;
read_lock(&css_set_lock); down_read(&css_set_rwsem);
list_for_each_entry(link, &cgrp->cset_links, cset_link) list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount); count += atomic_read(&link->cset->refcount);
read_unlock(&css_set_lock); up_read(&css_set_rwsem);
return count; return count;
} }
@ -2630,12 +2639,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
*/ */
void css_task_iter_start(struct cgroup_subsys_state *css, void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it) struct css_task_iter *it)
__acquires(css_set_lock) __acquires(css_set_rwsem)
{ {
/* no one should try to iterate before mounting cgroups */ /* no one should try to iterate before mounting cgroups */
WARN_ON_ONCE(!use_task_css_set_links); WARN_ON_ONCE(!use_task_css_set_links);
read_lock(&css_set_lock); down_read(&css_set_rwsem);
it->origin_css = css; it->origin_css = css;
it->cset_link = &css->cgroup->cset_links; it->cset_link = &css->cgroup->cset_links;
@ -2683,9 +2692,9 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
* Finish task iteration started by css_task_iter_start(). * Finish task iteration started by css_task_iter_start().
*/ */
void css_task_iter_end(struct css_task_iter *it) void css_task_iter_end(struct css_task_iter *it)
__releases(css_set_lock) __releases(css_set_rwsem)
{ {
read_unlock(&css_set_lock); up_read(&css_set_rwsem);
} }
static inline int started_after_time(struct task_struct *t1, static inline int started_after_time(struct task_struct *t1,
@ -2735,7 +2744,7 @@ static inline int started_after(void *p1, void *p2)
* *
* @test may be NULL, meaning always true (select all tasks), which * @test may be NULL, meaning always true (select all tasks), which
* effectively duplicates css_task_iter_{start,next,end}() but does not * effectively duplicates css_task_iter_{start,next,end}() but does not
* lock css_set_lock for the call to @process. * lock css_set_rwsem for the call to @process.
* *
* It is guaranteed that @process will act on every task that is a member * It is guaranteed that @process will act on every task that is a member
* of @css for the duration of this call. This function may or may not * of @css for the duration of this call. This function may or may not
@ -3867,12 +3876,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
/* /*
* css_set_lock synchronizes access to ->cset_links and prevents * css_set_rwsem synchronizes access to ->cset_links and prevents
* @cgrp from being removed while __put_css_set() is in progress. * @cgrp from being removed while __put_css_set() is in progress.
*/ */
read_lock(&css_set_lock); down_read(&css_set_rwsem);
empty = list_empty(&cgrp->cset_links); empty = list_empty(&cgrp->cset_links);
read_unlock(&css_set_lock); up_read(&css_set_rwsem);
if (!empty) if (!empty)
return -EBUSY; return -EBUSY;
@ -4208,6 +4217,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
retval = 0; retval = 0;
mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_mutex);
down_read(&css_set_rwsem);
for_each_active_root(root) { for_each_active_root(root) {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
@ -4233,6 +4243,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
} }
out_unlock: out_unlock:
up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
put_task_struct(tsk); put_task_struct(tsk);
out_free: out_free:
@ -4328,12 +4339,12 @@ void cgroup_post_fork(struct task_struct *child)
* lock on fork. * lock on fork.
*/ */
if (use_task_css_set_links) { if (use_task_css_set_links) {
write_lock(&css_set_lock); down_write(&css_set_rwsem);
task_lock(child); task_lock(child);
if (list_empty(&child->cg_list)) if (list_empty(&child->cg_list))
list_add(&child->cg_list, &task_css_set(child)->tasks); list_add(&child->cg_list, &task_css_set(child)->tasks);
task_unlock(child); task_unlock(child);
write_unlock(&css_set_lock); up_write(&css_set_rwsem);
} }
/* /*
@ -4390,15 +4401,14 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
int i; int i;
/* /*
* Unlink from the css_set task list if necessary. * Unlink from the css_set task list if necessary. Optimistically
* Optimistically check cg_list before taking * check cg_list before taking css_set_rwsem.
* css_set_lock
*/ */
if (!list_empty(&tsk->cg_list)) { if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock); down_write(&css_set_rwsem);
if (!list_empty(&tsk->cg_list)) if (!list_empty(&tsk->cg_list))
list_del_init(&tsk->cg_list); list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock); up_write(&css_set_rwsem);
} }
/* Reassign the task to the init_css_set. */ /* Reassign the task to the init_css_set. */
@ -4650,7 +4660,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
if (!name_buf) if (!name_buf)
return -ENOMEM; return -ENOMEM;
read_lock(&css_set_lock); down_read(&css_set_rwsem);
rcu_read_lock(); rcu_read_lock();
cset = rcu_dereference(current->cgroups); cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@ -4666,7 +4676,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
c->root->hierarchy_id, name); c->root->hierarchy_id, name);
} }
rcu_read_unlock(); rcu_read_unlock();
read_unlock(&css_set_lock); up_read(&css_set_rwsem);
kfree(name_buf); kfree(name_buf);
return 0; return 0;
} }
@ -4677,7 +4687,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
struct cgroup_subsys_state *css = seq_css(seq); struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link; struct cgrp_cset_link *link;
read_lock(&css_set_lock); down_read(&css_set_rwsem);
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset; struct css_set *cset = link->cset;
struct task_struct *task; struct task_struct *task;
@ -4693,7 +4703,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
} }
} }
} }
read_unlock(&css_set_lock); up_read(&css_set_rwsem);
return 0; return 0;
} }