mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-25 21:24:08 +08:00
cgroup: Changes for v6.6
* Per-cpu cpu usage stats are now tracked. This currently isn't printed out
in the cgroupfs interface and can only be accessed through e.g. BPF.
Should decide on a not-too-ugly way to show per-cpu stats in cgroupfs.
* cpuset received some cleanups and prepatory patches for the pending
cpus.exclusive patchset which will allow cpuset partitions to be created
below non-partition parents, which should ease the management of partition
cpusets.
* A lot of code and documentation cleanup patches.
* tools/testing/selftests/cgroup/test_cpuset.c is added. This causes trivial
conflicts in .gitignore and Makefile under the directory against
fe3b1bf19b
("selftests: cgroup: add test_zswap program"). They can be
resolved by keeping lines from both branches.
-----BEGIN PGP SIGNATURE-----
iIQEABYIACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCZPENTg4cdGpAa2VybmVs
Lm9yZwAKCRCxYfJx3gVYGcyBAP44cHwpSFxXe3cehxAzb1l/2BZXtzU5l48OqUQd
MwHyrwEAm7+MTVAR2xOF4f+oVM9KWmKj7oV7Clpixl1S7hHyjwE=
=FCc9
-----END PGP SIGNATURE-----
Merge tag 'cgroup-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- Per-cpu cpu usage stats are now tracked
This currently isn't printed out in the cgroupfs interface and can
only be accessed through e.g. BPF. Should decide on a not-too-ugly
way to show per-cpu stats in cgroupfs
- cpuset received some cleanups and prepatory patches for the pending
cpus.exclusive patchset which will allow cpuset partitions to be
created below non-partition parents, which should ease the management
of partition cpusets
- A lot of code and documentation cleanup patches
- tools/testing/selftests/cgroup/test_cpuset.c added
* tag 'cgroup-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (32 commits)
cgroup: Avoid -Wstringop-overflow warnings
cgroup:namespace: Remove unused cgroup_namespaces_init()
cgroup/rstat: Record the cumulative per-cpu time of cgroup and its descendants
cgroup: clean up if condition in cgroup_pidlist_start()
cgroup: fix obsolete function name in cgroup_destroy_locked()
Documentation: cgroup-v2.rst: Correct number of stats entries
cgroup: fix obsolete function name above css_free_rwork_fn()
cgroup/cpuset: fix kernel-doc
cgroup: clean up printk()
cgroup: fix obsolete comment above cgroup_create()
docs: cgroup-v1: fix typo
docs: cgroup-v1: correct the term of Page Cache organization in inode
cgroup/misc: Store atomic64_t reads to u64
cgroup/misc: Change counters to be explicit 64bit types
cgroup/misc: update struct members descriptions
cgroup: remove cgrp->kn check in css_populate_dir()
cgroup: fix obsolete function name
cgroup: use cached local variable parent in for loop
cgroup: remove obsolete comment above struct cgroupstats
cgroup: put cgroup_tryget_css() inside CONFIG_CGROUP_SCHED
...
This commit is contained in:
commit
7716f383a5
@ -195,11 +195,11 @@ are not accounted. We just account pages under usual VM management.
|
||||
|
||||
RSS pages are accounted at page_fault unless they've already been accounted
|
||||
for earlier. A file page will be accounted for as Page Cache when it's
|
||||
inserted into inode (radix-tree). While it's mapped into the page tables of
|
||||
inserted into inode (xarray). While it's mapped into the page tables of
|
||||
processes, duplicate accounting is carefully avoided.
|
||||
|
||||
An RSS page is unaccounted when it's fully unmapped. A PageCache page is
|
||||
unaccounted when it's removed from radix-tree. Even if RSS pages are fully
|
||||
unaccounted when it's removed from xarray. Even if RSS pages are fully
|
||||
unmapped (by kswapd), they may exist as SwapCache in the system until they
|
||||
are really freed. Such SwapCaches are also accounted.
|
||||
A swapped-in page is accounted after adding into swapcache.
|
||||
@ -907,7 +907,7 @@ experiences some pressure. In this situation, only group C will receive the
|
||||
notification, i.e. groups A and B will not receive it. This is done to avoid
|
||||
excessive "broadcasting" of messages, which disturbs the system and which is
|
||||
especially bad if we are low on memory or thrashing. Group B, will receive
|
||||
notification only if there are no event listers for group C.
|
||||
notification only if there are no event listeners for group C.
|
||||
|
||||
There are three optional modes that specify different propagation behavior:
|
||||
|
||||
|
@ -1045,7 +1045,7 @@ All time durations are in microseconds.
|
||||
- user_usec
|
||||
- system_usec
|
||||
|
||||
and the following three when the controller is enabled:
|
||||
and the following five when the controller is enabled:
|
||||
|
||||
- nr_periods
|
||||
- nr_throttled
|
||||
|
@ -5255,6 +5255,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
|
||||
F: Documentation/admin-guide/cgroup-v1/cpusets.rst
|
||||
F: include/linux/cpuset.h
|
||||
F: kernel/cgroup/cpuset.c
|
||||
F: tools/testing/selftests/cgroup/test_cpuset.c
|
||||
F: tools/testing/selftests/cgroup/test_cpuset_prs.sh
|
||||
|
||||
CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
|
||||
M: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
@ -341,6 +341,20 @@ struct cgroup_rstat_cpu {
|
||||
*/
|
||||
struct cgroup_base_stat last_bstat;
|
||||
|
||||
/*
|
||||
* This field is used to record the cumulative per-cpu time of
|
||||
* the cgroup and its descendants. Currently it can be read via
|
||||
* eBPF/drgn etc, and we are still trying to determine how to
|
||||
* expose it in the cgroupfs interface.
|
||||
*/
|
||||
struct cgroup_base_stat subtree_bstat;
|
||||
|
||||
/*
|
||||
* Snapshots at the last reading. These are used to calculate the
|
||||
* deltas to propagate to the per-cpu subtree_bstat.
|
||||
*/
|
||||
struct cgroup_base_stat last_subtree_bstat;
|
||||
|
||||
/*
|
||||
* Child cgroups with stat updates on this cpu since the last read
|
||||
* are linked on the parent's ->updated_children through
|
||||
|
@ -31,17 +31,18 @@ struct misc_cg;
|
||||
* struct misc_res: Per cgroup per misc type resource
|
||||
* @max: Maximum limit on the resource.
|
||||
* @usage: Current usage of the resource.
|
||||
* @failed: True if charged failed for the resource in a cgroup.
|
||||
* @events: Number of times, the resource limit exceeded.
|
||||
*/
|
||||
struct misc_res {
|
||||
unsigned long max;
|
||||
atomic_long_t usage;
|
||||
atomic_long_t events;
|
||||
u64 max;
|
||||
atomic64_t usage;
|
||||
atomic64_t events;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct misc_cg - Miscellaneous controller's cgroup structure.
|
||||
* @css: cgroup subsys state object.
|
||||
* @events_file: Handle for the misc resources events file.
|
||||
* @res: Array of misc resources usage in the cgroup.
|
||||
*/
|
||||
struct misc_cg {
|
||||
@ -53,12 +54,10 @@ struct misc_cg {
|
||||
struct misc_res res[MISC_CG_RES_TYPES];
|
||||
};
|
||||
|
||||
unsigned long misc_cg_res_total_usage(enum misc_res_type type);
|
||||
int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity);
|
||||
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount);
|
||||
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount);
|
||||
u64 misc_cg_res_total_usage(enum misc_res_type type);
|
||||
int misc_cg_set_capacity(enum misc_res_type type, u64 capacity);
|
||||
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
|
||||
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
|
||||
|
||||
/**
|
||||
* css_misc() - Get misc cgroup from the css.
|
||||
@ -99,27 +98,26 @@ static inline void put_misc_cg(struct misc_cg *cg)
|
||||
|
||||
#else /* !CONFIG_CGROUP_MISC */
|
||||
|
||||
static inline unsigned long misc_cg_res_total_usage(enum misc_res_type type)
|
||||
static inline u64 misc_cg_res_total_usage(enum misc_res_type type)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int misc_cg_set_capacity(enum misc_res_type type,
|
||||
unsigned long capacity)
|
||||
static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int misc_cg_try_charge(enum misc_res_type type,
|
||||
struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
u64 amount)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void misc_cg_uncharge(enum misc_res_type type,
|
||||
struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
u64 amount)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -24,8 +24,6 @@
|
||||
* basis. This data is shared using taskstats.
|
||||
*
|
||||
* Most of these states are derived by looking at the task->state value
|
||||
* For the nr_io_wait state, a flag in the delay accounting structure
|
||||
* indicates that the task is waiting on IO
|
||||
*
|
||||
* Each member is aligned to a 8 byte boundary.
|
||||
*/
|
||||
|
@ -431,7 +431,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
|
||||
if (l->list[mid] == pid) {
|
||||
index = mid;
|
||||
break;
|
||||
} else if (l->list[mid] <= pid)
|
||||
} else if (l->list[mid] < pid)
|
||||
index = mid + 1;
|
||||
else
|
||||
end = mid;
|
||||
|
@ -492,28 +492,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
|
||||
return &cgrp->self;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
|
||||
* @cgrp: the cgroup of interest
|
||||
* @ss: the subsystem of interest
|
||||
*
|
||||
* Find and get @cgrp's css associated with @ss. If the css doesn't exist
|
||||
* or is offline, %NULL is returned.
|
||||
*/
|
||||
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
|
||||
struct cgroup_subsys *ss)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
rcu_read_lock();
|
||||
css = cgroup_css(cgrp, ss);
|
||||
if (css && !css_tryget_online(css))
|
||||
css = NULL;
|
||||
rcu_read_unlock();
|
||||
|
||||
return css;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
|
||||
* @cgrp: the cgroup of interest
|
||||
@ -679,7 +657,7 @@ EXPORT_SYMBOL_GPL(of_css);
|
||||
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
|
||||
* @cgrp: the target cgroup to iterate css's of
|
||||
*
|
||||
* Should be called under cgroup_[tree_]mutex.
|
||||
* Should be called under cgroup_mutex.
|
||||
*/
|
||||
#define for_each_css(css, ssid, cgrp) \
|
||||
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
|
||||
@ -929,7 +907,7 @@ static void css_set_move_task(struct task_struct *task,
|
||||
#define CSS_SET_HASH_BITS 7
|
||||
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
|
||||
|
||||
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
|
||||
static unsigned long css_set_hash(struct cgroup_subsys_state **css)
|
||||
{
|
||||
unsigned long key = 0UL;
|
||||
struct cgroup_subsys *ss;
|
||||
@ -1070,7 +1048,7 @@ static bool compare_css_sets(struct css_set *cset,
|
||||
*/
|
||||
static struct css_set *find_existing_css_set(struct css_set *old_cset,
|
||||
struct cgroup *cgrp,
|
||||
struct cgroup_subsys_state *template[])
|
||||
struct cgroup_subsys_state **template)
|
||||
{
|
||||
struct cgroup_root *root = cgrp->root;
|
||||
struct cgroup_subsys *ss;
|
||||
@ -1736,7 +1714,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
|
||||
struct cftype *cfts, *failed_cfts;
|
||||
int ret;
|
||||
|
||||
if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
|
||||
if (css->flags & CSS_VISIBLE)
|
||||
return 0;
|
||||
|
||||
if (!css->ss) {
|
||||
@ -2499,7 +2477,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
|
||||
|
||||
/*
|
||||
* This function may be called both before and
|
||||
* after cgroup_taskset_migrate(). The two cases
|
||||
* after cgroup_migrate_execute(). The two cases
|
||||
* can be distinguished by looking at whether @cset
|
||||
* has its ->mg_dst_cset set.
|
||||
*/
|
||||
@ -3654,9 +3632,32 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
|
||||
struct cgroup *cgrp, int ssid)
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
/**
|
||||
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
|
||||
* @cgrp: the cgroup of interest
|
||||
* @ss: the subsystem of interest
|
||||
*
|
||||
* Find and get @cgrp's css associated with @ss. If the css doesn't exist
|
||||
* or is offline, %NULL is returned.
|
||||
*/
|
||||
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
|
||||
struct cgroup_subsys *ss)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
rcu_read_lock();
|
||||
css = cgroup_css(cgrp, ss);
|
||||
if (css && !css_tryget_online(css))
|
||||
css = NULL;
|
||||
rcu_read_unlock();
|
||||
|
||||
return css;
|
||||
}
|
||||
|
||||
static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct cgroup_subsys *ss = cgroup_subsys[ssid];
|
||||
struct cgroup_subsys_state *css;
|
||||
int ret;
|
||||
@ -3672,15 +3673,15 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
|
||||
css_put(css);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int cpu_stat_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
|
||||
int ret = 0;
|
||||
|
||||
cgroup_base_stat_cputime_show(seq);
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
|
||||
ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
@ -4350,14 +4351,13 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
|
||||
static void cgroup_rm_cftypes_locked(struct cftype *cfts)
|
||||
{
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
list_del(&cfts->node);
|
||||
cgroup_apply_cftypes(cfts, false);
|
||||
cgroup_exit_cftypes(cfts);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4373,8 +4373,6 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts)
|
||||
*/
|
||||
int cgroup_rm_cftypes(struct cftype *cfts)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!cfts || cfts[0].name[0] == '\0')
|
||||
return 0;
|
||||
|
||||
@ -4382,9 +4380,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
|
||||
return -ENOENT;
|
||||
|
||||
cgroup_lock();
|
||||
ret = cgroup_rm_cftypes_locked(cfts);
|
||||
cgroup_rm_cftypes_locked(cfts);
|
||||
cgroup_unlock();
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5337,7 +5335,7 @@ static struct cftype cgroup_psi_files[] = {
|
||||
* RCU callback.
|
||||
*
|
||||
* 4. After the grace period, the css can be freed. Implemented in
|
||||
* css_free_work_fn().
|
||||
* css_free_rwork_fn().
|
||||
*
|
||||
* It is actually hairier because both step 2 and 4 require process context
|
||||
* and thus involve punting to css->destroy_work adding two additional
|
||||
@ -5581,8 +5579,7 @@ err_free_css:
|
||||
|
||||
/*
|
||||
* The returned cgroup is fully initialized including its control mask, but
|
||||
* it isn't associated with its kernfs_node and doesn't have the control
|
||||
* mask applied.
|
||||
* it doesn't have the control mask applied.
|
||||
*/
|
||||
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
|
||||
umode_t mode)
|
||||
@ -5908,7 +5905,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
|
||||
/*
|
||||
* Mark @cgrp and the associated csets dead. The former prevents
|
||||
* further task migration and child creation by disabling
|
||||
* cgroup_lock_live_group(). The latter makes the csets ignored by
|
||||
* cgroup_kn_lock_live(). The latter makes the csets ignored by
|
||||
* the migration path.
|
||||
*/
|
||||
cgrp->self.flags &= ~CSS_ONLINE;
|
||||
@ -5930,7 +5927,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
|
||||
parent->nr_threaded_children--;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
|
||||
for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
|
||||
tcgrp->nr_descendants--;
|
||||
tcgrp->nr_dying_descendants++;
|
||||
/*
|
||||
@ -6123,8 +6120,8 @@ int __init cgroup_init(void)
|
||||
continue;
|
||||
|
||||
if (cgroup1_ssid_disabled(ssid))
|
||||
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
|
||||
ss->name);
|
||||
pr_info("Disabling %s control group subsystem in v1 mounts\n",
|
||||
ss->name);
|
||||
|
||||
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
|
||||
|
||||
|
@ -1230,7 +1230,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
|
||||
/*
|
||||
* Percpu kthreads in top_cpuset are ignored
|
||||
*/
|
||||
if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task))
|
||||
if (kthread_is_per_cpu(task))
|
||||
continue;
|
||||
cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
|
||||
} else {
|
||||
@ -1255,7 +1255,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
|
||||
static void compute_effective_cpumask(struct cpumask *new_cpus,
|
||||
struct cpuset *cs, struct cpuset *parent)
|
||||
{
|
||||
if (parent->nr_subparts_cpus) {
|
||||
if (parent->nr_subparts_cpus && is_partition_valid(cs)) {
|
||||
cpumask_or(new_cpus, parent->effective_cpus,
|
||||
parent->subparts_cpus);
|
||||
cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
|
||||
@ -1277,6 +1277,52 @@ enum subparts_cmd {
|
||||
|
||||
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
|
||||
int turning_on);
|
||||
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
|
||||
struct tmpmasks *tmp);
|
||||
|
||||
/*
|
||||
* Update partition exclusive flag
|
||||
*
|
||||
* Return: 0 if successful, an error code otherwise
|
||||
*/
|
||||
static int update_partition_exclusive(struct cpuset *cs, int new_prs)
|
||||
{
|
||||
bool exclusive = (new_prs > 0);
|
||||
|
||||
if (exclusive && !is_cpu_exclusive(cs)) {
|
||||
if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
|
||||
return PERR_NOTEXCL;
|
||||
} else if (!exclusive && is_cpu_exclusive(cs)) {
|
||||
/* Turning off CS_CPU_EXCLUSIVE will not return error */
|
||||
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update partition load balance flag and/or rebuild sched domain
|
||||
*
|
||||
* Changing load balance flag will automatically call
|
||||
* rebuild_sched_domains_locked().
|
||||
*/
|
||||
static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
|
||||
{
|
||||
int new_prs = cs->partition_root_state;
|
||||
bool new_lb = (new_prs != PRS_ISOLATED);
|
||||
bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
|
||||
|
||||
if (new_lb != !!is_sched_load_balance(cs)) {
|
||||
rebuild_domains = true;
|
||||
if (new_lb)
|
||||
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
||||
else
|
||||
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
||||
}
|
||||
|
||||
if (rebuild_domains)
|
||||
rebuild_sched_domains_locked();
|
||||
}
|
||||
|
||||
/**
|
||||
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
|
||||
* @cs: The cpuset that requests change in partition root state
|
||||
@ -1336,8 +1382,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
|
||||
return is_partition_invalid(parent)
|
||||
? PERR_INVPARENT : PERR_NOTPART;
|
||||
}
|
||||
if ((newmask && cpumask_empty(newmask)) ||
|
||||
(!newmask && cpumask_empty(cs->cpus_allowed)))
|
||||
if (!newmask && cpumask_empty(cs->cpus_allowed))
|
||||
return PERR_CPUSEMPTY;
|
||||
|
||||
/*
|
||||
@ -1403,11 +1448,16 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
|
||||
cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
|
||||
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
|
||||
parent->subparts_cpus);
|
||||
/*
|
||||
* Empty cpumask is not allowed
|
||||
*/
|
||||
if (cpumask_empty(newmask)) {
|
||||
part_error = PERR_CPUSEMPTY;
|
||||
/*
|
||||
* Make partition invalid if parent's effective_cpus could
|
||||
* become empty and there are tasks in the parent.
|
||||
*/
|
||||
if (adding &&
|
||||
} else if (adding &&
|
||||
cpumask_subset(parent->effective_cpus, tmp->addmask) &&
|
||||
!cpumask_intersects(tmp->delmask, cpu_active_mask) &&
|
||||
partition_is_populated(parent, cs)) {
|
||||
@ -1480,14 +1530,13 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
|
||||
|
||||
/*
|
||||
* Transitioning between invalid to valid or vice versa may require
|
||||
* changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE.
|
||||
* changing CS_CPU_EXCLUSIVE.
|
||||
*/
|
||||
if (old_prs != new_prs) {
|
||||
if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) &&
|
||||
(update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
|
||||
return PERR_NOTEXCL;
|
||||
if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs))
|
||||
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
|
||||
int err = update_partition_exclusive(cs, new_prs);
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1520,23 +1569,33 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
|
||||
|
||||
spin_unlock_irq(&callback_lock);
|
||||
|
||||
if (adding || deleting)
|
||||
if (adding || deleting) {
|
||||
update_tasks_cpumask(parent, tmp->addmask);
|
||||
if (parent->child_ecpus_count)
|
||||
update_sibling_cpumasks(parent, cs, tmp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
|
||||
* rebuild_sched_domains_locked() may be called.
|
||||
* For partcmd_update without newmask, it is being called from
|
||||
* cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
|
||||
* Update the load balance flag and scheduling domain if
|
||||
* cpus_read_trylock() is successful.
|
||||
*/
|
||||
if (old_prs != new_prs) {
|
||||
if (old_prs == PRS_ISOLATED)
|
||||
update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
|
||||
else if (new_prs == PRS_ISOLATED)
|
||||
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
|
||||
if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
|
||||
update_partition_sd_lb(cs, old_prs);
|
||||
cpus_read_unlock();
|
||||
}
|
||||
|
||||
notify_partition_change(cs, old_prs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* update_cpumasks_hier() flags
|
||||
*/
|
||||
#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
|
||||
#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
|
||||
|
||||
/*
|
||||
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
|
||||
* @cs: the cpuset to consider
|
||||
@ -1551,7 +1610,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
|
||||
* Called with cpuset_mutex held
|
||||
*/
|
||||
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
|
||||
bool force)
|
||||
int flags)
|
||||
{
|
||||
struct cpuset *cp;
|
||||
struct cgroup_subsys_state *pos_css;
|
||||
@ -1588,11 +1647,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
|
||||
}
|
||||
|
||||
/*
|
||||
* Skip the whole subtree if the cpumask remains the same
|
||||
* and has no partition root state and force flag not set.
|
||||
* Skip the whole subtree if
|
||||
* 1) the cpumask remains the same,
|
||||
* 2) has no partition root state,
|
||||
* 3) HIER_CHECKALL flag not set, and
|
||||
* 4) for v2 load balance state same as its parent.
|
||||
*/
|
||||
if (!cp->partition_root_state && !force &&
|
||||
cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
|
||||
if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
|
||||
cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
|
||||
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
||||
(is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
|
||||
pos_css = css_rightmost_descendant(pos_css);
|
||||
continue;
|
||||
}
|
||||
@ -1675,6 +1739,20 @@ update_parent_subparts:
|
||||
|
||||
update_tasks_cpumask(cp, tmp->new_cpus);
|
||||
|
||||
/*
|
||||
* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
|
||||
* from parent if current cpuset isn't a valid partition root
|
||||
* and their load balance states differ.
|
||||
*/
|
||||
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
||||
!is_partition_valid(cp) &&
|
||||
(is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
|
||||
if (is_sched_load_balance(parent))
|
||||
set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
|
||||
else
|
||||
clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* On legacy hierarchy, if the effective cpumask of any non-
|
||||
* empty cpuset is changed, we need to rebuild sched domains.
|
||||
@ -1692,7 +1770,7 @@ update_parent_subparts:
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (need_rebuild_sched_domains)
|
||||
if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
|
||||
rebuild_sched_domains_locked();
|
||||
}
|
||||
|
||||
@ -1716,7 +1794,9 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
|
||||
* to use the right effective_cpus value.
|
||||
*
|
||||
* The update_cpumasks_hier() function may sleep. So we have to
|
||||
* release the RCU read lock before calling it.
|
||||
* release the RCU read lock before calling it. HIER_NO_SD_REBUILD
|
||||
* flag is used to suppress rebuild of sched domains as the callers
|
||||
* will take care of that.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
cpuset_for_each_child(sibling, pos_css, parent) {
|
||||
@ -1728,7 +1808,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
|
||||
continue;
|
||||
|
||||
rcu_read_unlock();
|
||||
update_cpumasks_hier(sibling, tmp, false);
|
||||
update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
|
||||
rcu_read_lock();
|
||||
css_put(&sibling->css);
|
||||
}
|
||||
@ -1747,6 +1827,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
int retval;
|
||||
struct tmpmasks tmp;
|
||||
bool invalidate = false;
|
||||
int old_prs = cs->partition_root_state;
|
||||
|
||||
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
|
||||
if (cs == &top_cpuset)
|
||||
@ -1774,18 +1855,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
|
||||
return 0;
|
||||
|
||||
#ifdef CONFIG_CPUMASK_OFFSTACK
|
||||
/*
|
||||
* Use the cpumasks in trialcs for tmpmasks when they are pointers
|
||||
* to allocated cpumasks.
|
||||
*
|
||||
* Note that update_parent_subparts_cpumask() uses only addmask &
|
||||
* delmask, but not new_cpus.
|
||||
*/
|
||||
tmp.addmask = trialcs->subparts_cpus;
|
||||
tmp.delmask = trialcs->effective_cpus;
|
||||
tmp.new_cpus = NULL;
|
||||
#endif
|
||||
if (alloc_cpumasks(NULL, &tmp))
|
||||
return -ENOMEM;
|
||||
|
||||
retval = validate_change(cs, trialcs);
|
||||
|
||||
@ -1814,7 +1885,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
retval = 0;
|
||||
}
|
||||
if (retval < 0)
|
||||
return retval;
|
||||
goto out_free;
|
||||
|
||||
if (cs->partition_root_state) {
|
||||
if (invalidate)
|
||||
@ -1849,13 +1920,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
}
|
||||
spin_unlock_irq(&callback_lock);
|
||||
|
||||
#ifdef CONFIG_CPUMASK_OFFSTACK
|
||||
/* Now trialcs->cpus_allowed is available */
|
||||
tmp.new_cpus = trialcs->cpus_allowed;
|
||||
#endif
|
||||
|
||||
/* effective_cpus will be updated here */
|
||||
update_cpumasks_hier(cs, &tmp, false);
|
||||
update_cpumasks_hier(cs, &tmp, 0);
|
||||
|
||||
if (cs->partition_root_state) {
|
||||
struct cpuset *parent = parent_cs(cs);
|
||||
@ -1866,7 +1932,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
*/
|
||||
if (parent->child_ecpus_count)
|
||||
update_sibling_cpumasks(parent, cs, &tmp);
|
||||
|
||||
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */
|
||||
update_partition_sd_lb(cs, old_prs);
|
||||
}
|
||||
out_free:
|
||||
free_cpumasks(NULL, &tmp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2242,7 +2313,6 @@ out:
|
||||
static int update_prstate(struct cpuset *cs, int new_prs)
|
||||
{
|
||||
int err = PERR_NONE, old_prs = cs->partition_root_state;
|
||||
bool sched_domain_rebuilt = false;
|
||||
struct cpuset *parent = parent_cs(cs);
|
||||
struct tmpmasks tmpmask;
|
||||
|
||||
@ -2261,45 +2331,26 @@ static int update_prstate(struct cpuset *cs, int new_prs)
|
||||
if (alloc_cpumasks(NULL, &tmpmask))
|
||||
return -ENOMEM;
|
||||
|
||||
err = update_partition_exclusive(cs, new_prs);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (!old_prs) {
|
||||
/*
|
||||
* Turning on partition root requires setting the
|
||||
* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
|
||||
* cannot be empty.
|
||||
* cpus_allowed cannot be empty.
|
||||
*/
|
||||
if (cpumask_empty(cs->cpus_allowed)) {
|
||||
err = PERR_CPUSEMPTY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
|
||||
if (err) {
|
||||
err = PERR_NOTEXCL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = update_parent_subparts_cpumask(cs, partcmd_enable,
|
||||
NULL, &tmpmask);
|
||||
if (err) {
|
||||
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (new_prs == PRS_ISOLATED) {
|
||||
/*
|
||||
* Disable the load balance flag should not return an
|
||||
* error unless the system is running out of memory.
|
||||
*/
|
||||
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
|
||||
sched_domain_rebuilt = true;
|
||||
}
|
||||
} else if (old_prs && new_prs) {
|
||||
/*
|
||||
* A change in load balance state only, no change in cpumasks.
|
||||
*/
|
||||
update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
|
||||
sched_domain_rebuilt = true;
|
||||
goto out; /* Sched domain is rebuilt in update_flag() */
|
||||
;
|
||||
} else {
|
||||
/*
|
||||
* Switching back to member is always allowed even if it
|
||||
@ -2318,40 +2369,31 @@ static int update_prstate(struct cpuset *cs, int new_prs)
|
||||
compute_effective_cpumask(cs->effective_cpus, cs, parent);
|
||||
spin_unlock_irq(&callback_lock);
|
||||
}
|
||||
|
||||
/* Turning off CS_CPU_EXCLUSIVE will not return error */
|
||||
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
|
||||
|
||||
if (!is_sched_load_balance(cs)) {
|
||||
/* Make sure load balance is on */
|
||||
update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
|
||||
sched_domain_rebuilt = true;
|
||||
}
|
||||
}
|
||||
|
||||
update_tasks_cpumask(parent, tmpmask.new_cpus);
|
||||
|
||||
if (parent->child_ecpus_count)
|
||||
update_sibling_cpumasks(parent, cs, &tmpmask);
|
||||
|
||||
if (!sched_domain_rebuilt)
|
||||
rebuild_sched_domains_locked();
|
||||
out:
|
||||
/*
|
||||
* Make partition invalid if an error happen
|
||||
* Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
|
||||
* happens.
|
||||
*/
|
||||
if (err)
|
||||
if (err) {
|
||||
new_prs = -new_prs;
|
||||
update_partition_exclusive(cs, new_prs);
|
||||
}
|
||||
|
||||
spin_lock_irq(&callback_lock);
|
||||
cs->partition_root_state = new_prs;
|
||||
WRITE_ONCE(cs->prs_err, err);
|
||||
spin_unlock_irq(&callback_lock);
|
||||
|
||||
/*
|
||||
* Update child cpusets, if present.
|
||||
* Force update if switching back to member.
|
||||
*/
|
||||
if (!list_empty(&cs->css.children))
|
||||
update_cpumasks_hier(cs, &tmpmask, !new_prs);
|
||||
update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
|
||||
|
||||
/* Update sched domains and load balance flag */
|
||||
update_partition_sd_lb(cs, old_prs);
|
||||
|
||||
notify_partition_change(cs, old_prs);
|
||||
free_cpumasks(NULL, &tmpmask);
|
||||
@ -2487,6 +2529,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
|
||||
struct cgroup_subsys_state *css;
|
||||
struct cpuset *cs, *oldcs;
|
||||
struct task_struct *task;
|
||||
bool cpus_updated, mems_updated;
|
||||
int ret;
|
||||
|
||||
/* used later by cpuset_attach() */
|
||||
@ -2501,13 +2544,25 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
|
||||
mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
|
||||
|
||||
cgroup_taskset_for_each(task, css, tset) {
|
||||
ret = task_can_attach(task);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
ret = security_task_setscheduler(task);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* Skip rights over task check in v2 when nothing changes,
|
||||
* migration permission derives from hierarchy ownership in
|
||||
* cgroup_procs_write_permission()).
|
||||
*/
|
||||
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
||||
(cpus_updated || mems_updated)) {
|
||||
ret = security_task_setscheduler(task);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (dl_task(task)) {
|
||||
cs->nr_migrate_dl_tasks++;
|
||||
@ -3222,6 +3277,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
|
||||
cs->use_parent_ecpus = true;
|
||||
parent->child_ecpus_count++;
|
||||
}
|
||||
|
||||
/*
|
||||
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
|
||||
*/
|
||||
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
||||
!is_sched_load_balance(parent))
|
||||
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
||||
|
||||
spin_unlock_irq(&callback_lock);
|
||||
|
||||
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
|
||||
@ -3521,17 +3584,16 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
|
||||
is_empty = cpumask_empty(cs->cpus_allowed) ||
|
||||
nodes_empty(cs->mems_allowed);
|
||||
|
||||
mutex_unlock(&cpuset_mutex);
|
||||
|
||||
/*
|
||||
* Move tasks to the nearest ancestor with execution resources,
|
||||
* This is full cgroup operation which will also call back into
|
||||
* cpuset. Should be done outside any lock.
|
||||
*/
|
||||
if (is_empty)
|
||||
if (is_empty) {
|
||||
mutex_unlock(&cpuset_mutex);
|
||||
remove_tasks_in_empty_cpuset(cs);
|
||||
|
||||
mutex_lock(&cpuset_mutex);
|
||||
mutex_lock(&cpuset_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@ -3691,6 +3753,7 @@ unlock:
|
||||
|
||||
/**
|
||||
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
|
||||
* @work: unused
|
||||
*
|
||||
* This function is called after either CPU or memory configuration has
|
||||
* changed and updates cpuset accordingly. The top_cpuset is always
|
||||
@ -4073,6 +4136,7 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask)
|
||||
|
||||
/**
|
||||
* cpuset_spread_node() - On which node to begin search for a page
|
||||
* @rotor: round robin rotor
|
||||
*
|
||||
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
|
||||
* tasks in a cpuset with is_spread_page or is_spread_slab set),
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include <linux/misc_cgroup.h>
|
||||
|
||||
#define MAX_STR "max"
|
||||
#define MAX_NUM ULONG_MAX
|
||||
#define MAX_NUM U64_MAX
|
||||
|
||||
/* Miscellaneous res name, keep it in sync with enum misc_res_type */
|
||||
static const char *const misc_res_name[] = {
|
||||
@ -37,7 +37,7 @@ static struct misc_cg root_cg;
|
||||
* more than the actual capacity. We are using Limits resource distribution
|
||||
* model of cgroup for miscellaneous controller.
|
||||
*/
|
||||
static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
|
||||
static u64 misc_res_capacity[MISC_CG_RES_TYPES];
|
||||
|
||||
/**
|
||||
* parent_misc() - Get the parent of the passed misc cgroup.
|
||||
@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type)
|
||||
* Context: Any context.
|
||||
* Return: Current total usage of the resource.
|
||||
*/
|
||||
unsigned long misc_cg_res_total_usage(enum misc_res_type type)
|
||||
u64 misc_cg_res_total_usage(enum misc_res_type type)
|
||||
{
|
||||
if (valid_type(type))
|
||||
return atomic_long_read(&root_cg.res[type].usage);
|
||||
return atomic64_read(&root_cg.res[type].usage);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
|
||||
* * %0 - Successfully registered the capacity.
|
||||
* * %-EINVAL - If @type is invalid.
|
||||
*/
|
||||
int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
|
||||
int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
|
||||
{
|
||||
if (!valid_type(type))
|
||||
return -EINVAL;
|
||||
@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
|
||||
* Context: Any context.
|
||||
*/
|
||||
static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
u64 amount)
|
||||
{
|
||||
WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
|
||||
WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage),
|
||||
"misc cgroup resource %s became less than 0",
|
||||
misc_res_name[type]);
|
||||
}
|
||||
@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
* * -EBUSY - If max limit will be crossed or total usage will be more than the
|
||||
* capacity.
|
||||
*/
|
||||
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
|
||||
{
|
||||
struct misc_cg *i, *j;
|
||||
int ret;
|
||||
struct misc_res *res;
|
||||
int new_usage;
|
||||
u64 new_usage;
|
||||
|
||||
if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
|
||||
return -EINVAL;
|
||||
@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
for (i = cg; i; i = parent_misc(i)) {
|
||||
res = &i->res[type];
|
||||
|
||||
new_usage = atomic_long_add_return(amount, &res->usage);
|
||||
new_usage = atomic64_add_return(amount, &res->usage);
|
||||
if (new_usage > READ_ONCE(res->max) ||
|
||||
new_usage > READ_ONCE(misc_res_capacity[type])) {
|
||||
ret = -EBUSY;
|
||||
@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
|
||||
err_charge:
|
||||
for (j = i; j; j = parent_misc(j)) {
|
||||
atomic_long_inc(&j->res[type].events);
|
||||
atomic64_inc(&j->res[type].events);
|
||||
cgroup_file_notify(&j->events_file);
|
||||
}
|
||||
|
||||
@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge);
|
||||
*
|
||||
* Context: Any context.
|
||||
*/
|
||||
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
|
||||
{
|
||||
struct misc_cg *i;
|
||||
|
||||
@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
int i;
|
||||
struct misc_cg *cg = css_misc(seq_css(sf));
|
||||
unsigned long max;
|
||||
u64 max;
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
if (READ_ONCE(misc_res_capacity[i])) {
|
||||
@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
|
||||
if (max == MAX_NUM)
|
||||
seq_printf(sf, "%s max\n", misc_res_name[i]);
|
||||
else
|
||||
seq_printf(sf, "%s %lu\n", misc_res_name[i],
|
||||
seq_printf(sf, "%s %llu\n", misc_res_name[i],
|
||||
max);
|
||||
}
|
||||
}
|
||||
@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
|
||||
* Return:
|
||||
* * >= 0 - Number of bytes processed in the input.
|
||||
* * -EINVAL - If buf is not valid.
|
||||
* * -ERANGE - If number is bigger than the unsigned long capacity.
|
||||
* * -ERANGE - If number is bigger than the u64 capacity.
|
||||
*/
|
||||
static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, loff_t off)
|
||||
{
|
||||
struct misc_cg *cg;
|
||||
unsigned long max;
|
||||
u64 max;
|
||||
int ret = 0, i;
|
||||
enum misc_res_type type = MISC_CG_RES_TYPES;
|
||||
char *token;
|
||||
@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
|
||||
if (!strcmp(MAX_STR, buf)) {
|
||||
max = MAX_NUM;
|
||||
} else {
|
||||
ret = kstrtoul(buf, 0, &max);
|
||||
ret = kstrtou64(buf, 0, &max);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
|
||||
static int misc_cg_current_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
int i;
|
||||
unsigned long usage;
|
||||
u64 usage;
|
||||
struct misc_cg *cg = css_misc(seq_css(sf));
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
usage = atomic_long_read(&cg->res[i].usage);
|
||||
usage = atomic64_read(&cg->res[i].usage);
|
||||
if (READ_ONCE(misc_res_capacity[i]) || usage)
|
||||
seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
|
||||
seq_printf(sf, "%s %llu\n", misc_res_name[i], usage);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
|
||||
static int misc_cg_capacity_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
int i;
|
||||
unsigned long cap;
|
||||
u64 cap;
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
cap = READ_ONCE(misc_res_capacity[i]);
|
||||
if (cap)
|
||||
seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
|
||||
seq_printf(sf, "%s %llu\n", misc_res_name[i], cap);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
|
||||
static int misc_events_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct misc_cg *cg = css_misc(seq_css(sf));
|
||||
unsigned long events, i;
|
||||
u64 events;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
events = atomic_long_read(&cg->res[i].events);
|
||||
events = atomic64_read(&cg->res[i].events);
|
||||
if (READ_ONCE(misc_res_capacity[i]) || events)
|
||||
seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
|
||||
seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -397,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
WRITE_ONCE(cg->res[i].max, MAX_NUM);
|
||||
atomic_long_set(&cg->res[i].usage, 0);
|
||||
atomic64_set(&cg->res[i].usage, 0);
|
||||
}
|
||||
|
||||
return &cg->css;
|
||||
|
@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = {
|
||||
.install = cgroupns_install,
|
||||
.owner = cgroupns_owner,
|
||||
};
|
||||
|
||||
static __init int cgroup_namespaces_init(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(cgroup_namespaces_init);
|
||||
|
@ -344,6 +344,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
|
||||
{
|
||||
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
|
||||
struct cgroup *parent = cgroup_parent(cgrp);
|
||||
struct cgroup_rstat_cpu *prstatc;
|
||||
struct cgroup_base_stat delta;
|
||||
unsigned seq;
|
||||
|
||||
@ -357,17 +358,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
|
||||
delta = rstatc->bstat;
|
||||
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
|
||||
|
||||
/* propagate percpu delta to global */
|
||||
/* propagate per-cpu delta to cgroup and per-cpu global statistics */
|
||||
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
|
||||
cgroup_base_stat_add(&cgrp->bstat, &delta);
|
||||
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
|
||||
cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
|
||||
|
||||
/* propagate global delta to parent (unless that's root) */
|
||||
/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
|
||||
if (cgroup_parent(parent)) {
|
||||
delta = cgrp->bstat;
|
||||
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
|
||||
cgroup_base_stat_add(&parent->bstat, &delta);
|
||||
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
|
||||
|
||||
delta = rstatc->subtree_bstat;
|
||||
prstatc = cgroup_rstat_cpu(parent, cpu);
|
||||
cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
|
||||
cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
|
||||
cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
|
||||
}
|
||||
}
|
||||
|
||||
|
1
tools/testing/selftests/cgroup/.gitignore
vendored
1
tools/testing/selftests/cgroup/.gitignore
vendored
@ -5,5 +5,6 @@ test_freezer
|
||||
test_kmem
|
||||
test_kill
|
||||
test_cpu
|
||||
test_cpuset
|
||||
test_zswap
|
||||
wait_inotify
|
||||
|
@ -12,6 +12,7 @@ TEST_GEN_PROGS += test_core
|
||||
TEST_GEN_PROGS += test_freezer
|
||||
TEST_GEN_PROGS += test_kill
|
||||
TEST_GEN_PROGS += test_cpu
|
||||
TEST_GEN_PROGS += test_cpuset
|
||||
TEST_GEN_PROGS += test_zswap
|
||||
|
||||
LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h
|
||||
@ -24,4 +25,5 @@ $(OUTPUT)/test_core: cgroup_util.c
|
||||
$(OUTPUT)/test_freezer: cgroup_util.c
|
||||
$(OUTPUT)/test_kill: cgroup_util.c
|
||||
$(OUTPUT)/test_cpu: cgroup_util.c
|
||||
$(OUTPUT)/test_cpuset: cgroup_util.c
|
||||
$(OUTPUT)/test_zswap: cgroup_util.c
|
||||
|
@ -286,6 +286,8 @@ int cg_destroy(const char *cgroup)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!cgroup)
|
||||
return 0;
|
||||
retry:
|
||||
ret = rmdir(cgroup);
|
||||
if (ret && errno == EBUSY) {
|
||||
|
@ -11,6 +11,8 @@
|
||||
#define USEC_PER_SEC 1000000L
|
||||
#define NSEC_PER_SEC 1000000000L
|
||||
|
||||
#define TEST_UID 65534 /* usually nobody, any !root is fine */
|
||||
|
||||
/*
|
||||
* Checks if two given values differ by less than err% of their sum.
|
||||
*/
|
||||
|
@ -683,7 +683,7 @@ cleanup:
|
||||
*/
|
||||
static int test_cgcore_lesser_euid_open(const char *root)
|
||||
{
|
||||
const uid_t test_euid = 65534; /* usually nobody, any !root is fine */
|
||||
const uid_t test_euid = TEST_UID;
|
||||
int ret = KSFT_FAIL;
|
||||
char *cg_test_a = NULL, *cg_test_b = NULL;
|
||||
char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
|
||||
|
275
tools/testing/selftests/cgroup/test_cpuset.c
Normal file
275
tools/testing/selftests/cgroup/test_cpuset.c
Normal file
@ -0,0 +1,275 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/limits.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include "../kselftest.h"
|
||||
#include "cgroup_util.h"
|
||||
|
||||
static int idle_process_fn(const char *cgroup, void *arg)
|
||||
{
|
||||
(void)pause();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_migration_fn(const char *cgroup, void *arg)
|
||||
{
|
||||
int object_pid = (int)(size_t)arg;
|
||||
|
||||
if (setuid(TEST_UID))
|
||||
return EXIT_FAILURE;
|
||||
|
||||
// XXX checking /proc/$pid/cgroup would be quicker than wait
|
||||
if (cg_enter(cgroup, object_pid) ||
|
||||
cg_wait_for_proc_count(cgroup, 1))
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
static int do_controller_fn(const char *cgroup, void *arg)
|
||||
{
|
||||
const char *child = cgroup;
|
||||
const char *parent = arg;
|
||||
|
||||
if (setuid(TEST_UID))
|
||||
return EXIT_FAILURE;
|
||||
|
||||
if (!cg_read_strstr(child, "cgroup.controllers", "cpuset"))
|
||||
return EXIT_FAILURE;
|
||||
|
||||
if (cg_write(parent, "cgroup.subtree_control", "+cpuset"))
|
||||
return EXIT_FAILURE;
|
||||
|
||||
if (cg_read_strstr(child, "cgroup.controllers", "cpuset"))
|
||||
return EXIT_FAILURE;
|
||||
|
||||
if (cg_write(parent, "cgroup.subtree_control", "-cpuset"))
|
||||
return EXIT_FAILURE;
|
||||
|
||||
if (!cg_read_strstr(child, "cgroup.controllers", "cpuset"))
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Migrate a process between two sibling cgroups.
|
||||
* The success should only depend on the parent cgroup permissions and not the
|
||||
* migrated process itself (cpuset controller is in place because it uses
|
||||
* security_task_setscheduler() in cgroup v1).
|
||||
*
|
||||
* Deliberately don't set cpuset.cpus in children to avoid definining migration
|
||||
* permissions between two different cpusets.
|
||||
*/
|
||||
static int test_cpuset_perms_object(const char *root, bool allow)
|
||||
{
|
||||
char *parent = NULL, *child_src = NULL, *child_dst = NULL;
|
||||
char *parent_procs = NULL, *child_src_procs = NULL, *child_dst_procs = NULL;
|
||||
const uid_t test_euid = TEST_UID;
|
||||
int object_pid = 0;
|
||||
int ret = KSFT_FAIL;
|
||||
|
||||
parent = cg_name(root, "cpuset_test_0");
|
||||
if (!parent)
|
||||
goto cleanup;
|
||||
parent_procs = cg_name(parent, "cgroup.procs");
|
||||
if (!parent_procs)
|
||||
goto cleanup;
|
||||
if (cg_create(parent))
|
||||
goto cleanup;
|
||||
|
||||
child_src = cg_name(parent, "cpuset_test_1");
|
||||
if (!child_src)
|
||||
goto cleanup;
|
||||
child_src_procs = cg_name(child_src, "cgroup.procs");
|
||||
if (!child_src_procs)
|
||||
goto cleanup;
|
||||
if (cg_create(child_src))
|
||||
goto cleanup;
|
||||
|
||||
child_dst = cg_name(parent, "cpuset_test_2");
|
||||
if (!child_dst)
|
||||
goto cleanup;
|
||||
child_dst_procs = cg_name(child_dst, "cgroup.procs");
|
||||
if (!child_dst_procs)
|
||||
goto cleanup;
|
||||
if (cg_create(child_dst))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_write(parent, "cgroup.subtree_control", "+cpuset"))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_read_strstr(child_src, "cgroup.controllers", "cpuset") ||
|
||||
cg_read_strstr(child_dst, "cgroup.controllers", "cpuset"))
|
||||
goto cleanup;
|
||||
|
||||
/* Enable permissions along src->dst tree path */
|
||||
if (chown(child_src_procs, test_euid, -1) ||
|
||||
chown(child_dst_procs, test_euid, -1))
|
||||
goto cleanup;
|
||||
|
||||
if (allow && chown(parent_procs, test_euid, -1))
|
||||
goto cleanup;
|
||||
|
||||
/* Fork a privileged child as a test object */
|
||||
object_pid = cg_run_nowait(child_src, idle_process_fn, NULL);
|
||||
if (object_pid < 0)
|
||||
goto cleanup;
|
||||
|
||||
/* Carry out migration in a child process that can drop all privileges
|
||||
* (including capabilities), the main process must remain privileged for
|
||||
* cleanup.
|
||||
* Child process's cgroup is irrelevant but we place it into child_dst
|
||||
* as hacky way to pass information about migration target to the child.
|
||||
*/
|
||||
if (allow ^ (cg_run(child_dst, do_migration_fn, (void *)(size_t)object_pid) == EXIT_SUCCESS))
|
||||
goto cleanup;
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
if (object_pid > 0) {
|
||||
(void)kill(object_pid, SIGTERM);
|
||||
(void)clone_reap(object_pid, WEXITED);
|
||||
}
|
||||
|
||||
cg_destroy(child_dst);
|
||||
free(child_dst_procs);
|
||||
free(child_dst);
|
||||
|
||||
cg_destroy(child_src);
|
||||
free(child_src_procs);
|
||||
free(child_src);
|
||||
|
||||
cg_destroy(parent);
|
||||
free(parent_procs);
|
||||
free(parent);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int test_cpuset_perms_object_allow(const char *root)
|
||||
{
|
||||
return test_cpuset_perms_object(root, true);
|
||||
}
|
||||
|
||||
static int test_cpuset_perms_object_deny(const char *root)
|
||||
{
|
||||
return test_cpuset_perms_object(root, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Migrate a process between parent and child implicitely
|
||||
* Implicit migration happens when a controller is enabled/disabled.
|
||||
*
|
||||
*/
|
||||
static int test_cpuset_perms_subtree(const char *root)
|
||||
{
|
||||
char *parent = NULL, *child = NULL;
|
||||
char *parent_procs = NULL, *parent_subctl = NULL, *child_procs = NULL;
|
||||
const uid_t test_euid = TEST_UID;
|
||||
int object_pid = 0;
|
||||
int ret = KSFT_FAIL;
|
||||
|
||||
parent = cg_name(root, "cpuset_test_0");
|
||||
if (!parent)
|
||||
goto cleanup;
|
||||
parent_procs = cg_name(parent, "cgroup.procs");
|
||||
if (!parent_procs)
|
||||
goto cleanup;
|
||||
parent_subctl = cg_name(parent, "cgroup.subtree_control");
|
||||
if (!parent_subctl)
|
||||
goto cleanup;
|
||||
if (cg_create(parent))
|
||||
goto cleanup;
|
||||
|
||||
child = cg_name(parent, "cpuset_test_1");
|
||||
if (!child)
|
||||
goto cleanup;
|
||||
child_procs = cg_name(child, "cgroup.procs");
|
||||
if (!child_procs)
|
||||
goto cleanup;
|
||||
if (cg_create(child))
|
||||
goto cleanup;
|
||||
|
||||
/* Enable permissions as in a delegated subtree */
|
||||
if (chown(parent_procs, test_euid, -1) ||
|
||||
chown(parent_subctl, test_euid, -1) ||
|
||||
chown(child_procs, test_euid, -1))
|
||||
goto cleanup;
|
||||
|
||||
/* Put a privileged child in the subtree and modify controller state
|
||||
* from an unprivileged process, the main process remains privileged
|
||||
* for cleanup.
|
||||
* The unprivileged child runs in subtree too to avoid parent and
|
||||
* internal-node constraing violation.
|
||||
*/
|
||||
object_pid = cg_run_nowait(child, idle_process_fn, NULL);
|
||||
if (object_pid < 0)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_run(child, do_controller_fn, parent) != EXIT_SUCCESS)
|
||||
goto cleanup;
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
if (object_pid > 0) {
|
||||
(void)kill(object_pid, SIGTERM);
|
||||
(void)clone_reap(object_pid, WEXITED);
|
||||
}
|
||||
|
||||
cg_destroy(child);
|
||||
free(child_procs);
|
||||
free(child);
|
||||
|
||||
cg_destroy(parent);
|
||||
free(parent_subctl);
|
||||
free(parent_procs);
|
||||
free(parent);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
#define T(x) { x, #x }
|
||||
struct cpuset_test {
|
||||
int (*fn)(const char *root);
|
||||
const char *name;
|
||||
} tests[] = {
|
||||
T(test_cpuset_perms_object_allow),
|
||||
T(test_cpuset_perms_object_deny),
|
||||
T(test_cpuset_perms_subtree),
|
||||
};
|
||||
#undef T
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
char root[PATH_MAX];
|
||||
int i, ret = EXIT_SUCCESS;
|
||||
|
||||
if (cg_find_unified_root(root, sizeof(root)))
|
||||
ksft_exit_skip("cgroup v2 isn't mounted\n");
|
||||
|
||||
if (cg_read_strstr(root, "cgroup.subtree_control", "cpuset"))
|
||||
if (cg_write(root, "cgroup.subtree_control", "+cpuset"))
|
||||
ksft_exit_skip("Failed to set cpuset controller\n");
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(tests); i++) {
|
||||
switch (tests[i].fn(root)) {
|
||||
case KSFT_PASS:
|
||||
ksft_test_result_pass("%s\n", tests[i].name);
|
||||
break;
|
||||
case KSFT_SKIP:
|
||||
ksft_test_result_skip("%s\n", tests[i].name);
|
||||
break;
|
||||
default:
|
||||
ret = EXIT_FAILURE;
|
||||
ksft_test_result_fail("%s\n", tests[i].name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
@ -10,7 +10,7 @@
|
||||
skip_test() {
|
||||
echo "$1"
|
||||
echo "Test SKIPPED"
|
||||
exit 0
|
||||
exit 4 # ksft_skip
|
||||
}
|
||||
|
||||
[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
|
||||
|
Loading…
Reference in New Issue
Block a user