cgroup: Changes for v6.13

- cpu.stat now also shows niced CPU time.
 
 - Freezer and cpuset optimizations.
 
 - Other misc changes.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCZztlgg4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGbohAQDE/enqpAX9vSOpQPne4ZzgcPlGTrCwBcka3Z5z
 4aOF0AD/SmdjcJ/EULisD/2O27ovsGAtqDjngrrZwNUTbCNkTQQ=
 =pKyo
 -----END PGP SIGNATURE-----

Merge tag 'cgroup-for-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

 - cpu.stat now also shows niced CPU time

 - Freezer and cpuset optimizations

 - Other misc changes

* tag 'cgroup-for-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup/cpuset: Disable cpuset_cpumask_can_shrink() test if not load balancing
  cgroup/cpuset: Further optimize code if CONFIG_CPUSETS_V1 not set
  cgroup/cpuset: Enforce at most one rebuild_sched_domains_locked() call per operation
  cgroup/cpuset: Revert "Allow suppression of sched domain rebuild in update_cpumasks_hier()"
  MAINTAINERS: remove Zefan Li
  cgroup/freezer: Add cgroup CGRP_FROZEN flag update helper
  cgroup/freezer: Reduce redundant traversal for cgroup_freeze
  cgroup/bpf: only cgroup v2 can be attached by bpf programs
  Revert "cgroup: Fix memory leak caused by missing cgroup_bpf_offline"
  selftests/cgroup: Fix compile error in test_cpu.c
  cgroup/rstat: Selftests for niced CPU statistics
  cgroup/rstat: Tracking cgroup-level niced CPU time
  cgroup/cpuset: Fix spelling errors in file kernel/cgroup/cpuset.c
This commit is contained in:
Linus Torvalds 2024-11-20 09:54:49 -08:00
commit 7586d52765
8 changed files with 239 additions and 138 deletions

View File

@ -579,6 +579,9 @@ N: Zach Brown
E: zab@zabbo.net
D: maestro pci sound
N: Zefan Li
D: Contribution to control group stuff
N: David Brownell
D: Kernel engineer, mentor, and friend. Maintained USB EHCI and
D: gadget layers, SPI subsystem, GPIO subsystem, and more than a few

View File

@ -5756,7 +5756,6 @@ F: kernel/context_tracking.c
CONTROL GROUP (CGROUP)
M: Tejun Heo <tj@kernel.org>
M: Zefan Li <lizefan.x@bytedance.com>
M: Johannes Weiner <hannes@cmpxchg.org>
M: Michal Koutný <mkoutny@suse.com>
L: cgroups@vger.kernel.org
@ -5785,7 +5784,6 @@ F: include/linux/blk-cgroup.h
CONTROL GROUP - CPUSET
M: Waiman Long <longman@redhat.com>
M: Zefan Li <lizefan.x@bytedance.com>
L: cgroups@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git

View File

@ -327,6 +327,7 @@ struct cgroup_base_stat {
#ifdef CONFIG_SCHED_CORE
u64 forceidle_sum;
#endif
u64 ntime;
};
/*
@ -397,7 +398,7 @@ struct cgroup_freezer_state {
bool freeze;
/* Should the cgroup actually be frozen? */
int e_freeze;
bool e_freeze;
/* Fields below are protected by css_set_lock */

View File

@ -2140,8 +2140,10 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto exit_stats;
if (root == &cgrp_dfl_root) {
ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);
}
trace_cgroup_setup_root(root);
@ -2314,10 +2316,8 @@ static void cgroup_kill_sb(struct super_block *sb)
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
!percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
cgroup_bpf_offline(&root->cgrp);
!percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
}
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@ -5710,9 +5710,11 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_kernfs_remove;
if (cgrp->root == &cgrp_dfl_root) {
ret = cgroup_bpf_inherit(cgrp);
if (ret)
goto out_psi_free;
}
/*
* New cgroup inherits effective freeze counter, and
@ -6026,6 +6028,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
cgroup1_check_for_release(parent);
if (cgrp->root == &cgrp_dfl_root)
cgroup_bpf_offline(cgrp);
/* put the base reference */

View File

@ -84,9 +84,19 @@ static bool have_boot_isolcpus;
static struct list_head remote_children;
/*
* A flag to force sched domain rebuild at the end of an operation while
* inhibiting it in the intermediate stages when set. Currently it is only
* set in hotplug code.
* A flag to force sched domain rebuild at the end of an operation.
* It can be set in
* - update_partition_sd_lb()
* - remote_partition_check()
* - update_cpumasks_hier()
* - cpuset_update_flag()
* - cpuset_hotplug_update_tasks()
* - cpuset_handle_hotplug()
*
* Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
*
* Note that update_relax_domain_level() in cpuset-v1.c can still call
* rebuild_sched_domains_locked() directly without using this flag.
*/
static bool force_sd_rebuild;
@ -283,6 +293,12 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
mutex_unlock(&cpuset_mutex);
}
static inline bool cpuset_v2(void)
{
return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
}
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@ -293,7 +309,7 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
*/
static inline bool is_in_v2_mode(void)
{
return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
return cpuset_v2() ||
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
@ -565,12 +581,24 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
* tasks.
* tasks. This check is not done when scheduling is disabled as the
* users should know what they are doing.
*
* For v1, effective_cpus == cpus_allowed & user_xcpus() returns
* cpus_allowed.
*
* For v2, is_cpu_exclusive() & is_sched_load_balance() are true only
* for non-isolated partition root. At this point, the target
* effective_cpus isn't computed yet. user_xcpus() is the best
* approximation.
*
* TBD: May need to precompute the real effective_cpus here in case
* incorrect scheduling of SCHED_DEADLINE tasks in a partition
* becomes an issue.
*/
ret = -EBUSY;
if (is_cpu_exclusive(cur) &&
!cpuset_cpumask_can_shrink(cur->cpus_allowed,
trial->cpus_allowed))
if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
!cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
goto out;
/*
@ -728,7 +756,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
bool cgrpv2 = cpuset_v2();
int nslot_update;
doms = NULL;
@ -990,6 +1018,7 @@ void rebuild_sched_domains_locked(void)
lockdep_assert_cpus_held();
lockdep_assert_held(&cpuset_mutex);
force_sd_rebuild = false;
/*
* If we have raced with CPU hotplug, return early to avoid
@ -1164,8 +1193,8 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
if (rebuild_domains && !force_sd_rebuild)
rebuild_sched_domains_locked();
if (rebuild_domains)
cpuset_force_rebuild();
}
/*
@ -1187,7 +1216,7 @@ static void reset_partition_data(struct cpuset *cs)
{
struct cpuset *parent = parent_cs(cs);
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
if (!cpuset_v2())
return;
lockdep_assert_held(&callback_lock);
@ -1339,7 +1368,7 @@ static inline bool is_local_partition(struct cpuset *cs)
* remote_partition_enable - Enable current cpuset as a remote partition root
* @cs: the cpuset to update
* @new_prs: new partition_root_state
* @tmp: temparary masks
* @tmp: temporary masks
* Return: 0 if successful, errcode if error
*
* Enable the current cpuset to become a remote partition root taking CPUs
@ -1377,7 +1406,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
*/
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
@ -1387,7 +1416,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
/*
* remote_partition_disable - Remove current cpuset from remote partition list
* @cs: the cpuset to update
* @tmp: temparary masks
* @tmp: temporary masks
*
* The effective_cpus is also updated.
*
@ -1413,7 +1442,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
*/
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
@ -1423,7 +1452,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
* remote_cpus_update - cpus_exclusive change of remote partition
* @cs: the cpuset to be updated
* @newmask: the new effective_xcpus mask
* @tmp: temparary masks
* @tmp: temporary masks
*
* top_cpuset and subpartitions_cpus will be updated or partition can be
* invalidated.
@ -1465,7 +1494,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
*/
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
@ -1480,7 +1509,7 @@ invalidate:
* @cs: the cpuset to be updated
* @newmask: the new effective_xcpus mask
* @delmask: temporary mask for deletion (not in tmp)
* @tmp: temparary masks
* @tmp: temporary masks
*
* This should be called before the given cs has updated its cpus_allowed
* and/or effective_xcpus.
@ -1512,8 +1541,8 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
remote_partition_disable(child, tmp);
disable_cnt++;
}
if (disable_cnt && !force_sd_rebuild)
rebuild_sched_domains_locked();
if (disable_cnt)
cpuset_force_rebuild();
}
/*
@ -1922,12 +1951,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
rcu_read_unlock();
}
/*
* update_cpumasks_hier() flags
*/
#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
@ -1942,7 +1965,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
* Called with cpuset_mutex held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
int flags)
bool force)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@ -2007,12 +2030,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
* Skip the whole subtree if
* 1) the cpumask remains the same,
* 2) has no partition root state,
* 3) HIER_CHECKALL flag not set, and
* 3) force flag not set, and
* 4) for v2 load balance state same as its parent.
*/
if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
if (!cp->partition_root_state && !force &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
(!cpuset_v2() ||
(is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
pos_css = css_rightmost_descendant(pos_css);
continue;
@ -2086,8 +2109,7 @@ get_css:
* from parent if current cpuset isn't a valid partition root
* and their load balance states differ.
*/
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
!is_partition_valid(cp) &&
if (cpuset_v2() && !is_partition_valid(cp) &&
(is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
if (is_sched_load_balance(parent))
set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
@ -2103,8 +2125,7 @@ get_css:
*/
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
is_partition_valid(cp)))
(!cpuset_v2() || is_partition_valid(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@ -2112,9 +2133,8 @@ get_css:
}
rcu_read_unlock();
if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) &&
!force_sd_rebuild)
rebuild_sched_domains_locked();
if (need_rebuild_sched_domains)
cpuset_force_rebuild();
}
/**
@ -2141,9 +2161,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* directly.
*
* The update_cpumasks_hier() function may sleep. So we have to
* release the RCU read lock before calling it. HIER_NO_SD_REBUILD
* flag is used to suppress rebuild of sched domains as the callers
* will take care of that.
* release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@ -2159,7 +2177,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
rcu_read_unlock();
update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
update_cpumasks_hier(sibling, tmp, false);
rcu_read_lock();
css_put(&sibling->css);
}
@ -2179,7 +2197,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
struct tmpmasks tmp;
struct cpuset *parent = parent_cs(cs);
bool invalidate = false;
int hier_flags = 0;
bool force = false;
int old_prs = cs->partition_root_state;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@ -2206,7 +2224,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
return -EINVAL;
/*
* When exclusive_cpus isn't explicitly set, it is constrainted
* When exclusive_cpus isn't explicitly set, it is constrained
* by cpus_allowed and parent's effective_xcpus. Otherwise,
* trialcs->effective_xcpus is used as a temporary cpumask
* for checking validity of the partition root.
@ -2240,12 +2258,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Check all the descendants in update_cpumasks_hier() if
* effective_xcpus is to be changed.
*/
if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
hier_flags = HIER_CHECKALL;
force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
retval = validate_change(cs, trialcs);
if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
if ((retval == -EINVAL) && cpuset_v2()) {
struct cgroup_subsys_state *css;
struct cpuset *cp;
@ -2309,7 +2326,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
spin_unlock_irq(&callback_lock);
/* effective_cpus/effective_xcpus will be updated here */
update_cpumasks_hier(cs, &tmp, hier_flags);
update_cpumasks_hier(cs, &tmp, force);
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
@ -2334,7 +2351,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
struct tmpmasks tmp;
struct cpuset *parent = parent_cs(cs);
bool invalidate = false;
int hier_flags = 0;
bool force = false;
int old_prs = cs->partition_root_state;
if (!*buf) {
@ -2357,8 +2374,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Check all the descendants in update_cpumasks_hier() if
* effective_xcpus is to be changed.
*/
if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
hier_flags = HIER_CHECKALL;
force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
retval = validate_change(cs, trialcs);
if (retval)
@ -2411,8 +2427,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* of the subtree when it is a valid partition root or effective_xcpus
* is updated.
*/
if (is_partition_valid(cs) || hier_flags)
update_cpumasks_hier(cs, &tmp, hier_flags);
if (is_partition_valid(cs) || force)
update_cpumasks_hier(cs, &tmp, force);
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
@ -2737,9 +2753,12 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
cs->flags = trialcs->flags;
spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed &&
!force_sd_rebuild)
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
if (cpuset_v2())
cpuset_force_rebuild();
else
rebuild_sched_domains_locked();
}
if (spread_flag_changed)
cpuset1_update_tasks_flags(cs);
@ -2853,12 +2872,14 @@ out:
update_unbound_workqueue_cpumask(new_xcpus_state);
/* Force update if switching back to member */
update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
update_cpumasks_hier(cs, &tmpmask, !new_prs);
/* Update sched domains and load balance flag */
update_partition_sd_lb(cs, old_prs);
notify_partition_change(cs, old_prs);
if (force_sd_rebuild)
rebuild_sched_domains_locked();
free_cpumasks(NULL, &tmpmask);
return 0;
}
@ -2919,8 +2940,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* migration permission derives from hierarchy ownership in
* cgroup_procs_write_permission()).
*/
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
(cpus_updated || mems_updated)) {
if (!cpuset_v2() || (cpus_updated || mems_updated)) {
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
@ -3034,8 +3054,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* in effective cpus and mems. In that case, we can optimize out
* by skipping the task iteration and update.
*/
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
!cpus_updated && !mems_updated) {
if (cpuset_v2() && !cpus_updated && !mems_updated) {
cpuset_attach_nodemask_to = cs->effective_mems;
goto out;
}
@ -3152,6 +3171,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
}
free_cpuset(trialcs);
if (force_sd_rebuild)
rebuild_sched_domains_locked();
out_unlock:
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
@ -3383,7 +3404,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
INIT_LIST_HEAD(&cs->remote_sibling);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
if (cpuset_v2())
__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
return &cs->css;
@ -3410,8 +3431,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
*/
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
!is_sched_load_balance(parent))
if (cpuset_v2() && !is_sched_load_balance(parent))
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpuset_inc();
@ -3481,8 +3501,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
if (is_partition_valid(cs))
update_prstate(cs, 0);
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
is_sched_load_balance(cs))
if (!cpuset_v2() && is_sched_load_balance(cs))
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
cpuset_dec();
@ -3896,11 +3915,9 @@ static void cpuset_handle_hotplug(void)
rcu_read_unlock();
}
/* rebuild sched domains if cpus_allowed has changed */
if (force_sd_rebuild) {
force_sd_rebuild = false;
/* rebuild sched domains if necessary */
if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
}
free_cpumasks(NULL, ptmp);
}

View File

@ -8,6 +8,28 @@
#include <trace/events/cgroup.h>
/*
* Update CGRP_FROZEN of cgroup.flag
* Return true if flags is updated; false if flags has no change
*/
static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen)
{
lockdep_assert_held(&css_set_lock);
/* Already there? */
if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen)
return false;
if (frozen)
set_bit(CGRP_FROZEN, &cgrp->flags);
else
clear_bit(CGRP_FROZEN, &cgrp->flags);
cgroup_file_notify(&cgrp->events_file);
TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
return true;
}
/*
* Propagate the cgroup frozen state upwards by the cgroup tree.
*/
@ -24,26 +46,18 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
while ((cgrp = cgroup_parent(cgrp))) {
if (frozen) {
cgrp->freezer.nr_frozen_descendants += desc;
if (!test_bit(CGRP_FROZEN, &cgrp->flags) &&
test_bit(CGRP_FREEZE, &cgrp->flags) &&
cgrp->freezer.nr_frozen_descendants ==
cgrp->nr_descendants) {
set_bit(CGRP_FROZEN, &cgrp->flags);
cgroup_file_notify(&cgrp->events_file);
TRACE_CGROUP_PATH(notify_frozen, cgrp, 1);
desc++;
}
if (!test_bit(CGRP_FREEZE, &cgrp->flags) ||
(cgrp->freezer.nr_frozen_descendants !=
cgrp->nr_descendants))
continue;
} else {
cgrp->freezer.nr_frozen_descendants -= desc;
if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
clear_bit(CGRP_FROZEN, &cgrp->flags);
cgroup_file_notify(&cgrp->events_file);
TRACE_CGROUP_PATH(notify_frozen, cgrp, 0);
}
if (cgroup_update_frozen_flag(cgrp, frozen))
desc++;
}
}
}
}
/*
* Revisit the cgroup frozen state.
@ -53,8 +67,6 @@ void cgroup_update_frozen(struct cgroup *cgrp)
{
bool frozen;
lockdep_assert_held(&css_set_lock);
/*
* If the cgroup has to be frozen (CGRP_FREEZE bit set),
* and all tasks are frozen and/or stopped, let's consider
@ -63,23 +75,8 @@ void cgroup_update_frozen(struct cgroup *cgrp)
frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
if (frozen) {
/* Already there? */
if (test_bit(CGRP_FROZEN, &cgrp->flags))
return;
set_bit(CGRP_FROZEN, &cgrp->flags);
} else {
/* Already there? */
if (!test_bit(CGRP_FROZEN, &cgrp->flags))
return;
clear_bit(CGRP_FROZEN, &cgrp->flags);
}
cgroup_file_notify(&cgrp->events_file);
TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
/* Update the state of ancestor cgroups. */
/* If flags is updated, update the state of ancestor cgroups. */
if (cgroup_update_frozen_flag(cgrp, frozen))
cgroup_propagate_frozen(cgrp, frozen);
}
@ -260,8 +257,10 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
void cgroup_freeze(struct cgroup *cgrp, bool freeze)
{
struct cgroup_subsys_state *css;
struct cgroup *parent;
struct cgroup *dsct;
bool applied = false;
bool old_e;
lockdep_assert_held(&cgroup_mutex);
@ -282,22 +281,18 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
if (cgroup_is_dead(dsct))
continue;
if (freeze) {
dsct->freezer.e_freeze++;
/*
* Already frozen because of ancestor's settings?
* e_freeze is affected by parent's e_freeze and dst's freeze.
* If old e_freeze eq new e_freeze, no change, its children
* will not be affected. So do nothing and skip the subtree
*/
if (dsct->freezer.e_freeze > 1)
old_e = dsct->freezer.e_freeze;
parent = cgroup_parent(dsct);
dsct->freezer.e_freeze = (dsct->freezer.freeze ||
parent->freezer.e_freeze);
if (dsct->freezer.e_freeze == old_e) {
css = css_rightmost_descendant(css);
continue;
} else {
dsct->freezer.e_freeze--;
/*
* Still frozen because of ancestor's settings?
*/
if (dsct->freezer.e_freeze > 0)
continue;
WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
}
/*

View File

@ -444,6 +444,7 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
#ifdef CONFIG_SCHED_CORE
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
dst_bstat->ntime += src_bstat->ntime;
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@ -455,6 +456,7 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
#ifdef CONFIG_SCHED_CORE
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
dst_bstat->ntime -= src_bstat->ntime;
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@ -534,8 +536,10 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_USER:
case CPUTIME_NICE:
rstatc->bstat.ntime += delta_exec;
fallthrough;
case CPUTIME_USER:
rstatc->bstat.cputime.utime += delta_exec;
break;
case CPUTIME_SYSTEM:
@ -591,6 +595,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
#ifdef CONFIG_SCHED_CORE
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
bstat->ntime += cpustat[CPUTIME_NICE];
}
}
@ -608,13 +613,14 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
u64 usage, utime, stime, ntime;
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
ntime = cgrp->bstat.ntime;
cgroup_rstat_flush_release(cgrp);
} else {
/* cgrp->bstat of root is not actually used, reuse it */
@ -622,16 +628,19 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
usage = cgrp->bstat.cputime.sum_exec_runtime;
utime = cgrp->bstat.cputime.utime;
stime = cgrp->bstat.cputime.stime;
ntime = cgrp->bstat.ntime;
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
do_div(ntime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
"system_usec %llu\n"
"nice_usec %llu\n",
usage, utime, stime, ntime);
cgroup_force_idle_show(seq, &cgrp->bstat);
}

View File

@ -8,6 +8,7 @@
#include <pthread.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include "../kselftest.h"
#include "cgroup_util.h"
@ -229,6 +230,79 @@ cleanup:
return ret;
}
/*
* Creates a nice process that consumes CPU and checks that the elapsed
* usertime in the cgroup is close to the expected time.
*/
static int test_cpucg_nice(const char *root)
{
int ret = KSFT_FAIL;
int status;
long user_usec, nice_usec;
long usage_seconds = 2;
long expected_nice_usec = usage_seconds * USEC_PER_SEC;
char *cpucg;
pid_t pid;
cpucg = cg_name(root, "cpucg_test");
if (!cpucg)
goto cleanup;
if (cg_create(cpucg))
goto cleanup;
user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
if (nice_usec == -1)
ret = KSFT_SKIP;
if (user_usec != 0 || nice_usec != 0)
goto cleanup;
/*
* We fork here to create a new process that can be niced without
* polluting the nice value of other selftests
*/
pid = fork();
if (pid < 0) {
goto cleanup;
} else if (pid == 0) {
struct cpu_hog_func_param param = {
.nprocs = 1,
.ts = {
.tv_sec = usage_seconds,
.tv_nsec = 0,
},
.clock_type = CPU_HOG_CLOCK_PROCESS,
};
char buf[64];
snprintf(buf, sizeof(buf), "%d", getpid());
if (cg_write(cpucg, "cgroup.procs", buf))
goto cleanup;
/* Try to keep niced CPU usage as constrained to hog_cpu as possible */
nice(1);
hog_cpus_timed(cpucg, &param);
exit(0);
} else {
waitpid(pid, &status, 0);
if (!WIFEXITED(status))
goto cleanup;
user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
if (!values_close(nice_usec, expected_nice_usec, 1))
goto cleanup;
ret = KSFT_PASS;
}
cleanup:
cg_destroy(cpucg);
free(cpucg);
return ret;
}
static int
run_cpucg_weight_test(
const char *root,
@ -686,6 +760,7 @@ struct cpucg_test {
} tests[] = {
T(test_cpucg_subtree_control),
T(test_cpucg_stats),
T(test_cpucg_nice),
T(test_cpucg_weight_overprovisioned),
T(test_cpucg_weight_underprovisioned),
T(test_cpucg_nested_weight_overprovisioned),