Scheduler changes for v6.11:

- Update Daniel Bristot de Oliveira's entry in MAINTAINERS, and credit him in CREDITS. - Harmonize the lock-yielding behavior on dynamically selected preemption models with static ones. - Reorganize the code a bit: split out sched/syscalls.c to reduce the size of sched/core.c - Micro-optimize psi_group_change() - Fix set_load_weight() for SCHED_IDLE tasks - Misc cleanups & fixes Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmaVtVARHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1iqTQ/9GLNzNBnl0oBWCiybeQjyWsZ6BiZi48R0 C1g9/RKy++OyGOjn/yqYK0Kg8cdfoGzHGioMMAucHFW1nXZwVw17xAJK127N0apF 83up7AnFJw/JGr1bI0FwuozqHAs4Z5KzHTv2KBxhYuO77lyYna6/t0liRUbF8ZUZ I/nqav7wDB8RBIB5hEJ/uYLDX7qWdUlyFB+mcvV4ANA99yr++OgipCp6Ob3Rz3cP O676nKJY4vpNbZ/B6bpKg8ezULRP8re2qD3GJRf2huS63uu/Z5ct7ouLVZ1DwN53 mFDBTYUMI2ToV0pseikuqwnmrjxAKcEajTyZpD3vckafd2TlWIopkQZoQ9XLLlIZ DxO+KoekaHTSVy8FWlO8O+iE3IAdUUgECEpNveX45Pb7nFP+5dtFqqnVIdNqCq5e zEuQvizaa5m+A1POZhZKya+z9jbLXXx+gtPCbbADTBWtuyl8azUIh3vjn0bykmv4 IVV/wvUm+BPEIhnKusZZOgB0vLtxUdntBBfUSxqoSOad9L+0/UtSKoKI6wvW00q8 ZkW+85yS3YFiN9W61276RLis2j7OAjE0eDJ96wfhooma2JRDJU4Wmg5oWg8x3WuA JRmK0s63Qik5gpwG5rHQsR5jNqYWTj5Lp7So+M1kRfFsOM/RXQ/AneSXZu/P7d65 LnYWzbKu76c= =lLab -----END PGP SIGNATURE----- Merge tag 'sched-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: - Update Daniel Bristot de Oliveira's entry in MAINTAINERS, and credit him in CREDITS - Harmonize the lock-yielding behavior on dynamically selected preemption models with static ones - Reorganize the code a bit: split out sched/syscalls.c to reduce the size of sched/core.c - Micro-optimize psi_group_change() - Fix set_load_weight() for SCHED_IDLE tasks - Misc cleanups & fixes * tag 'sched-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Update MAINTAINERS and CREDITS sched/fair: set_load_weight() must also call reweight_task() for SCHED_IDLE tasks sched/psi: Optimise psi_group_change a bit sched/core: Drop spinlocks on contention iff kernel is preemptible sched/core: Move preempt_model_*() helpers from sched.h to preempt.h sched/balance: Skip unnecessary updates to idle load balancer's flags idle: Remove stale RCU comment sched/headers: Move struct pre-declarations to the beginning of the header sched/core: Clean up kernel/sched/sched.h a bit sched/core: Simplify prefetch_curr_exec_start() sched: Fix spelling in comments sched/syscalls: Split out kernel/sched/syscalls.c from kernel/sched/core.c
2024-11-11 04:18:39 +08:00 · 2024-07-16 17:00:50 -07:00 · 2024-07-16 17:00:50 -07:00 · 4a996d90b9
commit 4a996d90b9
parent 0c182ac2eb db43a609d0
23 changed files with 2183 additions and 2095 deletions
--- a/3
+++ b/3
@ -271,6 +271,9 @@ D: Driver for WaveFront soundcards (Turtle Beach Maui, Tropez, Tropez+)
 D: Various bugfixes and changes to sound drivers
 S: USA

+N: Daniel Bristot de Oliveira
+D: Scheduler contributions, notably: SCHED_DEADLINE
+
 N: Carlos Henrique Bauer
 E: chbauer@acm.org
 E: bauer@atlas.unisinos.br
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@ -4728,7 +4728,9 @@
 			none - Limited to cond_resched() calls
 			voluntary - Limited to cond_resched() and might_sleep() calls
 			full - Any section that isn't explicitly preempt disabled
-			       can be preempted anytime.
+			       can be preempted anytime.  Tasks will also yield
+			       contended spinlocks (if the critical section isn't
+			       explicitly preempt disabled beyond the lock itself).

 	print-fatal-signals=
 			[KNL] debug: print fatal signals
--- a/1
+++ b/1
@ -20047,7 +20047,6 @@ R:	Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
 R:	Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
 R:	Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
 R:	Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
-R:	Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
 R:	Valentin Schneider <vschneid@redhat.com> (TOPOLOGY)
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@ -481,4 +481,45 @@ DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
 DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())
 DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+extern bool preempt_model_none(void);
+extern bool preempt_model_voluntary(void);
+extern bool preempt_model_full(void);
+
+#else
+
+static inline bool preempt_model_none(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_NONE);
+}
+static inline bool preempt_model_voluntary(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
+}
+static inline bool preempt_model_full(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT);
+}
+
+#endif
+
+static inline bool preempt_model_rt(void)
+{
+	return IS_ENABLED(CONFIG_PREEMPT_RT);
+}
+
+/*
+ * Does the preemption model allow non-cooperative preemption?
+ *
+ * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
+ * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
+ * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
+ * PREEMPT_NONE model.
+ */
+static inline bool preempt_model_preemptible(void)
+{
+	return preempt_model_full() || preempt_model_rt();
+}
+
 #endif /* __LINUX_PREEMPT_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -2064,47 +2064,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
 	__cond_resched_rwlock_write(lock);					\
 })

-#ifdef CONFIG_PREEMPT_DYNAMIC
-
-extern bool preempt_model_none(void);
-extern bool preempt_model_voluntary(void);
-extern bool preempt_model_full(void);
-
-#else
-
-static inline bool preempt_model_none(void)
-{
-	return IS_ENABLED(CONFIG_PREEMPT_NONE);
-}
-static inline bool preempt_model_voluntary(void)
-{
-	return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
-}
-static inline bool preempt_model_full(void)
-{
-	return IS_ENABLED(CONFIG_PREEMPT);
-}
-
-#endif
-
-static inline bool preempt_model_rt(void)
-{
-	return IS_ENABLED(CONFIG_PREEMPT_RT);
-}
-
-/*
- * Does the preemption model allow non-cooperative preemption?
- *
- * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
- * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
- * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
- * PREEMPT_NONE model.
- */
-static inline bool preempt_model_preemptible(void)
-{
-	return preempt_model_full() || preempt_model_rt();
-}
-
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@ -462,11 +462,10 @@ static __always_inline int spin_is_contended(spinlock_t *lock)
 */
 static inline int spin_needbreak(spinlock_t *lock)
 {
-#ifdef CONFIG_PREEMPTION
+	if (!preempt_model_preemptible())
+		return 0;
+
 	return spin_is_contended(lock);
-#else
-	return 0;
-#endif
 }

 /*
@ -479,11 +478,10 @@ static inline int spin_needbreak(spinlock_t *lock)
 */
 static inline int rwlock_needbreak(rwlock_t *lock)
 {
-#ifdef CONFIG_PREEMPTION
+	if (!preempt_model_preemptible())
+		return 0;
+
 	return rwlock_is_contended(lock);
-#else
-	return 0;
-#endif
 }

 /*
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@ -52,3 +52,4 @@
 #include "cputime.c"
 #include "deadline.c"

+#include "syscalls.c"
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@ -340,7 +340,7 @@ again:
 	this_clock = sched_clock_local(my_scd);
 	/*
 	 * We must enforce atomic readout on 32-bit, otherwise the
-	 * update on the remote CPU can hit inbetween the readout of
+	 * update on the remote CPU can hit in between the readout of
 	 * the low 32-bit and the high 32-bit portion.
 	 */
 	remote_clock = cmpxchg64(&scd->clock, 0, 0);
@ -444,7 +444,7 @@ notrace void sched_clock_tick_stable(void)
 }

 /*
- * We are going deep-idle (irqs are disabled):
+ * We are going deep-idle (IRQs are disabled):
 */
 notrace void sched_clock_idle_sleep_event(void)
 {
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@ -279,7 +279,7 @@ void __sched_core_account_forceidle(struct rq *rq)
 			continue;

 		/*
-		 * Note: this will account forceidle to the current cpu, even
+		 * Note: this will account forceidle to the current CPU, even
 		 * if it comes from our SMT sibling.
 		 */
 		__account_forceidle_time(p, delta);
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@ -14,11 +14,11 @@
 * They are only modified in vtime_account, on corresponding CPU
 * with interrupts disabled. So, writes are safe.
 * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
+ * This may result in other CPU reading this CPU's IRQ time and can
 * race with irq/vtime_account on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of IRQ time to wrong
+ * task when IRQ is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each IRQ in account_system_time.
 */
 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);

@ -269,7 +269,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
 }

 /*
- * Account how much elapsed time was spent in steal, irq, or softirq time.
+ * Account how much elapsed time was spent in steal, IRQ, or softirq time.
 */
 static inline u64 account_other_time(u64 max)
 {
@ -370,7 +370,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 * Check for hardirq is done both for system and user time as there is
 * no timer going off while we are on hardirq and hence we may never get an
 * opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
+ * p->stime and friends are only updated on system time and not on IRQ
 * softirq as those do not count in task exec_runtime any more.
 */
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
@ -380,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,

 	/*
 	 * When returning from idle, many ticks can get accounted at
-	 * once, including some ticks of steal, irq, and softirq time.
+	 * once, including some ticks of steal, IRQ, and softirq time.
 	 * Subtract those ticks from the amount of time accounted to
 	 * idle, or potentially user or system time. Due to rounding,
 	 * other time can exceed ticks occasionally.
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@ -708,7 +708,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 	}

 	/*
-	 * And we finally need to fixup root_domain(s) bandwidth accounting,
+	 * And we finally need to fix up root_domain(s) bandwidth accounting,
 	 * since p is still hanging out in the old (now moved to default) root
 	 * domain.
 	 */
@ -992,7 +992,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
 * is detected, the runtime and deadline need to be updated.
 *
 * If the task has an implicit deadline, i.e., deadline == period, the Original
- * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * CBS is applied. The runtime is replenished and a new absolute deadline is
 * set, as in the previous cases.
 *
 * However, the Original CBS does not work properly for tasks with
@ -1294,7 +1294,7 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
 * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
 * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
 * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
- * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
 * Since delta is a 64 bit variable, to have an overflow its value should be
 * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
 * not an issue here.
@ -2493,7 +2493,7 @@ static void pull_dl_task(struct rq *this_rq)
 		src_rq = cpu_rq(cpu);

 		/*
-		 * It looks racy, abd it is! However, as in sched_rt.c,
+		 * It looks racy, and it is! However, as in sched_rt.c,
 		 * we are fine with this.
 		 */
 		if (this_rq->dl.dl_nr_running &&
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -61,7 +61,7 @@
 * Options are:
 *
 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
- *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ *   SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
 *
 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@ -3835,15 +3835,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	}
 }

-void reweight_task(struct task_struct *p, int prio)
+void reweight_task(struct task_struct *p, const struct load_weight *lw)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	struct load_weight *load = &se->load;
-	unsigned long weight = scale_load(sched_prio_to_weight[prio]);

-	reweight_entity(cfs_rq, se, weight);
-	load->inv_weight = sched_prio_to_wmult[prio];
+	reweight_entity(cfs_rq, se, lw->weight);
+	load->inv_weight = lw->inv_weight;
 }

 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@ -8719,7 +8718,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 * topology where each level pairs two lower groups (or better). This results
 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
 * tree to only the first of the previous level and we decrease the frequency
- * of load-balance at each level inv. proportional to the number of CPUs in
+ * of load-balance at each level inversely proportional to the number of CPUs in
 * the groups.
 *
 * This yields:
@ -11885,6 +11884,13 @@ static void kick_ilb(unsigned int flags)
 	if (ilb_cpu < 0)
 		return;

+	/*
+	 * Don't bother if no new NOHZ balance work items for ilb_cpu,
+	 * i.e. all bits in flags are already set in ilb_cpu.
+	 */
+	if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
+		return;
+
 	/*
 	 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
 	 * the first flag owns it; cleared by nohz_csd_func().
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@ -172,19 +172,13 @@ static void cpuidle_idle_call(void)

 	/*
 	 * Check if the idle task must be rescheduled. If it is the
-	 * case, exit the function after re-enabling the local irq.
+	 * case, exit the function after re-enabling the local IRQ.
 	 */
 	if (need_resched()) {
 		local_irq_enable();
 		return;
 	}

-	/*
-	 * The RCU framework needs to be told that we are entering an idle
-	 * section, so no more rcu read side critical sections and one more
-	 * step to the grace period
-	 */
-
 	if (cpuidle_not_available(drv, dev)) {
 		tick_nohz_idle_stop_tick();

@ -244,7 +238,7 @@ exit_idle:
 	__current_set_polling();

 	/*
-	 * It is up to the idle functions to reenable local interrupts
+	 * It is up to the idle functions to re-enable local interrupts
 	 */
 	if (WARN_ON_ONCE(irqs_disabled()))
 		local_irq_enable();
@ -320,7 +314,7 @@ static void do_idle(void)
 		rcu_nocb_flush_deferred_wakeup();

 		/*
-		 * In poll mode we reenable interrupts and spin. Also if we
+		 * In poll mode we re-enable interrupts and spin. Also if we
 		 * detected in the wakeup from idle path that the tick
 		 * broadcast device expired for us, we don't want to go deep
 		 * idle as we know that the IPI is going to arrive right away.
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@ -45,7 +45,7 @@
 *    again, being late doesn't loose the delta, just wrecks the sample.
 *
 *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
- *    this would add another cross-CPU cacheline miss and atomic operation
+ *    this would add another cross-CPU cache-line miss and atomic operation
 *    to the wakeup path. Instead we increment on whatever CPU the task ran
 *    when it went into uninterruptible state and decrement on whatever CPU
 *    did the wakeup. This means that only the sum of nr_uninterruptible over
@ -62,7 +62,7 @@ EXPORT_SYMBOL(avenrun); /* should be removed */

 /**
 * get_avenrun - get the load average array
- * @loads:	pointer to dest load array
+ * @loads:	pointer to destination load array
 * @offset:	offset to add
 * @shift:	shift count to shift the result left
 *
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@ -417,7 +417,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)

 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 /*
- * irq:
+ * IRQ:
 *
 *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
 *   util_sum = cpu_scale * load_sum
@ -432,7 +432,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
 	int ret = 0;

 	/*
-	 * We can't use clock_pelt because irq time is not accounted in
+	 * We can't use clock_pelt because IRQ time is not accounted in
 	 * clock_task. Instead we directly scale the running time to
 	 * reflect the real amount of computation
 	 */
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@ -41,7 +41,7 @@
 * What it means for a task to be productive is defined differently
 * for each resource. For IO, productive means a running task. For
 * memory, productive means a running task that isn't a reclaimer. For
- * CPU, productive means an oncpu task.
+ * CPU, productive means an on-CPU task.
 *
 * Naturally, the FULL state doesn't exist for the CPU resource at the
 * system level, but exist at the cgroup level. At the cgroup level,
@ -49,7 +49,7 @@
 * resource which is being used by others outside of the cgroup or
 * throttled by the cgroup cpu.max configuration.
 *
- * The percentage of wallclock time spent in those compound stall
+ * The percentage of wall clock time spent in those compound stall
 * states gives pressure numbers between 0 and 100 for each resource,
 * where the SOME percentage indicates workload slowdowns and the FULL
 * percentage indicates reduced CPU utilization:
@ -218,28 +218,32 @@ void __init psi_init(void)
 	group_init(&psi_system);
 }

-static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
+static u32 test_states(unsigned int *tasks, u32 state_mask)
 {
-	switch (state) {
-	case PSI_IO_SOME:
-		return unlikely(tasks[NR_IOWAIT]);
-	case PSI_IO_FULL:
-		return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
-	case PSI_MEM_SOME:
-		return unlikely(tasks[NR_MEMSTALL]);
-	case PSI_MEM_FULL:
-		return unlikely(tasks[NR_MEMSTALL] &&
-			tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
-	case PSI_CPU_SOME:
-		return unlikely(tasks[NR_RUNNING] > oncpu);
-	case PSI_CPU_FULL:
-		return unlikely(tasks[NR_RUNNING] && !oncpu);
-	case PSI_NONIDLE:
-		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
-			tasks[NR_RUNNING];
-	default:
-		return false;
+	const bool oncpu = state_mask & PSI_ONCPU;
+
+	if (tasks[NR_IOWAIT]) {
+		state_mask |= BIT(PSI_IO_SOME);
+		if (!tasks[NR_RUNNING])
+			state_mask |= BIT(PSI_IO_FULL);
 	}
+
+	if (tasks[NR_MEMSTALL]) {
+		state_mask |= BIT(PSI_MEM_SOME);
+		if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING])
+			state_mask |= BIT(PSI_MEM_FULL);
+	}
+
+	if (tasks[NR_RUNNING] > oncpu)
+		state_mask |= BIT(PSI_CPU_SOME);
+
+	if (tasks[NR_RUNNING] && !oncpu)
+		state_mask |= BIT(PSI_CPU_FULL);
+
+	if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING])
+		state_mask |= BIT(PSI_NONIDLE);
+
+	return state_mask;
 }

 static void get_recent_times(struct psi_group *group, int cpu,
@ -345,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group,

 	/*
 	 * Collect the per-cpu time buckets and average them into a
-	 * single time sample that is normalized to wallclock time.
+	 * single time sample that is normalized to wall clock time.
 	 *
 	 * For averaging, each CPU is weighted by its non-idle time in
 	 * the sampling period. This eliminates artifacts from uneven
@ -770,7 +774,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
 {
 	struct psi_group_cpu *groupc;
 	unsigned int t, m;
-	enum psi_states s;
 	u32 state_mask;

 	lockdep_assert_rq_held(cpu_rq(cpu));
@ -842,10 +845,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		return;
 	}

-	for (s = 0; s < NR_PSI_STATES; s++) {
-		if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
-			state_mask |= (1 << s);
-	}
+	state_mask = test_states(groupc->tasks, state_mask);

 	/*
 	 * Since we care about lost potential, a memstall is FULL
@ -1205,7 +1205,7 @@ void psi_cgroup_restart(struct psi_group *group)
 	/*
 	 * After we disable psi_group->enabled, we don't actually
 	 * stop percpu tasks accounting in each psi_group_cpu,
-	 * instead only stop test_state() loop, record_times()
+	 * instead only stop test_states() loop, record_times()
 	 * and averaging worker, see psi_group_change() for details.
 	 *
 	 * When disable cgroup PSI, this function has nothing to sync
@ -1213,7 +1213,7 @@ void psi_cgroup_restart(struct psi_group *group)
 	 * would see !psi_group->enabled and only do task accounting.
 	 *
 	 * When re-enable cgroup PSI, this function use psi_group_change()
-	 * to get correct state mask from test_state() loop on tasks[],
+	 * to get correct state mask from test_states() loop on tasks[],
 	 * and restart groupc->state_start from now, use .clear = .set = 0
 	 * here since no task status really changed.
 	 */
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@ -140,7 +140,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
 		INIT_LIST_HEAD(array->queue + i);
 		__clear_bit(i, array->bitmap);
 	}
-	/* delimiter for bitsearch: */
+	/* delimiter for bit-search: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);

 #if defined CONFIG_SMP
@ -1135,7 +1135,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)

 		/*
 		 * This may have been our highest task, and therefore
-		 * we may have some recomputation to do
+		 * we may have some re-computation to do
 		 */
 		if (prio == prev_prio) {
 			struct rt_prio_array *array = &rt_rq->active;
@ -1571,7 +1571,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
 	 *
 	 * For equal prio tasks, we just let the scheduler sort it out.
 	 *
-	 * Otherwise, just let it ride on the affined RQ and the
+	 * Otherwise, just let it ride on the affine RQ and the
 	 * post-schedule router will push the preempted task away
 	 *
 	 * This test is optimistic, if we get it wrong the load-balancer
@ -2147,14 +2147,14 @@ static void push_rt_tasks(struct rq *rq)
 * if its the only CPU with multiple RT tasks queued, and a large number
 * of CPUs scheduling a lower priority task at the same time.
 *
- * Each root domain has its own irq work function that can iterate over
+ * Each root domain has its own IRQ work function that can iterate over
 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
 * task must be checked if there's one or many CPUs that are lowering
- * their priority, there's a single irq work iterator that will try to
+ * their priority, there's a single IRQ work iterator that will try to
 * push off RT tasks that are waiting to run.
 *
 * When a CPU schedules a lower priority task, it will kick off the
- * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * IRQ work iterator that will jump to each CPU with overloaded RT tasks.
 * As it only takes the first CPU that schedules a lower priority task
 * to start the process, the rto_start variable is incremented and if
 * the atomic result is one, then that CPU will try to take the rto_lock.
@ -2162,7 +2162,7 @@ static void push_rt_tasks(struct rq *rq)
 * CPUs scheduling lower priority tasks.
 *
 * All CPUs that are scheduling a lower priority task will increment the
- * rt_loop_next variable. This will make sure that the irq work iterator
+ * rt_loop_next variable. This will make sure that the IRQ work iterator
 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
 * priority task, even if the iterator is in the middle of a scan. Incrementing
 * the rt_loop_next will cause the iterator to perform another scan.
@ -2242,7 +2242,7 @@ static void tell_cpu_to_push(struct rq *rq)
 	 * The rto_cpu is updated under the lock, if it has a valid CPU
 	 * then the IPI is still running and will continue due to the
 	 * update to loop_next, and nothing needs to be done here.
-	 * Otherwise it is finishing up and an ipi needs to be sent.
+	 * Otherwise it is finishing up and an IPI needs to be sent.
 	 */
 	if (rq->rd->rto_cpu < 0)
 		cpu = rto_next_cpu(rq->rd);
@ -2594,7 +2594,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	watchdog(rq, p);

 	/*
-	 * RR tasks need a special form of timeslice management.
+	 * RR tasks need a special form of time-slice management.
 	 * FIFO tasks have no timeslices.
 	 */
 	if (p->policy != SCHED_RR)
@ -2900,7 +2900,7 @@ static int sched_rt_global_constraints(void)

 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
-	/* Don't accept realtime tasks when there is no way for them to run */
+	/* Don't accept real-time tasks when there is no way for them to run */
 	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
 		return 0;

@ -3001,7 +3001,7 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	/*
 	 * Make sure that internally we keep jiffies.
-	 * Also, writing zero resets the timeslice to default:
+	 * Also, writing zero resets the time-slice to default:
 	 */
 	if (!ret && write) {
 		sched_rr_timeslice =
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@ -224,7 +224,7 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
 /*
 * Called when a task finally hits the CPU.  We can now calculate how
 * long it was waiting to run.  We also note when it began so that we
- * can keep stats on how long its timeslice is.
+ * can keep stats on how long its time-slice is.
 */
 static void sched_info_arrive(struct rq *rq, struct task_struct *t)
 {
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@ -501,7 +501,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
 		cpumask_clear_cpu(rq->cpu, old_rd->span);

 		/*
-		 * If we dont want to free the old_rd yet then
+		 * If we don't want to free the old_rd yet then
 		 * set old_rd to NULL to skip the freeing later
 		 * in this function:
 		 */
@ -1176,7 +1176,7 @@ fail:
 * uniquely identify each group (for a given domain):
 *
 *  - The first is the balance_cpu (see should_we_balance() and the
- *    load-balance blub in fair.c); for each group we only want 1 CPU to
+ *    load-balance blurb in fair.c); for each group we only want 1 CPU to
 *    continue balancing at a higher domain.
 *
 *  - The second is the sched_group_capacity; we want all identical groups
@ -1388,7 +1388,7 @@ static inline void asym_cpu_capacity_update_data(int cpu)

 	/*
 	 * Search if capacity already exits. If not, track which the entry
-	 * where we should insert to keep the list ordered descendingly.
+	 * where we should insert to keep the list ordered descending.
 	 */
 	list_for_each_entry(entry, &asym_cap_list, link) {
 		if (capacity == entry->capacity)
@ -1853,7 +1853,7 @@ void sched_init_numa(int offline_node)
 	struct cpumask ***masks;

 	/*
-	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+	 * O(nr_nodes^2) de-duplicating selection sort -- in order to find the
 	 * unique distances in the node_distance() table.
 	 */
 	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
@ -2750,7 +2750,7 @@ match2:
 	}

 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
-	/* Build perf. domains: */
+	/* Build perf domains: */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < n && !sched_energy_update; j++) {
 			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
@ -2759,7 +2759,7 @@ match2:
 				goto match3;
 			}
 		}
-		/* No match - add perf. domains for a new rd */
+		/* No match - add perf domains for a new rd */
 		has_eas |= build_perf_domains(doms_new[i]);
 match3:
 		;
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@ -33,7 +33,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 EXPORT_SYMBOL(wake_bit_function);

 /*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * To allow interruptible waiting and asynchronous (i.e. non-blocking)
 * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
 * permitted return codes. Nonzero return codes halt waiting and return.
 */
@ -133,7 +133,7 @@ EXPORT_SYMBOL(__wake_up_bit);
 * @bit: the bit of the word being waited on
 *
 * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that wakes up waiters
+ * is the part of the hash-table's accessor API that wakes up waiters
 * on a bit. For instance, if one were to have waiters on a bitflag,
 * one would call wake_up_bit() after clearing the bit.
 *