From 8cec3dd9e5930c82c6bd0af3fdb3a36bcd428310 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 16 Feb 2024 11:44:33 +0530 Subject: [PATCH 01/13] sched/core: Simplify code by removing duplicate #ifdefs There's a few cases of nested #ifdefs in the scheduler code that can be simplified: #ifdef DEFINE_A ...code block... #ifdef DEFINE_A <-- This is a duplicate. ...code block... #endif #else #ifndef DEFINE_A <-- This is also duplicate. ...code block... #endif #endif More details about the script and methods used to find these code patterns can be found at: https://lore.kernel.org/all/20240118080326.13137-1-sshegde@linux.ibm.com/ No change in functionality intended. [ mingo: Clarified the changelog. ] Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240216061433.535522-1-sshegde@linux.ibm.com --- kernel/sched/core.c | 4 +--- kernel/sched/fair.c | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc90346..a76c7095f736 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1792,7 +1792,6 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css); #endif #ifdef CONFIG_SYSCTL -#ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) { @@ -1898,7 +1897,6 @@ undo: return result; } #endif -#endif static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) @@ -2065,7 +2063,7 @@ static void __init init_uclamp(void) } } -#else /* CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline int uclamp_validate(struct task_struct *p, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 533547e3c90a..8e30e2bb77a0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10182,10 +10182,8 @@ static int idle_cpu_without(int cpu, struct task_struct *p) * be computed and tested before calling idle_cpu_without(). */ -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } From 8b936fc1d84f7d70009ea34d66bbf6c54c09fae7 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 1 Jan 2024 21:16:23 +0530 Subject: [PATCH 02/13] sched/fair: Use existing helper functions to access ->avg_rt and ->avg_dl There are helper functions called cpu_util_dl() and cpu_util_rt() which give the average utilization of DL and RT respectively. But there are a few places in code where access to these variables is open-coded. Instead use the helper function so that code becomes simpler and easier to maintain later on. No functional changes intended. Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240101154624.100981-2-sshegde@linux.vnet.ibm.com --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e30e2bb77a0..127e727fb7b4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9237,10 +9237,10 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) static inline bool others_have_blocked(struct rq *rq) { - if (READ_ONCE(rq->avg_rt.util_avg)) + if (cpu_util_rt(rq)) return true; - if (READ_ONCE(rq->avg_dl.util_avg)) + if (cpu_util_dl(rq)) return true; if (thermal_load_avg(rq)) @@ -9506,8 +9506,8 @@ static unsigned long scale_rt_capacity(int cpu) * avg_thermal.load_avg tracks thermal pressure and the weighted * average uses the actual delta max capacity(load). */ - used = READ_ONCE(rq->avg_rt.util_avg); - used += READ_ONCE(rq->avg_dl.util_avg); + used = cpu_util_rt(rq); + used += cpu_util_dl(rq); used += thermal_load_avg(rq); if (unlikely(used >= max)) From a6965b31888501f889261a6783f0de6afff84f8d Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 1 Jan 2024 21:16:24 +0530 Subject: [PATCH 03/13] sched/fair: Add READ_ONCE() and use existing helper function to access ->avg_irq Use existing helper function cpu_util_irq() instead of open-coding access to ->avg_irq. During review it was noted that ->avg_irq could be updated by a different CPU than the one which is trying to access it. ->avg_irq is updated with WRITE_ONCE(), use READ_ONCE to access it in order to avoid any compiler optimizations. Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240101154624.100981-3-sshegde@linux.vnet.ibm.com --- kernel/sched/fair.c | 4 +--- kernel/sched/sched.h | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 127e727fb7b4..ba3633940f6f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9246,10 +9246,8 @@ static inline bool others_have_blocked(struct rq *rq) if (thermal_load_avg(rq)) return true; -#ifdef CONFIG_HAVE_SCHED_AVG_IRQ - if (READ_ONCE(rq->avg_irq.util_avg)) + if (cpu_util_irq(rq)) return true; -#endif return false; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 001fe047bd5d..d2242679239e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3136,7 +3136,7 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { - return rq->avg_irq.util_avg; + return READ_ONCE(rq->avg_irq.util_avg); } static inline From 8aeaffef8c6eceab0e1498486fdd4f3dc3b7066c Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Wed, 10 Jan 2024 14:17:06 +0100 Subject: [PATCH 04/13] sched/fair: Take the scheduling domain into account in select_idle_smt() When picking a CPU on task wakeup, select_idle_smt() has to take into account the scheduling domain of @target. This is because the "isolcpus" kernel command line option can remove CPUs from the domain to isolate them from other SMT siblings. This fix checks if the candidate CPU is in the target scheduling domain. Commit: df3cb4ea1fb6 ("sched/fair: Fix wrong cpu selecting from isolated domain") ... originally introduced this fix by adding the check of the scheduling domain in the loop. However, commit: 3e6efe87cd5cc ("sched/fair: Remove redundant check in select_idle_smt()") ... accidentally removed the check. Bring it back. Fixes: 3e6efe87cd5c ("sched/fair: Remove redundant check in select_idle_smt()") Signed-off-by: Keisuke Nishimura Signed-off-by: Julia Lawall Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240110131707.437301-1-keisuke.nishimura@inria.fr --- kernel/sched/fair.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ba3633940f6f..005f6d31e8f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7311,13 +7311,19 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { int cpu; for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { if (cpu == target) continue; + /* + * Check if the CPU is in the LLC scheduling domain of @target. + * Due to isolcpus, there is no guarantee that all the siblings are in the domain. + */ + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; } @@ -7341,7 +7347,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, int target) +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } @@ -7591,7 +7597,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) has_idle_core = test_idle_cores(target); if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, prev); + i = select_idle_smt(p, sd, prev); if ((unsigned int)i < nr_cpumask_bits) return i; } From 23d04d8c6b8ec339057264659b7834027f3e6a63 Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Wed, 10 Jan 2024 14:17:07 +0100 Subject: [PATCH 05/13] sched/fair: Take the scheduling domain into account in select_idle_core() When picking a CPU on task wakeup, select_idle_core() has to take into account the scheduling domain where the function looks for the CPU. This is because the "isolcpus" kernel command line option can remove CPUs from the domain to isolate them from other SMT siblings. This change replaces the set of CPUs allowed to run the task from p->cpus_ptr by the intersection of p->cpus_ptr and sched_domain_span(sd) which is stored in the 'cpus' argument provided by select_idle_cpu(). Fixes: 9fe1f127b913 ("sched/fair: Merge select_idle_core/cpu()") Signed-off-by: Keisuke Nishimura Signed-off-by: Julia Lawall Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240110131707.437301-2-keisuke.nishimura@inria.fr --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 005f6d31e8f6..352222d09c90 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7289,7 +7289,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { *idle_cpu = cpu; break; } @@ -7297,7 +7297,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu } break; } - if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus)) *idle_cpu = cpu; } From 9dfbc26d27aaf0f5958c5972188f16fe977e5af5 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 5 Feb 2024 22:39:19 -0600 Subject: [PATCH 06/13] sched/fair: Remove unnecessary goto in update_sd_lb_stats() In update_sd_lb_stats(), when we're iterating over the sched groups that comprise a sched domain, we're skipping the call to update_sd_pick_busiest() for the sched group that contains the local / destination CPU. We use a goto to skip the call, but we could just as easily check !local_group, as there's no other logic that we need to skip with the goto. Let's remove the goto, and check for !local_group in the if statement instead. Signed-off-by: David Vernet Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20240206043921.850302-2-void@manifault.com --- kernel/sched/fair.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 352222d09c90..41dda5311770 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10580,16 +10580,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_sg_lb_stats(env, sds, sg, sgs, &sg_status); - if (local_group) - goto next_group; - - - if (update_sd_pick_busiest(env, sds, sg, sgs)) { + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } -next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; From 7f1a7229718d788f26a711374da83adc2689837f Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 5 Feb 2024 22:39:20 -0600 Subject: [PATCH 07/13] sched/fair: Do strict inequality check for busiest misfit task group In update_sd_pick_busiest(), when comparing two sched groups that are both of type group_misfit_task, we currently consider the new group as busier than the current busiest group even if the new group has the same misfit task load as the current busiest group. We can avoid some unnecessary writes if we instead only consider the newest group to be the busiest if it has a higher load than the current busiest. This matches the behavior of other group types where we compare load, such as two groups that are both overloaded. Let's update the group_misfit_task type comparison to also only update the busiest group in the event of strict inequality. Signed-off-by: David Vernet Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20240206043921.850302-3-void@manifault.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 41dda5311770..448520f4fe83 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10032,7 +10032,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * If we have more than one misfit sg go with the biggest * misfit. */ - if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) + if (sgs->group_misfit_task_load <= busiest->group_misfit_task_load) return false; break; From 7e9f7d17fe6c23432fda9a3648a858b7589cb4aa Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 5 Feb 2024 22:39:21 -0600 Subject: [PATCH 08/13] sched/fair: Simplify the update_sd_pick_busiest() logic When comparing the current struct sched_group with the yet-busiest domain in update_sd_pick_busiest(), if the two groups have the same group type, we're currently doing a bit of unnecessary work for any group >= group_misfit_task. We're comparing the two groups, and then returning only if false (the group in question is not the busiest). Otherwise, we break out, do an extra unnecessary conditional check that's vacuously false for any group type > group_fully_busy, and then always return true. Let's just return directly in the switch statement instead. This doesn't change the size of vmlinux with llvm 17 (not surprising given that all of this is inlined in load_balance()), but it does shrink load_balance() by 88 bytes on x86. Given that it also improves readability, this seems worth doing. Signed-off-by: David Vernet Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20240206043921.850302-4-void@manifault.com --- kernel/sched/fair.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 448520f4fe83..51fe17f10ef8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10010,9 +10010,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, switch (sgs->group_type) { case group_overloaded: /* Select the overloaded group with highest avg_load. */ - if (sgs->avg_load <= busiest->avg_load) - return false; - break; + return sgs->avg_load > busiest->avg_load; case group_imbalanced: /* @@ -10023,18 +10021,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) - return false; - break; + return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); case group_misfit_task: /* * If we have more than one misfit sg go with the biggest * misfit. */ - if (sgs->group_misfit_task_load <= busiest->group_misfit_task_load) - return false; - break; + return sgs->group_misfit_task_load > busiest->group_misfit_task_load; case group_smt_balance: /* From d654c8ddde84b9d1a30a40917e588b5a1e53dada Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 10 Feb 2024 19:39:19 +0800 Subject: [PATCH 09/13] sched/topology: Remove duplicate descriptions from TOPOLOGY_SD_FLAGS These flags are already documented in include/linux/sched/sd_flags.h. Also, add missing SD_CLUSTER and keep the comment on SD_ASYM_PACKING as it is a special case. Suggested-by: Ricardo Neri Signed-off-by: Alex Shi Signed-off-by: Ingo Molnar Reviewed-by: Ricardo Neri Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240210113924.1130448-1-alexs@kernel.org --- kernel/sched/topology.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 10d1391e7416..0b33f7b05d21 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1551,11 +1551,12 @@ static struct cpumask ***sched_domains_numa_masks; * * These flags are purely descriptive of the topology and do not prescribe * behaviour. Behaviour is artificial and mapped in the below sd_init() - * function: + * function. For details, see include/linux/sched/sd_flags.h. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies + * SD_SHARE_CPUCAPACITY + * SD_SHARE_PKG_RESOURCES + * SD_CLUSTER + * SD_NUMA * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: From 5a64983731566f3b102b4ed12445b8a1b2f46a46 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 10 Feb 2024 19:39:20 +0800 Subject: [PATCH 10/13] sched/fair: Remove unused parameter from sched_asym() The 'sds' argument is not used in the sched_asym() function anymore, remove it. Fixes: c9ca07886aaa ("sched/fair: Do not even the number of busy CPUs via asym_packing") Signed-off-by: Alex Shi Signed-off-by: Ingo Molnar Reviewed-by: Ricardo Neri Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240210113924.1130448-2-alexs@kernel.org --- kernel/sched/fair.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51fe17f10ef8..300d1bfe6d2b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9753,7 +9753,6 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) /** * sched_asym - Check if the destination CPU can do asym_packing load balance * @env: The load balancing environment - * @sds: Load-balancing data with statistics of the local group * @sgs: Load-balancing statistics of the candidate busiest group * @group: The candidate busiest group * @@ -9772,8 +9771,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) * otherwise. */ static inline bool -sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, - struct sched_group *group) +sched_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { /* Ensure that the whole local core is idle, if applicable. */ if (!sched_use_asym_prio(env->sd, env->dst_cpu)) @@ -9944,7 +9942,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Check if dst CPU is idle and preferred to this group */ if (!local_group && env->sd->flags & SD_ASYM_PACKING && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && - sched_asym(env, sds, sgs, group)) { + sched_asym(env, sgs, group)) { sgs->group_asym_packing = 1; } From 45de20623475049c424bc0b89f42efca54995edd Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 10 Feb 2024 19:39:21 +0800 Subject: [PATCH 11/13] sched/fair: Rework sched_use_asym_prio() and sched_asym_prefer() sched_use_asym_prio() and sched_asym_prefer() are used together in various places. Consolidate them into a single function sched_asym(). The existing sched_asym() function is only used when collecting statistics of a scheduling group. Rename it as sched_group_asym(), and remove the obsolete function description. This makes the code easier to read. No functional changes. Signed-off-by: Alex Shi Signed-off-by: Ingo Molnar Tested-by: Ricardo Neri Reviewed-by: Ricardo Neri Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240210113924.1130448-3-alexs@kernel.org --- kernel/sched/fair.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 300d1bfe6d2b..475e2ca66b63 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9750,8 +9750,18 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); } +static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu) +{ + /* + * First check if @dst_cpu can do asym_packing load balance. Only do it + * if it has higher priority than @src_cpu. + */ + return sched_use_asym_prio(sd, dst_cpu) && + sched_asym_prefer(dst_cpu, src_cpu); +} + /** - * sched_asym - Check if the destination CPU can do asym_packing load balance + * sched_group_asym - Check if the destination CPU can do asym_packing balance * @env: The load balancing environment * @sgs: Load-balancing statistics of the candidate busiest group * @group: The candidate busiest group @@ -9759,34 +9769,21 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) * @env::dst_cpu can do asym_packing if it has higher priority than the * preferred CPU of @group. * - * SMT is a special case. If we are balancing load between cores, @env::dst_cpu - * can do asym_packing balance only if all its SMT siblings are idle. Also, it - * can only do it if @group is an SMT group and has exactly on busy CPU. Larger - * imbalances in the number of CPUS are dealt with in find_busiest_group(). - * - * If we are balancing load within an SMT core, or at PKG domain level, always - * proceed. - * * Return: true if @env::dst_cpu can do with asym_packing load balance. False * otherwise. */ static inline bool -sched_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) +sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { - /* Ensure that the whole local core is idle, if applicable. */ - if (!sched_use_asym_prio(env->sd, env->dst_cpu)) - return false; - /* - * CPU priorities does not make sense for SMT cores with more than one + * CPU priorities do not make sense for SMT cores with more than one * busy sibling. */ - if (group->flags & SD_SHARE_CPUCAPACITY) { - if (sgs->group_weight - sgs->idle_cpus != 1) - return false; - } + if ((group->flags & SD_SHARE_CPUCAPACITY) && + (sgs->group_weight - sgs->idle_cpus != 1)) + return false; - return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); + return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); } /* One group has more than one SMT CPU while the other group does not */ @@ -9942,7 +9939,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Check if dst CPU is idle and preferred to this group */ if (!local_group && env->sd->flags & SD_ASYM_PACKING && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && - sched_asym(env, sgs, group)) { + sched_group_asym(env, sgs, group)) { sgs->group_asym_packing = 1; } @@ -11028,8 +11025,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, * SMT cores with more than one busy sibling. */ if ((env->sd->flags & SD_ASYM_PACKING) && - sched_use_asym_prio(env->sd, i) && - sched_asym_prefer(i, env->dst_cpu) && + sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1) continue; @@ -11899,8 +11895,7 @@ static void nohz_balancer_kick(struct rq *rq) * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_use_asym_prio(sd, i) && - sched_asym_prefer(i, cpu)) { + if (sched_asym(sd, i, cpu)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } From fbc449864e0d2ee2c16f3af2d1e9093b9b8d7ad0 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 10 Feb 2024 19:39:22 +0800 Subject: [PATCH 12/13] sched/fair: Check the SD_ASYM_PACKING flag in sched_use_asym_prio() sched_use_asym_prio() checks whether CPU priorities should be used. It makes sense to check for the SD_ASYM_PACKING() inside the function. Since both sched_asym() and sched_group_asym() use sched_use_asym_prio(), remove the now superfluous checks for the flag in various places. Signed-off-by: Alex Shi Signed-off-by: Ingo Molnar Tested-by: Ricardo Neri Reviewed-by: Ricardo Neri Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240210113924.1130448-4-alexs@kernel.org --- kernel/sched/fair.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 475e2ca66b63..39781a666c08 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9744,6 +9744,9 @@ group_type group_classify(unsigned int imbalance_pct, */ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) { + if (!(sd->flags & SD_ASYM_PACKING)) + return false; + if (!sched_smt_active()) return true; @@ -9937,11 +9940,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && - sched_group_asym(env, sgs, group)) { + if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + sched_group_asym(env, sgs, group)) sgs->group_asym_packing = 1; - } /* Check for loaded SMT group to be balanced to dst CPU */ if (!local_group && smt_balance(env, sgs, group)) @@ -11024,9 +11025,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, * If balancing between cores, let lower priority CPUs help * SMT cores with more than one busy sibling. */ - if ((env->sd->flags & SD_ASYM_PACKING) && - sched_asym(env->sd, i, env->dst_cpu) && - nr_running == 1) + if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1) continue; switch (env->migration_type) { @@ -11122,8 +11121,7 @@ asym_active_balance(struct lb_env *env) * the lower priority @env::dst_cpu help it. Do not follow * CPU priority. */ - return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_use_asym_prio(env->sd, env->dst_cpu) && + return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) && (sched_asym_prefer(env->dst_cpu, env->src_cpu) || !sched_use_asym_prio(env->sd, env->src_cpu)); } From 54de442747037485da1fc4eca9636287a61e97e3 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 10 Feb 2024 19:39:23 +0800 Subject: [PATCH 13/13] sched/topology: Rename SD_SHARE_PKG_RESOURCES to SD_SHARE_LLC SD_SHARE_PKG_RESOURCES is a bit of a misnomer: its naming suggests that it's sharing all 'package resources' - while in reality it's specifically for sharing the LLC only. Rename it to SD_SHARE_LLC to reduce confusion. [ mingo: Rewrote the confusing changelog as well. ] Suggested-by: Valentin Schneider Signed-off-by: Alex Shi Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Ricardo Neri Reviewed-by: Barry Song Link: https://lore.kernel.org/r/20240210113924.1130448-5-alexs@kernel.org --- arch/powerpc/kernel/smp.c | 6 +++--- include/linux/sched/sd_flags.h | 4 ++-- include/linux/sched/topology.h | 6 +++--- kernel/sched/fair.c | 2 +- kernel/sched/topology.c | 28 ++++++++++++++-------------- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 693334c20d07..a60e4139214b 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -984,7 +984,7 @@ static bool shared_caches __ro_after_init; /* cpumask of CPUs with asymmetric SMT dependency */ static int powerpc_smt_flags(void) { - int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); @@ -1010,9 +1010,9 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); static int powerpc_shared_cache_flags(void) { if (static_branch_unlikely(&splpar_asym_pack)) - return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING; + return SD_SHARE_LLC | SD_ASYM_PACKING; - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_LLC; } static int powerpc_shared_proc_flags(void) diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index a8b28647aafc..b04a5d04dee9 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -117,13 +117,13 @@ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) /* - * Domain members share CPU package resources (i.e. caches) + * Domain members share CPU Last Level Caches * * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share * the same cache(s). * NEEDS_GROUPS: Caches are shared between groups. */ -SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Only a single load balancing instance diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a6e04b4a21d7..191b122158fb 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -38,21 +38,21 @@ extern const struct sd_flag_debug sd_flag_debug[]; #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { - return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; } #endif #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_LLC; } #endif #ifdef CONFIG_SCHED_MC static inline int cpu_core_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_LLC; } #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 39781a666c08..6a16129f9a5c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10678,7 +10678,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (local->group_type == group_has_spare) { if ((busiest->group_type > group_fully_busy) && - !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { + !(env->sd->flags & SD_SHARE_LLC)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 0b33f7b05d21..99ea5986038c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -657,13 +657,13 @@ static void destroy_sched_domains(struct sched_domain *sd) } /* - * Keep a special pointer to the highest sched_domain that has - * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this - * allows us to avoid some pointer chasing select_idle_sibling(). + * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set + * (Last Level Cache Domain) for this allows us to avoid some pointer chasing + * select_idle_sibling(). * - * Also keep a unique ID per domain (we use the first CPU number in - * the cpumask of the domain), this allows us to quickly tell if - * two CPUs are in the same cache domain, see cpus_share_cache(). + * Also keep a unique ID per domain (we use the first CPU number in the cpumask + * of the domain), this allows us to quickly tell if two CPUs are in the same + * cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); @@ -684,7 +684,7 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1; - sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); + sd = highest_flag_domain(cpu, SD_SHARE_LLC); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); @@ -1554,7 +1554,7 @@ static struct cpumask ***sched_domains_numa_masks; * function. For details, see include/linux/sched/sd_flags.h. * * SD_SHARE_CPUCAPACITY - * SD_SHARE_PKG_RESOURCES + * SD_SHARE_LLC * SD_CLUSTER * SD_NUMA * @@ -1566,7 +1566,7 @@ static struct cpumask ***sched_domains_numa_masks; #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ SD_CLUSTER | \ - SD_SHARE_PKG_RESOURCES | \ + SD_SHARE_LLC | \ SD_NUMA | \ SD_ASYM_PACKING) @@ -1609,7 +1609,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_BALANCE_WAKE | 1*SD_WAKE_AFFINE | 0*SD_SHARE_CPUCAPACITY - | 0*SD_SHARE_PKG_RESOURCES + | 0*SD_SHARE_LLC | 0*SD_SERIALIZE | 1*SD_PREFER_SIBLING | 0*SD_NUMA @@ -1646,7 +1646,7 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; - } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + } else if (sd->flags & SD_SHARE_LLC) { sd->imbalance_pct = 117; sd->cache_nice_tries = 1; @@ -1671,7 +1671,7 @@ sd_init(struct sched_domain_topology_level *tl, * For all levels sharing cache; connect a sched_domain_shared * instance. */ - if (sd->flags & SD_SHARE_PKG_RESOURCES) { + if (sd->flags & SD_SHARE_LLC) { sd->shared = *per_cpu_ptr(sdd->sds, sd_id); atomic_inc(&sd->shared->ref); atomic_set(&sd->shared->nr_busy_cpus, sd_weight); @@ -2446,8 +2446,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { struct sched_domain *child = sd->child; - if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && - (child->flags & SD_SHARE_PKG_RESOURCES)) { + if (!(sd->flags & SD_SHARE_LLC) && child && + (child->flags & SD_SHARE_LLC)) { struct sched_domain __rcu *top_p; unsigned int nr_llcs;