mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-23 20:24:12 +08:00
Scheduler changes for v6.7 are:
- Fair scheduler (SCHED_OTHER) improvements: - Remove the old and now unused SIS_PROP code & option - Scan cluster before LLC in the wake-up path - Use candidate prev/recent_used CPU if scanning failed for cluster wakeup - NUMA scheduling improvements: - Improve the VMA access-PID code to better skip/scan VMAs - Extend tracing to cover VMA-skipping decisions - Improve/fix the recently introduced sched_numa_find_nth_cpu() code - Generalize numa_map_to_online_node() - Energy scheduling improvements: - Remove the EM_MAX_COMPLEXITY limit - Add tracepoints to track energy computation - Make the behavior of the 'sched_energy_aware' sysctl more consistent - Consolidate and clean up access to a CPU's max compute capacity - Fix uclamp code corner cases - RT scheduling improvements: - Drive dl_rq->overloaded with dl_rq->pushable_dl_tasks updates - Drive the ->rto_mask with rt_rq->pushable_tasks updates - Scheduler scalability improvements: - Rate-limit updates to tg->load_avg - On x86 disable IBRS when CPU is offline to improve single-threaded performance - Micro-optimize in_task() and in_interrupt() - Micro-optimize the PSI code - Avoid updating PSI triggers and ->rtpoll_total when there are no state changes - Core scheduler infrastructure improvements: - Use saved_state to reduce some spurious freezer wakeups - Bring in a handful of fast-headers improvements to scheduler headers - Make the scheduler UAPI headers more widely usable by user-space - Simplify the control flow of scheduler syscalls by using lock guards - Fix sched_setaffinity() vs. CPU hotplug race - Scheduler debuggability improvements: - Disallow writing invalid values to sched_rt_period_us - Fix a race in the rq-clock debugging code triggering warnings - Fix a warning in the bandwidth distribution code - Micro-optimize in_atomic_preempt_off() checks - Enforce that the tasklist_lock is held in for_each_thread() - Print the TGID in sched_show_task() - Remove the /proc/sys/kernel/sched_child_runs_first sysctl - Misc cleanups & fixes Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmU8/NoRHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1gN+xAAvKGYNZBCBG4jowxccgqAbCx81KOhhsy/ KUaOmdLPg9WaXuqjZ5sggXQCMT0wUqBYAmqV7ts53VhWcma2I1ap4dCM6Jj+RLrc vNwkeNetsikiZtarMoCJs5NahL8ULh3liBaoAkkToPjQ5r43aZ/eKwDovEdIKc+g +Vgn7jUY8ssIrAOKT1midSwY1y8kAU2AzWOSFDTgedkJP4PgOu9/lBl9jSJ2sYaX N4XqONYPXTwOHUtvmzkYILxLz0k0GgJ7hmt78E8Xy2rC4taGCRwCfCMBYxREuwiP huo3O1P/iIe5svm4/EBUvcpvf44eAWTV+CD0dnJPwOc9IvFhpSzqSZZAsyy/JQKt Lnzmc/xmyc1PnXCYJfHuXrw2/m+MyUHaegPzh5iLJFrlqa79GavOElj0jNTAMzbZ 39fybzPtuFP+64faRfu0BBlQZfORPBNc/oWMpPKqgP58YGuveKTWaUF5rl5lM7Ne nm07uOmq02JVR8YzPl/FcfhU2dPMawWuMwUjEr2eU+lAunY3PF88vu0FALj7iOBd 66F8qrtpDHJanOxrdEUwSJ7hgw79qY1iw66Db7cQYjMazFKZONxArQPqFUZ0ngLI n9hVa7brg1bAQKrQflqjcIAIbpVu3SjPEl15cKpAJTB/gn5H66TQgw8uQ6HfG+h2 GtOsn1nlvuk= =GDqb -----END PGP SIGNATURE----- Merge tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: "Fair scheduler (SCHED_OTHER) improvements: - Remove the old and now unused SIS_PROP code & option - Scan cluster before LLC in the wake-up path - Use candidate prev/recent_used CPU if scanning failed for cluster wakeup NUMA scheduling improvements: - Improve the VMA access-PID code to better skip/scan VMAs - Extend tracing to cover VMA-skipping decisions - Improve/fix the recently introduced sched_numa_find_nth_cpu() code - Generalize numa_map_to_online_node() Energy scheduling improvements: - Remove the EM_MAX_COMPLEXITY limit - Add tracepoints to track energy computation - Make the behavior of the 'sched_energy_aware' sysctl more consistent - Consolidate and clean up access to a CPU's max compute capacity - Fix uclamp code corner cases RT scheduling improvements: - Drive dl_rq->overloaded with dl_rq->pushable_dl_tasks updates - Drive the ->rto_mask with rt_rq->pushable_tasks updates Scheduler scalability improvements: - Rate-limit updates to tg->load_avg - On x86 disable IBRS when CPU is offline to improve single-threaded performance - Micro-optimize in_task() and in_interrupt() - Micro-optimize the PSI code - Avoid updating PSI triggers and ->rtpoll_total when there are no state changes Core scheduler infrastructure improvements: - Use saved_state to reduce some spurious freezer wakeups - Bring in a handful of fast-headers improvements to scheduler headers - Make the scheduler UAPI headers more widely usable by user-space - Simplify the control flow of scheduler syscalls by using lock guards - Fix sched_setaffinity() vs. CPU hotplug race Scheduler debuggability improvements: - Disallow writing invalid values to sched_rt_period_us - Fix a race in the rq-clock debugging code triggering warnings - Fix a warning in the bandwidth distribution code - Micro-optimize in_atomic_preempt_off() checks - Enforce that the tasklist_lock is held in for_each_thread() - Print the TGID in sched_show_task() - Remove the /proc/sys/kernel/sched_child_runs_first sysctl ... and misc cleanups & fixes" * tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (82 commits) sched/fair: Remove SIS_PROP sched/fair: Use candidate prev/recent_used CPU if scanning failed for cluster wakeup sched/fair: Scan cluster before scanning LLC in wake-up path sched: Add cpus_share_resources API sched/core: Fix RQCF_ACT_SKIP leak sched/fair: Remove unused 'curr' argument from pick_next_entity() sched/nohz: Update comments about NEWILB_KICK sched/fair: Remove duplicate #include sched/psi: Update poll => rtpoll in relevant comments sched: Make PELT acronym definition searchable sched: Fix stop_one_cpu_nowait() vs hotplug sched/psi: Bail out early from irq time accounting sched/topology: Rename 'DIE' domain to 'PKG' sched/psi: Delete the 'update_total' function parameter from update_triggers() sched/psi: Avoid updating PSI triggers and ->rtpoll_total when there are no state changes sched/headers: Remove comment referring to rq::cpu_load, since this has been removed sched/numa: Complete scanning of inactive VMAs when there is no alternative sched/numa: Complete scanning of partial VMAs regardless of PID activity sched/numa: Move up the access pid reset logic sched/numa: Trace decisions related to skipping VMAs ...
This commit is contained in:
commit
63ce50fff9
@ -170,7 +170,7 @@ and ``idle=nomwait``. If any of them is present in the kernel command line, the
|
||||
``MWAIT`` instruction is not allowed to be used, so the initialization of
|
||||
``intel_idle`` will fail.
|
||||
|
||||
Apart from that there are four module parameters recognized by ``intel_idle``
|
||||
Apart from that there are five module parameters recognized by ``intel_idle``
|
||||
itself that can be set via the kernel command line (they cannot be updated via
|
||||
sysfs, so that is the only way to change their values).
|
||||
|
||||
@ -216,6 +216,21 @@ are ignored).
|
||||
The idle states disabled this way can be enabled (on a per-CPU basis) from user
|
||||
space via ``sysfs``.
|
||||
|
||||
The ``ibrs_off`` module parameter is a boolean flag (defaults to
|
||||
false). If set, it is used to control if IBRS (Indirect Branch Restricted
|
||||
Speculation) should be turned off when the CPU enters an idle state.
|
||||
This flag does not affect CPUs that use Enhanced IBRS which can remain
|
||||
on with little performance impact.
|
||||
|
||||
For some CPUs, IBRS will be selected as mitigation for Spectre v2 and Retbleed
|
||||
security vulnerabilities by default. Leaving the IBRS mode on while idling may
|
||||
have a performance impact on its sibling CPU. The IBRS mode will be turned off
|
||||
by default when the CPU enters into a deep idle state, but not in some
|
||||
shallower ones. Setting the ``ibrs_off`` module parameter will force the IBRS
|
||||
mode to off when the CPU is in any one of the available idle states. This may
|
||||
help performance of a sibling CPU at the expense of a slightly higher wakeup
|
||||
latency for the idle CPU.
|
||||
|
||||
|
||||
.. _intel-idle-core-and-package-idle-states:
|
||||
|
||||
|
@ -1182,7 +1182,8 @@ automatically on platforms where it can run (that is,
|
||||
platforms with asymmetric CPU topologies and having an Energy
|
||||
Model available). If your platform happens to meet the
|
||||
requirements for EAS but you do not want to use it, change
|
||||
this value to 0.
|
||||
this value to 0. On Non-EAS platforms, write operation fails and
|
||||
read doesn't return anything.
|
||||
|
||||
task_delayacct
|
||||
===============
|
||||
|
@ -39,14 +39,15 @@ per Hz, leading to::
|
||||
-------------------
|
||||
|
||||
Two different capacity values are used within the scheduler. A CPU's
|
||||
``capacity_orig`` is its maximum attainable capacity, i.e. its maximum
|
||||
attainable performance level. A CPU's ``capacity`` is its ``capacity_orig`` to
|
||||
which some loss of available performance (e.g. time spent handling IRQs) is
|
||||
subtracted.
|
||||
``original capacity`` is its maximum attainable capacity, i.e. its maximum
|
||||
attainable performance level. This original capacity is returned by
|
||||
the function arch_scale_cpu_capacity(). A CPU's ``capacity`` is its ``original
|
||||
capacity`` to which some loss of available performance (e.g. time spent
|
||||
handling IRQs) is subtracted.
|
||||
|
||||
Note that a CPU's ``capacity`` is solely intended to be used by the CFS class,
|
||||
while ``capacity_orig`` is class-agnostic. The rest of this document will use
|
||||
the term ``capacity`` interchangeably with ``capacity_orig`` for the sake of
|
||||
while ``original capacity`` is class-agnostic. The rest of this document will use
|
||||
the term ``capacity`` interchangeably with ``original capacity`` for the sake of
|
||||
brevity.
|
||||
|
||||
1.3 Platform examples
|
||||
|
@ -359,32 +359,9 @@ in milli-Watts or in an 'abstract scale'.
|
||||
6.3 - Energy Model complexity
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The task wake-up path is very latency-sensitive. When the EM of a platform is
|
||||
too complex (too many CPUs, too many performance domains, too many performance
|
||||
states, ...), the cost of using it in the wake-up path can become prohibitive.
|
||||
The energy-aware wake-up algorithm has a complexity of:
|
||||
|
||||
C = Nd * (Nc + Ns)
|
||||
|
||||
with: Nd the number of performance domains; Nc the number of CPUs; and Ns the
|
||||
total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8).
|
||||
|
||||
A complexity check is performed at the root domain level, when scheduling
|
||||
domains are built. EAS will not start on a root domain if its C happens to be
|
||||
higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the
|
||||
time of writing).
|
||||
|
||||
If you really want to use EAS but the complexity of your platform's Energy
|
||||
Model is too high to be used with a single root domain, you're left with only
|
||||
two possible options:
|
||||
|
||||
1. split your system into separate, smaller, root domains using exclusive
|
||||
cpusets and enable EAS locally on each of them. This option has the
|
||||
benefit to work out of the box but the drawback of preventing load
|
||||
balance between root domains, which can result in an unbalanced system
|
||||
overall;
|
||||
2. submit patches to reduce the complexity of the EAS wake-up algorithm,
|
||||
hence enabling it to cope with larger EMs in reasonable time.
|
||||
EAS does not impose any complexity limit on the number of PDs/OPPs/CPUs but
|
||||
restricts the number of CPUs to EM_MAX_NUM_CPUS to prevent overflows during
|
||||
the energy estimation.
|
||||
|
||||
|
||||
6.4 - Schedutil governor
|
||||
|
@ -39,10 +39,10 @@ Most notable:
|
||||
1.1 The problem
|
||||
---------------
|
||||
|
||||
Realtime scheduling is all about determinism, a group has to be able to rely on
|
||||
Real-time scheduling is all about determinism, a group has to be able to rely on
|
||||
the amount of bandwidth (eg. CPU time) being constant. In order to schedule
|
||||
multiple groups of realtime tasks, each group must be assigned a fixed portion
|
||||
of the CPU time available. Without a minimum guarantee a realtime group can
|
||||
multiple groups of real-time tasks, each group must be assigned a fixed portion
|
||||
of the CPU time available. Without a minimum guarantee a real-time group can
|
||||
obviously fall short. A fuzzy upper limit is of no use since it cannot be
|
||||
relied upon. Which leaves us with just the single fixed portion.
|
||||
|
||||
@ -50,14 +50,14 @@ relied upon. Which leaves us with just the single fixed portion.
|
||||
----------------
|
||||
|
||||
CPU time is divided by means of specifying how much time can be spent running
|
||||
in a given period. We allocate this "run time" for each realtime group which
|
||||
the other realtime groups will not be permitted to use.
|
||||
in a given period. We allocate this "run time" for each real-time group which
|
||||
the other real-time groups will not be permitted to use.
|
||||
|
||||
Any time not allocated to a realtime group will be used to run normal priority
|
||||
Any time not allocated to a real-time group will be used to run normal priority
|
||||
tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by
|
||||
SCHED_OTHER.
|
||||
|
||||
Let's consider an example: a frame fixed realtime renderer must deliver 25
|
||||
Let's consider an example: a frame fixed real-time renderer must deliver 25
|
||||
frames a second, which yields a period of 0.04s per frame. Now say it will also
|
||||
have to play some music and respond to input, leaving it with around 80% CPU
|
||||
time dedicated for the graphics. We can then give this group a run time of 0.8
|
||||
@ -70,7 +70,7 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
|
||||
of 0.00015s.
|
||||
|
||||
The remaining CPU time will be used for user input and other tasks. Because
|
||||
realtime tasks have explicitly allocated the CPU time they need to perform
|
||||
real-time tasks have explicitly allocated the CPU time they need to perform
|
||||
their tasks, buffer underruns in the graphics or audio can be eliminated.
|
||||
|
||||
NOTE: the above example is not fully implemented yet. We still
|
||||
@ -87,18 +87,20 @@ lack an EDF scheduler to make non-uniform periods usable.
|
||||
The system wide settings are configured under the /proc virtual file system:
|
||||
|
||||
/proc/sys/kernel/sched_rt_period_us:
|
||||
The scheduling period that is equivalent to 100% CPU bandwidth
|
||||
The scheduling period that is equivalent to 100% CPU bandwidth.
|
||||
|
||||
/proc/sys/kernel/sched_rt_runtime_us:
|
||||
A global limit on how much time realtime scheduling may use. Even without
|
||||
CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime
|
||||
processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth
|
||||
available to all realtime groups.
|
||||
A global limit on how much time real-time scheduling may use. This is always
|
||||
less or equal to the period_us, as it denotes the time allocated from the
|
||||
period_us for the real-time tasks. Even without CONFIG_RT_GROUP_SCHED enabled,
|
||||
this will limit time reserved to real-time processes. With
|
||||
CONFIG_RT_GROUP_SCHED=y it signifies the total bandwidth available to all
|
||||
real-time groups.
|
||||
|
||||
* Time is specified in us because the interface is s32. This gives an
|
||||
operating range from 1us to about 35 minutes.
|
||||
* sched_rt_period_us takes values from 1 to INT_MAX.
|
||||
* sched_rt_runtime_us takes values from -1 to (INT_MAX - 1).
|
||||
* sched_rt_runtime_us takes values from -1 to sched_rt_period_us.
|
||||
* A run time of -1 specifies runtime == period, ie. no limit.
|
||||
|
||||
|
||||
@ -108,7 +110,7 @@ The system wide settings are configured under the /proc virtual file system:
|
||||
The default values for sched_rt_period_us (1000000 or 1s) and
|
||||
sched_rt_runtime_us (950000 or 0.95s). This gives 0.05s to be used by
|
||||
SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away
|
||||
realtime tasks will not lock up the machine but leave a little time to recover
|
||||
real-time tasks will not lock up the machine but leave a little time to recover
|
||||
it. By setting runtime to -1 you'd get the old behaviour back.
|
||||
|
||||
By default all bandwidth is assigned to the root group and new groups get the
|
||||
@ -116,10 +118,10 @@ period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you
|
||||
want to assign bandwidth to another group, reduce the root group's bandwidth
|
||||
and assign some or all of the difference to another group.
|
||||
|
||||
Realtime group scheduling means you have to assign a portion of total CPU
|
||||
bandwidth to the group before it will accept realtime tasks. Therefore you will
|
||||
not be able to run realtime tasks as any user other than root until you have
|
||||
done that, even if the user has the rights to run processes with realtime
|
||||
Real-time group scheduling means you have to assign a portion of total CPU
|
||||
bandwidth to the group before it will accept real-time tasks. Therefore you will
|
||||
not be able to run real-time tasks as any user other than root until you have
|
||||
done that, even if the user has the rights to run processes with real-time
|
||||
priority!
|
||||
|
||||
|
||||
|
@ -1051,7 +1051,7 @@ static struct sched_domain_topology_level powerpc_topology[] = {
|
||||
#endif
|
||||
{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
|
||||
{ cpu_mc_mask, SD_INIT_NAME(MC) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
@ -1595,7 +1595,7 @@ static void add_cpu_to_masks(int cpu)
|
||||
/* Skip all CPUs already part of current CPU core mask */
|
||||
cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu));
|
||||
|
||||
/* If chip_id is -1; limit the cpu_core_mask to within DIE*/
|
||||
/* If chip_id is -1; limit the cpu_core_mask to within PKG */
|
||||
if (chip_id == -1)
|
||||
cpumask_and(mask, mask, cpu_cpu_mask(cpu));
|
||||
|
||||
|
@ -522,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] = {
|
||||
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
|
||||
{ cpu_book_mask, SD_INIT_NAME(BOOK) },
|
||||
{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include <linux/thread_info.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
#include <asm/msr.h>
|
||||
|
||||
/*
|
||||
* On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
|
||||
@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
|
||||
return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* This can be used in noinstr functions & should only be called in bare
|
||||
* metal context.
|
||||
*/
|
||||
static __always_inline void __update_spec_ctrl(u64 val)
|
||||
{
|
||||
__this_cpu_write(x86_spec_ctrl_current, val);
|
||||
native_wrmsrl(MSR_IA32_SPEC_CTRL, val);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
extern void speculative_store_bypass_ht_init(void);
|
||||
#else
|
||||
|
@ -87,6 +87,7 @@
|
||||
#include <asm/hw_irq.h>
|
||||
#include <asm/stackprotector.h>
|
||||
#include <asm/sev.h>
|
||||
#include <asm/spec-ctrl.h>
|
||||
|
||||
/* representing HT siblings of each logical CPU */
|
||||
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
|
||||
@ -640,13 +641,13 @@ static void __init build_sched_topology(void)
|
||||
};
|
||||
#endif
|
||||
/*
|
||||
* When there is NUMA topology inside the package skip the DIE domain
|
||||
* When there is NUMA topology inside the package skip the PKG domain
|
||||
* since the NUMA domains will auto-magically create the right spanning
|
||||
* domains based on the SLIT.
|
||||
*/
|
||||
if (!x86_has_numa_in_package) {
|
||||
x86_topology[i++] = (struct sched_domain_topology_level){
|
||||
cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
|
||||
cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG)
|
||||
};
|
||||
}
|
||||
|
||||
@ -1596,8 +1597,15 @@ void __noreturn hlt_play_dead(void)
|
||||
native_halt();
|
||||
}
|
||||
|
||||
/*
|
||||
* native_play_dead() is essentially a __noreturn function, but it can't
|
||||
* be marked as such as the compiler may complain about it.
|
||||
*/
|
||||
void native_play_dead(void)
|
||||
{
|
||||
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
|
||||
__update_spec_ctrl(0);
|
||||
|
||||
play_dead_common();
|
||||
tboot_shutdown(TB_SHUTDOWN_WFS);
|
||||
|
||||
|
@ -53,9 +53,8 @@
|
||||
#include <linux/moduleparam.h>
|
||||
#include <asm/cpu_device_id.h>
|
||||
#include <asm/intel-family.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
#include <asm/mwait.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/spec-ctrl.h>
|
||||
#include <asm/fpu/api.h>
|
||||
|
||||
#define INTEL_IDLE_VERSION "0.5.1"
|
||||
@ -69,6 +68,7 @@ static int max_cstate = CPUIDLE_STATE_MAX - 1;
|
||||
static unsigned int disabled_states_mask __read_mostly;
|
||||
static unsigned int preferred_states_mask __read_mostly;
|
||||
static bool force_irq_on __read_mostly;
|
||||
static bool ibrs_off __read_mostly;
|
||||
|
||||
static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
|
||||
|
||||
@ -182,12 +182,12 @@ static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
|
||||
int ret;
|
||||
|
||||
if (smt_active)
|
||||
native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
|
||||
__update_spec_ctrl(0);
|
||||
|
||||
ret = __intel_idle(dev, drv, index);
|
||||
|
||||
if (smt_active)
|
||||
native_wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl);
|
||||
__update_spec_ctrl(spec_ctrl);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -1853,11 +1853,13 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate)
|
||||
}
|
||||
|
||||
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
|
||||
state->flags & CPUIDLE_FLAG_IBRS) {
|
||||
((state->flags & CPUIDLE_FLAG_IBRS) || ibrs_off)) {
|
||||
/*
|
||||
* IBRS mitigation requires that C-states are entered
|
||||
* with interrupts disabled.
|
||||
*/
|
||||
if (ibrs_off && (state->flags & CPUIDLE_FLAG_IRQ_ENABLE))
|
||||
state->flags &= ~CPUIDLE_FLAG_IRQ_ENABLE;
|
||||
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
|
||||
state->enter = intel_idle_ibrs;
|
||||
return;
|
||||
@ -2176,3 +2178,9 @@ MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");
|
||||
* 'CPUIDLE_FLAG_INIT_XSTATE' and 'CPUIDLE_FLAG_IBRS' flags.
|
||||
*/
|
||||
module_param(force_irq_on, bool, 0444);
|
||||
/*
|
||||
* Force the disabling of IBRS when X86_FEATURE_KERNEL_IBRS is on and
|
||||
* CPUIDLE_FLAG_IRQ_ENABLE isn't set.
|
||||
*/
|
||||
module_param(ibrs_off, bool, 0444);
|
||||
MODULE_PARM_DESC(ibrs_off, "Disable IBRS when idle");
|
||||
|
@ -155,6 +155,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
|
||||
static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
|
||||
#endif /* !CONFIG_HOTPLUG_CPU */
|
||||
|
||||
DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
|
||||
|
||||
#ifdef CONFIG_PM_SLEEP_SMP
|
||||
extern int freeze_secondary_cpus(int primary);
|
||||
extern void thaw_secondary_cpus(void);
|
||||
|
@ -686,6 +686,14 @@ static inline void list_splice_tail_init(struct list_head *list,
|
||||
#define list_for_each(pos, head) \
|
||||
for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)
|
||||
|
||||
/**
|
||||
* list_for_each_reverse - iterate backwards over a list
|
||||
* @pos: the &struct list_head to use as a loop cursor.
|
||||
* @head: the head for your list.
|
||||
*/
|
||||
#define list_for_each_reverse(pos, head) \
|
||||
for (pos = (head)->prev; pos != (head); pos = pos->prev)
|
||||
|
||||
/**
|
||||
* list_for_each_rcu - Iterate over a list in an RCU-safe fashion
|
||||
* @pos: the &struct list_head to use as a loop cursor.
|
||||
|
@ -1726,8 +1726,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
|
||||
unsigned int pid_bit;
|
||||
|
||||
pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
|
||||
if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
|
||||
__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
|
||||
if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
|
||||
__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
|
||||
}
|
||||
}
|
||||
#else /* !CONFIG_NUMA_BALANCING */
|
||||
|
@ -551,9 +551,36 @@ struct vma_lock {
|
||||
};
|
||||
|
||||
struct vma_numab_state {
|
||||
/*
|
||||
* Initialised as time in 'jiffies' after which VMA
|
||||
* should be scanned. Delays first scan of new VMA by at
|
||||
* least sysctl_numa_balancing_scan_delay:
|
||||
*/
|
||||
unsigned long next_scan;
|
||||
unsigned long next_pid_reset;
|
||||
unsigned long access_pids[2];
|
||||
|
||||
/*
|
||||
* Time in jiffies when pids_active[] is reset to
|
||||
* detect phase change behaviour:
|
||||
*/
|
||||
unsigned long pids_active_reset;
|
||||
|
||||
/*
|
||||
* Approximate tracking of PIDs that trapped a NUMA hinting
|
||||
* fault. May produce false positives due to hash collisions.
|
||||
*
|
||||
* [0] Previous PID tracking
|
||||
* [1] Current PID tracking
|
||||
*
|
||||
* Window moves after next_pid_reset has expired approximately
|
||||
* every VMA_PID_RESET_PERIOD jiffies:
|
||||
*/
|
||||
unsigned long pids_active[2];
|
||||
|
||||
/*
|
||||
* MM scan sequence ID when the VMA was last completely scanned.
|
||||
* A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
|
||||
*/
|
||||
int prev_scan_seq;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -25,7 +25,7 @@
|
||||
#include <asm/sparsemem.h>
|
||||
|
||||
/* Generic implementation available */
|
||||
int numa_map_to_online_node(int node);
|
||||
int numa_nearest_node(int node, unsigned int state);
|
||||
|
||||
#ifndef memory_add_physaddr_to_nid
|
||||
static inline int memory_add_physaddr_to_nid(u64 start)
|
||||
@ -44,10 +44,11 @@ static inline int phys_to_target_node(u64 start)
|
||||
}
|
||||
#endif
|
||||
#else /* !CONFIG_NUMA */
|
||||
static inline int numa_map_to_online_node(int node)
|
||||
static inline int numa_nearest_node(int node, unsigned int state)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
static inline int memory_add_physaddr_to_nid(u64 start)
|
||||
{
|
||||
return 0;
|
||||
@ -58,6 +59,8 @@ static inline int phys_to_target_node(u64 start)
|
||||
}
|
||||
#endif
|
||||
|
||||
#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE)
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
|
||||
extern const struct attribute_group arch_node_dev_group;
|
||||
#endif
|
||||
|
@ -99,14 +99,21 @@ static __always_inline unsigned char interrupt_context_level(void)
|
||||
return level;
|
||||
}
|
||||
|
||||
/*
|
||||
* These macro definitions avoid redundant invocations of preempt_count()
|
||||
* because such invocations would result in redundant loads given that
|
||||
* preempt_count() is commonly implemented with READ_ONCE().
|
||||
*/
|
||||
|
||||
#define nmi_count() (preempt_count() & NMI_MASK)
|
||||
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
|
||||
#ifdef CONFIG_PREEMPT_RT
|
||||
# define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK)
|
||||
# define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
|
||||
#else
|
||||
# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
|
||||
# define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
|
||||
#endif
|
||||
#define irq_count() (nmi_count() | hardirq_count() | softirq_count())
|
||||
|
||||
/*
|
||||
* Macros to retrieve the current execution context:
|
||||
@ -119,7 +126,11 @@ static __always_inline unsigned char interrupt_context_level(void)
|
||||
#define in_nmi() (nmi_count())
|
||||
#define in_hardirq() (hardirq_count())
|
||||
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
|
||||
#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq()))
|
||||
#ifdef CONFIG_PREEMPT_RT
|
||||
# define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
|
||||
#else
|
||||
# define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The following macros are deprecated and should not be used in new code:
|
||||
|
@ -63,7 +63,6 @@ struct robust_list_head;
|
||||
struct root_domain;
|
||||
struct rq;
|
||||
struct sched_attr;
|
||||
struct sched_param;
|
||||
struct seq_file;
|
||||
struct sighand_struct;
|
||||
struct signal_struct;
|
||||
@ -370,6 +369,10 @@ extern struct root_domain def_root_domain;
|
||||
extern struct mutex sched_domains_mutex;
|
||||
#endif
|
||||
|
||||
struct sched_param {
|
||||
int sched_priority;
|
||||
};
|
||||
|
||||
struct sched_info {
|
||||
#ifdef CONFIG_SCHED_INFO
|
||||
/* Cumulative counters: */
|
||||
@ -750,10 +753,8 @@ struct task_struct {
|
||||
#endif
|
||||
unsigned int __state;
|
||||
|
||||
#ifdef CONFIG_PREEMPT_RT
|
||||
/* saved state for "spinlock sleepers" */
|
||||
unsigned int saved_state;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This begins the randomizable portion of task_struct. Only
|
||||
|
@ -1,4 +1,6 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_SCHED_DEADLINE_H
|
||||
#define _LINUX_SCHED_DEADLINE_H
|
||||
|
||||
/*
|
||||
* SCHED_DEADLINE tasks has negative priorities, reflecting
|
||||
@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p);
|
||||
extern void dl_clear_root_domain(struct root_domain *rd);
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#endif /* _LINUX_SCHED_DEADLINE_H */
|
||||
|
@ -15,6 +15,16 @@
|
||||
#define TNF_FAULT_LOCAL 0x08
|
||||
#define TNF_MIGRATE_FAIL 0x10
|
||||
|
||||
enum numa_vmaskip_reason {
|
||||
NUMAB_SKIP_UNSUITABLE,
|
||||
NUMAB_SKIP_SHARED_RO,
|
||||
NUMAB_SKIP_INACCESSIBLE,
|
||||
NUMAB_SKIP_SCAN_DELAY,
|
||||
NUMAB_SKIP_PID_INACTIVE,
|
||||
NUMAB_SKIP_IGNORE_PID,
|
||||
NUMAB_SKIP_SEQ_COMPLETED,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
extern void task_numa_fault(int last_node, int node, int pages, int flags);
|
||||
extern pid_t task_numa_group_id(struct task_struct *p);
|
||||
|
@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
|
||||
*/
|
||||
SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Domain members share CPU cluster (LLC tags or L2 cache)
|
||||
*
|
||||
* NEEDS_GROUPS: Clusters are shared between groups.
|
||||
*/
|
||||
SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
|
||||
|
||||
/*
|
||||
* Domain members share CPU package resources (i.e. caches)
|
||||
*
|
||||
|
@ -656,7 +656,8 @@ extern bool current_is_single_threaded(void);
|
||||
while ((t = next_thread(t)) != g)
|
||||
|
||||
#define __for_each_thread(signal, t) \
|
||||
list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
|
||||
list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
|
||||
lockdep_is_held(&tasklist_lock))
|
||||
|
||||
#define for_each_thread(p, t) \
|
||||
__for_each_thread((p)->signal, t)
|
||||
|
@ -17,4 +17,4 @@ static inline bool sched_smt_active(void) { return false; }
|
||||
|
||||
void arch_smt_update(void);
|
||||
|
||||
#endif
|
||||
#endif /* _LINUX_SCHED_SMT_H */
|
||||
|
@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void)
|
||||
#ifdef CONFIG_SCHED_CLUSTER
|
||||
static inline int cpu_cluster_flags(void)
|
||||
{
|
||||
return SD_SHARE_PKG_RESOURCES;
|
||||
return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -109,8 +109,6 @@ struct sched_domain {
|
||||
u64 max_newidle_lb_cost;
|
||||
unsigned long last_decay_max_lb_cost;
|
||||
|
||||
u64 avg_scan_cost; /* select_idle_sibling */
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
/* load_balance() stats */
|
||||
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
|
||||
@ -179,6 +177,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
|
||||
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
|
||||
|
||||
bool cpus_share_cache(int this_cpu, int that_cpu);
|
||||
bool cpus_share_resources(int this_cpu, int that_cpu);
|
||||
|
||||
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
|
||||
typedef int (*sched_domain_flags_f)(void);
|
||||
@ -232,6 +231,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool cpus_share_resources(int this_cpu, int that_cpu)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif /* !CONFIG_SMP */
|
||||
|
||||
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
||||
|
@ -20,4 +20,4 @@ struct task_cputime {
|
||||
unsigned long long sum_exec_runtime;
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif /* _LINUX_SCHED_TYPES_H */
|
||||
|
@ -1,7 +1,6 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_VHOST_TASK_H
|
||||
#define _LINUX_VHOST_TASK_H
|
||||
|
||||
#ifndef _LINUX_SCHED_VHOST_TASK_H
|
||||
#define _LINUX_SCHED_VHOST_TASK_H
|
||||
|
||||
struct vhost_task;
|
||||
|
||||
@ -11,4 +10,4 @@ void vhost_task_start(struct vhost_task *vtsk);
|
||||
void vhost_task_stop(struct vhost_task *vtsk);
|
||||
void vhost_task_wake(struct vhost_task *vtsk);
|
||||
|
||||
#endif
|
||||
#endif /* _LINUX_SCHED_VHOST_TASK_H */
|
||||
|
@ -251,7 +251,7 @@ extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int
|
||||
#else
|
||||
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
|
||||
{
|
||||
return cpumask_nth(cpu, cpus);
|
||||
return cpumask_nth_and(cpu, cpus, cpu_online_mask);
|
||||
}
|
||||
|
||||
static inline const struct cpumask *
|
||||
|
@ -664,6 +664,58 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
|
||||
TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
|
||||
);
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
#define NUMAB_SKIP_REASON \
|
||||
EM( NUMAB_SKIP_UNSUITABLE, "unsuitable" ) \
|
||||
EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \
|
||||
EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \
|
||||
EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \
|
||||
EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \
|
||||
EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \
|
||||
EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" )
|
||||
|
||||
/* Redefine for export. */
|
||||
#undef EM
|
||||
#undef EMe
|
||||
#define EM(a, b) TRACE_DEFINE_ENUM(a);
|
||||
#define EMe(a, b) TRACE_DEFINE_ENUM(a);
|
||||
|
||||
NUMAB_SKIP_REASON
|
||||
|
||||
/* Redefine for symbolic printing. */
|
||||
#undef EM
|
||||
#undef EMe
|
||||
#define EM(a, b) { a, b },
|
||||
#define EMe(a, b) { a, b }
|
||||
|
||||
TRACE_EVENT(sched_skip_vma_numa,
|
||||
|
||||
TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
enum numa_vmaskip_reason reason),
|
||||
|
||||
TP_ARGS(mm, vma, reason),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned long, numa_scan_offset)
|
||||
__field(unsigned long, vm_start)
|
||||
__field(unsigned long, vm_end)
|
||||
__field(enum numa_vmaskip_reason, reason)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->numa_scan_offset = mm->numa_scan_offset;
|
||||
__entry->vm_start = vma->vm_start;
|
||||
__entry->vm_end = vma->vm_end;
|
||||
__entry->reason = reason;
|
||||
),
|
||||
|
||||
TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
|
||||
__entry->numa_scan_offset,
|
||||
__entry->vm_start,
|
||||
__entry->vm_end,
|
||||
__print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
|
||||
);
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
/*
|
||||
* Tracepoint for waking a polling cpu without an IPI.
|
||||
@ -735,6 +787,11 @@ DECLARE_TRACE(sched_update_nr_running_tp,
|
||||
TP_PROTO(struct rq *rq, int change),
|
||||
TP_ARGS(rq, change));
|
||||
|
||||
DECLARE_TRACE(sched_compute_energy_tp,
|
||||
TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy,
|
||||
unsigned long max_util, unsigned long busy_time),
|
||||
TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
|
||||
|
||||
#endif /* _TRACE_SCHED_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
@ -4,10 +4,6 @@
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
struct sched_param {
|
||||
int sched_priority;
|
||||
};
|
||||
|
||||
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
|
||||
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
|
||||
|
||||
|
@ -71,7 +71,11 @@ bool __refrigerator(bool check_kthr_stop)
|
||||
for (;;) {
|
||||
bool freeze;
|
||||
|
||||
raw_spin_lock_irq(¤t->pi_lock);
|
||||
set_current_state(TASK_FROZEN);
|
||||
/* unstale saved_state so that __thaw_task() will wake us up */
|
||||
current->saved_state = TASK_RUNNING;
|
||||
raw_spin_unlock_irq(¤t->pi_lock);
|
||||
|
||||
spin_lock_irq(&freezer_lock);
|
||||
freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
|
||||
@ -129,6 +133,7 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
|
||||
WARN_ON_ONCE(debug_locks && p->lockdep_depth);
|
||||
#endif
|
||||
|
||||
p->saved_state = p->__state;
|
||||
WRITE_ONCE(p->__state, TASK_FROZEN);
|
||||
return TASK_FROZEN;
|
||||
}
|
||||
@ -170,42 +175,34 @@ bool freeze_task(struct task_struct *p)
|
||||
}
|
||||
|
||||
/*
|
||||
* The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
|
||||
* state in p->jobctl. If either of them got a wakeup that was missed because
|
||||
* TASK_FROZEN, then their canonical state reflects that and the below will
|
||||
* refuse to restore the special state and instead issue the wakeup.
|
||||
* Restore the saved_state before the task entered freezer. For typical task
|
||||
* in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens
|
||||
* here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state
|
||||
* is restored unless they got an expected wakeup (see ttwu_state_match()).
|
||||
* Returns 1 if the task state was restored.
|
||||
*/
|
||||
static int __set_task_special(struct task_struct *p, void *arg)
|
||||
static int __restore_freezer_state(struct task_struct *p, void *arg)
|
||||
{
|
||||
unsigned int state = 0;
|
||||
unsigned int state = p->saved_state;
|
||||
|
||||
if (p->jobctl & JOBCTL_TRACED)
|
||||
state = TASK_TRACED;
|
||||
|
||||
else if (p->jobctl & JOBCTL_STOPPED)
|
||||
state = TASK_STOPPED;
|
||||
|
||||
if (state)
|
||||
if (state != TASK_RUNNING) {
|
||||
WRITE_ONCE(p->__state, state);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return state;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __thaw_task(struct task_struct *p)
|
||||
{
|
||||
unsigned long flags, flags2;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&freezer_lock, flags);
|
||||
if (WARN_ON_ONCE(freezing(p)))
|
||||
goto unlock;
|
||||
|
||||
if (lock_task_sighand(p, &flags2)) {
|
||||
/* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
|
||||
bool ret = task_call_func(p, __set_task_special, NULL);
|
||||
unlock_task_sighand(p, &flags2);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
}
|
||||
if (task_call_func(p, __restore_freezer_state, NULL))
|
||||
goto unlock;
|
||||
|
||||
wake_up_state(p, TASK_FROZEN);
|
||||
unlock:
|
||||
|
@ -34,7 +34,6 @@
|
||||
#include <linux/nospec.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/psi.h>
|
||||
#include <linux/psi.h>
|
||||
#include <linux/ptrace_api.h>
|
||||
#include <linux/sched_clock.h>
|
||||
#include <linux/security.h>
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -131,7 +131,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
||||
if (!dl_task_fits_capacity(p, cpu)) {
|
||||
cpumask_clear_cpu(cpu, later_mask);
|
||||
|
||||
cap = capacity_orig_of(cpu);
|
||||
cap = arch_scale_cpu_capacity(cpu);
|
||||
|
||||
if (cap > max_cap ||
|
||||
(cpu == task_cpu(p) && cap == max_cap)) {
|
||||
|
@ -132,7 +132,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
|
||||
int i;
|
||||
|
||||
for_each_cpu_and(i, mask, cpu_active_mask)
|
||||
cap += capacity_orig_of(i);
|
||||
cap += arch_scale_cpu_capacity(i);
|
||||
|
||||
return cap;
|
||||
}
|
||||
@ -144,7 +144,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
|
||||
static inline unsigned long dl_bw_capacity(int i)
|
||||
{
|
||||
if (!sched_asym_cpucap_active() &&
|
||||
capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
|
||||
arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) {
|
||||
return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
|
||||
} else {
|
||||
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
|
||||
@ -509,7 +509,6 @@ void init_dl_rq(struct dl_rq *dl_rq)
|
||||
/* zero means no -deadline tasks */
|
||||
dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
|
||||
|
||||
dl_rq->dl_nr_migratory = 0;
|
||||
dl_rq->overloaded = 0;
|
||||
dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
|
||||
#else
|
||||
@ -553,39 +552,6 @@ static inline void dl_clear_overload(struct rq *rq)
|
||||
cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
|
||||
}
|
||||
|
||||
static void update_dl_migration(struct dl_rq *dl_rq)
|
||||
{
|
||||
if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
|
||||
if (!dl_rq->overloaded) {
|
||||
dl_set_overload(rq_of_dl_rq(dl_rq));
|
||||
dl_rq->overloaded = 1;
|
||||
}
|
||||
} else if (dl_rq->overloaded) {
|
||||
dl_clear_overload(rq_of_dl_rq(dl_rq));
|
||||
dl_rq->overloaded = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
|
||||
{
|
||||
struct task_struct *p = dl_task_of(dl_se);
|
||||
|
||||
if (p->nr_cpus_allowed > 1)
|
||||
dl_rq->dl_nr_migratory++;
|
||||
|
||||
update_dl_migration(dl_rq);
|
||||
}
|
||||
|
||||
static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
|
||||
{
|
||||
struct task_struct *p = dl_task_of(dl_se);
|
||||
|
||||
if (p->nr_cpus_allowed > 1)
|
||||
dl_rq->dl_nr_migratory--;
|
||||
|
||||
update_dl_migration(dl_rq);
|
||||
}
|
||||
|
||||
#define __node_2_pdl(node) \
|
||||
rb_entry((node), struct task_struct, pushable_dl_tasks)
|
||||
|
||||
@ -594,6 +560,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
|
||||
return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
|
||||
}
|
||||
|
||||
static inline int has_pushable_dl_tasks(struct rq *rq)
|
||||
{
|
||||
return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
|
||||
}
|
||||
|
||||
/*
|
||||
* The list of pushable -deadline task is not a plist, like in
|
||||
* sched_rt.c, it is an rb-tree with tasks ordered by deadline.
|
||||
@ -609,6 +580,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
|
||||
__pushable_less);
|
||||
if (leftmost)
|
||||
rq->dl.earliest_dl.next = p->dl.deadline;
|
||||
|
||||
if (!rq->dl.overloaded) {
|
||||
dl_set_overload(rq);
|
||||
rq->dl.overloaded = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
|
||||
@ -625,11 +601,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
|
||||
dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
|
||||
|
||||
RB_CLEAR_NODE(&p->pushable_dl_tasks);
|
||||
}
|
||||
|
||||
static inline int has_pushable_dl_tasks(struct rq *rq)
|
||||
{
|
||||
return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
|
||||
if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) {
|
||||
dl_clear_overload(rq);
|
||||
rq->dl.overloaded = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static int push_dl_task(struct rq *rq);
|
||||
@ -763,7 +739,7 @@ static inline void deadline_queue_pull_task(struct rq *rq)
|
||||
|
||||
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
|
||||
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
|
||||
static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
|
||||
static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
|
||||
struct rq *rq)
|
||||
@ -1175,7 +1151,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
|
||||
|
||||
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
|
||||
if (dl_task(rq->curr))
|
||||
check_preempt_curr_dl(rq, p, 0);
|
||||
wakeup_preempt_dl(rq, p, 0);
|
||||
else
|
||||
resched_curr(rq);
|
||||
|
||||
@ -1504,7 +1480,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
|
||||
add_nr_running(rq_of_dl_rq(dl_rq), 1);
|
||||
|
||||
inc_dl_deadline(dl_rq, deadline);
|
||||
inc_dl_migration(dl_se, dl_rq);
|
||||
}
|
||||
|
||||
static inline
|
||||
@ -1518,7 +1493,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
|
||||
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
|
||||
|
||||
dec_dl_deadline(dl_rq, dl_se->deadline);
|
||||
dec_dl_migration(dl_se, dl_rq);
|
||||
}
|
||||
|
||||
static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
|
||||
@ -1939,7 +1913,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
|
||||
* Only called when both the current and waking task are -deadline
|
||||
* tasks.
|
||||
*/
|
||||
static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
|
||||
static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
|
||||
int flags)
|
||||
{
|
||||
if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
|
||||
@ -2291,9 +2265,6 @@ static int push_dl_task(struct rq *rq)
|
||||
struct rq *later_rq;
|
||||
int ret = 0;
|
||||
|
||||
if (!rq->dl.overloaded)
|
||||
return 0;
|
||||
|
||||
next_task = pick_next_pushable_dl_task(rq);
|
||||
if (!next_task)
|
||||
return 0;
|
||||
@ -2449,9 +2420,11 @@ skip:
|
||||
double_unlock_balance(this_rq, src_rq);
|
||||
|
||||
if (push_task) {
|
||||
preempt_disable();
|
||||
raw_spin_rq_unlock(this_rq);
|
||||
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
|
||||
push_task, &src_rq->push_work);
|
||||
preempt_enable();
|
||||
raw_spin_rq_lock(this_rq);
|
||||
}
|
||||
}
|
||||
@ -2652,7 +2625,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
|
||||
deadline_queue_push_tasks(rq);
|
||||
#endif
|
||||
if (dl_task(rq->curr))
|
||||
check_preempt_curr_dl(rq, p, 0);
|
||||
wakeup_preempt_dl(rq, p, 0);
|
||||
else
|
||||
resched_curr(rq);
|
||||
} else {
|
||||
@ -2721,7 +2694,7 @@ DEFINE_SCHED_CLASS(dl) = {
|
||||
.dequeue_task = dequeue_task_dl,
|
||||
.yield_task = yield_task_dl,
|
||||
|
||||
.check_preempt_curr = check_preempt_curr_dl,
|
||||
.wakeup_preempt = wakeup_preempt_dl,
|
||||
|
||||
.pick_next_task = pick_next_task_dl,
|
||||
.put_prev_task = put_prev_task_dl,
|
||||
|
@ -8,7 +8,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* This allows printing both to /proc/sched_debug and
|
||||
* This allows printing both to /sys/kernel/debug/sched/debug and
|
||||
* to the console
|
||||
*/
|
||||
#define SEQ_printf(m, x...) \
|
||||
@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
|
||||
|
||||
PU(rt_nr_running);
|
||||
#ifdef CONFIG_SMP
|
||||
PU(rt_nr_migratory);
|
||||
#endif
|
||||
P(rt_throttled);
|
||||
PN(rt_time);
|
||||
PN(rt_runtime);
|
||||
@ -748,7 +745,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
|
||||
|
||||
PU(dl_nr_running);
|
||||
#ifdef CONFIG_SMP
|
||||
PU(dl_nr_migratory);
|
||||
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
|
||||
#else
|
||||
dl_bw = &dl_rq->dl_bw;
|
||||
@ -864,7 +860,6 @@ static void sched_debug_header(struct seq_file *m)
|
||||
#define PN(x) \
|
||||
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
|
||||
PN(sysctl_sched_base_slice);
|
||||
P(sysctl_sched_child_runs_first);
|
||||
P(sysctl_sched_features);
|
||||
#undef PN
|
||||
#undef P
|
||||
|
@ -51,8 +51,6 @@
|
||||
|
||||
#include <asm/switch_to.h>
|
||||
|
||||
#include <linux/sched/cond_resched.h>
|
||||
|
||||
#include "sched.h"
|
||||
#include "stats.h"
|
||||
#include "autogroup.h"
|
||||
@ -78,12 +76,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
|
||||
unsigned int sysctl_sched_base_slice = 750000ULL;
|
||||
static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
|
||||
|
||||
/*
|
||||
* After fork, child runs first. If set to 0 (default) then
|
||||
* parent will (try to) run first.
|
||||
*/
|
||||
unsigned int sysctl_sched_child_runs_first __read_mostly;
|
||||
|
||||
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
||||
|
||||
int sched_thermal_decay_shift;
|
||||
@ -145,13 +137,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static struct ctl_table sched_fair_sysctls[] = {
|
||||
{
|
||||
.procname = "sched_child_runs_first",
|
||||
.data = &sysctl_sched_child_runs_first,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
{
|
||||
.procname = "sched_cfs_bandwidth_slice_us",
|
||||
@ -2899,19 +2884,7 @@ static void task_numa_placement(struct task_struct *p)
|
||||
}
|
||||
|
||||
/* Cannot migrate task to CPU-less node */
|
||||
if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
|
||||
int near_nid = max_nid;
|
||||
int distance, near_distance = INT_MAX;
|
||||
|
||||
for_each_node_state(nid, N_CPU) {
|
||||
distance = node_distance(max_nid, nid);
|
||||
if (distance < near_distance) {
|
||||
near_nid = nid;
|
||||
near_distance = distance;
|
||||
}
|
||||
}
|
||||
max_nid = near_nid;
|
||||
}
|
||||
max_nid = numa_nearest_node(max_nid, N_CPU);
|
||||
|
||||
if (ng) {
|
||||
numa_group_count_active_nodes(ng);
|
||||
@ -3182,7 +3155,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
|
||||
p->mm->numa_scan_offset = 0;
|
||||
}
|
||||
|
||||
static bool vma_is_accessed(struct vm_area_struct *vma)
|
||||
static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long pids;
|
||||
/*
|
||||
@ -3194,8 +3167,20 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
|
||||
if (READ_ONCE(current->mm->numa_scan_seq) < 2)
|
||||
return true;
|
||||
|
||||
pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
|
||||
return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
|
||||
pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
|
||||
if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Complete a scan that has already started regardless of PID access, or
|
||||
* some VMAs may never be scanned in multi-threaded applications:
|
||||
*/
|
||||
if (mm->numa_scan_offset > vma->vm_start) {
|
||||
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
|
||||
@ -3215,6 +3200,8 @@ static void task_numa_work(struct callback_head *work)
|
||||
unsigned long nr_pte_updates = 0;
|
||||
long pages, virtpages;
|
||||
struct vma_iterator vmi;
|
||||
bool vma_pids_skipped;
|
||||
bool vma_pids_forced = false;
|
||||
|
||||
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
|
||||
|
||||
@ -3257,7 +3244,6 @@ static void task_numa_work(struct callback_head *work)
|
||||
*/
|
||||
p->node_stamp += 2 * TICK_NSEC;
|
||||
|
||||
start = mm->numa_scan_offset;
|
||||
pages = sysctl_numa_balancing_scan_size;
|
||||
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
|
||||
virtpages = pages * 8; /* Scan up to this much virtual space */
|
||||
@ -3267,6 +3253,16 @@ static void task_numa_work(struct callback_head *work)
|
||||
|
||||
if (!mmap_read_trylock(mm))
|
||||
return;
|
||||
|
||||
/*
|
||||
* VMAs are skipped if the current PID has not trapped a fault within
|
||||
* the VMA recently. Allow scanning to be forced if there is no
|
||||
* suitable VMA remaining.
|
||||
*/
|
||||
vma_pids_skipped = false;
|
||||
|
||||
retry_pids:
|
||||
start = mm->numa_scan_offset;
|
||||
vma_iter_init(&vmi, mm, start);
|
||||
vma = vma_next(&vmi);
|
||||
if (!vma) {
|
||||
@ -3279,6 +3275,7 @@ static void task_numa_work(struct callback_head *work)
|
||||
do {
|
||||
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
|
||||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
|
||||
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -3289,15 +3286,19 @@ static void task_numa_work(struct callback_head *work)
|
||||
* as migrating the pages will be of marginal benefit.
|
||||
*/
|
||||
if (!vma->vm_mm ||
|
||||
(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
|
||||
(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
|
||||
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Skip inaccessible VMAs to avoid any confusion between
|
||||
* PROT_NONE and NUMA hinting ptes
|
||||
*/
|
||||
if (!vma_is_accessible(vma))
|
||||
if (!vma_is_accessible(vma)) {
|
||||
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Initialise new per-VMA NUMAB state. */
|
||||
if (!vma->numab_state) {
|
||||
@ -3310,8 +3311,15 @@ static void task_numa_work(struct callback_head *work)
|
||||
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
|
||||
|
||||
/* Reset happens after 4 times scan delay of scan start */
|
||||
vma->numab_state->next_pid_reset = vma->numab_state->next_scan +
|
||||
vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
|
||||
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
|
||||
|
||||
/*
|
||||
* Ensure prev_scan_seq does not match numa_scan_seq,
|
||||
* to prevent VMAs being skipped prematurely on the
|
||||
* first scan:
|
||||
*/
|
||||
vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3319,23 +3327,35 @@ static void task_numa_work(struct callback_head *work)
|
||||
* delay the scan for new VMAs.
|
||||
*/
|
||||
if (mm->numa_scan_seq && time_before(jiffies,
|
||||
vma->numab_state->next_scan))
|
||||
vma->numab_state->next_scan)) {
|
||||
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Do not scan the VMA if task has not accessed */
|
||||
if (!vma_is_accessed(vma))
|
||||
/* RESET access PIDs regularly for old VMAs. */
|
||||
if (mm->numa_scan_seq &&
|
||||
time_after(jiffies, vma->numab_state->pids_active_reset)) {
|
||||
vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
|
||||
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
|
||||
vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
|
||||
vma->numab_state->pids_active[1] = 0;
|
||||
}
|
||||
|
||||
/* Do not rescan VMAs twice within the same sequence. */
|
||||
if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
|
||||
mm->numa_scan_offset = vma->vm_end;
|
||||
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* RESET access PIDs regularly for old VMAs. Resetting after checking
|
||||
* vma for recent access to avoid clearing PID info before access..
|
||||
* Do not scan the VMA if task has not accessed it, unless no other
|
||||
* VMA candidate exists.
|
||||
*/
|
||||
if (mm->numa_scan_seq &&
|
||||
time_after(jiffies, vma->numab_state->next_pid_reset)) {
|
||||
vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
|
||||
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
|
||||
vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
|
||||
vma->numab_state->access_pids[1] = 0;
|
||||
if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
|
||||
vma_pids_skipped = true;
|
||||
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
|
||||
continue;
|
||||
}
|
||||
|
||||
do {
|
||||
@ -3362,8 +3382,28 @@ static void task_numa_work(struct callback_head *work)
|
||||
|
||||
cond_resched();
|
||||
} while (end != vma->vm_end);
|
||||
|
||||
/* VMA scan is complete, do not scan until next sequence. */
|
||||
vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
|
||||
|
||||
/*
|
||||
* Only force scan within one VMA at a time, to limit the
|
||||
* cost of scanning a potentially uninteresting VMA.
|
||||
*/
|
||||
if (vma_pids_forced)
|
||||
break;
|
||||
} for_each_vma(vmi, vma);
|
||||
|
||||
/*
|
||||
* If no VMAs are remaining and VMAs were skipped due to the PID
|
||||
* not accessing the VMA previously, then force a scan to ensure
|
||||
* forward progress:
|
||||
*/
|
||||
if (!vma && !vma_pids_forced && vma_pids_skipped) {
|
||||
vma_pids_forced = true;
|
||||
goto retry_pids;
|
||||
}
|
||||
|
||||
out:
|
||||
/*
|
||||
* It is possible to reach the end of the VMA list but the last few
|
||||
@ -3942,7 +3982,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
|
||||
*/
|
||||
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
|
||||
long delta;
|
||||
u64 now;
|
||||
|
||||
/*
|
||||
* No need to update load_avg for root_task_group as it is not used.
|
||||
@ -3950,9 +3991,19 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
|
||||
if (cfs_rq->tg == &root_task_group)
|
||||
return;
|
||||
|
||||
/*
|
||||
* For migration heavy workloads, access to tg->load_avg can be
|
||||
* unbound. Limit the update rate to at most once per ms.
|
||||
*/
|
||||
now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
|
||||
if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
|
||||
return;
|
||||
|
||||
delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
|
||||
if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
|
||||
atomic_long_add(delta, &cfs_rq->tg->load_avg);
|
||||
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
|
||||
cfs_rq->last_update_tg_load_avg = now;
|
||||
}
|
||||
}
|
||||
|
||||
@ -4626,22 +4677,6 @@ static inline unsigned long task_util_est(struct task_struct *p)
|
||||
return max(task_util(p), _task_util_est(p));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
static inline unsigned long uclamp_task_util(struct task_struct *p,
|
||||
unsigned long uclamp_min,
|
||||
unsigned long uclamp_max)
|
||||
{
|
||||
return clamp(task_util_est(p), uclamp_min, uclamp_max);
|
||||
}
|
||||
#else
|
||||
static inline unsigned long uclamp_task_util(struct task_struct *p,
|
||||
unsigned long uclamp_min,
|
||||
unsigned long uclamp_max)
|
||||
{
|
||||
return task_util_est(p);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
|
||||
struct task_struct *p)
|
||||
{
|
||||
@ -4745,7 +4780,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
|
||||
* To avoid overestimation of actual task utilization, skip updates if
|
||||
* we cannot grant there is idle time in this CPU.
|
||||
*/
|
||||
if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
|
||||
if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -4793,14 +4828,14 @@ static inline int util_fits_cpu(unsigned long util,
|
||||
return fits;
|
||||
|
||||
/*
|
||||
* We must use capacity_orig_of() for comparing against uclamp_min and
|
||||
* We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
|
||||
* uclamp_max. We only care about capacity pressure (by using
|
||||
* capacity_of()) for comparing against the real util.
|
||||
*
|
||||
* If a task is boosted to 1024 for example, we don't want a tiny
|
||||
* pressure to skew the check whether it fits a CPU or not.
|
||||
*
|
||||
* Similarly if a task is capped to capacity_orig_of(little_cpu), it
|
||||
* Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
|
||||
* should fit a little cpu even if there's some pressure.
|
||||
*
|
||||
* Only exception is for thermal pressure since it has a direct impact
|
||||
@ -4812,7 +4847,7 @@ static inline int util_fits_cpu(unsigned long util,
|
||||
* For uclamp_max, we can tolerate a drop in performance level as the
|
||||
* goal is to cap the task. So it's okay if it's getting less.
|
||||
*/
|
||||
capacity_orig = capacity_orig_of(cpu);
|
||||
capacity_orig = arch_scale_cpu_capacity(cpu);
|
||||
capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
|
||||
|
||||
/*
|
||||
@ -4932,7 +4967,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
|
||||
|
||||
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return true;
|
||||
return !cfs_rq->nr_running;
|
||||
}
|
||||
|
||||
#define UPDATE_TG 0x0
|
||||
@ -5267,7 +5302,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
* 4) do not run the "skip" process, if something else is available
|
||||
*/
|
||||
static struct sched_entity *
|
||||
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
||||
pick_next_entity(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
/*
|
||||
* Enabling NEXT_BUDDY will affect latency but not fairness.
|
||||
@ -5811,13 +5846,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
|
||||
|
||||
static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
|
||||
{
|
||||
struct cfs_rq *local_unthrottle = NULL;
|
||||
int this_cpu = smp_processor_id();
|
||||
u64 runtime, remaining = 1;
|
||||
bool throttled = false;
|
||||
struct cfs_rq *cfs_rq;
|
||||
struct cfs_rq *cfs_rq, *tmp;
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
LIST_HEAD(local_unthrottle);
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
|
||||
@ -5833,11 +5868,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
|
||||
if (!cfs_rq_throttled(cfs_rq))
|
||||
goto next;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/* Already queued for async unthrottle */
|
||||
if (!list_empty(&cfs_rq->throttled_csd_list))
|
||||
goto next;
|
||||
#endif
|
||||
|
||||
/* By the above checks, this should never be true */
|
||||
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
|
||||
@ -5854,11 +5887,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
|
||||
|
||||
/* we check whether we're throttled above */
|
||||
if (cfs_rq->runtime_remaining > 0) {
|
||||
if (cpu_of(rq) != this_cpu ||
|
||||
SCHED_WARN_ON(local_unthrottle))
|
||||
if (cpu_of(rq) != this_cpu) {
|
||||
unthrottle_cfs_rq_async(cfs_rq);
|
||||
else
|
||||
local_unthrottle = cfs_rq;
|
||||
} else {
|
||||
/*
|
||||
* We currently only expect to be unthrottling
|
||||
* a single cfs_rq locally.
|
||||
*/
|
||||
SCHED_WARN_ON(!list_empty(&local_unthrottle));
|
||||
list_add_tail(&cfs_rq->throttled_csd_list,
|
||||
&local_unthrottle);
|
||||
}
|
||||
} else {
|
||||
throttled = true;
|
||||
}
|
||||
@ -5866,15 +5905,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
|
||||
next:
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (local_unthrottle) {
|
||||
rq = cpu_rq(this_cpu);
|
||||
list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
|
||||
throttled_csd_list) {
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
if (cfs_rq_throttled(local_unthrottle))
|
||||
unthrottle_cfs_rq(local_unthrottle);
|
||||
|
||||
list_del_init(&cfs_rq->throttled_csd_list);
|
||||
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
unthrottle_cfs_rq(cfs_rq);
|
||||
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
}
|
||||
SCHED_WARN_ON(!list_empty(&local_unthrottle));
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
return throttled;
|
||||
}
|
||||
@ -6204,9 +6251,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
cfs_rq->runtime_enabled = 0;
|
||||
INIT_LIST_HEAD(&cfs_rq->throttled_list);
|
||||
#ifdef CONFIG_SMP
|
||||
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
|
||||
#endif
|
||||
}
|
||||
|
||||
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
@ -7164,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
|
||||
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
|
||||
int i, cpu, idle_cpu = -1, nr = INT_MAX;
|
||||
struct sched_domain_shared *sd_share;
|
||||
struct rq *this_rq = this_rq();
|
||||
int this = smp_processor_id();
|
||||
struct sched_domain *this_sd = NULL;
|
||||
u64 time = 0;
|
||||
|
||||
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
|
||||
|
||||
if (sched_feat(SIS_PROP) && !has_idle_core) {
|
||||
u64 avg_cost, avg_idle, span_avg;
|
||||
unsigned long now = jiffies;
|
||||
|
||||
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
|
||||
if (!this_sd)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* If we're busy, the assumption that the last idle period
|
||||
* predicts the future is flawed; age away the remaining
|
||||
* predicted idle time.
|
||||
*/
|
||||
if (unlikely(this_rq->wake_stamp < now)) {
|
||||
while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
|
||||
this_rq->wake_stamp++;
|
||||
this_rq->wake_avg_idle >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
avg_idle = this_rq->wake_avg_idle;
|
||||
avg_cost = this_sd->avg_scan_cost + 1;
|
||||
|
||||
span_avg = sd->span_weight * avg_idle;
|
||||
if (span_avg > 4*avg_cost)
|
||||
nr = div_u64(span_avg, avg_cost);
|
||||
else
|
||||
nr = 4;
|
||||
|
||||
time = cpu_clock(this);
|
||||
}
|
||||
|
||||
if (sched_feat(SIS_UTIL)) {
|
||||
sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
|
||||
if (sd_share) {
|
||||
@ -7214,6 +7223,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
|
||||
}
|
||||
}
|
||||
|
||||
if (static_branch_unlikely(&sched_cluster_active)) {
|
||||
struct sched_group *sg = sd->groups;
|
||||
|
||||
if (sg->flags & SD_CLUSTER) {
|
||||
for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
|
||||
if (!cpumask_test_cpu(cpu, cpus))
|
||||
continue;
|
||||
|
||||
if (has_idle_core) {
|
||||
i = select_idle_core(p, cpu, cpus, &idle_cpu);
|
||||
if ((unsigned int)i < nr_cpumask_bits)
|
||||
return i;
|
||||
} else {
|
||||
if (--nr <= 0)
|
||||
return -1;
|
||||
idle_cpu = __select_idle_cpu(cpu, p);
|
||||
if ((unsigned int)idle_cpu < nr_cpumask_bits)
|
||||
return idle_cpu;
|
||||
}
|
||||
}
|
||||
cpumask_andnot(cpus, cpus, sched_group_span(sg));
|
||||
}
|
||||
}
|
||||
|
||||
for_each_cpu_wrap(cpu, cpus, target + 1) {
|
||||
if (has_idle_core) {
|
||||
i = select_idle_core(p, cpu, cpus, &idle_cpu);
|
||||
@ -7221,7 +7254,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
|
||||
return i;
|
||||
|
||||
} else {
|
||||
if (!--nr)
|
||||
if (--nr <= 0)
|
||||
return -1;
|
||||
idle_cpu = __select_idle_cpu(cpu, p);
|
||||
if ((unsigned int)idle_cpu < nr_cpumask_bits)
|
||||
@ -7232,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
|
||||
if (has_idle_core)
|
||||
set_idle_cores(target, false);
|
||||
|
||||
if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
|
||||
time = cpu_clock(this) - time;
|
||||
|
||||
/*
|
||||
* Account for the scan cost of wakeups against the average
|
||||
* idle time.
|
||||
*/
|
||||
this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
|
||||
|
||||
update_avg(&this_sd->avg_scan_cost, time);
|
||||
}
|
||||
|
||||
return idle_cpu;
|
||||
}
|
||||
|
||||
@ -7283,7 +7304,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
* Look for the CPU with best capacity.
|
||||
*/
|
||||
else if (fits < 0)
|
||||
cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
|
||||
cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
|
||||
|
||||
/*
|
||||
* First, select CPU which fits better (-1 being better than 0).
|
||||
@ -7323,7 +7344,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
bool has_idle_core = false;
|
||||
struct sched_domain *sd;
|
||||
unsigned long task_util, util_min, util_max;
|
||||
int i, recent_used_cpu;
|
||||
int i, recent_used_cpu, prev_aff = -1;
|
||||
|
||||
/*
|
||||
* On asymmetric system, update task utilization because we will check
|
||||
@ -7350,8 +7371,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
*/
|
||||
if (prev != target && cpus_share_cache(prev, target) &&
|
||||
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
|
||||
asym_fits_cpu(task_util, util_min, util_max, prev))
|
||||
return prev;
|
||||
asym_fits_cpu(task_util, util_min, util_max, prev)) {
|
||||
|
||||
if (!static_branch_unlikely(&sched_cluster_active) ||
|
||||
cpus_share_resources(prev, target))
|
||||
return prev;
|
||||
|
||||
prev_aff = prev;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow a per-cpu kthread to stack with the wakee if the
|
||||
@ -7378,7 +7405,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
|
||||
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
|
||||
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
|
||||
return recent_used_cpu;
|
||||
|
||||
if (!static_branch_unlikely(&sched_cluster_active) ||
|
||||
cpus_share_resources(recent_used_cpu, target))
|
||||
return recent_used_cpu;
|
||||
|
||||
} else {
|
||||
recent_used_cpu = -1;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -7419,6 +7452,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
if ((unsigned)i < nr_cpumask_bits)
|
||||
return i;
|
||||
|
||||
/*
|
||||
* For cluster machines which have lower sharing cache like L2 or
|
||||
* LLC Tag, we tend to find an idle CPU in the target's cluster
|
||||
* first. But prev_cpu or recent_used_cpu may also be a good candidate,
|
||||
* use them if possible when no idle CPU found in select_idle_cpu().
|
||||
*/
|
||||
if ((unsigned int)prev_aff < nr_cpumask_bits)
|
||||
return prev_aff;
|
||||
if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
|
||||
return recent_used_cpu;
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
@ -7525,7 +7569,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
|
||||
util = max(util, util_est);
|
||||
}
|
||||
|
||||
return min(util, capacity_orig_of(cpu));
|
||||
return min(util, arch_scale_cpu_capacity(cpu));
|
||||
}
|
||||
|
||||
unsigned long cpu_util_cfs(int cpu)
|
||||
@ -7677,11 +7721,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
|
||||
{
|
||||
unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
|
||||
unsigned long busy_time = eenv->pd_busy_time;
|
||||
unsigned long energy;
|
||||
|
||||
if (dst_cpu >= 0)
|
||||
busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
|
||||
|
||||
return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
|
||||
energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
|
||||
|
||||
trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
|
||||
|
||||
return energy;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -7756,7 +7805,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
target = prev_cpu;
|
||||
|
||||
sync_entity_load_avg(&p->se);
|
||||
if (!uclamp_task_util(p, p_util_min, p_util_max))
|
||||
if (!task_util_est(p) && p_util_min == 0)
|
||||
goto unlock;
|
||||
|
||||
eenv_task_busy_time(&eenv, p, prev_cpu);
|
||||
@ -7764,11 +7813,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
for (; pd; pd = pd->next) {
|
||||
unsigned long util_min = p_util_min, util_max = p_util_max;
|
||||
unsigned long cpu_cap, cpu_thermal_cap, util;
|
||||
unsigned long cur_delta, max_spare_cap = 0;
|
||||
long prev_spare_cap = -1, max_spare_cap = -1;
|
||||
unsigned long rq_util_min, rq_util_max;
|
||||
unsigned long prev_spare_cap = 0;
|
||||
unsigned long cur_delta, base_energy;
|
||||
int max_spare_cap_cpu = -1;
|
||||
unsigned long base_energy;
|
||||
int fits, max_fits = -1;
|
||||
|
||||
cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
|
||||
@ -7831,7 +7879,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
prev_spare_cap = cpu_cap;
|
||||
prev_fits = fits;
|
||||
} else if ((fits > max_fits) ||
|
||||
((fits == max_fits) && (cpu_cap > max_spare_cap))) {
|
||||
((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
|
||||
/*
|
||||
* Find the CPU with the maximum spare capacity
|
||||
* among the remaining CPUs in the performance
|
||||
@ -7843,7 +7891,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
}
|
||||
}
|
||||
|
||||
if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
|
||||
if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
|
||||
continue;
|
||||
|
||||
eenv_pd_busy_time(&eenv, cpus, p);
|
||||
@ -7851,7 +7899,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
base_energy = compute_energy(&eenv, pd, cpus, p, -1);
|
||||
|
||||
/* Evaluate the energy impact of using prev_cpu. */
|
||||
if (prev_spare_cap > 0) {
|
||||
if (prev_spare_cap > -1) {
|
||||
prev_delta = compute_energy(&eenv, pd, cpus, p,
|
||||
prev_cpu);
|
||||
/* CPU utilization has changed */
|
||||
@ -8052,7 +8100,7 @@ static void set_next_buddy(struct sched_entity *se)
|
||||
/*
|
||||
* Preempt the current task with a newly woken task if needed:
|
||||
*/
|
||||
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
|
||||
static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
|
||||
{
|
||||
struct task_struct *curr = rq->curr;
|
||||
struct sched_entity *se = &curr->se, *pse = &p->se;
|
||||
@ -8065,7 +8113,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
||||
|
||||
/*
|
||||
* This is possible from callers such as attach_tasks(), in which we
|
||||
* unconditionally check_preempt_curr() after an enqueue (which may have
|
||||
* unconditionally wakeup_preempt() after an enqueue (which may have
|
||||
* lead to a throttle). This both saves work and prevents false
|
||||
* next-buddy nomination below.
|
||||
*/
|
||||
@ -8157,7 +8205,7 @@ again:
|
||||
goto again;
|
||||
}
|
||||
|
||||
se = pick_next_entity(cfs_rq, curr);
|
||||
se = pick_next_entity(cfs_rq);
|
||||
cfs_rq = group_cfs_rq(se);
|
||||
} while (cfs_rq);
|
||||
|
||||
@ -8220,7 +8268,7 @@ again:
|
||||
}
|
||||
}
|
||||
|
||||
se = pick_next_entity(cfs_rq, curr);
|
||||
se = pick_next_entity(cfs_rq);
|
||||
cfs_rq = group_cfs_rq(se);
|
||||
} while (cfs_rq);
|
||||
|
||||
@ -8259,7 +8307,7 @@ simple:
|
||||
put_prev_task(rq, prev);
|
||||
|
||||
do {
|
||||
se = pick_next_entity(cfs_rq, NULL);
|
||||
se = pick_next_entity(cfs_rq);
|
||||
set_next_entity(cfs_rq, se);
|
||||
cfs_rq = group_cfs_rq(se);
|
||||
} while (cfs_rq);
|
||||
@ -8972,7 +9020,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
|
||||
|
||||
WARN_ON_ONCE(task_rq(p) != rq);
|
||||
activate_task(rq, p, ENQUEUE_NOCLOCK);
|
||||
check_preempt_curr(rq, p, 0);
|
||||
wakeup_preempt(rq, p, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -9312,8 +9360,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
||||
unsigned long capacity = scale_rt_capacity(cpu);
|
||||
struct sched_group *sdg = sd->groups;
|
||||
|
||||
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
|
||||
|
||||
if (!capacity)
|
||||
capacity = 1;
|
||||
|
||||
@ -9389,7 +9435,7 @@ static inline int
|
||||
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
|
||||
{
|
||||
return ((rq->cpu_capacity * sd->imbalance_pct) <
|
||||
(rq->cpu_capacity_orig * 100));
|
||||
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -9400,7 +9446,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
|
||||
static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
|
||||
{
|
||||
return rq->misfit_task_load &&
|
||||
(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
|
||||
(arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
|
||||
check_cpu_capacity(rq, sd));
|
||||
}
|
||||
|
||||
@ -9552,7 +9598,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
|
||||
* can only do it if @group is an SMT group and has exactly on busy CPU. Larger
|
||||
* imbalances in the number of CPUS are dealt with in find_busiest_group().
|
||||
*
|
||||
* If we are balancing load within an SMT core, or at DIE domain level, always
|
||||
* If we are balancing load within an SMT core, or at PKG domain level, always
|
||||
* proceed.
|
||||
*
|
||||
* Return: true if @env::dst_cpu can do with asym_packing load balance. False
|
||||
@ -11251,13 +11297,15 @@ more_balance:
|
||||
busiest->push_cpu = this_cpu;
|
||||
active_balance = 1;
|
||||
}
|
||||
raw_spin_rq_unlock_irqrestore(busiest, flags);
|
||||
|
||||
preempt_disable();
|
||||
raw_spin_rq_unlock_irqrestore(busiest, flags);
|
||||
if (active_balance) {
|
||||
stop_one_cpu_nowait(cpu_of(busiest),
|
||||
active_load_balance_cpu_stop, busiest,
|
||||
&busiest->active_balance_work);
|
||||
}
|
||||
preempt_enable();
|
||||
}
|
||||
} else {
|
||||
sd->nr_balance_failed = 0;
|
||||
@ -11565,36 +11613,39 @@ static inline int on_null_domain(struct rq *rq)
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
/*
|
||||
* idle load balancing details
|
||||
* - When one of the busy CPUs notice that there may be an idle rebalancing
|
||||
* NOHZ idle load balancing (ILB) details:
|
||||
*
|
||||
* - When one of the busy CPUs notices that there may be an idle rebalancing
|
||||
* needed, they will kick the idle load balancer, which then does idle
|
||||
* load balancing for all the idle CPUs.
|
||||
* - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
|
||||
*
|
||||
* - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
|
||||
* anywhere yet.
|
||||
*/
|
||||
|
||||
static inline int find_new_ilb(void)
|
||||
{
|
||||
int ilb;
|
||||
const struct cpumask *hk_mask;
|
||||
int ilb_cpu;
|
||||
|
||||
hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
|
||||
|
||||
for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
|
||||
for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
|
||||
|
||||
if (ilb == smp_processor_id())
|
||||
if (ilb_cpu == smp_processor_id())
|
||||
continue;
|
||||
|
||||
if (idle_cpu(ilb))
|
||||
return ilb;
|
||||
if (idle_cpu(ilb_cpu))
|
||||
return ilb_cpu;
|
||||
}
|
||||
|
||||
return nr_cpu_ids;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
|
||||
* idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
|
||||
* Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
|
||||
* SMP function call (IPI).
|
||||
*
|
||||
* We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
|
||||
*/
|
||||
static void kick_ilb(unsigned int flags)
|
||||
{
|
||||
@ -11608,8 +11659,7 @@ static void kick_ilb(unsigned int flags)
|
||||
nohz.next_balance = jiffies+1;
|
||||
|
||||
ilb_cpu = find_new_ilb();
|
||||
|
||||
if (ilb_cpu >= nr_cpu_ids)
|
||||
if (ilb_cpu < 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -11622,7 +11672,7 @@ static void kick_ilb(unsigned int flags)
|
||||
|
||||
/*
|
||||
* This way we generate an IPI on the target CPU which
|
||||
* is idle. And the softirq performing nohz idle load balance
|
||||
* is idle, and the softirq performing NOHZ idle load balancing
|
||||
* will be run before returning from the IPI.
|
||||
*/
|
||||
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
|
||||
@ -11651,7 +11701,7 @@ static void nohz_balancer_kick(struct rq *rq)
|
||||
|
||||
/*
|
||||
* None are in tickless mode and hence no need for NOHZ idle load
|
||||
* balancing.
|
||||
* balancing:
|
||||
*/
|
||||
if (likely(!atomic_read(&nohz.nr_cpus)))
|
||||
return;
|
||||
@ -11673,9 +11723,8 @@ static void nohz_balancer_kick(struct rq *rq)
|
||||
sd = rcu_dereference(rq->sd);
|
||||
if (sd) {
|
||||
/*
|
||||
* If there's a CFS task and the current CPU has reduced
|
||||
* capacity; kick the ILB to see if there's a better CPU to run
|
||||
* on.
|
||||
* If there's a runnable CFS task and the current CPU has reduced
|
||||
* capacity, kick the ILB to see if there's a better CPU to run on:
|
||||
*/
|
||||
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
|
||||
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
|
||||
@ -11727,11 +11776,11 @@ static void nohz_balancer_kick(struct rq *rq)
|
||||
if (sds) {
|
||||
/*
|
||||
* If there is an imbalance between LLC domains (IOW we could
|
||||
* increase the overall cache use), we need some less-loaded LLC
|
||||
* domain to pull some load. Likewise, we may need to spread
|
||||
* increase the overall cache utilization), we need a less-loaded LLC
|
||||
* domain to pull some load from. Likewise, we may need to spread
|
||||
* load within the current LLC domain (e.g. packed SMT cores but
|
||||
* other CPUs are idle). We can't really know from here how busy
|
||||
* the others are - so just get a nohz balance going if it looks
|
||||
* the others are - so just get a NOHZ balance going if it looks
|
||||
* like this LLC domain has tasks we could move.
|
||||
*/
|
||||
nr_busy = atomic_read(&sds->nr_busy_cpus);
|
||||
@ -12001,8 +12050,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we need to run the ILB for updating blocked load before entering
|
||||
* idle state.
|
||||
* Check if we need to directly run the ILB for updating blocked load before
|
||||
* entering idle state. Here we run ILB directly without issuing IPIs.
|
||||
*
|
||||
* Note that when this function is called, the tick may not yet be stopped on
|
||||
* this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
|
||||
* cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
|
||||
* don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
|
||||
* entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
|
||||
* called from this function on (this) CPU that's not yet in the mask. That's
|
||||
* OK because the goal of nohz_run_idle_balance() is to run ILB only for
|
||||
* updating the blocked load of already idle CPUs without waking up one of
|
||||
* those idle CPUs and outside the preempt disable / irq off phase of the local
|
||||
* cpu about to enter idle, because it can take a long time.
|
||||
*/
|
||||
void nohz_run_idle_balance(int cpu)
|
||||
{
|
||||
@ -12447,7 +12507,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
||||
if (p->prio > oldprio)
|
||||
resched_curr(rq);
|
||||
} else
|
||||
check_preempt_curr(rq, p, 0);
|
||||
wakeup_preempt(rq, p, 0);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
@ -12549,7 +12609,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
|
||||
if (task_current(rq, p))
|
||||
resched_curr(rq);
|
||||
else
|
||||
check_preempt_curr(rq, p, 0);
|
||||
wakeup_preempt(rq, p, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -12908,7 +12968,7 @@ DEFINE_SCHED_CLASS(fair) = {
|
||||
.yield_task = yield_task_fair,
|
||||
.yield_to_task = yield_to_task_fair,
|
||||
|
||||
.check_preempt_curr = check_preempt_wakeup,
|
||||
.wakeup_preempt = check_preempt_wakeup_fair,
|
||||
|
||||
.pick_next_task = __pick_next_task_fair,
|
||||
.put_prev_task = put_prev_task_fair,
|
||||
|
@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true)
|
||||
/*
|
||||
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
|
||||
*/
|
||||
SCHED_FEAT(SIS_PROP, false)
|
||||
SCHED_FEAT(SIS_UTIL, true)
|
||||
|
||||
/*
|
||||
|
@ -401,7 +401,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
/*
|
||||
* Idle tasks are unconditionally rescheduled:
|
||||
*/
|
||||
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
resched_curr(rq);
|
||||
}
|
||||
@ -482,7 +482,7 @@ DEFINE_SCHED_CLASS(idle) = {
|
||||
/* dequeue is not valid, we print a debug message there: */
|
||||
.dequeue_task = dequeue_task_idle,
|
||||
|
||||
.check_preempt_curr = check_preempt_curr_idle,
|
||||
.wakeup_preempt = wakeup_preempt_idle,
|
||||
|
||||
.pick_next_task = pick_next_task_idle,
|
||||
.put_prev_task = put_prev_task_idle,
|
||||
|
@ -1,6 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Per Entity Load Tracking
|
||||
* Per Entity Load Tracking (PELT)
|
||||
*
|
||||
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
|
||||
*
|
||||
|
@ -434,14 +434,13 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
|
||||
return growth;
|
||||
}
|
||||
|
||||
static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
|
||||
static void update_triggers(struct psi_group *group, u64 now,
|
||||
enum psi_aggregators aggregator)
|
||||
{
|
||||
struct psi_trigger *t;
|
||||
u64 *total = group->total[aggregator];
|
||||
struct list_head *triggers;
|
||||
u64 *aggregator_total;
|
||||
*update_total = false;
|
||||
|
||||
if (aggregator == PSI_AVGS) {
|
||||
triggers = &group->avg_triggers;
|
||||
@ -471,14 +470,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
|
||||
* events without dropping any).
|
||||
*/
|
||||
if (new_stall) {
|
||||
/*
|
||||
* Multiple triggers might be looking at the same state,
|
||||
* remember to update group->polling_total[] once we've
|
||||
* been through all of them. Also remember to extend the
|
||||
* polling time if we see new stall activity.
|
||||
*/
|
||||
*update_total = true;
|
||||
|
||||
/* Calculate growth since last update */
|
||||
growth = window_update(&t->win, now, total[t->state]);
|
||||
if (!t->pending_event) {
|
||||
@ -503,8 +494,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
|
||||
/* Reset threshold breach flag once event got generated */
|
||||
t->pending_event = false;
|
||||
}
|
||||
|
||||
return now + group->rtpoll_min_period;
|
||||
}
|
||||
|
||||
static u64 update_averages(struct psi_group *group, u64 now)
|
||||
@ -565,7 +554,6 @@ static void psi_avgs_work(struct work_struct *work)
|
||||
struct delayed_work *dwork;
|
||||
struct psi_group *group;
|
||||
u32 changed_states;
|
||||
bool update_total;
|
||||
u64 now;
|
||||
|
||||
dwork = to_delayed_work(work);
|
||||
@ -584,7 +572,7 @@ static void psi_avgs_work(struct work_struct *work)
|
||||
* go - see calc_avgs() and missed_periods.
|
||||
*/
|
||||
if (now >= group->avg_next_update) {
|
||||
update_triggers(group, now, &update_total, PSI_AVGS);
|
||||
update_triggers(group, now, PSI_AVGS);
|
||||
group->avg_next_update = update_averages(group, now);
|
||||
}
|
||||
|
||||
@ -608,7 +596,7 @@ static void init_rtpoll_triggers(struct psi_group *group, u64 now)
|
||||
group->rtpoll_next_update = now + group->rtpoll_min_period;
|
||||
}
|
||||
|
||||
/* Schedule polling if it's not already scheduled or forced. */
|
||||
/* Schedule rtpolling if it's not already scheduled or forced. */
|
||||
static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
|
||||
bool force)
|
||||
{
|
||||
@ -640,7 +628,6 @@ static void psi_rtpoll_work(struct psi_group *group)
|
||||
{
|
||||
bool force_reschedule = false;
|
||||
u32 changed_states;
|
||||
bool update_total;
|
||||
u64 now;
|
||||
|
||||
mutex_lock(&group->rtpoll_trigger_lock);
|
||||
@ -649,37 +636,37 @@ static void psi_rtpoll_work(struct psi_group *group)
|
||||
|
||||
if (now > group->rtpoll_until) {
|
||||
/*
|
||||
* We are either about to start or might stop polling if no
|
||||
* state change was recorded. Resetting poll_scheduled leaves
|
||||
* We are either about to start or might stop rtpolling if no
|
||||
* state change was recorded. Resetting rtpoll_scheduled leaves
|
||||
* a small window for psi_group_change to sneak in and schedule
|
||||
* an immediate poll_work before we get to rescheduling. One
|
||||
* potential extra wakeup at the end of the polling window
|
||||
* should be negligible and polling_next_update still keeps
|
||||
* an immediate rtpoll_work before we get to rescheduling. One
|
||||
* potential extra wakeup at the end of the rtpolling window
|
||||
* should be negligible and rtpoll_next_update still keeps
|
||||
* updates correctly on schedule.
|
||||
*/
|
||||
atomic_set(&group->rtpoll_scheduled, 0);
|
||||
/*
|
||||
* A task change can race with the poll worker that is supposed to
|
||||
* A task change can race with the rtpoll worker that is supposed to
|
||||
* report on it. To avoid missing events, ensure ordering between
|
||||
* poll_scheduled and the task state accesses, such that if the poll
|
||||
* worker misses the state update, the task change is guaranteed to
|
||||
* reschedule the poll worker:
|
||||
* rtpoll_scheduled and the task state accesses, such that if the
|
||||
* rtpoll worker misses the state update, the task change is
|
||||
* guaranteed to reschedule the rtpoll worker:
|
||||
*
|
||||
* poll worker:
|
||||
* atomic_set(poll_scheduled, 0)
|
||||
* rtpoll worker:
|
||||
* atomic_set(rtpoll_scheduled, 0)
|
||||
* smp_mb()
|
||||
* LOAD states
|
||||
*
|
||||
* task change:
|
||||
* STORE states
|
||||
* if atomic_xchg(poll_scheduled, 1) == 0:
|
||||
* schedule poll worker
|
||||
* if atomic_xchg(rtpoll_scheduled, 1) == 0:
|
||||
* schedule rtpoll worker
|
||||
*
|
||||
* The atomic_xchg() implies a full barrier.
|
||||
*/
|
||||
smp_mb();
|
||||
} else {
|
||||
/* Polling window is not over, keep rescheduling */
|
||||
/* The rtpolling window is not over, keep rescheduling */
|
||||
force_reschedule = true;
|
||||
}
|
||||
|
||||
@ -687,7 +674,7 @@ static void psi_rtpoll_work(struct psi_group *group)
|
||||
collect_percpu_times(group, PSI_POLL, &changed_states);
|
||||
|
||||
if (changed_states & group->rtpoll_states) {
|
||||
/* Initialize trigger windows when entering polling mode */
|
||||
/* Initialize trigger windows when entering rtpolling mode */
|
||||
if (now > group->rtpoll_until)
|
||||
init_rtpoll_triggers(group, now);
|
||||
|
||||
@ -706,10 +693,12 @@ static void psi_rtpoll_work(struct psi_group *group)
|
||||
}
|
||||
|
||||
if (now >= group->rtpoll_next_update) {
|
||||
group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
|
||||
if (update_total)
|
||||
if (changed_states & group->rtpoll_states) {
|
||||
update_triggers(group, now, PSI_POLL);
|
||||
memcpy(group->rtpoll_total, group->total[PSI_POLL],
|
||||
sizeof(group->rtpoll_total));
|
||||
}
|
||||
group->rtpoll_next_update = now + group->rtpoll_min_period;
|
||||
}
|
||||
|
||||
psi_schedule_rtpoll_work(group,
|
||||
@ -1009,6 +998,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
|
||||
struct psi_group_cpu *groupc;
|
||||
u64 now;
|
||||
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
if (!task->pid)
|
||||
return;
|
||||
|
||||
|
@ -16,7 +16,7 @@ struct rt_bandwidth def_rt_bandwidth;
|
||||
* period over which we measure -rt task CPU usage in us.
|
||||
* default: 1s
|
||||
*/
|
||||
unsigned int sysctl_sched_rt_period = 1000000;
|
||||
int sysctl_sched_rt_period = 1000000;
|
||||
|
||||
/*
|
||||
* part of the period that we allow rt tasks to run in us.
|
||||
@ -34,9 +34,11 @@ static struct ctl_table sched_rt_sysctls[] = {
|
||||
{
|
||||
.procname = "sched_rt_period_us",
|
||||
.data = &sysctl_sched_rt_period,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = sched_rt_handler,
|
||||
.extra1 = SYSCTL_ONE,
|
||||
.extra2 = SYSCTL_INT_MAX,
|
||||
},
|
||||
{
|
||||
.procname = "sched_rt_runtime_us",
|
||||
@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = {
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = sched_rt_handler,
|
||||
.extra1 = SYSCTL_NEG_ONE,
|
||||
.extra2 = (void *)&sysctl_sched_rt_period,
|
||||
},
|
||||
{
|
||||
.procname = "sched_rr_timeslice_ms",
|
||||
@ -143,7 +147,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
|
||||
#if defined CONFIG_SMP
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
|
||||
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
|
||||
rt_rq->rt_nr_migratory = 0;
|
||||
rt_rq->overloaded = 0;
|
||||
plist_head_init(&rt_rq->pushable_tasks);
|
||||
#endif /* CONFIG_SMP */
|
||||
@ -358,53 +361,6 @@ static inline void rt_clear_overload(struct rq *rq)
|
||||
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
|
||||
}
|
||||
|
||||
static void update_rt_migration(struct rt_rq *rt_rq)
|
||||
{
|
||||
if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
|
||||
if (!rt_rq->overloaded) {
|
||||
rt_set_overload(rq_of_rt_rq(rt_rq));
|
||||
rt_rq->overloaded = 1;
|
||||
}
|
||||
} else if (rt_rq->overloaded) {
|
||||
rt_clear_overload(rq_of_rt_rq(rt_rq));
|
||||
rt_rq->overloaded = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
if (!rt_entity_is_task(rt_se))
|
||||
return;
|
||||
|
||||
p = rt_task_of(rt_se);
|
||||
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
|
||||
|
||||
rt_rq->rt_nr_total++;
|
||||
if (p->nr_cpus_allowed > 1)
|
||||
rt_rq->rt_nr_migratory++;
|
||||
|
||||
update_rt_migration(rt_rq);
|
||||
}
|
||||
|
||||
static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
if (!rt_entity_is_task(rt_se))
|
||||
return;
|
||||
|
||||
p = rt_task_of(rt_se);
|
||||
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
|
||||
|
||||
rt_rq->rt_nr_total--;
|
||||
if (p->nr_cpus_allowed > 1)
|
||||
rt_rq->rt_nr_migratory--;
|
||||
|
||||
update_rt_migration(rt_rq);
|
||||
}
|
||||
|
||||
static inline int has_pushable_tasks(struct rq *rq)
|
||||
{
|
||||
return !plist_head_empty(&rq->rt.pushable_tasks);
|
||||
@ -438,6 +394,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
|
||||
/* Update the highest prio pushable task */
|
||||
if (p->prio < rq->rt.highest_prio.next)
|
||||
rq->rt.highest_prio.next = p->prio;
|
||||
|
||||
if (!rq->rt.overloaded) {
|
||||
rt_set_overload(rq);
|
||||
rq->rt.overloaded = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
|
||||
@ -451,6 +412,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
|
||||
rq->rt.highest_prio.next = p->prio;
|
||||
} else {
|
||||
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
|
||||
|
||||
if (rq->rt.overloaded) {
|
||||
rt_clear_overload(rq);
|
||||
rq->rt.overloaded = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -464,16 +430,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
}
|
||||
|
||||
static inline
|
||||
void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
||||
{
|
||||
}
|
||||
|
||||
static inline
|
||||
void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void rt_queue_push_tasks(struct rq *rq)
|
||||
{
|
||||
}
|
||||
@ -515,7 +471,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
|
||||
min_cap = uclamp_eff_value(p, UCLAMP_MIN);
|
||||
max_cap = uclamp_eff_value(p, UCLAMP_MAX);
|
||||
|
||||
cpu_cap = capacity_orig_of(cpu);
|
||||
cpu_cap = arch_scale_cpu_capacity(cpu);
|
||||
|
||||
return cpu_cap >= min(min_cap, max_cap);
|
||||
}
|
||||
@ -953,7 +909,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
|
||||
|
||||
/*
|
||||
* When we're idle and a woken (rt) task is
|
||||
* throttled check_preempt_curr() will set
|
||||
* throttled wakeup_preempt() will set
|
||||
* skip_update and the time between the wakeup
|
||||
* and this unthrottle will get accounted as
|
||||
* 'runtime'.
|
||||
@ -1281,7 +1237,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
||||
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
|
||||
|
||||
inc_rt_prio(rt_rq, prio);
|
||||
inc_rt_migration(rt_se, rt_rq);
|
||||
inc_rt_group(rt_se, rt_rq);
|
||||
}
|
||||
|
||||
@ -1294,7 +1249,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
||||
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
|
||||
|
||||
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
|
||||
dec_rt_migration(rt_se, rt_rq);
|
||||
dec_rt_group(rt_se, rt_rq);
|
||||
}
|
||||
|
||||
@ -1715,7 +1669,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
|
||||
/*
|
||||
* Preempt the current task with a newly woken task if needed:
|
||||
*/
|
||||
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
if (p->prio < rq->curr->prio) {
|
||||
resched_curr(rq);
|
||||
@ -2109,9 +2063,11 @@ retry:
|
||||
*/
|
||||
push_task = get_push_task(rq);
|
||||
if (push_task) {
|
||||
preempt_disable();
|
||||
raw_spin_rq_unlock(rq);
|
||||
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
|
||||
push_task, &rq->push_work);
|
||||
preempt_enable();
|
||||
raw_spin_rq_lock(rq);
|
||||
}
|
||||
|
||||
@ -2448,9 +2404,11 @@ skip:
|
||||
double_unlock_balance(this_rq, src_rq);
|
||||
|
||||
if (push_task) {
|
||||
preempt_disable();
|
||||
raw_spin_rq_unlock(this_rq);
|
||||
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
|
||||
push_task, &src_rq->push_work);
|
||||
preempt_enable();
|
||||
raw_spin_rq_lock(this_rq);
|
||||
}
|
||||
}
|
||||
@ -2702,7 +2660,7 @@ DEFINE_SCHED_CLASS(rt) = {
|
||||
.dequeue_task = dequeue_task_rt,
|
||||
.yield_task = yield_task_rt,
|
||||
|
||||
.check_preempt_curr = check_preempt_curr_rt,
|
||||
.wakeup_preempt = wakeup_preempt_rt,
|
||||
|
||||
.pick_next_task = pick_next_task_rt,
|
||||
.put_prev_task = put_prev_task_rt,
|
||||
@ -2985,9 +2943,6 @@ static int sched_rt_global_constraints(void)
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static int sched_rt_global_validate(void)
|
||||
{
|
||||
if (sysctl_sched_rt_period <= 0)
|
||||
return -EINVAL;
|
||||
|
||||
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
|
||||
((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
|
||||
((u64)sysctl_sched_rt_runtime *
|
||||
@ -3018,7 +2973,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
|
||||
old_period = sysctl_sched_rt_period;
|
||||
old_runtime = sysctl_sched_rt_runtime;
|
||||
|
||||
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
||||
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
|
||||
if (!ret && write) {
|
||||
ret = sched_rt_global_validate();
|
||||
|
@ -74,15 +74,6 @@
|
||||
|
||||
#include "../workqueue_internal.h"
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/psi.h>
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
# include <linux/static_key.h>
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
# include <asm/paravirt.h>
|
||||
# include <asm/paravirt_api_clock.h>
|
||||
@ -109,14 +100,12 @@ extern __read_mostly int scheduler_running;
|
||||
extern unsigned long calc_load_update;
|
||||
extern atomic_long_t calc_load_tasks;
|
||||
|
||||
extern unsigned int sysctl_sched_child_runs_first;
|
||||
|
||||
extern void calc_global_load_tick(struct rq *this_rq);
|
||||
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
|
||||
|
||||
extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
|
||||
|
||||
extern unsigned int sysctl_sched_rt_period;
|
||||
extern int sysctl_sched_rt_period;
|
||||
extern int sysctl_sched_rt_runtime;
|
||||
extern int sched_rr_timeslice;
|
||||
|
||||
@ -594,6 +583,7 @@ struct cfs_rq {
|
||||
} removed;
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
u64 last_update_tg_load_avg;
|
||||
unsigned long tg_load_avg_contrib;
|
||||
long propagate;
|
||||
long prop_runnable_sum;
|
||||
@ -644,9 +634,7 @@ struct cfs_rq {
|
||||
int throttled;
|
||||
int throttle_count;
|
||||
struct list_head throttled_list;
|
||||
#ifdef CONFIG_SMP
|
||||
struct list_head throttled_csd_list;
|
||||
#endif
|
||||
#endif /* CONFIG_CFS_BANDWIDTH */
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
};
|
||||
@ -675,8 +663,6 @@ struct rt_rq {
|
||||
} highest_prio;
|
||||
#endif
|
||||
#ifdef CONFIG_SMP
|
||||
unsigned int rt_nr_migratory;
|
||||
unsigned int rt_nr_total;
|
||||
int overloaded;
|
||||
struct plist_head pushable_tasks;
|
||||
|
||||
@ -721,7 +707,6 @@ struct dl_rq {
|
||||
u64 next;
|
||||
} earliest_dl;
|
||||
|
||||
unsigned int dl_nr_migratory;
|
||||
int overloaded;
|
||||
|
||||
/*
|
||||
@ -963,10 +948,6 @@ struct rq {
|
||||
/* runqueue lock: */
|
||||
raw_spinlock_t __lock;
|
||||
|
||||
/*
|
||||
* nr_running and cpu_load should be in the same cacheline because
|
||||
* remote CPUs use both these fields when doing load calculation.
|
||||
*/
|
||||
unsigned int nr_running;
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
unsigned int nr_numa_running;
|
||||
@ -1048,7 +1029,6 @@ struct rq {
|
||||
struct sched_domain __rcu *sd;
|
||||
|
||||
unsigned long cpu_capacity;
|
||||
unsigned long cpu_capacity_orig;
|
||||
|
||||
struct balance_callback *balance_callback;
|
||||
|
||||
@ -1079,9 +1059,6 @@ struct rq {
|
||||
u64 idle_stamp;
|
||||
u64 avg_idle;
|
||||
|
||||
unsigned long wake_stamp;
|
||||
u64 wake_avg_idle;
|
||||
|
||||
/* This is used to determine avg_idle's max value */
|
||||
u64 max_idle_balance_cost;
|
||||
|
||||
@ -1658,6 +1635,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
|
||||
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
|
||||
}
|
||||
|
||||
DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
|
||||
_T->rq = task_rq_lock(_T->lock, &_T->rf),
|
||||
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
|
||||
struct rq *rq; struct rq_flags rf)
|
||||
|
||||
static inline void
|
||||
rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
|
||||
__acquires(rq->lock)
|
||||
@ -1868,11 +1850,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
|
||||
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
|
||||
DECLARE_PER_CPU(int, sd_llc_size);
|
||||
DECLARE_PER_CPU(int, sd_llc_id);
|
||||
DECLARE_PER_CPU(int, sd_share_id);
|
||||
DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
|
||||
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
|
||||
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
|
||||
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
|
||||
extern struct static_key_false sched_asym_cpucapacity;
|
||||
extern struct static_key_false sched_cluster_active;
|
||||
|
||||
static __always_inline bool sched_asym_cpucap_active(void)
|
||||
{
|
||||
@ -2239,7 +2223,7 @@ struct sched_class {
|
||||
void (*yield_task) (struct rq *rq);
|
||||
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
|
||||
|
||||
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
|
||||
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
struct task_struct *(*pick_next_task)(struct rq *rq);
|
||||
|
||||
@ -2513,7 +2497,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
|
||||
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
|
||||
extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
#ifdef CONFIG_PREEMPT_RT
|
||||
#define SCHED_NR_MIGRATE_BREAK 8
|
||||
@ -2977,11 +2961,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static inline unsigned long capacity_orig_of(int cpu)
|
||||
{
|
||||
return cpu_rq(cpu)->cpu_capacity_orig;
|
||||
}
|
||||
|
||||
/**
|
||||
* enum cpu_util_type - CPU utilization type
|
||||
* @FREQUENCY_UTIL: Utilization used to select frequency
|
||||
@ -3219,6 +3198,8 @@ static inline bool sched_energy_enabled(void)
|
||||
return static_branch_unlikely(&sched_energy_present);
|
||||
}
|
||||
|
||||
extern struct cpufreq_governor schedutil_gov;
|
||||
|
||||
#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
|
||||
|
||||
#define perf_domain_span(pd) NULL
|
||||
|
@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
static void
|
||||
check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
/* we're never preempted */
|
||||
}
|
||||
@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = {
|
||||
.dequeue_task = dequeue_task_stop,
|
||||
.yield_task = yield_task_stop,
|
||||
|
||||
.check_preempt_curr = check_preempt_curr_stop,
|
||||
.wakeup_preempt = wakeup_preempt_stop,
|
||||
|
||||
.pick_next_task = pick_next_task_stop,
|
||||
.put_prev_task = put_prev_task_stop,
|
||||
|
@ -212,6 +212,69 @@ static unsigned int sysctl_sched_energy_aware = 1;
|
||||
static DEFINE_MUTEX(sched_energy_mutex);
|
||||
static bool sched_energy_update;
|
||||
|
||||
static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
|
||||
{
|
||||
bool any_asym_capacity = false;
|
||||
struct cpufreq_policy *policy;
|
||||
struct cpufreq_governor *gov;
|
||||
int i;
|
||||
|
||||
/* EAS is enabled for asymmetric CPU capacity topologies. */
|
||||
for_each_cpu(i, cpu_mask) {
|
||||
if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) {
|
||||
any_asym_capacity = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!any_asym_capacity) {
|
||||
if (sched_debug()) {
|
||||
pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
|
||||
cpumask_pr_args(cpu_mask));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* EAS definitely does *not* handle SMT */
|
||||
if (sched_smt_active()) {
|
||||
if (sched_debug()) {
|
||||
pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
|
||||
cpumask_pr_args(cpu_mask));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!arch_scale_freq_invariant()) {
|
||||
if (sched_debug()) {
|
||||
pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
|
||||
cpumask_pr_args(cpu_mask));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Do not attempt EAS if schedutil is not being used. */
|
||||
for_each_cpu(i, cpu_mask) {
|
||||
policy = cpufreq_cpu_get(i);
|
||||
if (!policy) {
|
||||
if (sched_debug()) {
|
||||
pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
|
||||
cpumask_pr_args(cpu_mask), i);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
gov = policy->governor;
|
||||
cpufreq_cpu_put(policy);
|
||||
if (gov != &schedutil_gov) {
|
||||
if (sched_debug()) {
|
||||
pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
|
||||
cpumask_pr_args(cpu_mask));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void rebuild_sched_domains_energy(void)
|
||||
{
|
||||
mutex_lock(&sched_energy_mutex);
|
||||
@ -230,6 +293,15 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
|
||||
if (write && !capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
if (!sched_is_eas_possible(cpu_active_mask)) {
|
||||
if (write) {
|
||||
return -EOPNOTSUPP;
|
||||
} else {
|
||||
*lenp = 0;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (!ret && write) {
|
||||
state = static_branch_unlikely(&sched_energy_present);
|
||||
@ -348,103 +420,33 @@ static void sched_energy_set(bool has_eas)
|
||||
* 1. an Energy Model (EM) is available;
|
||||
* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
|
||||
* 3. no SMT is detected.
|
||||
* 4. the EM complexity is low enough to keep scheduling overheads low;
|
||||
* 5. schedutil is driving the frequency of all CPUs of the rd;
|
||||
* 6. frequency invariance support is present;
|
||||
*
|
||||
* The complexity of the Energy Model is defined as:
|
||||
*
|
||||
* C = nr_pd * (nr_cpus + nr_ps)
|
||||
*
|
||||
* with parameters defined as:
|
||||
* - nr_pd: the number of performance domains
|
||||
* - nr_cpus: the number of CPUs
|
||||
* - nr_ps: the sum of the number of performance states of all performance
|
||||
* domains (for example, on a system with 2 performance domains,
|
||||
* with 10 performance states each, nr_ps = 2 * 10 = 20).
|
||||
*
|
||||
* It is generally not a good idea to use such a model in the wake-up path on
|
||||
* very complex platforms because of the associated scheduling overheads. The
|
||||
* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
|
||||
* with per-CPU DVFS and less than 8 performance states each, for example.
|
||||
* 4. schedutil is driving the frequency of all CPUs of the rd;
|
||||
* 5. frequency invariance support is present;
|
||||
*/
|
||||
#define EM_MAX_COMPLEXITY 2048
|
||||
|
||||
extern struct cpufreq_governor schedutil_gov;
|
||||
static bool build_perf_domains(const struct cpumask *cpu_map)
|
||||
{
|
||||
int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
|
||||
int i;
|
||||
struct perf_domain *pd = NULL, *tmp;
|
||||
int cpu = cpumask_first(cpu_map);
|
||||
struct root_domain *rd = cpu_rq(cpu)->rd;
|
||||
struct cpufreq_policy *policy;
|
||||
struct cpufreq_governor *gov;
|
||||
|
||||
if (!sysctl_sched_energy_aware)
|
||||
goto free;
|
||||
|
||||
/* EAS is enabled for asymmetric CPU capacity topologies. */
|
||||
if (!per_cpu(sd_asym_cpucapacity, cpu)) {
|
||||
if (sched_debug()) {
|
||||
pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
|
||||
cpumask_pr_args(cpu_map));
|
||||
}
|
||||
if (!sched_is_eas_possible(cpu_map))
|
||||
goto free;
|
||||
}
|
||||
|
||||
/* EAS definitely does *not* handle SMT */
|
||||
if (sched_smt_active()) {
|
||||
pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
|
||||
cpumask_pr_args(cpu_map));
|
||||
goto free;
|
||||
}
|
||||
|
||||
if (!arch_scale_freq_invariant()) {
|
||||
if (sched_debug()) {
|
||||
pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
|
||||
cpumask_pr_args(cpu_map));
|
||||
}
|
||||
goto free;
|
||||
}
|
||||
|
||||
for_each_cpu(i, cpu_map) {
|
||||
/* Skip already covered CPUs. */
|
||||
if (find_pd(pd, i))
|
||||
continue;
|
||||
|
||||
/* Do not attempt EAS if schedutil is not being used. */
|
||||
policy = cpufreq_cpu_get(i);
|
||||
if (!policy)
|
||||
goto free;
|
||||
gov = policy->governor;
|
||||
cpufreq_cpu_put(policy);
|
||||
if (gov != &schedutil_gov) {
|
||||
if (rd->pd)
|
||||
pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
|
||||
cpumask_pr_args(cpu_map));
|
||||
goto free;
|
||||
}
|
||||
|
||||
/* Create the new pd and add it to the local list. */
|
||||
tmp = pd_init(i);
|
||||
if (!tmp)
|
||||
goto free;
|
||||
tmp->next = pd;
|
||||
pd = tmp;
|
||||
|
||||
/*
|
||||
* Count performance domains and performance states for the
|
||||
* complexity check.
|
||||
*/
|
||||
nr_pd++;
|
||||
nr_ps += em_pd_nr_perf_states(pd->em_pd);
|
||||
}
|
||||
|
||||
/* Bail out if the Energy Model complexity is too high. */
|
||||
if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
|
||||
WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
|
||||
cpumask_pr_args(cpu_map));
|
||||
goto free;
|
||||
}
|
||||
|
||||
perf_domain_debug(cpu_map, pd);
|
||||
@ -666,11 +668,14 @@ static void destroy_sched_domains(struct sched_domain *sd)
|
||||
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
|
||||
DEFINE_PER_CPU(int, sd_llc_size);
|
||||
DEFINE_PER_CPU(int, sd_llc_id);
|
||||
DEFINE_PER_CPU(int, sd_share_id);
|
||||
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
|
||||
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
|
||||
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
|
||||
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
|
||||
DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
|
||||
|
||||
static void update_top_cache_domain(int cpu)
|
||||
{
|
||||
@ -691,6 +696,17 @@ static void update_top_cache_domain(int cpu)
|
||||
per_cpu(sd_llc_id, cpu) = id;
|
||||
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
|
||||
|
||||
sd = lowest_flag_domain(cpu, SD_CLUSTER);
|
||||
if (sd)
|
||||
id = cpumask_first(sched_domain_span(sd));
|
||||
|
||||
/*
|
||||
* This assignment should be placed after the sd_llc_id as
|
||||
* we want this id equals to cluster id on cluster machines
|
||||
* but equals to LLC id on non-Cluster machines.
|
||||
*/
|
||||
per_cpu(sd_share_id, cpu) = id;
|
||||
|
||||
sd = lowest_flag_domain(cpu, SD_NUMA);
|
||||
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
|
||||
|
||||
@ -1117,7 +1133,7 @@ fail:
|
||||
*
|
||||
* - Simultaneous multithreading (SMT)
|
||||
* - Multi-Core Cache (MC)
|
||||
* - Package (DIE)
|
||||
* - Package (PKG)
|
||||
*
|
||||
* Where the last one more or less denotes everything up to a NUMA node.
|
||||
*
|
||||
@ -1139,13 +1155,13 @@ fail:
|
||||
*
|
||||
* CPU 0 1 2 3 4 5 6 7
|
||||
*
|
||||
* DIE [ ]
|
||||
* PKG [ ]
|
||||
* MC [ ] [ ]
|
||||
* SMT [ ] [ ] [ ] [ ]
|
||||
*
|
||||
* - or -
|
||||
*
|
||||
* DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
|
||||
* PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
|
||||
* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
|
||||
* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
|
||||
*
|
||||
@ -1548,6 +1564,7 @@ static struct cpumask ***sched_domains_numa_masks;
|
||||
*/
|
||||
#define TOPOLOGY_SD_FLAGS \
|
||||
(SD_SHARE_CPUCAPACITY | \
|
||||
SD_CLUSTER | \
|
||||
SD_SHARE_PKG_RESOURCES | \
|
||||
SD_NUMA | \
|
||||
SD_ASYM_PACKING)
|
||||
@ -1679,7 +1696,7 @@ static struct sched_domain_topology_level default_topology[] = {
|
||||
#ifdef CONFIG_SCHED_MC
|
||||
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
|
||||
#endif
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
|
||||
{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
|
||||
{ NULL, },
|
||||
};
|
||||
|
||||
@ -2112,22 +2129,31 @@ static int hop_cmp(const void *a, const void *b)
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
|
||||
* closest to @cpu from @cpumask.
|
||||
* cpumask: cpumask to find a cpu from
|
||||
* cpu: Nth cpu to find
|
||||
/**
|
||||
* sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU
|
||||
* from @cpus to @cpu, taking into account distance
|
||||
* from a given @node.
|
||||
* @cpus: cpumask to find a cpu from
|
||||
* @cpu: CPU to start searching
|
||||
* @node: NUMA node to order CPUs by distance
|
||||
*
|
||||
* returns: cpu, or nr_cpu_ids when nothing found.
|
||||
* Return: cpu, or nr_cpu_ids when nothing found.
|
||||
*/
|
||||
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
|
||||
{
|
||||
struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
|
||||
struct __cmp_key k = { .cpus = cpus, .cpu = cpu };
|
||||
struct cpumask ***hop_masks;
|
||||
int hop, ret = nr_cpu_ids;
|
||||
|
||||
if (node == NUMA_NO_NODE)
|
||||
return cpumask_nth_and(cpu, cpus, cpu_online_mask);
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
/* CPU-less node entries are uninitialized in sched_domains_numa_masks */
|
||||
node = numa_nearest_node(node, N_CPU);
|
||||
k.node = node;
|
||||
|
||||
k.masks = rcu_dereference(sched_domains_numa_masks);
|
||||
if (!k.masks)
|
||||
goto unlock;
|
||||
@ -2362,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
||||
struct rq *rq = NULL;
|
||||
int i, ret = -ENOMEM;
|
||||
bool has_asym = false;
|
||||
bool has_cluster = false;
|
||||
|
||||
if (WARN_ON(cpumask_empty(cpu_map)))
|
||||
goto error;
|
||||
@ -2479,20 +2506,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
||||
/* Attach the domains */
|
||||
rcu_read_lock();
|
||||
for_each_cpu(i, cpu_map) {
|
||||
unsigned long capacity;
|
||||
|
||||
rq = cpu_rq(i);
|
||||
sd = *per_cpu_ptr(d.sd, i);
|
||||
|
||||
capacity = arch_scale_cpu_capacity(i);
|
||||
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
|
||||
if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
|
||||
WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
|
||||
if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
|
||||
WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
|
||||
|
||||
cpu_attach_domain(sd, d.rd, i);
|
||||
|
||||
if (lowest_flag_domain(i, SD_CLUSTER))
|
||||
has_cluster = true;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (has_asym)
|
||||
static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
|
||||
|
||||
if (has_cluster)
|
||||
static_branch_inc_cpuslocked(&sched_cluster_active);
|
||||
|
||||
if (rq && sched_debug_verbose) {
|
||||
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
|
||||
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
|
||||
@ -2592,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
|
||||
if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
|
||||
static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
|
||||
|
||||
if (static_branch_unlikely(&sched_cluster_active))
|
||||
static_branch_dec_cpuslocked(&sched_cluster_active);
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_cpu(i, cpu_map)
|
||||
cpu_attach_domain(NULL, &def_root_domain, i);
|
||||
|
@ -146,9 +146,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
|
||||
/* Wrap: we always want a cpu. */
|
||||
i %= num_online_cpus();
|
||||
|
||||
cpu = (node == NUMA_NO_NODE) ?
|
||||
cpumask_nth(i, cpu_online_mask) :
|
||||
sched_numa_find_nth_cpu(cpu_online_mask, i, node);
|
||||
cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node);
|
||||
|
||||
WARN_ON(cpu >= nr_cpu_ids);
|
||||
return cpu;
|
||||
|
@ -131,22 +131,26 @@ static struct mempolicy default_policy = {
|
||||
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
|
||||
|
||||
/**
|
||||
* numa_map_to_online_node - Find closest online node
|
||||
* numa_nearest_node - Find nearest node by state
|
||||
* @node: Node id to start the search
|
||||
* @state: State to filter the search
|
||||
*
|
||||
* Lookup the next closest node by distance if @nid is not online.
|
||||
* Lookup the closest node by distance if @nid is not in state.
|
||||
*
|
||||
* Return: this @node if it is online, otherwise the closest node by distance
|
||||
* Return: this @node if it is in state, otherwise the closest node by distance
|
||||
*/
|
||||
int numa_map_to_online_node(int node)
|
||||
int numa_nearest_node(int node, unsigned int state)
|
||||
{
|
||||
int min_dist = INT_MAX, dist, n, min_node;
|
||||
|
||||
if (node == NUMA_NO_NODE || node_online(node))
|
||||
if (state >= NR_NODE_STATES)
|
||||
return -EINVAL;
|
||||
|
||||
if (node == NUMA_NO_NODE || node_state(node, state))
|
||||
return node;
|
||||
|
||||
min_node = node;
|
||||
for_each_online_node(n) {
|
||||
for_each_node_state(n, state) {
|
||||
dist = node_distance(node, n);
|
||||
if (dist < min_dist) {
|
||||
min_dist = dist;
|
||||
@ -156,7 +160,7 @@ int numa_map_to_online_node(int node)
|
||||
|
||||
return min_node;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(numa_map_to_online_node);
|
||||
EXPORT_SYMBOL_GPL(numa_nearest_node);
|
||||
|
||||
struct mempolicy *get_task_policy(struct task_struct *p)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user