mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-27 14:14:24 +08:00
The changes in this cycle are:
- Optimize the task wakeup CPU selection logic, to improve scalability and reduce wakeup latency spikes - PELT enhancements - CFS bandwidth handling fixes - Optimize the wakeup path by remove rq->wake_list and replacing it with ->ttwu_pending - Optimize IPI cross-calls by making flush_smp_call_function_queue() process sync callbacks first. - Misc fixes and enhancements. Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAl7WPL0RHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1i0ThAAs0fbvMzNJ5SWFdwOQ4KZIlA+Im4dEBMK sx/XAZqa/hGxvkm1jS0RDVQl1V1JdOlru5UF4C42ctnAFGtBBHDriO5rn9oCpkSw DAoLc4eZqzldIXN6sDZ0xMtC14Eu15UAP40OyM4qxBc4GqGlOnnale6Vhn+n+pLQ jAuZlMJIkmmzeA6cuvtultevrVh+QUqJ/5oNUANlTER4OM48umjr5rNTOb8cIW53 9K3vbS3nmqSvJuIyqfRFoMy5GFM6+Jj2+nYuq8aTuYLEtF4qqWzttS3wBzC9699g XYRKILkCK8ZP4RB5Ps/DIKj6maZGZoICBxTJEkIgXujJlxlKKTD3mddk+0LBXChW Ijznanxn67akoAFpqi/Dnkhieg7cUrE9v1OPRS2J0xy550synSPFcSgOK3viizga iqbjptY4scUWkCwHQNjABerxc7MWzrwbIrRt+uNvCaqJLweUh0GnEcV5va8R+4I8 K20XwOdrzuPLo5KdDWA/BKOEv49guHZDvoykzlwMlR3gFfwHS/UsjzmSQIWK3gZG 9OMn8ibO2f1OzhRcEpDLFzp7IIj6NJmPFVSW+7xHyL9/vTveUx3ZXPLteb2qxJVP BYPsduVx8YeGRBlLya0PJriB23ajQr0lnHWo15g0uR9o/0Ds1ephcymiF3QJmCaA To3CyIuQN8M= =C2OP -----END PGP SIGNATURE----- Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: "The changes in this cycle are: - Optimize the task wakeup CPU selection logic, to improve scalability and reduce wakeup latency spikes - PELT enhancements - CFS bandwidth handling fixes - Optimize the wakeup path by remove rq->wake_list and replacing it with ->ttwu_pending - Optimize IPI cross-calls by making flush_smp_call_function_queue() process sync callbacks first. - Misc fixes and enhancements" * tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits) irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too sched/headers: Split out open-coded prototypes into kernel/sched/smp.h sched: Replace rq::wake_list sched: Add rq::ttwu_pending irq_work, smp: Allow irq_work on call_single_queue smp: Optimize send_call_function_single_ipi() smp: Move irq_work_run() out of flush_smp_call_function_queue() smp: Optimize flush_smp_call_function_queue() sched: Fix smp_call_function_single_async() usage for ILB sched/core: Offload wakee task activation if it the wakee is descheduling sched/core: Optimize ttwu() spinning on p->on_cpu sched: Defend cfs and rt bandwidth quota against overflow sched/cpuacct: Fix charge cpuacct.usage_sys sched/fair: Replace zero-length array with flexible-array sched/pelt: Sync util/runnable_sum with PELT window when propagating sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr() sched/fair: Optimize enqueue_task_fair() sched: Make scheduler_ipi inline sched: Clean up scheduler_ipi() sched/core: Simplify sched_init() ...
This commit is contained in:
commit
d479c5a191
@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
|
||||
/* Standard hot unplug procedure */
|
||||
|
||||
idle_task_exit();
|
||||
current->active_mm = NULL; /* for sanity */
|
||||
cpu = smp_processor_id();
|
||||
DBG("CPU%d offline\n", cpu);
|
||||
generic_set_cpu_dead(cpu);
|
||||
|
@ -13,6 +13,8 @@
|
||||
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
|
||||
*/
|
||||
|
||||
/* flags share CSD_FLAG_ space */
|
||||
|
||||
#define IRQ_WORK_PENDING BIT(0)
|
||||
#define IRQ_WORK_BUSY BIT(1)
|
||||
|
||||
@ -23,9 +25,12 @@
|
||||
|
||||
#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY)
|
||||
|
||||
/*
|
||||
* structure shares layout with single_call_data_t.
|
||||
*/
|
||||
struct irq_work {
|
||||
atomic_t flags;
|
||||
struct llist_node llnode;
|
||||
atomic_t flags;
|
||||
void (*func)(struct irq_work *);
|
||||
};
|
||||
|
||||
@ -53,9 +58,11 @@ void irq_work_sync(struct irq_work *work);
|
||||
|
||||
void irq_work_run(void);
|
||||
bool irq_work_needs_cpu(void);
|
||||
void irq_work_single(void *arg);
|
||||
#else
|
||||
static inline bool irq_work_needs_cpu(void) { return false; }
|
||||
static inline void irq_work_run(void) { }
|
||||
static inline void irq_work_single(void *arg) { }
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_IRQ_WORK_H */
|
||||
|
@ -654,6 +654,7 @@ struct task_struct {
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
struct llist_node wake_entry;
|
||||
unsigned int wake_entry_type;
|
||||
int on_cpu;
|
||||
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
||||
/* Current CPU: */
|
||||
@ -1730,7 +1731,15 @@ extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
|
||||
})
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
void scheduler_ipi(void);
|
||||
static __always_inline void scheduler_ipi(void)
|
||||
{
|
||||
/*
|
||||
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
|
||||
* TIF_NEED_RESCHED remotely (for the first time) will also send
|
||||
* this IPI.
|
||||
*/
|
||||
preempt_fold_need_resched();
|
||||
}
|
||||
extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
|
||||
#else
|
||||
static inline void scheduler_ipi(void) { }
|
||||
|
@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
|
||||
__mmdrop(mm);
|
||||
}
|
||||
|
||||
void mmdrop(struct mm_struct *mm);
|
||||
|
||||
/*
|
||||
* This has to be called after a get_task_mm()/mmget_not_zero()
|
||||
* followed by taking the mmap_sem for writing before modifying the
|
||||
|
@ -11,21 +11,20 @@
|
||||
*/
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
|
||||
#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
|
||||
#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
|
||||
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
|
||||
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
|
||||
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
|
||||
#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */
|
||||
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */
|
||||
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
|
||||
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */
|
||||
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
|
||||
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
|
||||
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
|
||||
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
|
||||
#define SD_NUMA 0x4000 /* cross-node balancing */
|
||||
#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */
|
||||
#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */
|
||||
#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */
|
||||
#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */
|
||||
#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */
|
||||
#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */
|
||||
#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */
|
||||
#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */
|
||||
#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */
|
||||
#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */
|
||||
#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */
|
||||
#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */
|
||||
#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */
|
||||
#define SD_NUMA 0x2000 /* cross-node balancing */
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
static inline int cpu_smt_flags(void)
|
||||
|
@ -16,17 +16,39 @@
|
||||
|
||||
typedef void (*smp_call_func_t)(void *info);
|
||||
typedef bool (*smp_cond_func_t)(int cpu, void *info);
|
||||
|
||||
enum {
|
||||
CSD_FLAG_LOCK = 0x01,
|
||||
|
||||
/* IRQ_WORK_flags */
|
||||
|
||||
CSD_TYPE_ASYNC = 0x00,
|
||||
CSD_TYPE_SYNC = 0x10,
|
||||
CSD_TYPE_IRQ_WORK = 0x20,
|
||||
CSD_TYPE_TTWU = 0x30,
|
||||
CSD_FLAG_TYPE_MASK = 0xF0,
|
||||
};
|
||||
|
||||
/*
|
||||
* structure shares (partial) layout with struct irq_work
|
||||
*/
|
||||
struct __call_single_data {
|
||||
struct llist_node llist;
|
||||
unsigned int flags;
|
||||
smp_call_func_t func;
|
||||
void *info;
|
||||
unsigned int flags;
|
||||
};
|
||||
|
||||
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
|
||||
typedef struct __call_single_data call_single_data_t
|
||||
__aligned(sizeof(struct __call_single_data));
|
||||
|
||||
/*
|
||||
* Enqueue a llist_node on the call_single_queue; be very careful, read
|
||||
* flush_smp_call_function_queue() in detail.
|
||||
*/
|
||||
extern void __smp_call_single_queue(int cpu, struct llist_node *node);
|
||||
|
||||
/* total number of cpus in this system (may exceed NR_CPUS) */
|
||||
extern unsigned int total_cpus;
|
||||
|
||||
|
@ -9,23 +9,10 @@
|
||||
#include <asm/current.h>
|
||||
|
||||
/*
|
||||
* BROKEN wait-queues.
|
||||
*
|
||||
* These "simple" wait-queues are broken garbage, and should never be
|
||||
* used. The comments below claim that they are "similar" to regular
|
||||
* wait-queues, but the semantics are actually completely different, and
|
||||
* every single user we have ever had has been buggy (or pointless).
|
||||
*
|
||||
* A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
|
||||
* "wake_up()" does, and has led to problems. In other cases, it has
|
||||
* been fine, because there's only ever one waiter (kvm), but in that
|
||||
* case gthe whole "simple" wait-queue is just pointless to begin with,
|
||||
* since there is no "queue". Use "wake_up_process()" with a direct
|
||||
* pointer instead.
|
||||
*
|
||||
* While these are very similar to regular wait queues (wait.h) the most
|
||||
* important difference is that the simple waitqueue allows for deterministic
|
||||
* behaviour -- IOW it has strictly bounded IRQ and lock hold times.
|
||||
* Simple waitqueues are semantically very different to regular wait queues
|
||||
* (wait.h). The most important difference is that the simple waitqueue allows
|
||||
* for deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
|
||||
* times.
|
||||
*
|
||||
* Mainly, this is accomplished by two things. Firstly not allowing swake_up_all
|
||||
* from IRQ disabled, and dropping the lock upon every wakeup, giving a higher
|
||||
@ -39,7 +26,7 @@
|
||||
* sleeper state.
|
||||
*
|
||||
* - the !exclusive mode; because that leads to O(n) wakeups, everything is
|
||||
* exclusive.
|
||||
* exclusive. As such swake_up_one will only ever awake _one_ waiter.
|
||||
*
|
||||
* - custom wake callback functions; because you cannot give any guarantees
|
||||
* about random code. This also allows swait to be used in RT, such that
|
||||
|
18
kernel/cpu.c
18
kernel/cpu.c
@ -3,6 +3,7 @@
|
||||
*
|
||||
* This code is licenced under the GPL.
|
||||
*/
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/init.h>
|
||||
@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
|
||||
return bringup_wait_for_ap(cpu);
|
||||
}
|
||||
|
||||
static int finish_cpu(unsigned int cpu)
|
||||
{
|
||||
struct task_struct *idle = idle_thread_get(cpu);
|
||||
struct mm_struct *mm = idle->active_mm;
|
||||
|
||||
/*
|
||||
* idle_task_exit() will have switched to &init_mm, now
|
||||
* clean up any remaining active_mm state.
|
||||
*/
|
||||
if (mm != &init_mm)
|
||||
idle->active_mm = &init_mm;
|
||||
mmdrop(mm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hotplug state machine related functions
|
||||
*/
|
||||
@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
|
||||
[CPUHP_BRINGUP_CPU] = {
|
||||
.name = "cpu:bringup",
|
||||
.startup.single = bringup_cpu,
|
||||
.teardown.single = NULL,
|
||||
.teardown.single = finish_cpu,
|
||||
.cant_stop = true,
|
||||
},
|
||||
/* Final state before CPU kills itself */
|
||||
|
@ -708,8 +708,12 @@ void __noreturn do_exit(long code)
|
||||
struct task_struct *tsk = current;
|
||||
int group_dead;
|
||||
|
||||
profile_task_exit(tsk);
|
||||
kcov_task_exit(tsk);
|
||||
/*
|
||||
* We can get here from a kernel oops, sometimes with preemption off.
|
||||
* Start by checking for critical errors.
|
||||
* Then fix up important state like USER_DS and preemption.
|
||||
* Then do everything else.
|
||||
*/
|
||||
|
||||
WARN_ON(blk_needs_flush_plug(tsk));
|
||||
|
||||
@ -727,6 +731,16 @@ void __noreturn do_exit(long code)
|
||||
*/
|
||||
set_fs(USER_DS);
|
||||
|
||||
if (unlikely(in_atomic())) {
|
||||
pr_info("note: %s[%d] exited with preempt_count %d\n",
|
||||
current->comm, task_pid_nr(current),
|
||||
preempt_count());
|
||||
preempt_count_set(PREEMPT_ENABLED);
|
||||
}
|
||||
|
||||
profile_task_exit(tsk);
|
||||
kcov_task_exit(tsk);
|
||||
|
||||
ptrace_event(PTRACE_EVENT_EXIT, code);
|
||||
|
||||
validate_creds_for_do_exit(tsk);
|
||||
@ -744,13 +758,6 @@ void __noreturn do_exit(long code)
|
||||
|
||||
exit_signals(tsk); /* sets PF_EXITING */
|
||||
|
||||
if (unlikely(in_atomic())) {
|
||||
pr_info("note: %s[%d] exited with preempt_count %d\n",
|
||||
current->comm, task_pid_nr(current),
|
||||
preempt_count());
|
||||
preempt_count_set(PREEMPT_ENABLED);
|
||||
}
|
||||
|
||||
/* sync mm's RSS info before statistics gathering */
|
||||
if (tsk->mm)
|
||||
sync_mm_rss(tsk->mm);
|
||||
|
@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
|
||||
{
|
||||
int oflags;
|
||||
|
||||
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
|
||||
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
|
||||
/*
|
||||
* If the work is already pending, no need to raise the IPI.
|
||||
* The pairing atomic_fetch_andnot() in irq_work_run() makes sure
|
||||
@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
|
||||
if (cpu != smp_processor_id()) {
|
||||
/* Arch remote IPI send/receive backend aren't NMI safe */
|
||||
WARN_ON_ONCE(in_nmi());
|
||||
if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
|
||||
arch_send_call_function_single_ipi(cpu);
|
||||
__smp_call_single_queue(cpu, &work->llnode);
|
||||
} else {
|
||||
__irq_work_queue_local(work);
|
||||
}
|
||||
@ -131,19 +130,11 @@ bool irq_work_needs_cpu(void)
|
||||
return true;
|
||||
}
|
||||
|
||||
static void irq_work_run_list(struct llist_head *list)
|
||||
void irq_work_single(void *arg)
|
||||
{
|
||||
struct irq_work *work, *tmp;
|
||||
struct llist_node *llnode;
|
||||
|
||||
BUG_ON(!irqs_disabled());
|
||||
|
||||
if (llist_empty(list))
|
||||
return;
|
||||
|
||||
llnode = llist_del_all(list);
|
||||
llist_for_each_entry_safe(work, tmp, llnode, llnode) {
|
||||
struct irq_work *work = arg;
|
||||
int flags;
|
||||
|
||||
/*
|
||||
* Clear the PENDING bit, after this point the @work
|
||||
* can be re-used.
|
||||
@ -163,6 +154,20 @@ static void irq_work_run_list(struct llist_head *list)
|
||||
flags &= ~IRQ_WORK_PENDING;
|
||||
(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
|
||||
}
|
||||
|
||||
static void irq_work_run_list(struct llist_head *list)
|
||||
{
|
||||
struct irq_work *work, *tmp;
|
||||
struct llist_node *llnode;
|
||||
|
||||
BUG_ON(!irqs_disabled());
|
||||
|
||||
if (llist_empty(list))
|
||||
return;
|
||||
|
||||
llnode = llist_del_all(list);
|
||||
llist_for_each_entry_safe(work, tmp, llnode, llnode)
|
||||
irq_work_single(work);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "../smpboot.h"
|
||||
|
||||
#include "pelt.h"
|
||||
#include "smp.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/sched.h>
|
||||
@ -220,6 +221,13 @@ void update_rq_clock(struct rq *rq)
|
||||
update_rq_clock_task(rq, delta);
|
||||
}
|
||||
|
||||
static inline void
|
||||
rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
|
||||
{
|
||||
csd->flags = 0;
|
||||
csd->func = func;
|
||||
csd->info = rq;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_HRTICK
|
||||
/*
|
||||
@ -315,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay)
|
||||
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
|
||||
HRTIMER_MODE_REL_PINNED_HARD);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
static void hrtick_rq_init(struct rq *rq)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
rq->hrtick_csd.flags = 0;
|
||||
rq->hrtick_csd.func = __hrtick_start;
|
||||
rq->hrtick_csd.info = rq;
|
||||
rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
|
||||
#endif
|
||||
|
||||
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
|
||||
rq->hrtick_timer.function = hrtick;
|
||||
}
|
||||
@ -633,29 +639,23 @@ void wake_up_nohz_cpu(int cpu)
|
||||
wake_up_idle_cpu(cpu);
|
||||
}
|
||||
|
||||
static inline bool got_nohz_idle_kick(void)
|
||||
static void nohz_csd_func(void *info)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
|
||||
return false;
|
||||
|
||||
if (idle_cpu(cpu) && !need_resched())
|
||||
return true;
|
||||
struct rq *rq = info;
|
||||
int cpu = cpu_of(rq);
|
||||
unsigned int flags;
|
||||
|
||||
/*
|
||||
* We can't run Idle Load Balance on this CPU for this time so we
|
||||
* cancel it and clear NOHZ_BALANCE_KICK
|
||||
* Release the rq::nohz_csd.
|
||||
*/
|
||||
atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
|
||||
return false;
|
||||
flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
|
||||
WARN_ON(!(flags & NOHZ_KICK_MASK));
|
||||
|
||||
rq->idle_balance = idle_cpu(cpu);
|
||||
if (rq->idle_balance && !need_resched()) {
|
||||
rq->nohz_idle_balance = flags;
|
||||
raise_softirq_irqoff(SCHED_SOFTIRQ);
|
||||
}
|
||||
|
||||
#else /* CONFIG_NO_HZ_COMMON */
|
||||
|
||||
static inline bool got_nohz_idle_kick(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_NO_HZ_COMMON */
|
||||
@ -1540,7 +1540,7 @@ static int migration_cpu_stop(void *data)
|
||||
* __migrate_task() such that we will not miss enforcing cpus_ptr
|
||||
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
|
||||
*/
|
||||
sched_ttwu_pending();
|
||||
flush_smp_call_function_from_idle();
|
||||
|
||||
raw_spin_lock(&p->pi_lock);
|
||||
rq_lock(rq, &rf);
|
||||
@ -2274,16 +2274,23 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
void sched_ttwu_pending(void)
|
||||
void sched_ttwu_pending(void *arg)
|
||||
{
|
||||
struct llist_node *llist = arg;
|
||||
struct rq *rq = this_rq();
|
||||
struct llist_node *llist = llist_del_all(&rq->wake_list);
|
||||
struct task_struct *p, *t;
|
||||
struct rq_flags rf;
|
||||
|
||||
if (!llist)
|
||||
return;
|
||||
|
||||
/*
|
||||
* rq::ttwu_pending racy indication of out-standing wakeups.
|
||||
* Races such that false-negatives are possible, since they
|
||||
* are shorter lived that false-positives would be.
|
||||
*/
|
||||
WRITE_ONCE(rq->ttwu_pending, 0);
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
update_rq_clock(rq);
|
||||
|
||||
@ -2293,56 +2300,30 @@ void sched_ttwu_pending(void)
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
}
|
||||
|
||||
void scheduler_ipi(void)
|
||||
void send_call_function_single_ipi(int cpu)
|
||||
{
|
||||
/*
|
||||
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
|
||||
* TIF_NEED_RESCHED remotely (for the first time) will also send
|
||||
* this IPI.
|
||||
*/
|
||||
preempt_fold_need_resched();
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
|
||||
return;
|
||||
|
||||
/*
|
||||
* Not all reschedule IPI handlers call irq_enter/irq_exit, since
|
||||
* traditionally all their work was done from the interrupt return
|
||||
* path. Now that we actually do some work, we need to make sure
|
||||
* we do call them.
|
||||
*
|
||||
* Some archs already do call them, luckily irq_enter/exit nest
|
||||
* properly.
|
||||
*
|
||||
* Arguably we should visit all archs and update all handlers,
|
||||
* however a fair share of IPIs are still resched only so this would
|
||||
* somewhat pessimize the simple resched case.
|
||||
*/
|
||||
irq_enter();
|
||||
sched_ttwu_pending();
|
||||
|
||||
/*
|
||||
* Check if someone kicked us for doing the nohz idle load balance.
|
||||
*/
|
||||
if (unlikely(got_nohz_idle_kick())) {
|
||||
this_rq()->idle_balance = 1;
|
||||
raise_softirq_irqoff(SCHED_SOFTIRQ);
|
||||
}
|
||||
irq_exit();
|
||||
if (!set_nr_if_polling(rq->idle))
|
||||
arch_send_call_function_single_ipi(cpu);
|
||||
else
|
||||
trace_sched_wake_idle_without_ipi(cpu);
|
||||
}
|
||||
|
||||
static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
|
||||
/*
|
||||
* Queue a task on the target CPUs wake_list and wake the CPU via IPI if
|
||||
* necessary. The wakee CPU on receipt of the IPI will queue the task
|
||||
* via sched_ttwu_wakeup() for activation so the wakee incurs the cost
|
||||
* of the wakeup instead of the waker.
|
||||
*/
|
||||
static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
|
||||
|
||||
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
|
||||
if (!set_nr_if_polling(rq->idle))
|
||||
smp_send_reschedule(cpu);
|
||||
else
|
||||
trace_sched_wake_idle_without_ipi(cpu);
|
||||
}
|
||||
WRITE_ONCE(rq->ttwu_pending, 1);
|
||||
__smp_call_single_queue(cpu, &p->wake_entry);
|
||||
}
|
||||
|
||||
void wake_up_if_idle(int cpu)
|
||||
@ -2373,6 +2354,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
|
||||
{
|
||||
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
|
||||
}
|
||||
|
||||
static inline bool ttwu_queue_cond(int cpu, int wake_flags)
|
||||
{
|
||||
/*
|
||||
* If the CPU does not share cache, then queue the task on the
|
||||
* remote rqs wakelist to avoid accessing remote data.
|
||||
*/
|
||||
if (!cpus_share_cache(smp_processor_id(), cpu))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If the task is descheduling and the only running task on the
|
||||
* CPU then use the wakelist to offload the task activation to
|
||||
* the soon-to-be-idle CPU as the current CPU is likely busy.
|
||||
* nr_running is checked to avoid unnecessary task stacking.
|
||||
*/
|
||||
if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
|
||||
{
|
||||
if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
|
||||
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
|
||||
__ttwu_queue_wakelist(p, cpu, wake_flags);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
|
||||
@ -2381,11 +2394,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
|
||||
struct rq_flags rf;
|
||||
|
||||
#if defined(CONFIG_SMP)
|
||||
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
|
||||
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
|
||||
ttwu_queue_remote(p, cpu, wake_flags);
|
||||
if (ttwu_queue_wakelist(p, cpu, wake_flags))
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
rq_lock(rq, &rf);
|
||||
@ -2569,7 +2579,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
if (p->on_rq && ttwu_remote(p, wake_flags))
|
||||
goto unlock;
|
||||
|
||||
if (p->in_iowait) {
|
||||
delayacct_blkio_end(p);
|
||||
atomic_dec(&task_rq(p)->nr_iowait);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
p->sched_contributes_to_load = !!task_contributes_to_load(p);
|
||||
p->state = TASK_WAKING;
|
||||
|
||||
/*
|
||||
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
|
||||
* possible to, falsely, observe p->on_cpu == 0.
|
||||
@ -2591,6 +2609,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
/*
|
||||
* If the owning (remote) CPU is still in the middle of schedule() with
|
||||
* this task as prev, considering queueing p on the remote CPUs wake_list
|
||||
* which potentially sends an IPI instead of spinning on p->on_cpu to
|
||||
* let the waker make forward progress. This is safe because IRQs are
|
||||
* disabled and the IPI will deliver after on_cpu is cleared.
|
||||
*/
|
||||
if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
|
||||
goto unlock;
|
||||
|
||||
/*
|
||||
* If the owning (remote) CPU is still in the middle of schedule() with
|
||||
* this task as prev, wait until its done referencing the task.
|
||||
@ -2602,28 +2630,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
*/
|
||||
smp_cond_load_acquire(&p->on_cpu, !VAL);
|
||||
|
||||
p->sched_contributes_to_load = !!task_contributes_to_load(p);
|
||||
p->state = TASK_WAKING;
|
||||
|
||||
if (p->in_iowait) {
|
||||
delayacct_blkio_end(p);
|
||||
atomic_dec(&task_rq(p)->nr_iowait);
|
||||
}
|
||||
|
||||
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
|
||||
if (task_cpu(p) != cpu) {
|
||||
wake_flags |= WF_MIGRATED;
|
||||
psi_ttwu_dequeue(p);
|
||||
set_task_cpu(p, cpu);
|
||||
}
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
if (p->in_iowait) {
|
||||
delayacct_blkio_end(p);
|
||||
atomic_dec(&task_rq(p)->nr_iowait);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
ttwu_queue(p, cpu, wake_flags);
|
||||
@ -2751,6 +2763,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
p->capture_control = NULL;
|
||||
#endif
|
||||
init_numa_balancing(clone_flags, p);
|
||||
#ifdef CONFIG_SMP
|
||||
p->wake_entry_type = CSD_TYPE_TTWU;
|
||||
#endif
|
||||
}
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
|
||||
@ -3951,6 +3966,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
|
||||
schedstat_inc(this_rq()->sched_count);
|
||||
}
|
||||
|
||||
static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
|
||||
struct rq_flags *rf)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
const struct sched_class *class;
|
||||
/*
|
||||
* We must do the balancing pass before put_prev_task(), such
|
||||
* that when we release the rq->lock the task is in the same
|
||||
* state as before we took rq->lock.
|
||||
*
|
||||
* We can terminate the balance pass as soon as we know there is
|
||||
* a runnable task of @class priority or higher.
|
||||
*/
|
||||
for_class_range(class, prev->sched_class, &idle_sched_class) {
|
||||
if (class->balance(rq, prev, rf))
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
put_prev_task(rq, prev);
|
||||
}
|
||||
|
||||
/*
|
||||
* Pick up the highest-prio task:
|
||||
*/
|
||||
@ -3984,22 +4021,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
}
|
||||
|
||||
restart:
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* We must do the balancing pass before put_next_task(), such
|
||||
* that when we release the rq->lock the task is in the same
|
||||
* state as before we took rq->lock.
|
||||
*
|
||||
* We can terminate the balance pass as soon as we know there is
|
||||
* a runnable task of @class priority or higher.
|
||||
*/
|
||||
for_class_range(class, prev->sched_class, &idle_sched_class) {
|
||||
if (class->balance(rq, prev, rf))
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
put_prev_task(rq, prev);
|
||||
put_prev_task_balance(rq, prev, rf);
|
||||
|
||||
for_each_class(class) {
|
||||
p = class->pick_next_task(rq);
|
||||
@ -4689,7 +4711,7 @@ int idle_cpu(int cpu)
|
||||
return 0;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (!llist_empty(&rq->wake_list))
|
||||
if (rq->ttwu_pending)
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
@ -6243,13 +6265,14 @@ void idle_task_exit(void)
|
||||
struct mm_struct *mm = current->active_mm;
|
||||
|
||||
BUG_ON(cpu_online(smp_processor_id()));
|
||||
BUG_ON(current != this_rq()->idle);
|
||||
|
||||
if (mm != &init_mm) {
|
||||
switch_mm(mm, &init_mm, current);
|
||||
current->active_mm = &init_mm;
|
||||
finish_arch_post_lock_switch();
|
||||
}
|
||||
mmdrop(mm);
|
||||
|
||||
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
|
||||
}
|
||||
|
||||
/*
|
||||
@ -6539,7 +6562,6 @@ int sched_cpu_dying(unsigned int cpu)
|
||||
struct rq_flags rf;
|
||||
|
||||
/* Handle pending wakeups and then migrate everything off */
|
||||
sched_ttwu_pending();
|
||||
sched_tick_stop(cpu);
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
@ -6642,6 +6664,8 @@ void __init sched_init(void)
|
||||
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
|
||||
ptr += nr_cpu_ids * sizeof(void **);
|
||||
|
||||
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
|
||||
init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
|
||||
@ -6694,7 +6718,6 @@ void __init sched_init(void)
|
||||
init_rt_rq(&rq->rt);
|
||||
init_dl_rq(&rq->dl);
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
|
||||
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
||||
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
||||
/*
|
||||
@ -6716,7 +6739,6 @@ void __init sched_init(void)
|
||||
* We achieve this by letting root_task_group's tasks sit
|
||||
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
|
||||
*/
|
||||
init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
|
||||
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
@ -6744,6 +6766,8 @@ void __init sched_init(void)
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
rq->last_blocked_load_update_tick = jiffies;
|
||||
atomic_set(&rq->nohz_flags, 0);
|
||||
|
||||
rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
|
||||
#endif
|
||||
#endif /* CONFIG_SMP */
|
||||
hrtick_rq_init(rq);
|
||||
@ -7438,6 +7462,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
|
||||
|
||||
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
|
||||
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
|
||||
/* More than 203 days if BW_SHIFT equals 20. */
|
||||
static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
|
||||
|
||||
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
|
||||
|
||||
@ -7465,6 +7491,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
if (period > max_cfs_quota_period)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Bound quota to defend quota against overflow during bandwidth shift.
|
||||
*/
|
||||
if (quota != RUNTIME_INF && quota > max_cfs_runtime)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Prevent race between setting of cfs_rq->runtime_enabled and
|
||||
* unthrottle_offline_cfs_rqs().
|
||||
|
@ -5,6 +5,7 @@
|
||||
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
|
||||
* (balbir@in.ibm.com).
|
||||
*/
|
||||
#include <asm/irq_regs.h>
|
||||
#include "sched.h"
|
||||
|
||||
/* Time spent by the tasks of the CPU accounting group executing in ... */
|
||||
@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
||||
{
|
||||
struct cpuacct *ca;
|
||||
int index = CPUACCT_STAT_SYSTEM;
|
||||
struct pt_regs *regs = task_pt_regs(tsk);
|
||||
struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
|
||||
|
||||
if (regs && user_mode(regs))
|
||||
index = CPUACCT_STAT_USER;
|
||||
@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
||||
rcu_read_lock();
|
||||
|
||||
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
|
||||
this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
|
||||
__this_cpu_add(ca->cpuusage->usages[index], cputime);
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
|
||||
|
||||
rcu_read_lock();
|
||||
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
|
||||
this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
|
||||
__this_cpu_add(ca->cpustat->cpustat[index], val);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
|
@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
||||
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
|
||||
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
|
||||
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
|
||||
/* &table[8] is terminator */
|
||||
@ -467,7 +467,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
|
||||
SEQ_printf(m, " S task PID tree-key switches prio"
|
||||
" wait-time sum-exec sum-sleep\n");
|
||||
SEQ_printf(m, "-------------------------------------------------------"
|
||||
"----------------------------------------------------\n");
|
||||
"------------------------------------------------------\n");
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_process_thread(g, p) {
|
||||
@ -638,7 +638,6 @@ do { \
|
||||
|
||||
P(nr_running);
|
||||
P(nr_switches);
|
||||
P(nr_load_updates);
|
||||
P(nr_uninterruptible);
|
||||
PN(next_balance);
|
||||
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
|
||||
|
@ -191,7 +191,7 @@ static void update_sysctl(void)
|
||||
#undef SET_SYSCTL
|
||||
}
|
||||
|
||||
void sched_init_granularity(void)
|
||||
void __init sched_init_granularity(void)
|
||||
{
|
||||
update_sysctl();
|
||||
}
|
||||
@ -1094,7 +1094,7 @@ struct numa_group {
|
||||
* more by CPU use than by memory faults.
|
||||
*/
|
||||
unsigned long *faults_cpu;
|
||||
unsigned long faults[0];
|
||||
unsigned long faults[];
|
||||
};
|
||||
|
||||
/*
|
||||
@ -3441,52 +3441,46 @@ static inline void
|
||||
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
|
||||
{
|
||||
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
|
||||
/*
|
||||
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
|
||||
* See ___update_load_avg() for details.
|
||||
*/
|
||||
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
|
||||
|
||||
/* Nothing to update */
|
||||
if (!delta)
|
||||
return;
|
||||
|
||||
/*
|
||||
* The relation between sum and avg is:
|
||||
*
|
||||
* LOAD_AVG_MAX - 1024 + sa->period_contrib
|
||||
*
|
||||
* however, the PELT windows are not aligned between grq and gse.
|
||||
*/
|
||||
|
||||
/* Set new sched_entity's utilization */
|
||||
se->avg.util_avg = gcfs_rq->avg.util_avg;
|
||||
se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
|
||||
se->avg.util_sum = se->avg.util_avg * divider;
|
||||
|
||||
/* Update parent cfs_rq utilization */
|
||||
add_positive(&cfs_rq->avg.util_avg, delta);
|
||||
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
|
||||
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
|
||||
}
|
||||
|
||||
static inline void
|
||||
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
|
||||
{
|
||||
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
|
||||
/*
|
||||
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
|
||||
* See ___update_load_avg() for details.
|
||||
*/
|
||||
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
|
||||
|
||||
/* Nothing to update */
|
||||
if (!delta)
|
||||
return;
|
||||
|
||||
/*
|
||||
* The relation between sum and avg is:
|
||||
*
|
||||
* LOAD_AVG_MAX - 1024 + sa->period_contrib
|
||||
*
|
||||
* however, the PELT windows are not aligned between grq and gse.
|
||||
*/
|
||||
|
||||
/* Set new sched_entity's runnable */
|
||||
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
|
||||
se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
|
||||
se->avg.runnable_sum = se->avg.runnable_avg * divider;
|
||||
|
||||
/* Update parent cfs_rq runnable */
|
||||
add_positive(&cfs_rq->avg.runnable_avg, delta);
|
||||
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
|
||||
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
|
||||
}
|
||||
|
||||
static inline void
|
||||
@ -3496,19 +3490,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
|
||||
unsigned long load_avg;
|
||||
u64 load_sum = 0;
|
||||
s64 delta_sum;
|
||||
u32 divider;
|
||||
|
||||
if (!runnable_sum)
|
||||
return;
|
||||
|
||||
gcfs_rq->prop_runnable_sum = 0;
|
||||
|
||||
/*
|
||||
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
|
||||
* See ___update_load_avg() for details.
|
||||
*/
|
||||
divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
|
||||
|
||||
if (runnable_sum >= 0) {
|
||||
/*
|
||||
* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
|
||||
* the CPU is saturated running == runnable.
|
||||
*/
|
||||
runnable_sum += se->avg.load_sum;
|
||||
runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
|
||||
runnable_sum = min_t(long, runnable_sum, divider);
|
||||
} else {
|
||||
/*
|
||||
* Estimate the new unweighted runnable_sum of the gcfs_rq by
|
||||
@ -3533,7 +3534,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
|
||||
runnable_sum = max(runnable_sum, running_sum);
|
||||
|
||||
load_sum = (s64)se_weight(se) * runnable_sum;
|
||||
load_avg = div_s64(load_sum, LOAD_AVG_MAX);
|
||||
load_avg = div_s64(load_sum, divider);
|
||||
|
||||
delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
|
||||
delta_avg = load_avg - se->avg.load_avg;
|
||||
@ -3697,6 +3698,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
||||
*/
|
||||
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
/*
|
||||
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
|
||||
* See ___update_load_avg() for details.
|
||||
*/
|
||||
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
|
||||
|
||||
/*
|
||||
@ -3873,6 +3878,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
|
||||
return cfs_rq->avg.load_avg;
|
||||
}
|
||||
|
||||
static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
|
||||
|
||||
static inline unsigned long task_util(struct task_struct *p)
|
||||
{
|
||||
return READ_ONCE(p->se.avg.util_avg);
|
||||
@ -4054,7 +4061,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
||||
static inline void
|
||||
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
||||
|
||||
static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
|
||||
static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@ -4588,16 +4595,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
||||
}
|
||||
|
||||
/* returns 0 on failure to allocate runtime */
|
||||
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
|
||||
struct cfs_rq *cfs_rq, u64 target_runtime)
|
||||
{
|
||||
struct task_group *tg = cfs_rq->tg;
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
u64 amount = 0, min_amount;
|
||||
u64 min_amount, amount = 0;
|
||||
|
||||
lockdep_assert_held(&cfs_b->lock);
|
||||
|
||||
/* note: this is a positive sum as runtime_remaining <= 0 */
|
||||
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
|
||||
min_amount = target_runtime - cfs_rq->runtime_remaining;
|
||||
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
if (cfs_b->quota == RUNTIME_INF)
|
||||
amount = min_amount;
|
||||
else {
|
||||
@ -4609,13 +4616,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
cfs_b->idle = 0;
|
||||
}
|
||||
}
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
|
||||
cfs_rq->runtime_remaining += amount;
|
||||
|
||||
return cfs_rq->runtime_remaining > 0;
|
||||
}
|
||||
|
||||
/* returns 0 on failure to allocate runtime */
|
||||
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
int ret;
|
||||
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
|
||||
{
|
||||
/* dock delta_exec before expiring quota (as it could span periods) */
|
||||
@ -4704,13 +4723,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
struct sched_entity *se;
|
||||
long task_delta, idle_task_delta, dequeue = 1;
|
||||
bool empty;
|
||||
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
/* This will start the period timer if necessary */
|
||||
if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
|
||||
/*
|
||||
* We have raced with bandwidth becoming available, and if we
|
||||
* actually throttled the timer might not unthrottle us for an
|
||||
* entire period. We additionally needed to make sure that any
|
||||
* subsequent check_cfs_rq_runtime calls agree not to throttle
|
||||
* us, as we may commit to do cfs put_prev+pick_next, so we ask
|
||||
* for 1ns of runtime rather than just check cfs_b.
|
||||
*/
|
||||
dequeue = 0;
|
||||
} else {
|
||||
list_add_tail_rcu(&cfs_rq->throttled_list,
|
||||
&cfs_b->throttled_cfs_rq);
|
||||
}
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
|
||||
if (!dequeue)
|
||||
return false; /* Throttle no longer required. */
|
||||
|
||||
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
|
||||
|
||||
@ -4744,29 +4783,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
if (!se)
|
||||
sub_nr_running(rq, task_delta);
|
||||
|
||||
/*
|
||||
* Note: distribution will already see us throttled via the
|
||||
* throttled-list. rq->lock protects completion.
|
||||
*/
|
||||
cfs_rq->throttled = 1;
|
||||
cfs_rq->throttled_clock = rq_clock(rq);
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
empty = list_empty(&cfs_b->throttled_cfs_rq);
|
||||
|
||||
/*
|
||||
* Add to the _head_ of the list, so that an already-started
|
||||
* distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
|
||||
* not running add to the tail so that later runqueues don't get starved.
|
||||
*/
|
||||
if (cfs_b->distribute_running)
|
||||
list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
|
||||
else
|
||||
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
|
||||
|
||||
/*
|
||||
* If we're the first throttled task, make sure the bandwidth
|
||||
* timer is running.
|
||||
*/
|
||||
if (empty)
|
||||
start_cfs_bandwidth(cfs_b);
|
||||
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
@ -4933,14 +4956,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
|
||||
/*
|
||||
* This check is repeated as we release cfs_b->lock while we unthrottle.
|
||||
*/
|
||||
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
|
||||
cfs_b->distribute_running = 1;
|
||||
while (throttled && cfs_b->runtime > 0) {
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
/* we can't nest cfs_b->lock while distributing bandwidth */
|
||||
distribute_cfs_runtime(cfs_b);
|
||||
raw_spin_lock_irqsave(&cfs_b->lock, flags);
|
||||
|
||||
cfs_b->distribute_running = 0;
|
||||
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
|
||||
}
|
||||
|
||||
@ -5054,10 +5075,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
||||
/* confirm we're still not at a refresh boundary */
|
||||
raw_spin_lock_irqsave(&cfs_b->lock, flags);
|
||||
cfs_b->slack_started = false;
|
||||
if (cfs_b->distribute_running) {
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
return;
|
||||
}
|
||||
|
||||
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
@ -5067,9 +5084,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
||||
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
|
||||
runtime = cfs_b->runtime;
|
||||
|
||||
if (runtime)
|
||||
cfs_b->distribute_running = 1;
|
||||
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
|
||||
if (!runtime)
|
||||
@ -5078,7 +5092,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
||||
distribute_cfs_runtime(cfs_b);
|
||||
|
||||
raw_spin_lock_irqsave(&cfs_b->lock, flags);
|
||||
cfs_b->distribute_running = 0;
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
}
|
||||
|
||||
@ -5139,8 +5152,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
if (cfs_rq_throttled(cfs_rq))
|
||||
return true;
|
||||
|
||||
throttle_cfs_rq(cfs_rq);
|
||||
return true;
|
||||
return throttle_cfs_rq(cfs_rq);
|
||||
}
|
||||
|
||||
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
|
||||
@ -5170,6 +5182,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
|
||||
if (!overrun)
|
||||
break;
|
||||
|
||||
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
|
||||
|
||||
if (++count > 3) {
|
||||
u64 new, old = ktime_to_ns(cfs_b->period);
|
||||
|
||||
@ -5199,8 +5213,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
|
||||
/* reset count so we don't come right back in here */
|
||||
count = 0;
|
||||
}
|
||||
|
||||
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
|
||||
}
|
||||
if (idle)
|
||||
cfs_b->period_active = 0;
|
||||
@ -5221,7 +5233,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
cfs_b->period_timer.function = sched_cfs_period_timer;
|
||||
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
cfs_b->slack_timer.function = sched_cfs_slack_timer;
|
||||
cfs_b->distribute_running = 0;
|
||||
cfs_b->slack_started = false;
|
||||
}
|
||||
|
||||
@ -5506,9 +5517,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
list_add_leaf_cfs_rq(cfs_rq);
|
||||
}
|
||||
|
||||
enqueue_throttle:
|
||||
if (!se) {
|
||||
/* At this point se is NULL and we are at root level*/
|
||||
add_nr_running(rq, 1);
|
||||
|
||||
/*
|
||||
* Since new tasks are assigned an initial util_avg equal to
|
||||
* half of the spare capacity of their CPU, tiny tasks have the
|
||||
@ -5526,8 +5537,7 @@ enqueue_throttle:
|
||||
if (flags & ENQUEUE_WAKEUP)
|
||||
update_overutilized_status(rq);
|
||||
|
||||
}
|
||||
|
||||
enqueue_throttle:
|
||||
if (cfs_bandwidth_used()) {
|
||||
/*
|
||||
* When bandwidth control is enabled; the cfs_rq_throttled()
|
||||
@ -5737,7 +5747,7 @@ static int wake_wide(struct task_struct *p)
|
||||
{
|
||||
unsigned int master = current->wakee_flips;
|
||||
unsigned int slave = p->wakee_flips;
|
||||
int factor = this_cpu_read(sd_llc_size);
|
||||
int factor = __this_cpu_read(sd_llc_size);
|
||||
|
||||
if (master < slave)
|
||||
swap(master, slave);
|
||||
@ -5846,8 +5856,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
|
||||
}
|
||||
|
||||
static struct sched_group *
|
||||
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
||||
int this_cpu, int sd_flag);
|
||||
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
|
||||
|
||||
/*
|
||||
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
|
||||
@ -5930,7 +5939,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
|
||||
continue;
|
||||
}
|
||||
|
||||
group = find_idlest_group(sd, p, cpu, sd_flag);
|
||||
group = find_idlest_group(sd, p, cpu);
|
||||
if (!group) {
|
||||
sd = sd->child;
|
||||
continue;
|
||||
@ -6671,9 +6680,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_domain(cpu, tmp) {
|
||||
if (!(tmp->flags & SD_LOAD_BALANCE))
|
||||
break;
|
||||
|
||||
/*
|
||||
* If both 'cpu' and 'prev_cpu' are part of this domain,
|
||||
* cpu is a valid SD_WAKE_AFFINE target.
|
||||
@ -8584,7 +8590,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (!llist_empty(&rq->wake_list))
|
||||
if (rq->ttwu_pending)
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
@ -8702,8 +8708,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
|
||||
* Assumes p is allowed on at least one CPU in sd.
|
||||
*/
|
||||
static struct sched_group *
|
||||
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
||||
int this_cpu, int sd_flag)
|
||||
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
|
||||
{
|
||||
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
|
||||
struct sg_lb_stats local_sgs, tmp_sgs;
|
||||
@ -9434,7 +9439,7 @@ static int active_load_balance_cpu_stop(void *data);
|
||||
static int should_we_balance(struct lb_env *env)
|
||||
{
|
||||
struct sched_group *sg = env->sd->groups;
|
||||
int cpu, balance_cpu = -1;
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* Ensure the balancing environment is consistent; can happen
|
||||
@ -9455,18 +9460,12 @@ static int should_we_balance(struct lb_env *env)
|
||||
if (!idle_cpu(cpu))
|
||||
continue;
|
||||
|
||||
balance_cpu = cpu;
|
||||
break;
|
||||
/* Are we the first idle CPU? */
|
||||
return cpu == env->dst_cpu;
|
||||
}
|
||||
|
||||
if (balance_cpu == -1)
|
||||
balance_cpu = group_balance_cpu(sg);
|
||||
|
||||
/*
|
||||
* First idle CPU or the first CPU(busiest) in this sched group
|
||||
* is eligible for doing load balancing at this and above domains.
|
||||
*/
|
||||
return balance_cpu == env->dst_cpu;
|
||||
/* Are we the first CPU of this group ? */
|
||||
return group_balance_cpu(sg) == env->dst_cpu;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -9819,8 +9818,7 @@ static int active_load_balance_cpu_stop(void *data)
|
||||
/* Search for an sd spanning us and the target CPU. */
|
||||
rcu_read_lock();
|
||||
for_each_domain(target_cpu, sd) {
|
||||
if ((sd->flags & SD_LOAD_BALANCE) &&
|
||||
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
|
||||
if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
|
||||
break;
|
||||
}
|
||||
|
||||
@ -9910,9 +9908,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
}
|
||||
max_cost += sd->max_newidle_lb_cost;
|
||||
|
||||
if (!(sd->flags & SD_LOAD_BALANCE))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Stop the load balance at this level. There is another
|
||||
* CPU in our sched group which is doing load balancing more
|
||||
@ -10029,17 +10024,20 @@ static void kick_ilb(unsigned int flags)
|
||||
if (ilb_cpu >= nr_cpu_ids)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
|
||||
* the first flag owns it; cleared by nohz_csd_func().
|
||||
*/
|
||||
flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
|
||||
if (flags & NOHZ_KICK_MASK)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Use smp_send_reschedule() instead of resched_cpu().
|
||||
* This way we generate a sched IPI on the target CPU which
|
||||
* This way we generate an IPI on the target CPU which
|
||||
* is idle. And the softirq performing nohz idle load balance
|
||||
* will be run before returning from the IPI.
|
||||
*/
|
||||
smp_send_reschedule(ilb_cpu);
|
||||
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -10377,20 +10375,14 @@ abort:
|
||||
*/
|
||||
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
|
||||
{
|
||||
int this_cpu = this_rq->cpu;
|
||||
unsigned int flags;
|
||||
unsigned int flags = this_rq->nohz_idle_balance;
|
||||
|
||||
if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
|
||||
if (!flags)
|
||||
return false;
|
||||
|
||||
if (idle != CPU_IDLE) {
|
||||
atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
|
||||
return false;
|
||||
}
|
||||
this_rq->nohz_idle_balance = 0;
|
||||
|
||||
/* could be _relaxed() */
|
||||
flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
|
||||
if (!(flags & NOHZ_KICK_MASK))
|
||||
if (idle != CPU_IDLE)
|
||||
return false;
|
||||
|
||||
_nohz_idle_balance(this_rq, flags, idle);
|
||||
@ -10450,7 +10442,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
|
||||
* 0 - failed, no new tasks
|
||||
* > 0 - success, new (fair) tasks present
|
||||
*/
|
||||
int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
|
||||
static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
|
||||
{
|
||||
unsigned long next_balance = jiffies + HZ;
|
||||
int this_cpu = this_rq->cpu;
|
||||
@ -10501,9 +10493,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
|
||||
int continue_balancing = 1;
|
||||
u64 t0, domain_cost;
|
||||
|
||||
if (!(sd->flags & SD_LOAD_BALANCE))
|
||||
continue;
|
||||
|
||||
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
|
||||
update_next_balance(sd, &next_balance);
|
||||
break;
|
||||
|
@ -289,7 +289,11 @@ static void do_idle(void)
|
||||
*/
|
||||
smp_mb__after_atomic();
|
||||
|
||||
sched_ttwu_pending();
|
||||
/*
|
||||
* RCU relies on this call to be done outside of an RCU read-side
|
||||
* critical section.
|
||||
*/
|
||||
flush_smp_call_function_from_idle();
|
||||
schedule_idle();
|
||||
|
||||
if (unlikely(klp_patch_pending(current)))
|
||||
|
@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* When syncing *_avg with *_sum, we must take into account the current
|
||||
* position in the PELT segment otherwise the remaining part of the segment
|
||||
* will be considered as idle time whereas it's not yet elapsed and this will
|
||||
* generate unwanted oscillation in the range [1002..1024[.
|
||||
*
|
||||
* The max value of *_sum varies with the position in the time segment and is
|
||||
* equals to :
|
||||
*
|
||||
* LOAD_AVG_MAX*y + sa->period_contrib
|
||||
*
|
||||
* which can be simplified into:
|
||||
*
|
||||
* LOAD_AVG_MAX - 1024 + sa->period_contrib
|
||||
*
|
||||
* because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
|
||||
*
|
||||
* The same care must be taken when a sched entity is added, updated or
|
||||
* removed from a cfs_rq and we need to update sched_avg. Scheduler entities
|
||||
* and the cfs rq, to which they are attached, have the same position in the
|
||||
* time segment because they use the same clock. This means that we can use
|
||||
* the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
|
||||
* if it's more convenient.
|
||||
*/
|
||||
static __always_inline void
|
||||
___update_load_avg(struct sched_avg *sa, unsigned long load)
|
||||
{
|
||||
|
@ -9,6 +9,8 @@
|
||||
|
||||
int sched_rr_timeslice = RR_TIMESLICE;
|
||||
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
|
||||
/* More than 4 hours if BW_SHIFT equals 20. */
|
||||
static const u64 max_rt_runtime = MAX_BW;
|
||||
|
||||
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
|
||||
|
||||
@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
|
||||
if (rt_period == 0)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Bound quota to defend quota against overflow during bandwidth shift.
|
||||
*/
|
||||
if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&rt_constraints_mutex);
|
||||
err = __rt_schedulable(tg, rt_period, rt_runtime);
|
||||
if (err)
|
||||
@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void)
|
||||
return -EINVAL;
|
||||
|
||||
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
|
||||
(sysctl_sched_rt_runtime > sysctl_sched_rt_period))
|
||||
((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
|
||||
((u64)sysctl_sched_rt_runtime *
|
||||
NSEC_PER_USEC > max_rt_runtime)))
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
|
@ -349,7 +349,6 @@ struct cfs_bandwidth {
|
||||
|
||||
u8 idle;
|
||||
u8 period_active;
|
||||
u8 distribute_running;
|
||||
u8 slack_started;
|
||||
struct hrtimer period_timer;
|
||||
struct hrtimer slack_timer;
|
||||
@ -890,12 +889,15 @@ struct rq {
|
||||
#ifdef CONFIG_SMP
|
||||
unsigned long last_blocked_load_update_tick;
|
||||
unsigned int has_blocked_load;
|
||||
call_single_data_t nohz_csd;
|
||||
#endif /* CONFIG_SMP */
|
||||
unsigned int nohz_tick_stopped;
|
||||
atomic_t nohz_flags;
|
||||
#endif /* CONFIG_NO_HZ_COMMON */
|
||||
|
||||
unsigned long nr_load_updates;
|
||||
#ifdef CONFIG_SMP
|
||||
unsigned int ttwu_pending;
|
||||
#endif
|
||||
u64 nr_switches;
|
||||
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
@ -951,6 +953,7 @@ struct rq {
|
||||
|
||||
struct callback_head *balance_callback;
|
||||
|
||||
unsigned char nohz_idle_balance;
|
||||
unsigned char idle_balance;
|
||||
|
||||
unsigned long misfit_task_load;
|
||||
@ -979,7 +982,7 @@ struct rq {
|
||||
|
||||
/* This is used to determine avg_idle's max value */
|
||||
u64 max_idle_balance_cost;
|
||||
#endif
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
u64 prev_irq_time;
|
||||
@ -1020,10 +1023,6 @@ struct rq {
|
||||
unsigned int ttwu_local;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
struct llist_head wake_list;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CPU_IDLE
|
||||
/* Must be inspected within a rcu lock section */
|
||||
struct cpuidle_state *idle_state;
|
||||
@ -1367,8 +1366,6 @@ queue_balance_callback(struct rq *rq,
|
||||
rq->balance_callback = head;
|
||||
}
|
||||
|
||||
extern void sched_ttwu_pending(void);
|
||||
|
||||
#define rcu_dereference_check_sched_domain(p) \
|
||||
rcu_dereference_check((p), \
|
||||
lockdep_is_held(&sched_domains_mutex))
|
||||
@ -1461,7 +1458,7 @@ struct sched_group {
|
||||
* by attaching extra space to the end of the structure,
|
||||
* depending on how many CPUs the kernel has booted up with)
|
||||
*/
|
||||
unsigned long cpumask[0];
|
||||
unsigned long cpumask[];
|
||||
};
|
||||
|
||||
static inline struct cpumask *sched_group_span(struct sched_group *sg)
|
||||
@ -1504,15 +1501,11 @@ static inline void unregister_sched_domain_sysctl(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
|
||||
extern void flush_smp_call_function_from_idle(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline void sched_ttwu_pending(void) { }
|
||||
|
||||
static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
#else /* !CONFIG_SMP: */
|
||||
static inline void flush_smp_call_function_from_idle(void) { }
|
||||
#endif
|
||||
|
||||
#include "stats.h"
|
||||
#include "autogroup.h"
|
||||
@ -1688,7 +1681,8 @@ static inline int task_on_rq_migrating(struct task_struct *p)
|
||||
*/
|
||||
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
|
||||
#define WF_FORK 0x02 /* Child wakeup after fork */
|
||||
#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
|
||||
#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
|
||||
#define WF_ON_RQ 0x08 /* Wakee is on_rq */
|
||||
|
||||
/*
|
||||
* To aid in avoiding the subversion of "niceness" due to uneven distribution
|
||||
@ -1918,6 +1912,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
|
||||
#define BW_SHIFT 20
|
||||
#define BW_UNIT (1 << BW_SHIFT)
|
||||
#define RATIO_SHIFT 8
|
||||
#define MAX_BW_BITS (64 - BW_SHIFT)
|
||||
#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
|
||||
unsigned long to_ratio(u64 period, u64 runtime);
|
||||
|
||||
extern void init_entity_runnable_average(struct sched_entity *se);
|
||||
|
9
kernel/sched/smp.h
Normal file
9
kernel/sched/smp.h
Normal file
@ -0,0 +1,9 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Scheduler internal SMP callback types and methods between the scheduler
|
||||
* and other internal parts of the core kernel:
|
||||
*/
|
||||
|
||||
extern void sched_ttwu_pending(void *arg);
|
||||
|
||||
extern void send_call_function_single_ipi(int cpu);
|
@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
|
||||
cpumask_clear(groupmask);
|
||||
|
||||
printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
|
||||
|
||||
if (!(sd->flags & SD_LOAD_BALANCE)) {
|
||||
printk("does not load-balance\n");
|
||||
if (sd->parent)
|
||||
printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
|
||||
return -1;
|
||||
}
|
||||
|
||||
printk(KERN_CONT "span=%*pbl level=%s\n",
|
||||
cpumask_pr_args(sched_domain_span(sd)), sd->name);
|
||||
|
||||
@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd)
|
||||
return 1;
|
||||
|
||||
/* Following flags need at least 2 groups */
|
||||
if (sd->flags & (SD_LOAD_BALANCE |
|
||||
SD_BALANCE_NEWIDLE |
|
||||
if (sd->flags & (SD_BALANCE_NEWIDLE |
|
||||
SD_BALANCE_FORK |
|
||||
SD_BALANCE_EXEC |
|
||||
SD_SHARE_CPUCAPACITY |
|
||||
@ -183,8 +174,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
|
||||
|
||||
/* Flags needing groups don't count if only 1 group in parent */
|
||||
if (parent->groups == parent->groups->next) {
|
||||
pflags &= ~(SD_LOAD_BALANCE |
|
||||
SD_BALANCE_NEWIDLE |
|
||||
pflags &= ~(SD_BALANCE_NEWIDLE |
|
||||
SD_BALANCE_FORK |
|
||||
SD_BALANCE_EXEC |
|
||||
SD_ASYM_CPUCAPACITY |
|
||||
@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
|
||||
.cache_nice_tries = 0,
|
||||
|
||||
.flags = 1*SD_LOAD_BALANCE
|
||||
| 1*SD_BALANCE_NEWIDLE
|
||||
.flags = 1*SD_BALANCE_NEWIDLE
|
||||
| 1*SD_BALANCE_EXEC
|
||||
| 1*SD_BALANCE_FORK
|
||||
| 0*SD_BALANCE_WAKE
|
||||
|
183
kernel/smp.c
183
kernel/smp.c
@ -22,11 +22,9 @@
|
||||
#include <linux/hypervisor.h>
|
||||
|
||||
#include "smpboot.h"
|
||||
#include "sched/smp.h"
|
||||
|
||||
enum {
|
||||
CSD_FLAG_LOCK = 0x01,
|
||||
CSD_FLAG_SYNCHRONOUS = 0x02,
|
||||
};
|
||||
#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
|
||||
|
||||
struct call_function_data {
|
||||
call_single_data_t __percpu *csd;
|
||||
@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
|
||||
* still pending.
|
||||
*/
|
||||
flush_smp_call_function_queue(false);
|
||||
irq_work_run();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
|
||||
|
||||
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
|
||||
|
||||
void __smp_call_single_queue(int cpu, struct llist_node *node)
|
||||
{
|
||||
/*
|
||||
* The list addition should be visible before sending the IPI
|
||||
* handler locks the list to pull the entry off it because of
|
||||
* normal cache coherency rules implied by spinlocks.
|
||||
*
|
||||
* If IPIs can go out of order to the cache coherency protocol
|
||||
* in an architecture, sufficient synchronisation should be added
|
||||
* to arch code to make it appear to obey cache coherency WRT
|
||||
* locking and barrier primitives. Generic code isn't really
|
||||
* equipped to do the right thing...
|
||||
*/
|
||||
if (llist_add(node, &per_cpu(call_single_queue, cpu)))
|
||||
send_call_function_single_ipi(cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a previously allocated call_single_data_t element
|
||||
* for execution on the given CPU. data must already have
|
||||
* ->func, ->info, and ->flags set.
|
||||
*/
|
||||
static int generic_exec_single(int cpu, call_single_data_t *csd,
|
||||
smp_call_func_t func, void *info)
|
||||
static int generic_exec_single(int cpu, call_single_data_t *csd)
|
||||
{
|
||||
if (cpu == smp_processor_id()) {
|
||||
smp_call_func_t func = csd->func;
|
||||
void *info = csd->info;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
|
||||
csd_unlock(csd);
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
csd->func = func;
|
||||
csd->info = info;
|
||||
|
||||
/*
|
||||
* The list addition should be visible before sending the IPI
|
||||
* handler locks the list to pull the entry off it because of
|
||||
* normal cache coherency rules implied by spinlocks.
|
||||
*
|
||||
* If IPIs can go out of order to the cache coherency protocol
|
||||
* in an architecture, sufficient synchronisation should be added
|
||||
* to arch code to make it appear to obey cache coherency WRT
|
||||
* locking and barrier primitives. Generic code isn't really
|
||||
* equipped to do the right thing...
|
||||
*/
|
||||
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
|
||||
arch_send_call_function_single_ipi(cpu);
|
||||
__smp_call_single_queue(cpu, &csd->llist);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void)
|
||||
*/
|
||||
static void flush_smp_call_function_queue(bool warn_cpu_offline)
|
||||
{
|
||||
struct llist_head *head;
|
||||
struct llist_node *entry;
|
||||
call_single_data_t *csd, *csd_next;
|
||||
struct llist_node *entry, *prev;
|
||||
struct llist_head *head;
|
||||
static bool warned;
|
||||
|
||||
lockdep_assert_irqs_disabled();
|
||||
@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
|
||||
* We don't have to use the _safe() variant here
|
||||
* because we are not invoking the IPI handlers yet.
|
||||
*/
|
||||
llist_for_each_entry(csd, entry, llist)
|
||||
llist_for_each_entry(csd, entry, llist) {
|
||||
switch (CSD_TYPE(csd)) {
|
||||
case CSD_TYPE_ASYNC:
|
||||
case CSD_TYPE_SYNC:
|
||||
case CSD_TYPE_IRQ_WORK:
|
||||
pr_warn("IPI callback %pS sent to offline CPU\n",
|
||||
csd->func);
|
||||
break;
|
||||
|
||||
case CSD_TYPE_TTWU:
|
||||
pr_warn("IPI task-wakeup sent to offline CPU\n");
|
||||
break;
|
||||
|
||||
default:
|
||||
pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
|
||||
CSD_TYPE(csd));
|
||||
break;
|
||||
}
|
||||
|
||||
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
|
||||
smp_call_func_t func = csd->func;
|
||||
void *info = csd->info;
|
||||
|
||||
/* Do we wait until *after* callback? */
|
||||
if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
|
||||
func(info);
|
||||
csd_unlock(csd);
|
||||
} else {
|
||||
csd_unlock(csd);
|
||||
func(info);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle irq works queued remotely by irq_work_queue_on().
|
||||
* Smp functions above are typically synchronous so they
|
||||
* better run first since some other CPUs may be busy waiting
|
||||
* for them.
|
||||
* First; run all SYNC callbacks, people are waiting for us.
|
||||
*/
|
||||
irq_work_run();
|
||||
prev = NULL;
|
||||
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
|
||||
/* Do we wait until *after* callback? */
|
||||
if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
|
||||
smp_call_func_t func = csd->func;
|
||||
void *info = csd->info;
|
||||
|
||||
if (prev) {
|
||||
prev->next = &csd_next->llist;
|
||||
} else {
|
||||
entry = &csd_next->llist;
|
||||
}
|
||||
|
||||
func(info);
|
||||
csd_unlock(csd);
|
||||
} else {
|
||||
prev = &csd->llist;
|
||||
}
|
||||
}
|
||||
|
||||
if (!entry)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Second; run all !SYNC callbacks.
|
||||
*/
|
||||
prev = NULL;
|
||||
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
|
||||
int type = CSD_TYPE(csd);
|
||||
|
||||
if (type != CSD_TYPE_TTWU) {
|
||||
if (prev) {
|
||||
prev->next = &csd_next->llist;
|
||||
} else {
|
||||
entry = &csd_next->llist;
|
||||
}
|
||||
|
||||
if (type == CSD_TYPE_ASYNC) {
|
||||
smp_call_func_t func = csd->func;
|
||||
void *info = csd->info;
|
||||
|
||||
csd_unlock(csd);
|
||||
func(info);
|
||||
} else if (type == CSD_TYPE_IRQ_WORK) {
|
||||
irq_work_single(csd);
|
||||
}
|
||||
|
||||
} else {
|
||||
prev = &csd->llist;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Third; only CSD_TYPE_TTWU is left, issue those.
|
||||
*/
|
||||
if (entry)
|
||||
sched_ttwu_pending(entry);
|
||||
}
|
||||
|
||||
void flush_smp_call_function_from_idle(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (llist_empty(this_cpu_ptr(&call_single_queue)))
|
||||
return;
|
||||
|
||||
local_irq_save(flags);
|
||||
flush_smp_call_function_queue(true);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
|
||||
{
|
||||
call_single_data_t *csd;
|
||||
call_single_data_t csd_stack = {
|
||||
.flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
|
||||
.flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
|
||||
};
|
||||
int this_cpu;
|
||||
int err;
|
||||
@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
|
||||
csd_lock(csd);
|
||||
}
|
||||
|
||||
err = generic_exec_single(cpu, csd, func, info);
|
||||
csd->func = func;
|
||||
csd->info = info;
|
||||
|
||||
err = generic_exec_single(cpu, csd);
|
||||
|
||||
if (wait)
|
||||
csd_lock_wait(csd);
|
||||
@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
|
||||
csd->flags = CSD_FLAG_LOCK;
|
||||
smp_wmb();
|
||||
|
||||
err = generic_exec_single(cpu, csd, csd->func, csd->info);
|
||||
err = generic_exec_single(cpu, csd);
|
||||
|
||||
out:
|
||||
preempt_enable();
|
||||
@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
|
||||
|
||||
csd_lock(csd);
|
||||
if (wait)
|
||||
csd->flags |= CSD_FLAG_SYNCHRONOUS;
|
||||
csd->flags |= CSD_TYPE_SYNC;
|
||||
csd->func = func;
|
||||
csd->info = info;
|
||||
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
|
||||
@ -598,6 +669,24 @@ void __init smp_init(void)
|
||||
{
|
||||
int num_nodes, num_cpus;
|
||||
|
||||
/*
|
||||
* Ensure struct irq_work layout matches so that
|
||||
* flush_smp_call_function_queue() can do horrible things.
|
||||
*/
|
||||
BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
|
||||
offsetof(struct __call_single_data, llist));
|
||||
BUILD_BUG_ON(offsetof(struct irq_work, func) !=
|
||||
offsetof(struct __call_single_data, func));
|
||||
BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
|
||||
offsetof(struct __call_single_data, flags));
|
||||
|
||||
/*
|
||||
* Assert the CSD_TYPE_TTWU layout is similar enough
|
||||
* for task_struct to be on the @call_single_queue.
|
||||
*/
|
||||
BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
|
||||
offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
|
||||
|
||||
idle_threads_init();
|
||||
cpuhp_threads_init();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user