linux/kernel/sched/idle.c
Peter Zijlstra e78a7614f3 idle: Prevent late-arriving interrupts from disrupting offline
Scheduling-clock interrupts can arrive late in the CPU-offline process,
after idle entry and the subsequent call to cpuhp_report_idle_dead().
Once execution passes the call to rcu_report_dead(), RCU is ignoring
the CPU, which results in lockdep complaints when the interrupt handler
uses RCU:

------------------------------------------------------------------------

=============================
WARNING: suspicious RCU usage
5.2.0-rc1+ #681 Not tainted
-----------------------------
kernel/sched/fair.c:9542 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

RCU used illegally from offline CPU!
rcu_scheduler_active = 2, debug_locks = 1
no locks held by swapper/5/0.

stack backtrace:
CPU: 5 PID: 0 Comm: swapper/5 Not tainted 5.2.0-rc1+ #681
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Bochs 01/01/2011
Call Trace:
 <IRQ>
 dump_stack+0x5e/0x8b
 trigger_load_balance+0xa8/0x390
 ? tick_sched_do_timer+0x60/0x60
 update_process_times+0x3b/0x50
 tick_sched_handle+0x2f/0x40
 tick_sched_timer+0x32/0x70
 __hrtimer_run_queues+0xd3/0x3b0
 hrtimer_interrupt+0x11d/0x270
 ? sched_clock_local+0xc/0x74
 smp_apic_timer_interrupt+0x79/0x200
 apic_timer_interrupt+0xf/0x20
 </IRQ>
RIP: 0010:delay_tsc+0x22/0x50
Code: ff 0f 1f 80 00 00 00 00 65 44 8b 05 18 a7 11 48 0f ae e8 0f 31 48 89 d6 48 c1 e6 20 48 09 c6 eb 0e f3 90 65 8b 05 fe a6 11 48 <41> 39 c0 75 18 0f ae e8 0f 31 48 c1 e2 20 48 09 c2 48 89 d0 48 29
RSP: 0000:ffff8f92c0157ed0 EFLAGS: 00000212 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000005 RBX: ffff8c861f356400 RCX: ffff8f92c0157e64
RDX: 000000321214c8cc RSI: 00000032120daa7f RDI: 0000000000260f15
RBP: 0000000000000005 R08: 0000000000000005 R09: 0000000000000000
R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000
R13: 0000000000000000 R14: ffff8c861ee18000 R15: ffff8c861ee18000
 cpuhp_report_idle_dead+0x31/0x60
 do_idle+0x1d5/0x200
 ? _raw_spin_unlock_irqrestore+0x2d/0x40
 cpu_startup_entry+0x14/0x20
 start_secondary+0x151/0x170
 secondary_startup_64+0xa4/0xb0

------------------------------------------------------------------------

This happens rarely, but can be forced by happen more often by
placing delays in cpuhp_report_idle_dead() following the call to
rcu_report_dead().  With this in place, the following rcutorture
scenario reproduces the problem within a few minutes:

tools/testing/selftests/rcutorture/bin/kvm.sh --cpus 8 --duration 5 --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" --configs "TREE04"

This commit uses the crude but effective expedient of moving the disabling
of interrupts within the idle loop to precede the cpu_is_offline()
check.  It also invokes tick_nohz_idle_stop_tick() instead of
tick_nohz_idle_stop_tick_protected() to shut off the scheduling-clock
interrupt.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
[ paulmck: Revert tick_nohz_idle_stop_tick_protected() removal, new callers. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-08-12 11:23:56 -07:00

470 lines
11 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic entry points for the idle threads and
* implementation of the idle task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
#include "sched.h"
#include <trace/events/power.h>
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
/**
* sched_idle_set_state - Record idle state for the current CPU.
* @idle_state: State to record.
*/
void sched_idle_set_state(struct cpuidle_state *idle_state)
{
idle_set_state(this_rq(), idle_state);
}
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
{
if (enable) {
cpu_idle_force_poll++;
} else {
cpu_idle_force_poll--;
WARN_ON_ONCE(cpu_idle_force_poll < 0);
}
}
#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
static int __init cpu_idle_poll_setup(char *__unused)
{
cpu_idle_force_poll = 1;
return 1;
}
__setup("nohlt", cpu_idle_poll_setup);
static int __init cpu_idle_nopoll_setup(char *__unused)
{
cpu_idle_force_poll = 0;
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
#endif
static noinline int __cpuidle cpu_idle_poll(void)
{
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
stop_critical_timings();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
start_critical_timings();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
return 1;
}
/* Weak implementations for optional arch specific functions */
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
local_irq_enable();
}
/**
* default_idle_call - Default CPU idle routine.
*
* To use when the cpuidle framework cannot be used.
*/
void __cpuidle default_idle_call(void)
{
if (current_clr_polling_and_test()) {
local_irq_enable();
} else {
stop_critical_timings();
arch_cpu_idle();
start_critical_timings();
}
}
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
int next_state)
{
/*
* The idle task must be scheduled, it is pointless to go to idle, just
* update no idle residency and return.
*/
if (current_clr_polling_and_test()) {
dev->last_residency = 0;
local_irq_enable();
return -EBUSY;
}
/*
* Enter the idle state previously returned by the governor decision.
* This function will block until an interrupt occurs and will take
* care of re-enabling the local interrupts
*/
return cpuidle_enter(drv, dev, next_state);
}
/**
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
*
* On archs that support TIF_POLLING_NRFLAG, is called with polling
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
static void cpuidle_idle_call(void)
{
struct cpuidle_device *dev = cpuidle_get_device();
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
/*
* Check if the idle task must be rescheduled. If it is the
* case, exit the function after re-enabling the local irq.
*/
if (need_resched()) {
local_irq_enable();
return;
}
/*
* The RCU framework needs to be told that we are entering an idle
* section, so no more rcu read side critical sections and one more
* step to the grace period
*/
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
rcu_idle_enter();
default_idle_call();
goto exit_idle;
}
/*
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in iterrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
*/
if (idle_should_enter_s2idle() || dev->use_deepest_state) {
if (idle_should_enter_s2idle()) {
rcu_idle_enter();
entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
}
rcu_idle_exit();
}
tick_nohz_idle_stop_tick();
rcu_idle_enter();
next_state = cpuidle_find_deepest_state(drv, dev);
call_cpuidle(drv, dev, next_state);
} else {
bool stop_tick = true;
/*
* Ask the cpuidle framework to choose a convenient idle state.
*/
next_state = cpuidle_select(drv, dev, &stop_tick);
if (stop_tick || tick_nohz_tick_stopped())
tick_nohz_idle_stop_tick();
else
tick_nohz_idle_retain_tick();
rcu_idle_enter();
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
*/
cpuidle_reflect(dev, entered_state);
}
exit_idle:
__current_set_polling();
/*
* It is up to the idle functions to reenable local interrupts
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
rcu_idle_exit();
}
/*
* Generic idle loop implementation
*
* Called with polling cleared.
*/
static void do_idle(void)
{
int cpu = smp_processor_id();
/*
* If the arch has a polling bit, we maintain an invariant:
*
* Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
* rq->idle). This means that, if rq->idle has the polling bit set,
* then setting need_resched is guaranteed to cause the CPU to
* reschedule.
*/
__current_set_polling();
tick_nohz_idle_enter();
while (!need_resched()) {
check_pgt_cache();
rmb();
local_irq_disable();
if (cpu_is_offline(cpu)) {
tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
arch_cpu_idle_enter();
/*
* In poll mode we reenable interrupts and spin. Also if we
* detected in the wakeup from idle path that the tick
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
*/
if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
tick_nohz_idle_restart_tick();
cpu_idle_poll();
} else {
cpuidle_idle_call();
}
arch_cpu_idle_exit();
}
/*
* Since we fell out of the loop above, we know TIF_NEED_RESCHED must
* be set, propagate it into PREEMPT_NEED_RESCHED.
*
* This is required because for polling idle loops we will not have had
* an IPI to fold the state for us.
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
__current_clr_polling();
/*
* We promise to call sched_ttwu_pending() and reschedule if
* need_resched() is set while polling is set. That means that clearing
* polling needs to be visible before doing these things.
*/
smp_mb__after_atomic();
sched_ttwu_pending();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
klp_update_patch_state(current);
}
bool cpu_in_idle(unsigned long pc)
{
return pc >= (unsigned long)__cpuidle_text_start &&
pc < (unsigned long)__cpuidle_text_end;
}
struct idle_timer {
struct hrtimer timer;
int done;
};
static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
{
struct idle_timer *it = container_of(timer, struct idle_timer, timer);
WRITE_ONCE(it->done, 1);
set_tsk_need_resched(current);
return HRTIMER_NORESTART;
}
void play_idle(unsigned long duration_ms)
{
struct idle_timer it;
/*
* Only FIFO tasks can disable the tick since they don't need the forced
* preemption.
*/
WARN_ON_ONCE(current->policy != SCHED_FIFO);
WARN_ON_ONCE(current->nr_cpus_allowed != 1);
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
WARN_ON_ONCE(!duration_ms);
rcu_sleep_check();
preempt_disable();
current->flags |= PF_IDLE;
cpuidle_use_deepest_state(true);
it.done = 0;
hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
it.timer.function = idle_inject_timer_fn;
hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
while (!READ_ONCE(it.done))
do_idle();
cpuidle_use_deepest_state(false);
current->flags &= ~PF_IDLE;
preempt_fold_need_resched();
preempt_enable();
}
EXPORT_SYMBOL_GPL(play_idle);
void cpu_startup_entry(enum cpuhp_state state)
{
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
do_idle();
}
/*
* idle-task scheduling class.
*/
#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_unlock_irq(&rq->lock);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
raw_spin_lock_irq(&rq->lock);
}
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
}
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
static void set_curr_task_idle(struct rq *rq)
{
}
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
static void
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
{
return 0;
}
static void update_curr_idle(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
.check_preempt_curr = check_preempt_curr_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
};