From d65d411c9259a2499081e1e7ed91088232666b57 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Jul 2023 12:08:50 +0100 Subject: [PATCH 01/90] treewide: context_tracking: Rename CONTEXT_* into CT_STATE_* Context tracking state related symbols currently use a mix of the CONTEXT_ (e.g. CONTEXT_KERNEL) and CT_SATE_ (e.g. CT_STATE_MASK) prefixes. Clean up the naming and make the ctx_state enum use the CT_STATE_ prefix. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Acked-by: Frederic Weisbecker Acked-by: Thomas Gleixner Signed-off-by: Neeraj Upadhyay --- arch/Kconfig | 2 +- arch/arm64/kernel/entry-common.c | 2 +- arch/powerpc/include/asm/interrupt.h | 6 +++--- arch/powerpc/kernel/interrupt.c | 6 +++--- arch/powerpc/kernel/syscall.c | 2 +- arch/x86/entry/common.c | 2 +- include/linux/context_tracking.h | 16 ++++++++-------- include/linux/context_tracking_state.h | 20 ++++++++++---------- include/linux/entry-common.h | 2 +- kernel/context_tracking.c | 12 ++++++------ kernel/entry/common.c | 2 +- kernel/sched/core.c | 4 ++-- 12 files changed, 38 insertions(+), 38 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 975dd22a2dbd..4e2eaba9e305 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -862,7 +862,7 @@ config HAVE_CONTEXT_TRACKING_USER_OFFSTACK Architecture neither relies on exception_enter()/exception_exit() nor on schedule_user(). Also preempt_schedule_notrace() and preempt_schedule_irq() can't be called in a preemptible section - while context tracking is CONTEXT_USER. This feature reflects a sane + while context tracking is CT_STATE_USER. This feature reflects a sane entry implementation where the following requirements are met on critical entry code, ie: before user_exit() or after user_enter(): diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index b77a15955f28..3fcd9d080bf2 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -103,7 +103,7 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs) static __always_inline void __enter_from_user_mode(void) { lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(ct_state() != CONTEXT_USER); + CT_WARN_ON(ct_state() != CT_STATE_USER); user_exit_irqoff(); trace_hardirqs_off_finish(); mte_disable_tco_entry(current); diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 2d6c886b40f4..23638d4e73ac 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -177,7 +177,7 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs) if (user_mode(regs)) { kuap_lock(); - CT_WARN_ON(ct_state() != CONTEXT_USER); + CT_WARN_ON(ct_state() != CT_STATE_USER); user_exit_irqoff(); account_cpu_user_entry(); @@ -189,8 +189,8 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs) * so avoid recursion. */ if (TRAP(regs) != INTERRUPT_PROGRAM) - CT_WARN_ON(ct_state() != CONTEXT_KERNEL && - ct_state() != CONTEXT_IDLE); + CT_WARN_ON(ct_state() != CT_STATE_KERNEL && + ct_state() != CT_STATE_IDLE); INT_SOFT_MASK_BUG_ON(regs, is_implicit_soft_masked(regs)); INT_SOFT_MASK_BUG_ON(regs, arch_irq_disabled_regs(regs) && search_kernel_restart_table(regs->nip)); diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index eca293794a1e..af62ec974b97 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -266,7 +266,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3, unsigned long ret = 0; bool is_not_scv = !IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !scv; - CT_WARN_ON(ct_state() == CONTEXT_USER); + CT_WARN_ON(ct_state() == CT_STATE_USER); kuap_assert_locked(); @@ -344,7 +344,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) BUG_ON(regs_is_unrecoverable(regs)); BUG_ON(arch_irq_disabled_regs(regs)); - CT_WARN_ON(ct_state() == CONTEXT_USER); + CT_WARN_ON(ct_state() == CT_STATE_USER); /* * We don't need to restore AMR on the way back to userspace for KUAP. @@ -386,7 +386,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_PPC_BOOK3E_64) && TRAP(regs) != INTERRUPT_PROGRAM && TRAP(regs) != INTERRUPT_PERFMON) - CT_WARN_ON(ct_state() == CONTEXT_USER); + CT_WARN_ON(ct_state() == CT_STATE_USER); kuap = kuap_get_and_assert_locked(); diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c index f6f868e817e6..be159ad4b77b 100644 --- a/arch/powerpc/kernel/syscall.c +++ b/arch/powerpc/kernel/syscall.c @@ -27,7 +27,7 @@ notrace long system_call_exception(struct pt_regs *regs, unsigned long r0) trace_hardirqs_off(); /* finish reconciling */ - CT_WARN_ON(ct_state() == CONTEXT_KERNEL); + CT_WARN_ON(ct_state() == CT_STATE_KERNEL); user_exit_irqoff(); BUG_ON(regs_is_unrecoverable(regs)); diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 51cc9c7cb9bd..94941c5a10ac 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -150,7 +150,7 @@ early_param("ia32_emulation", ia32_emulation_override_cmdline); #endif /* - * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. + * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL. */ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) { diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 6e76b9dba00e..28fcfa184903 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -26,26 +26,26 @@ extern void user_exit_callable(void); static inline void user_enter(void) { if (context_tracking_enabled()) - ct_user_enter(CONTEXT_USER); + ct_user_enter(CT_STATE_USER); } static inline void user_exit(void) { if (context_tracking_enabled()) - ct_user_exit(CONTEXT_USER); + ct_user_exit(CT_STATE_USER); } /* Called with interrupts disabled. */ static __always_inline void user_enter_irqoff(void) { if (context_tracking_enabled()) - __ct_user_enter(CONTEXT_USER); + __ct_user_enter(CT_STATE_USER); } static __always_inline void user_exit_irqoff(void) { if (context_tracking_enabled()) - __ct_user_exit(CONTEXT_USER); + __ct_user_exit(CT_STATE_USER); } static inline enum ctx_state exception_enter(void) @@ -57,7 +57,7 @@ static inline enum ctx_state exception_enter(void) return 0; prev_ctx = __ct_state(); - if (prev_ctx != CONTEXT_KERNEL) + if (prev_ctx != CT_STATE_KERNEL) ct_user_exit(prev_ctx); return prev_ctx; @@ -67,7 +67,7 @@ static inline void exception_exit(enum ctx_state prev_ctx) { if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) && context_tracking_enabled()) { - if (prev_ctx != CONTEXT_KERNEL) + if (prev_ctx != CT_STATE_KERNEL) ct_user_enter(prev_ctx); } } @@ -75,7 +75,7 @@ static inline void exception_exit(enum ctx_state prev_ctx) static __always_inline bool context_tracking_guest_enter(void) { if (context_tracking_enabled()) - __ct_user_enter(CONTEXT_GUEST); + __ct_user_enter(CT_STATE_GUEST); return context_tracking_enabled_this_cpu(); } @@ -83,7 +83,7 @@ static __always_inline bool context_tracking_guest_enter(void) static __always_inline void context_tracking_guest_exit(void) { if (context_tracking_enabled()) - __ct_user_exit(CONTEXT_GUEST); + __ct_user_exit(CT_STATE_GUEST); } #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index bbff5f7f8803..f1c53125edee 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -10,18 +10,18 @@ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) enum ctx_state { - CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */ - CONTEXT_KERNEL = 0, - CONTEXT_IDLE = 1, - CONTEXT_USER = 2, - CONTEXT_GUEST = 3, - CONTEXT_MAX = 4, + CT_STATE_DISABLED = -1, /* returned by ct_state() if unknown */ + CT_STATE_KERNEL = 0, + CT_STATE_IDLE = 1, + CT_STATE_USER = 2, + CT_STATE_GUEST = 3, + CT_STATE_MAX = 4, }; /* Even value for idle, else odd. */ -#define RCU_DYNTICKS_IDX CONTEXT_MAX +#define RCU_DYNTICKS_IDX CT_STATE_MAX -#define CT_STATE_MASK (CONTEXT_MAX - 1) +#define CT_STATE_MASK (CT_STATE_MAX - 1) #define CT_DYNTICKS_MASK (~CT_STATE_MASK) struct context_tracking { @@ -123,14 +123,14 @@ static inline bool context_tracking_enabled_this_cpu(void) * * Returns the current cpu's context tracking state if context tracking * is enabled. If context tracking is disabled, returns - * CONTEXT_DISABLED. This should be used primarily for debugging. + * CT_STATE_DISABLED. This should be used primarily for debugging. */ static __always_inline int ct_state(void) { int ret; if (!context_tracking_enabled()) - return CONTEXT_DISABLED; + return CT_STATE_DISABLED; preempt_disable(); ret = __ct_state(); diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index b0fb775a600d..1e50cdb83ae5 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -108,7 +108,7 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) arch_enter_from_user_mode(regs); lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(__ct_state() != CONTEXT_USER); + CT_WARN_ON(__ct_state() != CT_STATE_USER); user_exit_irqoff(); instrumentation_begin(); diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 24b1e1143260..4bb5751af994 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -317,7 +317,7 @@ void noinstr ct_nmi_enter(void) void noinstr ct_idle_enter(void) { WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE); + ct_kernel_exit(false, RCU_DYNTICKS_IDX + CT_STATE_IDLE); } EXPORT_SYMBOL_GPL(ct_idle_enter); @@ -335,7 +335,7 @@ void noinstr ct_idle_exit(void) unsigned long flags; raw_local_irq_save(flags); - ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE); + ct_kernel_enter(false, RCU_DYNTICKS_IDX - CT_STATE_IDLE); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ct_idle_exit); @@ -485,7 +485,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * user_exit() or ct_irq_enter(). Let's remove RCU's dependency * on the tick. */ - if (state == CONTEXT_USER) { + if (state == CT_STATE_USER) { instrumentation_begin(); trace_user_enter(0); vtime_user_enter(current); @@ -621,7 +621,7 @@ void noinstr __ct_user_exit(enum ctx_state state) * run a RCU read side critical section anytime. */ ct_kernel_enter(true, RCU_DYNTICKS_IDX - state); - if (state == CONTEXT_USER) { + if (state == CT_STATE_USER) { instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); @@ -634,12 +634,12 @@ void noinstr __ct_user_exit(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - raw_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CT_STATE_KERNEL); } else { if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - raw_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CT_STATE_KERNEL); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 90843cc38588..5b6934e23c21 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -182,7 +182,7 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) unsigned long work = READ_ONCE(current_thread_info()->syscall_work); unsigned long nr = syscall_get_nr(current, regs); - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + CT_WARN_ON(ct_state() != CT_STATE_KERNEL); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a9f655025607..93a845fe8449 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5762,7 +5762,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CONTEXT_USER); + SCHED_WARN_ON(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -6658,7 +6658,7 @@ asmlinkage __visible void __sched schedule_user(void) * we find a better solution. * * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != CONTEXT_USER, but that will trigger + * should warn if prev_state != CT_STATE_USER, but that will trigger * too frequently to make sense yet. */ enum ctx_state prev_state = exception_enter(); From 4aa35e0db6d758e8f952ed30d7ac3117c376562c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Jul 2023 12:19:01 +0100 Subject: [PATCH 02/90] context_tracking, rcu: Rename RCU_DYNTICKS_IDX into CT_RCU_WATCHING The symbols relating to the CT_STATE part of context_tracking.state are now all prefixed with CT_STATE. The RCU dynticks counter part of that atomic variable still involves symbols with different prefixes, align them all to be prefixed with CT_RCU_WATCHING. Suggested-by: "Paul E. McKenney" Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Acked-by: Thomas Gleixner Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking.h | 6 +++--- include/linux/context_tracking_state.h | 12 ++++++------ kernel/context_tracking.c | 22 +++++++++++----------- kernel/rcu/tree.c | 12 ++++++------ 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 28fcfa184903..a6c36780cc3b 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -119,7 +119,7 @@ extern void ct_idle_exit(void); */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & RCU_DYNTICKS_IDX); + return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING); } /* @@ -142,7 +142,7 @@ static __always_inline bool warn_rcu_enter(void) preempt_disable_notrace(); if (rcu_dynticks_curr_cpu_in_eqs()) { ret = true; - ct_state_inc(RCU_DYNTICKS_IDX); + ct_state_inc(CT_RCU_WATCHING); } return ret; @@ -151,7 +151,7 @@ static __always_inline bool warn_rcu_enter(void) static __always_inline void warn_rcu_exit(bool rcu) { if (rcu) - ct_state_inc(RCU_DYNTICKS_IDX); + ct_state_inc(CT_RCU_WATCHING); preempt_enable_notrace(); } diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index f1c53125edee..94d6a935af3b 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -18,11 +18,11 @@ enum ctx_state { CT_STATE_MAX = 4, }; -/* Even value for idle, else odd. */ -#define RCU_DYNTICKS_IDX CT_STATE_MAX +/* Odd value for watching, else even. */ +#define CT_RCU_WATCHING CT_STATE_MAX #define CT_STATE_MASK (CT_STATE_MAX - 1) -#define CT_DYNTICKS_MASK (~CT_STATE_MASK) +#define CT_RCU_WATCHING_MASK (~CT_STATE_MASK) struct context_tracking { #ifdef CONFIG_CONTEXT_TRACKING_USER @@ -58,21 +58,21 @@ static __always_inline int __ct_state(void) #ifdef CONFIG_CONTEXT_TRACKING_IDLE static __always_inline int ct_dynticks(void) { - return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_DYNTICKS_MASK; + return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK; } static __always_inline int ct_dynticks_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); - return atomic_read(&ct->state) & CT_DYNTICKS_MASK; + return atomic_read(&ct->state) & CT_RCU_WATCHING_MASK; } static __always_inline int ct_dynticks_cpu_acquire(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); - return atomic_read_acquire(&ct->state) & CT_DYNTICKS_MASK; + return atomic_read_acquire(&ct->state) & CT_RCU_WATCHING_MASK; } static __always_inline long ct_dynticks_nesting(void) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 4bb5751af994..b2589bc59e18 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, #endif - .state = ATOMIC_INIT(RCU_DYNTICKS_IDX), + .state = ATOMIC_INIT(CT_RCU_WATCHING), }; EXPORT_SYMBOL_GPL(context_tracking); @@ -90,7 +90,7 @@ static noinstr void ct_kernel_exit_state(int offset) rcu_dynticks_task_trace_enter(); // Before ->dynticks update! seq = ct_state_inc(offset); // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); } /* @@ -110,7 +110,7 @@ static noinstr void ct_kernel_enter_state(int offset) seq = ct_state_inc(offset); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING)); } /* @@ -236,7 +236,7 @@ void noinstr ct_nmi_exit(void) instrumentation_end(); // RCU is watching here ... - ct_kernel_exit_state(RCU_DYNTICKS_IDX); + ct_kernel_exit_state(CT_RCU_WATCHING); // ... but is no longer watching here. if (!in_nmi()) @@ -277,7 +277,7 @@ void noinstr ct_nmi_enter(void) rcu_dynticks_task_exit(); // RCU is not watching here ... - ct_kernel_enter_state(RCU_DYNTICKS_IDX); + ct_kernel_enter_state(CT_RCU_WATCHING); // ... but is watching here. instrumentation_begin(); @@ -317,7 +317,7 @@ void noinstr ct_nmi_enter(void) void noinstr ct_idle_enter(void) { WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - ct_kernel_exit(false, RCU_DYNTICKS_IDX + CT_STATE_IDLE); + ct_kernel_exit(false, CT_RCU_WATCHING + CT_STATE_IDLE); } EXPORT_SYMBOL_GPL(ct_idle_enter); @@ -335,7 +335,7 @@ void noinstr ct_idle_exit(void) unsigned long flags; raw_local_irq_save(flags); - ct_kernel_enter(false, RCU_DYNTICKS_IDX - CT_STATE_IDLE); + ct_kernel_enter(false, CT_RCU_WATCHING - CT_STATE_IDLE); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ct_idle_exit); @@ -504,7 +504,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * CPU doesn't need to maintain the tick for RCU maintenance purposes * when the CPU runs in userspace. */ - ct_kernel_exit(true, RCU_DYNTICKS_IDX + state); + ct_kernel_exit(true, CT_RCU_WATCHING + state); /* * Special case if we only track user <-> kernel transitions for tickless @@ -534,7 +534,7 @@ void noinstr __ct_user_enter(enum ctx_state state) /* * Tracking for vtime and RCU EQS. Make sure we don't race * with NMIs. OTOH we don't care about ordering here since - * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * RCU only requires CT_RCU_WATCHING increments to be fully * ordered. */ raw_atomic_add(state, &ct->state); @@ -620,7 +620,7 @@ void noinstr __ct_user_exit(enum ctx_state state) * Exit RCU idle mode while entering the kernel because it can * run a RCU read side critical section anytime. */ - ct_kernel_enter(true, RCU_DYNTICKS_IDX - state); + ct_kernel_enter(true, CT_RCU_WATCHING - state); if (state == CT_STATE_USER) { instrumentation_begin(); vtime_user_exit(current); @@ -644,7 +644,7 @@ void noinstr __ct_user_exit(enum ctx_state state) /* * Tracking for vtime and RCU EQS. Make sure we don't race * with NMIs. OTOH we don't care about ordering here since - * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * RCU only requires CT_RCU_WATCHING increments to be fully * ordered. */ raw_atomic_sub(state, &ct->state); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..04f87c44385c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -294,9 +294,9 @@ void rcu_softirq_qs(void) */ static void rcu_dynticks_eqs_online(void) { - if (ct_dynticks() & RCU_DYNTICKS_IDX) + if (ct_dynticks() & CT_RCU_WATCHING) return; - ct_state_inc(RCU_DYNTICKS_IDX); + ct_state_inc(CT_RCU_WATCHING); } /* @@ -305,7 +305,7 @@ static void rcu_dynticks_eqs_online(void) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & RCU_DYNTICKS_IDX); + return !(snap & CT_RCU_WATCHING); } /* @@ -335,7 +335,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX; + snap = ct_dynticks_cpu(cpu) & ~CT_RCU_WATCHING; smp_rmb(); // Order ->dynticks and *vp reads. if (READ_ONCE(*vp)) return false; // Non-zero, so report failure; @@ -361,9 +361,9 @@ notrace void rcu_momentary_dyntick_idle(void) int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - seq = ct_state_inc(2 * RCU_DYNTICKS_IDX); + seq = ct_state_inc(2 * CT_RCU_WATCHING); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX)); + WARN_ON_ONCE(!(seq & CT_RCU_WATCHING)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); From a4a7921ec08d016f9d692752e2f82f66369c7ffa Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 10:45:56 +0200 Subject: [PATCH 03/90] context_tracking, rcu: Rename ct_dynticks() into ct_rcu_watching() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- kernel/context_tracking.c | 10 +++++----- kernel/rcu/tree.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 94d6a935af3b..cb90d8c17810 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -56,7 +56,7 @@ static __always_inline int __ct_state(void) #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE -static __always_inline int ct_dynticks(void) +static __always_inline int ct_rcu_watching(void) { return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index b2589bc59e18..868ae0bcd4be 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -137,7 +137,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) instrumentation_begin(); lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks()); + trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rcu_preempt_deferred_qs(current); @@ -182,7 +182,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) // instrumentation for the noinstr ct_kernel_enter_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); - trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); + trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(ct->dynticks_nesting, 1); WARN_ON_ONCE(ct_dynticks_nmi_nesting()); @@ -220,7 +220,7 @@ void noinstr ct_nmi_exit(void) */ if (ct_dynticks_nmi_nesting() != 1) { trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, - ct_dynticks()); + ct_rcu_watching()); WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ ct_dynticks_nmi_nesting() - 2); instrumentation_end(); @@ -228,7 +228,7 @@ void noinstr ct_nmi_exit(void) } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks()); + trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_rcu_watching()); WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ // instrumentation for the noinstr ct_kernel_exit_state() @@ -296,7 +296,7 @@ void noinstr ct_nmi_enter(void) trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), ct_dynticks_nmi_nesting(), - ct_dynticks_nmi_nesting() + incby, ct_dynticks()); + ct_dynticks_nmi_nesting() + incby, ct_rcu_watching()); instrumentation_end(); WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ ct_dynticks_nmi_nesting() + incby); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 04f87c44385c..3a2f0325aa43 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -294,7 +294,7 @@ void rcu_softirq_qs(void) */ static void rcu_dynticks_eqs_online(void) { - if (ct_dynticks() & CT_RCU_WATCHING) + if (ct_rcu_watching() & CT_RCU_WATCHING) return; ct_state_inc(CT_RCU_WATCHING); } From a9fde9d1a5dd42edadf61cf4e64aa234b4c5bd3f Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 10:47:14 +0200 Subject: [PATCH 04/90] context_tracking, rcu: Rename ct_dynticks_cpu() into ct_rcu_watching_cpu() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- kernel/rcu/tree.c | 10 +++++----- kernel/rcu/tree_stall.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index cb90d8c17810..ad5a06a42b4a 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -61,7 +61,7 @@ static __always_inline int ct_rcu_watching(void) return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK; } -static __always_inline int ct_dynticks_cpu(int cpu) +static __always_inline int ct_rcu_watching_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3a2f0325aa43..e7f612e9f7e5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -335,14 +335,14 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = ct_dynticks_cpu(cpu) & ~CT_RCU_WATCHING; - smp_rmb(); // Order ->dynticks and *vp reads. + snap = ct_rcu_watching_cpu(cpu) & ~CT_RCU_WATCHING; + smp_rmb(); // Order CT state and *vp reads. if (READ_ONCE(*vp)) return false; // Non-zero, so report failure; - smp_rmb(); // Order *vp read and ->dynticks re-read. + smp_rmb(); // Order *vp read and CT state re-read. // If still in the same extended quiescent state, we are good! - return snap == ct_dynticks_cpu(cpu); + return snap == ct_rcu_watching_cpu(cpu); } /* @@ -4805,7 +4805,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(ct->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu))); + WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_rcu_watching_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4b0e9d7c4c68..d65974448e81 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -501,7 +501,7 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)); + rcu_dynticks_in_eqs(ct_rcu_watching_cpu(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) // Print signed value, as negative values indicate a probable bug. @@ -515,7 +515,7 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - ct_dynticks_cpu(cpu) & 0xffff, + ct_rcu_watching_cpu(cpu) & 0xffff, ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, From 125716c393889f9035567d4d78da239483f63ece Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:47:12 +0200 Subject: [PATCH 05/90] context_tracking, rcu: Rename ct_dynticks_cpu_acquire() into ct_rcu_watching_cpu_acquire() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- .../RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst | 2 +- include/linux/context_tracking_state.h | 2 +- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_exp.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 728b1e690c64..2d7036ad7476 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -149,7 +149,7 @@ This case is handled by calls to the strongly ordered ``atomic_add_return()`` read-modify-write atomic operation that is invoked within ``rcu_dynticks_eqs_enter()`` at idle-entry time and within ``rcu_dynticks_eqs_exit()`` at idle-exit time. -The grace-period kthread invokes first ``ct_dynticks_cpu_acquire()`` +The grace-period kthread invokes first ``ct_rcu_watching_cpu_acquire()`` (preceded by a full memory barrier) and ``rcu_dynticks_in_eqs_since()`` (both of which rely on acquire semantics) to detect idle CPUs. diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index ad5a06a42b4a..ad6570ffeff3 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -68,7 +68,7 @@ static __always_inline int ct_rcu_watching_cpu(int cpu) return atomic_read(&ct->state) & CT_RCU_WATCHING_MASK; } -static __always_inline int ct_dynticks_cpu_acquire(int cpu) +static __always_inline int ct_rcu_watching_cpu_acquire(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e7f612e9f7e5..45a9f3667c2a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -323,7 +323,7 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) * performed by the remote CPU prior to entering idle and therefore can * rely solely on acquire semantics. */ - return snap != ct_dynticks_cpu_acquire(rdp->cpu); + return snap != ct_rcu_watching_cpu_acquire(rdp->cpu); } /* @@ -782,7 +782,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * Ordering between remote CPU's pre idle accesses and post grace period * updater's accesses is enforced by the below acquire semantic. */ - rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu); + rdp->dynticks_snap = ct_rcu_watching_cpu_acquire(rdp->cpu); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4acd29d16fdb..daa87fec703f 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -376,7 +376,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) * post grace period updater's accesses is enforced by the * below acquire semantic. */ - snap = ct_dynticks_cpu_acquire(cpu); + snap = ct_rcu_watching_cpu_acquire(cpu); if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; else From 7aeba709a048d870c15940af8b620b16281c3b9e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:42 +0200 Subject: [PATCH 06/90] rcu/nocb: Introduce RCU_NOCB_LOCKDEP_WARN() Checking for races against concurrent (de-)offloading implies the creation of !CONFIG_RCU_NOCB_CPU stubs to check if each relevant lock is held. For now this only implies the nocb_lock but more are to be expected. Create instead a NOCB specific version of RCU_LOCKDEP_WARN() to avoid the proliferation of stubs. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 7 +++++++ kernel/rcu/tree_nocb.h | 14 -------------- kernel/rcu/tree_plugin.h | 4 ++-- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13f6f00aecf9..d48d3c237305 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -144,11 +144,18 @@ void rcu_init_nohz(void); int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); void rcu_nocb_flush_deferred_wakeup(void); + +#define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s) + #else /* #ifdef CONFIG_RCU_NOCB_CPU */ + static inline void rcu_init_nohz(void) { } static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } static inline void rcu_nocb_flush_deferred_wakeup(void) { } + +#define RCU_NOCB_LOCKDEP_WARN(c, s) + #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3ce30841119a..f4112fc663a7 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -16,10 +16,6 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return lockdep_is_held(&rdp->nocb_lock); -} static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) { @@ -1653,16 +1649,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return 0; -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - return false; -} - /* No ->nocb_lock to acquire. */ static void rcu_nocb_lock(struct rcu_data *rdp) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c569da65b421..f752b2a1d887 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -24,10 +24,10 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) * timers have their own means of synchronization against the * offloaded state updaters. */ - RCU_LOCKDEP_WARN( + RCU_NOCB_LOCKDEP_WARN( !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || - rcu_lockdep_is_held_nocb(rdp) || + lockdep_is_held(&rdp->nocb_lock) || (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), From ff81428ede8a290fc5fd85135e39be1065ae6176 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:43 +0200 Subject: [PATCH 07/90] rcu/nocb: Move nocb field at the end of state struct nocb_is_setup is a rarely used field, mostly on boot and CPU hotplug. It shouldn't occupy the middle of the rcu state hot fields cacheline. Move it to the end and build it conditionally while at it. More cold NOCB fields are to come. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fcf2b4aa3441..a297dc89a09c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -411,7 +411,6 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ - int nocb_is_setup; /* nocb is setup from boot */ /* synchronize_rcu() part. */ struct llist_head srs_next; /* request a GP users. */ @@ -420,6 +419,10 @@ struct rcu_state { struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; struct work_struct srs_cleanup_work; atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ + +#ifdef CONFIG_RCU_NOCB_CPU + int nocb_is_setup; /* nocb is setup from boot */ +#endif }; /* Values for rcu_state structure's gp_flags field. */ From 7be88a857eb84d2e0677690b81ee423dee51c93d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:44 +0200 Subject: [PATCH 08/90] rcu/nocb: Assert no callbacks while nocb kthread allocation fails When a NOCB CPU fails to create a nocb kthread on bringup, the CPU is then deoffloaded. The barrier mutex is locked at this stage. It is typically used to protect against concurrent (de-)offloading and/or concurrent rcu_barrier() that would otherwise risk a nocb locking imbalance. However: * rcu_barrier() can't run concurrently if it's the boot CPU on early boot-up. * rcu_barrier() can run concurrently if it's a secondary CPU but it is expected to see 0 callbacks on this target because it's the first time it boots. * (de-)offloading can't happen concurrently with smp_init(), as rcutorture is initialized later, at least not before device_initcall(), and userspace isn't available yet. * (de-)offloading can't happen concurrently with cpu_up(), courtesy of cpu_hotplug_lock. But: * The lazy shrinker might run concurrently with cpu_up(). It shouldn't try to grab the nocb_lock and risk an imbalance due to lazy_len supposed to be 0 but be extra cautious. * Also be cautious against resume from hibernation potential subtleties. So keep the locking and add some assertions and comments. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f4112fc663a7..fdd0616f2fd1 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1442,7 +1442,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuog/%d", rdp_gp->cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - goto end; + goto err; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) @@ -1454,7 +1454,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) t = kthread_create(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - goto end; + goto err; if (rcu_rdp_is_offloaded(rdp)) wake_up_process(t); @@ -1467,7 +1467,15 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); return; -end: + +err: + /* + * No need to protect against concurrent rcu_barrier() + * because the number of callbacks should be 0 for a non-boot CPU, + * therefore rcu_barrier() shouldn't even try to grab the nocb_lock. + * But hold barrier_mutex to avoid nocb_lock imbalance from shrinker. + */ + WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist)); mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { rcu_nocb_rdp_deoffload(rdp); From 7121dd915a262aee1f5a88cf86b1543f3a9439d7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:45 +0200 Subject: [PATCH 09/90] rcu/nocb: Introduce nocb mutex The barrier_mutex is used currently to protect (de-)offloading operations and prevent from nocb_lock locking imbalance in rcu_barrier() and shrinker, and also from misordered RCU barrier invocation. Now since RCU (de-)offloading is going to happen on offline CPUs, an RCU barrier will have to be executed while transitionning from offloaded to de-offloaded state. And this can't happen while holding the barrier_mutex. Introduce a NOCB mutex to protect (de-)offloading transitions. The barrier_mutex is still held for now when necessary to avoid barrier callbacks reordering and nocb_lock imbalance. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 3 +++ kernel/rcu/tree.h | 1 + kernel/rcu/tree_nocb.h | 20 ++++++++++++-------- kernel/rcu/tree_plugin.h | 1 + 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..2b9e713854b0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,6 +97,9 @@ static struct rcu_state rcu_state = { .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, rcu_sr_normal_gp_cleanup_work), .srs_cleanups_pending = ATOMIC_INIT(0), +#ifdef CONFIG_RCU_NOCB_CPU + .nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex), +#endif }; /* Dump rcu_node combining tree at boot to verify correct setup. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a297dc89a09c..16e6fe63d93c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -421,6 +421,7 @@ struct rcu_state { atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ #ifdef CONFIG_RCU_NOCB_CPU + struct mutex nocb_mutex; /* Guards (de-)offloading */ int nocb_is_setup; /* nocb is setup from boot */ #endif }; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index fdd0616f2fd1..16bcb8b13a5e 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1141,6 +1141,7 @@ int rcu_nocb_cpu_deoffload(int cpu) int ret = 0; cpus_read_lock(); + mutex_lock(&rcu_state.nocb_mutex); mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { @@ -1153,6 +1154,7 @@ int rcu_nocb_cpu_deoffload(int cpu) } } mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; @@ -1228,6 +1230,7 @@ int rcu_nocb_cpu_offload(int cpu) int ret = 0; cpus_read_lock(); + mutex_lock(&rcu_state.nocb_mutex); mutex_lock(&rcu_state.barrier_mutex); if (!rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { @@ -1240,6 +1243,7 @@ int rcu_nocb_cpu_offload(int cpu) } } mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; @@ -1257,7 +1261,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) return 0; /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ - if (!mutex_trylock(&rcu_state.barrier_mutex)) + if (!mutex_trylock(&rcu_state.nocb_mutex)) return 0; /* Snapshot count of all CPUs */ @@ -1267,7 +1271,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) count += READ_ONCE(rdp->lazy_len); } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); return count ? count : SHRINK_EMPTY; } @@ -1285,9 +1289,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * Protect against concurrent (de-)offloading. Otherwise nocb locking * may be ignored or imbalanced. */ - if (!mutex_trylock(&rcu_state.barrier_mutex)) { + if (!mutex_trylock(&rcu_state.nocb_mutex)) { /* - * But really don't insist if barrier_mutex is contended since we + * But really don't insist if nocb_mutex is contended since we * can't guarantee that it will never engage in a dependency * chain involving memory allocation. The lock is seldom contended * anyway. @@ -1326,7 +1330,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); return count ? count : SHRINK_STOP; } @@ -1473,15 +1477,15 @@ err: * No need to protect against concurrent rcu_barrier() * because the number of callbacks should be 0 for a non-boot CPU, * therefore rcu_barrier() shouldn't even try to grab the nocb_lock. - * But hold barrier_mutex to avoid nocb_lock imbalance from shrinker. + * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker. */ WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist)); - mutex_lock(&rcu_state.barrier_mutex); + mutex_lock(&rcu_state.nocb_mutex); if (rcu_rdp_is_offloaded(rdp)) { rcu_nocb_rdp_deoffload(rdp); cpumask_clear_cpu(cpu, rcu_nocb_mask); } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f752b2a1d887..c662376c8af0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -28,6 +28,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || lockdep_is_held(&rdp->nocb_lock) || + lockdep_is_held(&rcu_state.nocb_mutex) || (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), From 4e26ce423116e9f48f73723cc33add295e56ad31 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:46 +0200 Subject: [PATCH 10/90] rcu/nocb: (De-)offload callbacks on offline CPUs only Currently callbacks can be (de-)offloaded only on online CPUs. This involves an overly elaborated state machine in order to make sure that callbacks are always handled during the process while ensuring synchronization between rcu_core and NOCB kthreads. The only potential user of NOCB (de-)offloading appears to be a nohz_full toggling interface through cpusets. And the general agreement is now to work toward toggling the nohz_full state on offline CPUs to simplify the whole picture. Therefore, convert the (de-)offloading to only support offline CPUs. This involves the following changes: * Call rcu_barrier() before deoffloading. An offline offloaded CPU may still carry callbacks in its queue ignored by rcutree_migrate_callbacks(). Those callbacks must all be flushed before switching to a regular queue because no more kthreads will handle those before the CPU ever gets re-onlined. This means that further calls to rcu_barrier() will find an empty queue until the CPU goes through rcutree_report_cpu_starting(). As a result it is guaranteed that further rcu_barrier() won't try to lock the nocb_lock for that target and thus won't risk an imbalance. Therefore barrier_mutex doesn't need to be locked anymore upon deoffloading. * Assume the queue is empty before offloading, as rcutree_migrate_callbacks() took care of everything. This means that further calls to rcu_barrier() will find an empty queue until the CPU goes through rcutree_report_cpu_starting(). As a result it is guaranteed that further rcu_barrier() won't risk a nocb_lock imbalance. Therefore barrier_mutex doesn't need to be locked anymore upon offloading. * No need to flush bypass anymore. Further simplifications will follow in upcoming patches. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 82 +++++++++++------------------------------- 1 file changed, 21 insertions(+), 61 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 16bcb8b13a5e..8e766396df3a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1049,43 +1049,26 @@ static int rdp_offload_toggle(struct rcu_data *rdp, return wake_gp; } -static long rcu_nocb_rdp_deoffload(void *arg) +static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - /* - * rcu_nocb_rdp_deoffload() may be called directly if - * rcuog/o[p] spawn failed, because at this time the rdp->cpu - * is not online yet. - */ - WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu)); + /* CPU must be offline, unless it's early boot */ + WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id()); pr_info("De-offloading %d\n", rdp->cpu); + /* Flush all callbacks from segcblist and bypass */ + rcu_barrier(); + rcu_nocb_lock_irqsave(rdp, flags); - /* - * Flush once and for all now. This suffices because we are - * running on the target CPU holding ->nocb_lock (thus having - * interrupts disabled), and because rdp_offload_toggle() - * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. - * Thus future calls to rcu_segcblist_completely_offloaded() will - * return false, which means that future calls to rcu_nocb_try_bypass() - * will refuse to put anything into the bypass. - */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); - /* - * Start with invoking rcu_core() early. This way if the current thread - * happens to preempt an ongoing call to rcu_core() in the middle, - * leaving some work dismissed because rcu_core() still thinks the rdp is - * completely offloaded, we are guaranteed a nearby future instance of - * rcu_core() to catch up. - */ + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); - invoke_rcu_core(); wake_gp = rdp_offload_toggle(rdp, false, flags); mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); @@ -1128,10 +1111,6 @@ static long rcu_nocb_rdp_deoffload(void *arg) */ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - /* Sanity check */ - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - - return 0; } @@ -1142,18 +1121,16 @@ int rcu_nocb_cpu_deoffload(int cpu) cpus_read_lock(); mutex_lock(&rcu_state.nocb_mutex); - mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_deoffload(rdp); if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); @@ -1161,15 +1138,14 @@ int rcu_nocb_cpu_deoffload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); -static long rcu_nocb_rdp_offload(void *arg) +static int rcu_nocb_rdp_offload(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + WARN_ON_ONCE(cpu_online(rdp->cpu)); /* * For now we only support re-offload, ie: the rdp must have been * offloaded on boot first. @@ -1182,28 +1158,15 @@ static long rcu_nocb_rdp_offload(void *arg) pr_info("Offloading %d\n", rdp->cpu); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + /* * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING * is set. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - /* - * We didn't take the nocb lock while working on the - * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode). - * Every modifications that have been done previously on - * rdp->cblist must be visible remotely by the nocb kthreads - * upon wake up after reading the cblist flags. - * - * The layout against nocb_lock enforces that ordering: - * - * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() - * ------------------------- ---------------------------- - * WRITE callbacks rcu_nocb_lock() - * rcu_nocb_lock() READ flags - * WRITE flags READ callbacks - * rcu_nocb_unlock() rcu_nocb_unlock() - */ wake_gp = rdp_offload_toggle(rdp, true, flags); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); @@ -1214,8 +1177,7 @@ static long rcu_nocb_rdp_offload(void *arg) rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); /* - * All kthreads are ready to work, we can finally relieve rcu_core() and - * enable nocb bypass. + * All kthreads are ready to work, we can finally enable nocb bypass. */ rcu_nocb_lock_irqsave(rdp, flags); rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); @@ -1231,18 +1193,16 @@ int rcu_nocb_cpu_offload(int cpu) cpus_read_lock(); mutex_lock(&rcu_state.nocb_mutex); - mutex_lock(&rcu_state.barrier_mutex); if (!rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_offload(rdp); if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); From d2e7f55ff2e3d878a63149c90b360cbec101a83e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:47 +0200 Subject: [PATCH 11/90] rcu/nocb: Remove halfway (de-)offloading handling from bypass Bypass enqueue can't happen anymore in the middle of (de-)offloading since this sort of transition now only applies to offline CPUs. The related safety check can therefore be removed. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 8e766396df3a..af44e75eb0cd 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -409,14 +409,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, return false; } - // In the process of (de-)offloading: no bypassing, but - // locking. - if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; /* Not offloaded, no bypassing. */ - } - // Don't use ->nocb_bypass during early boot. if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { rcu_nocb_lock(rdp); From 5a4f9059a8f46fc1fbbae74781a5f7c8d74c732b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:48 +0200 Subject: [PATCH 12/90] rcu/nocb: Remove halfway (de-)offloading handling from rcu_core()'s QS reporting RCU core can't be running anymore while in the middle of (de-)offloading since this sort of transition now only applies to offline CPUs. The locked callback acceleration handling during the transition can therefore be removed. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2b9e713854b0..60f271f5c079 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2386,7 +2386,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needacc = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2423,23 +2422,11 @@ rcu_report_qs_rdp(struct rcu_data *rdp) * to return true. So complain, but don't awaken. */ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp)); - } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - /* - * ...but NOCB kthreads may miss or delay callbacks acceleration - * if in the middle of a (de-)offloading process. - */ - needacc = true; } rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ - - if (needacc) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); - } } } From df7c249a0ed464b3f690c6f04b8cbc0fdbf30afc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:49 +0200 Subject: [PATCH 13/90] rcu/nocb: Remove halfway (de-)offloading handling from rcu_core RCU core can't be running anymore while in the middle of (de-)offloading since this sort of transition now only applies to offline CPUs. The locked callback acceleration handling during the transition can therefore be removed, along with concurrent batch execution. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 60f271f5c079..1a272c678533 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2781,24 +2781,6 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - /* - * On RT rcu_core() can be preempted when IRQs aren't disabled. - * Therefore this function can race with concurrent NOCB (de-)offloading - * on this CPU and the below condition must be considered volatile. - * However if we race with: - * - * _ Offloading: In the worst case we accelerate or process callbacks - * concurrently with NOCB kthreads. We are guaranteed to - * call rcu_nocb_lock() if that happens. - * - * _ Deoffloading: In the worst case we miss callbacks acceleration or - * processing. This is fine because the early stage - * of deoffloading invokes rcu_core() after setting - * SEGCBLIST_RCU_CORE. So we guarantee that we'll process - * what could have been dismissed without the need to wait - * for the next rcu_pending() check in the next jiffy. - */ - const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2818,17 +2800,17 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) { - rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { + local_irq_save(flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); + local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && + if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) { rcu_do_batch(rdp); /* Re-invoke RCU core processing if there are callbacks remaining. */ From bae6076ebbd14cc1f1bd4de65c2db21d3ab109d7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:50 +0200 Subject: [PATCH 14/90] rcu/nocb: Remove SEGCBLIST_RCU_CORE RCU core can't be running anymore while in the middle of (de-)offloading since this sort of transition now only applies to offline CPUs. The SEGCBLIST_RCU_CORE state can therefore be removed. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcu_segcblist.h | 9 ++++----- kernel/rcu/rcu_segcblist.h | 9 --------- kernel/rcu/tree.c | 3 --- kernel/rcu/tree_nocb.h | 9 --------- 4 files changed, 4 insertions(+), 26 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index ba95c06675e1..5469c54cd778 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -185,11 +185,10 @@ struct rcu_cblist { * ---------------------------------------------------------------------------- */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_RCU_CORE BIT(1) -#define SEGCBLIST_LOCKING BIT(2) -#define SEGCBLIST_KTHREAD_CB BIT(3) -#define SEGCBLIST_KTHREAD_GP BIT(4) -#define SEGCBLIST_OFFLOADED BIT(5) +#define SEGCBLIST_LOCKING BIT(1) +#define SEGCBLIST_KTHREAD_CB BIT(2) +#define SEGCBLIST_KTHREAD_GP BIT(3) +#define SEGCBLIST_OFFLOADED BIT(4) struct rcu_segcblist { struct rcu_head *head; diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 4fe877f5f654..7a0962dfee86 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -95,15 +95,6 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) return false; } -static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) -{ - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE)) - return true; - - return false; -} - /* * Are all segments following the specified segment of the specified * rcu_segcblist structure empty of callbacks? (The specified diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1a272c678533..82e831b969e4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,9 +79,6 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, -#ifdef CONFIG_RCU_NOCB_CPU - .cblist.flags = SEGCBLIST_RCU_CORE, -#endif }; static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index af44e75eb0cd..24daf606de0c 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1060,7 +1060,6 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); - rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); wake_gp = rdp_offload_toggle(rdp, false, flags); mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); @@ -1168,13 +1167,6 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) swait_event_exclusive(rdp->nocb_state_wq, rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - /* - * All kthreads are ready to work, we can finally enable nocb bypass. - */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); - rcu_nocb_unlock_irqrestore(rdp, flags); - return 0; } @@ -1350,7 +1342,6 @@ void __init rcu_init_nohz(void) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); } rcu_organize_nocb_kthreads(); } From 91e43b9044a4f9b6976dc28b3f1446b733cc68de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:51 +0200 Subject: [PATCH 15/90] rcu/nocb: Remove SEGCBLIST_KTHREAD_CB This state excerpt from the (de-)offloading state machine was used to implement an ad-hoc kthread parking of rcuo kthreads. This code has been removed and therefore the related state can be erased as well. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcu_segcblist.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 5469c54cd778..1ef1bb54853d 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -186,9 +186,8 @@ struct rcu_cblist { */ #define SEGCBLIST_ENABLED BIT(0) #define SEGCBLIST_LOCKING BIT(1) -#define SEGCBLIST_KTHREAD_CB BIT(2) -#define SEGCBLIST_KTHREAD_GP BIT(3) -#define SEGCBLIST_OFFLOADED BIT(4) +#define SEGCBLIST_KTHREAD_GP BIT(2) +#define SEGCBLIST_OFFLOADED BIT(3) struct rcu_segcblist { struct rcu_head *head; From cf3b1501a8aa8e817baede5e3e1f2beb26a09527 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Jun 2024 13:35:47 -0700 Subject: [PATCH 16/90] rcutorture: Remove redundant rcu_torture_ops get_gp_completed fields The rcu_torture_ops structure's ->get_gp_completed and ->get_gp_completed_full fields are redundant with its ->get_comp_state and ->get_comp_state_full fields. This commit therefore removes the former in favor of the latter. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 08bf7c669dd3..836e27250a8b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -366,8 +366,6 @@ struct rcu_torture_ops { bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2); unsigned long (*get_gp_state)(void); void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp); - unsigned long (*get_gp_completed)(void); - void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*start_gp_poll)(void); void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state)(unsigned long oldstate); @@ -553,8 +551,6 @@ static struct rcu_torture_ops rcu_ops = { .get_comp_state_full = get_completed_synchronize_rcu_full, .get_gp_state = get_state_synchronize_rcu, .get_gp_state_full = get_state_synchronize_rcu_full, - .get_gp_completed = get_completed_synchronize_rcu, - .get_gp_completed_full = get_completed_synchronize_rcu_full, .start_gp_poll = start_poll_synchronize_rcu, .start_gp_poll_full = start_poll_synchronize_rcu_full, .poll_gp_state = poll_state_synchronize_rcu, @@ -1437,8 +1433,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); - if (cur_ops->get_gp_completed) { - cookie = cur_ops->get_gp_completed(); + if (cur_ops->get_comp_state) { + cookie = cur_ops->get_comp_state(); WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); } cur_ops->readunlock(idx); @@ -1452,8 +1448,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cpumask_pr_args(cpu_online_mask)); - if (cur_ops->get_gp_completed_full) { - cur_ops->get_gp_completed_full(&cookie_full); + if (cur_ops->get_comp_state_full) { + cur_ops->get_comp_state_full(&cookie_full); WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full)); } cur_ops->readunlock(idx); From 1bc5bb9a61378bbb2ca73ab06651d5b174876a8e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Jun 2024 16:47:12 -0700 Subject: [PATCH 17/90] rcutorture: Add SRCU ->same_gp_state and ->get_comp_state functions This commit points the SRCU ->same_gp_state and ->get_comp_state fields to same_state_synchronize_srcu() and get_completed_synchronize_srcu(), allowing them to be tested. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 836e27250a8b..b2e6201b4569 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -736,6 +736,8 @@ static struct rcu_torture_ops srcu_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, @@ -776,6 +778,8 @@ static struct rcu_torture_ops srcud_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, From 72ed29f68e6370841ba10edce5b9a213259eb9a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Jun 2024 21:34:17 -0700 Subject: [PATCH 18/90] rcutorture: Generic test for NUM_ACTIVE_*RCU_POLL* The rcutorture test suite has specific tests for both of the NUM_ACTIVE_RCU_POLL_OLDSTATE and NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE macros provided for RCU polled grace periods. However, with the advent of NUM_ACTIVE_SRCU_POLL_OLDSTATE, a more generic test is needed. This commit therefore adds ->poll_active and ->poll_active_full fields to the rcu_torture_ops structure and converts the existing specific tests to use these fields, when present. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b2e6201b4569..acf9f9945d2b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -373,6 +373,8 @@ struct rcu_torture_ops { bool (*poll_need_2gp)(bool poll, bool poll_full); void (*cond_sync)(unsigned long oldstate); void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp); + int poll_active; + int poll_active_full; call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); @@ -558,6 +560,8 @@ static struct rcu_torture_ops rcu_ops = { .poll_need_2gp = rcu_poll_need_2gp, .cond_sync = cond_synchronize_rcu, .cond_sync_full = cond_synchronize_rcu_full, + .poll_active = NUM_ACTIVE_RCU_POLL_OLDSTATE, + .poll_active_full = NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE, .get_gp_state_exp = get_state_synchronize_rcu, .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, @@ -741,6 +745,7 @@ static struct rcu_torture_ops srcu_ops = { .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -783,6 +788,7 @@ static struct rcu_torture_ops srcud_ops = { .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -1374,13 +1380,15 @@ rcu_torture_writer(void *arg) int i; int idx; int oldnice = task_nice(current); - struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE]; + struct rcu_gp_oldstate *rgo = NULL; + int rgo_size = 0; struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); unsigned long stallsdone = jiffies; bool stutter_waited; - unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + unsigned long *ulo = NULL; + int ulo_size = 0; // If a new stall test is added, this must be adjusted. if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) @@ -1401,6 +1409,16 @@ rcu_torture_writer(void *arg) torture_kthread_stopping("rcu_torture_writer"); return 0; } + if (cur_ops->poll_active > 0) { + ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL); + if (!WARN_ON(!ulo)) + ulo_size = cur_ops->poll_active; + } + if (cur_ops->poll_active_full > 0) { + rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL); + if (!WARN_ON(!rgo)) + rgo_size = cur_ops->poll_active_full; + } do { rcu_torture_writer_state = RTWS_FIXED_DELAY; @@ -1502,19 +1520,19 @@ rcu_torture_writer(void *arg) break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) if (cur_ops->poll_gp_state(ulo[i]) || cur_ops->same_gp_state(ulo[i], gp_snap1)) { ulo[i] = gp_snap1; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(ulo)); + WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } @@ -1522,20 +1540,20 @@ rcu_torture_writer(void *arg) break; case RTWS_POLL_GET_FULL: rcu_torture_writer_state = RTWS_POLL_GET_FULL; - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) if (cur_ops->poll_gp_state_full(&rgo[i]) || cur_ops->same_gp_state_full(&rgo[i], &gp_snap1_full)) { rgo[i] = gp_snap1_full; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(rgo)); + WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } @@ -1617,6 +1635,8 @@ rcu_torture_writer(void *arg) pr_alert("%s" TORTURE_FLAG " Dynamic grace-period expediting was disabled.\n", torture_type); + kfree(ulo); + kfree(rgo); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; From dc86460e77e8cea4896ff19de905001922653311 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Wed, 19 Jun 2024 23:06:58 +0000 Subject: [PATCH 19/90] rcutorture: Add CFcommon.arch for arch-specific Kconfig options Add CFcommon.arch for arch-specific Kconfig options. In accordance with [1], [2] and [3], move the x86-specific kernel option CONFIG_HYPERVISOR_GUEST to CFcommon.i686 and CFcommon.x86_64, and also move the x86/PowerPC CONFIG_KVM_GUEST Kconfig option to CFcommon.i686, CFcommon.x86_64, and CFcommon.ppc64le. The "arch" in CFcommon.arch is taken from the "uname -m" command. [1] https://lore.kernel.org/all/20240427005626.1365935-1-zhouzhouyi@gmail.com/ [2] https://lore.kernel.org/all/059d36ce-6453-42be-a31e-895abd35d590@paulmck-laptop/ [3] https://lore.kernel.org/all/ZnBkHosMDhsh4H8g@J2N7QTR9R3/ Tested in x86_64 and PPC VM of Open Source Lab of Oregon State University. Fixes: a6fda6dab93c ("rcutorture: Tweak kvm options") Suggested-by: Paul E. McKenney Suggested-by: Mark Rutland Signed-off-by: Zhouyi Zhou Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 ++ tools/testing/selftests/rcutorture/configs/rcu/CFcommon | 2 -- tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 | 2 ++ tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le | 1 + tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 | 2 ++ 5 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index b33cd8753689..ad79784e552d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -68,6 +68,8 @@ config_override_param "--gdb options" KcList "$TORTURE_KCONFIG_GDB_ARG" config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG" config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG" config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" +config_override_param "$config_dir/CFcommon.$(uname -m)" KcList \ + "`cat $config_dir/CFcommon.$(uname -m) 2> /dev/null`" cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon index 0e92d85313aa..217597e84905 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon @@ -1,7 +1,5 @@ CONFIG_RCU_TORTURE_TEST=y CONFIG_PRINTK_TIME=y -CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y -CONFIG_KVM_GUEST=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 new file mode 100644 index 000000000000..d8b2f555686f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 @@ -0,0 +1,2 @@ +CONFIG_HYPERVISOR_GUEST=y +CONFIG_KVM_GUEST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le new file mode 100644 index 000000000000..133da04247ee --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le @@ -0,0 +1 @@ +CONFIG_KVM_GUEST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 new file mode 100644 index 000000000000..d8b2f555686f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 @@ -0,0 +1,2 @@ +CONFIG_HYPERVISOR_GUEST=y +CONFIG_KVM_GUEST=y From 20cee0b3daa315237edded31ddf72a4e948d5e1d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jul 2024 09:18:33 -0700 Subject: [PATCH 20/90] rcutorture: Make rcu_torture_write_types() print number of update types This commit follows the list of update types with their count, resulting in console output like this: rcu_torture_write_types: Testing conditional GPs. rcu_torture_write_types: Testing conditional expedited GPs. rcu_torture_write_types: Testing conditional full-state GPs. rcu_torture_write_types: Testing expedited GPs. rcu_torture_write_types: Testing asynchronous GPs. rcu_torture_write_types: Testing polling GPs. rcu_torture_write_types: Testing polling full-state GPs. rcu_torture_write_types: Testing polling expedited GPs. rcu_torture_write_types: Testing polling full-state expedited GPs. rcu_torture_write_types: Testing normal GPs. rcu_torture_write_types: Testing 10 update types This commit adds the final line giving the count. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index acf9f9945d2b..9a2ecb8b88ec 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1324,6 +1324,7 @@ static void rcu_torture_write_types(void) } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); } + pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes); } /* From 9a13a324f40fc3846cf7867daffaccd785beb10c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:51:02 -0700 Subject: [PATCH 21/90] tools/rcu: Remove RCU Tasks Rude asynchronous APIs from rcu-updaters.sh The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are no longer. This commit therefore removes them from the rcu-updaters.sh script. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- tools/rcu/rcu-updaters.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/rcu/rcu-updaters.sh b/tools/rcu/rcu-updaters.sh index 4ef1397927bb..8a5df3f22550 100755 --- a/tools/rcu/rcu-updaters.sh +++ b/tools/rcu/rcu-updaters.sh @@ -21,12 +21,10 @@ fi bpftrace -e 'kprobe:kvfree_call_rcu, kprobe:call_rcu, kprobe:call_rcu_tasks, - kprobe:call_rcu_tasks_rude, kprobe:call_rcu_tasks_trace, kprobe:call_srcu, kprobe:rcu_barrier, kprobe:rcu_barrier_tasks, - kprobe:rcu_barrier_tasks_rude, kprobe:rcu_barrier_tasks_trace, kprobe:srcu_barrier, kprobe:synchronize_rcu, From 3471e96bcf53731639587580fd02e5a297940b91 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Jun 2024 10:40:30 -0700 Subject: [PATCH 22/90] rcu/kfree: Warn on unexpected tail state Within the rcu_sr_normal_gp_cleanup_work() function, there is an acquire load from rcu_state.srs_done_tail, which is expected to be non-NULL. This commit adds a WARN_ON_ONCE() to check this expectation. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..0f41a81138dc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1649,7 +1649,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) * the done tail list manipulations are protected here. */ done = smp_load_acquire(&rcu_state.srs_done_tail); - if (!done) + if (WARN_ON_ONCE(!done)) return; WARN_ON_ONCE(!rcu_sr_is_wait_head(done)); From c1972c8dc987769eac8e5dede536d3cff489c6ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 09:49:49 -0700 Subject: [PATCH 23/90] locking/csd_lock: Print large numbers as negatives The CSD-lock-hold diagnostics from CONFIG_CSD_LOCK_WAIT_DEBUG are printed in nanoseconds as unsigned long longs, which is a bit obtuse for human readers when timing bugs result in negative CSD-lock hold times. Yes, there are some people to whom it is immediately obvious that 18446744073709551615 is really -1, but for the rest of us... Therefore, print these numbers as signed long longs, making the negative hold times immediately apparent. Reported-by: Rik van Riel Signed-off-by: Paul E. McKenney Cc: Imran Khan Cc: Ingo Molnar Cc: Leonardo Bras Cc: "Peter Zijlstra (Intel)" Cc: Rik van Riel Reviewed-by: Rik van Riel Signed-off-by: Neeraj Upadhyay --- kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index aaffecdad319..e87953729230 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -249,8 +249,8 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ /* How long since this CSD lock was stuck. */ ts_delta = ts2 - ts0; - pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", - firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta, + pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely From 9aed3b51fd6186582a95abd9fa67782982540749 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jul 2024 18:49:35 -0700 Subject: [PATCH 24/90] rcu: Better define "atomic" for list replacement The kernel-doc headers for list_replace_rcu() and hlist_replace_rcu() claim that the replacement is atomic, which it is, but only for readers. Avoid confusion by making it clear that the atomic nature of these functions applies only to readers, not to concurrent updaters. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rculist.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 3dc1e58865f7..14dfa6008467 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -191,7 +191,10 @@ static inline void hlist_del_init_rcu(struct hlist_node *n) * @old : the element to be replaced * @new : the new element to insert * - * The @old entry will be replaced with the @new entry atomically. + * The @old entry will be replaced with the @new entry atomically from + * the perspective of concurrent readers. It is the caller's responsibility + * to synchronize with concurrent updaters, if any. + * * Note: @old should not be empty. */ static inline void list_replace_rcu(struct list_head *old, @@ -519,7 +522,9 @@ static inline void hlist_del_rcu(struct hlist_node *n) * @old : the element to be replaced * @new : the new element to insert * - * The @old entry will be replaced with the @new entry atomically. + * The @old entry will be replaced with the @new entry atomically from + * the perspective of concurrent readers. It is the caller's responsibility + * to synchronize with concurrent updaters, if any. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) From cfdbfb94b3824b8da3a6b21252d24754d0162ab6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:37:48 -0700 Subject: [PATCH 25/90] rcutorture: Add rcutree.nohz_full_patience_delay to TREE07 This commit adds the rcutree.nohz_full_patience_delay=1000 kernel boot parameter to the TREE07 scenario, on the observation that "if it ain't tested, it don't work". Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot index 979edbf4c820..55ce305b2a3d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot @@ -2,3 +2,4 @@ nohz_full=2-9 rcutorture.stall_cpu=14 rcutorture.stall_cpu_holdoff=90 rcutorture.fwd_progress=0 +rcutree.nohz_full_patience_delay=1000 From bf66471987b4bd537bc800353ca7d39bfd1d1022 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 10:51:10 +0200 Subject: [PATCH 26/90] context_tracking, rcu: Rename struct context_tracking .dynticks_nesting into .nesting The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. [ neeraj.upadhyay: Fix htmldocs build error reported by Stephen Rothwell ] Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- .../RCU/Design/Data-Structures/Data-Structures.rst | 10 +++++----- include/linux/context_tracking_state.h | 6 +++--- include/trace/events/rcu.h | 2 +- kernel/context_tracking.c | 10 +++++----- kernel/rcu/tree.c | 8 ++++---- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst index b34990c7c377..69860bbec202 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst @@ -935,7 +935,7 @@ This portion of the rcu_data structure is declared as follows: :: - 1 long dynticks_nesting; + 1 long nesting; 2 long dynticks_nmi_nesting; 3 atomic_t dynticks; 4 bool rcu_need_heavy_qs; @@ -945,7 +945,7 @@ These fields in the rcu_data structure maintain the per-CPU dyntick-idle state for the corresponding CPU. The fields may be accessed only from the corresponding CPU (and from tracing) unless otherwise stated. -The ``->dynticks_nesting`` field counts the nesting depth of process +The ``->nesting`` field counts the nesting depth of process execution, so that in normal circumstances this counter has value zero or one. NMIs, irqs, and tracers are counted by the ``->dynticks_nmi_nesting`` field. Because NMIs cannot be masked, changes @@ -960,9 +960,9 @@ process-level transitions. However, it turns out that when running in non-idle kernel context, the Linux kernel is fully capable of entering interrupt handlers that never exit and perhaps also vice versa. Therefore, whenever the -``->dynticks_nesting`` field is incremented up from zero, the +``->nesting`` field is incremented up from zero, the ``->dynticks_nmi_nesting`` field is set to a large positive number, and -whenever the ``->dynticks_nesting`` field is decremented down to zero, +whenever the ``->nesting`` field is decremented down to zero, the ``->dynticks_nmi_nesting`` field is set to zero. Assuming that the number of misnested interrupts is not sufficient to overflow the counter, this approach corrects the ``->dynticks_nmi_nesting`` field @@ -992,7 +992,7 @@ code. +-----------------------------------------------------------------------+ | **Quick Quiz**: | +-----------------------------------------------------------------------+ -| Why not simply combine the ``->dynticks_nesting`` and | +| Why not simply combine the ``->nesting`` and | | ``->dynticks_nmi_nesting`` counters into a single counter that just | | counts the number of reasons that the corresponding CPU is non-idle? | +-----------------------------------------------------------------------+ diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index ad6570ffeff3..65290e7677e6 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -39,7 +39,7 @@ struct context_tracking { atomic_t state; #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE - long dynticks_nesting; /* Track process nesting level. */ + long nesting; /* Track process nesting level. */ long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ #endif }; @@ -77,14 +77,14 @@ static __always_inline int ct_rcu_watching_cpu_acquire(int cpu) static __always_inline long ct_dynticks_nesting(void) { - return __this_cpu_read(context_tracking.dynticks_nesting); + return __this_cpu_read(context_tracking.nesting); } static __always_inline long ct_dynticks_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); - return ct->dynticks_nesting; + return ct->nesting; } static __always_inline long ct_dynticks_nmi_nesting(void) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 31b3e0d3e65f..4066b6d51e46 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -469,7 +469,7 @@ TRACE_EVENT(rcu_stall_warning, * polarity: "Start", "End", "StillNonIdle" for entering, exiting or still not * being in dyntick-idle mode. * context: "USER" or "IDLE" or "IRQ". - * NMIs nested in IRQs are inferred with dynticks_nesting > 1 in IRQ context. + * NMIs nested in IRQs are inferred with nesting > 1 in IRQ context. * * These events also take a pair of numbers, which indicate the nesting * depth before and after the event of interest, and a third number that is diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 868ae0bcd4be..5cfdfc03b401 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -28,7 +28,7 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_IDLE - .dynticks_nesting = 1, + .nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, #endif .state = ATOMIC_INIT(CT_RCU_WATCHING), @@ -131,7 +131,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) ct_dynticks_nesting() == 0); if (ct_dynticks_nesting() != 1) { // RCU will still be watching, so just do accounting and leave. - ct->dynticks_nesting--; + ct->nesting--; return; } @@ -145,7 +145,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) instrument_atomic_write(&ct->state, sizeof(ct->state)); instrumentation_end(); - WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + WRITE_ONCE(ct->nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... ct_kernel_exit_state(offset); // ... but is no longer watching here. @@ -170,7 +170,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { // RCU was already watching, so just do accounting and leave. - ct->dynticks_nesting++; + ct->nesting++; return; } rcu_dynticks_task_exit(); @@ -184,7 +184,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - WRITE_ONCE(ct->dynticks_nesting, 1); + WRITE_ONCE(ct->nesting, 1); WARN_ON_ONCE(ct_dynticks_nmi_nesting()); WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); instrumentation_end(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 45a9f3667c2a..0d5a539acd58 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -389,7 +389,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* Check for counter underflows */ RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0, - "RCU dynticks_nesting counter underflow!"); + "RCU nesting counter underflow!"); RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0, "RCU dynticks_nmi_nesting counter underflow/zero!"); @@ -597,7 +597,7 @@ void rcu_irq_exit_check_preempt(void) lockdep_assert_irqs_disabled(); RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0, - "RCU dynticks_nesting counter underflow/zero!"); + "RCU nesting counter underflow/zero!"); RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE, "Bad RCU dynticks_nmi_nesting counter\n"); @@ -4804,7 +4804,7 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); - WARN_ON_ONCE(ct->dynticks_nesting != 1); + WARN_ON_ONCE(ct->nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_rcu_watching_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; @@ -4898,7 +4898,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->blimit = blimit; - ct->dynticks_nesting = 1; /* CPU not up, no tearing. */ + ct->nesting = 1; /* CPU not up, no tearing. */ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* From 1089c0078b69bce0c2d540e3cc43470ba9e5dcfd Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:45:30 +0200 Subject: [PATCH 27/90] context_tracking, rcu: Rename ct_dynticks_nesting() into ct_nesting() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- kernel/context_tracking.c | 10 +++++----- kernel/rcu/tree.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 65290e7677e6..586c1ff22c2e 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -75,7 +75,7 @@ static __always_inline int ct_rcu_watching_cpu_acquire(int cpu) return atomic_read_acquire(&ct->state) & CT_RCU_WATCHING_MASK; } -static __always_inline long ct_dynticks_nesting(void) +static __always_inline long ct_nesting(void) { return __this_cpu_read(context_tracking.nesting); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5cfdfc03b401..a951bde0bbcb 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -128,8 +128,8 @@ static void noinstr ct_kernel_exit(bool user, int offset) WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); WRITE_ONCE(ct->dynticks_nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - ct_dynticks_nesting() == 0); - if (ct_dynticks_nesting() != 1) { + ct_nesting() == 0); + if (ct_nesting() != 1) { // RCU will still be watching, so just do accounting and leave. ct->nesting--; return; @@ -137,7 +137,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) instrumentation_begin(); lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_rcu_watching()); + trace_rcu_dyntick(TPS("Start"), ct_nesting(), 0, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rcu_preempt_deferred_qs(current); @@ -166,7 +166,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) long oldval; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - oldval = ct_dynticks_nesting(); + oldval = ct_nesting(); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { // RCU was already watching, so just do accounting and leave. @@ -182,7 +182,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) // instrumentation for the noinstr ct_kernel_enter_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); - trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_rcu_watching()); + trace_rcu_dyntick(TPS("End"), ct_nesting(), 1, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(ct->nesting, 1); WARN_ON_ONCE(ct_dynticks_nmi_nesting()); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0d5a539acd58..cf69a234080f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -388,7 +388,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) lockdep_assert_irqs_disabled(); /* Check for counter underflows */ - RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0, + RCU_LOCKDEP_WARN(ct_nesting() < 0, "RCU nesting counter underflow!"); RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0, "RCU dynticks_nmi_nesting counter underflow/zero!"); @@ -404,7 +404,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) WARN_ON_ONCE(!nesting && !is_idle_task(current)); /* Does CPU appear to be idle from an RCU standpoint? */ - return ct_dynticks_nesting() == 0; + return ct_nesting() == 0; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) @@ -596,7 +596,7 @@ void rcu_irq_exit_check_preempt(void) { lockdep_assert_irqs_disabled(); - RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0, + RCU_LOCKDEP_WARN(ct_nesting() <= 0, "RCU nesting counter underflow/zero!"); RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE, From bca9455da531a3c2520c28ff349d0dab4e471d28 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:48:26 +0200 Subject: [PATCH 28/90] context_tracking, rcu: Rename ct_dynticks_nesting_cpu() into ct_nesting_cpu() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- kernel/rcu/tree_stall.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 586c1ff22c2e..fd42d8120ac2 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -80,7 +80,7 @@ static __always_inline long ct_nesting(void) return __this_cpu_read(context_tracking.nesting); } -static __always_inline long ct_dynticks_nesting_cpu(int cpu) +static __always_inline long ct_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index d65974448e81..59b1d84a4749 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -516,7 +516,7 @@ static void print_cpu_stall_info(int cpu) "!."[!delta], ticks_value, ticks_title, ct_rcu_watching_cpu(cpu) & 0xffff, - ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), + ct_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, rcuc_starved ? buf : "", From dc5fface4b30508933b515a4985401c8c8f6bcfd Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 10:52:03 +0200 Subject: [PATCH 29/90] context_tracking, rcu: Rename struct context_tracking .dynticks_nmi_nesting into .nmi_nesting The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. [ neeraj.upadhyay: Fix htmldocs build error reported by Stephen Rothwell ] Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- .../Data-Structures/Data-Structures.rst | 14 ++++----- include/linux/context_tracking_state.h | 6 ++-- kernel/context_tracking.c | 30 +++++++++---------- kernel/rcu/tree.c | 4 +-- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst index 69860bbec202..28d9f9e1783f 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst @@ -936,7 +936,7 @@ This portion of the rcu_data structure is declared as follows: :: 1 long nesting; - 2 long dynticks_nmi_nesting; + 2 long nmi_nesting; 3 atomic_t dynticks; 4 bool rcu_need_heavy_qs; 5 bool rcu_urgent_qs; @@ -948,11 +948,11 @@ the corresponding CPU (and from tracing) unless otherwise stated. The ``->nesting`` field counts the nesting depth of process execution, so that in normal circumstances this counter has value zero or one. NMIs, irqs, and tracers are counted by the -``->dynticks_nmi_nesting`` field. Because NMIs cannot be masked, changes +``->nmi_nesting`` field. Because NMIs cannot be masked, changes to this variable have to be undertaken carefully using an algorithm provided by Andy Lutomirski. The initial transition from idle adds one, and nested transitions add two, so that a nesting level of five is -represented by a ``->dynticks_nmi_nesting`` value of nine. This counter +represented by a ``->nmi_nesting`` value of nine. This counter can therefore be thought of as counting the number of reasons why this CPU cannot be permitted to enter dyntick-idle mode, aside from process-level transitions. @@ -961,11 +961,11 @@ However, it turns out that when running in non-idle kernel context, the Linux kernel is fully capable of entering interrupt handlers that never exit and perhaps also vice versa. Therefore, whenever the ``->nesting`` field is incremented up from zero, the -``->dynticks_nmi_nesting`` field is set to a large positive number, and +``->nmi_nesting`` field is set to a large positive number, and whenever the ``->nesting`` field is decremented down to zero, -the ``->dynticks_nmi_nesting`` field is set to zero. Assuming that +the ``->nmi_nesting`` field is set to zero. Assuming that the number of misnested interrupts is not sufficient to overflow the -counter, this approach corrects the ``->dynticks_nmi_nesting`` field +counter, this approach corrects the ``->nmi_nesting`` field every time the corresponding CPU enters the idle loop from process context. @@ -993,7 +993,7 @@ code. | **Quick Quiz**: | +-----------------------------------------------------------------------+ | Why not simply combine the ``->nesting`` and | -| ``->dynticks_nmi_nesting`` counters into a single counter that just | +| ``->nmi_nesting`` counters into a single counter that just | | counts the number of reasons that the corresponding CPU is non-idle? | +-----------------------------------------------------------------------+ | **Answer**: | diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index fd42d8120ac2..12d00adf29e1 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -40,7 +40,7 @@ struct context_tracking { #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE long nesting; /* Track process nesting level. */ - long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ + long nmi_nesting; /* Track irq/NMI nesting level. */ #endif }; @@ -89,14 +89,14 @@ static __always_inline long ct_nesting_cpu(int cpu) static __always_inline long ct_dynticks_nmi_nesting(void) { - return __this_cpu_read(context_tracking.dynticks_nmi_nesting); + return __this_cpu_read(context_tracking.nmi_nesting); } static __always_inline long ct_dynticks_nmi_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); - return ct->dynticks_nmi_nesting; + return ct->nmi_nesting; } #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index a951bde0bbcb..ae94215aa132 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -29,7 +29,7 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_IDLE .nesting = 1, - .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, + .nmi_nesting = DYNTICK_IRQ_NONIDLE, #endif .state = ATOMIC_INIT(CT_RCU_WATCHING), }; @@ -117,7 +117,7 @@ static noinstr void ct_kernel_enter_state(int offset) * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * - * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * We crowbar the ->nmi_nesting field to zero to allow for * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ @@ -126,7 +126,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) struct context_tracking *ct = this_cpu_ptr(&context_tracking); WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); - WRITE_ONCE(ct->dynticks_nmi_nesting, 0); + WRITE_ONCE(ct->nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && ct_nesting() == 0); if (ct_nesting() != 1) { @@ -156,7 +156,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * - * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * We crowbar the ->nmi_nesting field to DYNTICK_IRQ_NONIDLE to * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ @@ -186,7 +186,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(ct->nesting, 1); WARN_ON_ONCE(ct_dynticks_nmi_nesting()); - WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(ct->nmi_nesting, DYNTICK_IRQ_NONIDLE); instrumentation_end(); } @@ -194,7 +194,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) * ct_nmi_exit - inform RCU of exit from NMI context * * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting + * RCU-idle period, update ct->state and ct->nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * @@ -207,7 +207,7 @@ void noinstr ct_nmi_exit(void) instrumentation_begin(); /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * Check for ->nmi_nesting underflow and bad ->dynticks. * (We are exiting an NMI handler, so RCU better be paying attention * to us!) */ @@ -221,7 +221,7 @@ void noinstr ct_nmi_exit(void) if (ct_dynticks_nmi_nesting() != 1) { trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, ct_rcu_watching()); - WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ + WRITE_ONCE(ct->nmi_nesting, /* No store tearing. */ ct_dynticks_nmi_nesting() - 2); instrumentation_end(); return; @@ -229,7 +229,7 @@ void noinstr ct_nmi_exit(void) /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_rcu_watching()); - WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + WRITE_ONCE(ct->nmi_nesting, 0); /* Avoid store tearing. */ // instrumentation for the noinstr ct_kernel_exit_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); @@ -247,7 +247,7 @@ void noinstr ct_nmi_exit(void) * ct_nmi_enter - inform RCU of entry to NMI context * * If the CPU was idle from RCU's viewpoint, update ct->state and - * ct->dynticks_nmi_nesting to let the RCU grace-period handling know + * ct->nmi_nesting to let the RCU grace-period handling know * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) @@ -264,10 +264,10 @@ void noinstr ct_nmi_enter(void) WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); /* - * If idle from RCU viewpoint, atomically increment ->dynticks - * to mark non-idle and increment ->dynticks_nmi_nesting by one. - * Otherwise, increment ->dynticks_nmi_nesting by two. This means - * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * If idle from RCU viewpoint, atomically increment CT state + * to mark non-idle and increment ->nmi_nesting by one. + * Otherwise, increment ->nmi_nesting by two. This means + * if ->nmi_nesting is equal to one, we are guaranteed * to be in the outermost NMI handler that interrupted an RCU-idle * period (observation due to Andy Lutomirski). */ @@ -298,7 +298,7 @@ void noinstr ct_nmi_enter(void) ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() + incby, ct_rcu_watching()); instrumentation_end(); - WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ + WRITE_ONCE(ct->nmi_nesting, /* Prevent store tearing. */ ct_dynticks_nmi_nesting() + incby); barrier(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cf69a234080f..934f6b34a551 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -391,7 +391,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) RCU_LOCKDEP_WARN(ct_nesting() < 0, "RCU nesting counter underflow!"); RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0, - "RCU dynticks_nmi_nesting counter underflow/zero!"); + "RCU nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ nesting = ct_dynticks_nmi_nesting(); @@ -600,7 +600,7 @@ void rcu_irq_exit_check_preempt(void) "RCU nesting counter underflow/zero!"); RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE, - "Bad RCU dynticks_nmi_nesting counter\n"); + "Bad RCU nmi_nesting counter\n"); RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), "RCU in extended quiescent state!"); } From 8375cb260d7e43610934af63748d1a51201449f8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:46:14 +0200 Subject: [PATCH 30/90] context_tracking, rcu: Rename ct_dynticks_nmi_nesting() into ct_nmi_nesting() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- kernel/context_tracking.c | 24 ++++++++++++------------ kernel/rcu/tree.c | 6 +++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 12d00adf29e1..8f32fe599c5c 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -87,7 +87,7 @@ static __always_inline long ct_nesting_cpu(int cpu) return ct->nesting; } -static __always_inline long ct_dynticks_nmi_nesting(void) +static __always_inline long ct_nmi_nesting(void) { return __this_cpu_read(context_tracking.nmi_nesting); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index ae94215aa132..115843eeb030 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -125,7 +125,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) { struct context_tracking *ct = this_cpu_ptr(&context_tracking); - WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); + WARN_ON_ONCE(ct_nmi_nesting() != DYNTICK_IRQ_NONIDLE); WRITE_ONCE(ct->nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && ct_nesting() == 0); @@ -185,7 +185,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) trace_rcu_dyntick(TPS("End"), ct_nesting(), 1, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(ct->nesting, 1); - WARN_ON_ONCE(ct_dynticks_nmi_nesting()); + WARN_ON_ONCE(ct_nmi_nesting()); WRITE_ONCE(ct->nmi_nesting, DYNTICK_IRQ_NONIDLE); instrumentation_end(); } @@ -207,28 +207,28 @@ void noinstr ct_nmi_exit(void) instrumentation_begin(); /* - * Check for ->nmi_nesting underflow and bad ->dynticks. + * Check for ->nmi_nesting underflow and bad CT state. * (We are exiting an NMI handler, so RCU better be paying attention * to us!) */ - WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0); + WARN_ON_ONCE(ct_nmi_nesting() <= 0); WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so * leave it in non-RCU-idle state. */ - if (ct_dynticks_nmi_nesting() != 1) { - trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, + if (ct_nmi_nesting() != 1) { + trace_rcu_dyntick(TPS("--="), ct_nmi_nesting(), ct_nmi_nesting() - 2, ct_rcu_watching()); WRITE_ONCE(ct->nmi_nesting, /* No store tearing. */ - ct_dynticks_nmi_nesting() - 2); + ct_nmi_nesting() - 2); instrumentation_end(); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_rcu_watching()); + trace_rcu_dyntick(TPS("Startirq"), ct_nmi_nesting(), 0, ct_rcu_watching()); WRITE_ONCE(ct->nmi_nesting, 0); /* Avoid store tearing. */ // instrumentation for the noinstr ct_kernel_exit_state() @@ -261,7 +261,7 @@ void noinstr ct_nmi_enter(void) struct context_tracking *ct = this_cpu_ptr(&context_tracking); /* Complain about underflow. */ - WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); + WARN_ON_ONCE(ct_nmi_nesting() < 0); /* * If idle from RCU viewpoint, atomically increment CT state @@ -295,11 +295,11 @@ void noinstr ct_nmi_enter(void) } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), - ct_dynticks_nmi_nesting(), - ct_dynticks_nmi_nesting() + incby, ct_rcu_watching()); + ct_nmi_nesting(), + ct_nmi_nesting() + incby, ct_rcu_watching()); instrumentation_end(); WRITE_ONCE(ct->nmi_nesting, /* Prevent store tearing. */ - ct_dynticks_nmi_nesting() + incby); + ct_nmi_nesting() + incby); barrier(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 934f6b34a551..14cc314eedad 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -390,11 +390,11 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* Check for counter underflows */ RCU_LOCKDEP_WARN(ct_nesting() < 0, "RCU nesting counter underflow!"); - RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0, + RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0, "RCU nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - nesting = ct_dynticks_nmi_nesting(); + nesting = ct_nmi_nesting(); if (nesting > 1) return false; @@ -598,7 +598,7 @@ void rcu_irq_exit_check_preempt(void) RCU_LOCKDEP_WARN(ct_nesting() <= 0, "RCU nesting counter underflow/zero!"); - RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() != + RCU_LOCKDEP_WARN(ct_nmi_nesting() != DYNTICK_IRQ_NONIDLE, "Bad RCU nmi_nesting counter\n"); RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), From 2ef2890b7a94fbbfa8fdf0a90499ae1df476b8e8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:49:13 +0200 Subject: [PATCH 31/90] context_tracking, rcu: Rename ct_dynticks_nmi_nesting_cpu() into ct_nmi_nesting_cpu() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- kernel/rcu/tree_stall.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 8f32fe599c5c..34fd504e53a8 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -92,7 +92,7 @@ static __always_inline long ct_nmi_nesting(void) return __this_cpu_read(context_tracking.nmi_nesting); } -static __always_inline long ct_dynticks_nmi_nesting_cpu(int cpu) +static __always_inline long ct_nmi_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 59b1d84a4749..ec49f0155bec 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -516,7 +516,7 @@ static void print_cpu_stall_info(int cpu) "!."[!delta], ticks_value, ticks_title, ct_rcu_watching_cpu(cpu) & 0xffff, - ct_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), + ct_nesting_cpu(cpu), ct_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, rcuc_starved ? buf : "", From e1de438336222dbe27f2d4baa8c01074fcac2bef Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 29 Apr 2024 15:52:32 +0200 Subject: [PATCH 32/90] context_tracking, rcu: Rename DYNTICK_IRQ_NONIDLE into CT_NESTING_IRQ_NONIDLE The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- kernel/context_tracking.c | 8 ++++---- kernel/rcu/tree.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 34fd504e53a8..0dbda59c9f37 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -7,7 +7,7 @@ #include /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ -#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) +#define CT_NESTING_IRQ_NONIDLE ((LONG_MAX / 2) + 1) enum ctx_state { CT_STATE_DISABLED = -1, /* returned by ct_state() if unknown */ diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 115843eeb030..8262f57a4363 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -29,7 +29,7 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_IDLE .nesting = 1, - .nmi_nesting = DYNTICK_IRQ_NONIDLE, + .nmi_nesting = CT_NESTING_IRQ_NONIDLE, #endif .state = ATOMIC_INIT(CT_RCU_WATCHING), }; @@ -125,7 +125,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) { struct context_tracking *ct = this_cpu_ptr(&context_tracking); - WARN_ON_ONCE(ct_nmi_nesting() != DYNTICK_IRQ_NONIDLE); + WARN_ON_ONCE(ct_nmi_nesting() != CT_NESTING_IRQ_NONIDLE); WRITE_ONCE(ct->nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && ct_nesting() == 0); @@ -156,7 +156,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * - * We crowbar the ->nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * We crowbar the ->nmi_nesting field to CT_NESTING_IRQ_NONIDLE to * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ @@ -186,7 +186,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(ct->nesting, 1); WARN_ON_ONCE(ct_nmi_nesting()); - WRITE_ONCE(ct->nmi_nesting, DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(ct->nmi_nesting, CT_NESTING_IRQ_NONIDLE); instrumentation_end(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14cc314eedad..4778c873f4ac 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -599,7 +599,7 @@ void rcu_irq_exit_check_preempt(void) RCU_LOCKDEP_WARN(ct_nesting() <= 0, "RCU nesting counter underflow/zero!"); RCU_LOCKDEP_WARN(ct_nmi_nesting() != - DYNTICK_IRQ_NONIDLE, + CT_NESTING_IRQ_NONIDLE, "Bad RCU nmi_nesting counter\n"); RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), "RCU in extended quiescent state!"); From fb579e6656a8d5e042e65062f68ca41321817237 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 7 Aug 2024 11:55:00 +0200 Subject: [PATCH 33/90] rcu: Annotate struct kvfree_rcu_bulk_data with __counted_by() Add the __counted_by compiler attribute to the flexible array member records to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Increment nr_records before adding a new pointer to the records array. Signed-off-by: Thorsten Blum Reviewed-by: "Gustavo A. R. Silva" Reviewed-by: "Uladzislau Rezki (Sony)" Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0f41a81138dc..d5bf824159da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3227,7 +3227,7 @@ struct kvfree_rcu_bulk_data { struct list_head list; struct rcu_gp_oldstate gp_snap; unsigned long nr_records; - void *records[]; + void *records[] __counted_by(nr_records); }; /* @@ -3767,7 +3767,8 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, } // Finally insert and update the GP for this page. - bnode->records[bnode->nr_records++] = ptr; + bnode->nr_records++; + bnode->records[bnode->nr_records - 1] = ptr; get_state_synchronize_rcu_full(&bnode->gp_snap); atomic_inc(&(*krcp)->bulk_count[idx]); From 0aac9daef6763e6efef398faff71f8c593651cce Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 23 Jul 2024 14:10:25 -0400 Subject: [PATCH 34/90] rcu: Use system_unbound_wq to avoid disturbing isolated CPUs It was discovered that isolated CPUs could sometimes be disturbed by kworkers processing kfree_rcu() works causing higher than expected latency. It is because the RCU core uses "system_wq" which doesn't have the WQ_UNBOUND flag to handle all its work items. Fix this violation of latency limits by using "system_unbound_wq" in the RCU core instead. This will ensure that those work items will not be run on CPUs marked as isolated. Beside the WQ_UNBOUND flag, the other major difference between system_wq and system_unbound_wq is their max_active count. The system_unbound_wq has a max_active of WQ_MAX_ACTIVE (512) while system_wq's max_active is WQ_DFL_ACTIVE (256) which is half of WQ_MAX_ACTIVE. Reported-by: Vratislav Bendel Closes: https://issues.redhat.com/browse/RHEL-50220 Signed-off-by: Waiman Long Reviewed-by: "Uladzislau Rezki (Sony)" Tested-by: Breno Leitao Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..494aa9513d0b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3539,10 +3539,10 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) if (delayed_work_pending(&krcp->monitor_work)) { delay_left = krcp->monitor_work.timer.expires - jiffies; if (delay < delay_left) - mod_delayed_work(system_wq, &krcp->monitor_work, delay); + mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); return; } - queue_delayed_work(system_wq, &krcp->monitor_work, delay); + queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); } static void @@ -3634,7 +3634,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // be that the work is in the pending state when // channels have been detached following by each // other. - queue_rcu_work(system_wq, &krwp->rcu_work); + queue_rcu_work(system_unbound_wq, &krwp->rcu_work); } } @@ -3704,7 +3704,7 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_wq, + queue_delayed_work(system_unbound_wq, &krcp->page_cache_work, msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); } else { From 29bc83e4d90546aa794a9584786086b141a6ba4d Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 15 Jul 2024 16:23:24 -0700 Subject: [PATCH 35/90] srcu: faster gp seq wrap-around Using a higher value for the initial gp sequence counters allows for wrapping to occur faster. It can help with surfacing any issues that may be happening as a result of the wrap around. Signed-off-by: JP Kobryn Tested-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 3 +++ include/linux/srcutree.h | 15 ++++++++++++++- kernel/rcu/rcu.h | 3 --- kernel/rcu/srcutree.c | 7 ++++--- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13f6f00aecf9..8d56db70d417 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -34,6 +34,9 @@ #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) +#define RCU_SEQ_CTR_SHIFT 2 +#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) + /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8f3f72480e78..ed57598394de 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -129,10 +129,23 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 +/* + * Values for initializing gp sequence fields. Higher values allow wrap arounds to + * occur earlier. + * The second value with state is useful in the case of static initialization of + * srcu_usage where srcu_gp_seq_needed is expected to have some state value in its + * lower bits (or else it will appear to be already initialized within + * the call check_init_srcu_struct()). + */ +#define SRCU_GP_SEQ_INITIAL_VAL ((0UL - 100UL) << RCU_SEQ_CTR_SHIFT) +#define SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE (SRCU_GP_SEQ_INITIAL_VAL - 1) + #define __SRCU_USAGE_INIT(name) \ { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - .srcu_gp_seq_needed = -1UL, \ + .srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL, \ + .srcu_gp_seq_needed = SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE, \ + .srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL, \ .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ } diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 38238e595a61..2bfed9855d67 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -54,9 +54,6 @@ * grace-period sequence number. */ -#define RCU_SEQ_CTR_SHIFT 2 -#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) - /* Low-order bit definition for polled grace-period APIs. */ #define RCU_GET_STATE_COMPLETED 0x1 diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b24db425f16d..6fd9c914ce64 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -247,7 +247,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); ssp->srcu_idx = 0; - ssp->srcu_sup->srcu_gp_seq = 0; + ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); @@ -258,7 +258,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); - ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; + ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) @@ -266,7 +266,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; - smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, + SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */ return 0; err_free_sda: From c8c3ae83e0bbed6d56a00068c7d10ebf64db48d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:36:21 -0700 Subject: [PATCH 36/90] srcu: Check for concurrent updates of heuristics SRCU maintains the ->srcu_n_exp_nodelay and ->reschedule_count values to guide heuristics governing auto-expediting of normal SRCU grace periods and grace-period-state-machine delays. This commit adds KCSAN ASSERT_EXCLUSIVE_WRITER() calls to check for concurrent updates to these fields. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/srcutree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6fd9c914ce64..aaee09a6748c 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -629,6 +629,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { + ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay); WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; @@ -1819,6 +1820,7 @@ static void process_srcu(struct work_struct *work) } else { j = jiffies; if (READ_ONCE(sup->reschedule_jiffies) == j) { + ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count); WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) curdelay = 1; From e53cef031bfac2369d249cfd726706c30a5a3351 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:36:22 -0700 Subject: [PATCH 37/90] srcu: Mark callbacks not currently participating in barrier operation SRCU keeps a count of the number of callbacks that the current srcu_barrier() is waiting on, but there is currently no easy way to work out which callback is stuck. One way to do this is to mark idle SRCU-barrier callbacks by making the ->next pointer point to the callback itself, and this commit does just that. Later commits will use this for debug output. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/srcutree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index aaee09a6748c..31706e3293bc 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -137,6 +137,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head; sdp->mynode = NULL; sdp->cpu = cpu; INIT_WORK(&sdp->work, srcu_invoke_callbacks); @@ -1562,6 +1563,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) struct srcu_data *sdp; struct srcu_struct *ssp; + rhp->next = rhp; // Mark the callback as having been invoked. sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); ssp = sdp->ssp; if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) From 58cb3210543591c1783a9a884adf27708f19d2a1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:37:49 -0700 Subject: [PATCH 38/90] rcutorture: Add a stall_cpu_repeat module parameter This commit adds an stall_cpu_repeat kernel, which is also the rcutorture.stall_cpu_repeat boot parameter, to test repeated CPU stalls. Note that only the first stall will pay attention to the stall_cpu_irqsoff module parameter. For the second and subsequent stalls, interrupts will be enabled. This is helpful when testing the interaction between RCU CPU stall warnings and CSD-lock stall warnings. Reported-by: Rik van Riel Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- .../admin-guide/kernel-parameters.txt | 8 ++- kernel/rcu/rcutorture.c | 56 +++++++++++++------ 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1384c7b59c9..db1be767c91a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5384,7 +5384,13 @@ Time to wait (s) after boot before inducing stall. rcutorture.stall_cpu_irqsoff= [KNL] - Disable interrupts while stalling if set. + Disable interrupts while stalling if set, but only + on the first stall in the set. + + rcutorture.stall_cpu_repeat= [KNL] + Number of times to repeat the stall sequence, + so that rcutorture.stall_cpu_repeat=3 will result + in four stall sequences. rcutorture.stall_gp_kthread= [KNL] Duration (s) of forced sleep within RCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9a2ecb8b88ec..4fd23cae0e9e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -115,6 +115,7 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one."); torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -1393,7 +1394,8 @@ rcu_torture_writer(void *arg) // If a new stall test is added, this must be adjusted. if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) - stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ; + stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * + HZ * (stall_cpu_repeat + 1); VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) pr_alert("%s" TORTURE_FLAG @@ -2391,7 +2393,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " - "stall_cpu_block=%d " + "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " @@ -2403,7 +2405,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, - stall_cpu_block, + stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, @@ -2481,19 +2483,11 @@ static struct notifier_block rcu_torture_stall_block = { * induces a CPU stall for the time specified by stall_cpu. If a new * stall test is added, stallsdone in rcu_torture_writer() must be adjusted. */ -static int rcu_torture_stall(void *args) +static void rcu_torture_stall_one(int rep, int irqsoff) { int idx; - int ret; unsigned long stop_at; - VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); - if (rcu_cpu_stall_notifiers) { - ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); - if (ret) - pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", - __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); - } if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2513,12 +2507,12 @@ static int rcu_torture_stall(void *args) stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ idx = cur_ops->readlock(); - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_disable(); else if (!stall_cpu_block) preempt_disable(); - pr_alert("%s start on CPU %d.\n", - __func__, raw_smp_processor_id()); + pr_alert("%s start stall episode %d on CPU %d.\n", + __func__, rep + 1, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) && !kthread_should_stop()) if (stall_cpu_block) { @@ -2530,12 +2524,42 @@ static int rcu_torture_stall(void *args) } else if (stall_no_softlockup) { touch_softlockup_watchdog(); } - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_enable(); else if (!stall_cpu_block) preempt_enable(); cur_ops->readunlock(idx); } +} + +/* + * CPU-stall kthread. Invokes rcu_torture_stall_one() once, and then as many + * additional times as specified by the stall_cpu_repeat module parameter. + * Note that stall_cpu_irqsoff is ignored on the second and subsequent + * stall. + */ +static int rcu_torture_stall(void *args) +{ + int i; + int repeat = stall_cpu_repeat; + int ret; + + VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + if (repeat < 0) { + repeat = 0; + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + } + if (rcu_cpu_stall_notifiers) { + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); + } + for (i = 0; i <= repeat; i++) { + if (kthread_should_stop()) + break; + rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0); + } pr_alert("%s end.\n", __func__); if (rcu_cpu_stall_notifiers && !ret) { ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); From 1c5144a066fd17eba681b3d6b0f86bda8e06fbfa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:37:50 -0700 Subject: [PATCH 39/90] torture: Add torture.sh --guest-cpu-limit argument for limited hosts Some servers have limitations on the number of CPUs a given guest OS can use. In my earlier experience, such limitations have been at least half of the host's CPUs, but in a recent example, this limit is less than 40%. This commit therefore adds a --guest-cpu-limit argument that allows such low limits to be made known to torture.sh. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- .../selftests/rcutorture/bin/torture.sh | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 990d24696fd3..0447c4a00cc4 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -19,10 +19,10 @@ PATH=${RCUTORTURE}/bin:$PATH; export PATH TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`" MAKE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS*2)) -HALF_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2)) -if test "$HALF_ALLOTED_CPUS" -lt 1 +SCALE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2)) +if test "$SCALE_ALLOTED_CPUS" -lt 1 then - HALF_ALLOTED_CPUS=1 + SCALE_ALLOTED_CPUS=1 fi VERBOSE_BATCH_CPUS=$((TORTURE_ALLOTED_CPUS/16)) if test "$VERBOSE_BATCH_CPUS" -lt 2 @@ -90,6 +90,7 @@ usage () { echo " --do-scftorture / --do-no-scftorture / --no-scftorture" echo " --do-srcu-lockdep / --do-no-srcu-lockdep / --no-srcu-lockdep" echo " --duration [ | h | d ]" + echo " --guest-cpu-limit N" echo " --kcsan-kmake-arg kernel-make-arguments" exit 1 } @@ -203,6 +204,21 @@ do duration_base=$(($ts*mult)) shift ;; + --guest-cpu-limit|--guest-cpu-lim) + checkarg --guest-cpu-limit "(number)" "$#" "$2" '^[0-9]*$' '^--' + if (("$2" <= "$TORTURE_ALLOTED_CPUS" / 2)) + then + SCALE_ALLOTED_CPUS="$2" + VERBOSE_BATCH_CPUS="$((SCALE_ALLOTED_CPUS/8))" + if (("$VERBOSE_BATCH_CPUS" < 2)) + then + VERBOSE_BATCH_CPUS=0 + fi + else + echo "Ignoring value of $2 for --guest-cpu-limit which is greater than (("$TORTURE_ALLOTED_CPUS" / 2))." + fi + shift + ;; --kcsan-kmake-arg|--kcsan-kmake-args) checkarg --kcsan-kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' kcsan_kmake_args="`echo "$kcsan_kmake_args $2" | sed -e 's/^ *//' -e 's/ *$//'`" @@ -425,9 +441,9 @@ fi if test "$do_scftorture" = "yes" then # Scale memory based on the number of CPUs. - scfmem=$((3+HALF_ALLOTED_CPUS/16)) - torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" - torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make + scfmem=$((3+SCALE_ALLOTED_CPUS/16)) + torture_bootargs="scftorture.nthreads=$SCALE_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" + torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory ${scfmem}G --trust-make fi if test "$do_rt" = "yes" @@ -471,8 +487,8 @@ for prim in $primlist do if test -n "$firsttime" then - torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make + torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$SCALE_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" + torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make mv $T/last-resdir-nodebug $T/first-resdir-nodebug || : if test -f "$T/last-resdir-kasan" then @@ -520,8 +536,8 @@ for prim in $primlist do if test -n "$firsttime" then - torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$HALF_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$SCALE_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" + torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --trust-make mv $T/last-resdir-nodebug $T/first-resdir-nodebug || : if test -f "$T/last-resdir-kasan" then @@ -559,7 +575,7 @@ do_kcsan="$do_kcsan_save" if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" From 0ff92d145a2be4da47e2ee6287a731084fba112f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:52:29 -0700 Subject: [PATCH 40/90] doc: Remove RCU Tasks Rude asynchronous APIs The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are no longer. This commit therefore removes them from the documentation. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- .../RCU/Design/Requirements/Requirements.rst | 3 +- Documentation/RCU/checklist.rst | 61 +++++++++---------- Documentation/RCU/whatisRCU.rst | 2 +- .../admin-guide/kernel-parameters.txt | 8 --- 4 files changed, 30 insertions(+), 44 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index f511476b4550..6125e7068d2c 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -2649,8 +2649,7 @@ those that are idle from RCU's perspective) and then Tasks Rude RCU can be removed from the kernel. The tasks-rude-RCU API is also reader-marking-free and thus quite compact, -consisting of call_rcu_tasks_rude(), synchronize_rcu_tasks_rude(), -and rcu_barrier_tasks_rude(). +consisting solely of synchronize_rcu_tasks_rude(). Tasks Trace RCU ~~~~~~~~~~~~~~~ diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 3e6407de231c..7de3e308f330 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -194,14 +194,13 @@ over a rather long period of time, but improvements are always welcome! when publicizing a pointer to a structure that can be traversed by an RCU read-side critical section. -5. If any of call_rcu(), call_srcu(), call_rcu_tasks(), - call_rcu_tasks_rude(), or call_rcu_tasks_trace() is used, - the callback function may be invoked from softirq context, - and in any case with bottom halves disabled. In particular, - this callback function cannot block. If you need the callback - to block, run that code in a workqueue handler scheduled from - the callback. The queue_rcu_work() function does this for you - in the case of call_rcu(). +5. If any of call_rcu(), call_srcu(), call_rcu_tasks(), or + call_rcu_tasks_trace() is used, the callback function may be + invoked from softirq context, and in any case with bottom halves + disabled. In particular, this callback function cannot block. + If you need the callback to block, run that code in a workqueue + handler scheduled from the callback. The queue_rcu_work() + function does this for you in the case of call_rcu(). 6. Since synchronize_rcu() can block, it cannot be called from any sort of irq context. The same rule applies @@ -254,10 +253,10 @@ over a rather long period of time, but improvements are always welcome! corresponding readers must use rcu_read_lock_trace() and rcu_read_unlock_trace(). - c. If an updater uses call_rcu_tasks_rude() or - synchronize_rcu_tasks_rude(), then the corresponding - readers must use anything that disables preemption, - for example, preempt_disable() and preempt_enable(). + c. If an updater uses synchronize_rcu_tasks_rude(), + then the corresponding readers must use anything that + disables preemption, for example, preempt_disable() + and preempt_enable(). Mixing things up will result in confusion and broken kernels, and has even resulted in an exploitable security issue. Therefore, @@ -326,11 +325,9 @@ over a rather long period of time, but improvements are always welcome! d. Periodically invoke rcu_barrier(), permitting a limited number of updates per grace period. - The same cautions apply to call_srcu(), call_rcu_tasks(), - call_rcu_tasks_rude(), and call_rcu_tasks_trace(). This is - why there is an srcu_barrier(), rcu_barrier_tasks(), - rcu_barrier_tasks_rude(), and rcu_barrier_tasks_rude(), - respectively. + The same cautions apply to call_srcu(), call_rcu_tasks(), and + call_rcu_tasks_trace(). This is why there is an srcu_barrier(), + rcu_barrier_tasks(), and rcu_barrier_tasks_trace(), respectively. Note that although these primitives do take action to avoid memory exhaustion when any given CPU has too many callbacks, @@ -383,17 +380,17 @@ over a rather long period of time, but improvements are always welcome! must use whatever locking or other synchronization is required to safely access and/or modify that data structure. - Do not assume that RCU callbacks will be executed on - the same CPU that executed the corresponding call_rcu(), - call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), or - call_rcu_tasks_trace(). For example, if a given CPU goes offline - while having an RCU callback pending, then that RCU callback - will execute on some surviving CPU. (If this was not the case, - a self-spawning RCU callback would prevent the victim CPU from - ever going offline.) Furthermore, CPUs designated by rcu_nocbs= - might well *always* have their RCU callbacks executed on some - other CPUs, in fact, for some real-time workloads, this is the - whole point of using the rcu_nocbs= kernel boot parameter. + Do not assume that RCU callbacks will be executed on the same + CPU that executed the corresponding call_rcu(), call_srcu(), + call_rcu_tasks(), or call_rcu_tasks_trace(). For example, if + a given CPU goes offline while having an RCU callback pending, + then that RCU callback will execute on some surviving CPU. + (If this was not the case, a self-spawning RCU callback would + prevent the victim CPU from ever going offline.) Furthermore, + CPUs designated by rcu_nocbs= might well *always* have their + RCU callbacks executed on some other CPUs, in fact, for some + real-time workloads, this is the whole point of using the + rcu_nocbs= kernel boot parameter. In addition, do not assume that callbacks queued in a given order will be invoked in that order, even if they all are queued on the @@ -507,9 +504,9 @@ over a rather long period of time, but improvements are always welcome! These debugging aids can help you find problems that are otherwise extremely difficult to spot. -17. If you pass a callback function defined within a module to one of - call_rcu(), call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), - or call_rcu_tasks_trace(), then it is necessary to wait for all +17. If you pass a callback function defined within a module + to one of call_rcu(), call_srcu(), call_rcu_tasks(), or + call_rcu_tasks_trace(), then it is necessary to wait for all pending callbacks to be invoked before unloading that module. Note that it is absolutely *not* sufficient to wait for a grace period! For example, synchronize_rcu() implementation is *not* @@ -522,7 +519,6 @@ over a rather long period of time, but improvements are always welcome! - call_rcu() -> rcu_barrier() - call_srcu() -> srcu_barrier() - call_rcu_tasks() -> rcu_barrier_tasks() - - call_rcu_tasks_rude() -> rcu_barrier_tasks_rude() - call_rcu_tasks_trace() -> rcu_barrier_tasks_trace() However, these barrier functions are absolutely *not* guaranteed @@ -539,7 +535,6 @@ over a rather long period of time, but improvements are always welcome! - Either synchronize_srcu() or synchronize_srcu_expedited(), together with and srcu_barrier() - synchronize_rcu_tasks() and rcu_barrier_tasks() - - synchronize_tasks_rude() and rcu_barrier_tasks_rude() - synchronize_tasks_trace() and rcu_barrier_tasks_trace() If necessary, you can use something like workqueues to execute diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index d585a5490aee..1ef5784c1b84 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -1103,7 +1103,7 @@ RCU-Tasks-Rude:: Critical sections Grace period Barrier - N/A call_rcu_tasks_rude rcu_barrier_tasks_rude + N/A N/A synchronize_rcu_tasks_rude diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1384c7b59c9..a6107b745076 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5572,14 +5572,6 @@ of zero will disable batching. Batching is always disabled for synchronize_rcu_tasks(). - rcupdate.rcu_tasks_rude_lazy_ms= [KNL] - Set timeout in milliseconds RCU Tasks - Rude asynchronous callback batching for - call_rcu_tasks_rude(). A negative value - will take the default. A value of zero will - disable batching. Batching is always disabled - for synchronize_rcu_tasks_rude(). - rcupdate.rcu_tasks_trace_lazy_ms= [KNL] Set timeout in milliseconds RCU Tasks Trace asynchronous callback batching for From b428a9de9bba8877aeb5de4691cc9334467065d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 09:04:08 -0700 Subject: [PATCH 41/90] rcutorture: Stop testing RCU Tasks Rude asynchronous APIs The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently unused. Furthermore, the idea is to get rid of RCU Tasks Rude entirely once all architectures have their deep-idle and entry/exit code correctly marked as inline or noinstr. As a first step towards this goal, this commit therefore removes these two functions from rcutorture testing. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcutorture.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 08bf7c669dd3..cd8752b2a960 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -915,11 +915,6 @@ static struct rcu_torture_ops tasks_ops = { * Definitions for rude RCU-tasks torture testing. */ -static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); -} - static struct rcu_torture_ops tasks_rude_ops = { .ttype = RCU_TASKS_RUDE_FLAVOR, .init = rcu_sync_torture_init, @@ -927,11 +922,8 @@ static struct rcu_torture_ops tasks_rude_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, .get_gp_seq = rcu_no_completed, - .deferred_free = rcu_tasks_rude_torture_deferred_free, .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, - .call = call_rcu_tasks_rude, - .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, .get_gp_data = rcu_tasks_rude_get_gp_data, .cbflood_max = 50000, From 599194d01459670049079ff54d52442f54d38432 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:46:00 -0700 Subject: [PATCH 42/90] rcuscale: Stop testing RCU Tasks Rude asynchronous APIs The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently unused. Furthermore, the idea is to get rid of RCU Tasks Rude entirely once all architectures have their deep-idle and entry/exit code correctly marked as inline or noinstr. As a step towards this goal, this commit therefore removes these two functions from rcuscale testing. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index b53a9e8f5904..d534d4ec2314 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -333,8 +333,6 @@ static struct rcu_scale_ops tasks_rude_ops = { .readunlock = tasks_rude_scale_read_unlock, .get_gp_seq = rcu_no_completed, .gp_diff = rcu_seq_diff, - .async = call_rcu_tasks_rude, - .gp_barrier = rcu_barrier_tasks_rude, .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, From 7945b741d1fc071a621366c512a060ea08848955 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:47:47 -0700 Subject: [PATCH 43/90] rcu-tasks: Remove RCU Tasks Rude asynchronous APIs The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently unused. This commit therefore removes their definitions and boot-time self-tests. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 2 -- kernel/rcu/tasks.h | 55 +++++++++++++--------------------------- 2 files changed, 17 insertions(+), 40 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13f6f00aecf9..31e679c7110e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -37,7 +37,6 @@ /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); -void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); struct rcu_gp_oldstate; @@ -202,7 +201,6 @@ do { \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU -void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks_rude(void); # endif diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ba3440a45b6d..4bc038bcc016 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -396,7 +396,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) // Wait for all in-flight callbacks for the specified RCU Tasks flavor. // Operates in a manner similar to rcu_barrier(). -static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) +static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp) { int cpu; unsigned long flags; @@ -1244,13 +1244,12 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } //////////////////////////////////////////////////////////////////////// // -// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of -// passing an empty function to schedule_on_each_cpu(). This approach -// provides an asynchronous call_rcu_tasks_rude() API and batching of -// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API. -// This invokes schedule_on_each_cpu() in order to send IPIs far and wide -// and induces otherwise unnecessary context switches on all online CPUs, -// whether idle or not. +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's +// trick of passing an empty function to schedule_on_each_cpu(). +// This approach provides batching of concurrent calls to the synchronous +// synchronize_rcu_tasks_rude() API. This invokes schedule_on_each_cpu() +// in order to send IPIs far and wide and induces otherwise unnecessary +// context switches on all online CPUs, whether idle or not. // // Callback handling is provided by the rcu_tasks_kthread() function. // @@ -1268,11 +1267,11 @@ static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) schedule_on_each_cpu(rcu_tasks_be_rude); } -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, "RCU Tasks Rude"); -/** +/* * call_rcu_tasks_rude() - Queue a callback rude task-based grace period * @rhp: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period @@ -1289,12 +1288,14 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. + * + * This is no longer exported, and is instead reserved for use by + * synchronize_rcu_tasks_rude(). */ -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) { call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); } -EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); /** * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period @@ -1320,26 +1321,9 @@ void synchronize_rcu_tasks_rude(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); -/** - * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks_rude(void) -{ - rcu_barrier_tasks_generic(&rcu_tasks_rude); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); - -int rcu_tasks_rude_lazy_ms = -1; -module_param(rcu_tasks_rude_lazy_ms, int, 0444); - static int __init rcu_spawn_tasks_rude_kthread(void) { rcu_tasks_rude.gp_sleep = HZ / 10; - if (rcu_tasks_rude_lazy_ms >= 0) - rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms); rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -2069,11 +2053,6 @@ static struct rcu_tasks_test_desc tests[] = { /* If not defined, the test is skipped. */ .notrun = IS_ENABLED(CONFIG_TASKS_RCU), }, - { - .name = "call_rcu_tasks_rude()", - /* If not defined, the test is skipped. */ - .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU), - }, { .name = "call_rcu_tasks_trace()", /* If not defined, the test is skipped. */ @@ -2081,6 +2060,7 @@ static struct rcu_tasks_test_desc tests[] = { } }; +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void test_rcu_tasks_callback(struct rcu_head *rhp) { struct rcu_tasks_test_desc *rttd = @@ -2090,6 +2070,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) rttd->notrun = false; } +#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void rcu_tasks_initiate_self_tests(void) { @@ -2102,16 +2083,14 @@ static void rcu_tasks_initiate_self_tests(void) #ifdef CONFIG_TASKS_RUDE_RCU pr_info("Running RCU Tasks Rude wait API self tests\n"); - tests[1].runstart = jiffies; synchronize_rcu_tasks_rude(); - call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU pr_info("Running RCU Tasks Trace wait API self tests\n"); - tests[2].runstart = jiffies; + tests[1].runstart = jiffies; synchronize_rcu_tasks_trace(); - call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); + call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback); #endif } From fd70e9f1d85f5323096ad313ba73f5fe3d15ea41 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 10 Jul 2024 12:45:42 +0800 Subject: [PATCH 44/90] rcu-tasks: Fix access non-existent percpu rtpcp variable in rcu_tasks_need_gpcb() For kernels built with CONFIG_FORCE_NR_CPUS=y, the nr_cpu_ids is defined as NR_CPUS instead of the number of possible cpus, this will cause the following system panic: smpboot: Allowing 4 CPUs, 0 hotplug CPUs ... setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:512 nr_node_ids:1 ... BUG: unable to handle page fault for address: ffffffff9911c8c8 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 15 Comm: rcu_tasks_trace Tainted: G W 6.6.21 #1 5dc7acf91a5e8e9ac9dcfc35bee0245691283ea6 RIP: 0010:rcu_tasks_need_gpcb+0x25d/0x2c0 RSP: 0018:ffffa371c00a3e60 EFLAGS: 00010082 CR2: ffffffff9911c8c8 CR3: 000000040fa20005 CR4: 00000000001706f0 Call Trace: ? __die+0x23/0x80 ? page_fault_oops+0xa4/0x180 ? exc_page_fault+0x152/0x180 ? asm_exc_page_fault+0x26/0x40 ? rcu_tasks_need_gpcb+0x25d/0x2c0 ? __pfx_rcu_tasks_kthread+0x40/0x40 rcu_tasks_one_gp+0x69/0x180 rcu_tasks_kthread+0x94/0xc0 kthread+0xe8/0x140 ? __pfx_kthread+0x40/0x40 ret_from_fork+0x34/0x80 ? __pfx_kthread+0x40/0x40 ret_from_fork_asm+0x1b/0x80 Considering that there may be holes in the CPU numbers, use the maximum possible cpu number, instead of nr_cpu_ids, for configuring enqueue and dequeue limits. [ neeraj.upadhyay: Fix htmldocs build error reported by Stephen Rothwell ] Closes: https://lore.kernel.org/linux-input/CALMA0xaTSMN+p4xUXkzrtR5r6k7hgoswcaXx7baR_z9r5jjskw@mail.gmail.com/T/#u Reported-by: Zhixu Liu Signed-off-by: Zqiang Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 82 ++++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 4bc038bcc016..72d564c84499 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -34,6 +34,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_blkd_tasks: List of tasks blocked as readers. * @rtp_exit_list: List of tasks in the latter portion of do_exit(). * @cpu: CPU number corresponding to this entry. + * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure. * @rtpp: Pointer to the rcu_tasks structure. */ struct rcu_tasks_percpu { @@ -49,6 +50,7 @@ struct rcu_tasks_percpu { struct list_head rtp_blkd_tasks; struct list_head rtp_exit_list; int cpu; + int index; struct rcu_tasks *rtpp; }; @@ -76,6 +78,7 @@ struct rcu_tasks_percpu { * @call_func: This flavor's call_rcu()-equivalent function. * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE). * @rtpcpu: This flavor's rcu_tasks_percpu structure. + * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. @@ -110,6 +113,7 @@ struct rcu_tasks { call_rcu_func_t call_func; unsigned int wait_state; struct rcu_tasks_percpu __percpu *rtpcpu; + struct rcu_tasks_percpu **rtpcp_array; int percpu_enqueue_shift; int percpu_enqueue_lim; int percpu_dequeue_lim; @@ -182,6 +186,8 @@ module_param(rcu_task_collapse_lim, int, 0444); static int rcu_task_lazy_lim __read_mostly = 32; module_param(rcu_task_lazy_lim, int, 0444); +static int rcu_task_cpu_ids; + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -245,6 +251,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int cpu; int lim; int shift; + int maxcpu; + int index = 0; if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; @@ -254,14 +262,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) } lim = rcu_task_enqueue_lim; - if (lim > nr_cpu_ids) - lim = nr_cpu_ids; - shift = ilog2(nr_cpu_ids / lim); - if (((nr_cpu_ids - 1) >> shift) >= lim) - shift++; - WRITE_ONCE(rtp->percpu_enqueue_shift, shift); - WRITE_ONCE(rtp->percpu_dequeue_lim, lim); - smp_store_release(&rtp->percpu_enqueue_lim, lim); + rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL); + BUG_ON(!rtp->rtpcp_array); + for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); @@ -273,14 +276,29 @@ static void cblist_init_generic(struct rcu_tasks *rtp) INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; + rtpcp->index = index; + rtp->rtpcp_array[index] = rtpcp; + index++; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); if (!rtpcp->rtp_exit_list.next) INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + maxcpu = cpu; } - pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, - data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); + rcu_task_cpu_ids = maxcpu + 1; + if (lim > rcu_task_cpu_ids) + lim = rcu_task_cpu_ids; + shift = ilog2(rcu_task_cpu_ids / lim); + if (((rcu_task_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); + WRITE_ONCE(rtp->percpu_dequeue_lim, lim); + smp_store_release(&rtp->percpu_enqueue_lim, lim); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n", + rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), + rcu_task_cb_adjust, rcu_task_cpu_ids); } // Compute wakeup time for lazy callback timer. @@ -348,7 +366,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rtpcp->rtp_n_lock_retries = 0; } if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && - READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) + READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids) needadjust = true; // Defer adjustment to avoid deadlock. } // Queuing callbacks before initialization not yet supported. @@ -368,10 +386,10 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); - if (rtp->percpu_enqueue_lim != nr_cpu_ids) { + if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) { WRITE_ONCE(rtp->percpu_enqueue_shift, 0); - WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); - smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); + WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids); + smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); @@ -444,6 +462,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); for (cpu = 0; cpu < dequeue_limit; cpu++) { + if (!cpu_possible(cpu)) + continue; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -481,7 +501,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids)); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); gpdone = false; @@ -496,7 +516,9 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); } if (rtp->percpu_dequeue_lim == 1) { - for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { + for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); @@ -511,30 +533,32 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) // Advance callbacks and invoke any that are ready. static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp) { - int cpu; - int cpunext; int cpuwq; unsigned long flags; int len; + int index; struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); struct rcu_tasks_percpu *rtpcp_next; - cpu = rtpcp->cpu; - cpunext = cpu * 2 + 1; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; - queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); - cpunext++; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + index = rtpcp->index * 2 + 1; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + index++; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + } + } } } - if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) + if (rcu_segcblist_empty(&rtpcp->cblist)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); From 49f49266ec9d61a4d038cb4ddff4417ba793198f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:21 -0700 Subject: [PATCH 45/90] rcu/tasks: Check processor-ID assumptions The current mapping of smp_processor_id() to a CPU processing Tasks-RCU callbacks makes some assumptions about layout. This commit therefore adds a WARN_ON() to check these assumptions. [ neeraj.upadhyay: Replace nr_cpu_ids with rcu_task_cpu_ids. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 72d564c84499..9fb8bb0afff7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -357,6 +357,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rcu_read_lock(); ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids); rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. From 522295774db51ab39ea3d46926555401c8f10130 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:22 -0700 Subject: [PATCH 46/90] rcu/tasks: Update rtp->tasks_gp_seq comment The rtp->tasks_gp_seq grace-period sequence number is not a strict count, but rather the usual RCU sequence number with the lower few bits tracking per-grace-period state and the upper bits the count of grace periods since boot, give or take the initial value. This commit therefore adjusts this comment. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 9fb8bb0afff7..127018280618 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -65,7 +65,7 @@ struct rcu_tasks_percpu { * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. - * @tasks_gp_seq: Number of grace periods completed since boot. + * @tasks_gp_seq: Number of grace periods completed since boot in upper bits. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. From 54973cdd166b13195a99940cd21a827c3a99cac2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:23 -0700 Subject: [PATCH 47/90] rcu: Provide rcu_barrier_cb_is_done() to check rcu_barrier() CBs This commit provides a rcu_barrier_cb_is_done() function that returns true if the *rcu_barrier*() callback passed in is done. This will be used when printing grace-period debugging information. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcu.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 38238e595a61..caaed27e476b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -255,6 +255,11 @@ static inline void debug_rcu_head_callback(struct rcu_head *rhp) kmem_dump_obj(rhp); } +static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp) +{ + return rhp->next == rhp; +} + extern int rcu_cpu_stall_suppress_at_boot; static inline bool rcu_stall_is_suppressed_at_boot(void) From d3f84aeb71d61ece24164bb2ce3161c60922fd8c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:24 -0700 Subject: [PATCH 48/90] rcu/tasks: Mark callbacks not currently participating in barrier operation Each Tasks RCU flavor keeps a count of the number of callbacks that the current rcu_barrier_tasks*() is waiting on, but there is currently no easy way to work out which callback is stuck. One way to do this is to mark idle RCU-barrier callbacks by making the ->next pointer point to the callback itself, and this commit does just that. Later commits will use this for debug output. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 127018280618..d44abcd656d6 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -283,6 +283,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); if (!rtpcp->rtp_exit_list.next) INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head; maxcpu = cpu; } @@ -407,6 +408,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) struct rcu_tasks *rtp; struct rcu_tasks_percpu *rtpcp; + rhp->next = rhp; // Mark the callback as having been invoked. rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head); rtp = rtpcp->rtpp; if (atomic_dec_and_test(&rtp->barrier_q_count)) From fe91cf39db0939ac8d523b5a1c31840f7cbe205c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:25 -0700 Subject: [PATCH 49/90] rcu/tasks: Add detailed grace-period and barrier diagnostics This commit adds rcu_tasks_torture_stats_print(), rcu_tasks_trace_torture_stats_print(), and rcu_tasks_rude_torture_stats_print() functions that provide detailed diagnostics on grace-period, callback, and barrier state. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 3 ++ kernel/rcu/tasks.h | 66 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 31e679c7110e..17463e95b6ef 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -164,6 +164,7 @@ static inline void rcu_nocb_flush_deferred_wakeup(void) { } } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); +void rcu_tasks_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu @@ -190,6 +191,7 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t); rcu_tasks_trace_qs_blkd(t); \ } \ } while (0) +void rcu_tasks_trace_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif @@ -202,6 +204,7 @@ do { \ # ifdef CONFIG_TASKS_RUDE_RCU void synchronize_rcu_tasks_rude(void); +void rcu_tasks_rude_torture_stats_print(char *tt, char *tf); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d44abcd656d6..5f6d80ce1e47 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -714,9 +714,7 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -#endif /* #ifndef CONFIG_TINY_RCU */ -#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -750,6 +748,52 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) rtp->lazy_jiffies, s); } + +/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */ +static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt, + char *tf, char *tst) +{ + cpumask_var_t cm; + int cpu; + bool gotcb = false; + unsigned long j = jiffies; + + pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n", + tt, tf, tst, data_race(rtp->tasks_gp_seq), + j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies), + data_race(rtp->gp_state), tasks_gp_state_getname(rtp)); + pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n", + data_race(rtp->percpu_enqueue_shift), + data_race(rtp->percpu_enqueue_lim), + data_race(rtp->percpu_dequeue_lim), + data_race(rtp->percpu_dequeue_gpseq)); + (void)zalloc_cpumask_var(&cm, GFP_KERNEL); + pr_alert("\tCallback counts:"); + for_each_possible_cpu(cpu) { + long n; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head)) + cpumask_set_cpu(cpu, cm); + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (!n) + continue; + pr_cont(" %d:%ld", cpu, n); + gotcb = true; + } + if (gotcb) + pr_cont(".\n"); + else + pr_cont(" (none).\n"); + pr_alert("\tBarrier seq %lu count %d holdout CPUs ", + data_race(rtp->barrier_q_seq), atomic_read(&rtp->barrier_q_count)); + if (cpumask_available(cm) && !cpumask_empty(cm)) + pr_cont(" %*pbl.\n", cpumask_pr_args(cm)); + else + pr_cont("(none).\n"); + free_cpumask_var(cm); +} + #endif // #ifndef CONFIG_TINY_RCU static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -1201,6 +1245,12 @@ void show_rcu_tasks_classic_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); + +void rcu_tasks_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_gp_kthread(void) @@ -1361,6 +1411,12 @@ void show_rcu_tasks_rude_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); + +void rcu_tasks_rude_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_rude_gp_kthread(void) @@ -2038,6 +2094,12 @@ void show_rcu_tasks_trace_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); + +void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_trace_gp_kthread(void) From 591ce640819ff10385cd072fb19c2ca5d5181fe5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:26 -0700 Subject: [PATCH 50/90] rcu/tasks: Add rcu_barrier_tasks*() start time to diagnostics This commit adds the start time, in jiffies, of the most recently started rcu_barrier_tasks*() operation to the diagnostic output used by rcuscale. This information can be helpful in distinguishing a hung barrier operation from a long series of barrier operations. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tasks.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5f6d80ce1e47..36be92efb0da 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -87,6 +87,7 @@ struct rcu_tasks_percpu { * @barrier_q_count: Number of queues being waited on. * @barrier_q_completion: Barrier wait/wakeup mechanism. * @barrier_q_seq: Sequence number for barrier operations. + * @barrier_q_start: Most recent barrier start in jiffies. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ @@ -122,6 +123,7 @@ struct rcu_tasks { atomic_t barrier_q_count; struct completion barrier_q_completion; unsigned long barrier_q_seq; + unsigned long barrier_q_start; char *name; char *kname; }; @@ -430,6 +432,7 @@ static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp) mutex_unlock(&rtp->barrier_q_mutex); return; } + rtp->barrier_q_start = jiffies; rcu_seq_start(&rtp->barrier_q_seq); init_completion(&rtp->barrier_q_completion); atomic_set(&rtp->barrier_q_count, 2); @@ -785,8 +788,9 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t pr_cont(".\n"); else pr_cont(" (none).\n"); - pr_alert("\tBarrier seq %lu count %d holdout CPUs ", - data_race(rtp->barrier_q_seq), atomic_read(&rtp->barrier_q_count)); + pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ", + data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start), + atomic_read(&rtp->barrier_q_count)); if (cpumask_available(cm) && !cpumask_empty(cm)) pr_cont(" %*pbl.\n", cpumask_pr_args(cm)); else From 3e49aea71d5bac93b892f954d14bc8070e4cc651 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Jul 2024 19:57:47 -0700 Subject: [PATCH 51/90] refscale: Add TINY scenario This commit adds a TINY scenario in order to support tests of Tiny RCU and Tiny SRCU. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- .../rcutorture/configs/refscale/TINY | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/configs/refscale/TINY diff --git a/tools/testing/selftests/rcutorture/configs/refscale/TINY b/tools/testing/selftests/rcutorture/configs/refscale/TINY new file mode 100644 index 000000000000..759343980b80 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refscale/TINY @@ -0,0 +1,20 @@ +CONFIG_SMP=n +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n +#CHECK#CONFIG_PREEMPT_RCU=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n From 4e39bb49c2de445848c2668d7f9fec6ba70724ae Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 21 Jul 2024 09:23:46 +0200 Subject: [PATCH 52/90] refscale: Optimize process_durations() process_durations() is not a hot path, but there is no good reason to iterate over and over the data already in 'buf'. Using a seq_buf saves some useless strcat() and the need of a temp buffer. Data is written directly at the correct place. Signed-off-by: Christophe JAILLET Tested-by: "Paul E. McKenney" Reviewed-by: Davidlohr Bueso Signed-off-by: Neeraj Upadhyay --- kernel/rcu/refscale.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f4ea5b1ec068..cfec0648e141 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -891,32 +892,34 @@ static u64 process_durations(int n) { int i; struct reader_task *rt; - char buf1[64]; + struct seq_buf s; char *buf; u64 sum = 0; buf = kmalloc(800 + 64, GFP_KERNEL); if (!buf) return 0; - buf[0] = 0; - sprintf(buf, "Experiment #%d (Format: :)", - exp_idx); + seq_buf_init(&s, buf, 800 + 64); + + seq_buf_printf(&s, "Experiment #%d (Format: :)", + exp_idx); for (i = 0; i < n && !torture_must_stop(); i++) { rt = &(reader_tasks[i]); - sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); if (i % 5 == 0) - strcat(buf, "\n"); - if (strlen(buf) >= 800) { - pr_alert("%s", buf); - buf[0] = 0; + seq_buf_putc(&s, '\n'); + + if (seq_buf_used(&s) >= 800) { + pr_alert("%s", seq_buf_str(&s)); + seq_buf_clear(&s); } - strcat(buf, buf1); + + seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns); sum += rt->last_duration_ns; } - pr_alert("%s\n", buf); + pr_alert("%s\n", seq_buf_str(&s)); kfree(buf); return sum; From ea793764b5c677cf16f114a5ee3fb620560a78f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:42:58 -0700 Subject: [PATCH 53/90] rcuscale: Save a few lines with whitespace-only change This whitespace-only commit fuses a few lines of code, taking advantage of the newish 100-character-per-line limit to save a few lines of code. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index d534d4ec2314..3269dd9c639f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -1015,13 +1015,9 @@ rcu_scale_init(void) } while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); - writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), - GFP_KERNEL); - writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), - GFP_KERNEL); - writer_n_durations = - kcalloc(nrealwriters, sizeof(*writer_n_durations), - GFP_KERNEL); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); + writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); if (!writer_tasks || !writer_durations || !writer_n_durations) { SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; From 42a8a2695ccbb98c15d73703cb39af70cdcf45fb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:42:59 -0700 Subject: [PATCH 54/90] rcuscale: Dump stacks of stalled rcu_scale_writer() instances This commit improves debuggability by dumping the stacks of rcu_scale_writer() instances that have not completed in a reasonable timeframe. These stacks are dumped remotely, but they will be accurate in the thus-far common case where the stalled rcu_scale_writer() instances are blocked. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 3269dd9c639f..5087ca7062d9 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "rcu.h" @@ -111,6 +112,7 @@ static struct task_struct **reader_tasks; static struct task_struct *shutdown_task; static u64 **writer_durations; +static bool *writer_done; static int *writer_n_durations; static atomic_t n_rcu_scale_reader_started; static atomic_t n_rcu_scale_writer_started; @@ -524,6 +526,7 @@ retry: started = true; if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) { done = true; + WRITE_ONCE(writer_done[me], true); sched_set_normal(current, 0); pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", scale_type, SCALE_FLAG, me, MIN_MEAS); @@ -549,6 +552,19 @@ retry: if (done && !alldone && atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) alldone = true; + if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) { + static atomic_t dumped; + int i; + + if (!atomic_xchg(&dumped, 1)) { + for (i = 0; i < nrealwriters; i++) { + if (writer_done[i]) + continue; + pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i); + sched_show_task(writer_tasks[i]); + } + } + } if (started && !alldone && i < MAX_MEAS - 1) i++; rcu_scale_wait_shutdown(); @@ -921,6 +937,8 @@ rcu_scale_cleanup(void) kfree(writer_tasks); kfree(writer_durations); kfree(writer_n_durations); + kfree(writer_done); + writer_done = NULL; } /* Do torture-type-specific cleanup operations. */ @@ -1015,10 +1033,11 @@ rcu_scale_init(void) } while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); - writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL); + writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL); writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); - if (!writer_tasks || !writer_durations || !writer_n_durations) { + writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done) { SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; From 48c21c020fcacba6deb065c78ac1876418b5be11 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:00 -0700 Subject: [PATCH 55/90] rcuscale: Dump grace-period statistics when rcu_scale_writer() stalls This commit adds a .stats function pointer to the rcu_scale_ops structure, and if this is non-NULL, it is invoked after stack traces are dumped in response to a rcu_scale_writer() stall. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 5087ca7062d9..1b9a43653d7e 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -145,6 +145,7 @@ struct rcu_scale_ops { void (*sync)(void); void (*exp_sync)(void); struct task_struct *(*rso_gp_kthread)(void); + void (*stats)(void); const char *name; }; @@ -226,6 +227,11 @@ static void srcu_scale_synchronize(void) synchronize_srcu(srcu_ctlp); } +static void srcu_scale_stats(void) +{ + srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG); +} + static void srcu_scale_synchronize_expedited(void) { synchronize_srcu_expedited(srcu_ctlp); @@ -243,6 +249,7 @@ static struct rcu_scale_ops srcu_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcu" }; @@ -272,6 +279,7 @@ static struct rcu_scale_ops srcud_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcud" }; @@ -563,6 +571,8 @@ retry: pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i); sched_show_task(writer_tasks[i]); } + if (cur_ops->stats) + cur_ops->stats(); } } if (started && !alldone && i < MAX_MEAS - 1) From 0616f7e981968743edb1ef758ba1a0e984240419 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:01 -0700 Subject: [PATCH 56/90] rcu: Mark callbacks not currently participating in barrier operation RCU keeps a count of the number of callbacks that the current rcu_barrier() is waiting on, but there is currently no easy way to work out which callback is stuck. One way to do this is to mark idle RCU-barrier callbacks by making the ->next pointer point to the callback itself, and this commit does just that. Later commits will use this for debug output. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..f931171daecd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4403,6 +4403,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) { unsigned long __maybe_unused s = rcu_state.barrier_sequence; + rhp->next = rhp; // Mark the callback as having been invoked. if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { rcu_barrier_trace(TPS("LastCB"), -1, s); complete(&rcu_state.barrier_completion); @@ -5424,6 +5425,8 @@ static void __init rcu_init_one(void) while (i > rnp->grphi) rnp++; per_cpu_ptr(&rcu_data, i)->mynode = rnp; + per_cpu_ptr(&rcu_data, i)->barrier_head.next = + &per_cpu_ptr(&rcu_data, i)->barrier_head; rcu_boot_init_percpu_data(i); } } From 674fc922f06f7632ac94536ca92dd8addb606f46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:02 -0700 Subject: [PATCH 57/90] rcuscale: Print detailed grace-period and barrier diagnostics This commit uses the new rcu_tasks_torture_stats_print(), rcu_tasks_trace_torture_stats_print(), and rcu_tasks_rude_torture_stats_print() functions in order to provide detailed diagnostics on grace-period, callback, and barrier state when rcu_scale_writer() hangs. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 1b9a43653d7e..c507750e94d8 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -298,6 +298,11 @@ static void tasks_scale_read_unlock(int idx) { } +static void rcu_tasks_scale_stats(void) +{ + rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -310,6 +315,7 @@ static struct rcu_scale_ops tasks_ops = { .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, .rso_gp_kthread = get_rcu_tasks_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats, .name = "tasks" }; @@ -336,6 +342,11 @@ static void tasks_rude_scale_read_unlock(int idx) { } +static void rcu_tasks_rude_scale_stats(void) +{ + rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_rude_ops = { .ptype = RCU_TASKS_RUDE_FLAVOR, .init = rcu_sync_scale_init, @@ -346,6 +357,7 @@ static struct rcu_scale_ops tasks_rude_ops = { .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats, .name = "tasks-rude" }; @@ -374,6 +386,11 @@ static void tasks_trace_scale_read_unlock(int idx) rcu_read_unlock_trace(); } +static void rcu_tasks_trace_scale_stats(void) +{ + rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_tracing_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -386,6 +403,7 @@ static struct rcu_scale_ops tasks_tracing_ops = { .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, .name = "tasks-tracing" }; From ac9d45544cd571decca395715d0b0a3b617d02f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 13:33:58 -0700 Subject: [PATCH 58/90] locking/csd_lock: Provide an indication of ongoing CSD-lock stall If a CSD-lock stall goes on long enough, it will cause an RCU CPU stall warning. This additional warning provides much additional console-log traffic and little additional information. Therefore, provide a new csd_lock_is_stuck() function that returns true if there is an ongoing CSD-lock stall. This function will be used by the RCU CPU stall warnings to provide a one-line indication of the stall when this function returns true. [ neeraj.upadhyay: Apply Rik van Riel feedback. ] [ neeraj.upadhyay: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Imran Khan Cc: Ingo Molnar Cc: Leonardo Bras Cc: "Peter Zijlstra (Intel)" Cc: Rik van Riel Signed-off-by: Neeraj Upadhyay --- include/linux/smp.h | 6 ++++++ kernel/smp.c | 16 ++++++++++++++++ lib/Kconfig.debug | 1 + 3 files changed, 23 insertions(+) diff --git a/include/linux/smp.h b/include/linux/smp.h index fcd61dfe2af3..3871bd32018f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -294,4 +294,10 @@ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); int smpcfd_dying_cpu(unsigned int cpu); +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG +bool csd_lock_is_stuck(void); +#else +static inline bool csd_lock_is_stuck(void) { return false; } +#endif + #endif /* __LINUX_SMP_H */ diff --git a/kernel/smp.c b/kernel/smp.c index e87953729230..202cda4d2a55 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -208,6 +208,19 @@ static int csd_lock_wait_getcpu(call_single_data_t *csd) return -1; } +static atomic_t n_csd_lock_stuck; + +/** + * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? + * + * Returns @true if a CSD-lock acquisition is stuck and has been stuck + * long enough for a "non-responsive CSD lock" message to be printed. + */ +bool csd_lock_is_stuck(void) +{ + return !!atomic_read(&n_csd_lock_stuck); +} + /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, @@ -229,6 +242,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); + atomic_dec(&n_csd_lock_stuck); return true; } @@ -252,6 +266,8 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); + if (firsttime) + atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..4e5f61cba8e4 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1614,6 +1614,7 @@ config SCF_TORTURE_TEST config CSD_LOCK_WAIT_DEBUG bool "Debugging for csd_lock_wait(), called from smp_call_function*()" depends on DEBUG_KERNEL + depends on SMP depends on 64BIT default n help From d40760d6811d55172ba6b90ebd2a60a75f88bffe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jul 2024 14:32:20 -0700 Subject: [PATCH 59/90] locking/csd-lock: Use backoff for repeated reports of same incident Currently, the CSD-lock diagnostics in CONFIG_CSD_LOCK_WAIT_DEBUG=y kernels are emitted at five-second intervals. Although this has proven to be a good time interval for the first diagnostic, if the target CPU keeps interrupts disabled for way longer than five seconds, the ratio of useful new information to pointless repetition increases considerably. Therefore, back off the time period for repeated reports of the same incident, increasing linearly with the number of reports and logarithmicly with the number of online CPUs. [ paulmck: Apply Dan Carpenter feedback. ] Signed-off-by: Paul E. McKenney Cc: Imran Khan Cc: Ingo Molnar Cc: Leonardo Bras Cc: "Peter Zijlstra (Intel)" Cc: Rik van Riel Reviewed-by: Rik van Riel Signed-off-by: Neeraj Upadhyay --- kernel/smp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index 202cda4d2a55..b484ee6dcaf6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -226,7 +226,7 @@ bool csd_lock_is_stuck(void) * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages) { int cpu = -1; int cpux; @@ -249,7 +249,9 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in ts2 = sched_clock(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; - if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) + if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * + (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) || + csd_lock_timeout_ns == 0)) return false; firsttime = !*bug_id; @@ -266,6 +268,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); + (*nmessages)++; if (firsttime) atomic_inc(&n_csd_lock_stuck); /* @@ -306,12 +309,13 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in */ static void __csd_lock_wait(call_single_data_t *csd) { + unsigned long nmessages = 0; int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = sched_clock(); for (;;) { - if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) + if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; cpu_relax(); } From 9fbaa44114ca6b69074a488295e86a9fa7e685f9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 15 Jul 2024 13:49:41 -0400 Subject: [PATCH 60/90] smp: print only local CPU info when sched_clock goes backward About 40% of all csd_lock warnings observed in our fleet appear to be due to sched_clock() going backward in time (usually only a little bit), resulting in ts0 being larger than ts2. When the local CPU is at fault, we should print out a message reflecting that, rather than trying to get the remote CPU's stack trace. Signed-off-by: Rik van Riel Tested-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/smp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/smp.c b/kernel/smp.c index b484ee6dcaf6..f25e20617b7e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -254,6 +254,14 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in csd_lock_timeout_ns == 0)) return false; + if (ts0 > ts2) { + /* Our own sched_clock went backward; don't blame another CPU. */ + ts_delta = ts0 - ts2; + pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta); + *ts1 = ts2; + return false; + } + firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); From 1dd01c06506c42ab3a8026272ee93c466ae0cb4b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 19:30:56 -0700 Subject: [PATCH 61/90] rcu: Summarize RCU CPU stall warnings during CSD-lock stalls During CSD-lock stalls, the additional information output by RCU CPU stall warnings is usually redundant, flooding the console for not good reason. However, this has been the way things work for a few years. This commit therefore adds an rcutree.csd_lock_suppress_rcu_stall kernel boot parameter that causes RCU CPU stall warnings to be abbreviated to a single line when there is at least one CPU that has been stuck waiting for CSD lock for more than five seconds. To make this abbreviated message happen with decent probability: tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 8 \ --configs "2*TREE01" --kconfig "CONFIG_CSD_LOCK_WAIT_DEBUG=y" \ --bootargs "csdlock_debug=1 rcutorture.stall_cpu=200 \ rcutorture.stall_cpu_holdoff=120 rcutorture.stall_cpu_irqsoff=1 \ rcutree.csd_lock_suppress_rcu_stall=1 \ rcupdate.rcu_exp_cpu_stall_timeout=5000" --trust-make [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ kernel/rcu/tree_stall.h | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1384c7b59c9..d56356c13184 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4937,6 +4937,10 @@ Set maximum number of finished RCU callbacks to process in one batch. + rcutree.csd_lock_suppress_rcu_stall= [KNL] + Do only a one-line RCU CPU stall warning when + there is an ongoing too-long CSD-lock wait. + rcutree.do_rcu_barrier= [KNL] Request a call to rcu_barrier(). This is throttled so that userspace tests can safely diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4b0e9d7c4c68..b497d4c6dabd 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -9,6 +9,7 @@ #include #include +#include ////////////////////////////////////////////////////////////////////////////// // @@ -719,6 +720,9 @@ static void print_cpu_stall(unsigned long gps) set_preempt_need_resched(); } +static bool csd_lock_suppress_rcu_stall; +module_param(csd_lock_suppress_rcu_stall, bool, 0644); + static void check_cpu_stall(struct rcu_data *rdp) { bool self_detected; @@ -791,7 +795,9 @@ static void check_cpu_stall(struct rcu_data *rdp) return; rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); - if (self_detected) { + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + } else if (self_detected) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); } else { From 27d5749d075578b105e958369a0263a0e15a8d3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 21:19:06 -0700 Subject: [PATCH 62/90] rcu: Extract synchronize_rcu_expedited_stall() from synchronize_rcu_expedited_wait() This commit extracts the RCU CPU stall-warning report code from synchronize_rcu_expedited_wait() and places it in a new function named synchronize_rcu_expedited_stall(). This is strictly a code-movement commit. A later commit will use this reorganization to avoid printing expedited RCU CPU stall warnings while there are ongoing CSD-lock stall reports. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_exp.h | 111 +++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 51 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4acd29d16fdb..17dd5169012d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -542,6 +542,65 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit) return false; } +/* + * Print out an expedited RCU CPU stall warning message. + */ +static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j) +{ + int cpu; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(); + + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); + ndetected = 0; + rcu_for_each_leaf_node(rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp; + + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(&rcu_data, cpu); + pr_cont(" %d-%c%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); + } + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); + if (ndetected) { + pr_err("blocking rcu_node structures (internal RCU debug):"); + rcu_for_each_node_breadth_first(rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_exp_done_unlocked(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + preempt_disable(); // For smp_processor_id() in dump_cpu_task(). + dump_cpu_task(cpu); + preempt_enable(); + } + rcu_exp_print_detail_task_stall_rnp(rnp); + } +} + /* * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. @@ -553,10 +612,8 @@ static void synchronize_rcu_expedited_wait(void) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; - int ndetected; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(); unsigned long flags; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); @@ -593,55 +650,7 @@ static void synchronize_rcu_expedited_wait(void) j = jiffies; rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rcu_state.name); - ndetected = 0; - rcu_for_each_leaf_node(rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) { - struct rcu_data *rdp; - - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], - "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); - } - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - j - jiffies_start, rcu_state.expedited_sequence, - data_race(rnp_root->expmask), - ".T"[!!data_race(rnp_root->exp_tasks)]); - if (ndetected) { - pr_err("blocking rcu_node structures (internal RCU debug):"); - rcu_for_each_node_breadth_first(rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_exp_done_unlocked(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - data_race(rnp->expmask), - ".T"[!!data_race(rnp->exp_tasks)]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rnp) { - for_each_leaf_node_possible_cpu(rnp, cpu) { - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - preempt_disable(); // For smp_processor_id() in dump_cpu_task(). - dump_cpu_task(cpu); - preempt_enable(); - } - rcu_exp_print_detail_task_stall_rnp(rnp); - } + synchronize_rcu_expedited_stall(jiffies_start, j); jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; panic_on_rcu_stall(); } From 7c72dedb0079e62c1c75dbab038332017f34a6b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 21:24:29 -0700 Subject: [PATCH 63/90] rcu: Summarize expedited RCU CPU stall warnings during CSD-lock stalls During CSD-lock stalls, the additional information output by expedited RCU CPU stall warnings is usually redundant, flooding the console for not good reason. However, this has been the way things work for a few years. This commit therefore uses rcutree.csd_lock_suppress_rcu_stall kernel boot parameter that causes expedited RCU CPU stall warnings to be abbreviated to a single line when there is at least one CPU that has been stuck waiting for CSD lock for more than five seconds. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_exp.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 17dd5169012d..d4be644afb50 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -553,6 +553,10 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(); + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + return; + } pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); ndetected = 0; rcu_for_each_leaf_node(rnp) { From 51b739990cc81316768b85db519eac3999ba92a9 Mon Sep 17 00:00:00 2001 From: Ryo Takakura Date: Fri, 28 Jun 2024 13:18:26 +0900 Subject: [PATCH 64/90] rcu: Let dump_cpu_task() be used without preemption disabled The commit 2d7f00b2f0130 ("rcu: Suppress smp_processor_id() complaint in synchronize_rcu_expedited_wait()") disabled preemption around dump_cpu_task() to suppress warning on its usage within preemtible context. Calling dump_cpu_task() doesn't required to be in non-preemptible context except for suppressing the smp_processor_id() warning. As the smp_processor_id() is evaluated along with in_hardirq() to check if it's in interrupt context, this patch removes the need for its preemtion disablement by reordering the condition so that smp_processor_id() only gets evaluated when it's in interrupt context. Signed-off-by: Ryo Takakura Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_exp.h | 2 -- kernel/sched/core.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d4be644afb50..c5d9a7eb0803 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -597,9 +597,7 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne mask = leaf_node_cpu_bit(rnp, cpu); if (!(READ_ONCE(rnp->expmask) & mask)) continue; - preempt_disable(); // For smp_processor_id() in dump_cpu_task(). dump_cpu_task(cpu); - preempt_enable(); } rcu_exp_print_detail_task_stall_rnp(rnp); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a9f655025607..78ae888a1fa2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9726,7 +9726,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { - if (cpu == smp_processor_id() && in_hardirq()) { + if (in_hardirq() && cpu == smp_processor_id()) { struct pt_regs *regs; regs = get_irq_regs(); From 11377947b5861fa59bf77c827e1dd7c081842cc9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:03 -0700 Subject: [PATCH 65/90] rcuscale: Provide clear error when async specified without primitives Currently, if the rcuscale module's async module parameter is specified for RCU implementations that do not have async primitives such as RCU Tasks Rude (which now lacks a call_rcu_tasks_rude() function), there will be a series of splats due to calls to a NULL pointer. This commit therefore warns of this situation, but switches to non-async testing. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index c507750e94d8..79e1c32d5c0f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -525,7 +525,7 @@ rcu_scale_writer(void *arg) schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_async) { + if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { retry: if (!rhp) rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); @@ -597,7 +597,7 @@ retry: i++; rcu_scale_wait_shutdown(); } while (!torture_must_stop()); - if (gp_async) { + if (gp_async && cur_ops->async) { cur_ops->gp_barrier(); } writer_n_durations[me] = i_max + 1; From abaf1322adbfd4faa28072fd5412e54b9722e477 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:04 -0700 Subject: [PATCH 66/90] rcuscale: Make all writer tasks report upon hang This commit causes all writer tasks to provide a brief report after a hang has been reported, spaced at one-second intervals. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 79e1c32d5c0f..dfe8e0faa4d8 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -483,6 +483,7 @@ rcu_scale_writer(void *arg) unsigned long jdone; long me = (long)arg; struct rcu_head *rhp = NULL; + bool selfreport = false; bool started = false, done = false, alldone = false; u64 t; DEFINE_TORTURE_RANDOM(tr); @@ -593,6 +594,11 @@ retry: cur_ops->stats(); } } + if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) { + pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n", + __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone); + selfreport = true; + } if (started && !alldone && i < MAX_MEAS - 1) i++; rcu_scale_wait_shutdown(); From 3e3c4f0e2753e92aa2a248d34c6866c0a264ac96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:05 -0700 Subject: [PATCH 67/90] rcuscale: Make rcu_scale_writer() tolerate repeated GFP_KERNEL failure Under some conditions, kmalloc(GFP_KERNEL) allocations have been observed to repeatedly fail. This situation has been observed to cause one of the rcu_scale_writer() instances to loop indefinitely retrying memory allocation for an asynchronous grace-period primitive. The problem is that if memory is short, all the other instances will allocate all available memory before the looping task is awakened from its rcu_barrier*() call. This in turn results in hangs, so that rcuscale fails to complete. This commit therefore removes the tight retry loop, so that when this condition occurs, the affected task is still passing through the full loop with its full set of termination checks. This spreads the risk of indefinite memory-allocation retry failures across all instances of rcu_scale_writer() tasks, which in turn prevents the hangs. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index dfe8e0faa4d8..80518662273b 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -520,6 +520,8 @@ rcu_scale_writer(void *arg) jdone = jiffies + minruntime * HZ; do { + bool gp_succeeded = false; + if (writer_holdoff) udelay(writer_holdoff); if (writer_holdoff_jiffies) @@ -527,23 +529,24 @@ rcu_scale_writer(void *arg) wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { -retry: if (!rhp) rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { atomic_inc(this_cpu_ptr(&n_async_inflight)); cur_ops->async(rhp, rcu_scale_async_cb); rhp = NULL; + gp_succeeded = true; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); - goto retry; } else { kfree(rhp); /* Because we are stopping. */ } } else if (gp_exp) { cur_ops->exp_sync(); + gp_succeeded = true; } else { cur_ops->sync(); + gp_succeeded = true; } t = ktime_get_mono_fast_ns(); *wdp = t - *wdp; @@ -599,7 +602,7 @@ retry: __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone); selfreport = true; } - if (started && !alldone && i < MAX_MEAS - 1) + if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1) i++; rcu_scale_wait_shutdown(); } while (!torture_must_stop()); From 1c3e6e7903fa939601fdd326b04507304ba22337 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:06 -0700 Subject: [PATCH 68/90] rcuscale: Use special allocator for rcu_scale_writer() The rcu_scale_writer() function needs only a fixed number of rcu_head structures per kthread, which means that a trivial allocator suffices. This commit therefore uses an llist-based allocator using a fixed array of structures per kthread. This allows aggressive testing of RCU performance without stressing the slab allocators. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 123 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 113 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 80518662273b..bc7cca979c06 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -105,6 +105,19 @@ static char *scale_type = "rcu"; module_param(scale_type, charp, 0444); MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); +// Structure definitions for custom fixed-per-task allocator. +struct writer_mblock { + struct rcu_head wmb_rh; + struct llist_node wmb_node; + struct writer_freelist *wmb_wfl; +}; + +struct writer_freelist { + struct llist_head ws_lhg; + struct llist_head ____cacheline_internodealigned_in_smp ws_lhp; + struct writer_mblock *ws_mblocks; +}; + static int nrealreaders; static int nrealwriters; static struct task_struct **writer_tasks; @@ -113,6 +126,7 @@ static struct task_struct *shutdown_task; static u64 **writer_durations; static bool *writer_done; +static struct writer_freelist *writer_freelists; static int *writer_n_durations; static atomic_t n_rcu_scale_reader_started; static atomic_t n_rcu_scale_writer_started; @@ -463,13 +477,52 @@ rcu_scale_reader(void *arg) return 0; } +/* + * Allocate a writer_mblock structure for the specified rcu_scale_writer + * task. + */ +static struct writer_mblock *rcu_scale_alloc(long me) +{ + struct llist_node *llnp; + struct writer_freelist *wflp; + struct writer_mblock *wmbp; + + if (WARN_ON_ONCE(!writer_freelists)) + return NULL; + wflp = &writer_freelists[me]; + if (llist_empty(&wflp->ws_lhp)) { + // ->ws_lhp is private to its rcu_scale_writer task. + wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node); + wflp->ws_lhp.first = &wmbp->wmb_node; + } + llnp = llist_del_first(&wflp->ws_lhp); + if (!llnp) + return NULL; + return container_of(llnp, struct writer_mblock, wmb_node); +} + +/* + * Free a writer_mblock structure to its rcu_scale_writer task. + */ +static void rcu_scale_free(struct writer_mblock *wmbp) +{ + struct writer_freelist *wflp; + + if (!wmbp) + return; + wflp = wmbp->wmb_wfl; + llist_add(&wmbp->wmb_node, &wflp->ws_lhg); +} + /* * Callback function for asynchronous grace periods from rcu_scale_writer(). */ static void rcu_scale_async_cb(struct rcu_head *rhp) { + struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh); + atomic_dec(this_cpu_ptr(&n_async_inflight)); - kfree(rhp); + rcu_scale_free(wmbp); } /* @@ -482,13 +535,13 @@ rcu_scale_writer(void *arg) int i_max; unsigned long jdone; long me = (long)arg; - struct rcu_head *rhp = NULL; bool selfreport = false; bool started = false, done = false, alldone = false; u64 t; DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; + struct writer_mblock *wmbp = NULL; VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); @@ -529,17 +582,18 @@ rcu_scale_writer(void *arg) wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { - if (!rhp) - rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); - if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { + if (!wmbp) + wmbp = rcu_scale_alloc(me); + if (wmbp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_scale_async_cb); - rhp = NULL; + cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb); + wmbp = NULL; gp_succeeded = true; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); } else { - kfree(rhp); /* Because we are stopping. */ + rcu_scale_free(wmbp); /* Because we are stopping. */ + wmbp = NULL; } } else if (gp_exp) { cur_ops->exp_sync(); @@ -607,6 +661,7 @@ rcu_scale_writer(void *arg) rcu_scale_wait_shutdown(); } while (!torture_must_stop()); if (gp_async && cur_ops->async) { + rcu_scale_free(wmbp); cur_ops->gp_barrier(); } writer_n_durations[me] = i_max + 1; @@ -970,12 +1025,30 @@ rcu_scale_cleanup(void) schedule_timeout_uninterruptible(1); } kfree(writer_durations[i]); + if (writer_freelists) { + int ctr = 0; + struct llist_node *llnp; + struct writer_freelist *wflp = &writer_freelists[i]; + + if (wflp->ws_mblocks) { + llist_for_each(llnp, wflp->ws_lhg.first) + ctr++; + llist_for_each(llnp, wflp->ws_lhp.first) + ctr++; + WARN_ONCE(ctr != gp_async_max, + "%s: ctr = %d gp_async_max = %d\n", + __func__, ctr, gp_async_max); + kfree(wflp->ws_mblocks); + } + } } kfree(writer_tasks); kfree(writer_durations); kfree(writer_n_durations); kfree(writer_done); writer_done = NULL; + kfree(writer_freelists); + writer_freelists = NULL; } /* Do torture-type-specific cleanup operations. */ @@ -1002,8 +1075,9 @@ rcu_scale_shutdown(void *arg) static int __init rcu_scale_init(void) { - long i; int firsterr = 0; + long i; + long j; static struct rcu_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS }; @@ -1074,7 +1148,18 @@ rcu_scale_init(void) writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL); - if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done) { + if (gp_async) { + if (gp_async_max <= 0) { + pr_warn("%s: gp_async_max = %d must be greater than zero.\n", + __func__, gp_async_max); + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + firsterr = -EINVAL; + goto unwind; + } + writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL); + } + if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done || + (gp_async && !writer_freelists)) { SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; @@ -1087,6 +1172,24 @@ rcu_scale_init(void) firsterr = -ENOMEM; goto unwind; } + if (writer_freelists) { + struct writer_freelist *wflp = &writer_freelists[i]; + + init_llist_head(&wflp->ws_lhg); + init_llist_head(&wflp->ws_lhp); + wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]), + GFP_KERNEL); + if (!wflp->ws_mblocks) { + firsterr = -ENOMEM; + goto unwind; + } + for (j = 0; j < gp_async_max; j++) { + struct writer_mblock *wmbp = &wflp->ws_mblocks[j]; + + wmbp->wmb_wfl = wflp; + llist_add(&wmbp->wmb_node, &wflp->ws_lhp); + } + } firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); if (torture_init_error(firsterr)) From 554f07a119866a78606b36123fb28e9451b1f994 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:07 -0700 Subject: [PATCH 69/90] rcuscale: NULL out top-level pointers to heap memory Currently, if someone modprobes and rmmods rcuscale successfully, but the next run errors out during the modprobe, non-NULL pointers to freed memory will remain. If the run after that also errors out during the modprobe, there will be double-free bugs. This commit therefore NULLs out top-level pointers to memory that has just been freed. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index bc7cca979c06..61a178914256 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -819,6 +819,7 @@ kfree_scale_cleanup(void) torture_stop_kthread(kfree_scale_thread, kfree_reader_tasks[i]); kfree(kfree_reader_tasks); + kfree_reader_tasks = NULL; } torture_cleanup_end(); @@ -987,6 +988,7 @@ rcu_scale_cleanup(void) torture_stop_kthread(rcu_scale_reader, reader_tasks[i]); kfree(reader_tasks); + reader_tasks = NULL; } if (writer_tasks) { @@ -1043,8 +1045,11 @@ rcu_scale_cleanup(void) } } kfree(writer_tasks); + writer_tasks = NULL; kfree(writer_durations); + writer_durations = NULL; kfree(writer_n_durations); + writer_n_durations = NULL; kfree(writer_done); writer_done = NULL; kfree(writer_freelists); From f1fd0e0bb12d09ca0564425b8af76fddf2e380fb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:43:08 -0700 Subject: [PATCH 70/90] rcuscale: Count outstanding callbacks per-task rather than per-CPU The current rcu_scale_writer() asynchronous grace-period testing uses a per-CPU counter to track the number of outstanding callbacks. This is subject to CPU-imbalance errors when tasks migrate from one CPU to another between the time that the counter is incremented and the callback is queued, and additionally in kernels configured such that callbacks can be invoked on some CPU other than the one that queued it. This commit therefore arranges for per-task callback counts, thus avoiding any issues with migration of either tasks or callbacks. Reported-by: Vlastimil Babka Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcuscale.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 61a178914256..6d37596deb1f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -114,6 +114,7 @@ struct writer_mblock { struct writer_freelist { struct llist_head ws_lhg; + atomic_t ws_inflight; struct llist_head ____cacheline_internodealigned_in_smp ws_lhp; struct writer_mblock *ws_mblocks; }; @@ -136,7 +137,6 @@ static u64 t_rcu_scale_writer_started; static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; static unsigned long b_rcu_gp_test_finished; -static DEFINE_PER_CPU(atomic_t, n_async_inflight); #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -520,8 +520,9 @@ static void rcu_scale_free(struct writer_mblock *wmbp) static void rcu_scale_async_cb(struct rcu_head *rhp) { struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh); + struct writer_freelist *wflp = wmbp->wmb_wfl; - atomic_dec(this_cpu_ptr(&n_async_inflight)); + atomic_dec(&wflp->ws_inflight); rcu_scale_free(wmbp); } @@ -541,6 +542,7 @@ rcu_scale_writer(void *arg) DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; + struct writer_freelist *wflp = &writer_freelists[me]; struct writer_mblock *wmbp = NULL; VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); @@ -584,8 +586,8 @@ rcu_scale_writer(void *arg) if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { if (!wmbp) wmbp = rcu_scale_alloc(me); - if (wmbp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { - atomic_inc(this_cpu_ptr(&n_async_inflight)); + if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) { + atomic_inc(&wflp->ws_inflight); cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb); wmbp = NULL; gp_succeeded = true; From 8f35fefad06323ab8ad5915bb7c6bc1a684cb8b5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 6 Aug 2024 15:30:16 +0200 Subject: [PATCH 71/90] refscale: Constify struct ref_scale_ops 'struct ref_scale_ops' are not modified in these drivers. Constifying this structure moves some data to a read-only section, so increase overall security. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 34231 4167 736 39134 98de kernel/rcu/refscale.o After: ===== text data bss dec hex filename 35175 3239 736 39150 98ee kernel/rcu/refscale.o Signed-off-by: Christophe JAILLET Tested-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/refscale.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index cfec0648e141..0db9db73f57f 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -135,7 +135,7 @@ struct ref_scale_ops { const char *name; }; -static struct ref_scale_ops *cur_ops; +static const struct ref_scale_ops *cur_ops; static void un_delay(const int udl, const int ndl) { @@ -171,7 +171,7 @@ static bool rcu_sync_scale_init(void) return true; } -static struct ref_scale_ops rcu_ops = { +static const struct ref_scale_ops rcu_ops = { .init = rcu_sync_scale_init, .readsection = ref_rcu_read_section, .delaysection = ref_rcu_delay_section, @@ -205,7 +205,7 @@ static void srcu_ref_scale_delay_section(const int nloops, const int udl, const } } -static struct ref_scale_ops srcu_ops = { +static const struct ref_scale_ops srcu_ops = { .init = rcu_sync_scale_init, .readsection = srcu_ref_scale_read_section, .delaysection = srcu_ref_scale_delay_section, @@ -232,7 +232,7 @@ static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, c un_delay(udl, ndl); } -static struct ref_scale_ops rcu_tasks_ops = { +static const struct ref_scale_ops rcu_tasks_ops = { .init = rcu_sync_scale_init, .readsection = rcu_tasks_ref_scale_read_section, .delaysection = rcu_tasks_ref_scale_delay_section, @@ -271,7 +271,7 @@ static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, c } } -static struct ref_scale_ops rcu_trace_ops = { +static const struct ref_scale_ops rcu_trace_ops = { .init = rcu_sync_scale_init, .readsection = rcu_trace_ref_scale_read_section, .delaysection = rcu_trace_ref_scale_delay_section, @@ -310,7 +310,7 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops refcnt_ops = { +static const struct ref_scale_ops refcnt_ops = { .init = rcu_sync_scale_init, .readsection = ref_refcnt_section, .delaysection = ref_refcnt_delay_section, @@ -347,7 +347,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops rwlock_ops = { +static const struct ref_scale_ops rwlock_ops = { .init = ref_rwlock_init, .readsection = ref_rwlock_section, .delaysection = ref_rwlock_delay_section, @@ -384,7 +384,7 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n } } -static struct ref_scale_ops rwsem_ops = { +static const struct ref_scale_ops rwsem_ops = { .init = ref_rwsem_init, .readsection = ref_rwsem_section, .delaysection = ref_rwsem_delay_section, @@ -419,7 +419,7 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd preempt_enable(); } -static struct ref_scale_ops lock_ops = { +static const struct ref_scale_ops lock_ops = { .readsection = ref_lock_section, .delaysection = ref_lock_delay_section, .name = "lock" @@ -454,7 +454,7 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in preempt_enable(); } -static struct ref_scale_ops lock_irq_ops = { +static const struct ref_scale_ops lock_irq_ops = { .readsection = ref_lock_irq_section, .delaysection = ref_lock_irq_delay_section, .name = "lock-irq" @@ -490,7 +490,7 @@ static void ref_acqrel_delay_section(const int nloops, const int udl, const int preempt_enable(); } -static struct ref_scale_ops acqrel_ops = { +static const struct ref_scale_ops acqrel_ops = { .readsection = ref_acqrel_section, .delaysection = ref_acqrel_delay_section, .name = "acqrel" @@ -524,7 +524,7 @@ static void ref_clock_delay_section(const int nloops, const int udl, const int n stopopts = x; } -static struct ref_scale_ops clock_ops = { +static const struct ref_scale_ops clock_ops = { .readsection = ref_clock_section, .delaysection = ref_clock_delay_section, .name = "clock" @@ -556,7 +556,7 @@ static void ref_jiffies_delay_section(const int nloops, const int udl, const int stopopts = x; } -static struct ref_scale_ops jiffies_ops = { +static const struct ref_scale_ops jiffies_ops = { .readsection = ref_jiffies_section, .delaysection = ref_jiffies_delay_section, .name = "jiffies" @@ -706,9 +706,9 @@ static void refscale_typesafe_ctor(void *rtsp_in) preempt_enable(); } -static struct ref_scale_ops typesafe_ref_ops; -static struct ref_scale_ops typesafe_lock_ops; -static struct ref_scale_ops typesafe_seqlock_ops; +static const struct ref_scale_ops typesafe_ref_ops; +static const struct ref_scale_ops typesafe_lock_ops; +static const struct ref_scale_ops typesafe_seqlock_ops; // Initialize for a typesafe test. static bool typesafe_init(void) @@ -769,7 +769,7 @@ static void typesafe_cleanup(void) } // The typesafe_init() function distinguishes these structures by address. -static struct ref_scale_ops typesafe_ref_ops = { +static const struct ref_scale_ops typesafe_ref_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -777,7 +777,7 @@ static struct ref_scale_ops typesafe_ref_ops = { .name = "typesafe_ref" }; -static struct ref_scale_ops typesafe_lock_ops = { +static const struct ref_scale_ops typesafe_lock_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -785,7 +785,7 @@ static struct ref_scale_ops typesafe_lock_ops = { .name = "typesafe_lock" }; -static struct ref_scale_ops typesafe_seqlock_ops = { +static const struct ref_scale_ops typesafe_seqlock_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -1026,7 +1026,7 @@ end: } static void -ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, @@ -1081,7 +1081,7 @@ ref_scale_init(void) { long i; int firsterr = 0; - static struct ref_scale_ops *scale_ops[] = { + static const struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, From b1b91fd1bece511dfe05d41675c624a4506e409d Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 10:53:45 +0200 Subject: [PATCH 72/90] context_tracking, rcu: Rename rcu_dynticks_task*() into rcu_task*() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. While at it, flip the suffixes of these helpers. We are not telling that we are entering dynticks mode from an RCU-task perspective anymore; we are telling that we are exiting RCU-tasks because we are in eqs mode. [ neeraj.upadhyay: Incorporate Frederic Weisbecker feedback. ] Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/context_tracking.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 8262f57a4363..00c8f128885a 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -38,24 +38,24 @@ EXPORT_SYMBOL_GPL(context_tracking); #ifdef CONFIG_CONTEXT_TRACKING_IDLE #define TPS(x) tracepoint_string(x) -/* Record the current task on dyntick-idle entry. */ -static __always_inline void rcu_dynticks_task_enter(void) +/* Record the current task on exiting RCU-tasks (dyntick-idle entry). */ +static __always_inline void rcu_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } -/* Record no current task on dyntick-idle exit. */ -static __always_inline void rcu_dynticks_task_exit(void) +/* Record no current task on entering RCU-tasks (dyntick-idle exit). */ +static __always_inline void rcu_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } -/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ -static __always_inline void rcu_dynticks_task_trace_enter(void) +/* Turn on heavyweight RCU tasks trace readers on kernel exit. */ +static __always_inline void rcu_task_trace_heavyweight_enter(void) { #ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) @@ -63,8 +63,8 @@ static __always_inline void rcu_dynticks_task_trace_enter(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ -static __always_inline void rcu_dynticks_task_trace_exit(void) +/* Turn off heavyweight RCU tasks trace readers on kernel entry. */ +static __always_inline void rcu_task_trace_heavyweight_exit(void) { #ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) @@ -87,7 +87,7 @@ static noinstr void ct_kernel_exit_state(int offset) * critical sections, and we also must force ordering with the * next idle sojourn. */ - rcu_dynticks_task_trace_enter(); // Before ->dynticks update! + rcu_task_trace_heavyweight_enter(); // Before CT state update! seq = ct_state_inc(offset); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); @@ -109,7 +109,7 @@ static noinstr void ct_kernel_enter_state(int offset) */ seq = ct_state_inc(offset); // RCU is now watching. Better not be in an extended quiescent state! - rcu_dynticks_task_trace_exit(); // After ->dynticks update! + rcu_task_trace_heavyweight_exit(); // After CT state update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING)); } @@ -149,7 +149,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) // RCU is watching here ... ct_kernel_exit_state(offset); // ... but is no longer watching here. - rcu_dynticks_task_enter(); + rcu_task_exit(); } /* @@ -173,7 +173,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) ct->nesting++; return; } - rcu_dynticks_task_exit(); + rcu_task_enter(); // RCU is not watching here ... ct_kernel_enter_state(offset); // ... but is watching here. @@ -240,7 +240,7 @@ void noinstr ct_nmi_exit(void) // ... but is no longer watching here. if (!in_nmi()) - rcu_dynticks_task_enter(); + rcu_task_exit(); } /** @@ -274,7 +274,7 @@ void noinstr ct_nmi_enter(void) if (rcu_dynticks_curr_cpu_in_eqs()) { if (!in_nmi()) - rcu_dynticks_task_exit(); + rcu_task_enter(); // RCU is not watching here ... ct_kernel_enter_state(CT_RCU_WATCHING); From fda70207135b60dd1a7c8f16cc036bb1cc344490 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 11:01:23 +0200 Subject: [PATCH 73/90] context_tracking, rcu: Rename rcu_dynticks_curr_cpu_in_eqs() into rcu_is_watching_curr_cpu() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Note that "watching" is the opposite of "in EQS", so the negation is lifted out of the helper and into the callsites. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking.h | 12 ++++++++---- kernel/context_tracking.c | 6 +++--- kernel/rcu/tree.c | 6 +++--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index a6c36780cc3b..d53092ffa9db 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -113,13 +113,17 @@ extern void ct_idle_enter(void); extern void ct_idle_exit(void); /* - * Is the current CPU in an extended quiescent state? + * Is RCU watching the current CPU (IOW, it is not in an extended quiescent state)? + * + * Note that this returns the actual boolean data (watching / not watching), + * whereas ct_rcu_watching() returns the RCU_WATCHING subvariable of + * context_tracking.state. * * No ordering, as we are sampling CPU-local information. */ -static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) +static __always_inline bool rcu_is_watching_curr_cpu(void) { - return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING); + return raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING; } /* @@ -140,7 +144,7 @@ static __always_inline bool warn_rcu_enter(void) * lots of the actual reporting also relies on RCU. */ preempt_disable_notrace(); - if (rcu_dynticks_curr_cpu_in_eqs()) { + if (!rcu_is_watching_curr_cpu()) { ret = true; ct_state_inc(CT_RCU_WATCHING); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 00c8f128885a..31c6d0c1b79d 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -212,7 +212,7 @@ void noinstr ct_nmi_exit(void) * to us!) */ WARN_ON_ONCE(ct_nmi_nesting() <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + WARN_ON_ONCE(!rcu_is_watching_curr_cpu()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so @@ -271,7 +271,7 @@ void noinstr ct_nmi_enter(void) * to be in the outermost NMI handler that interrupted an RCU-idle * period (observation due to Andy Lutomirski). */ - if (rcu_dynticks_curr_cpu_in_eqs()) { + if (!rcu_is_watching_curr_cpu()) { if (!in_nmi()) rcu_task_enter(); @@ -281,7 +281,7 @@ void noinstr ct_nmi_enter(void) // ... but is watching here. instrumentation_begin(); - // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + // instrumentation for the noinstr rcu_is_watching_curr_cpu() instrument_atomic_read(&ct->state, sizeof(ct->state)); // instrumentation for the noinstr ct_kernel_enter_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4778c873f4ac..68eca7d3fdd1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -601,7 +601,7 @@ void rcu_irq_exit_check_preempt(void) RCU_LOCKDEP_WARN(ct_nmi_nesting() != CT_NESTING_IRQ_NONIDLE, "Bad RCU nmi_nesting counter\n"); - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), "RCU in extended quiescent state!"); } #endif /* #ifdef CONFIG_PROVE_RCU */ @@ -641,7 +641,7 @@ void __rcu_irq_enter_check_tick(void) if (in_nmi()) return; - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); if (!tick_nohz_full_cpu(rdp->cpu) || @@ -723,7 +723,7 @@ notrace bool rcu_is_watching(void) bool ret; preempt_disable_notrace(); - ret = !rcu_dynticks_curr_cpu_in_eqs(); + ret = rcu_is_watching_curr_cpu(); preempt_enable_notrace(); return ret; } From 654b578e4db04a68f9bfd0532edaeb7a87abc945 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 12:25:31 +0200 Subject: [PATCH 74/90] rcu: Rename rcu_dynticks_eqs_online() into rcu_watching_online() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 68eca7d3fdd1..47ec9f2daaed 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -283,16 +283,16 @@ void rcu_softirq_qs(void) } /* - * Reset the current CPU's ->dynticks counter to indicate that the + * Reset the current CPU's RCU_WATCHING counter to indicate that the * newly onlined CPU is no longer in an extended quiescent state. * This will either leave the counter unchanged, or increment it * to the next non-quiescent value. * * The non-atomic test/increment sequence works because the upper bits - * of the ->dynticks counter are manipulated only by the corresponding CPU, + * of the ->state variable are manipulated only by the corresponding CPU, * or when the corresponding CPU is offline. */ -static void rcu_dynticks_eqs_online(void) +static void rcu_watching_online(void) { if (ct_rcu_watching() & CT_RCU_WATCHING) return; @@ -5058,7 +5058,7 @@ void rcutree_report_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; arch_spin_lock(&rcu_state.ofl_lock); - rcu_dynticks_eqs_online(); + rcu_watching_online(); raw_spin_lock(&rcu_state.barrier_lock); raw_spin_lock_rcu_node(rnp); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); From 9629936d06d05540d4dc83cee1167422a477a818 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 17:18:18 +0200 Subject: [PATCH 75/90] rcu: Rename rcu_dynticks_in_eqs() into rcu_watching_snap_in_eqs() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. While at it, update a comment that still refers to rcu_dynticks_snap(), which was removed by commit: 7be2e6323b9b ("rcu: Remove full memory barrier on RCU stall printout") Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_stall.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 47ec9f2daaed..73516d76b70a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -300,10 +300,10 @@ static void rcu_watching_online(void) } /* - * Return true if the snapshot returned from rcu_dynticks_snap() + * Return true if the snapshot returned from ct_rcu_watching() * indicates that RCU is in an extended quiescent state. */ -static bool rcu_dynticks_in_eqs(int snap) +static bool rcu_watching_snap_in_eqs(int snap) { return !(snap & CT_RCU_WATCHING); } @@ -783,7 +783,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * updater's accesses is enforced by the below acquire semantic. */ rdp->dynticks_snap = ct_rcu_watching_cpu_acquire(rdp->cpu); - if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { + if (rcu_watching_snap_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; @@ -4805,7 +4805,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(ct->nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_rcu_watching_cpu(cpu))); + WARN_ON_ONCE(rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index daa87fec703f..137559c84309 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -377,7 +377,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) * below acquire semantic. */ snap = ct_rcu_watching_cpu_acquire(cpu); - if (rcu_dynticks_in_eqs(snap)) + if (rcu_watching_snap_in_eqs(snap)) mask_ofl_test |= mask; else rdp->exp_dynticks_snap = snap; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index ec49f0155bec..e933c6a58d5f 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -501,7 +501,7 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(ct_rcu_watching_cpu(cpu)); + rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) // Print signed value, as negative values indicate a probable bug. From 3116a32eb40464de61828787c21f0629a1c26cfa Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 17:21:25 +0200 Subject: [PATCH 76/90] rcu: Rename rcu_dynticks_in_eqs_since() into rcu_watching_snap_stopped_since() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, the dynticks prefix can go. While at it, this helper is only meant to be called after failing an earlier call to rcu_watching_snap_in_eqs(), document this in the comments and add a WARN_ON_ONCE() for good measure. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- .../Tree-RCU-Memory-Ordering.rst | 2 +- kernel/rcu/tree.c | 23 ++++++++++++++----- kernel/rcu/tree_exp.h | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 2d7036ad7476..7163d0def34e 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -150,7 +150,7 @@ This case is handled by calls to the strongly ordered is invoked within ``rcu_dynticks_eqs_enter()`` at idle-entry time and within ``rcu_dynticks_eqs_exit()`` at idle-exit time. The grace-period kthread invokes first ``ct_rcu_watching_cpu_acquire()`` -(preceded by a full memory barrier) and ``rcu_dynticks_in_eqs_since()`` +(preceded by a full memory barrier) and ``rcu_watching_snap_stopped_since()`` (both of which rely on acquire semantics) to detect idle CPUs. +-----------------------------------------------------------------------+ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 73516d76b70a..93570cb5e99e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -308,12 +308,20 @@ static bool rcu_watching_snap_in_eqs(int snap) return !(snap & CT_RCU_WATCHING); } -/* - * Return true if the CPU corresponding to the specified rcu_data - * structure has spent some time in an extended quiescent state since - * rcu_dynticks_snap() returned the specified snapshot. +/** + * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU + * since the specified @snap? + * + * @rdp: The rcu_data corresponding to the CPU for which to check EQS. + * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS. + * + * Returns true if the CPU corresponding to @rdp has spent some time in an + * extended quiescent state since @snap. Note that this doesn't check if it + * /still/ is in an EQS, just that it went through one since @snap. + * + * This is meant to be used in a loop waiting for a CPU to go through an EQS. */ -static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) +static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap) { /* * The first failing snapshot is already ordered against the accesses @@ -323,6 +331,9 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) * performed by the remote CPU prior to entering idle and therefore can * rely solely on acquire semantics. */ + if (WARN_ON_ONCE(rcu_watching_snap_in_eqs(snap))) + return true; + return snap != ct_rcu_watching_cpu_acquire(rdp->cpu); } @@ -815,7 +826,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->dynticks_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rnp, rdp); return 1; diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 137559c84309..48ad8b868d83 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -400,7 +400,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) unsigned long mask = rdp->grpmask; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; continue; } From fc1096ab1f318fd7cabb842279e88978bb31768e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 29 Apr 2024 13:11:34 +0200 Subject: [PATCH 77/90] rcu: Rename rcu_dynticks_zero_in_eqs() into rcu_watching_zero_in_eqs() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/rcu.h | 4 ++-- kernel/rcu/tasks.h | 2 +- kernel/rcu/tree.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 38238e595a61..5564402af4cb 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -606,7 +606,7 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, #endif #ifdef CONFIG_TINY_RCU -static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } +static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long @@ -619,7 +619,7 @@ static inline void rcu_fwd_progress_check(unsigned long j) { } static inline void rcu_gp_slow_register(atomic_t *rgssp) { } static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ -bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); +bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ba3440a45b6d..2484c1a8e051 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1613,7 +1613,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) // However, we cannot safely change its state. n_heavy_reader_attempts++; // Check for "running" idle tasks on offline CPUs. - if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting)) return -EINVAL; // No quiescent state, do it the hard way. n_heavy_reader_updates++; nesting = 0; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 93570cb5e99e..ac3577ac10bd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -341,7 +341,7 @@ static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap) * Return true if the referenced integer is zero while the specified * CPU remains within a single extended quiescent state. */ -bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +bool rcu_watching_zero_in_eqs(int cpu, int *vp) { int snap; From 2dba2230f9e26ad492fcd70e9ef2edc28313ea50 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 29 Apr 2024 14:05:29 +0200 Subject: [PATCH 78/90] rcu: Rename struct rcu_data .dynticks_snap into .watching_snap The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the snapshot helpers are now prefix by "rcu_watching". Reflect that change into the storage variables for these snapshots. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- .../RCU/Design/Data-Structures/Data-Structures.rst | 4 ++-- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst index 28d9f9e1783f..04e16775c752 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst @@ -921,10 +921,10 @@ This portion of the ``rcu_data`` structure is declared as follows: :: - 1 int dynticks_snap; + 1 int watching_snap; 2 unsigned long dynticks_fqs; -The ``->dynticks_snap`` field is used to take a snapshot of the +The ``->watching_snap`` field is used to take a snapshot of the corresponding CPU's dyntick-idle state when forcing quiescent states, and is therefore accessed from other CPUs. Finally, the ``->dynticks_fqs`` field is used to count the number of times this CPU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ac3577ac10bd..a969d8ddc58e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -793,8 +793,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * Ordering between remote CPU's pre idle accesses and post grace period * updater's accesses is enforced by the below acquire semantic. */ - rdp->dynticks_snap = ct_rcu_watching_cpu_acquire(rdp->cpu); - if (rcu_watching_snap_in_eqs(rdp->dynticks_snap)) { + rdp->watching_snap = ct_rcu_watching_cpu_acquire(rdp->cpu); + if (rcu_watching_snap_in_eqs(rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; @@ -826,7 +826,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if (rcu_watching_snap_stopped_since(rdp, rdp->dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rnp, rdp); return 1; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fcf2b4aa3441..f5361a7d7269 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -206,7 +206,7 @@ struct rcu_data { long blimit; /* Upper limit on a processed batch */ /* 3) dynticks interface. */ - int dynticks_snap; /* Per-GP tracking for dynticks. */ + int watching_snap; /* Per-GP tracking for dynticks. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ From 76ce2b3d75305d02174655062e3a68a8f45117cd Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 29 Apr 2024 14:05:29 +0200 Subject: [PATCH 79/90] rcu: Rename struct rcu_data .exp_dynticks_snap into .exp_watching_snap The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the snapshot helpers are now prefix by "rcu_watching". Reflect that change into the storage variables for these snapshots. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_exp.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f5361a7d7269..13fdcc2aa081 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -215,7 +215,7 @@ struct rcu_data { /* 4) rcu_barrier(), OOM callbacks, and expediting. */ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */ struct rcu_head barrier_head; - int exp_dynticks_snap; /* Double-check need for IPI. */ + int exp_watching_snap; /* Double-check need for IPI. */ /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 48ad8b868d83..eec42df0c018 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -380,7 +380,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) if (rcu_watching_snap_in_eqs(snap)) mask_ofl_test |= mask; else - rdp->exp_dynticks_snap = snap; + rdp->exp_watching_snap = snap; } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -400,7 +400,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) unsigned long mask = rdp->grpmask; retry_ipi: - if (rcu_watching_snap_stopped_since(rdp, rdp->exp_dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->exp_watching_snap)) { mask_ofl_test |= mask; continue; } From 49f82c64fdc66447d839e77d2bd1c5b5b797590c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 29 Apr 2024 16:38:19 +0200 Subject: [PATCH 80/90] rcu: Rename dyntick_save_progress_counter() into rcu_watching_snap_save() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- .../RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg | 2 +- .../RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg | 2 +- Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg | 2 +- .../RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg | 2 +- kernel/rcu/tree.c | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg index 423df00c4df9..13956baf748a 100644 --- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg @@ -528,7 +528,7 @@ font-style="normal" y="-8652.5312" x="2466.7822" - xml:space="preserve">dyntick_save_progress_counter() + xml:space="preserve">rcu_watching_snap_save() dyntick_save_progress_counter() + xml:space="preserve">rcu_watching_snap_save() dyntick_save_progress_counter() + xml:space="preserve">rcu_watching_snap_save() dyntick_save_progress_counter() + xml:space="preserve">rcu_watching_snap_save() Date: Mon, 29 Apr 2024 16:41:37 +0200 Subject: [PATCH 81/90] rcu: Rename rcu_implicit_dynticks_qs() into rcu_watching_snap_recheck() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, drop the dyntick reference and update the name of this helper to express that it rechecks rdp->watching_snap after an earlier rcu_watching_snap_save(). Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg | 2 +- Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg | 2 +- Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg | 2 +- Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg | 2 +- kernel/rcu/tree.c | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg index 13956baf748a..ab9707f04e66 100644 --- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg @@ -537,7 +537,7 @@ font-style="normal" y="-8368.1475" x="2463.3262" - xml:space="preserve">rcu_implicit_dynticks_qs() + xml:space="preserve">rcu_watching_snap_recheck() rcu_implicit_dynticks_qs() + xml:space="preserve">rcu_watching_snap_recheck() rcu_implicit_dynticks_qs() + xml:space="preserve">rcu_watching_snap_recheck() rcu_implicit_dynticks_qs() + xml:space="preserve">rcu_watching_snap_recheck() Date: Mon, 29 Apr 2024 16:43:36 +0200 Subject: [PATCH 82/90] rcu: Rename rcu_momentary_dyntick_idle() into rcu_momentary_eqs() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, replace "dyntick_idle" into "eqs" to drop the dyntick reference. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/rcutiny.h | 2 +- include/linux/rcutree.h | 2 +- kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_nocb.h | 2 +- kernel/rcu/tree_plugin.h | 6 +++--- kernel/stop_machine.c | 2 +- kernel/trace/trace_osnoise.c | 4 ++-- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d9ac7b136aea..cf2b5a188f78 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -158,7 +158,7 @@ void rcu_scheduler_starting(void); static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } -static inline void rcu_momentary_dyntick_idle(void) { } +static inline void rcu_momentary_eqs(void) { } static inline void kfree_rcu_scheduler_running(void) { } static inline bool rcu_gp_might_be_stalled(void) { return false; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 254244202ea9..7dbde2b6f714 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -37,7 +37,7 @@ void synchronize_rcu_expedited(void); void kvfree_call_rcu(struct rcu_head *head, void *ptr); void rcu_barrier(void); -void rcu_momentary_dyntick_idle(void); +void rcu_momentary_eqs(void); void kfree_rcu_scheduler_running(void); bool rcu_gp_might_be_stalled(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 08bf7c669dd3..ef757c3e84a2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2680,7 +2680,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp) rcu_torture_fwd_prog_cond_resched(freed); if (tick_nohz_full_enabled()) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } } @@ -2830,7 +2830,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); if (tick_nohz_full_enabled()) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b53b354417ae..e0f6fc272906 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -367,7 +367,7 @@ bool rcu_watching_zero_in_eqs(int cpu, int *vp) * * The caller must have disabled interrupts and must not be idle. */ -notrace void rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_eqs(void) { int seq; @@ -377,7 +377,7 @@ notrace void rcu_momentary_dyntick_idle(void) WARN_ON_ONCE(!(seq & CT_RCU_WATCHING)); rcu_preempt_deferred_qs(current); } -EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); +EXPORT_SYMBOL_GPL(rcu_momentary_eqs); /** * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3ce30841119a..e629646ac6f5 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -917,7 +917,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); /* * Disable BH to provide the expected environment. Also, when diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c569da65b421..354fbf2e411c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -869,7 +869,7 @@ static void rcu_qs(void) /* * Register an urgently needed quiescent state. If there is an - * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * emergency, invoke rcu_momentary_eqs() to do a heavy-weight * dyntick-idle quiescent state visible to other CPUs, which will in * some cases serve for expedited as well as normal grace periods. * Either way, register a lightweight quiescent state. @@ -889,7 +889,7 @@ void rcu_all_qs(void) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } rcu_qs(); @@ -909,7 +909,7 @@ void rcu_note_context_switch(bool preempt) goto out; this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); out: rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index cedb17ba158a..da821ce258ea 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -251,7 +251,7 @@ static int multi_cpu_stop(void *data) */ touch_nmi_watchdog(); } - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); } while (curstate != MULTI_STOP_EXIT); local_irq_restore(flags); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 66a871553d4a..ad86b1ba2e43 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1535,7 +1535,7 @@ static int run_osnoise(void) * This will eventually cause unwarranted noise as PREEMPT_RCU * will force preemption as the means of ending the current * grace period. We avoid this problem by calling - * rcu_momentary_dyntick_idle(), which performs a zero duration + * rcu_momentary_eqs(), which performs a zero duration * EQS allowing PREEMPT_RCU to end the current grace period. * This call shouldn't be wrapped inside an RCU critical * section. @@ -1547,7 +1547,7 @@ static int run_osnoise(void) if (!disable_irq) local_irq_disable(); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); if (!disable_irq) local_irq_enable(); From c3dcd90b8f34f7fa796eccbfbd59a6bb47740832 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 29 Apr 2024 13:47:27 +0200 Subject: [PATCH 83/90] rcu: Update stray documentation references to rcu_dynticks_eqs_{enter, exit}() rcu_dynticks_eqs_{enter, exit}() have been replaced by their context-tracking counterparts since commit: 171476775d32 ("context_tracking: Convert state to atomic_t") Update the stray documentation references. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- .../RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst | 4 ++-- Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg | 4 ++-- Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg | 4 ++-- Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 7163d0def34e..1a5ff1a9f02e 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -147,8 +147,8 @@ RCU read-side critical sections preceding and following the current idle sojourn. This case is handled by calls to the strongly ordered ``atomic_add_return()`` read-modify-write atomic operation that -is invoked within ``rcu_dynticks_eqs_enter()`` at idle-entry -time and within ``rcu_dynticks_eqs_exit()`` at idle-exit time. +is invoked within ``ct_kernel_exit_state()`` at idle-entry +time and within ``ct_kernel_enter_state()`` at idle-exit time. The grace-period kthread invokes first ``ct_rcu_watching_cpu_acquire()`` (preceded by a full memory barrier) and ``rcu_watching_snap_stopped_since()`` (both of which rely on acquire semantics) to detect idle CPUs. diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg index ab9707f04e66..3fbc19c48a58 100644 --- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-dyntick.svg @@ -607,7 +607,7 @@ font-weight="bold" font-size="192" id="text202-7-5-3-27-6" - style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_dynticks_eqs_enter() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">ct_kernel_exit_state() rcu_dynticks_eqs_exit() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">ct_kernel_enter_state() rcu_dynticks_eqs_enter() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">ct_kernel_exit_state() rcu_dynticks_eqs_exit() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">ct_kernel_enter_state() rcu_dynticks_eqs_enter() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">ct_kernel_exit_state() rcu_dynticks_eqs_exit() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">ct_kernel_enter_state() Date: Mon, 22 Apr 2024 13:57:29 +0200 Subject: [PATCH 84/90] context_tracking, rcu: Rename rcu_dyntick trace event into rcu_watching The "rcu_dyntick" naming convention has been turned into "rcu_watching" for all helpers now, align the trace event to that. To add to the confusion, the strings passed to the trace event are now reversed: when RCU "starts" the dyntick / EQS state, it "stops" watching. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/trace/events/rcu.h | 18 +++++++++--------- kernel/context_tracking.c | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 4066b6d51e46..e81431deaa50 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -466,40 +466,40 @@ TRACE_EVENT(rcu_stall_warning, /* * Tracepoint for dyntick-idle entry/exit events. These take 2 strings * as argument: - * polarity: "Start", "End", "StillNonIdle" for entering, exiting or still not - * being in dyntick-idle mode. + * polarity: "Start", "End", "StillWatching" for entering, exiting or still not + * being in EQS mode. * context: "USER" or "IDLE" or "IRQ". * NMIs nested in IRQs are inferred with nesting > 1 in IRQ context. * * These events also take a pair of numbers, which indicate the nesting * depth before and after the event of interest, and a third number that is - * the ->dynticks counter. Note that task-related and interrupt-related + * the RCU_WATCHING counter. Note that task-related and interrupt-related * events use two separate counters, and that the "++=" and "--=" events * for irq/NMI will change the counter by two, otherwise by one. */ -TRACE_EVENT_RCU(rcu_dyntick, +TRACE_EVENT_RCU(rcu_watching, - TP_PROTO(const char *polarity, long oldnesting, long newnesting, int dynticks), + TP_PROTO(const char *polarity, long oldnesting, long newnesting, int counter), - TP_ARGS(polarity, oldnesting, newnesting, dynticks), + TP_ARGS(polarity, oldnesting, newnesting, counter), TP_STRUCT__entry( __field(const char *, polarity) __field(long, oldnesting) __field(long, newnesting) - __field(int, dynticks) + __field(int, counter) ), TP_fast_assign( __entry->polarity = polarity; __entry->oldnesting = oldnesting; __entry->newnesting = newnesting; - __entry->dynticks = dynticks; + __entry->counter = counter; ), TP_printk("%s %lx %lx %#3x", __entry->polarity, __entry->oldnesting, __entry->newnesting, - __entry->dynticks & 0xfff) + __entry->counter & 0xfff) ); /* diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 31c6d0c1b79d..938c48952d26 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -137,7 +137,7 @@ static void noinstr ct_kernel_exit(bool user, int offset) instrumentation_begin(); lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), ct_nesting(), 0, ct_rcu_watching()); + trace_rcu_watching(TPS("End"), ct_nesting(), 0, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rcu_preempt_deferred_qs(current); @@ -182,7 +182,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) // instrumentation for the noinstr ct_kernel_enter_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); - trace_rcu_dyntick(TPS("End"), ct_nesting(), 1, ct_rcu_watching()); + trace_rcu_watching(TPS("Start"), ct_nesting(), 1, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(ct->nesting, 1); WARN_ON_ONCE(ct_nmi_nesting()); @@ -219,7 +219,7 @@ void noinstr ct_nmi_exit(void) * leave it in non-RCU-idle state. */ if (ct_nmi_nesting() != 1) { - trace_rcu_dyntick(TPS("--="), ct_nmi_nesting(), ct_nmi_nesting() - 2, + trace_rcu_watching(TPS("--="), ct_nmi_nesting(), ct_nmi_nesting() - 2, ct_rcu_watching()); WRITE_ONCE(ct->nmi_nesting, /* No store tearing. */ ct_nmi_nesting() - 2); @@ -228,7 +228,7 @@ void noinstr ct_nmi_exit(void) } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), ct_nmi_nesting(), 0, ct_rcu_watching()); + trace_rcu_watching(TPS("Endirq"), ct_nmi_nesting(), 0, ct_rcu_watching()); WRITE_ONCE(ct->nmi_nesting, 0); /* Avoid store tearing. */ // instrumentation for the noinstr ct_kernel_exit_state() @@ -294,7 +294,7 @@ void noinstr ct_nmi_enter(void) instrumentation_begin(); } - trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), + trace_rcu_watching(incby == 1 ? TPS("Startirq") : TPS("++="), ct_nmi_nesting(), ct_nmi_nesting() + incby, ct_rcu_watching()); instrumentation_end(); From 4040b1139904b3f72b56862fc9a8d3e9abb69ffb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 17 Jun 2024 08:51:14 -0700 Subject: [PATCH 85/90] context_tracking: Tag context_tracking_enabled_this_cpu() __always_inline Force context_tracking_enabled_this_cpu() to be inlined so that invoking it from guest_context_enter_irqoff(), which KVM uses in non-instrumentable code, doesn't unexpectedly leave a noinstr section. vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x1c7: call to context_tracking_enabled_this_cpu() leaves .noinstr.text section vmlinux.o: warning: objtool: svm_vcpu_enter_exit+0x83: call to context_tracking_enabled_this_cpu() leaves .noinstr.text section Note, the CONFIG_CONTEXT_TRACKING_USER=n stub is already __always_inline. Signed-off-by: Sean Christopherson Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 0dbda59c9f37..7b8433d5a8ef 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -113,7 +113,7 @@ static __always_inline bool context_tracking_enabled_cpu(int cpu) return context_tracking_enabled() && per_cpu(context_tracking.active, cpu); } -static inline bool context_tracking_enabled_this_cpu(void) +static __always_inline bool context_tracking_enabled_this_cpu(void) { return context_tracking_enabled() && __this_cpu_read(context_tracking.active); } From 1fcb932c8b5ce86219d7dedcd63659351a43291c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Jul 2024 00:56:40 +0200 Subject: [PATCH 86/90] rcu/nocb: Simplify (de-)offloading state machine Now that the (de-)offloading process can only apply to offline CPUs, there is no more concurrency between rcu_core and nocb kthreads. Also the mutation now happens on empty queues. Therefore the state machine can be reduced to a single bit called SEGCBLIST_OFFLOADED. Simplify the transition as follows: * Upon offloading: queue the rdp to be added to the rcuog list and wait for the rcuog kthread to set the SEGCBLIST_OFFLOADED bit. Unpark rcuo kthread. * Upon de-offloading: Park rcuo kthread. Queue the rdp to be removed from the rcuog list and wait for the rcuog kthread to clear the SEGCBLIST_OFFLOADED bit. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcu_segcblist.h | 4 +- kernel/rcu/rcu_segcblist.c | 11 --- kernel/rcu/rcu_segcblist.h | 2 +- kernel/rcu/tree_nocb.h | 138 ++++++++++++++++------------------ 4 files changed, 68 insertions(+), 87 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 1ef1bb54853d..2fdc2208f1ca 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -185,9 +185,7 @@ struct rcu_cblist { * ---------------------------------------------------------------------------- */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_LOCKING BIT(1) -#define SEGCBLIST_KTHREAD_GP BIT(2) -#define SEGCBLIST_OFFLOADED BIT(3) +#define SEGCBLIST_OFFLOADED BIT(1) struct rcu_segcblist { struct rcu_head *head; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 1693ea22ef1b..298a2c573f02 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -260,17 +260,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED); } -/* - * Mark the specified rcu_segcblist structure as offloaded (or not) - */ -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) -{ - if (offload) - rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED); - else - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); -} - /* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 7a0962dfee86..259904075636 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -89,7 +89,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING)) + rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED)) return true; return false; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 24daf606de0c..857024ee0349 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -604,37 +604,33 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, } } -static int nocb_gp_toggle_rdp(struct rcu_data *rdp) +static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; - rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * Locking orders future de-offloaded callbacks enqueue against previous + * handling of this rdp. Ie: Make sure rcuog is done with this rdp before + * deoffloaded callbacks can be enqueued. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { /* * Offloading. Set our flag and notify the offload worker. * We will handle this rdp until it ever gets de-offloaded. */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - ret = 1; - } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); + rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED); + } else { /* * De-offloading. Clear our flag and notify the de-offload worker. * We will ignore this rdp until it ever gets re-offloaded. */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - ret = 0; - } else { - WARN_ON_ONCE(1); - ret = -1; + list_del(&rdp->nocb_entry_rdp); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED); } - - rcu_nocb_unlock_irqrestore(rdp, flags); - - return ret; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) @@ -841,14 +837,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (rdp_toggling) { - int ret; - - ret = nocb_gp_toggle_rdp(rdp_toggling); - if (ret == 1) - list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); - else if (ret == 0) - list_del(&rdp_toggling->nocb_entry_rdp); - + nocb_gp_toggle_rdp(my_rdp, rdp_toggling); swake_up_one(&rdp_toggling->nocb_state_wq); } @@ -1018,16 +1007,11 @@ void rcu_nocb_flush_deferred_wakeup(void) } EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); -static int rdp_offload_toggle(struct rcu_data *rdp, - bool offload, unsigned long flags) - __releases(rdp->nocb_lock) +static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp) { - struct rcu_segcblist *cblist = &rdp->cblist; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; bool wake_gp = false; - - rcu_segcblist_offload(cblist, offload); - rcu_nocb_unlock_irqrestore(rdp, flags); + unsigned long flags; raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); // Queue this rdp for add/del to/from the list to iterate on rcuog @@ -1041,9 +1025,25 @@ static int rdp_offload_toggle(struct rcu_data *rdp, return wake_gp; } +static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp) +{ + unsigned long flags; + bool ret; + + /* + * Locking makes sure rcuog is done handling this rdp before deoffloaded + * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable + * while the ->nocb_lock is held. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) { - struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; @@ -1056,51 +1056,42 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) /* Flush all callbacks from segcblist and bypass */ rcu_barrier(); + /* + * Make sure the rcuoc kthread isn't in the middle of a nocb locked + * sequence while offloading is deactivated, along with nocb locking. + */ + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); + rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); - wake_gp = rdp_offload_toggle(rdp, false, flags); + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (rdp_gp->nocb_gp_kthread) { if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, - SEGCBLIST_KTHREAD_GP)); - if (rdp->nocb_cb_kthread) - kthread_park(rdp->nocb_cb_kthread); + rcu_nocb_rdp_deoffload_wait_cond(rdp)); } else { /* * No kthread to clear the flags for us or remove the rdp from the nocb list * to iterate. Do it here instead. Locking doesn't look stricly necessary * but we stick to paranoia in this rare path. */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); list_del(&rdp->nocb_entry_rdp); } - mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - /* - * Lock one last time to acquire latest callback updates from kthreads - * so we can later handle callbacks locally without locking. - */ - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Theoretically we could clear SEGCBLIST_LOCKING after the nocb - * lock is released but how about being paranoid for once? - */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING); - /* - * Without SEGCBLIST_LOCKING, we can't use - * rcu_nocb_unlock_irqrestore() anymore. - */ - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); return 0; } @@ -1129,10 +1120,20 @@ int rcu_nocb_cpu_deoffload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); +static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + static int rcu_nocb_rdp_offload(struct rcu_data *rdp) { - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long flags; int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; @@ -1152,20 +1153,14 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); - /* - * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING - * is set. - */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - - wake_gp = rdp_offload_toggle(rdp, true, flags); + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); - kthread_unpark(rdp->nocb_cb_kthread); - swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_nocb_rdp_offload_wait_cond(rdp)); + + kthread_unpark(rdp->nocb_cb_kthread); return 0; } @@ -1340,8 +1335,7 @@ void __init rcu_init_nohz(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); } rcu_organize_nocb_kthreads(); } From 9139f93209d1ffd7f489ab19dee01b7c3a1a43d2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Aug 2024 00:56:40 +0200 Subject: [PATCH 87/90] rcu/nocb: Fix RT throttling hrtimer armed from offline CPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a CPU is marked offline and until it reaches its final trip to idle, rcuo has several opportunities to be woken up, either because a callback has been queued in the meantime or because rcutree_report_cpu_dead() has issued the final deferred NOCB wake up. If RCU-boosting is enabled, RCU kthreads are set to SCHED_FIFO policy. And if RT-bandwidth is enabled, the related hrtimer might be armed. However this then happens after hrtimers have been migrated at the CPUHP_AP_HRTIMERS_DYING stage, which is broken as reported by the following warning: Call trace: enqueue_hrtimer+0x7c/0xf8 hrtimer_start_range_ns+0x2b8/0x300 enqueue_task_rt+0x298/0x3f0 enqueue_task+0x94/0x188 ttwu_do_activate+0xb4/0x27c try_to_wake_up+0x2d8/0x79c wake_up_process+0x18/0x28 __wake_nocb_gp+0x80/0x1a0 do_nocb_deferred_wakeup_common+0x3c/0xcc rcu_report_dead+0x68/0x1ac cpuhp_report_idle_dead+0x48/0x9c do_idle+0x288/0x294 cpu_startup_entry+0x34/0x3c secondary_start_kernel+0x138/0x158 Fix this with waking up rcuo using an IPI if necessary. Since the existing API to deal with this situation only handles swait queue, rcuo is only woken up from offline CPUs if it's not already waiting on a grace period. In the worst case some callbacks will just wait for a grace period to complete before being assigned to a subsequent one. Reported-by: "Cheng-Jui Wang (王正睿)" Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Signed-off-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 857024ee0349..e3cf354f7a7f 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -216,7 +216,10 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - wake_up_process(rdp_gp->nocb_gp_kthread); + if (cpu_is_offline(raw_smp_processor_id())) + swake_up_one_online(&rdp_gp->nocb_gp_wq); + else + wake_up_process(rdp_gp->nocb_gp_kthread); } return needwake; From 1b022b8763fde84c51ca06218306d7d88fb090cb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Aug 2024 00:56:41 +0200 Subject: [PATCH 88/90] rcu/nocb: Conditionally wake up rcuo if not already waiting on GP A callback enqueuer currently wakes up the rcuo kthread if it is adding the first non-done callback of a CPU, whether the kthread is waiting on a grace period or not (unless the CPU is offline). This looks like a desired behaviour because then the rcuo kthread doesn't wait for the end of the current grace period to handle the callback. It is accelerated right away and assigned to the next grace period. The GP kthread is notified about that fact and iterates with the upcoming GP without sleeping in-between. However this best-case scenario is contradicted by a few details, depending on the situation: 1) If the callback is a non-bypass one queued with IRQs enabled, the wake up only occurs if no other pending callbacks are on the list. Therefore the theoretical "optimization" actually applies on rare occasions. 2) If the callback is a non-bypass one queued with IRQs disabled, the situation is similar with even more uncertainty due to the deferred wake up. 3) If the callback is lazy, a few jiffies don't make any difference. 4) If the callback is bypass, the wake up timer is programmed 2 jiffies ahead by rcuo in case the regular pending queue has been handled in the meantime. The rare storm of callbacks can otherwise wait for the currently elapsing grace period to be flushed and handled. For all those reasons, the optimization is only theoretical and occasional. Therefore it is reasonable that callbacks enqueuers only wake up the rcuo kthread when it is not already waiting on a grace period to complete. Signed-off-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index e3cf354f7a7f..14d183438423 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -216,10 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - if (cpu_is_offline(raw_smp_processor_id())) - swake_up_one_online(&rdp_gp->nocb_gp_wq); - else - wake_up_process(rdp_gp->nocb_gp_kthread); + swake_up_one_online(&rdp_gp->nocb_gp_wq); } return needwake; From 7562eed272b49e233c430524e684b957f34f2fd2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Aug 2024 00:56:42 +0200 Subject: [PATCH 89/90] rcu/nocb: Remove superfluous memory barrier after bypass enqueue Pre-GP accesses performed by the update side must be ordered against post-GP accesses performed by the readers. This is ensured by the bypass or nocb locking on enqueue time, followed by the fully ordered rnp locking initiated while callbacks are accelerated, and then propagated throughout the whole GP lifecyle associated with the callbacks. Therefore the explicit barrier advertizing ordering between bypass enqueue and rcuo wakeup is superfluous. If anything, it would even only order the first bypass callback enqueue against the rcuo wakeup and ignore all the subsequent ones. Remove the needless barrier. Signed-off-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 14d183438423..4ffb8c90d695 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -493,7 +493,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); } rcu_nocb_bypass_unlock(rdp); - smp_mb(); /* Order enqueue before wake. */ + // A wake up of the grace period kthread or timer adjustment // needs to be done only if: // 1. Bypass list was fully empty before (this is the first From 1ecd9d68eb44e4b7972aee2840eb4fdf29b9de2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Aug 2024 14:15:12 -0700 Subject: [PATCH 90/90] rcu: Defer printing stall-warning backtrace when holding rcu_node lock The rcu_dump_cpu_stacks() holds the leaf rcu_node structure's ->lock when dumping the stakcks of any CPUs stalling the current grace period. This lock is held to prevent confusion that would otherwise occur when the stalled CPU reported its quiescent state (and then went on to do unrelated things) just as the backtrace NMI was heading towards it. This has worked well, but on larger systems has recently been observed to cause severe lock contention resulting in CSD-lock stalls and other general unhappiness. This commit therefore does printk_deferred_enter() before acquiring the lock and printk_deferred_exit() after releasing it, thus deferring the overhead of actually outputting the stack trace out of that lock's critical section. Reported-by: Rik van Riel Suggested-by: Rik van Riel Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_stall.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b497d4c6dabd..9772e1ffcf6e 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -371,6 +371,7 @@ static void rcu_dump_cpu_stacks(void) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { + printk_deferred_enter(); raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { @@ -380,6 +381,7 @@ static void rcu_dump_cpu_stacks(void) dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + printk_deferred_exit(); } }