From 546a9d8519ed137b2804a3f5a3659003039dd49c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Jun 2014 14:57:10 -0700 Subject: [PATCH 01/34] rcu: Export debug_init_rcu_head() and and debug_init_rcu_head() Currently, call_rcu() relies on implicit allocation and initialization for the debug-objects handling of RCU callbacks. If you hammer the kernel hard enough with Sasha's modified version of trinity, you can end up with the sl*b allocators recursing into themselves via this implicit call_rcu() allocation. This commit therefore exports the debug_init_rcu_head() and debug_rcu_head_free() functions, which permits the allocators to allocated and pre-initialize the debug-objects information, so that there no longer any need for call_rcu() to do that initialization, which in turn prevents the recursion into the memory allocators. Reported-by: Sasha Levin Suggested-by: Thomas Gleixner Signed-off-by: Paul E. McKenney Acked-by: Thomas Gleixner Looks-good-to: Christoph Lameter --- include/linux/rcupdate.h | 10 ++++++++++ kernel/rcu/update.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5a75d19aa661..13bbfbde41b9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -358,9 +358,19 @@ void wait_rcu_gp(call_rcu_func_t crf); * initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +void init_rcu_head(struct rcu_head *head); +void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void init_rcu_head(struct rcu_head *head) +{ +} + +static inline void destroy_rcu_head(struct rcu_head *head) +{ +} + static inline void init_rcu_head_on_stack(struct rcu_head *head) { } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a2aeb4df0f60..0fb691e63ce6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf) EXPORT_SYMBOL_GPL(wait_rcu_gp); #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD -static inline void debug_init_rcu_head(struct rcu_head *head) +void init_rcu_head(struct rcu_head *head) { debug_object_init(head, &rcuhead_debug_descr); } -static inline void debug_rcu_head_free(struct rcu_head *head) +void destroy_rcu_head(struct rcu_head *head) { debug_object_free(head, &rcuhead_debug_descr); } From 4a81e8328d3791a4f99bf5b436d050f6dc5ffea3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 Jun 2014 16:49:01 -0700 Subject: [PATCH 02/34] rcu: Reduce overhead of cond_resched() checks for RCU Commit ac1bea85781e (Make cond_resched() report RCU quiescent states) fixed a problem where a CPU looping in the kernel with but one runnable task would give RCU CPU stall warnings, even if the in-kernel loop contained cond_resched() calls. Unfortunately, in so doing, it introduced performance regressions in Anton Blanchard's will-it-scale "open1" test. The problem appears to be not so much the increased cond_resched() path length as an increase in the rate at which grace periods complete, which increased per-update grace-period overhead. This commit takes a different approach to fixing this bug, mainly by moving the RCU-visible quiescent state from cond_resched() to rcu_note_context_switch(), and by further reducing the check to a simple non-zero test of a single per-CPU variable. However, this approach requires that the force-quiescent-state processing send resched IPIs to the offending CPUs. These will be sent only once the grace period has reached an age specified by the boot/sysfs parameter rcutree.jiffies_till_sched_qs, or once the grace period reaches an age halfway to the point at which RCU CPU stall warnings will be emitted, whichever comes first. Reported-by: Dave Hansen Signed-off-by: Paul E. McKenney Cc: Andi Kleen Cc: Christoph Lameter Cc: Mike Galbraith Cc: Eric Dumazet Reviewed-by: Josh Triplett [ paulmck: Made rcu_momentary_dyntick_idle() as suggested by the ktest build robot. Also fixed smp_mb() comment as noted by Oleg Nesterov. ] Merge with e552592e (Reduce overhead of cond_resched() checks for RCU) Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 6 ++ include/linux/rcupdate.h | 36 ------- kernel/rcu/tree.c | 140 ++++++++++++++++++++++------ kernel/rcu/tree.h | 6 +- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/update.c | 18 ---- kernel/sched/core.c | 7 +- 7 files changed, 125 insertions(+), 90 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6eaa9cdb7094..910c3829f81d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2785,6 +2785,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. leaf rcu_node structure. Useful for very large systems. + rcutree.jiffies_till_sched_qs= [KNL] + Set required age in jiffies for a + given grace period before RCU starts + soliciting quiescent-state help from + rcu_note_context_switch(). + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13bbfbde41b9..6a94cc8b1ca0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,7 +44,6 @@ #include #include #include -#include #include extern int rcu_expedited; /* for sysctl */ @@ -299,41 +298,6 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ -/* - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. - */ - -#define RCU_COND_RESCHED_LIM 256 /* ms vs. 100s of ms. */ -DECLARE_PER_CPU(int, rcu_cond_resched_count); -void rcu_resched(void); - -/* - * Is it time to report RCU quiescent states? - * - * Note unsynchronized access to rcu_cond_resched_count. Yes, we might - * increment some random CPU's count, and possibly also load the result from - * yet another CPU's count. We might even clobber some other CPU's attempt - * to zero its counter. This is all OK because the goal is not precision, - * but rather reasonable amortization of rcu_note_context_switch() overhead - * and extremely high probability of avoiding RCU CPU stall warnings. - * Note that this function has to be preempted in just the wrong place, - * many thousands of times in a row, for anything bad to happen. - */ -static inline bool rcu_should_resched(void) -{ - return raw_cpu_inc_return(rcu_cond_resched_count) >= - RCU_COND_RESCHED_LIM; -} - -/* - * Report quiscent states to RCU if it is time to do so. - */ -static inline void rcu_cond_resched(void) -{ - if (unlikely(rcu_should_resched())) - rcu_resched(); -} - /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f1ba77363fbb..625d0b0cd75a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu) rdp->passed_quiesce = 1; } +static DEFINE_PER_CPU(int, rcu_sched_qs_mask); + +static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, + .dynticks_idle = ATOMIC_INIT(1), +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +}; + +/* + * Let the RCU core know that this CPU has gone through the scheduler, + * which is a quiescent state. This is called when the need for a + * quiescent state is urgent, so we burn an atomic operation and full + * memory barriers to let the RCU core know about it, regardless of what + * this CPU might (or might not) do in the near future. + * + * We inform the RCU core by emulating a zero-duration dyntick-idle + * period, which we in turn do by incrementing the ->dynticks counter + * by two. + */ +static void rcu_momentary_dyntick_idle(void) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_dynticks *rdtp; + int resched_mask; + struct rcu_state *rsp; + + local_irq_save(flags); + + /* + * Yes, we can lose flag-setting operations. This is OK, because + * the flag will be set again after some delay. + */ + resched_mask = raw_cpu_read(rcu_sched_qs_mask); + raw_cpu_write(rcu_sched_qs_mask, 0); + + /* Find the flavor that needs a quiescent state. */ + for_each_rcu_flavor(rsp) { + rdp = raw_cpu_ptr(rsp->rda); + if (!(resched_mask & rsp->flavor_mask)) + continue; + smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ + if (ACCESS_ONCE(rdp->mynode->completed) != + ACCESS_ONCE(rdp->cond_resched_completed)) + continue; + + /* + * Pretend to be momentarily idle for the quiescent state. + * This allows the grace-period kthread to record the + * quiescent state, with no need for this CPU to do anything + * further. + */ + rdtp = this_cpu_ptr(&rcu_dynticks); + smp_mb__before_atomic(); /* Earlier stuff before QS. */ + atomic_add(2, &rdtp->dynticks); /* QS. */ + smp_mb__after_atomic(); /* Later stuff after QS. */ + break; + } + local_irq_restore(flags); +} + /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. @@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu) trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, - .dynticks = ATOMIC_INIT(1), -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, - .dynticks_idle = ATOMIC_INIT(1), -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -}; - static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static long qhimark = 10000; /* If this many pending, ignore blimit. */ static long qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +/* + * How long the grace period must be before we start recruiting + * quiescent-state help from rcu_note_context_switch(). + */ +static ulong jiffies_till_sched_qs = HZ / 20; +module_param(jiffies_till_sched_qs, ulong, 0644); + static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, @@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { unsigned int curr; + int *rcrmp; unsigned int snap; curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); @@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, } /* - * There is a possibility that a CPU in adaptive-ticks state - * might run in the kernel with the scheduling-clock tick disabled - * for an extended time period. Invoke rcu_kick_nohz_cpu() to - * force the CPU to restart the scheduling-clock tick in this - * CPU is in this state. + * A CPU running for an extended time within the kernel can + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, + * even context-switching back and forth between a pair of + * in-kernel CPU-bound tasks cannot advance grace periods. + * So if the grace period is old enough, make the CPU pay attention. + * Note that the unsynchronized assignments to the per-CPU + * rcu_sched_qs_mask variable are safe. Yes, setting of + * bits can be lost, but they will be set again on the next + * force-quiescent-state pass. So lost bit sets do not result + * in incorrect behavior, merely in a grace period lasting + * a few jiffies longer than it might otherwise. Because + * there are at most four threads involved, and because the + * updates are only once every few jiffies, the probability of + * lossage (and thus of slight grace-period extension) is + * quite low. + * + * Note that if the jiffies_till_sched_qs boot/sysfs parameter + * is set too high, we override with half of the RCU CPU stall + * warning delay. */ - rcu_kick_nohz_cpu(rdp->cpu); - - /* - * Alternatively, the CPU might be running in the kernel - * for an extended period of time without a quiescent state. - * Attempt to force the CPU through the scheduler to gain the - * needed quiescent state, but only if the grace period has gone - * on for an uncommonly long time. If there are many stuck CPUs, - * we will beat on the first one until it gets unstuck, then move - * to the next. Only do this for the primary flavor of RCU. - */ - if (rdp->rsp == rcu_state_p && + rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); + if (ULONG_CMP_GE(jiffies, + rdp->rsp->gp_start + jiffies_till_sched_qs) || ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - rdp->rsp->jiffies_resched += 5; - resched_cpu(rdp->cpu); + if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { + ACCESS_ONCE(rdp->cond_resched_completed) = + ACCESS_ONCE(rdp->mynode->completed); + smp_mb(); /* ->cond_resched_completed before *rcrmp. */ + ACCESS_ONCE(*rcrmp) = + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + rdp->rsp->jiffies_resched += 5; /* Enable beating. */ + } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { + /* Time to beat on that CPU again! */ + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ + } } return 0; @@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static u8 fl_mask = 0x1; int cpustride = 1; int i; int j; @@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; rcu_init_levelspread(rsp); + rsp->flavor_mask = fl_mask; + fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bf2c1e669691..0f69a79c5b7d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -307,6 +307,9 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ unsigned long offline_fqs; /* Kicked due to being offline. */ + unsigned long cond_resched_completed; + /* Grace period that needs help */ + /* from cond_resched(). */ /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ @@ -392,6 +395,7 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ + u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); @@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); -static void rcu_kick_nohz_cpu(int cpu); +static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index cbc2c45265e2..02ac0fb186b8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) * if an adaptive-ticks CPU is failing to respond to the current grace * period and has not be idle from an RCU perspective, kick it. */ -static void rcu_kick_nohz_cpu(int cpu) +static void __maybe_unused rcu_kick_nohz_cpu(int cpu) { #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(cpu)) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 0fb691e63ce6..bc7883570530 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void) early_initcall(check_cpu_stall_init); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ - -/* - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. - */ - -DEFINE_PER_CPU(int, rcu_cond_resched_count); - -/* - * Report a set of RCU quiescent states, for use by cond_resched() - * and friends. Out of line due to being called infrequently. - */ -void rcu_resched(void) -{ - preempt_disable(); - __this_cpu_write(rcu_cond_resched_count, 0); - rcu_note_context_switch(smp_processor_id()); - preempt_enable(); -} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3bdf01b494fe..bc1638b33449 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4147,7 +4147,6 @@ static void __cond_resched(void) int __sched _cond_resched(void) { - rcu_cond_resched(); if (should_resched()) { __cond_resched(); return 1; @@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { - bool need_rcu_resched = rcu_should_resched(); int resched = should_resched(); int ret = 0; lockdep_assert_held(lock); - if (spin_needbreak(lock) || resched || need_rcu_resched) { + if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) __cond_resched(); - else if (unlikely(need_rcu_resched)) - rcu_resched(); else cpu_relax(); ret = 1; @@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ if (should_resched()) { local_bh_enable(); __cond_resched(); From 57e60d0b9e6c36c607471cdcd82d3465fb7f4c37 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 19 Apr 2014 11:00:04 -0700 Subject: [PATCH 03/34] torture: Enable versions without CFcommon to function correctly The CFcommon file must now be present, which makes using the current scripts against old kernel versions cumbersome. This commit therefore makes the CFcommon file be optional, so that old kernel versions can be used with current torture scripts. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 27e544e29510..fa37010c34db 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -42,6 +42,7 @@ grace=120 T=/tmp/kvm-test-1-run.sh.$$ trap 'rm -rf $T' 0 +touch $T . $KVM/bin/functions.sh . $KVPATH/ver_functions.sh From 9bee2c6fb49d93678e4fb506a911c0b5ffe455ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Apr 2014 09:53:34 -0700 Subject: [PATCH 04/34] torture: Clean up diagnostics from --buildonly runs Currently the post-processing complains about the lack of rcutorture output when --buildonly is set and also emits misleading messages about kernels being started and finishing. This commit suppresses these complaints and messages. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-recheck.sh | 8 ++++++- .../rcutorture/bin/kvm-test-1-run.sh | 8 +++++-- tools/testing/selftests/rcutorture/bin/kvm.sh | 21 +++++++++++-------- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index ee1f6cae3d70..3f6c9b78d177 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -54,10 +54,16 @@ do if test -f "$i/qemu-cmd" then print_bug qemu failed + echo " $i" + elif test -f "$i/buildonly" + then + echo Build-only run, no boot/test + configcheck.sh $i/.config $i/ConfigFragment + parse-build.sh $i/Make.out $configfile else print_bug Build failed + echo " $i" fi - echo " $i" fi done done diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index fa37010c34db..0f69dcbf9def 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -132,7 +132,10 @@ boot_args=$6 cd $KVM kstarttime=`awk 'BEGIN { print systime() }' < /dev/null` -echo ' ---' `date`: Starting kernel +if test -z "$TORTURE_BUILDONLY" +then + echo ' ---' `date`: Starting kernel +fi # Generate -smp qemu argument. qemu_args="-nographic $qemu_args" @@ -158,12 +161,13 @@ boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" # Generate kernel-version-specific boot parameters boot_args="`per_version_boot_params "$boot_args" $builddir/.config $seconds`" -echo $QEMU $qemu_args -m 512 -kernel $builddir/$BOOT_IMAGE -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then echo Build-only run specified, boot/test omitted. + touch $resdir/buildonly exit 0 fi +echo $QEMU $qemu_args -m 512 -kernel $builddir/$BOOT_IMAGE -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd ( $QEMU $qemu_args -m 512 -kernel $builddir/$BOOT_IMAGE -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & qemu_pid=$! commandcompleted=0 diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 40285c58653e..589e9c38413b 100644 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -340,12 +340,18 @@ function dump(first, pastlast) for (j = 1; j < jn; j++) { builddir=KVM "/b" j print "rm -f " builddir ".ready" - print "echo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date`"; - print "echo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log"; + print "if test -z \"$TORTURE_BUILDONLY\"" + print "then" + print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date`"; + print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log"; + print "fi" } print "wait" - print "echo ---- All kernel runs complete. `date`"; - print "echo ---- All kernel runs complete. `date` >> " rd "/log"; + print "if test -z \"$TORTURE_BUILDONLY\"" + print "then" + print "\techo ---- All kernel runs complete. `date`"; + print "\techo ---- All kernel runs complete. `date` >> " rd "/log"; + print "fi" for (j = 1; j < jn; j++) { builddir=KVM "/b" j print "echo ----", cfr[j], cpusr[j] ovf ": Build/run results:"; @@ -385,10 +391,7 @@ echo echo echo " --- `date` Test summary:" echo Results directory: $resdir/$ds -if test -z "$TORTURE_BUILDONLY" -then - kvm-recheck.sh $resdir/$ds -fi +kvm-recheck.sh $resdir/$ds ___EOF___ if test "$dryrun" = script @@ -403,7 +406,7 @@ then sed -e 's/:.*$//' -e 's/^echo //' exit 0 else - # Not a dryru, so run the script. + # Not a dryrun, so run the script. sh $T/script fi From eafbaac3093760d1fd3b2a5b9f016362dd68af36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Jun 2014 09:06:37 -0700 Subject: [PATCH 05/34] MAINTAINERS: Add "R:" designated-reviewers tag A ksummit-discuss email thread looked at the difficulty recruiting and retaining reviewers. Paul Walmsley also noted the need for patch submitters to know who the key reviewers are and suggested adding an "R:" tag to the MAINTAINERS file to record this information on a per-subsystem basis. This commit does just that, and a subsequent commit tags the designated reviewer for the RCU-related subsystems. http://lists.linuxfoundation.org/pipermail/ksummit-discuss/2014-May/000830.html Suggested-by: Paul Walmsley Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Steven Rostedt --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 134483f206e4..1814075c0e74 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -70,6 +70,8 @@ Descriptions of section entries: P: Person (obsolete) M: Mail patches to: FullName + R: Designated reviewer: FullName + These reviewers should be CCed on patches. L: Mailing list that is relevant to this area W: Web-page with status/info Q: Patchwork web based patch tracking system site From 4632a1919ce1b42e93321c8eeb3747cb26bcc5ac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 May 2014 15:37:24 -0700 Subject: [PATCH 06/34] rcu: Add designated reviewers for RCU Adding Steven Rostedt, Mathieu Desnoyers, and Lai Jiangshan as designated RCU reviewers based on recent emails: o https://lkml.org/lkml/2014/6/2/578 (Steven) o https://lkml.org/lkml/2014/6/2/621 (Mathieu) o https://lkml.org/lkml/2014/6/3/897 (Lai) Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1814075c0e74..92f2bf91eec1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7405,6 +7405,9 @@ F: kernel/rcu/torture.c RCUTORTURE TEST FRAMEWORK M: "Paul E. McKenney" +R: Steven Rostedt +R: Mathieu Desnoyers +R: Lai Jiangshan L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git @@ -7429,6 +7432,9 @@ F: net/rds/ READ-COPY UPDATE (RCU) M: Dipankar Sarma M: "Paul E. McKenney" +R: Steven Rostedt +R: Mathieu Desnoyers +R: Lai Jiangshan L: linux-kernel@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported @@ -8208,6 +8214,8 @@ F: mm/sl?b.c SLEEPABLE READ-COPY UPDATE (SRCU) M: Lai Jiangshan M: "Paul E. McKenney" +R: Steven Rostedt +R: Mathieu Desnoyers L: linux-kernel@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported From 6945915e7f949e213c525d4eb244be87ec92f3d1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 22 May 2014 11:51:04 -0700 Subject: [PATCH 07/34] torture: Avoid format string leak to thead name Since the torture-test thread creation interface does not include format string arguments, this commit makes sure the name can never be accidentally processed as a format string. Signed-off-by: Kees Cook Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/torture.c b/kernel/torture.c index 40bb511cca48..d600af21f022 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -708,7 +708,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, int ret = 0; VERBOSE_TOROUT_STRING(m); - *tp = kthread_run(fn, arg, s); + *tp = kthread_run(fn, arg, "%s", s); if (IS_ERR(*tp)) { ret = PTR_ERR(*tp); VERBOSE_TOROUT_ERRSTRING(f); From fbce7497ee5af800a1c350c73f3c3f103cb27a15 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Jun 2014 09:26:11 -0700 Subject: [PATCH 08/34] rcu: Parallelize and economize NOCB kthread wakeups An 80-CPU system with a context-switch-heavy workload can require so many NOCB kthread wakeups that the RCU grace-period kthreads spend several tens of percent of a CPU just awakening things. This clearly will not scale well: If you add enough CPUs, the RCU grace-period kthreads would get behind, increasing grace-period latency. To avoid this problem, this commit divides the NOCB kthreads into leaders and followers, where the grace-period kthreads awaken the leaders each of whom in turn awakens its followers. By default, the number of groups of kthreads is the square root of the number of CPUs, but this default may be overridden using the rcutree.rcu_nocb_leader_stride boot parameter. This reduces the number of wakeups done per grace period by the RCU grace-period kthread by the square root of the number of CPUs, but of course by shifting those wakeups to the leaders. In addition, because the leaders do grace periods on behalf of their respective followers, the number of wakeups of the followers decreases by up to a factor of two. Instead of being awakened once when new callbacks arrive and again at the end of the grace period, the followers are awakened only at the end of the grace period. For a numerical example, in a 4096-CPU system, the grace-period kthread would awaken 64 leaders, each of which would awaken its 63 followers at the end of the grace period. This compares favorably with the 79 wakeups for the grace-period kthread on an 80-CPU system. Reported-by: Rik van Riel Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 7 + kernel/rcu/tree.h | 28 +++- kernel/rcu/tree_plugin.h | 250 +++++++++++++++++++++++----- 3 files changed, 243 insertions(+), 42 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 910c3829f81d..770662c42c9f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2802,6 +2802,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. quiescent states. Units are jiffies, minimum value is one, and maximum value is HZ. + rcutree.rcu_nocb_leader_stride= [KNL] + Set the number of NOCB kthread groups, which + defaults to the square root of the number of + CPUs. Larger numbers reduces the wakeup overhead + on the per-CPU grace-period kthreads, but increases + that same overhead on each group's leader. + rcutree.qhimark= [KNL] Set threshold of queued RCU callbacks beyond which batch limiting is disabled. diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0f69a79c5b7d..e996d1e53c84 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -334,11 +334,29 @@ struct rcu_data { struct rcu_head **nocb_tail; atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ atomic_long_t nocb_q_count_lazy; /* (approximate). */ + struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ + struct rcu_head **nocb_follower_tail; + atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ + atomic_long_t nocb_follower_count_lazy; /* (approximate). */ int nocb_p_count; /* # CBs being invoked by kthread */ int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + + /* The following fields are used by the leader, hence own cacheline. */ + struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; + /* CBs waiting for GP. */ + struct rcu_head **nocb_gp_tail; + long nocb_gp_count; + long nocb_gp_count_lazy; + bool nocb_leader_wake; /* Is the nocb leader thread awake? */ + struct rcu_data *nocb_next_follower; + /* Next follower in wakeup chain. */ + + /* The following fields are used by the follower, hence new cachline. */ + struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; + /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ @@ -587,8 +605,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); /* Sum up queue lengths for tracing. */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) { - *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; + *ql = atomic_long_read(&rdp->nocb_q_count) + + rdp->nocb_p_count + + atomic_long_read(&rdp->nocb_follower_count) + + rdp->nocb_p_count + rdp->nocb_gp_count; + *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + + rdp->nocb_p_count_lazy + + atomic_long_read(&rdp->nocb_follower_count_lazy) + + rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy; } #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ac0fb186b8..b27b86c7bbfa 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2059,6 +2059,22 @@ bool rcu_is_nocb_cpu(int cpu) } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ +/* + * Kick the leader kthread for this NOCB group. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + struct rcu_data *rdp_leader = rdp->nocb_leader; + + if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) + return; + if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { + /* Prior xchg orders against prior callback enqueue. */ + ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; + wake_up(&rdp_leader->nocb_wq); + } +} + /* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the @@ -2093,7 +2109,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { if (!irqs_disabled_flags(flags)) { - wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ + /* ... if queue was empty ... */ + wake_nocb_leader(rdp, false); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { @@ -2103,7 +2120,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { - wake_up_process(t); /* ... or if many callbacks queued. */ + /* ... or if many callbacks queued. */ + wake_nocb_leader(rdp, true); rdp->qlen_last_fqs_check = LONG_MAX / 2; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { @@ -2212,14 +2230,151 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) smp_mb(); /* Ensure that CB invocation happens after GP end. */ } +/* + * Leaders come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_leader_wait(struct rcu_data *my_rdp) +{ + bool firsttime = true; + bool gotcbs; + struct rcu_data *rdp; + struct rcu_head **tail; + +wait_again: + + /* Wait for callbacks to appear. */ + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + wait_event_interruptible(my_rdp->nocb_wq, + ACCESS_ONCE(my_rdp->nocb_leader_wake)); + /* Memory barrier handled by smp_mb() calls below and repoll. */ + } else if (firsttime) { + firsttime = false; /* Don't drown trace log with "Poll"! */ + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + } + + /* + * Each pass through the following loop checks a follower for CBs. + * We are our own first follower. Any CBs found are moved to + * nocb_gp_head, where they await a grace period. + */ + gotcbs = false; + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head); + if (!rdp->nocb_gp_head) + continue; /* No CBs here, try next follower. */ + + /* Move callbacks to wait-for-GP list, which is empty. */ + ACCESS_ONCE(rdp->nocb_head) = NULL; + rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); + rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); + rdp->nocb_gp_count_lazy = + atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); + gotcbs = true; + } + + /* + * If there were no callbacks, sleep a bit, rescan after a + * memory barrier, and go retry. + */ + if (unlikely(!gotcbs)) { + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, + "WokeEmpty"); + flush_signals(current); + schedule_timeout_interruptible(1); + + /* Rescan in case we were a victim of memory ordering. */ + my_rdp->nocb_leader_wake = false; + smp_mb(); /* Ensure _wake false before scan. */ + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) + if (ACCESS_ONCE(rdp->nocb_head)) { + /* Found CB, so short-circuit next wait. */ + my_rdp->nocb_leader_wake = true; + break; + } + goto wait_again; + } + + /* Wait for one grace period. */ + rcu_nocb_wait_gp(my_rdp); + + /* + * We left ->nocb_leader_wake set to reduce cache thrashing. + * We clear it now, but recheck for new callbacks while + * traversing our follower list. + */ + my_rdp->nocb_leader_wake = false; + smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ + + /* Each pass through the following loop wakes a follower, if needed. */ + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + if (ACCESS_ONCE(rdp->nocb_head)) + my_rdp->nocb_leader_wake = true; /* No need to wait. */ + if (!rdp->nocb_gp_head) + continue; /* No CBs, so no need to wake follower. */ + + /* Append callbacks to follower's "done" list. */ + tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + *tail = rdp->nocb_gp_head; + atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); + atomic_long_add(rdp->nocb_gp_count_lazy, + &rdp->nocb_follower_count_lazy); + if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { + /* + * List was empty, wake up the follower. + * Memory barriers supplied by atomic_long_add(). + */ + wake_up(&rdp->nocb_wq); + } + } + + /* If we (the leader) don't have CBs, go wait some more. */ + if (!my_rdp->nocb_follower_head) + goto wait_again; +} + +/* + * Followers come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_follower_wait(struct rcu_data *rdp) +{ + bool firsttime = true; + + for (;;) { + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + "FollowerSleep"); + wait_event_interruptible(rdp->nocb_wq, + ACCESS_ONCE(rdp->nocb_follower_head)); + } else if (firsttime) { + /* Don't drown trace log with "Poll"! */ + firsttime = false; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); + } + if (smp_load_acquire(&rdp->nocb_follower_head)) { + /* ^^^ Ensure CB invocation follows _head test. */ + return; + } + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + "WokeEmpty"); + flush_signals(current); + schedule_timeout_interruptible(1); + } +} + /* * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU. + * callbacks queued by the corresponding no-CBs CPU, however, there is + * an optional leader-follower relationship so that the grace-period + * kthreads don't have to do quite so many wakeups. */ static int rcu_nocb_kthread(void *arg) { int c, cl; - bool firsttime = 1; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2227,41 +2382,22 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { - /* If not polling, wait for next batch of callbacks. */ - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Sleep")); - wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); - /* Memory barrier provide by xchg() below. */ - } else if (firsttime) { - firsttime = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Poll")); - } - list = ACCESS_ONCE(rdp->nocb_head); - if (!list) { - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeEmpty")); - schedule_timeout_interruptible(1); - flush_signals(current); - continue; - } - firsttime = 1; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeNonEmpty")); + /* Wait for callbacks. */ + if (rdp->nocb_leader == rdp) + nocb_leader_wait(rdp); + else + nocb_follower_wait(rdp); - /* - * Extract queued callbacks, update counts, and wait - * for a grace period to elapse. - */ - ACCESS_ONCE(rdp->nocb_head) = NULL; - tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - c = atomic_long_xchg(&rdp->nocb_q_count, 0); - cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); - ACCESS_ONCE(rdp->nocb_p_count) += c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; - rcu_nocb_wait_gp(rdp); + /* Pull the ready-to-invoke callbacks onto local list. */ + list = ACCESS_ONCE(rdp->nocb_follower_head); + BUG_ON(!list); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); + ACCESS_ONCE(rdp->nocb_follower_head) = NULL; + tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); + c = atomic_long_xchg(&rdp->nocb_follower_count, 0); + cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); + rdp->nocb_p_count += c; + rdp->nocb_p_count_lazy += cl; /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); @@ -2305,7 +2441,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; - wake_up(&rdp->nocb_wq); + wake_nocb_leader(rdp, false); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); } @@ -2314,19 +2450,53 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; init_waitqueue_head(&rdp->nocb_wq); + rdp->nocb_follower_tail = &rdp->nocb_follower_head; } -/* Create a kthread for each RCU flavor for each no-CBs CPU. */ +/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_leader_stride = -1; +module_param(rcu_nocb_leader_stride, int, 0444); + +/* + * Create a kthread for each RCU flavor for each no-CBs CPU. + * Also initialize leader-follower relationships. + */ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { int cpu; + int ls = rcu_nocb_leader_stride; + int nl = 0; /* Next leader. */ struct rcu_data *rdp; + struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ + struct rcu_data *rdp_prev = NULL; struct task_struct *t; if (rcu_nocb_mask == NULL) return; + if (ls == -1) { + ls = int_sqrt(nr_cpu_ids); + rcu_nocb_leader_stride = ls; + } + + /* + * Each pass through this loop sets up one rcu_data structure and + * spawns one rcu_nocb_kthread(). + */ for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->cpu >= nl) { + /* New leader, set up for followers & next leader. */ + nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; + rdp->nocb_leader = rdp; + rdp_leader = rdp; + } else { + /* Another follower, link to previous leader. */ + rdp->nocb_leader = rdp_leader; + rdp_prev->nocb_next_follower = rdp; + } + rdp_prev = rdp; + + /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%c/%d", rsp->abbr, cpu); BUG_ON(IS_ERR(t)); From b58cc46c5f6b57f1c814e374dbc47176e6b4938e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Jul 2014 10:13:24 -0700 Subject: [PATCH 09/34] rcu: Don't offload callbacks unless specifically requested Enabling NO_HZ_FULL currently has the side effect of enabling callback offloading on all CPUs. This results in lots of additional rcuo kthreads, and can also increase context switching and wakeups, even in cases where callback offloading is neither needed nor particularly desirable. This commit therefore enables callback offloading on a given CPU only if specifically requested at build time or boot time, or if that CPU has been specifically designated (again, either at build time or boot time) as a nohz_full CPU. Signed-off-by: Paul E. McKenney --- init/Kconfig | 4 ++-- kernel/rcu/tree_plugin.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 9d76b99af1b9..9332d33346ac 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -737,7 +737,7 @@ choice config RCU_NOCB_CPU_NONE bool "No build_forced no-CBs CPUs" - depends on RCU_NOCB_CPU && !NO_HZ_FULL + depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL help This option does not force any of the CPUs to be no-CBs CPUs. Only CPUs designated by the rcu_nocbs= boot parameter will be @@ -751,7 +751,7 @@ config RCU_NOCB_CPU_NONE config RCU_NOCB_CPU_ZERO bool "CPU 0 is a build_forced no-CBs CPU" - depends on RCU_NOCB_CPU && !NO_HZ_FULL + depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL help This option forces CPU 0 to be a no-CBs CPU, so that its RCU callbacks are invoked by a per-CPU kthread whose name begins diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b27b86c7bbfa..17eed0856b03 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2473,6 +2473,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) if (rcu_nocb_mask == NULL) return; +#ifdef CONFIG_NO_HZ_FULL + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ if (ls == -1) { ls = int_sqrt(nr_cpu_ids); rcu_nocb_leader_stride = ls; From 5726ce06ad6bcd8dd75a204d1465c99a2f897d3a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 13 May 2014 10:14:51 -0700 Subject: [PATCH 10/34] documentation: Clarify wake-up/memory-barrier relationship This commit adds an example demonstrating that if a wake_up() doesn't actually wake something up, no memory ordering is provided. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Reviewed-by: Lai Jiangshan Acked-by: Peter Zijlstra --- Documentation/memory-barriers.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f1dc4a215593..a6ca533a73fc 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1893,6 +1893,21 @@ between the STORE to indicate the event and the STORE to set TASK_RUNNING: STORE current->state LOAD event_indicated +To repeat, this write memory barrier is present if and only if something +is actually awakened. To see this, consider the following sequence of +events, where X and Y are both initially zero: + + CPU 1 CPU 2 + =============================== =============================== + X = 1; STORE event_indicated + smp_mb(); wake_up(); + Y = 1; wait_event(wq, Y == 1); + wake_up(); load from Y sees 1, no memory barrier + load from X might see 0 + +In contrast, if a wakeup does occur, CPU 2's load from X would be guaranteed +to see 1. + The available waker functions include: complete(); From 5e40ad7f6a038cfa42e0605764366d994701eab7 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Mon, 9 Jun 2014 11:51:23 -0400 Subject: [PATCH 11/34] documentation: Update reference, kerneltrap.org no longer works The kerneltrap.org site no longer works, so this commit updates it to a working reference, namely gmane. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/RTFP.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index 2f0fcb2112d2..f29bcbc463e7 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt @@ -2451,8 +2451,8 @@ lot of {Linux} into your technology!!!" ,month="February" ,year="2010" ,note="Available: -\url{http://kerneltrap.com/mailarchive/linux-netdev/2010/2/26/6270589} -[Viewed March 20, 2011]" +\url{http://thread.gmane.org/gmane.linux.network/153338} +[Viewed June 9, 2014]" ,annotation={ Use a pair of list_head structures to support RCU-protected resizable hash tables. From 128ea442b12ba63614dccc9b54726cf753aa4758 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Jun 2014 10:01:23 -0700 Subject: [PATCH 12/34] documentation: Add acquire/release barriers to pairing rules It is possible to pair acquire and release barriers with other barriers, so this commit adds them to the list in the SMP barrier pairing section. Reported-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Tejun Heo Reviewed-by: Josh Triplett [ paulmck: Updated pairing discussion as suggested by Peter Zijlstra. ] --- Documentation/memory-barriers.txt | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index a6ca533a73fc..a4de88fb55f0 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -757,10 +757,14 @@ SMP BARRIER PAIRING When dealing with CPU-CPU interactions, certain types of memory barrier should always be paired. A lack of appropriate pairing is almost certainly an error. -A write barrier should always be paired with a data dependency barrier or read -barrier, though a general barrier would also be viable. Similarly a read -barrier or a data dependency barrier should always be paired with at least an -write barrier, though, again, a general barrier is viable: +General barriers pair with each other, though they also pair with +most other types of barriers, albeit without transitivity. An acquire +barrier pairs with a release barrier, but both may also pair with other +barriers, including of course general barriers. A write barrier pairs +with a data dependency barrier, an acquire barrier, a release barrier, +a read barrier, or a general barrier. Similarly a read barrier or a +data dependency barrier pairs with a write barrier, an acquire barrier, +a release barrier, or a general barrier: CPU 1 CPU 2 =============== =============== From 9963185c04727769913a2758c0fcccb821c77098 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jun 2014 11:52:59 -0700 Subject: [PATCH 13/34] documentation: Add pointer to percpu-ref for RCU and refcount Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Reviewed-by: Lai Jiangshan --- Documentation/RCU/rcuref.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt index 141d531aa14b..613033ff2b9b 100644 --- a/Documentation/RCU/rcuref.txt +++ b/Documentation/RCU/rcuref.txt @@ -1,5 +1,14 @@ Reference-count design for elements of lists/arrays protected by RCU. + +Please note that the percpu-ref feature is likely your first +stop if you need to combine reference counts and RCU. Please see +include/linux/percpu-refcount.h for more information. However, in +those unusual cases where percpu-ref would consume too much memory, +please read on. + +------------------------------------------------------------------------ + Reference counting on elements of lists which are protected by traditional reader/writer spinlocks or semaphores are straightforward: From ab0afd6c24948e1f261adf510408824c793c1206 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Jun 2014 08:17:06 -0700 Subject: [PATCH 14/34] rcu: Update RCU maintainership Drop Dipankar Sarma at his request (https://lkml.org/lkml/2014/6/2/628), add Josh Triplett based on long-term review, contributions, and agreement to take on this role (https://lkml.org/lkml/2014/6/2/554). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 92f2bf91eec1..922661d17083 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7405,6 +7405,7 @@ F: kernel/rcu/torture.c RCUTORTURE TEST FRAMEWORK M: "Paul E. McKenney" +M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers R: Lai Jiangshan @@ -7430,8 +7431,8 @@ S: Supported F: net/rds/ READ-COPY UPDATE (RCU) -M: Dipankar Sarma M: "Paul E. McKenney" +M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers R: Lai Jiangshan @@ -8214,6 +8215,7 @@ F: mm/sl?b.c SLEEPABLE READ-COPY UPDATE (SRCU) M: Lai Jiangshan M: "Paul E. McKenney" +M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers L: linux-kernel@vger.kernel.org From 34e2d560bcdd639717367a570063011ae74b6782 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Jul 2014 15:08:00 -0700 Subject: [PATCH 15/34] rcu: Update rcu torture maintainership filename patterns Commit 51b1130eb582 ("rcutorture: Abstract rcu_torture_random()") moved the file, so this commit updates the patterns. Signed-off-by: Joe Perches cc: "Paul E. McKenney" Signed-off-by: Paul E. McKenney --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 922661d17083..4619a91d5d8f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7401,7 +7401,7 @@ L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git F: Documentation/RCU/torture.txt -F: kernel/rcu/torture.c +F: kernel/rcu/rcutorture.c RCUTORTURE TEST FRAMEWORK M: "Paul E. McKenney" @@ -7445,7 +7445,7 @@ X: Documentation/RCU/torture.txt F: include/linux/rcu* X: include/linux/srcu.h F: kernel/rcu/ -X: kernel/rcu/torture.c +X: kernel/torture.c REAL TIME CLOCK (RTC) SUBSYSTEM M: Alessandro Zummo From c1c3f2c906e35bcb6e4cdf5b8e077660fead14fe Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 2 Jun 2014 12:05:17 -0700 Subject: [PATCH 16/34] scripts: Teach get_maintainer.pl about the new "R:" tag We can now designate reviewers in the MAINTAINERS file with the new "R:" tag, so this commit teaches get_maintainers.pl to add their email addresses. Signed-off-by: Joe Perches Signed-off-by: Paul E. McKenney --- scripts/get_maintainer.pl | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 41987885bd31..d7016279ec2b 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -21,6 +21,7 @@ my $lk_path = "./"; my $email = 1; my $email_usename = 1; my $email_maintainer = 1; +my $email_reviewer = 1; my $email_list = 1; my $email_subscriber_list = 0; my $email_git_penguin_chiefs = 0; @@ -202,6 +203,7 @@ if (!GetOptions( 'remove-duplicates!' => \$email_remove_duplicates, 'mailmap!' => \$email_use_mailmap, 'm!' => \$email_maintainer, + 'r!' => \$email_reviewer, 'n!' => \$email_usename, 'l!' => \$email_list, 's!' => \$email_subscriber_list, @@ -260,7 +262,8 @@ if ($sections) { } if ($email && - ($email_maintainer + $email_list + $email_subscriber_list + + ($email_maintainer + $email_reviewer + + $email_list + $email_subscriber_list + $email_git + $email_git_penguin_chiefs + $email_git_blame) == 0) { die "$P: Please select at least 1 email option\n"; } @@ -750,6 +753,7 @@ MAINTAINER field selection options: --hg-since => hg history to use (default: $email_hg_since) --interactive => display a menu (mostly useful if used with the --git option) --m => include maintainer(s) if any + --r => include reviewer(s) if any --n => include name 'Full Name ' --l => include list(s) if any --s => include subscriber only list(s) if any @@ -1064,6 +1068,22 @@ sub add_categories { my $role = get_maintainer_role($i); push_email_addresses($pvalue, $role); } + } elsif ($ptype eq "R") { + my ($name, $address) = parse_email($pvalue); + if ($name eq "") { + if ($i > 0) { + my $tv = $typevalue[$i - 1]; + if ($tv =~ m/^(\C):\s*(.*)/) { + if ($1 eq "P") { + $name = $2; + $pvalue = format_email($name, $address, $email_usename); + } + } + } + } + if ($email_reviewer) { + push_email_addresses($pvalue, 'reviewer'); + } } elsif ($ptype eq "T") { push(@scm, $pvalue); } elsif ($ptype eq "W") { From f27bc4873fa8b75cc1eba7b641eda7375dc72ccf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 May 2014 15:38:38 -0700 Subject: [PATCH 17/34] rcu: Document deadlock-avoidance information for rcu_read_unlock() Reported-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- include/linux/rcupdate.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6a94cc8b1ca0..c56ad15204ec 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -858,6 +858,34 @@ static inline void rcu_read_lock(void) /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * + * In most situations, rcu_read_unlock() is immune from deadlock. + * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock() + * is responsible for deboosting, which it does via rt_mutex_unlock(). + * Unfortunately, this function acquires the scheduler's runqueue and + * priority-inheritance spinlocks. This means that deadlock could result + * if the caller of rcu_read_unlock() already holds one of these locks or + * any lock that is ever acquired while holding them. + * + * That said, RCU readers are never priority boosted unless they were + * preempted. Therefore, one way to avoid deadlock is to make sure + * that preemption never happens within any RCU read-side critical + * section whose outermost rcu_read_unlock() is called with one of + * rt_mutex_unlock()'s locks held. Such preemption can be avoided in + * a number of ways, for example, by invoking preempt_disable() before + * critical section's outermost rcu_read_lock(). + * + * Given that the set of locks acquired by rt_mutex_unlock() might change + * at any time, a somewhat more future-proofed approach is to make sure + * that that preemption never happens within any RCU read-side critical + * section whose outermost rcu_read_unlock() is called with irqs disabled. + * This approach relies on the fact that rt_mutex_unlock() currently only + * acquires irq-disabled locks. + * + * The second of these two approaches is best in most situations, + * however, the first approach can also be useful, at least to those + * developers willing to keep abreast of the set of locks acquired by + * rt_mutex_unlock(). + * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) From ab74fdfd4e11ec040f21cf87edc14fc9f62cc934 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 May 2014 15:41:21 -0700 Subject: [PATCH 18/34] rcu: Handle obsolete references to TINY_PREEMPT_RCU Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- include/linux/rcupdate.h | 17 ++++++++--------- init/Kconfig | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c56ad15204ec..d231aa17b1d7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -826,15 +826,14 @@ static inline void rcu_preempt_sleep_check(void) * read-side critical section that would block in a !PREEMPT kernel. * But if you want the full story, read on! * - * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it - * is illegal to block while in an RCU read-side critical section. In - * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU) - * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may - * be preempted, but explicit blocking is illegal. Finally, in preemptible - * RCU implementations in real-time (with -rt patchset) kernel builds, - * RCU read-side critical sections may be preempted and they may also - * block, but only when acquiring spinlocks that are subject to priority - * inheritance. + * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), + * it is illegal to block while in an RCU read-side critical section. + * In preemptible RCU implementations (TREE_PREEMPT_RCU) in CONFIG_PREEMPT + * kernel builds, RCU read-side critical sections may be preempted, + * but explicit blocking is illegal. Finally, in preemptible RCU + * implementations in real-time (with -rt patchset) kernel builds, RCU + * read-side critical sections may be preempted and they may also block, but + * only when acquiring spinlocks that are subject to priority inheritance. */ static inline void rcu_read_lock(void) { diff --git a/init/Kconfig b/init/Kconfig index 9d76b99af1b9..977b37806e95 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -505,7 +505,7 @@ config PREEMPT_RCU def_bool TREE_PREEMPT_RCU help This option enables preemptible-RCU code that is common between - the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU. config RCU_STALL_COMMON def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE ) From c41247e1d4864c863ee25e029dd53acdb2abc000 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 May 2014 08:18:30 -0700 Subject: [PATCH 19/34] signal: Explain local_irq_save() call The explicit local_irq_save() in __lock_task_sighand() is needed to avoid a potential deadlock condition, as noted in a841796f11c90d53 (signal: align __lock_task_sighand() irq disabling and RCU). However, someone reading the code might be forgiven for concluding that this separate local_irq_save() was completely unnecessary. This commit therefore adds a comment referencing the shiny new block comment on rcu_read_unlock(). Reported-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Acked-by: Oleg Nesterov Reviewed-by: Lai Jiangshan --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/signal.c b/kernel/signal.c index a4077e90f19f..40b76e351e64 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, struct sighand_struct *sighand; for (;;) { + /* + * Disable interrupts early to avoid deadlocks. + * See rcu_read_unlock() comment header for details. + */ local_irq_save(*flags); rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); From b4426b49c65e0d266f8a9181ca51d5bf11407714 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 6 May 2014 19:21:14 +0200 Subject: [PATCH 20/34] rcu: Make rcu node arrays static const char * const Those two arrays are being passed to lockdep_init_map(), which expects const char *, and are stored in lockdep_map the same way. Cc: Dipankar Sarma Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 625d0b0cd75a..ebd99af2214e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3564,14 +3564,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static char *buf[] = { "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ - static char *fqs[] = { "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static const char * const buf[] = { + "rcu_node_0", + "rcu_node_1", + "rcu_node_2", + "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + static const char * const fqs[] = { + "rcu_node_fqs_0", + "rcu_node_fqs_1", + "rcu_node_fqs_2", + "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ static u8 fl_mask = 0x1; int cpustride = 1; int i; From 4da117cfa72e6cde3d9e8f5ed932381863cdeec9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 May 2014 18:06:51 -0700 Subject: [PATCH 21/34] rcu: Remove redundant ACCESS_ONCE() from tick_do_timer_cpu In kernels built with CONFIG_NO_HZ_FULL, tick_do_timer_cpu is constant once boot completes. Thus, there is no need to wrap it in ACCESS_ONCE() in code that is built only when CONFIG_NO_HZ_FULL. This commit therefore removes the redundant ACCESS_ONCE(). Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker Reviewed-by: Lai Jiangshan --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ac0fb186b8..5da9f9b3abc9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2844,7 +2844,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) static void rcu_bind_gp_kthread(void) { #ifdef CONFIG_NO_HZ_FULL - int cpu = ACCESS_ONCE(tick_do_timer_cpu); + int cpu = tick_do_timer_cpu; if (cpu < 0 || cpu >= nr_cpu_ids) return; From a792563bd47632d85158c72e2acf4484eed0ec32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Jun 2014 14:54:34 -0700 Subject: [PATCH 22/34] rcu: Eliminate read-modify-write ACCESS_ONCE() calls RCU contains code of the following forms: ACCESS_ONCE(x)++; ACCESS_ONCE(x) += y; ACCESS_ONCE(x) -= y; Now these constructs do operate correctly, but they really result in a pair of volatile accesses, one to do the load and another to do the store. This can be confusing, as the casual reader might well assume that (for example) gcc might generate a memory-to-memory add instruction for each of these three cases. In fact, gcc will do no such thing. Also, there is a good chance that the kernel will move to separate load and store variants of ACCESS_ONCE(), and constructs like the above could easily confuse both people and scripts attempting to make that sort of change. Finally, most of RCU's read-modify-write uses of ACCESS_ONCE() really only need the store to be volatile, so that the read-modify-write form might be misleading. This commit therefore changes the above forms in RCU so that each instance of ACCESS_ONCE() either does a load or a store, but not both. In a few cases, ACCESS_ONCE() was not critical, for example, for maintaining statisitics. In these cases, ACCESS_ONCE() has been dispensed with entirely. Suggested-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 4 ++-- kernel/rcu/tree.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index c639556f3fa0..e037f3eb2f7b 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp) idx = ACCESS_ONCE(sp->completed) & 0x1; preempt_disable(); - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; + __this_cpu_inc(sp->per_cpu_ref->c[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; + __this_cpu_inc(sp->per_cpu_ref->seq[idx]); preempt_enable(); return idx; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ebd99af2214e..6bf7daebcc6b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2347,7 +2347,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - ACCESS_ONCE(rdp->qlen) -= count; + ACCESS_ONCE(rdp->qlen) = rdp->qlen - count; rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -2492,7 +2492,7 @@ static void force_quiescent_state(struct rcu_state *rsp) if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) { - ACCESS_ONCE(rsp->n_force_qs_lh)++; + rsp->n_force_qs_lh++; return; } rnp_old = rnp; @@ -2504,7 +2504,7 @@ static void force_quiescent_state(struct rcu_state *rsp) smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - ACCESS_ONCE(rsp->n_force_qs_lh)++; + rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } @@ -2693,7 +2693,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), local_irq_restore(flags); return; } - ACCESS_ONCE(rdp->qlen)++; + ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; if (lazy) rdp->qlen_lazy++; else @@ -3257,7 +3257,7 @@ static void _rcu_barrier(struct rcu_state *rsp) * ACCESS_ONCE() to prevent the compiler from speculating * the increment to precede the early-exit check. */ - ACCESS_ONCE(rsp->n_barrier_done)++; + ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ @@ -3307,7 +3307,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Increment ->n_barrier_done to prevent duplicate work. */ smp_mb(); /* Keep increment after above mechanism. */ - ACCESS_ONCE(rsp->n_barrier_done)++; + ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); smp_mb(); /* Keep increment before caller's subsequent code. */ From 1146edcbef3789228454c4aa42c08ddc2c275990 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Jun 2014 08:24:17 -0700 Subject: [PATCH 23/34] rcu: Loosen __call_rcu()'s rcu_head alignment constraint The m68k architecture aligns only to 16-bit boundaries, which can cause the align-to-32-bits check in __call_rcu() to trigger. Because there is currently no known potential need for more than one low-order bit, this commit loosens the check to 16-bit boundaries. Reported-by: Greg Ungerer Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6bf7daebcc6b..bcd635e42841 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2662,7 +2662,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; - WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ + WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ ACCESS_ONCE(head->func) = rcu_leak_callback; From dfeb9765ce3c33cb3cbc5f16db423f1c58a4cc55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Jun 2014 16:31:55 -0700 Subject: [PATCH 24/34] rcu: Allow post-unlock reference for rt_mutex The current approach to RCU priority boosting uses an rt_mutex strictly for its priority-boosting side effects. The rt_mutex_init_proxy_locked() function is used by the booster to initialize the lock as held by the boostee. The booster then uses rt_mutex_lock() to acquire this rt_mutex, which priority-boosts the boostee. When the boostee reaches the end of its outermost RCU read-side critical section, it checks a field in its task structure to see whether it has been boosted, and, if so, uses rt_mutex_unlock() to release the rt_mutex. The booster can then go on to boost the next task that is blocking the current RCU grace period. But reasonable implementations of rt_mutex_unlock() might result in the boostee referencing the rt_mutex's data after releasing it. But the booster might have re-initialized the rt_mutex between the time that the boostee released it and the time that it later referenced it. This is clearly asking for trouble, so this commit introduces a completion that forces the booster to wait until the boostee has completely finished with the rt_mutex, thus avoiding the case where the booster is re-initializing the rt_mutex before the last boostee's last reference to that rt_mutex. This of course does introduce some overhead, but the priority-boosting code paths are miles from any possible fastpath, and the overhead of executing the completion will normally be quite small compared to the overhead of priority boosting and deboosting, so this should be OK. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 5 +++++ kernel/rcu/tree_plugin.h | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0f69a79c5b7d..3eeb919e26a2 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -172,6 +172,11 @@ struct rcu_node { /* queued on this rcu_node structure that */ /* are blocking the current grace period, */ /* there can be no such task. */ + struct completion boost_completion; + /* Used to ensure that the rt_mutex used */ + /* to carry out the boosting is fully */ + /* released with no future boostee accesses */ + /* before that rt_mutex is re-initialized. */ unsigned long boost_time; /* When to start boosting (jiffies). */ struct task_struct *boost_kthread_task; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5da9f9b3abc9..9c811879d31e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -427,8 +427,10 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (rbmp) + if (rbmp) { rt_mutex_unlock(rbmp); + complete(&rnp->boost_completion); + } #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -1202,10 +1204,14 @@ static int rcu_boost(struct rcu_node *rnp) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; + init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + /* Wait until boostee is done accessing mtx before reinitializing. */ + wait_for_completion(&rnp->boost_completion); + return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; } From 48bd8e9b82a750b983823f391c67e70553757afa Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 11 Jun 2014 10:32:47 -0700 Subject: [PATCH 25/34] rcu: Check both root and current rcu_node when setting up future grace period The rcu_start_future_gp() function checks the current rcu_node's ->gpnum and ->completed twice, once without ACCESS_ONCE() and once with it. Which is pointless because we hold that rcu_node's ->lock at that point. The intent was to check the current rcu_node structure and the root rcu_node structure, the latter locklessly with ACCESS_ONCE(). This commit therefore makes that change. The reason that it is safe to locklessly check the root rcu_nodes's ->gpnum and ->completed fields is that we hold the current rcu_node's ->lock, which constrains the root rcu_node's ability to change its ->gpnum and ->completed fields. Of course, if there is a single rcu_node structure, then rnp_root==rnp, and holding the lock prevents all changes. If there is more than one rcu_node structure, then the code updates the fields in the following order: 1. Increment rnp_root->gpnum to start new grace period. 2. Increment rnp->gpnum to initialize the current rcu_node, continuing initialization for the new grace period. 3. Increment rnp_root->completed to end the current grace period. 4. Increment rnp->completed to continue cleaning up after the old grace period. So there are four possible combinations of relative values of these four fields: N N N N: RCU idle, new grace period must be initiated. Although rnp_root->gpnum might be incremented immediately after we check, that will just result in unnecessary work. The grace period already started, and we try to start it. N+1 N N N: RCU grace period just started. No further change is possible because we hold rnp->lock, so the checks of rnp_root->gpnum and rnp_root->completed are stable. We know that our request for a future grace period will be seen during grace-period cleanup. N+1 N N+1 N: RCU grace period is ongoing. Because rnp->gpnum is different than rnp->completed, we won't even look at rnp_root->gpnum and rnp_root->completed, so the possible concurrent change to rnp_root->completed does not matter. We know that our request for a future grace period will be seen during grace-period cleanup, which cannot pass this rcu_node because we hold its ->lock. N+1 N+1 N+1 N: RCU grace period has ended, but not yet been cleaned up. Because rnp->gpnum is different than rnp->completed, we won't look at rnp_root->gpnum and rnp_root->completed, so the possible concurrent change to rnp_root->completed does not matter. We know that our request for a future grace period will be seen during grace-period cleanup, which cannot pass this rcu_node because we hold its ->lock. Therefore, despite initial appearances, the lockless check is safe. Signed-off-by: Pranith Kumar [ paulmck: Update comment to say why the lockless check is safe. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcd635e42841..3f93033d3c61 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1305,10 +1305,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * believe that a grace period is in progress, then we must wait * for the one following, which is in "c". Because our request * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. + * need to explicitly start one. We only do the lockless check + * of rnp_root's fields if the current rcu_node structure thinks + * there is no grace period in flight, and because we hold rnp->lock, + * the only possible change is when rnp_root's two fields are + * equal, in which case rnp_root->gpnum might be concurrently + * incremented. But that is OK, as it will just result in our + * doing some extra useless work. */ if (rnp->gpnum != rnp->completed || - ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; From abaa93d9e1de2c29297e69ddba8ddd38f15064cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jun 2014 13:30:25 -0700 Subject: [PATCH 26/34] rcu: Simplify priority boosting by putting rt_mutex in rcu_node RCU priority boosting currently checks for boosting via a pointer in task_struct. However, this is not needed: As Oleg noted, if the rt_mutex is placed in the rcu_node instead of on the booster's stack, the boostee can simply check it see if it owns the lock. This commit makes this change, shrinking task_struct by one pointer and the kernel by thirteen lines. Suggested-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 9 +-------- include/linux/sched.h | 6 ------ kernel/rcu/tree.h | 3 +++ kernel/rcu/tree_plugin.h | 25 +++++++++++-------------- 4 files changed, 15 insertions(+), 28 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 6df7f9fe0d01..2bb4c4f3531a 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -102,12 +102,6 @@ extern struct group_info init_groups; #define INIT_IDS #endif -#ifdef CONFIG_RCU_BOOST -#define INIT_TASK_RCU_BOOST() \ - .rcu_boost_mutex = NULL, -#else -#define INIT_TASK_RCU_BOOST() -#endif #ifdef CONFIG_TREE_PREEMPT_RCU #define INIT_TASK_RCU_TREE_PREEMPT() \ .rcu_blocked_node = NULL, @@ -119,8 +113,7 @@ extern struct group_info init_groups; .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ - INIT_TASK_RCU_TREE_PREEMPT() \ - INIT_TASK_RCU_BOOST() + INIT_TASK_RCU_TREE_PREEMPT() #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 306f4f0c987a..3cfbc05e66e6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1270,9 +1270,6 @@ struct task_struct { #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rcu_boost_mutex; -#endif /* #ifdef CONFIG_RCU_BOOST */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -2009,9 +2006,6 @@ static inline void rcu_copy_process(struct task_struct *p) #ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - p->rcu_boost_mutex = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&p->rcu_node_entry); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3eeb919e26a2..60fb0eaa2d16 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -177,6 +177,9 @@ struct rcu_node { /* to carry out the boosting is fully */ /* released with no future boostee accesses */ /* before that rt_mutex is re-initialized. */ + struct rt_mutex boost_mtx; + /* Used only for the priority-boosting */ + /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ struct task_struct *boost_kthread_task; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9c811879d31e..719587af7b10 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -33,6 +33,7 @@ #define RCU_KTHREAD_PRIO 1 #ifdef CONFIG_RCU_BOOST +#include "../locking/rtmutex_common.h" #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO #else #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO @@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t) unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; + bool drop_boost_mutex = false; #endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; int special; @@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; - /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ - if (t->rcu_boost_mutex) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - } + /* Snapshot ->boost_mtx ownership with rcu_node lock held. */ + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -427,8 +425,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (rbmp) { - rt_mutex_unlock(rbmp); + if (drop_boost_mutex) { + rt_mutex_unlock(&rnp->boost_mtx); complete(&rnp->boost_completion); } #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -1151,7 +1149,6 @@ static void rcu_wake_cond(struct task_struct *t, int status) static int rcu_boost(struct rcu_node *rnp) { unsigned long flags; - struct rt_mutex mtx; struct task_struct *t; struct list_head *tb; @@ -1202,14 +1199,14 @@ static int rcu_boost(struct rcu_node *rnp) * section. */ t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; + rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); - rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + /* Lock only for side effect: boosts task t's priority. */ + rt_mutex_lock(&rnp->boost_mtx); + rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - /* Wait until boostee is done accessing mtx before reinitializing. */ + /* Wait for boostee to be done w/boost_mtx before reinitializing. */ wait_for_completion(&rnp->boost_completion); return ACCESS_ONCE(rnp->exp_tasks) != NULL || From c0f489d2c6fec8994c642c2ec925eb858727dc7b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jun 2014 13:46:03 -0700 Subject: [PATCH 27/34] rcu: Bind grace-period kthreads to non-NO_HZ_FULL CPUs Binding the grace-period kthreads to the timekeeping CPU resulted in significant performance decreases for some workloads. For more detail, see: https://lkml.org/lkml/2014/6/3/395 for benchmark numbers https://lkml.org/lkml/2014/6/4/218 for CPU statistics It turns out that it is necessary to bind the grace-period kthreads to the timekeeping CPU only when all but CPU 0 is a nohz_full CPU on the one hand or if CONFIG_NO_HZ_FULL_SYSIDLE=y on the other. In other cases, it suffices to bind the grace-period kthreads to the set of non-nohz_full CPUs. This commit therefore creates a tick_nohz_not_full_mask that is the complement of tick_nohz_full_mask, and then binds the grace-period kthread to the set of CPUs indicated by this new mask, which covers the CONFIG_NO_HZ_FULL_SYSIDLE=n case. The CONFIG_NO_HZ_FULL_SYSIDLE=y case still binds the grace-period kthreads to the timekeeping CPU. This commit also includes the tick_nohz_full_enabled() check suggested by Frederic Weisbecker. Reported-by: Jet Chen Signed-off-by: Paul E. McKenney [ paulmck: Created housekeeping_affine() and housekeeping_mask per fweisbec feedback. ] --- include/linux/tick.h | 20 ++++++++++++++++++++ kernel/rcu/tree_plugin.h | 14 +++++++++----- kernel/time/tick-sched.c | 10 ++++++++++ 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index b84773cb9f4c..06cc093ab7ad 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -162,6 +163,7 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #ifdef CONFIG_NO_HZ_FULL extern bool tick_nohz_full_running; extern cpumask_var_t tick_nohz_full_mask; +extern cpumask_var_t housekeeping_mask; static inline bool tick_nohz_full_enabled(void) { @@ -194,6 +196,24 @@ static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(struct task_struct *tsk) { } #endif +static inline bool is_housekeeping_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + return cpumask_test_cpu(cpu, housekeeping_mask); +#endif + return true; +} + +static inline void housekeeping_affine(struct task_struct *t) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + set_cpus_allowed_ptr(t, housekeeping_mask); + +#endif +} + static inline void tick_nohz_full_check(void) { if (tick_nohz_full_enabled()) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 719587af7b10..b39ba7239bd6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2846,12 +2846,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) */ static void rcu_bind_gp_kthread(void) { -#ifdef CONFIG_NO_HZ_FULL - int cpu = tick_do_timer_cpu; + int __maybe_unused cpu; - if (cpu < 0 || cpu >= nr_cpu_ids) + if (!tick_nohz_full_enabled()) return; - if (raw_smp_processor_id() != cpu) +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + cpu = tick_do_timer_cpu; + if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + if (!is_housekeeping_cpu(raw_smp_processor_id())) + housekeeping_affine(current); +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6558b7ac112d..f784d83e29f1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; +cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; static bool can_stop_full_tick(void) @@ -281,6 +282,7 @@ static int __init tick_nohz_full_setup(char *str) int cpu; alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + alloc_bootmem_cpumask_var(&housekeeping_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); return 1; @@ -291,6 +293,8 @@ static int __init tick_nohz_full_setup(char *str) pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); tick_nohz_full_running = true; return 1; @@ -332,9 +336,15 @@ static int tick_nohz_init_all(void) pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); + return err; + } err = 0; cpumask_setall(tick_nohz_full_mask); cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); + cpumask_clear(housekeeping_mask); + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); tick_nohz_full_running = true; #endif return err; From bc1dce514e9b29b64df28a533015885862f47814 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Jun 2014 09:18:31 -0700 Subject: [PATCH 28/34] rcu: Don't use NMIs to dump other CPUs' stacks Although NMI-based stack dumps are in principle more accurate, they are also more likely to trigger deadlocks. This commit therefore replaces all uses of trigger_all_cpu_backtrace() with rcu_dump_cpu_stacks(), so that the CPU detecting an RCU CPU stall does the stack dumping. Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- kernel/rcu/tree.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3f93033d3c61..8f3e4d43d736 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1013,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) } /* - * Dump stacks of all tasks running on stalled CPUs. This is a fallback - * for architectures that do not implement trigger_all_cpu_backtrace(). - * The NMI-triggered stack traces are more accurate because they are - * printed by the target CPU. + * Dump stacks of all tasks running on stalled CPUs. */ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) { @@ -1094,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected == 0) pr_err("INFO: Stall ended before state dump start\n"); - else if (!trigger_all_cpu_backtrace()) + else rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ @@ -1125,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp) pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (!trigger_all_cpu_backtrace()) - dump_stack(); + rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) From d860d40327dde251d508a234fa00bd0d90fbb656 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Thu, 19 Jun 2014 14:12:44 -0700 Subject: [PATCH 29/34] rcu: Use __this_cpu_read() instead of per_cpu_ptr() The __this_cpu_read() function produces better code than does per_cpu_ptr() on both ARM and x86. For example, gcc (Ubuntu/Linaro 4.7.3-12ubuntu1) 4.7.3 produces the following: ARMv7 per_cpu_ptr(): force_quiescent_state: mov r3, sp @, bic r1, r3, #8128 @ tmp171,, ldr r2, .L98 @ tmp169, bic r1, r1, #63 @ tmp170, tmp171, ldr r3, [r0, #220] @ __ptr, rsp_6(D)->rda ldr r1, [r1, #20] @ D.35903_68->cpu, D.35903_68->cpu mov r6, r0 @ rsp, rsp ldr r2, [r2, r1, asl #2] @ tmp173, __per_cpu_offset add r3, r3, r2 @ tmp175, __ptr, tmp173 ldr r5, [r3, #12] @ rnp_old, D.29162_13->mynode ARMv7 __this_cpu_read(): force_quiescent_state: ldr r3, [r0, #220] @ rsp_7(D)->rda, rsp_7(D)->rda mov r6, r0 @ rsp, rsp add r3, r3, #12 @ __ptr, rsp_7(D)->rda, ldr r5, [r2, r3] @ rnp_old, *D.29176_13 Using gcc 4.8.2: x86_64 per_cpu_ptr(): movl %gs:cpu_number,%edx # cpu_number, pscr_ret__ movslq %edx, %rdx # pscr_ret__, pscr_ret__ movq __per_cpu_offset(,%rdx,8), %rdx # __per_cpu_offset, tmp93 movq %rdi, %r13 # rsp, rsp movq 1000(%rdi), %rax # rsp_9(D)->rda, __ptr movq 24(%rdx,%rax), %r12 # _15->mynode, rnp_old x86_64 __this_cpu_read(): movq %rdi, %r13 # rsp, rsp movq 1000(%rdi), %rax # rsp_9(D)->rda, rsp_9(D)->rda movq %gs:24(%rax),%r12 # _10->mynode, rnp_old Because this change produces significant benefits for these two very diverse architectures, this commit makes this change. Signed-off-by: Shan Wei Acked-by: Christoph Lameter Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Reviewed-by: Lai Jiangshan --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8f3e4d43d736..a6c5424ffa38 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2487,7 +2487,7 @@ static void force_quiescent_state(struct rcu_state *rsp) struct rcu_node *rnp_old = NULL; /* Funnel through hierarchy to reduce memory contention. */ - rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + rnp = __this_cpu_read(rsp->rda->mynode); for (; rnp != NULL; rnp = rnp->parent) { ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || !raw_spin_trylock(&rnp->fqslock); From 11992c703a1c7d95f5d8759498d7617d4a504819 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jun 2014 12:09:52 -0700 Subject: [PATCH 30/34] rcu: Remove CONFIG_PROVE_RCU_DELAY The CONFIG_PROVE_RCU_DELAY Kconfig parameter doesn't appear to be very effective at finding race conditions, so this commit removes it. Signed-off-by: Paul E. McKenney Cc: Andi Kleen [ paulmck: Remove definition and uses as noted by Paul Bolle. ] --- kernel/rcu/tree.c | 5 ----- kernel/rcu/update.c | 3 --- lib/Kconfig.debug | 14 -------------- .../selftests/rcutorture/configs/rcu/TREE01 | 1 - .../selftests/rcutorture/configs/rcu/TREE02 | 1 - .../selftests/rcutorture/configs/rcu/TREE02-T | 1 - .../selftests/rcutorture/configs/rcu/TREE03 | 1 - .../selftests/rcutorture/configs/rcu/TREE04 | 1 - .../selftests/rcutorture/configs/rcu/TREE05 | 1 - .../selftests/rcutorture/configs/rcu/TREE06 | 1 - .../selftests/rcutorture/configs/rcu/TREE07 | 1 - .../selftests/rcutorture/configs/rcu/TREE08 | 1 - .../selftests/rcutorture/configs/rcu/TREE08-T | 1 - .../selftests/rcutorture/configs/rcu/TREE09 | 1 - .../configs/rcu/v0.0/P5-U-T-NH-sd-SMP-hp | 1 - .../configs/rcu/v3.12/P5-U-T-NH-sd-SMP-hp | 1 - .../configs/rcu/v3.3/P5-U-T-NH-sd-SMP-hp | 1 - .../configs/rcu/v3.5/P5-U-T-NH-sd-SMP-hp | 1 - .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 1 - 19 files changed, 38 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a6c5424ffa38..1b70cb6fbe3c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1647,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); -#ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && - system_state == SYSTEM_RUNNING) - udelay(200); -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index bc7883570530..4056d7992a6c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -90,9 +90,6 @@ void __rcu_read_unlock(void) } else { barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; -#ifdef CONFIG_PROVE_RCU_DELAY - udelay(10); /* Make preemption more probable. */ -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7cfcc1b8e101..4c37d5459b41 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1131,20 +1131,6 @@ config PROVE_RCU_REPEATEDLY Say N if you are unsure. -config PROVE_RCU_DELAY - bool "RCU debugging: preemptible RCU race provocation" - depends on DEBUG_KERNEL && PREEMPT_RCU - default n - help - There is a class of races that involve an unlikely preemption - of __rcu_read_unlock() just after ->rcu_read_lock_nesting has - been set to INT_MIN. This feature inserts a delay at that - point to increase the probability of these races. - - Say Y to increase probability of preemption of __rcu_read_unlock(). - - Say N if you are unsure. - config SPARSE_RCU_POINTER bool "RCU debugging: sparse-based checks for pointer usage" default n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 index 9c827ec59a97..063b7079c621 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 @@ -15,7 +15,6 @@ CONFIG_RCU_FANOUT_EXACT=n CONFIG_RCU_NOCB_CPU=y CONFIG_RCU_NOCB_CPU_ZERO=y CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_CPU_STALL_VERBOSE=n CONFIG_RCU_BOOST=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 index 1a777b5f68b5..ea119ba2f7d4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 @@ -18,7 +18,6 @@ CONFIG_RCU_FANOUT_EXACT=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=n -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_CPU_STALL_VERBOSE=y CONFIG_RCU_BOOST=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T index 61c8d9ce5bb2..19cf9485f48a 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T @@ -18,7 +18,6 @@ CONFIG_RCU_FANOUT_EXACT=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=n -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_CPU_STALL_VERBOSE=y CONFIG_RCU_BOOST=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 index c1f111c1561b..f4567fb3e332 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 @@ -14,7 +14,6 @@ CONFIG_RCU_FANOUT_LEAF=4 CONFIG_RCU_FANOUT_EXACT=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_CPU_STALL_VERBOSE=n CONFIG_RCU_BOOST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 7dbd27ce17a4..0a262fbb0c12 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -18,7 +18,6 @@ CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_FANOUT_EXACT=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=y CONFIG_RCU_CPU_STALL_VERBOSE=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 index d0f32e574743..3a06b97e9a73 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 @@ -18,7 +18,6 @@ CONFIG_RCU_NOCB_CPU_NONE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y CONFIG_PROVE_RCU=y -CONFIG_PROVE_RCU_DELAY=y CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_CPU_STALL_VERBOSE=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 index 2e477dfb9c57..8f084cca91bf 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 @@ -19,7 +19,6 @@ CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y CONFIG_PROVE_RCU=y -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_CPU_STALL_VERBOSE=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index 042f86ef362a..ab6225506909 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -17,7 +17,6 @@ CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_FANOUT_EXACT=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=y CONFIG_RCU_CPU_STALL_VERBOSE=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 index 3438cee1e3c5..69a2e255bf98 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 @@ -18,7 +18,6 @@ CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=y CONFIG_RCU_NOCB_CPU_ALL=y CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_CPU_STALL_VERBOSE=n CONFIG_RCU_BOOST=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T index bf4523d3e44c..a0f32fb8f17e 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T @@ -18,7 +18,6 @@ CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=y CONFIG_RCU_NOCB_CPU_ALL=y CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_CPU_STALL_VERBOSE=n CONFIG_RCU_BOOST=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index 81e4f7c0bf0b..b7a62a540ad1 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -13,7 +13,6 @@ CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_RCU_DELAY=n CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_CPU_STALL_VERBOSE=n CONFIG_RCU_BOOST=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/v0.0/P5-U-T-NH-sd-SMP-hp b/tools/testing/selftests/rcutorture/configs/rcu/v0.0/P5-U-T-NH-sd-SMP-hp index ef624ce73d8e..a55c00877fe4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/v0.0/P5-U-T-NH-sd-SMP-hp +++ b/tools/testing/selftests/rcutorture/configs/rcu/v0.0/P5-U-T-NH-sd-SMP-hp @@ -13,7 +13,6 @@ CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y #CHECK#CONFIG_TREE_PREEMPT_RCU=y CONFIG_DEBUG_KERNEL=y -CONFIG_PROVE_RCU_DELAY=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_RT_MUTEXES=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/v3.12/P5-U-T-NH-sd-SMP-hp b/tools/testing/selftests/rcutorture/configs/rcu/v3.12/P5-U-T-NH-sd-SMP-hp index ef624ce73d8e..a55c00877fe4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/v3.12/P5-U-T-NH-sd-SMP-hp +++ b/tools/testing/selftests/rcutorture/configs/rcu/v3.12/P5-U-T-NH-sd-SMP-hp @@ -13,7 +13,6 @@ CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y #CHECK#CONFIG_TREE_PREEMPT_RCU=y CONFIG_DEBUG_KERNEL=y -CONFIG_PROVE_RCU_DELAY=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_RT_MUTEXES=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/v3.3/P5-U-T-NH-sd-SMP-hp b/tools/testing/selftests/rcutorture/configs/rcu/v3.3/P5-U-T-NH-sd-SMP-hp index ef624ce73d8e..a55c00877fe4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/v3.3/P5-U-T-NH-sd-SMP-hp +++ b/tools/testing/selftests/rcutorture/configs/rcu/v3.3/P5-U-T-NH-sd-SMP-hp @@ -13,7 +13,6 @@ CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y #CHECK#CONFIG_TREE_PREEMPT_RCU=y CONFIG_DEBUG_KERNEL=y -CONFIG_PROVE_RCU_DELAY=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_RT_MUTEXES=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/v3.5/P5-U-T-NH-sd-SMP-hp b/tools/testing/selftests/rcutorture/configs/rcu/v3.5/P5-U-T-NH-sd-SMP-hp index ef624ce73d8e..a55c00877fe4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/v3.5/P5-U-T-NH-sd-SMP-hp +++ b/tools/testing/selftests/rcutorture/configs/rcu/v3.5/P5-U-T-NH-sd-SMP-hp @@ -13,7 +13,6 @@ CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y #CHECK#CONFIG_TREE_PREEMPT_RCU=y CONFIG_DEBUG_KERNEL=y -CONFIG_PROVE_RCU_DELAY=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_RT_MUTEXES=y diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index adbb76cffb49..3e588db86a17 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -14,7 +14,6 @@ CONFIG_NO_HZ_FULL_SYSIDLE -- Do one. CONFIG_PREEMPT -- Do half. (First three and #8.) CONFIG_PROVE_LOCKING -- Do all but two, covering CONFIG_PROVE_RCU and not. CONFIG_PROVE_RCU -- Do all but one under CONFIG_PROVE_LOCKING. -CONFIG_PROVE_RCU_DELAY -- Do one. CONFIG_RCU_BOOST -- one of TREE_PREEMPT_RCU. CONFIG_RCU_BOOST_PRIO -- set to 2 for _BOOST testing. CONFIG_RCU_CPU_STALL_INFO -- do one with and without _VERBOSE. From 406e3e536550bcb87ccbedddcd483776b1828761 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jun 2014 13:48:28 -0700 Subject: [PATCH 31/34] rcu: Fix __rcu_reclaim() to use true/false for bool The __rcu_reclaim() function returned 0/1, which is not proper for a function of type bool. This commit therefore converts to false/true. Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- kernel/rcu/rcu.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bfda2726ca45..ff1a6de62f17 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) void kfree(const void *); +/* + * Reclaim the specified callback, either by invoking it (non-lazy case) + * or freeing it directly (lazy case). Return true if lazy, false otherwise. + */ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; @@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); - return 1; + return true; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); rcu_lock_release(&rcu_callback_map); - return 0; + return false; } } From 615e41c6050a4878b2b68297f4672287941b93cd Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 11 Jun 2014 16:39:40 -0400 Subject: [PATCH 32/34] rcu: Fix a sparse warning in rcu_initiate_boost() This commit annotates rcu_initiate_boost() fixes the following sparse warning: kernel/rcu/tree_plugin.h:1494:13: warning: context imbalance in 'rcu_initiate_boost' - unexpected unlock Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- kernel/rcu/tree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b39ba7239bd6..0409ba34a05c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1259,6 +1259,7 @@ static int rcu_boost_kthread(void *arg) * about it going away. */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { struct task_struct *t; @@ -1494,6 +1495,7 @@ static void rcu_prepare_kthreads(int cpu) #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } From b41d1b924d0bd41a225a17f39297b9de0dca93d9 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 11 Jun 2014 16:39:41 -0400 Subject: [PATCH 33/34] rcu: Fix a sparse warning in rcu_report_unblock_qs_rnp() This commit annotates rcu_report_unblock_qs_rnp() in order to fix the following sparse warning: kernel/rcu/tree_plugin.h:990:13: warning: context imbalance in 'rcu_report_unblock_qs_rnp' - unexpected unlock Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0409ba34a05c..c66bdcb20c82 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) /* Because preemptible RCU does not exist, no quieting of tasks. */ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } From 187497fa5e9e9383820d33e48b87f8200a747c2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Jul 2014 07:37:06 -0700 Subject: [PATCH 34/34] rcu: Allow for NULL tick_nohz_full_mask when nohz_full= missing If there isn't a nohz_full= kernel parameter specified, then tick_nohz_full_mask can legitimately be NULL. This can cause problems when RCU's boot code tries to cpumask_or() this value into rcu_nocb_mask. In addition, if NO_HZ_FULL_ALL=y, there is no point in doing the cpumask_or() in the first place because this will cause RCU_NOCB_CPU_ALL=y, which in turn will have all bits already set in rcu_nocb_mask. This commit therefore avoids the cpumask_or() if NO_HZ_FULL_ALL=y and checks for !tick_nohz_full_running otherwise, this latter check catching cases when there was no nohz_full= kernel parameter specified. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f62b7f2f6abd..00dc411e9676 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2479,9 +2479,10 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) if (rcu_nocb_mask == NULL) return; -#ifdef CONFIG_NO_HZ_FULL - cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ +#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) + if (tick_nohz_full_running) + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ if (ls == -1) { ls = int_sqrt(nr_cpu_ids); rcu_nocb_leader_stride = ls;