2019-01-18 02:23:39 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0+ */
|
2009-08-23 04:56:45 +08:00
|
|
|
/*
|
|
|
|
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
|
|
|
|
* Internal non-public definitions.
|
|
|
|
*
|
|
|
|
* Copyright IBM Corporation, 2008
|
|
|
|
*
|
|
|
|
* Author: Ingo Molnar <mingo@elte.hu>
|
2019-01-18 02:23:39 +08:00
|
|
|
* Paul E. McKenney <paulmck@linux.ibm.com>
|
2009-08-23 04:56:45 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/cache.h>
|
|
|
|
#include <linux/spinlock.h>
|
2017-02-03 17:08:30 +08:00
|
|
|
#include <linux/rtmutex.h>
|
2009-08-23 04:56:45 +08:00
|
|
|
#include <linux/threads.h>
|
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/seqlock.h>
|
2016-02-19 16:46:41 +08:00
|
|
|
#include <linux/swait.h>
|
2017-03-15 03:42:30 +08:00
|
|
|
#include <linux/rcu_node_tree.h>
|
2012-04-24 06:52:53 +08:00
|
|
|
|
2017-05-02 16:31:18 +08:00
|
|
|
#include "rcu_segcblist.h"
|
|
|
|
|
2018-02-02 14:05:38 +08:00
|
|
|
/* Communicate arguments to a workqueue handler. */
|
|
|
|
struct rcu_exp_work {
|
|
|
|
unsigned long rew_s;
|
|
|
|
struct work_struct rew_work;
|
|
|
|
};
|
|
|
|
|
2011-03-30 08:48:28 +08:00
|
|
|
/* RCU's kthread states for tracing. */
|
|
|
|
#define RCU_KTHREAD_STOPPED 0
|
|
|
|
#define RCU_KTHREAD_RUNNING 1
|
|
|
|
#define RCU_KTHREAD_WAITING 2
|
2011-04-07 07:01:16 +08:00
|
|
|
#define RCU_KTHREAD_OFFCPU 3
|
|
|
|
#define RCU_KTHREAD_YIELDING 4
|
|
|
|
#define RCU_KTHREAD_MAX 4
|
2011-03-30 08:48:28 +08:00
|
|
|
|
2009-08-23 04:56:45 +08:00
|
|
|
/*
|
|
|
|
* Definition for node within the RCU grace-period-detection hierarchy.
|
|
|
|
*/
|
|
|
|
struct rcu_node {
|
2015-12-29 12:18:47 +08:00
|
|
|
raw_spinlock_t __private lock; /* Root rcu_node's lock protects */
|
|
|
|
/* some rcu_state fields as well as */
|
|
|
|
/* following. */
|
2020-05-15 04:34:34 +08:00
|
|
|
unsigned long gp_seq; /* Track rsp->gp_seq. */
|
2018-09-23 07:41:26 +08:00
|
|
|
unsigned long gp_seq_needed; /* Track furthest future GP request. */
|
2017-11-28 07:13:56 +08:00
|
|
|
unsigned long completedqs; /* All QSes done for this node. */
|
2009-08-23 04:56:45 +08:00
|
|
|
unsigned long qsmask; /* CPUs or groups that need to switch in */
|
|
|
|
/* order for current grace period to proceed.*/
|
2009-09-24 00:50:42 +08:00
|
|
|
/* In leaf rcu_node, each bit corresponds to */
|
|
|
|
/* an rcu_data structure, otherwise, each */
|
|
|
|
/* bit corresponds to a child rcu_node */
|
|
|
|
/* structure. */
|
2018-05-16 07:23:23 +08:00
|
|
|
unsigned long rcu_gp_init_mask; /* Mask of offline CPUs at GP init. */
|
2009-08-23 04:56:45 +08:00
|
|
|
unsigned long qsmaskinit;
|
2015-08-01 07:04:45 +08:00
|
|
|
/* Per-GP initial value for qsmask. */
|
rcu: Process offlining and onlining only at grace-period start
Races between CPU hotplug and grace periods can be difficult to resolve,
so the ->onoff_mutex is used to exclude the two events. Unfortunately,
this means that it is impossible for an outgoing CPU to perform the
last bits of its offlining from its last pass through the idle loop,
because sleeplocks cannot be acquired in that context.
This commit avoids these problems by buffering online and offline events
in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a
grace period starts, the events accumulated in this mask are applied to
the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special
case of all CPUs corresponding to a given leaf rcu_node structure being
offline while there are still elements in that structure's ->blkd_tasks
list is handled using a new ->wait_blkd_tasks field. In this case,
propagating the offline bits up the tree is deferred until the beginning
of the grace period after all of the tasks have exited their RCU read-side
critical sections and removed themselves from the list, at which point
the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node
structure's CPUs comes back online before the list empties, then the
->wait_blkd_tasks flag is simply cleared.
This of course means that RCU's notion of which CPUs are offline can be
out of date. This is OK because RCU need only wait on CPUs that were
online at the time that the grace period started. In addition, RCU's
force-quiescent-state actions will handle the case where a CPU goes
offline after the grace period starts.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2015-01-24 13:52:37 +08:00
|
|
|
/* Initialized from ->qsmaskinitnext at the */
|
|
|
|
/* beginning of each grace period. */
|
|
|
|
unsigned long qsmaskinitnext;
|
rcu: Prevent lockdep-RCU splats on lock acquisition/release
The rcu_cpu_starting() and rcu_report_dead() functions transition the
current CPU between online and offline state from an RCU perspective.
Unfortunately, this means that the rcu_cpu_starting() function's lock
acquisition and the rcu_report_dead() function's lock releases happen
while the CPU is offline from an RCU perspective, which can result
in lockdep-RCU splats about using RCU from an offline CPU. And this
situation can also result in too-short grace periods, especially in
guest OSes that are subject to vCPU preemption.
This commit therefore uses sequence-count-like synchronization to forgive
use of RCU while RCU thinks a CPU is offline across the full extent of
the rcu_cpu_starting() and rcu_report_dead() function's lock acquisitions
and releases.
One approach would have been to use the actual sequence-count primitives
provided by the Linux kernel. Unfortunately, the resulting code looks
completely broken and wrong, and is likely to result in patches that
break RCU in an attempt to address this appearance of broken wrongness.
Plus there is no net savings in lines of code, given the additional
explicit memory barriers required.
Therefore, this sequence count is instead implemented by a new ->ofl_seq
field in the rcu_node structure. If this counter's value is an odd
number, RCU forgives RCU read-side critical sections on other CPUs covered
by the same rcu_node structure, even if those CPUs are offline from
an RCU perspective. In addition, if a given leaf rcu_node structure's
->ofl_seq counter value is an odd number, rcu_gp_init() delays starting
the grace period until that counter value changes.
[ paulmck: Apply Peter Zijlstra feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-10-14 03:39:23 +08:00
|
|
|
unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */
|
rcu: Process offlining and onlining only at grace-period start
Races between CPU hotplug and grace periods can be difficult to resolve,
so the ->onoff_mutex is used to exclude the two events. Unfortunately,
this means that it is impossible for an outgoing CPU to perform the
last bits of its offlining from its last pass through the idle loop,
because sleeplocks cannot be acquired in that context.
This commit avoids these problems by buffering online and offline events
in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a
grace period starts, the events accumulated in this mask are applied to
the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special
case of all CPUs corresponding to a given leaf rcu_node structure being
offline while there are still elements in that structure's ->blkd_tasks
list is handled using a new ->wait_blkd_tasks field. In this case,
propagating the offline bits up the tree is deferred until the beginning
of the grace period after all of the tasks have exited their RCU read-side
critical sections and removed themselves from the list, at which point
the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node
structure's CPUs comes back online before the list empties, then the
->wait_blkd_tasks flag is simply cleared.
This of course means that RCU's notion of which CPUs are offline can be
out of date. This is OK because RCU need only wait on CPUs that were
online at the time that the grace period started. In addition, RCU's
force-quiescent-state actions will handle the case where a CPU goes
offline after the grace period starts.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2015-01-24 13:52:37 +08:00
|
|
|
/* Online CPUs for next grace period. */
|
2015-08-01 07:04:45 +08:00
|
|
|
unsigned long expmask; /* CPUs or groups that need to check in */
|
|
|
|
/* to allow the current expedited GP */
|
|
|
|
/* to complete. */
|
|
|
|
unsigned long expmaskinit;
|
|
|
|
/* Per-GP initial values for expmask. */
|
|
|
|
/* Initialized from ->expmaskinitnext at the */
|
|
|
|
/* beginning of each expedited GP. */
|
|
|
|
unsigned long expmaskinitnext;
|
|
|
|
/* Online CPUs for next expedited GP. */
|
2015-09-30 00:45:00 +08:00
|
|
|
/* Any CPU that has ever been online will */
|
|
|
|
/* have its bit set. */
|
2019-10-31 02:56:10 +08:00
|
|
|
unsigned long cbovldmask;
|
|
|
|
/* CPUs experiencing callback overload. */
|
2017-08-18 08:05:59 +08:00
|
|
|
unsigned long ffmask; /* Fully functional CPUs. */
|
2009-08-23 04:56:45 +08:00
|
|
|
unsigned long grpmask; /* Mask to apply to parent qsmask. */
|
2009-09-24 00:50:42 +08:00
|
|
|
/* Only one bit will be set in this mask. */
|
2020-06-12 10:07:53 +08:00
|
|
|
int grplo; /* lowest-numbered CPU here. */
|
|
|
|
int grphi; /* highest-numbered CPU here. */
|
2020-06-12 10:07:54 +08:00
|
|
|
u8 grpnum; /* group number for next level up. */
|
2009-08-23 04:56:45 +08:00
|
|
|
u8 level; /* root is at level 0. */
|
rcu: Process offlining and onlining only at grace-period start
Races between CPU hotplug and grace periods can be difficult to resolve,
so the ->onoff_mutex is used to exclude the two events. Unfortunately,
this means that it is impossible for an outgoing CPU to perform the
last bits of its offlining from its last pass through the idle loop,
because sleeplocks cannot be acquired in that context.
This commit avoids these problems by buffering online and offline events
in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a
grace period starts, the events accumulated in this mask are applied to
the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special
case of all CPUs corresponding to a given leaf rcu_node structure being
offline while there are still elements in that structure's ->blkd_tasks
list is handled using a new ->wait_blkd_tasks field. In this case,
propagating the offline bits up the tree is deferred until the beginning
of the grace period after all of the tasks have exited their RCU read-side
critical sections and removed themselves from the list, at which point
the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node
structure's CPUs comes back online before the list empties, then the
->wait_blkd_tasks flag is simply cleared.
This of course means that RCU's notion of which CPUs are offline can be
out of date. This is OK because RCU need only wait on CPUs that were
online at the time that the grace period started. In addition, RCU's
force-quiescent-state actions will handle the case where a CPU goes
offline after the grace period starts.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2015-01-24 13:52:37 +08:00
|
|
|
bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
|
|
|
|
/* exit RCU read-side critical sections */
|
|
|
|
/* before propagating offline up the */
|
|
|
|
/* rcu_node tree? */
|
2009-08-23 04:56:45 +08:00
|
|
|
struct rcu_node *parent;
|
2010-11-30 13:56:39 +08:00
|
|
|
struct list_head blkd_tasks;
|
|
|
|
/* Tasks blocked in RCU read-side critical */
|
|
|
|
/* section. Tasks are placed at the head */
|
|
|
|
/* of this list and age towards the tail. */
|
|
|
|
struct list_head *gp_tasks;
|
|
|
|
/* Pointer to the first task blocking the */
|
|
|
|
/* current grace period, or NULL if there */
|
|
|
|
/* is no such task. */
|
|
|
|
struct list_head *exp_tasks;
|
|
|
|
/* Pointer to the first task blocking the */
|
|
|
|
/* current expedited grace period, or NULL */
|
|
|
|
/* if there is no such task. If there */
|
|
|
|
/* is no current expedited grace period, */
|
|
|
|
/* then there can cannot be any such task. */
|
2011-02-08 04:47:15 +08:00
|
|
|
struct list_head *boost_tasks;
|
|
|
|
/* Pointer to first task that needs to be */
|
|
|
|
/* priority boosted, or NULL if no priority */
|
|
|
|
/* boosting is needed for this rcu_node */
|
|
|
|
/* structure. If there are no tasks */
|
|
|
|
/* queued on this rcu_node structure that */
|
|
|
|
/* are blocking the current grace period, */
|
|
|
|
/* there can be no such task. */
|
2014-06-13 04:30:25 +08:00
|
|
|
struct rt_mutex boost_mtx;
|
|
|
|
/* Used only for the priority-boosting */
|
|
|
|
/* side effect, not as a lock. */
|
2011-02-08 04:47:15 +08:00
|
|
|
unsigned long boost_time;
|
|
|
|
/* When to start boosting (jiffies). */
|
|
|
|
struct task_struct *boost_kthread_task;
|
|
|
|
/* kthread that takes care of priority */
|
|
|
|
/* boosting for this rcu_node structure. */
|
2011-03-30 08:48:28 +08:00
|
|
|
unsigned int boost_kthread_status;
|
|
|
|
/* State of boost_kthread_task for tracing. */
|
2013-02-11 12:48:58 +08:00
|
|
|
#ifdef CONFIG_RCU_NOCB_CPU
|
2016-02-19 16:46:41 +08:00
|
|
|
struct swait_queue_head nocb_gp_wq[2];
|
2013-02-11 12:48:58 +08:00
|
|
|
/* Place for rcu_nocb_kthread() to wait GP. */
|
|
|
|
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
|
2012-06-27 08:00:35 +08:00
|
|
|
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
|
2015-06-25 05:20:08 +08:00
|
|
|
|
2016-01-31 09:57:35 +08:00
|
|
|
spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
|
|
|
|
unsigned long exp_seq_rq;
|
2016-03-17 07:47:55 +08:00
|
|
|
wait_queue_head_t exp_wq[4];
|
2018-02-02 14:05:38 +08:00
|
|
|
struct rcu_exp_work rew;
|
|
|
|
bool exp_need_flush; /* Need to flush workitem? */
|
2009-08-23 04:56:45 +08:00
|
|
|
} ____cacheline_internodealigned_in_smp;
|
|
|
|
|
rcu: Correctly handle sparse possible cpus
In many cases in the RCU tree code, we iterate over the set of cpus for
a leaf node described by rcu_node::grplo and rcu_node::grphi, checking
per-cpu data for each cpu in this range. However, if the set of possible
cpus is sparse, some cpus described in this range are not possible, and
thus no per-cpu region will have been allocated (or initialised) for
them by the generic percpu code.
Erroneous accesses to a per-cpu area for these !possible cpus may fault
or may hit other data depending on the addressed generated when the
erroneous per cpu offset is applied. In practice, both cases have been
observed on arm64 hardware (the former being silent, but detectable with
additional patches).
To avoid issues resulting from this, we must iterate over the set of
*possible* cpus for a given leaf node. This patch add a new helper,
for_each_leaf_node_possible_cpu, to enable this. As iteration is often
intertwined with rcu_node local bitmask manipulation, a new
leaf_node_cpu_bit helper is added to make this simpler and more
consistent. The RCU tree code is made to use both of these where
appropriate.
Without this patch, running reboot at a shell can result in an oops
like:
[ 3369.075979] Unable to handle kernel paging request at virtual address ffffff8008b21b4c
[ 3369.083881] pgd = ffffffc3ecdda000
[ 3369.087270] [ffffff8008b21b4c] *pgd=00000083eca48003, *pud=00000083eca48003, *pmd=0000000000000000
[ 3369.096222] Internal error: Oops: 96000007 [#1] PREEMPT SMP
[ 3369.101781] Modules linked in:
[ 3369.104825] CPU: 2 PID: 1817 Comm: NetworkManager Tainted: G W 4.6.0+ #3
[ 3369.121239] task: ffffffc0fa13e000 ti: ffffffc3eb940000 task.ti: ffffffc3eb940000
[ 3369.128708] PC is at sync_rcu_exp_select_cpus+0x188/0x510
[ 3369.134094] LR is at sync_rcu_exp_select_cpus+0x104/0x510
[ 3369.139479] pc : [<ffffff80081109a8>] lr : [<ffffff8008110924>] pstate: 200001c5
[ 3369.146860] sp : ffffffc3eb9435a0
[ 3369.150162] x29: ffffffc3eb9435a0 x28: ffffff8008be4f88
[ 3369.155465] x27: ffffff8008b66c80 x26: ffffffc3eceb2600
[ 3369.160767] x25: 0000000000000001 x24: ffffff8008be4f88
[ 3369.166070] x23: ffffff8008b51c3c x22: ffffff8008b66c80
[ 3369.171371] x21: 0000000000000001 x20: ffffff8008b21b40
[ 3369.176673] x19: ffffff8008b66c80 x18: 0000000000000000
[ 3369.181975] x17: 0000007fa951a010 x16: ffffff80086a30f0
[ 3369.187278] x15: 0000007fa9505590 x14: 0000000000000000
[ 3369.192580] x13: ffffff8008b51000 x12: ffffffc3eb940000
[ 3369.197882] x11: 0000000000000006 x10: ffffff8008b51b78
[ 3369.203184] x9 : 0000000000000001 x8 : ffffff8008be4000
[ 3369.208486] x7 : ffffff8008b21b40 x6 : 0000000000001003
[ 3369.213788] x5 : 0000000000000000 x4 : ffffff8008b27280
[ 3369.219090] x3 : ffffff8008b21b4c x2 : 0000000000000001
[ 3369.224406] x1 : 0000000000000001 x0 : 0000000000000140
...
[ 3369.972257] [<ffffff80081109a8>] sync_rcu_exp_select_cpus+0x188/0x510
[ 3369.978685] [<ffffff80081128b4>] synchronize_rcu_expedited+0x64/0xa8
[ 3369.985026] [<ffffff80086b987c>] synchronize_net+0x24/0x30
[ 3369.990499] [<ffffff80086ddb54>] dev_deactivate_many+0x28c/0x298
[ 3369.996493] [<ffffff80086b6bb8>] __dev_close_many+0x60/0xd0
[ 3370.002052] [<ffffff80086b6d48>] __dev_close+0x28/0x40
[ 3370.007178] [<ffffff80086bf62c>] __dev_change_flags+0x8c/0x158
[ 3370.012999] [<ffffff80086bf718>] dev_change_flags+0x20/0x60
[ 3370.018558] [<ffffff80086cf7f0>] do_setlink+0x288/0x918
[ 3370.023771] [<ffffff80086d0798>] rtnl_newlink+0x398/0x6a8
[ 3370.029158] [<ffffff80086cee84>] rtnetlink_rcv_msg+0xe4/0x220
[ 3370.034891] [<ffffff80086e274c>] netlink_rcv_skb+0xc4/0xf8
[ 3370.040364] [<ffffff80086ced8c>] rtnetlink_rcv+0x2c/0x40
[ 3370.045663] [<ffffff80086e1fe8>] netlink_unicast+0x160/0x238
[ 3370.051309] [<ffffff80086e24b8>] netlink_sendmsg+0x2f0/0x358
[ 3370.056956] [<ffffff80086a0070>] sock_sendmsg+0x18/0x30
[ 3370.062168] [<ffffff80086a21cc>] ___sys_sendmsg+0x26c/0x280
[ 3370.067728] [<ffffff80086a30ac>] __sys_sendmsg+0x44/0x88
[ 3370.073027] [<ffffff80086a3100>] SyS_sendmsg+0x10/0x20
[ 3370.078153] [<ffffff8008085e70>] el0_svc_naked+0x24/0x28
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Dennis Chen <dennis.chen@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2016-06-03 22:20:04 +08:00
|
|
|
/*
|
|
|
|
* Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
|
|
|
|
* are indexed relative to this interval rather than the global CPU ID space.
|
|
|
|
* This generates the bit for a CPU in node-local masks.
|
|
|
|
*/
|
2018-08-01 00:49:20 +08:00
|
|
|
#define leaf_node_cpu_bit(rnp, cpu) (BIT((cpu) - (rnp)->grplo))
|
rcu: Correctly handle sparse possible cpus
In many cases in the RCU tree code, we iterate over the set of cpus for
a leaf node described by rcu_node::grplo and rcu_node::grphi, checking
per-cpu data for each cpu in this range. However, if the set of possible
cpus is sparse, some cpus described in this range are not possible, and
thus no per-cpu region will have been allocated (or initialised) for
them by the generic percpu code.
Erroneous accesses to a per-cpu area for these !possible cpus may fault
or may hit other data depending on the addressed generated when the
erroneous per cpu offset is applied. In practice, both cases have been
observed on arm64 hardware (the former being silent, but detectable with
additional patches).
To avoid issues resulting from this, we must iterate over the set of
*possible* cpus for a given leaf node. This patch add a new helper,
for_each_leaf_node_possible_cpu, to enable this. As iteration is often
intertwined with rcu_node local bitmask manipulation, a new
leaf_node_cpu_bit helper is added to make this simpler and more
consistent. The RCU tree code is made to use both of these where
appropriate.
Without this patch, running reboot at a shell can result in an oops
like:
[ 3369.075979] Unable to handle kernel paging request at virtual address ffffff8008b21b4c
[ 3369.083881] pgd = ffffffc3ecdda000
[ 3369.087270] [ffffff8008b21b4c] *pgd=00000083eca48003, *pud=00000083eca48003, *pmd=0000000000000000
[ 3369.096222] Internal error: Oops: 96000007 [#1] PREEMPT SMP
[ 3369.101781] Modules linked in:
[ 3369.104825] CPU: 2 PID: 1817 Comm: NetworkManager Tainted: G W 4.6.0+ #3
[ 3369.121239] task: ffffffc0fa13e000 ti: ffffffc3eb940000 task.ti: ffffffc3eb940000
[ 3369.128708] PC is at sync_rcu_exp_select_cpus+0x188/0x510
[ 3369.134094] LR is at sync_rcu_exp_select_cpus+0x104/0x510
[ 3369.139479] pc : [<ffffff80081109a8>] lr : [<ffffff8008110924>] pstate: 200001c5
[ 3369.146860] sp : ffffffc3eb9435a0
[ 3369.150162] x29: ffffffc3eb9435a0 x28: ffffff8008be4f88
[ 3369.155465] x27: ffffff8008b66c80 x26: ffffffc3eceb2600
[ 3369.160767] x25: 0000000000000001 x24: ffffff8008be4f88
[ 3369.166070] x23: ffffff8008b51c3c x22: ffffff8008b66c80
[ 3369.171371] x21: 0000000000000001 x20: ffffff8008b21b40
[ 3369.176673] x19: ffffff8008b66c80 x18: 0000000000000000
[ 3369.181975] x17: 0000007fa951a010 x16: ffffff80086a30f0
[ 3369.187278] x15: 0000007fa9505590 x14: 0000000000000000
[ 3369.192580] x13: ffffff8008b51000 x12: ffffffc3eb940000
[ 3369.197882] x11: 0000000000000006 x10: ffffff8008b51b78
[ 3369.203184] x9 : 0000000000000001 x8 : ffffff8008be4000
[ 3369.208486] x7 : ffffff8008b21b40 x6 : 0000000000001003
[ 3369.213788] x5 : 0000000000000000 x4 : ffffff8008b27280
[ 3369.219090] x3 : ffffff8008b21b4c x2 : 0000000000000001
[ 3369.224406] x1 : 0000000000000001 x0 : 0000000000000140
...
[ 3369.972257] [<ffffff80081109a8>] sync_rcu_exp_select_cpus+0x188/0x510
[ 3369.978685] [<ffffff80081128b4>] synchronize_rcu_expedited+0x64/0xa8
[ 3369.985026] [<ffffff80086b987c>] synchronize_net+0x24/0x30
[ 3369.990499] [<ffffff80086ddb54>] dev_deactivate_many+0x28c/0x298
[ 3369.996493] [<ffffff80086b6bb8>] __dev_close_many+0x60/0xd0
[ 3370.002052] [<ffffff80086b6d48>] __dev_close+0x28/0x40
[ 3370.007178] [<ffffff80086bf62c>] __dev_change_flags+0x8c/0x158
[ 3370.012999] [<ffffff80086bf718>] dev_change_flags+0x20/0x60
[ 3370.018558] [<ffffff80086cf7f0>] do_setlink+0x288/0x918
[ 3370.023771] [<ffffff80086d0798>] rtnl_newlink+0x398/0x6a8
[ 3370.029158] [<ffffff80086cee84>] rtnetlink_rcv_msg+0xe4/0x220
[ 3370.034891] [<ffffff80086e274c>] netlink_rcv_skb+0xc4/0xf8
[ 3370.040364] [<ffffff80086ced8c>] rtnetlink_rcv+0x2c/0x40
[ 3370.045663] [<ffffff80086e1fe8>] netlink_unicast+0x160/0x238
[ 3370.051309] [<ffffff80086e24b8>] netlink_sendmsg+0x2f0/0x358
[ 3370.056956] [<ffffff80086a0070>] sock_sendmsg+0x18/0x30
[ 3370.062168] [<ffffff80086a21cc>] ___sys_sendmsg+0x26c/0x280
[ 3370.067728] [<ffffff80086a30ac>] __sys_sendmsg+0x44/0x88
[ 3370.073027] [<ffffff80086a3100>] SyS_sendmsg+0x10/0x20
[ 3370.078153] [<ffffff8008085e70>] el0_svc_naked+0x24/0x28
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Dennis Chen <dennis.chen@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2016-06-03 22:20:04 +08:00
|
|
|
|
2015-08-07 06:16:57 +08:00
|
|
|
/*
|
|
|
|
* Union to allow "aggregate OR" operation on the need for a quiescent
|
|
|
|
* state by the normal and expedited grace periods.
|
|
|
|
*/
|
|
|
|
union rcu_noqs {
|
|
|
|
struct {
|
|
|
|
u8 norm;
|
|
|
|
u8 exp;
|
|
|
|
} b; /* Bits. */
|
|
|
|
u16 s; /* Set of bits, aggregate OR here. */
|
|
|
|
};
|
|
|
|
|
2009-08-23 04:56:45 +08:00
|
|
|
/* Per-CPU data for read-copy update. */
|
|
|
|
struct rcu_data {
|
|
|
|
/* 1) quiescent-state and grace-period handling : */
|
2020-05-15 04:34:34 +08:00
|
|
|
unsigned long gp_seq; /* Track rsp->gp_seq counter. */
|
2018-09-23 07:41:26 +08:00
|
|
|
unsigned long gp_seq_needed; /* Track furthest future GP request. */
|
2015-08-07 06:16:57 +08:00
|
|
|
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
|
2015-08-07 02:31:51 +08:00
|
|
|
bool core_needs_qs; /* Core waits for quiesc state. */
|
2009-08-23 04:56:45 +08:00
|
|
|
bool beenonline; /* CPU online at least once. */
|
2018-05-02 05:34:08 +08:00
|
|
|
bool gpwrap; /* Possible ->gp_seq wrap. */
|
2019-03-28 06:51:25 +08:00
|
|
|
bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
|
2020-07-25 11:22:05 +08:00
|
|
|
bool cpu_started; /* RCU watching this onlining CPU. */
|
2009-08-23 04:56:45 +08:00
|
|
|
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
|
|
|
|
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
|
2012-01-17 05:29:10 +08:00
|
|
|
unsigned long ticks_this_gp; /* The number of scheduling-clock */
|
|
|
|
/* ticks this CPU has handled */
|
|
|
|
/* during and after the last grace */
|
|
|
|
/* period it is aware of. */
|
2019-04-05 03:19:25 +08:00
|
|
|
struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
|
|
|
|
bool defer_qs_iw_pending; /* Scheduler attention pending? */
|
2020-08-08 22:56:31 +08:00
|
|
|
struct work_struct strict_work; /* Schedule readers for strict GPs. */
|
2009-08-23 04:56:45 +08:00
|
|
|
|
|
|
|
/* 2) batch handling */
|
srcu: Abstract multi-tail callback list handling
RCU has only one multi-tail callback list, which is implemented via
the nxtlist, nxttail, nxtcompleted, qlen_lazy, and qlen fields in the
rcu_data structure, and whose operations are open-code throughout the
Tree RCU implementation. This has been more or less OK in the past,
but upcoming callback-list optimizations in SRCU could really use
a multi-tail callback list there as well.
This commit therefore abstracts the multi-tail callback list handling
into a new kernel/rcu/rcu_segcblist.h file, and uses this new API.
The simple head-and-tail pointer callback list is also abstracted and
applied everywhere except for the NOCB callback-offload lists. (Yes,
the plan is to apply them there as well, but this commit is already
bigger than would be good.)
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2017-02-09 04:36:42 +08:00
|
|
|
struct rcu_segcblist cblist; /* Segmented callback list, with */
|
|
|
|
/* different callbacks waiting for */
|
|
|
|
/* different grace periods. */
|
2009-10-15 01:15:55 +08:00
|
|
|
long qlen_last_fqs_check;
|
|
|
|
/* qlen at last check for QS forcing */
|
2020-05-02 07:49:48 +08:00
|
|
|
unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */
|
2009-10-15 01:15:55 +08:00
|
|
|
unsigned long n_force_qs_snap;
|
|
|
|
/* did other CPU force QS recently? */
|
2009-08-23 04:56:45 +08:00
|
|
|
long blimit; /* Upper limit on a processed batch */
|
|
|
|
|
|
|
|
/* 3) dynticks interface. */
|
|
|
|
int dynticks_snap; /* Per-GP tracking for dynticks. */
|
2018-08-04 12:00:38 +08:00
|
|
|
long dynticks_nesting; /* Track process nesting level. */
|
|
|
|
long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */
|
|
|
|
atomic_t dynticks; /* Even value for idle, else odd. */
|
|
|
|
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
|
|
|
|
bool rcu_urgent_qs; /* GP old need light quiescent state. */
|
2019-08-13 07:14:00 +08:00
|
|
|
bool rcu_forced_tick; /* Forced tick to provide QS. */
|
2019-11-28 08:36:45 +08:00
|
|
|
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
|
2018-08-04 10:31:39 +08:00
|
|
|
#ifdef CONFIG_RCU_FAST_NO_HZ
|
2018-08-04 12:00:38 +08:00
|
|
|
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
|
|
|
|
unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
|
|
|
|
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
|
2018-08-04 10:31:39 +08:00
|
|
|
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
|
2009-08-23 04:56:45 +08:00
|
|
|
|
2018-08-05 11:32:07 +08:00
|
|
|
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
|
2012-05-29 14:57:46 +08:00
|
|
|
struct rcu_head barrier_head;
|
2016-10-11 21:09:59 +08:00
|
|
|
int exp_dynticks_snap; /* Double-check need for IPI. */
|
2012-05-29 14:57:46 +08:00
|
|
|
|
2018-08-05 11:32:07 +08:00
|
|
|
/* 5) Callback offloading. */
|
2012-08-20 12:35:53 +08:00
|
|
|
#ifdef CONFIG_RCU_NOCB_CPU
|
rcu/nocb: Provide separate no-CBs grace-period kthreads
Currently, there is one no-CBs rcuo kthread per CPU, and these kthreads
are divided into groups. The first rcuo kthread to come online in a
given group is that group's leader, and the leader both waits for grace
periods and invokes its CPU's callbacks. The non-leader rcuo kthreads
only invoke callbacks.
This works well in the real-time/embedded environments for which it was
intended because such environments tend not to generate all that many
callbacks. However, given huge floods of callbacks, it is possible for
the leader kthread to be stuck invoking callbacks while its followers
wait helplessly while their callbacks pile up. This is a good recipe
for an OOM, and rcutorture's new callback-flood capability does generate
such OOMs.
One strategy would be to wait until such OOMs start happening in
production, but similar OOMs have in fact happened starting in 2018.
It would therefore be wise to take a more proactive approach.
This commit therefore features per-CPU rcuo kthreads that do nothing
but invoke callbacks. Instead of having one of these kthreads act as
leader, each group has a separate rcog kthread that handles grace periods
for its group. Because these rcuog kthreads do not invoke callbacks,
callback floods on one CPU no longer block callbacks from reaching the
rcuc callback-invocation kthreads on other CPUs.
This change does introduce additional kthreads, however:
1. The number of additional kthreads is about the square root of
the number of CPUs, so that a 4096-CPU system would have only
about 64 additional kthreads. Note that recent changes
decreased the number of rcuo kthreads by a factor of two
(CONFIG_PREEMPT=n) or even three (CONFIG_PREEMPT=y), so
this still represents a significant improvement on most systems.
2. The leading "rcuo" of the rcuog kthreads should allow existing
scripting to affinity these additional kthreads as needed, the
same as for the rcuop and rcuos kthreads. (There are no longer
any rcuob kthreads.)
3. A state-machine approach was considered and rejected. Although
this would allow the rcuo kthreads to continue their dual
leader/follower roles, it complicates callback invocation
and makes it more difficult to consolidate rcuo callback
invocation with existing softirq callback invocation.
The introduction of rcuog kthreads should thus be acceptable.
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-03-30 07:43:51 +08:00
|
|
|
struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
|
|
|
|
struct task_struct *nocb_gp_kthread;
|
2017-04-30 11:03:20 +08:00
|
|
|
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
|
2019-05-28 22:18:08 +08:00
|
|
|
atomic_t nocb_lock_contended; /* Contention experienced. */
|
2014-07-30 05:50:47 +08:00
|
|
|
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
|
2017-04-30 11:03:20 +08:00
|
|
|
struct timer_list nocb_timer; /* Enforce finite deferral. */
|
rcu/nocb: Add bypass callback queueing
Use of the rcu_data structure's segmented ->cblist for no-CBs CPUs
takes advantage of unrelated grace periods, thus reducing the memory
footprint in the face of floods of call_rcu() invocations. However,
the ->cblist field is a more-complex rcu_segcblist structure which must
be protected via locking. Even though there are only three entities
which can acquire this lock (the CPU invoking call_rcu(), the no-CBs
grace-period kthread, and the no-CBs callbacks kthread), the contention
on this lock is excessive under heavy stress.
This commit therefore greatly reduces contention by provisioning
an rcu_cblist structure field named ->nocb_bypass within the
rcu_data structure. Each no-CBs CPU is permitted only a limited
number of enqueues onto the ->cblist per jiffy, controlled by a new
nocb_nobypass_lim_per_jiffy kernel boot parameter that defaults to
about 16 enqueues per millisecond (16 * 1000 / HZ). When that limit is
exceeded, the CPU instead enqueues onto the new ->nocb_bypass.
The ->nocb_bypass is flushed into the ->cblist every jiffy or when
the number of callbacks on ->nocb_bypass exceeds qhimark, whichever
happens first. During call_rcu() floods, this flushing is carried out
by the CPU during the course of its call_rcu() invocations. However,
a CPU could simply stop invoking call_rcu() at any time. The no-CBs
grace-period kthread therefore carries out less-aggressive flushing
(every few jiffies or when the number of callbacks on ->nocb_bypass
exceeds (2 * qhimark), whichever comes first). This means that the
no-CBs grace-period kthread cannot be permitted to do unbounded waits
while there are callbacks on ->nocb_bypass. A ->nocb_bypass_timer is
used to provide the needed wakeups.
[ paulmck: Apply Coverity feedback reported by Colin Ian King. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-07-03 07:03:33 +08:00
|
|
|
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
|
|
|
|
|
|
|
|
/* The following fields are used by call_rcu, hence own cacheline. */
|
|
|
|
raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
|
|
|
|
struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */
|
|
|
|
unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */
|
|
|
|
unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */
|
|
|
|
int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */
|
rcu: Parallelize and economize NOCB kthread wakeups
An 80-CPU system with a context-switch-heavy workload can require so
many NOCB kthread wakeups that the RCU grace-period kthreads spend several
tens of percent of a CPU just awakening things. This clearly will not
scale well: If you add enough CPUs, the RCU grace-period kthreads would
get behind, increasing grace-period latency.
To avoid this problem, this commit divides the NOCB kthreads into leaders
and followers, where the grace-period kthreads awaken the leaders each of
whom in turn awakens its followers. By default, the number of groups of
kthreads is the square root of the number of CPUs, but this default may
be overridden using the rcutree.rcu_nocb_leader_stride boot parameter.
This reduces the number of wakeups done per grace period by the RCU
grace-period kthread by the square root of the number of CPUs, but of
course by shifting those wakeups to the leaders. In addition, because
the leaders do grace periods on behalf of their respective followers,
the number of wakeups of the followers decreases by up to a factor of two.
Instead of being awakened once when new callbacks arrive and again
at the end of the grace period, the followers are awakened only at
the end of the grace period.
For a numerical example, in a 4096-CPU system, the grace-period kthread
would awaken 64 leaders, each of which would awaken its 63 followers
at the end of the grace period. This compares favorably with the 79
wakeups for the grace-period kthread on an 80-CPU system.
Reported-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2014-06-25 00:26:11 +08:00
|
|
|
|
2019-03-29 06:44:18 +08:00
|
|
|
/* The following fields are used by GP kthread, hence own cacheline. */
|
2019-06-03 04:41:08 +08:00
|
|
|
raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
|
rcu/nocb: Add bypass callback queueing
Use of the rcu_data structure's segmented ->cblist for no-CBs CPUs
takes advantage of unrelated grace periods, thus reducing the memory
footprint in the face of floods of call_rcu() invocations. However,
the ->cblist field is a more-complex rcu_segcblist structure which must
be protected via locking. Even though there are only three entities
which can acquire this lock (the CPU invoking call_rcu(), the no-CBs
grace-period kthread, and the no-CBs callbacks kthread), the contention
on this lock is excessive under heavy stress.
This commit therefore greatly reduces contention by provisioning
an rcu_cblist structure field named ->nocb_bypass within the
rcu_data structure. Each no-CBs CPU is permitted only a limited
number of enqueues onto the ->cblist per jiffy, controlled by a new
nocb_nobypass_lim_per_jiffy kernel boot parameter that defaults to
about 16 enqueues per millisecond (16 * 1000 / HZ). When that limit is
exceeded, the CPU instead enqueues onto the new ->nocb_bypass.
The ->nocb_bypass is flushed into the ->cblist every jiffy or when
the number of callbacks on ->nocb_bypass exceeds qhimark, whichever
happens first. During call_rcu() floods, this flushing is carried out
by the CPU during the course of its call_rcu() invocations. However,
a CPU could simply stop invoking call_rcu() at any time. The no-CBs
grace-period kthread therefore carries out less-aggressive flushing
(every few jiffies or when the number of callbacks on ->nocb_bypass
exceeds (2 * qhimark), whichever comes first). This means that the
no-CBs grace-period kthread cannot be permitted to do unbounded waits
while there are callbacks on ->nocb_bypass. A ->nocb_bypass_timer is
used to provide the needed wakeups.
[ paulmck: Apply Coverity feedback reported by Colin Ian King. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-07-03 07:03:33 +08:00
|
|
|
struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
|
2019-06-26 04:32:51 +08:00
|
|
|
u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
|
|
|
|
u8 nocb_gp_bypass; /* Found a bypass on last scan? */
|
|
|
|
u8 nocb_gp_gp; /* GP to wait for on last scan? */
|
|
|
|
unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */
|
|
|
|
unsigned long nocb_gp_loops; /* # passes through wait code. */
|
rcu/nocb: Provide separate no-CBs grace-period kthreads
Currently, there is one no-CBs rcuo kthread per CPU, and these kthreads
are divided into groups. The first rcuo kthread to come online in a
given group is that group's leader, and the leader both waits for grace
periods and invokes its CPU's callbacks. The non-leader rcuo kthreads
only invoke callbacks.
This works well in the real-time/embedded environments for which it was
intended because such environments tend not to generate all that many
callbacks. However, given huge floods of callbacks, it is possible for
the leader kthread to be stuck invoking callbacks while its followers
wait helplessly while their callbacks pile up. This is a good recipe
for an OOM, and rcutorture's new callback-flood capability does generate
such OOMs.
One strategy would be to wait until such OOMs start happening in
production, but similar OOMs have in fact happened starting in 2018.
It would therefore be wise to take a more proactive approach.
This commit therefore features per-CPU rcuo kthreads that do nothing
but invoke callbacks. Instead of having one of these kthreads act as
leader, each group has a separate rcog kthread that handles grace periods
for its group. Because these rcuog kthreads do not invoke callbacks,
callback floods on one CPU no longer block callbacks from reaching the
rcuc callback-invocation kthreads on other CPUs.
This change does introduce additional kthreads, however:
1. The number of additional kthreads is about the square root of
the number of CPUs, so that a 4096-CPU system would have only
about 64 additional kthreads. Note that recent changes
decreased the number of rcuo kthreads by a factor of two
(CONFIG_PREEMPT=n) or even three (CONFIG_PREEMPT=y), so
this still represents a significant improvement on most systems.
2. The leading "rcuo" of the rcuog kthreads should allow existing
scripting to affinity these additional kthreads as needed, the
same as for the rcuop and rcuos kthreads. (There are no longer
any rcuob kthreads.)
3. A state-machine approach was considered and rejected. Although
this would allow the rcuo kthreads to continue their dual
leader/follower roles, it complicates callback invocation
and makes it more difficult to consolidate rcuo callback
invocation with existing softirq callback invocation.
The introduction of rcuog kthreads should thus be acceptable.
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-03-30 07:43:51 +08:00
|
|
|
struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
|
2019-05-16 00:56:40 +08:00
|
|
|
bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
|
rcu/nocb: Provide separate no-CBs grace-period kthreads
Currently, there is one no-CBs rcuo kthread per CPU, and these kthreads
are divided into groups. The first rcuo kthread to come online in a
given group is that group's leader, and the leader both waits for grace
periods and invokes its CPU's callbacks. The non-leader rcuo kthreads
only invoke callbacks.
This works well in the real-time/embedded environments for which it was
intended because such environments tend not to generate all that many
callbacks. However, given huge floods of callbacks, it is possible for
the leader kthread to be stuck invoking callbacks while its followers
wait helplessly while their callbacks pile up. This is a good recipe
for an OOM, and rcutorture's new callback-flood capability does generate
such OOMs.
One strategy would be to wait until such OOMs start happening in
production, but similar OOMs have in fact happened starting in 2018.
It would therefore be wise to take a more proactive approach.
This commit therefore features per-CPU rcuo kthreads that do nothing
but invoke callbacks. Instead of having one of these kthreads act as
leader, each group has a separate rcog kthread that handles grace periods
for its group. Because these rcuog kthreads do not invoke callbacks,
callback floods on one CPU no longer block callbacks from reaching the
rcuc callback-invocation kthreads on other CPUs.
This change does introduce additional kthreads, however:
1. The number of additional kthreads is about the square root of
the number of CPUs, so that a 4096-CPU system would have only
about 64 additional kthreads. Note that recent changes
decreased the number of rcuo kthreads by a factor of two
(CONFIG_PREEMPT=n) or even three (CONFIG_PREEMPT=y), so
this still represents a significant improvement on most systems.
2. The leading "rcuo" of the rcuog kthreads should allow existing
scripting to affinity these additional kthreads as needed, the
same as for the rcuop and rcuos kthreads. (There are no longer
any rcuob kthreads.)
3. A state-machine approach was considered and rejected. Although
this would allow the rcuo kthreads to continue their dual
leader/follower roles, it complicates callback invocation
and makes it more difficult to consolidate rcuo callback
invocation with existing softirq callback invocation.
The introduction of rcuog kthreads should thus be acceptable.
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-03-30 07:43:51 +08:00
|
|
|
struct task_struct *nocb_cb_kthread;
|
2019-03-29 06:33:59 +08:00
|
|
|
struct rcu_data *nocb_next_cb_rdp;
|
|
|
|
/* Next rcu_data in wakeup chain. */
|
rcu: Parallelize and economize NOCB kthread wakeups
An 80-CPU system with a context-switch-heavy workload can require so
many NOCB kthread wakeups that the RCU grace-period kthreads spend several
tens of percent of a CPU just awakening things. This clearly will not
scale well: If you add enough CPUs, the RCU grace-period kthreads would
get behind, increasing grace-period latency.
To avoid this problem, this commit divides the NOCB kthreads into leaders
and followers, where the grace-period kthreads awaken the leaders each of
whom in turn awakens its followers. By default, the number of groups of
kthreads is the square root of the number of CPUs, but this default may
be overridden using the rcutree.rcu_nocb_leader_stride boot parameter.
This reduces the number of wakeups done per grace period by the RCU
grace-period kthread by the square root of the number of CPUs, but of
course by shifting those wakeups to the leaders. In addition, because
the leaders do grace periods on behalf of their respective followers,
the number of wakeups of the followers decreases by up to a factor of two.
Instead of being awakened once when new callbacks arrive and again
at the end of the grace period, the followers are awakened only at
the end of the grace period.
For a numerical example, in a 4096-CPU system, the grace-period kthread
would awaken 64 leaders, each of which would awaken its 63 followers
at the end of the grace period. This compares favorably with the 79
wakeups for the grace-period kthread on an 80-CPU system.
Reported-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2014-06-25 00:26:11 +08:00
|
|
|
|
rcu/nocb: Add bypass callback queueing
Use of the rcu_data structure's segmented ->cblist for no-CBs CPUs
takes advantage of unrelated grace periods, thus reducing the memory
footprint in the face of floods of call_rcu() invocations. However,
the ->cblist field is a more-complex rcu_segcblist structure which must
be protected via locking. Even though there are only three entities
which can acquire this lock (the CPU invoking call_rcu(), the no-CBs
grace-period kthread, and the no-CBs callbacks kthread), the contention
on this lock is excessive under heavy stress.
This commit therefore greatly reduces contention by provisioning
an rcu_cblist structure field named ->nocb_bypass within the
rcu_data structure. Each no-CBs CPU is permitted only a limited
number of enqueues onto the ->cblist per jiffy, controlled by a new
nocb_nobypass_lim_per_jiffy kernel boot parameter that defaults to
about 16 enqueues per millisecond (16 * 1000 / HZ). When that limit is
exceeded, the CPU instead enqueues onto the new ->nocb_bypass.
The ->nocb_bypass is flushed into the ->cblist every jiffy or when
the number of callbacks on ->nocb_bypass exceeds qhimark, whichever
happens first. During call_rcu() floods, this flushing is carried out
by the CPU during the course of its call_rcu() invocations. However,
a CPU could simply stop invoking call_rcu() at any time. The no-CBs
grace-period kthread therefore carries out less-aggressive flushing
(every few jiffies or when the number of callbacks on ->nocb_bypass
exceeds (2 * qhimark), whichever comes first). This means that the
no-CBs grace-period kthread cannot be permitted to do unbounded waits
while there are callbacks on ->nocb_bypass. A ->nocb_bypass_timer is
used to provide the needed wakeups.
[ paulmck: Apply Coverity feedback reported by Colin Ian King. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-07-03 07:03:33 +08:00
|
|
|
/* The following fields are used by CB kthread, hence new cacheline. */
|
2019-03-29 06:33:59 +08:00
|
|
|
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
|
2019-03-29 06:44:18 +08:00
|
|
|
/* GP rdp takes GP-end wakeups. */
|
2012-08-20 12:35:53 +08:00
|
|
|
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
|
|
|
|
|
2018-12-01 08:11:14 +08:00
|
|
|
/* 6) RCU priority boosting. */
|
|
|
|
struct task_struct *rcu_cpu_kthread_task;
|
|
|
|
/* rcuc per-CPU kthread or NULL. */
|
2018-12-01 08:43:05 +08:00
|
|
|
unsigned int rcu_cpu_kthread_status;
|
2018-12-01 10:21:32 +08:00
|
|
|
char rcu_cpu_has_work;
|
2018-12-01 08:11:14 +08:00
|
|
|
|
|
|
|
/* 7) Diagnostic data, including RCU CPU stall warnings. */
|
2013-03-07 05:37:09 +08:00
|
|
|
unsigned int softirq_snap; /* Snapshot of softirq activity. */
|
2017-08-18 08:05:59 +08:00
|
|
|
/* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
|
|
|
|
struct irq_work rcu_iw; /* Check for non-irq activity. */
|
|
|
|
bool rcu_iw_pending; /* Is ->rcu_iw pending? */
|
2018-04-29 05:15:40 +08:00
|
|
|
unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */
|
2018-05-09 05:18:57 +08:00
|
|
|
unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */
|
|
|
|
short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */
|
|
|
|
unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
|
|
|
|
short rcu_onl_gp_flags; /* ->gp_flags at last online. */
|
2018-07-26 02:49:47 +08:00
|
|
|
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
|
2013-03-07 05:37:09 +08:00
|
|
|
|
2009-08-23 04:56:45 +08:00
|
|
|
int cpu;
|
|
|
|
};
|
|
|
|
|
2014-07-30 05:50:47 +08:00
|
|
|
/* Values for nocb_defer_wakeup field in struct rcu_data. */
|
2017-04-29 08:04:09 +08:00
|
|
|
#define RCU_NOCB_WAKE_NOT 0
|
|
|
|
#define RCU_NOCB_WAKE 1
|
|
|
|
#define RCU_NOCB_WAKE_FORCE 2
|
2014-07-30 05:50:47 +08:00
|
|
|
|
2013-04-04 13:14:11 +08:00
|
|
|
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
|
|
|
|
/* For jiffies_till_first_fqs and */
|
|
|
|
/* and jiffies_till_next_fqs. */
|
2010-03-06 07:03:26 +08:00
|
|
|
|
2013-04-04 13:14:11 +08:00
|
|
|
#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */
|
|
|
|
/* delay between bouts of */
|
|
|
|
/* quiescent-state forcing. */
|
|
|
|
|
|
|
|
#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */
|
|
|
|
/* at least one scheduling clock */
|
|
|
|
/* irq before ratting on them. */
|
2009-08-23 04:56:45 +08:00
|
|
|
|
2011-05-21 07:06:29 +08:00
|
|
|
#define rcu_wait(cond) \
|
|
|
|
do { \
|
|
|
|
for (;;) { \
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE); \
|
|
|
|
if (cond) \
|
|
|
|
break; \
|
|
|
|
schedule(); \
|
|
|
|
} \
|
|
|
|
__set_current_state(TASK_RUNNING); \
|
|
|
|
} while (0)
|
2009-08-23 04:56:45 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* RCU global state, including node hierarchy. This hierarchy is
|
|
|
|
* represented in "heap" form in a dense array. The root (first level)
|
|
|
|
* of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
|
|
|
|
* level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
|
|
|
|
* and the third level in ->node[m+1] and following (->node[m+1] referenced
|
|
|
|
* by ->level[2]). The number of levels is determined by the number of
|
|
|
|
* CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
|
|
|
|
* consisting of a single rcu_node.
|
|
|
|
*/
|
|
|
|
struct rcu_state {
|
|
|
|
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
|
2015-07-09 21:34:23 +08:00
|
|
|
struct rcu_node *level[RCU_NUM_LVLS + 1];
|
|
|
|
/* Hierarchy levels (+1 to */
|
|
|
|
/* shut bogus gcc warning) */
|
2015-08-01 07:04:45 +08:00
|
|
|
int ncpus; /* # CPUs seen so far. */
|
rcu: Fix single-CPU check in rcu_blocking_is_gp()
Currently, for CONFIG_PREEMPTION=n kernels, rcu_blocking_is_gp() uses
num_online_cpus() to determine whether there is only one CPU online. When
there is only a single CPU online, the simple fact that synchronize_rcu()
could be legally called implies that a full grace period has elapsed.
Therefore, in the single-CPU case, synchronize_rcu() simply returns
immediately. Unfortunately, num_online_cpus() is unreliable while a
CPU-hotplug operation is transitioning to or from single-CPU operation
because:
1. num_online_cpus() uses atomic_read(&__num_online_cpus) to
locklessly sample the number of online CPUs. The hotplug locks
are not held, which means that an incoming CPU can concurrently
update this count. This in turn means that an RCU read-side
critical section on the incoming CPU might observe updates
prior to the grace period, but also that this critical section
might extend beyond the end of the optimized synchronize_rcu().
This breaks RCU's fundamental guarantee.
2. In addition, num_online_cpus() does no ordering, thus providing
another way that RCU's fundamental guarantee can be broken by
the current code.
3. The most probable failure mode happens on outgoing CPUs.
The outgoing CPU updates the count of online CPUs in the
CPUHP_TEARDOWN_CPU stop-machine handler, which is fine in
and of itself due to preemption being disabled at the call
to num_online_cpus(). Unfortunately, after that stop-machine
handler returns, the CPU takes one last trip through the
scheduler (which has RCU readers) and, after the resulting
context switch, one final dive into the idle loop. During this
time, RCU needs to keep track of two CPUs, but num_online_cpus()
will say that there is only one, which in turn means that the
surviving CPU will incorrectly ignore the outgoing CPU's RCU
read-side critical sections.
This problem is illustrated by the following litmus test in which P0()
corresponds to synchronize_rcu() and P1() corresponds to the incoming CPU.
The herd7 tool confirms that the "exists" clause can be satisfied,
thus demonstrating that this breakage can happen according to the Linux
kernel memory model.
{
int x = 0;
atomic_t numonline = ATOMIC_INIT(1);
}
P0(int *x, atomic_t *numonline)
{
int r0;
WRITE_ONCE(*x, 1);
r0 = atomic_read(numonline);
if (r0 == 1) {
smp_mb();
} else {
synchronize_rcu();
}
WRITE_ONCE(*x, 2);
}
P1(int *x, atomic_t *numonline)
{
int r0; int r1;
atomic_inc(numonline);
smp_mb();
rcu_read_lock();
r0 = READ_ONCE(*x);
smp_rmb();
r1 = READ_ONCE(*x);
rcu_read_unlock();
}
locations [x;numonline;]
exists (1:r0=0 /\ 1:r1=2)
It is important to note that these problems arise only when the system
is transitioning to or from single-CPU operation.
One solution would be to hold the CPU-hotplug locks while sampling
num_online_cpus(), which was in fact the intent of the (redundant)
preempt_disable() and preempt_enable() surrounding this call to
num_online_cpus(). Actually blocking CPU hotplug would not only result
in excessive overhead, but would also unnecessarily impede CPU-hotplug
operations.
This commit therefore follows long-standing RCU tradition by maintaining
a separate RCU-specific set of CPU-hotplug books.
This separate set of books is implemented by a new ->n_online_cpus field
in the rcu_state structure that maintains RCU's count of the online CPUs.
This count is incremented early in the CPU-online process, so that
the critical transition away from single-CPU operation will occur when
there is only a single CPU. Similarly for the critical transition to
single-CPU operation, the counter is decremented late in the CPU-offline
process, again while there is only a single CPU. Because there is only
ever a single CPU when the ->n_online_cpus field undergoes the critical
1->2 and 2->1 transitions, full memory ordering and mutual exclusion is
provided implicitly and, better yet, for free.
In the case where the CPU is coming online, nothing will happen until
the current CPU helps it come online. Therefore, the new CPU will see
all accesses prior to the optimized grace period, which means that RCU
does not need to further delay this new CPU. In the case where the CPU
is going offline, the outgoing CPU is totally out of the picture before
the optimized grace period starts, which means that this outgoing CPU
cannot see any of the accesses following that grace period. Again,
RCU needs no further interaction with the outgoing CPU.
This does mean that synchronize_rcu() will unnecessarily do a few grace
periods the hard way just before the second CPU comes online and just
after the second-to-last CPU goes offline, but it is not worth optimizing
this uncommon case.
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-09-23 15:29:33 +08:00
|
|
|
int n_online_cpus; /* # CPUs online for RCU. */
|
2009-08-23 04:56:45 +08:00
|
|
|
|
|
|
|
/* The following fields are guarded by the root rcu_node's lock. */
|
|
|
|
|
2015-09-10 03:09:49 +08:00
|
|
|
u8 boost ____cacheline_internodealigned_in_smp;
|
|
|
|
/* Subject to priority boost. */
|
2018-04-27 02:52:09 +08:00
|
|
|
unsigned long gp_seq; /* Grace-period sequence #. */
|
2020-06-12 10:07:52 +08:00
|
|
|
unsigned long gp_max; /* Maximum GP duration in */
|
|
|
|
/* jiffies. */
|
2012-06-19 09:36:08 +08:00
|
|
|
struct task_struct *gp_kthread; /* Task for grace periods. */
|
2016-02-19 16:46:41 +08:00
|
|
|
struct swait_queue_head gp_wq; /* Where GP task waits. */
|
2014-03-12 22:10:41 +08:00
|
|
|
short gp_flags; /* Commands for GP task. */
|
|
|
|
short gp_state; /* GP kthread sleep state. */
|
2018-12-11 08:09:49 +08:00
|
|
|
unsigned long gp_wake_time; /* Last GP kthread wake. */
|
|
|
|
unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
|
2009-09-24 00:50:42 +08:00
|
|
|
|
2009-12-03 04:10:15 +08:00
|
|
|
/* End of fields guarded by root rcu_node's lock. */
|
2009-09-24 00:50:42 +08:00
|
|
|
|
2012-05-29 20:18:53 +08:00
|
|
|
struct mutex barrier_mutex; /* Guards barrier fields. */
|
2012-05-29 15:34:56 +08:00
|
|
|
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
|
2012-05-29 18:03:37 +08:00
|
|
|
struct completion barrier_completion; /* Wake at barrier end. */
|
2015-06-27 02:20:00 +08:00
|
|
|
unsigned long barrier_sequence; /* ++ at start and end of */
|
2018-07-11 09:37:30 +08:00
|
|
|
/* rcu_barrier(). */
|
2012-10-07 23:36:12 +08:00
|
|
|
/* End of fields guarded by barrier_mutex. */
|
|
|
|
|
2016-01-31 09:57:35 +08:00
|
|
|
struct mutex exp_mutex; /* Serialize expedited GP. */
|
2016-03-17 07:47:55 +08:00
|
|
|
struct mutex exp_wake_mutex; /* Serialize wakeup. */
|
2015-06-25 01:46:30 +08:00
|
|
|
unsigned long expedited_sequence; /* Take a ticket. */
|
2015-06-26 02:27:10 +08:00
|
|
|
atomic_t expedited_need_qs; /* # CPUs left to check in. */
|
2016-02-19 16:46:41 +08:00
|
|
|
struct swait_queue_head expedited_wq; /* Wait for check-ins. */
|
2015-08-01 07:04:45 +08:00
|
|
|
int ncpus_snap; /* # CPUs seen last time. */
|
2019-10-31 02:56:10 +08:00
|
|
|
u8 cbovld; /* Callback overload now? */
|
|
|
|
u8 cbovldnext; /* ^ ^ next time? */
|
2012-10-12 06:24:03 +08:00
|
|
|
|
2009-08-23 04:56:45 +08:00
|
|
|
unsigned long jiffies_force_qs; /* Time at which to invoke */
|
|
|
|
/* force_quiescent_state(). */
|
2016-01-04 12:29:57 +08:00
|
|
|
unsigned long jiffies_kick_kthreads; /* Time at which to kick */
|
|
|
|
/* kthreads, if configured. */
|
2009-08-23 04:56:45 +08:00
|
|
|
unsigned long n_force_qs; /* Number of calls to */
|
|
|
|
/* force_quiescent_state(). */
|
|
|
|
unsigned long gp_start; /* Time at which GP started, */
|
|
|
|
/* but in jiffies. */
|
2018-10-04 08:25:33 +08:00
|
|
|
unsigned long gp_end; /* Time last GP ended, again */
|
|
|
|
/* in jiffies. */
|
2014-12-12 02:20:59 +08:00
|
|
|
unsigned long gp_activity; /* Time of last GP kthread */
|
|
|
|
/* activity in jiffies. */
|
2018-04-22 11:44:11 +08:00
|
|
|
unsigned long gp_req_activity; /* Time of last GP request */
|
|
|
|
/* in jiffies. */
|
2009-08-23 04:56:45 +08:00
|
|
|
unsigned long jiffies_stall; /* Time at which to check */
|
|
|
|
/* for CPU stalls. */
|
2013-09-24 04:57:18 +08:00
|
|
|
unsigned long jiffies_resched; /* Time at which to resched */
|
|
|
|
/* a reluctant CPU. */
|
2014-12-09 01:57:48 +08:00
|
|
|
unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
|
|
|
|
/* GP start. */
|
2013-07-13 04:50:28 +08:00
|
|
|
const char *name; /* Name of structure. */
|
rcu: Distinguish "rcuo" kthreads by RCU flavor
Currently, the per-no-CBs-CPU kthreads are named "rcuo" followed by
the CPU number, for example, "rcuo". This is problematic given that
there are either two or three RCU flavors, each of which gets a per-CPU
kthread with exactly the same name. This commit therefore introduces
a one-letter abbreviation for each RCU flavor, namely 'b' for RCU-bh,
'p' for RCU-preempt, and 's' for RCU-sched. This abbreviation is used
to distinguish the "rcuo" kthreads, for example, for CPU 0 we would have
"rcuob/0", "rcuop/0", and "rcuos/0".
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
2012-12-04 00:16:28 +08:00
|
|
|
char abbr; /* Abbreviated name. */
|
rcu: Fix grace-period hangs due to race with CPU offline
Without special fail-safe quiescent-state-propagation checks, grace-period
hangs can result from the following scenario:
1. CPU 1 goes offline.
2. Because CPU 1 is the only CPU in the system blocking the current
grace period, the grace period ends as soon as
rcu_cleanup_dying_idle_cpu()'s call to rcu_report_qs_rnp()
returns.
3. At this point, the leaf rcu_node structure's ->lock is no longer
held: rcu_report_qs_rnp() has released it, as it must in order
to awaken the RCU grace-period kthread.
4. At this point, that same leaf rcu_node structure's ->qsmaskinitnext
field still records CPU 1 as being online. This is absolutely
necessary because the scheduler uses RCU (in this case on the
wake-up path while awakening RCU's grace-period kthread), and
->qsmaskinitnext contains RCU's idea as to which CPUs are online.
Therefore, invoking rcu_report_qs_rnp() after clearing CPU 1's
bit from ->qsmaskinitnext would result in a lockdep-RCU splat
due to RCU being used from an offline CPU.
5. RCU's grace-period kthread awakens, sees that the old grace period
has completed and that a new one is needed. It therefore starts
a new grace period, but because CPU 1's leaf rcu_node structure's
->qsmaskinitnext field still shows CPU 1 as being online, this new
grace period is initialized to wait for a quiescent state from the
now-offline CPU 1.
6. Without the fail-safe force-quiescent-state checks, there would
be no quiescent state from the now-offline CPU 1, which would
eventually result in RCU CPU stall warnings and memory exhaustion.
It would be good to get rid of the special fail-safe quiescent-state
propagation checks, and thus it would be good to fix things so that
the above scenario cannot happen. This commit therefore adds a new
->ofl_lock to the rcu_state structure. This lock is held by rcu_gp_init()
across the applying of buffered online and offline operations to the
rcu_node tree, and it is also held by rcu_cleanup_dying_idle_cpu()
when buffering a new offline operation. This prevents rcu_gp_init()
from acquiring the leaf rcu_node structure's lock during the interval
between when rcu_cleanup_dying_idle_cpu() invokes rcu_report_qs_rnp(),
which releases ->lock and the re-acquisition of that same lock.
This in turn prevents the failure scenario outlined above, and will
hopefully eventually allow removal of the offline-CPU checks from the
force-quiescent-state code path.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2018-05-26 10:23:09 +08:00
|
|
|
|
2018-08-16 00:05:29 +08:00
|
|
|
raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
|
rcu: Fix grace-period hangs due to race with CPU offline
Without special fail-safe quiescent-state-propagation checks, grace-period
hangs can result from the following scenario:
1. CPU 1 goes offline.
2. Because CPU 1 is the only CPU in the system blocking the current
grace period, the grace period ends as soon as
rcu_cleanup_dying_idle_cpu()'s call to rcu_report_qs_rnp()
returns.
3. At this point, the leaf rcu_node structure's ->lock is no longer
held: rcu_report_qs_rnp() has released it, as it must in order
to awaken the RCU grace-period kthread.
4. At this point, that same leaf rcu_node structure's ->qsmaskinitnext
field still records CPU 1 as being online. This is absolutely
necessary because the scheduler uses RCU (in this case on the
wake-up path while awakening RCU's grace-period kthread), and
->qsmaskinitnext contains RCU's idea as to which CPUs are online.
Therefore, invoking rcu_report_qs_rnp() after clearing CPU 1's
bit from ->qsmaskinitnext would result in a lockdep-RCU splat
due to RCU being used from an offline CPU.
5. RCU's grace-period kthread awakens, sees that the old grace period
has completed and that a new one is needed. It therefore starts
a new grace period, but because CPU 1's leaf rcu_node structure's
->qsmaskinitnext field still shows CPU 1 as being online, this new
grace period is initialized to wait for a quiescent state from the
now-offline CPU 1.
6. Without the fail-safe force-quiescent-state checks, there would
be no quiescent state from the now-offline CPU 1, which would
eventually result in RCU CPU stall warnings and memory exhaustion.
It would be good to get rid of the special fail-safe quiescent-state
propagation checks, and thus it would be good to fix things so that
the above scenario cannot happen. This commit therefore adds a new
->ofl_lock to the rcu_state structure. This lock is held by rcu_gp_init()
across the applying of buffered online and offline operations to the
rcu_node tree, and it is also held by rcu_cleanup_dying_idle_cpu()
when buffering a new offline operation. This prevents rcu_gp_init()
from acquiring the leaf rcu_node structure's lock during the interval
between when rcu_cleanup_dying_idle_cpu() invokes rcu_report_qs_rnp(),
which releases ->lock and the re-acquisition of that same lock.
This in turn prevents the failure scenario outlined above, and will
hopefully eventually allow removal of the offline-CPU checks from the
force-quiescent-state code path.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2018-05-26 10:23:09 +08:00
|
|
|
/* Synchronize offline with */
|
|
|
|
/* GP pre-initialization. */
|
2009-08-23 04:56:45 +08:00
|
|
|
};
|
|
|
|
|
2012-06-23 08:06:26 +08:00
|
|
|
/* Values for rcu_state structure's gp_flags field. */
|
|
|
|
#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
|
|
|
|
#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
|
2020-02-23 12:07:09 +08:00
|
|
|
#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */
|
2012-06-23 08:06:26 +08:00
|
|
|
|
2015-09-11 02:21:28 +08:00
|
|
|
/* Values for rcu_state structure's gp_state field. */
|
2015-09-10 03:09:49 +08:00
|
|
|
#define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */
|
2014-03-12 22:10:41 +08:00
|
|
|
#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
|
2015-05-20 05:16:52 +08:00
|
|
|
#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
|
2018-05-16 06:47:30 +08:00
|
|
|
#define RCU_GP_ONOFF 3 /* Grace-period initialization hotplug. */
|
|
|
|
#define RCU_GP_INIT 4 /* Grace-period initialization. */
|
|
|
|
#define RCU_GP_WAIT_FQS 5 /* Wait for force-quiescent-state time. */
|
|
|
|
#define RCU_GP_DOING_FQS 6 /* Wait done for force-quiescent-state time. */
|
|
|
|
#define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */
|
|
|
|
#define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */
|
2014-03-12 22:10:41 +08:00
|
|
|
|
2018-07-04 05:15:31 +08:00
|
|
|
/*
|
|
|
|
* In order to export the rcu_state name to the tracing tools, it
|
|
|
|
* needs to be added in the __tracepoint_string section.
|
|
|
|
* This requires defining a separate variable tp_<sname>_varname
|
|
|
|
* that points to the string being used, and this will allow
|
|
|
|
* the tracing userspace tools to be able to decipher the string
|
|
|
|
* address to the matching string.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_PREEMPT_RCU
|
|
|
|
#define RCU_ABBR 'p'
|
|
|
|
#define RCU_NAME_RAW "rcu_preempt"
|
|
|
|
#else /* #ifdef CONFIG_PREEMPT_RCU */
|
|
|
|
#define RCU_ABBR 's'
|
|
|
|
#define RCU_NAME_RAW "rcu_sched"
|
|
|
|
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
|
|
|
|
#ifndef CONFIG_TRACING
|
|
|
|
#define RCU_NAME RCU_NAME_RAW
|
|
|
|
#else /* #ifdef CONFIG_TRACING */
|
|
|
|
static char rcu_name[] = RCU_NAME_RAW;
|
|
|
|
static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
|
|
|
|
#define RCU_NAME rcu_name
|
|
|
|
#endif /* #else #ifdef CONFIG_TRACING */
|
2015-11-18 06:39:26 +08:00
|
|
|
|
2019-01-12 08:57:41 +08:00
|
|
|
/* Forward declarations for tree_plugin.h */
|
2009-11-11 05:37:19 +08:00
|
|
|
static void rcu_bootup_announce(void);
|
2018-07-03 05:30:37 +08:00
|
|
|
static void rcu_qs(void);
|
2011-02-08 04:47:15 +08:00
|
|
|
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
|
rcu: Fix grace-period-stall bug on large systems with CPU hotplug
When the last CPU of a given leaf rcu_node structure goes
offline, all of the tasks queued on that leaf rcu_node structure
(due to having blocked in their current RCU read-side critical
sections) are requeued onto the root rcu_node structure. This
requeuing is carried out by rcu_preempt_offline_tasks().
However, it is possible that these queued tasks are the only
thing preventing the leaf rcu_node structure from reporting a
quiescent state up the rcu_node hierarchy. Unfortunately, the
old code would fail to do this reporting, resulting in a
grace-period stall given the following sequence of events:
1. Kernel built for more than 32 CPUs on 32-bit systems or for more
than 64 CPUs on 64-bit systems, so that there is more than one
rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set
to a number smaller than CONFIG_NR_CPUS.)
2. The kernel is built with CONFIG_TREE_PREEMPT_RCU.
3. A task running on a CPU associated with a given leaf rcu_node
structure blocks while in an RCU read-side critical section
-and- that CPU has not yet passed through a quiescent state
for the current RCU grace period. This will cause the task
to be queued on the leaf rcu_node's blocked_tasks[] array, in
particular, on the element of this array corresponding to the
current grace period.
4. Each of the remaining CPUs corresponding to this same leaf rcu_node
structure pass through a quiescent state. However, the task is
still in its RCU read-side critical section, so these quiescent
states cannot be reported further up the rcu_node hierarchy.
Nevertheless, all bits in the leaf rcu_node structure's ->qsmask
field are now zero.
5. Each of the remaining CPUs go offline. (The events in step
#4 and #5 can happen in any order as long as each CPU passes
through a quiescent state before going offline.)
6. When the last CPU goes offline, __rcu_offline_cpu() will invoke
rcu_preempt_offline_tasks(), which will move the task to the
root rcu_node structure, but without reporting a quiescent state
up the rcu_node hierarchy (and this failure to report a quiescent
state is the bug).
But because this leaf rcu_node structure's ->qsmask field is
already zero and its ->block_tasks[] entries are all empty,
force_quiescent_state() will skip this rcu_node structure.
Therefore, grace periods are now hung.
This patch abstracts some code out of rcu_read_unlock_special(),
calling the result task_quiet() by analogy with cpu_quiet(), and
invokes task_quiet() from both rcu_read_lock_special() and
__rcu_offline_cpu(). Invoking task_quiet() from
__rcu_offline_cpu() reports the quiescent state up the rcu_node
hierarchy, fixing the bug. This ends up requiring a separate
lock_class_key per level of the rcu_node hierarchy, which this
patch also provides.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088301770-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-11-23 00:53:48 +08:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
2014-11-01 02:22:37 +08:00
|
|
|
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
|
rcu: Fix grace-period-stall bug on large systems with CPU hotplug
When the last CPU of a given leaf rcu_node structure goes
offline, all of the tasks queued on that leaf rcu_node structure
(due to having blocked in their current RCU read-side critical
sections) are requeued onto the root rcu_node structure. This
requeuing is carried out by rcu_preempt_offline_tasks().
However, it is possible that these queued tasks are the only
thing preventing the leaf rcu_node structure from reporting a
quiescent state up the rcu_node hierarchy. Unfortunately, the
old code would fail to do this reporting, resulting in a
grace-period stall given the following sequence of events:
1. Kernel built for more than 32 CPUs on 32-bit systems or for more
than 64 CPUs on 64-bit systems, so that there is more than one
rcu_node structure. (Or CONFIG_RCU_FANOUT is artificially set
to a number smaller than CONFIG_NR_CPUS.)
2. The kernel is built with CONFIG_TREE_PREEMPT_RCU.
3. A task running on a CPU associated with a given leaf rcu_node
structure blocks while in an RCU read-side critical section
-and- that CPU has not yet passed through a quiescent state
for the current RCU grace period. This will cause the task
to be queued on the leaf rcu_node's blocked_tasks[] array, in
particular, on the element of this array corresponding to the
current grace period.
4. Each of the remaining CPUs corresponding to this same leaf rcu_node
structure pass through a quiescent state. However, the task is
still in its RCU read-side critical section, so these quiescent
states cannot be reported further up the rcu_node hierarchy.
Nevertheless, all bits in the leaf rcu_node structure's ->qsmask
field are now zero.
5. Each of the remaining CPUs go offline. (The events in step
#4 and #5 can happen in any order as long as each CPU passes
through a quiescent state before going offline.)
6. When the last CPU goes offline, __rcu_offline_cpu() will invoke
rcu_preempt_offline_tasks(), which will move the task to the
root rcu_node structure, but without reporting a quiescent state
up the rcu_node hierarchy (and this failure to report a quiescent
state is the bug).
But because this leaf rcu_node structure's ->qsmask field is
already zero and its ->block_tasks[] entries are all empty,
force_quiescent_state() will skip this rcu_node structure.
Therefore, grace periods are now hung.
This patch abstracts some code out of rcu_read_unlock_special(),
calling the result task_quiet() by analogy with cpu_quiet(), and
invokes task_quiet() from both rcu_read_lock_special() and
__rcu_offline_cpu(). Invoking task_quiet() from
__rcu_offline_cpu() reports the quiescent state up the rcu_node
hierarchy, fixing the bug. This ends up requiring a separate
lock_class_key per level of the rcu_node hierarchy, which this
patch also provides.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088301770-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-11-23 00:53:48 +08:00
|
|
|
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
|
2015-08-19 01:20:43 +08:00
|
|
|
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
|
2018-07-04 08:22:34 +08:00
|
|
|
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
|
2018-11-22 03:35:03 +08:00
|
|
|
static void rcu_flavor_sched_clock_irq(int user);
|
2018-07-04 08:22:34 +08:00
|
|
|
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
|
2011-05-05 12:43:49 +08:00
|
|
|
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
|
2011-06-16 06:47:09 +08:00
|
|
|
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
|
2011-11-30 07:57:13 +08:00
|
|
|
static bool rcu_is_callbacks_kthread(void);
|
2019-03-21 05:13:33 +08:00
|
|
|
static void rcu_cpu_kthread_setup(unsigned int cpu);
|
2014-07-14 03:00:53 +08:00
|
|
|
static void __init rcu_spawn_boost_kthreads(void);
|
2013-06-20 02:52:21 +08:00
|
|
|
static void rcu_prepare_kthreads(int cpu);
|
2014-10-23 06:07:37 +08:00
|
|
|
static void rcu_cleanup_after_idle(void);
|
2014-10-23 06:03:43 +08:00
|
|
|
static void rcu_prepare_for_idle(void);
|
rcu: Process offlining and onlining only at grace-period start
Races between CPU hotplug and grace periods can be difficult to resolve,
so the ->onoff_mutex is used to exclude the two events. Unfortunately,
this means that it is impossible for an outgoing CPU to perform the
last bits of its offlining from its last pass through the idle loop,
because sleeplocks cannot be acquired in that context.
This commit avoids these problems by buffering online and offline events
in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a
grace period starts, the events accumulated in this mask are applied to
the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special
case of all CPUs corresponding to a given leaf rcu_node structure being
offline while there are still elements in that structure's ->blkd_tasks
list is handled using a new ->wait_blkd_tasks field. In this case,
propagating the offline bits up the tree is deferred until the beginning
of the grace period after all of the tasks have exited their RCU read-side
critical sections and removed themselves from the list, at which point
the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node
structure's CPUs comes back online before the list empties, then the
->wait_blkd_tasks flag is simply cleared.
This of course means that RCU's notion of which CPUs are offline can be
out of date. This is OK because RCU need only wait on CPUs that were
online at the time that the grace period started. In addition, RCU's
force-quiescent-state actions will handle the case where a CPU goes
offline after the grace period starts.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2015-01-24 13:52:37 +08:00
|
|
|
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
|
rcu: Defer reporting RCU-preempt quiescent states when disabled
This commit defers reporting of RCU-preempt quiescent states at
rcu_read_unlock_special() time when any of interrupts, softirq, or
preemption are disabled. These deferred quiescent states are reported
at a later RCU_SOFTIRQ, context switch, idle entry, or CPU-hotplug
offline operation. Of course, if another RCU read-side critical
section has started in the meantime, the reporting of the quiescent
state will be further deferred.
This also means that disabling preemption, interrupts, and/or
softirqs will act as an RCU-preempt read-side critical section.
This is enforced by checking preempt_count() as needed.
Some special cases must be handled on an ad-hoc basis, for example,
context switch is a quiescent state even though both the scheduler and
do_exit() disable preemption. In these cases, additional calls to
rcu_preempt_deferred_qs() override the preemption disabling. Similar
logic overrides disabled interrupts in rcu_preempt_check_callbacks()
because in this case the quiescent state happened just before the
corresponding scheduling-clock interrupt.
In theory, this change lifts a long-standing restriction that required
that if interrupts were disabled across a call to rcu_read_unlock()
that the matching rcu_read_lock() also be contained within that
interrupts-disabled region of code. Because the reporting of the
corresponding RCU-preempt quiescent state is now deferred until
after interrupts have been enabled, it is no longer possible for this
situation to result in deadlocks involving the scheduler's runqueue and
priority-inheritance locks. This may allow some code simplification that
might reduce interrupt latency a bit. Unfortunately, in practice this
would also defer deboosting a low-priority task that had been subjected
to RCU priority boosting, so real-time-response considerations might
well force this restriction to remain in place.
Because RCU-preempt grace periods are now blocked not only by RCU
read-side critical sections, but also by disabling of interrupts,
preemption, and softirqs, it will be possible to eliminate RCU-bh and
RCU-sched in favor of RCU-preempt in CONFIG_PREEMPT=y kernels. This may
require some additional plumbing to provide the network denial-of-service
guarantees that have been traditionally provided by RCU-bh. Once these
are in place, CONFIG_PREEMPT=n kernels will be able to fold RCU-bh
into RCU-sched. This would mean that all kernels would have but
one flavor of RCU, which would open the door to significant code
cleanup.
Moving to a single flavor of RCU would also have the beneficial effect
of reducing the NOCB kthreads by at least a factor of two.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Apply rcu_read_unlock_special() preempt_count() feedback
from Joel Fernandes. ]
[ paulmck: Adjust rcu_eqs_enter() call to rcu_preempt_deferred_qs() in
response to bug reports from kbuild test robot. ]
[ paulmck: Fix bug located by kbuild test robot involving recursion
via rcu_preempt_deferred_qs(). ]
2018-06-22 03:50:01 +08:00
|
|
|
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
|
|
|
|
static void rcu_preempt_deferred_qs(struct task_struct *t);
|
2012-01-17 05:29:10 +08:00
|
|
|
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
|
2016-02-19 16:46:41 +08:00
|
|
|
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
|
|
|
|
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
|
2013-02-11 12:48:58 +08:00
|
|
|
static void rcu_init_one_nocb(struct rcu_node *rnp);
|
rcu/nocb: Add bypass callback queueing
Use of the rcu_data structure's segmented ->cblist for no-CBs CPUs
takes advantage of unrelated grace periods, thus reducing the memory
footprint in the face of floods of call_rcu() invocations. However,
the ->cblist field is a more-complex rcu_segcblist structure which must
be protected via locking. Even though there are only three entities
which can acquire this lock (the CPU invoking call_rcu(), the no-CBs
grace-period kthread, and the no-CBs callbacks kthread), the contention
on this lock is excessive under heavy stress.
This commit therefore greatly reduces contention by provisioning
an rcu_cblist structure field named ->nocb_bypass within the
rcu_data structure. Each no-CBs CPU is permitted only a limited
number of enqueues onto the ->cblist per jiffy, controlled by a new
nocb_nobypass_lim_per_jiffy kernel boot parameter that defaults to
about 16 enqueues per millisecond (16 * 1000 / HZ). When that limit is
exceeded, the CPU instead enqueues onto the new ->nocb_bypass.
The ->nocb_bypass is flushed into the ->cblist every jiffy or when
the number of callbacks on ->nocb_bypass exceeds qhimark, whichever
happens first. During call_rcu() floods, this flushing is carried out
by the CPU during the course of its call_rcu() invocations. However,
a CPU could simply stop invoking call_rcu() at any time. The no-CBs
grace-period kthread therefore carries out less-aggressive flushing
(every few jiffies or when the number of callbacks on ->nocb_bypass
exceeds (2 * qhimark), whichever comes first). This means that the
no-CBs grace-period kthread cannot be permitted to do unbounded waits
while there are callbacks on ->nocb_bypass. A ->nocb_bypass_timer is
used to provide the needed wakeups.
[ paulmck: Apply Coverity feedback reported by Colin Ian King. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-07-03 07:03:33 +08:00
|
|
|
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
|
|
|
|
unsigned long j);
|
|
|
|
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
|
|
|
|
bool *was_alldone, unsigned long flags);
|
2019-05-16 00:56:40 +08:00
|
|
|
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
|
|
|
|
unsigned long flags);
|
2014-07-30 05:50:47 +08:00
|
|
|
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
|
rcu: Break call_rcu() deadlock involving scheduler and perf
Dave Jones got the following lockdep splat:
> ======================================================
> [ INFO: possible circular locking dependency detected ]
> 3.12.0-rc3+ #92 Not tainted
> -------------------------------------------------------
> trinity-child2/15191 is trying to acquire lock:
> (&rdp->nocb_wq){......}, at: [<ffffffff8108ff43>] __wake_up+0x23/0x50
>
> but task is already holding lock:
> (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #3 (&ctx->lock){-.-...}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff811500ff>] __perf_event_task_sched_out+0x2df/0x5e0
> [<ffffffff81091b83>] perf_event_task_sched_out+0x93/0xa0
> [<ffffffff81732052>] __schedule+0x1d2/0xa20
> [<ffffffff81732f30>] preempt_schedule_irq+0x50/0xb0
> [<ffffffff817352b6>] retint_kernel+0x26/0x30
> [<ffffffff813eed04>] tty_flip_buffer_push+0x34/0x50
> [<ffffffff813f0504>] pty_write+0x54/0x60
> [<ffffffff813e900d>] n_tty_write+0x32d/0x4e0
> [<ffffffff813e5838>] tty_write+0x158/0x2d0
> [<ffffffff811c4850>] vfs_write+0xc0/0x1f0
> [<ffffffff811c52cc>] SyS_write+0x4c/0xa0
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> -> #2 (&rq->lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff810980b2>] wake_up_new_task+0xc2/0x2e0
> [<ffffffff81054336>] do_fork+0x126/0x460
> [<ffffffff81054696>] kernel_thread+0x26/0x30
> [<ffffffff8171ff93>] rest_init+0x23/0x140
> [<ffffffff81ee1e4b>] start_kernel+0x3f6/0x403
> [<ffffffff81ee1571>] x86_64_start_reservations+0x2a/0x2c
> [<ffffffff81ee1664>] x86_64_start_kernel+0xf1/0xf4
>
> -> #1 (&p->pi_lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff810979d1>] try_to_wake_up+0x31/0x350
> [<ffffffff81097d62>] default_wake_function+0x12/0x20
> [<ffffffff81084af8>] autoremove_wake_function+0x18/0x40
> [<ffffffff8108ea38>] __wake_up_common+0x58/0x90
> [<ffffffff8108ff59>] __wake_up+0x39/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111b8d>] call_rcu+0x1d/0x20
> [<ffffffff81093697>] cpu_attach_domain+0x287/0x360
> [<ffffffff81099d7e>] build_sched_domains+0xe5e/0x10a0
> [<ffffffff81efa7fc>] sched_init_smp+0x3b7/0x47a
> [<ffffffff81ee1f4e>] kernel_init_freeable+0xf6/0x202
> [<ffffffff817200be>] kernel_init+0xe/0x190
> [<ffffffff8173d22c>] ret_from_fork+0x7c/0xb0
>
> -> #0 (&rdp->nocb_wq){......}:
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> other info that might help us debug this:
>
> Chain exists of:
> &rdp->nocb_wq --> &rq->lock --> &ctx->lock
>
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&ctx->lock);
> lock(&rq->lock);
> lock(&ctx->lock);
> lock(&rdp->nocb_wq);
>
> *** DEADLOCK ***
>
> 1 lock held by trinity-child2/15191:
> #0: (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> stack backtrace:
> CPU: 2 PID: 15191 Comm: trinity-child2 Not tainted 3.12.0-rc3+ #92
> ffffffff82565b70 ffff880070c2dbf8 ffffffff8172a363 ffffffff824edf40
> ffff880070c2dc38 ffffffff81726741 ffff880070c2dc90 ffff88022383b1c0
> ffff88022383aac0 0000000000000000 ffff88022383b188 ffff88022383b1c0
> Call Trace:
> [<ffffffff8172a363>] dump_stack+0x4e/0x82
> [<ffffffff81726741>] print_circular_bug+0x200/0x20f
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810c6439>] ? get_lock_stats+0x19/0x60
> [<ffffffff8100b2f4>] ? native_sched_clock+0x24/0x80
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff8109bc8f>] ? local_clock+0x3f/0x50
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff810c9af5>] ? trace_hardirqs_on_caller+0x115/0x1e0
> [<ffffffff810c9bcd>] ? trace_hardirqs_on+0xd/0x10
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
The underlying problem is that perf is invoking call_rcu() with the
scheduler locks held, but in NOCB mode, call_rcu() will with high
probability invoke the scheduler -- which just might want to use its
locks. The reason that call_rcu() needs to invoke the scheduler is
to wake up the corresponding rcuo callback-offload kthread, which
does the job of starting up a grace period and invoking the callbacks
afterwards.
One solution (championed on a related problem by Lai Jiangshan) is to
simply defer the wakeup to some point where scheduler locks are no longer
held. Since we don't want to unnecessarily incur the cost of such
deferral, the task before us is threefold:
1. Determine when it is likely that a relevant scheduler lock is held.
2. Defer the wakeup in such cases.
3. Ensure that all deferred wakeups eventually happen, preferably
sooner rather than later.
We use irqs_disabled_flags() as a proxy for relevant scheduler locks
being held. This works because the relevant locks are always acquired
with interrupts disabled. We may defer more often than needed, but that
is at least safe.
The wakeup deferral is tracked via a new field in the per-CPU and
per-RCU-flavor rcu_data structure, namely ->nocb_defer_wakeup.
This flag is checked by the RCU core processing. The __rcu_pending()
function now checks this flag, which causes rcu_check_callbacks()
to initiate RCU core processing at each scheduling-clock interrupt
where this flag is set. Of course this is not sufficient because
scheduling-clock interrupts are often turned off (the things we used to
be able to count on!). So the flags are also checked on entry to any
state that RCU considers to be idle, which includes both NO_HZ_IDLE idle
state and NO_HZ_FULL user-mode-execution state.
This approach should allow call_rcu() to be invoked regardless of what
locks you might be holding, the key word being "should".
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
2013-10-05 05:33:34 +08:00
|
|
|
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
|
2012-08-20 12:35:53 +08:00
|
|
|
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
|
2018-11-28 05:55:53 +08:00
|
|
|
static void rcu_spawn_cpu_nocb_kthread(int cpu);
|
2014-07-12 02:30:24 +08:00
|
|
|
static void __init rcu_spawn_nocb_kthreads(void);
|
2019-06-26 04:32:51 +08:00
|
|
|
static void show_rcu_nocb_state(struct rcu_data *rdp);
|
2019-05-16 00:56:40 +08:00
|
|
|
static void rcu_nocb_lock(struct rcu_data *rdp);
|
|
|
|
static void rcu_nocb_unlock(struct rcu_data *rdp);
|
|
|
|
static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
|
|
|
|
unsigned long flags);
|
rcu/nocb: Add bypass callback queueing
Use of the rcu_data structure's segmented ->cblist for no-CBs CPUs
takes advantage of unrelated grace periods, thus reducing the memory
footprint in the face of floods of call_rcu() invocations. However,
the ->cblist field is a more-complex rcu_segcblist structure which must
be protected via locking. Even though there are only three entities
which can acquire this lock (the CPU invoking call_rcu(), the no-CBs
grace-period kthread, and the no-CBs callbacks kthread), the contention
on this lock is excessive under heavy stress.
This commit therefore greatly reduces contention by provisioning
an rcu_cblist structure field named ->nocb_bypass within the
rcu_data structure. Each no-CBs CPU is permitted only a limited
number of enqueues onto the ->cblist per jiffy, controlled by a new
nocb_nobypass_lim_per_jiffy kernel boot parameter that defaults to
about 16 enqueues per millisecond (16 * 1000 / HZ). When that limit is
exceeded, the CPU instead enqueues onto the new ->nocb_bypass.
The ->nocb_bypass is flushed into the ->cblist every jiffy or when
the number of callbacks on ->nocb_bypass exceeds qhimark, whichever
happens first. During call_rcu() floods, this flushing is carried out
by the CPU during the course of its call_rcu() invocations. However,
a CPU could simply stop invoking call_rcu() at any time. The no-CBs
grace-period kthread therefore carries out less-aggressive flushing
(every few jiffies or when the number of callbacks on ->nocb_bypass
exceeds (2 * qhimark), whichever comes first). This means that the
no-CBs grace-period kthread cannot be permitted to do unbounded waits
while there are callbacks on ->nocb_bypass. A ->nocb_bypass_timer is
used to provide the needed wakeups.
[ paulmck: Apply Coverity feedback reported by Colin Ian King. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-07-03 07:03:33 +08:00
|
|
|
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
|
2014-07-12 02:30:24 +08:00
|
|
|
#ifdef CONFIG_RCU_NOCB_CPU
|
2018-07-04 08:22:34 +08:00
|
|
|
static void __init rcu_organize_nocb_kthreads(void);
|
2019-05-28 22:18:08 +08:00
|
|
|
#define rcu_nocb_lock_irqsave(rdp, flags) \
|
|
|
|
do { \
|
rcu/nocb: Add bypass callback queueing
Use of the rcu_data structure's segmented ->cblist for no-CBs CPUs
takes advantage of unrelated grace periods, thus reducing the memory
footprint in the face of floods of call_rcu() invocations. However,
the ->cblist field is a more-complex rcu_segcblist structure which must
be protected via locking. Even though there are only three entities
which can acquire this lock (the CPU invoking call_rcu(), the no-CBs
grace-period kthread, and the no-CBs callbacks kthread), the contention
on this lock is excessive under heavy stress.
This commit therefore greatly reduces contention by provisioning
an rcu_cblist structure field named ->nocb_bypass within the
rcu_data structure. Each no-CBs CPU is permitted only a limited
number of enqueues onto the ->cblist per jiffy, controlled by a new
nocb_nobypass_lim_per_jiffy kernel boot parameter that defaults to
about 16 enqueues per millisecond (16 * 1000 / HZ). When that limit is
exceeded, the CPU instead enqueues onto the new ->nocb_bypass.
The ->nocb_bypass is flushed into the ->cblist every jiffy or when
the number of callbacks on ->nocb_bypass exceeds qhimark, whichever
happens first. During call_rcu() floods, this flushing is carried out
by the CPU during the course of its call_rcu() invocations. However,
a CPU could simply stop invoking call_rcu() at any time. The no-CBs
grace-period kthread therefore carries out less-aggressive flushing
(every few jiffies or when the number of callbacks on ->nocb_bypass
exceeds (2 * qhimark), whichever comes first). This means that the
no-CBs grace-period kthread cannot be permitted to do unbounded waits
while there are callbacks on ->nocb_bypass. A ->nocb_bypass_timer is
used to provide the needed wakeups.
[ paulmck: Apply Coverity feedback reported by Colin Ian King. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-07-03 07:03:33 +08:00
|
|
|
if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
|
2019-05-28 22:18:08 +08:00
|
|
|
local_irq_save(flags); \
|
rcu/nocb: Add bypass callback queueing
Use of the rcu_data structure's segmented ->cblist for no-CBs CPUs
takes advantage of unrelated grace periods, thus reducing the memory
footprint in the face of floods of call_rcu() invocations. However,
the ->cblist field is a more-complex rcu_segcblist structure which must
be protected via locking. Even though there are only three entities
which can acquire this lock (the CPU invoking call_rcu(), the no-CBs
grace-period kthread, and the no-CBs callbacks kthread), the contention
on this lock is excessive under heavy stress.
This commit therefore greatly reduces contention by provisioning
an rcu_cblist structure field named ->nocb_bypass within the
rcu_data structure. Each no-CBs CPU is permitted only a limited
number of enqueues onto the ->cblist per jiffy, controlled by a new
nocb_nobypass_lim_per_jiffy kernel boot parameter that defaults to
about 16 enqueues per millisecond (16 * 1000 / HZ). When that limit is
exceeded, the CPU instead enqueues onto the new ->nocb_bypass.
The ->nocb_bypass is flushed into the ->cblist every jiffy or when
the number of callbacks on ->nocb_bypass exceeds qhimark, whichever
happens first. During call_rcu() floods, this flushing is carried out
by the CPU during the course of its call_rcu() invocations. However,
a CPU could simply stop invoking call_rcu() at any time. The no-CBs
grace-period kthread therefore carries out less-aggressive flushing
(every few jiffies or when the number of callbacks on ->nocb_bypass
exceeds (2 * qhimark), whichever comes first). This means that the
no-CBs grace-period kthread cannot be permitted to do unbounded waits
while there are callbacks on ->nocb_bypass. A ->nocb_bypass_timer is
used to provide the needed wakeups.
[ paulmck: Apply Coverity feedback reported by Colin Ian King. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
2019-07-03 07:03:33 +08:00
|
|
|
else \
|
2019-05-28 22:18:08 +08:00
|
|
|
raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
|
|
|
|
} while (0)
|
|
|
|
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
|
|
|
|
#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
|
|
|
|
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
|
|
|
|
|
2013-06-22 08:10:40 +08:00
|
|
|
static void rcu_bind_gp_kthread(void);
|
2018-07-04 08:22:34 +08:00
|
|
|
static bool rcu_nohz_full_cpu(void);
|
2014-08-05 08:43:50 +08:00
|
|
|
static void rcu_dynticks_task_enter(void);
|
|
|
|
static void rcu_dynticks_task_exit(void);
|
rcu-tasks: Avoid IPIing userspace/idle tasks if kernel is so built
Systems running CPU-bound real-time task do not want IPIs sent to CPUs
executing nohz_full userspace tasks. Battery-powered systems don't
want IPIs sent to idle CPUs in low-power mode. Unfortunately, RCU tasks
trace can and will send such IPIs in some cases.
Both of these situations occur only when the target CPU is in RCU
dyntick-idle mode, in other words, when RCU is not watching the
target CPU. This suggests that CPUs in dyntick-idle mode should use
memory barriers in outermost invocations of rcu_read_lock_trace()
and rcu_read_unlock_trace(), which would allow the RCU tasks trace
grace period to directly read out the target CPU's read-side state.
One challenge is that RCU tasks trace is not targeting a specific
CPU, but rather a task. And that task could switch from one CPU to
another at any time.
This commit therefore uses try_invoke_on_locked_down_task()
and checks for task_curr() in trc_inspect_reader_notrunning().
When this condition holds, the target task is running and cannot move.
If CONFIG_TASKS_TRACE_RCU_READ_MB=y, the new rcu_dynticks_zero_in_eqs()
function can be used to check if the specified integer (in this case,
t->trc_reader_nesting) is zero while the target CPU remains in that same
dyntick-idle sojourn. If so, the target task is in a quiescent state.
If not, trc_read_check_handler() must indicate failure so that the
grace-period kthread can take appropriate action or retry after an
appropriate delay, as the case may be.
With this change, given CONFIG_TASKS_TRACE_RCU_READ_MB=y, if a given
CPU remains idle or a given task continues executing in nohz_full mode,
the RCU tasks trace grace-period kthread will detect this without the
need to send an IPI.
Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2020-03-20 06:33:12 +08:00
|
|
|
static void rcu_dynticks_task_trace_enter(void);
|
|
|
|
static void rcu_dynticks_task_trace_exit(void);
|
2019-01-12 08:57:41 +08:00
|
|
|
|
|
|
|
/* Forward declarations for tree_stall.h */
|
|
|
|
static void record_gp_stall_check_time(void);
|
2019-01-15 02:19:20 +08:00
|
|
|
static void rcu_iw_handler(struct irq_work *iwp);
|
2019-01-12 08:57:41 +08:00
|
|
|
static void check_cpu_stall(struct rcu_data *rdp);
|
2019-01-15 23:01:33 +08:00
|
|
|
static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
|
|
|
|
const unsigned long gpssdelay);
|