linux/drivers/base/arch_topology.c
Wang ShaoBo 4cc4cc28ec arch_topology: Fix missing clear cluster_cpumask in remove_cpu_topology()
When testing cpu online and offline, warning happened like this:

[  146.746743] WARNING: CPU: 92 PID: 974 at kernel/sched/topology.c:2215 build_sched_domains+0x81c/0x11b0
[  146.749988] CPU: 92 PID: 974 Comm: kworker/92:2 Not tainted 5.15.0 #9
[  146.750402] Hardware name: Huawei TaiShan 2280 V2/BC82AMDDA, BIOS 1.79 08/21/2021
[  146.751213] Workqueue: events cpuset_hotplug_workfn
[  146.751629] pstate: 00400009 (nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[  146.752048] pc : build_sched_domains+0x81c/0x11b0
[  146.752461] lr : build_sched_domains+0x414/0x11b0
[  146.752860] sp : ffff800040a83a80
[  146.753247] x29: ffff800040a83a80 x28: ffff20801f13a980 x27: ffff20800448ae00
[  146.753644] x26: ffff800012a858e8 x25: ffff800012ea48c0 x24: 0000000000000000
[  146.754039] x23: ffff800010ab7d60 x22: ffff800012f03758 x21: 000000000000005f
[  146.754427] x20: 000000000000005c x19: ffff004080012840 x18: ffffffffffffffff
[  146.754814] x17: 3661613030303230 x16: 30303078303a3239 x15: ffff800011f92b48
[  146.755197] x14: ffff20be3f95cef6 x13: 2e6e69616d6f642d x12: 6465686373204c4c
[  146.755578] x11: ffff20bf7fc83a00 x10: 0000000000000040 x9 : 0000000000000000
[  146.755957] x8 : 0000000000000002 x7 : ffffffffe0000000 x6 : 0000000000000002
[  146.756334] x5 : 0000000090000000 x4 : 00000000f0000000 x3 : 0000000000000001
[  146.756705] x2 : 0000000000000080 x1 : ffff800012f03860 x0 : 0000000000000001
[  146.757070] Call trace:
[  146.757421]  build_sched_domains+0x81c/0x11b0
[  146.757771]  partition_sched_domains_locked+0x57c/0x978
[  146.758118]  rebuild_sched_domains_locked+0x44c/0x7f0
[  146.758460]  rebuild_sched_domains+0x2c/0x48
[  146.758791]  cpuset_hotplug_workfn+0x3fc/0x888
[  146.759114]  process_one_work+0x1f4/0x480
[  146.759429]  worker_thread+0x48/0x460
[  146.759734]  kthread+0x158/0x168
[  146.760030]  ret_from_fork+0x10/0x20
[  146.760318] ---[ end trace 82c44aad6900e81a ]---

For some architectures like risc-v and arm64 which use common code
clear_cpu_topology() in shutting down CPUx, When CONFIG_SCHED_CLUSTER
is set, cluster_sibling in cpu_topology of each sibling adjacent
to CPUx is missed clearing, this causes checking failed in
topology_span_sane() and rebuilding topology failure at end when CPU online.

Different sibling's cluster_sibling in cpu_topology[] when CPU92 offline
(CPU 92, 93, 94, 95 are in one cluster):

Before revision:
CPU                 [92]      [93]      [94]      [95]
cluster_sibling     [92]     [92-95]   [92-95]   [92-95]

After revision:
CPU                 [92]      [93]      [94]      [95]
cluster_sibling     [92]     [93-95]   [93-95]   [93-95]

Signed-off-by: Wang ShaoBo <bobo.shaobowang@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Barry Song <song.bao.hua@hisilicon.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/20211110095856.469360-1-bobo.shaobowang@huawei.com
2021-11-11 13:09:33 +01:00

708 lines
17 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Arch specific cpu topology information
*
* Copyright (C) 2016, ARM Ltd.
* Written by: Juri Lelli, ARM Ltd.
*/
#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/cpufreq.h>
#include <linux/device.h>
#include <linux/of.h>
#include <linux/slab.h>
#include <linux/sched/topology.h>
#include <linux/cpuset.h>
#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
static struct cpumask scale_freq_counters_mask;
static bool scale_freq_invariant;
static bool supports_scale_freq_counters(const struct cpumask *cpus)
{
return cpumask_subset(cpus, &scale_freq_counters_mask);
}
bool topology_scale_freq_invariant(void)
{
return cpufreq_supports_freq_invariance() ||
supports_scale_freq_counters(cpu_online_mask);
}
static void update_scale_freq_invariant(bool status)
{
if (scale_freq_invariant == status)
return;
/*
* Task scheduler behavior depends on frequency invariance support,
* either cpufreq or counter driven. If the support status changes as
* a result of counter initialisation and use, retrigger the build of
* scheduling domains to ensure the information is propagated properly.
*/
if (topology_scale_freq_invariant() == status) {
scale_freq_invariant = status;
rebuild_sched_domains_energy();
}
}
void topology_set_scale_freq_source(struct scale_freq_data *data,
const struct cpumask *cpus)
{
struct scale_freq_data *sfd;
int cpu;
/*
* Avoid calling rebuild_sched_domains() unnecessarily if FIE is
* supported by cpufreq.
*/
if (cpumask_empty(&scale_freq_counters_mask))
scale_freq_invariant = topology_scale_freq_invariant();
rcu_read_lock();
for_each_cpu(cpu, cpus) {
sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu));
/* Use ARCH provided counters whenever possible */
if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) {
rcu_assign_pointer(per_cpu(sft_data, cpu), data);
cpumask_set_cpu(cpu, &scale_freq_counters_mask);
}
}
rcu_read_unlock();
update_scale_freq_invariant(true);
}
EXPORT_SYMBOL_GPL(topology_set_scale_freq_source);
void topology_clear_scale_freq_source(enum scale_freq_source source,
const struct cpumask *cpus)
{
struct scale_freq_data *sfd;
int cpu;
rcu_read_lock();
for_each_cpu(cpu, cpus) {
sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu));
if (sfd && sfd->source == source) {
rcu_assign_pointer(per_cpu(sft_data, cpu), NULL);
cpumask_clear_cpu(cpu, &scale_freq_counters_mask);
}
}
rcu_read_unlock();
/*
* Make sure all references to previous sft_data are dropped to avoid
* use-after-free races.
*/
synchronize_rcu();
update_scale_freq_invariant(false);
}
EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source);
void topology_scale_freq_tick(void)
{
struct scale_freq_data *sfd = rcu_dereference_sched(*this_cpu_ptr(&sft_data));
if (sfd)
sfd->set_freq_scale();
}
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
unsigned long max_freq)
{
unsigned long scale;
int i;
if (WARN_ON_ONCE(!cur_freq || !max_freq))
return;
/*
* If the use of counters for FIE is enabled, just return as we don't
* want to update the scale factor with information from CPUFREQ.
* Instead the scale factor will be updated from arch_scale_freq_tick.
*/
if (supports_scale_freq_counters(cpus))
return;
scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
for_each_cpu(i, cpus)
per_cpu(arch_freq_scale, i) = scale;
}
DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
EXPORT_PER_CPU_SYMBOL_GPL(cpu_scale);
void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
{
per_cpu(cpu_scale, cpu) = capacity;
}
DEFINE_PER_CPU(unsigned long, thermal_pressure);
void topology_set_thermal_pressure(const struct cpumask *cpus,
unsigned long th_pressure)
{
int cpu;
for_each_cpu(cpu, cpus)
WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
}
EXPORT_SYMBOL_GPL(topology_set_thermal_pressure);
static ssize_t cpu_capacity_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct cpu *cpu = container_of(dev, struct cpu, dev);
return sysfs_emit(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id));
}
static void update_topology_flags_workfn(struct work_struct *work);
static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn);
static DEVICE_ATTR_RO(cpu_capacity);
static int register_cpu_capacity_sysctl(void)
{
int i;
struct device *cpu;
for_each_possible_cpu(i) {
cpu = get_cpu_device(i);
if (!cpu) {
pr_err("%s: too early to get CPU%d device!\n",
__func__, i);
continue;
}
device_create_file(cpu, &dev_attr_cpu_capacity);
}
return 0;
}
subsys_initcall(register_cpu_capacity_sysctl);
static int update_topology;
int topology_update_cpu_topology(void)
{
return update_topology;
}
/*
* Updating the sched_domains can't be done directly from cpufreq callbacks
* due to locking, so queue the work for later.
*/
static void update_topology_flags_workfn(struct work_struct *work)
{
update_topology = 1;
rebuild_sched_domains();
pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
update_topology = 0;
}
static DEFINE_PER_CPU(u32, freq_factor) = 1;
static u32 *raw_capacity;
static int free_raw_capacity(void)
{
kfree(raw_capacity);
raw_capacity = NULL;
return 0;
}
void topology_normalize_cpu_scale(void)
{
u64 capacity;
u64 capacity_scale;
int cpu;
if (!raw_capacity)
return;
capacity_scale = 1;
for_each_possible_cpu(cpu) {
capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu);
capacity_scale = max(capacity, capacity_scale);
}
pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale);
for_each_possible_cpu(cpu) {
capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu);
capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT,
capacity_scale);
topology_set_cpu_scale(cpu, capacity);
pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
cpu, topology_get_cpu_scale(cpu));
}
}
bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
{
struct clk *cpu_clk;
static bool cap_parsing_failed;
int ret;
u32 cpu_capacity;
if (cap_parsing_failed)
return false;
ret = of_property_read_u32(cpu_node, "capacity-dmips-mhz",
&cpu_capacity);
if (!ret) {
if (!raw_capacity) {
raw_capacity = kcalloc(num_possible_cpus(),
sizeof(*raw_capacity),
GFP_KERNEL);
if (!raw_capacity) {
cap_parsing_failed = true;
return false;
}
}
raw_capacity[cpu] = cpu_capacity;
pr_debug("cpu_capacity: %pOF cpu_capacity=%u (raw)\n",
cpu_node, raw_capacity[cpu]);
/*
* Update freq_factor for calculating early boot cpu capacities.
* For non-clk CPU DVFS mechanism, there's no way to get the
* frequency value now, assuming they are running at the same
* frequency (by keeping the initial freq_factor value).
*/
cpu_clk = of_clk_get(cpu_node, 0);
if (!PTR_ERR_OR_ZERO(cpu_clk)) {
per_cpu(freq_factor, cpu) =
clk_get_rate(cpu_clk) / 1000;
clk_put(cpu_clk);
}
} else {
if (raw_capacity) {
pr_err("cpu_capacity: missing %pOF raw capacity\n",
cpu_node);
pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n");
}
cap_parsing_failed = true;
free_raw_capacity();
}
return !ret;
}
#ifdef CONFIG_CPU_FREQ
static cpumask_var_t cpus_to_visit;
static void parsing_done_workfn(struct work_struct *work);
static DECLARE_WORK(parsing_done_work, parsing_done_workfn);
static int
init_cpu_capacity_callback(struct notifier_block *nb,
unsigned long val,
void *data)
{
struct cpufreq_policy *policy = data;
int cpu;
if (!raw_capacity)
return 0;
if (val != CPUFREQ_CREATE_POLICY)
return 0;
pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] (to_visit=%*pbl)\n",
cpumask_pr_args(policy->related_cpus),
cpumask_pr_args(cpus_to_visit));
cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
for_each_cpu(cpu, policy->related_cpus)
per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000;
if (cpumask_empty(cpus_to_visit)) {
topology_normalize_cpu_scale();
schedule_work(&update_topology_flags_work);
free_raw_capacity();
pr_debug("cpu_capacity: parsing done\n");
schedule_work(&parsing_done_work);
}
return 0;
}
static struct notifier_block init_cpu_capacity_notifier = {
.notifier_call = init_cpu_capacity_callback,
};
static int __init register_cpufreq_notifier(void)
{
int ret;
/*
* on ACPI-based systems we need to use the default cpu capacity
* until we have the necessary code to parse the cpu capacity, so
* skip registering cpufreq notifier.
*/
if (!acpi_disabled || !raw_capacity)
return -EINVAL;
if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL))
return -ENOMEM;
cpumask_copy(cpus_to_visit, cpu_possible_mask);
ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
CPUFREQ_POLICY_NOTIFIER);
if (ret)
free_cpumask_var(cpus_to_visit);
return ret;
}
core_initcall(register_cpufreq_notifier);
static void parsing_done_workfn(struct work_struct *work)
{
cpufreq_unregister_notifier(&init_cpu_capacity_notifier,
CPUFREQ_POLICY_NOTIFIER);
free_cpumask_var(cpus_to_visit);
}
#else
core_initcall(free_raw_capacity);
#endif
#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
/*
* This function returns the logic cpu number of the node.
* There are basically three kinds of return values:
* (1) logic cpu number which is > 0.
* (2) -ENODEV when the device tree(DT) node is valid and found in the DT but
* there is no possible logical CPU in the kernel to match. This happens
* when CONFIG_NR_CPUS is configure to be smaller than the number of
* CPU nodes in DT. We need to just ignore this case.
* (3) -1 if the node does not exist in the device tree
*/
static int __init get_cpu_for_node(struct device_node *node)
{
struct device_node *cpu_node;
int cpu;
cpu_node = of_parse_phandle(node, "cpu", 0);
if (!cpu_node)
return -1;
cpu = of_cpu_node_to_id(cpu_node);
if (cpu >= 0)
topology_parse_cpu_capacity(cpu_node, cpu);
else
pr_info("CPU node for %pOF exist but the possible cpu range is :%*pbl\n",
cpu_node, cpumask_pr_args(cpu_possible_mask));
of_node_put(cpu_node);
return cpu;
}
static int __init parse_core(struct device_node *core, int package_id,
int core_id)
{
char name[20];
bool leaf = true;
int i = 0;
int cpu;
struct device_node *t;
do {
snprintf(name, sizeof(name), "thread%d", i);
t = of_get_child_by_name(core, name);
if (t) {
leaf = false;
cpu = get_cpu_for_node(t);
if (cpu >= 0) {
cpu_topology[cpu].package_id = package_id;
cpu_topology[cpu].core_id = core_id;
cpu_topology[cpu].thread_id = i;
} else if (cpu != -ENODEV) {
pr_err("%pOF: Can't get CPU for thread\n", t);
of_node_put(t);
return -EINVAL;
}
of_node_put(t);
}
i++;
} while (t);
cpu = get_cpu_for_node(core);
if (cpu >= 0) {
if (!leaf) {
pr_err("%pOF: Core has both threads and CPU\n",
core);
return -EINVAL;
}
cpu_topology[cpu].package_id = package_id;
cpu_topology[cpu].core_id = core_id;
} else if (leaf && cpu != -ENODEV) {
pr_err("%pOF: Can't get CPU for leaf core\n", core);
return -EINVAL;
}
return 0;
}
static int __init parse_cluster(struct device_node *cluster, int depth)
{
char name[20];
bool leaf = true;
bool has_cores = false;
struct device_node *c;
static int package_id __initdata;
int core_id = 0;
int i, ret;
/*
* First check for child clusters; we currently ignore any
* information about the nesting of clusters and present the
* scheduler with a flat list of them.
*/
i = 0;
do {
snprintf(name, sizeof(name), "cluster%d", i);
c = of_get_child_by_name(cluster, name);
if (c) {
leaf = false;
ret = parse_cluster(c, depth + 1);
of_node_put(c);
if (ret != 0)
return ret;
}
i++;
} while (c);
/* Now check for cores */
i = 0;
do {
snprintf(name, sizeof(name), "core%d", i);
c = of_get_child_by_name(cluster, name);
if (c) {
has_cores = true;
if (depth == 0) {
pr_err("%pOF: cpu-map children should be clusters\n",
c);
of_node_put(c);
return -EINVAL;
}
if (leaf) {
ret = parse_core(c, package_id, core_id++);
} else {
pr_err("%pOF: Non-leaf cluster with core %s\n",
cluster, name);
ret = -EINVAL;
}
of_node_put(c);
if (ret != 0)
return ret;
}
i++;
} while (c);
if (leaf && !has_cores)
pr_warn("%pOF: empty cluster\n", cluster);
if (leaf)
package_id++;
return 0;
}
static int __init parse_dt_topology(void)
{
struct device_node *cn, *map;
int ret = 0;
int cpu;
cn = of_find_node_by_path("/cpus");
if (!cn) {
pr_err("No CPU information found in DT\n");
return 0;
}
/*
* When topology is provided cpu-map is essentially a root
* cluster with restricted subnodes.
*/
map = of_get_child_by_name(cn, "cpu-map");
if (!map)
goto out;
ret = parse_cluster(map, 0);
if (ret != 0)
goto out_map;
topology_normalize_cpu_scale();
/*
* Check that all cores are in the topology; the SMP code will
* only mark cores described in the DT as possible.
*/
for_each_possible_cpu(cpu)
if (cpu_topology[cpu].package_id == -1)
ret = -EINVAL;
out_map:
of_node_put(map);
out:
of_node_put(cn);
return ret;
}
#endif
/*
* cpu topology table
*/
struct cpu_topology cpu_topology[NR_CPUS];
EXPORT_SYMBOL_GPL(cpu_topology);
const struct cpumask *cpu_coregroup_mask(int cpu)
{
const cpumask_t *core_mask = cpumask_of_node(cpu_to_node(cpu));
/* Find the smaller of NUMA, core or LLC siblings */
if (cpumask_subset(&cpu_topology[cpu].core_sibling, core_mask)) {
/* not numa in package, lets use the package siblings */
core_mask = &cpu_topology[cpu].core_sibling;
}
if (cpu_topology[cpu].llc_id != -1) {
if (cpumask_subset(&cpu_topology[cpu].llc_sibling, core_mask))
core_mask = &cpu_topology[cpu].llc_sibling;
}
return core_mask;
}
const struct cpumask *cpu_clustergroup_mask(int cpu)
{
return &cpu_topology[cpu].cluster_sibling;
}
void update_siblings_masks(unsigned int cpuid)
{
struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
int cpu;
/* update core and thread sibling masks */
for_each_online_cpu(cpu) {
cpu_topo = &cpu_topology[cpu];
if (cpuid_topo->llc_id == cpu_topo->llc_id) {
cpumask_set_cpu(cpu, &cpuid_topo->llc_sibling);
cpumask_set_cpu(cpuid, &cpu_topo->llc_sibling);
}
if (cpuid_topo->package_id != cpu_topo->package_id)
continue;
if (cpuid_topo->cluster_id == cpu_topo->cluster_id &&
cpuid_topo->cluster_id != -1) {
cpumask_set_cpu(cpu, &cpuid_topo->cluster_sibling);
cpumask_set_cpu(cpuid, &cpu_topo->cluster_sibling);
}
cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
if (cpuid_topo->core_id != cpu_topo->core_id)
continue;
cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
}
}
static void clear_cpu_topology(int cpu)
{
struct cpu_topology *cpu_topo = &cpu_topology[cpu];
cpumask_clear(&cpu_topo->llc_sibling);
cpumask_set_cpu(cpu, &cpu_topo->llc_sibling);
cpumask_clear(&cpu_topo->cluster_sibling);
cpumask_set_cpu(cpu, &cpu_topo->cluster_sibling);
cpumask_clear(&cpu_topo->core_sibling);
cpumask_set_cpu(cpu, &cpu_topo->core_sibling);
cpumask_clear(&cpu_topo->thread_sibling);
cpumask_set_cpu(cpu, &cpu_topo->thread_sibling);
}
void __init reset_cpu_topology(void)
{
unsigned int cpu;
for_each_possible_cpu(cpu) {
struct cpu_topology *cpu_topo = &cpu_topology[cpu];
cpu_topo->thread_id = -1;
cpu_topo->core_id = -1;
cpu_topo->cluster_id = -1;
cpu_topo->package_id = -1;
cpu_topo->llc_id = -1;
clear_cpu_topology(cpu);
}
}
void remove_cpu_topology(unsigned int cpu)
{
int sibling;
for_each_cpu(sibling, topology_core_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
for_each_cpu(sibling, topology_sibling_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
for_each_cpu(sibling, topology_cluster_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_cluster_cpumask(sibling));
for_each_cpu(sibling, topology_llc_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling));
clear_cpu_topology(cpu);
}
__weak int __init parse_acpi_topology(void)
{
return 0;
}
#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
void __init init_cpu_topology(void)
{
reset_cpu_topology();
/*
* Discard anything that was parsed if we hit an error so we
* don't use partial information.
*/
if (parse_acpi_topology())
reset_cpu_topology();
else if (of_have_populated_dt() && parse_dt_topology())
reset_cpu_topology();
}
#endif