From 8a3decac087aa897df5af04358c2089e52e70ac4 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 18 Aug 2020 22:24:25 +0800 Subject: [PATCH 01/13] ACPI: Add out of bounds and numa_off protections to pxm_to_node() The function should check the validity of the pxm value before using it to index the pxm_to_node_map[] array. Whilst hardening this code may be good in general, the main intent here is to enable following patches that use this function to replace acpi_map_pxm_to_node() for non SRAT usecases which should return NO_NUMA_NODE for PXM entries not matching with those in SRAT. Signed-off-by: Jonathan Cameron Reviewed-by: Barry Song Reviewed-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/srat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 15bbaab8500b..1fb486f46ee2 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -31,7 +31,7 @@ int acpi_numa __initdata; int pxm_to_node(int pxm) { - if (pxm < 0) + if (pxm < 0 || pxm >= MAX_PXM_DOMAINS || numa_off) return NUMA_NO_NODE; return pxm_to_node_map[pxm]; } From 01feba590cd610780c463aa3d498200ba4503703 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 18 Aug 2020 22:24:26 +0800 Subject: [PATCH 02/13] ACPI: Do not create new NUMA domains from ACPI static tables that are not SRAT Several ACPI static tables contain references to proximity domains. ACPI 6.3 has clarified that only entries in SRAT may define a new domain (sec 5.2.16). Those tables described in the ACPI spec have additional clarifying text. NFIT: Table 5-132, "Integer that represents the proximity domain to which the memory belongs. This number must match with corresponding entry in the SRAT table." HMAT: Table 5-145, "... This number must match with the corresponding entry in the SRAT table's processor affinity structure ... if the initiator is a processor, or the Generic Initiator Affinity Structure if the initiator is a generic initiator". IORT and DMAR are defined by external specifications. Intel Virtualization Technology for Directed I/O Rev 3.1 does not make any explicit statements, but the general SRAT statement above will still apply. https://software.intel.com/sites/default/files/managed/c5/15/vt-directed-io-spec.pdf IO Remapping Table, Platform Design Document rev D, also makes not explicit statement, but refers to ACPI SRAT table for more information and again the generic SRAT statement above applies. https://developer.arm.com/documentation/den0049/d/ In conclusion, any proximity domain specified in these tables, should be a reference to a proximity domain also found in SRAT, and they should not be able to instantiate a new domain. Hence we switch to pxm_to_node() which will only return existing nodes. Signed-off-by: Jonathan Cameron Reviewed-by: Barry Song Reviewed-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- drivers/acpi/arm64/iort.c | 2 +- drivers/acpi/nfit/core.c | 3 +-- drivers/acpi/numa/hmat.c | 2 +- drivers/iommu/intel/dmar.c | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index ec782e4a0fe4..26005a15cb8b 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1335,7 +1335,7 @@ static int __init arm_smmu_v3_set_proximity(struct device *dev, smmu = (struct acpi_iort_smmu_v3 *)node->node_data; if (smmu->flags & ACPI_IORT_SMMU_V3_PXM_VALID) { - int dev_node = acpi_map_pxm_to_node(smmu->pxm); + int dev_node = pxm_to_node(smmu->pxm); if (dev_node != NUMA_NO_NODE && !node_online(dev_node)) return -EINVAL; diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 26dd208a0d63..ea0557cb54f7 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -3008,8 +3008,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) { ndr_desc->numa_node = acpi_map_pxm_to_online_node( spa->proximity_domain); - ndr_desc->target_node = acpi_map_pxm_to_node( - spa->proximity_domain); + ndr_desc->target_node = pxm_to_node(spa->proximity_domain); } else { ndr_desc->numa_node = NUMA_NO_NODE; ndr_desc->target_node = NUMA_NO_NODE; diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 2c32cfb72370..cf6df2df26cd 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -666,7 +666,7 @@ static void hmat_register_target_device(struct memory_target *target, pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm); info = (struct memregion_info) { - .target_node = acpi_map_pxm_to_node(target->memory_pxm), + .target_node = pxm_to_node(target->memory_pxm), }; rc = platform_device_add_data(pdev, &info, sizeof(info)); if (rc < 0) { diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 93e6345f3414..2f3badd41e1b 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -473,7 +473,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg) rhsa = (struct acpi_dmar_rhsa *)header; for_each_drhd_unit(drhd) { if (drhd->reg_base_addr == rhsa->base_address) { - int node = acpi_map_pxm_to_node(rhsa->proximity_domain); + int node = pxm_to_node(rhsa->proximity_domain); if (!node_online(node)) node = NUMA_NO_NODE; From fe205d984e7730f4d21f6f8ebc60f0698404ac31 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 18 Aug 2020 22:24:27 +0800 Subject: [PATCH 03/13] ACPI: Remove side effect of partly creating a node in acpi_map_pxm_to_online_node() While this function will only return an online node, it can have the side effect of partially creating a new node. The existing comments suggest this is intentional, but the usecases of this function are related to NFIT and HMAT parsing, neither of which should be able to define new nodes. One route by which the existing behaviour would cause a crash is to have a _PXM entry in ACPI DSDT attempt to place a device within this partly created proximity domain. A subsequent call to devm_kzalloc() or similar would result in an attempt to allocate memory on a node for which zone lists have not been set up and a NULL pointer dereference. Prevent such cases by switching to pxm_to_node() within acpi_map_pxm_to_online_node() which cannot cause a new node to be partly created. If one would previously have been created we now return NO_NUMA_NODE. Documentation updated to reflect this change. Signed-off-by: Jonathan Cameron Reviewed-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 1e4cdc6c7ae2..a9fd122ae878 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -430,13 +430,12 @@ int acpi_get_node(acpi_handle handle); * ACPI device drivers, which are called after the NUMA initialization has * completed in the kernel, can call this interface to obtain their device * NUMA topology from ACPI tables. Such drivers do not have to deal with - * offline nodes. A node may be offline when a device proximity ID is - * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. - * "numa=off" on x86. + * offline nodes. A node may be offline when SRAT memory entry does not exist, + * or NUMA is disabled, ex. "numa=off" on x86. */ static inline int acpi_map_pxm_to_online_node(int pxm) { - int node = acpi_map_pxm_to_node(pxm); + int node = pxm_to_node(pxm); return numa_map_to_online_node(node); } From 4eb3723f18e9ba4d4b13d82b6f7e68dd50a852ea Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 18 Aug 2020 22:24:28 +0800 Subject: [PATCH 04/13] ACPI: Rename acpi_map_pxm_to_online_node() to pxm_to_online_node() As this function is no longer allowed to create new mappings let us rename it to reflect this. Note all nodes should already exist before any of the users of this function are called. Signed-off-by: Jonathan Cameron Reviewed-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- drivers/acpi/nfit/core.c | 3 +-- drivers/acpi/numa/hmat.c | 2 +- include/linux/acpi.h | 8 ++++---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index ea0557cb54f7..8df2f16d1fdf 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -3006,8 +3006,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ndr_desc->provider_data = nfit_spa; ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) { - ndr_desc->numa_node = acpi_map_pxm_to_online_node( - spa->proximity_domain); + ndr_desc->numa_node = pxm_to_online_node(spa->proximity_domain); ndr_desc->target_node = pxm_to_node(spa->proximity_domain); } else { ndr_desc->numa_node = NUMA_NO_NODE; diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index cf6df2df26cd..e7add2609c03 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -664,7 +664,7 @@ static void hmat_register_target_device(struct memory_target *target, goto out_pdev; } - pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm); + pdev->dev.numa_node = pxm_to_online_node(target->memory_pxm); info = (struct memregion_info) { .target_node = pxm_to_node(target->memory_pxm), }; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index a9fd122ae878..e9f6cd67943e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -420,10 +420,10 @@ int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); /** - * acpi_map_pxm_to_online_node - Map proximity ID to online node + * pxm_to_online_node - Map proximity ID to online node * @pxm: ACPI proximity ID * - * This is similar to acpi_map_pxm_to_node(), but always returns an online + * This is similar to pxm_to_node(), but always returns an online * node. When the mapped node from a given proximity ID is offline, it * looks up the node distance table and returns the nearest online node. * @@ -433,14 +433,14 @@ int acpi_get_node(acpi_handle handle); * offline nodes. A node may be offline when SRAT memory entry does not exist, * or NUMA is disabled, ex. "numa=off" on x86. */ -static inline int acpi_map_pxm_to_online_node(int pxm) +static inline int pxm_to_online_node(int pxm) { int node = pxm_to_node(pxm); return numa_map_to_online_node(node); } #else -static inline int acpi_map_pxm_to_online_node(int pxm) +static inline int pxm_to_online_node(int pxm) { return 0; } From a62d07e0006a3a3ce77041ca07f3c488ec880790 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 18 Aug 2020 22:24:29 +0800 Subject: [PATCH 05/13] ACPI: Remove side effect of partly creating a node in acpi_get_node() acpi_get_node() calls acpi_get_pxm() to evaluate the _PXM AML method for entries found in DSDT/SSDT. ACPI 6.3 sec 6.2.14 states "_PXM evaluates to an integer that identifies a device as belonging to a Proximity Domain defined in the System Resource Affinity Table (SRAT)." Hence a _PXM method should not result in creation of a new NUMA node. Before this patch, _PXM could result in partial instantiation of NUMA node, missing elements such as zone lists. A call to devm_kzalloc(), for example, results in a NULL pointer dereference. This patch therefore replaces the acpi_map_pxm_to_node() with a call to pxm_to_node(). Signed-off-by: Jonathan Cameron Reviewed-by: Hanjun Guo Reviewed-by: Barry Song Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/srat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 1fb486f46ee2..2c9a66c203ff 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -436,6 +436,6 @@ int acpi_get_node(acpi_handle handle) pxm = acpi_get_pxm(handle); - return acpi_map_pxm_to_node(pxm); + return pxm_to_node(pxm); } EXPORT_SYMBOL(acpi_get_node); From 95ac5bf4e471ab1f37bca522e5ea4e474fbcec31 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 18 Aug 2020 22:24:30 +0800 Subject: [PATCH 06/13] irq-chip/gic-v3-its: Fix crash if ITS is in a proximity domain without processor or memory Note this crash is present before any of the patches in this series, but as explained below it is highly unlikely anyone is shipping a firmware that causes it. Tests were done using an overriden SRAT. On ARM64, the gic-v3 driver directly parses SRAT to locate GIC Interrupt Translation Service (ITS) Affinity Structures. This is done much later in the boot than the parses of SRAT which identify proximity domains. As a result, an ITS placed in a proximity domain that is not defined by another SRAT structure will result in a NUMA node that is not completely configured and a crash. ITS [mem 0x202100000-0x20211ffff] ITS@0x0000000202100000: Using ITS number 0 Unable to handle kernel paging request at virtual address 0000000000001a08 ... Call trace: __alloc_pages_nodemask+0xe8/0x338 alloc_pages_node.constprop.0+0x34/0x40 its_probe_one+0x2f8/0xb18 gic_acpi_parse_madt_its+0x108/0x150 acpi_table_parse_entries_array+0x17c/0x264 acpi_table_parse_entries+0x48/0x6c acpi_table_parse_madt+0x30/0x3c its_init+0x1c4/0x644 gic_init_bases+0x4b8/0x4ec gic_acpi_init+0x134/0x264 acpi_match_madt+0x4c/0x84 acpi_table_parse_entries_array+0x17c/0x264 acpi_table_parse_entries+0x48/0x6c acpi_table_parse_madt+0x30/0x3c __acpi_probe_device_table+0x8c/0xe8 irqchip_init+0x3c/0x48 init_IRQ+0xcc/0x100 start_kernel+0x33c/0x548 ACPI 6.3 allows any set of Affinity Structures in SRAT to define a proximity domain. However, as we do not see this crash, we can conclude that no firmware is currently placing an ITS in a node that is separate from those containing memory and / or processors. We could modify the SRAT parsing behavior to identify the existence of Proximity Domains unique to the ITS structures, and handle them as a special case of a generic initiator (once support for those merges). This patch avoids the complexity that would be needed to handle this corner case, by not allowing the ITS entry parsing code to instantiate new NUMA Nodes. If one is encountered that does not already exist, then NO_NUMA_NODE is assigned and a warning printed just as if the value had been greater than allowed NUMA Nodes. "SRAT: Invalid NUMA node -1 in ITS affinity" Whilst this does not provide the full flexibility allowed by ACPI, it does fix the problem. We can revisit a more sophisticated solution if needed by future platforms. Change is simply to replace acpi_map_pxm_to_node with pxm_to_node reflecting the fact a new mapping is not created. Signed-off-by: Jonathan Cameron Reviewed-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- drivers/irqchip/irq-gic-v3-its.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 548de7538632..c8ffec5d9b8a 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -5263,7 +5263,12 @@ static int __init gic_acpi_parse_srat_its(union acpi_subtable_headers *header, return -EINVAL; } - node = acpi_map_pxm_to_node(its_affinity->proximity_domain); + /* + * Note that in theory a new proximity node could be created by this + * entry as it is an SRAT resource allocation structure. + * We do not currently support doing so. + */ + node = pxm_to_node(its_affinity->proximity_domain); if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { pr_err("SRAT: Invalid NUMA node %d in ITS affinity\n", node); From 4849bc777049b568a35d5d63ae326e93f0fff9de Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 28 Sep 2020 12:45:55 -0700 Subject: [PATCH 07/13] ACPI / NUMA: Add stub function for pxm_to_node() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 01feba590cd6 ("ACPI: Do not create new NUMA domains from ACPI static tables that are not SRAT"): $ scripts/config --file arch/x86/configs/x86_64_defconfig -d NUMA -e ACPI_NFIT $ make -skj"$(nproc)" distclean defconfig drivers/acpi/nfit/ drivers/acpi/nfit/core.c: In function ‘acpi_nfit_register_region’: drivers/acpi/nfit/core.c:3010:27: error: implicit declaration of function ‘pxm_to_node’; did you mean ‘xa_to_node’? [-Werror=implicit-function-declaration] 3010 | ndr_desc->target_node = pxm_to_node(spa->proximity_domain); | ^~~~~~~~~~~ | xa_to_node cc1: some warnings being treated as errors ... Add a stub function like acpi_map_pxm_to_node() had so that the build continues to work. Fixes: 01feba590cd6 ("ACPI: Do not create new NUMA domains from ACPI static tables that are not SRAT") Signed-off-by: Nathan Chancellor Acked-by: Randy Dunlap # build-tested Acked-by: Jonathan Cameron Reviewed-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_numa.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index fdebcfc6c8df..09eb3bc20ff5 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -22,5 +22,10 @@ extern int acpi_numa __initdata; extern void bad_srat(void); extern int srat_disabled(void); +#else /* CONFIG_ACPI_NUMA */ +static inline int pxm_to_node(int pxm) +{ + return 0; +} #endif /* CONFIG_ACPI_NUMA */ #endif /* __ACP_NUMA_H */ From 894c26a1c274b8eafbb4b1dad67e70e51a106061 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 30 Sep 2020 22:05:42 +0800 Subject: [PATCH 08/13] ACPI: Support Generic Initiator only domains Generic Initiators are a new ACPI concept that allows for the description of proximity domains that contain a device which performs memory access (such as a network card) but neither host CPU nor Memory. This patch has the parsing code and provides the infrastructure for an architecture to associate these new domains with their nearest memory processing node. Signed-off-by: Jonathan Cameron Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/srat.c | 69 +++++++++++++++++++++++++++++++++++++++- drivers/base/node.c | 3 ++ include/linux/nodemask.h | 1 + 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 2c9a66c203ff..979e76b94e54 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -130,6 +130,36 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) } break; + case ACPI_SRAT_TYPE_GENERIC_AFFINITY: + { + struct acpi_srat_generic_affinity *p = + (struct acpi_srat_generic_affinity *)header; + + if (p->device_handle_type == 0) { + /* + * For pci devices this may be the only place they + * are assigned a proximity domain + */ + pr_debug("SRAT Generic Initiator(Seg:%u BDF:%u) in proximity domain %d %s\n", + *(u16 *)(&p->device_handle[0]), + *(u16 *)(&p->device_handle[2]), + p->proximity_domain, + (p->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED) ? + "enabled" : "disabled"); + } else { + /* + * In this case we can rely on the device having a + * proximity domain reference + */ + pr_debug("SRAT Generic Initiator(HID=%.8s UID=%.4s) in proximity domain %d %s\n", + (char *)(&p->device_handle[0]), + (char *)(&p->device_handle[8]), + p->proximity_domain, + (p->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED) ? + "enabled" : "disabled"); + } + } + break; default: pr_warn("Found unsupported SRAT entry (type = 0x%x)\n", header->type); @@ -332,6 +362,41 @@ acpi_parse_gicc_affinity(union acpi_subtable_headers *header, return 0; } +#if defined(CONFIG_X86) || defined(CONFIG_ARM64) +static int __init +acpi_parse_gi_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_generic_affinity *gi_affinity; + int node; + + gi_affinity = (struct acpi_srat_generic_affinity *)header; + if (!gi_affinity) + return -EINVAL; + acpi_table_print_srat_entry(&header->common); + + if (!(gi_affinity->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED)) + return -EINVAL; + + node = acpi_map_pxm_to_node(gi_affinity->proximity_domain); + if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { + pr_err("SRAT: Too many proximity domains.\n"); + return -EINVAL; + } + node_set(node, numa_nodes_parsed); + node_set_state(node, N_GENERIC_INITIATOR); + + return 0; +} +#else +static int __init +acpi_parse_gi_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + return 0; +} +#endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ + static int __initdata parsed_numa_memblks; static int __init @@ -385,7 +450,7 @@ int __init acpi_numa_init(void) /* SRAT: System Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { - struct acpi_subtable_proc srat_proc[3]; + struct acpi_subtable_proc srat_proc[4]; memset(srat_proc, 0, sizeof(srat_proc)); srat_proc[0].id = ACPI_SRAT_TYPE_CPU_AFFINITY; @@ -394,6 +459,8 @@ int __init acpi_numa_init(void) srat_proc[1].handler = acpi_parse_x2apic_affinity; srat_proc[2].id = ACPI_SRAT_TYPE_GICC_AFFINITY; srat_proc[2].handler = acpi_parse_gicc_affinity; + srat_proc[3].id = ACPI_SRAT_TYPE_GENERIC_AFFINITY; + srat_proc[3].handler = acpi_parse_gi_affinity; acpi_table_parse_entries_array(ACPI_SIG_SRAT, sizeof(struct acpi_table_srat), diff --git a/drivers/base/node.c b/drivers/base/node.c index 508b80f6329b..53383f1f683c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -980,6 +980,8 @@ static struct node_attr node_state_attr[] = { #endif [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY), [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), + [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, + N_GENERIC_INITIATOR), }; static struct attribute *node_state_attrs[] = { @@ -991,6 +993,7 @@ static struct attribute *node_state_attrs[] = { #endif &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, + &node_state_attr[N_GENERIC_INITIATOR].attr.attr, NULL }; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 27e7fa36f707..3334ce056335 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -399,6 +399,7 @@ enum node_states { #endif N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ + N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ NR_NODE_STATES }; From 73bf7382debb1a93fef5ff38222c6a3b62dfea44 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 30 Sep 2020 22:05:43 +0800 Subject: [PATCH 09/13] x86: Support Generic Initiator only proximity domains In common with memoryless domains only register GI domains if the proximity node is not online. If a domain is already a memory containing domain, or a memoryless domain there is nothing to do just because it also contains a Generic Initiator. Signed-off-by: Jonathan Cameron Acked-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/numa.h | 2 ++ arch/x86/kernel/setup.c | 1 + arch/x86/mm/numa.c | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index bbfde3d2662f..f631467272a3 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -62,12 +62,14 @@ extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); extern void numa_add_cpu(int cpu); extern void numa_remove_cpu(int cpu); +extern void init_gi_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } +static inline void init_gi_nodes(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3511736fbc74..9062c146f03a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1218,6 +1218,7 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); init_cpu_to_node(); + init_gi_nodes(); io_apic_init_mappings(); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index aa76ec2d359b..22d3e5ade3ae 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -747,6 +747,27 @@ static void __init init_memory_less_node(int nid) */ } +/* + * A node may exist which has one or more Generic Initiators but no CPUs and no + * memory. + * + * This function must be called after init_cpu_to_node(), to ensure that any + * memoryless CPU nodes have already been brought online, and before the + * node_data[nid] is needed for zone list setup in build_all_zonelists(). + * + * When this function is called, any nodes containing either memory and/or CPUs + * will already be online and there is no need to do anything extra, even if + * they also contain one or more Generic Initiators. + */ +void __init init_gi_nodes(void) +{ + int nid; + + for_each_node_state(nid, N_GENERIC_INITIATOR) + if (!node_online(nid)) + init_memory_less_node(nid); +} + /* * Setup early cpu_to_node. * From 01aabca2fd545554905321029e6c4c5b2fedb345 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 30 Sep 2020 22:05:44 +0800 Subject: [PATCH 10/13] ACPI: Let ACPI know we support Generic Initiator Affinity Structures Until we tell ACPI that we support generic initiators, it will have to operate in fall back domain mode and all _PXM entries should be on existing non GI domains. This patch sets the relevant OSC bit to make that happen. Signed-off-by: Jonathan Cameron Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 4 ++++ include/linux/acpi.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 54002670cb7a..113c661eb848 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -303,7 +303,11 @@ static void acpi_bus_osc_support(void) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT; +#ifdef CONFIG_ARM64 + capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; +#endif #ifdef CONFIG_X86 + capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; if (boot_cpu_has(X86_FEATURE_HWP)) { capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e9f6cd67943e..edbf3c4116b4 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -545,6 +545,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); #define OSC_SB_PCLPI_SUPPORT 0x00000080 #define OSC_SB_OSLPI_SUPPORT 0x00000100 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000 +#define OSC_SB_GENERIC_INITIATOR_SUPPORT 0x00002000 extern bool osc_sb_apei_support_acked; extern bool osc_pc_lpi_support_confirmed; From 2c5b9bde95c96942f2873cea6ef383c02800e4a8 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 30 Sep 2020 22:05:45 +0800 Subject: [PATCH 11/13] ACPI: HMAT: Fix handling of changes from ACPI 6.2 to ACPI 6.3 In ACPI 6.3, the Memory Proximity Domain Attributes Structure changed substantially. One of those changes was that the flag for "Memory Proximity Domain field is valid" was deprecated. This was because the field "Proximity Domain for the Memory" became a required field and hence having a validity flag makes no sense. So the correct logic is to always assume the field is there. Current code assumes it never is. Signed-off-by: Jonathan Cameron Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/hmat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index e7add2609c03..5264aee93642 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -424,7 +424,8 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade pr_info("HMAT: Memory Flags:%04x Processor Domain:%u Memory Domain:%u\n", p->flags, p->processor_PD, p->memory_PD); - if (p->flags & ACPI_HMAT_MEMORY_PD_VALID && hmat_revision == 1) { + if ((hmat_revision == 1 && p->flags & ACPI_HMAT_MEMORY_PD_VALID) || + hmat_revision > 1) { target = find_mem_target(p->memory_PD); if (!target) { pr_debug("HMAT: Memory Domain missing from SRAT\n"); From b9fffe47212c70114f1d91e773ce5ecff8936ef5 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 30 Sep 2020 22:05:46 +0800 Subject: [PATCH 12/13] node: Add access1 class to represent CPU to memory characteristics New access1 class is nearly the same as access0, but always provides characteristics for CPUs to memory. The existing access0 class provides characteristics to nearest or direct connnect initiator which may be a Generic Initiator such as a GPU or network adapter. This new class allows thread placement on CPUs to be performed so as to give optimal access characteristics to memory, even if that memory is for example attached to a GPU or similar and only accessible to the CPU via an appropriate bus. Suggested-by: Dan Willaims Signed-off-by: Jonathan Cameron Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/hmat.c | 88 +++++++++++++++++++++++++++++++--------- 1 file changed, 69 insertions(+), 19 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 5264aee93642..6ea95b367a31 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -56,7 +56,7 @@ struct memory_target { unsigned int memory_pxm; unsigned int processor_pxm; struct resource memregions; - struct node_hmem_attrs hmem_attrs; + struct node_hmem_attrs hmem_attrs[2]; struct list_head caches; struct node_cache_attrs cache_attrs; bool registered; @@ -65,6 +65,7 @@ struct memory_target { struct memory_initiator { struct list_head node; unsigned int processor_pxm; + bool has_cpu; }; struct memory_locality { @@ -108,6 +109,7 @@ static __init void alloc_memory_initiator(unsigned int cpu_pxm) return; initiator->processor_pxm = cpu_pxm; + initiator->has_cpu = node_state(pxm_to_node(cpu_pxm), N_CPU); list_add_tail(&initiator->node, &initiators); } @@ -215,28 +217,28 @@ static u32 hmat_normalize(u16 entry, u64 base, u8 type) } static void hmat_update_target_access(struct memory_target *target, - u8 type, u32 value) + u8 type, u32 value, int access) { switch (type) { case ACPI_HMAT_ACCESS_LATENCY: - target->hmem_attrs.read_latency = value; - target->hmem_attrs.write_latency = value; + target->hmem_attrs[access].read_latency = value; + target->hmem_attrs[access].write_latency = value; break; case ACPI_HMAT_READ_LATENCY: - target->hmem_attrs.read_latency = value; + target->hmem_attrs[access].read_latency = value; break; case ACPI_HMAT_WRITE_LATENCY: - target->hmem_attrs.write_latency = value; + target->hmem_attrs[access].write_latency = value; break; case ACPI_HMAT_ACCESS_BANDWIDTH: - target->hmem_attrs.read_bandwidth = value; - target->hmem_attrs.write_bandwidth = value; + target->hmem_attrs[access].read_bandwidth = value; + target->hmem_attrs[access].write_bandwidth = value; break; case ACPI_HMAT_READ_BANDWIDTH: - target->hmem_attrs.read_bandwidth = value; + target->hmem_attrs[access].read_bandwidth = value; break; case ACPI_HMAT_WRITE_BANDWIDTH: - target->hmem_attrs.write_bandwidth = value; + target->hmem_attrs[access].write_bandwidth = value; break; default: break; @@ -329,8 +331,12 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header, if (mem_hier == ACPI_HMAT_MEMORY) { target = find_mem_target(targs[targ]); - if (target && target->processor_pxm == inits[init]) - hmat_update_target_access(target, type, value); + if (target && target->processor_pxm == inits[init]) { + hmat_update_target_access(target, type, value, 0); + /* If the node has a CPU, update access 1 */ + if (node_state(pxm_to_node(inits[init]), N_CPU)) + hmat_update_target_access(target, type, value, 1); + } } } } @@ -567,6 +573,7 @@ static void hmat_register_target_initiators(struct memory_target *target) unsigned int mem_nid, cpu_nid; struct memory_locality *loc = NULL; u32 best = 0; + bool access0done = false; int i; mem_nid = pxm_to_node(target->memory_pxm); @@ -578,7 +585,11 @@ static void hmat_register_target_initiators(struct memory_target *target) if (target->processor_pxm != PXM_INVAL) { cpu_nid = pxm_to_node(target->processor_pxm); register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); - return; + access0done = true; + if (node_state(cpu_nid, N_CPU)) { + register_memory_node_under_compute_node(mem_nid, cpu_nid, 1); + return; + } } if (list_empty(&localities)) @@ -592,6 +603,41 @@ static void hmat_register_target_initiators(struct memory_target *target) */ bitmap_zero(p_nodes, MAX_NUMNODES); list_sort(p_nodes, &initiators, initiator_cmp); + if (!access0done) { + for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { + loc = localities_types[i]; + if (!loc) + continue; + + best = 0; + list_for_each_entry(initiator, &initiators, node) { + u32 value; + + if (!test_bit(initiator->processor_pxm, p_nodes)) + continue; + + value = hmat_initiator_perf(target, initiator, + loc->hmat_loc); + if (hmat_update_best(loc->hmat_loc->data_type, value, &best)) + bitmap_clear(p_nodes, 0, initiator->processor_pxm); + if (value != best) + clear_bit(initiator->processor_pxm, p_nodes); + } + if (best) + hmat_update_target_access(target, loc->hmat_loc->data_type, + best, 0); + } + + for_each_set_bit(i, p_nodes, MAX_NUMNODES) { + cpu_nid = pxm_to_node(i); + register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); + } + } + + /* Access 1 ignores Generic Initiators */ + bitmap_zero(p_nodes, MAX_NUMNODES); + list_sort(p_nodes, &initiators, initiator_cmp); + best = 0; for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { loc = localities_types[i]; if (!loc) @@ -601,6 +647,10 @@ static void hmat_register_target_initiators(struct memory_target *target) list_for_each_entry(initiator, &initiators, node) { u32 value; + if (!initiator->has_cpu) { + clear_bit(initiator->processor_pxm, p_nodes); + continue; + } if (!test_bit(initiator->processor_pxm, p_nodes)) continue; @@ -611,12 +661,11 @@ static void hmat_register_target_initiators(struct memory_target *target) clear_bit(initiator->processor_pxm, p_nodes); } if (best) - hmat_update_target_access(target, loc->hmat_loc->data_type, best); + hmat_update_target_access(target, loc->hmat_loc->data_type, best, 1); } - for_each_set_bit(i, p_nodes, MAX_NUMNODES) { cpu_nid = pxm_to_node(i); - register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); + register_memory_node_under_compute_node(mem_nid, cpu_nid, 1); } } @@ -629,10 +678,10 @@ static void hmat_register_target_cache(struct memory_target *target) node_add_cache(mem_nid, &tcache->cache_attrs); } -static void hmat_register_target_perf(struct memory_target *target) +static void hmat_register_target_perf(struct memory_target *target, int access) { unsigned mem_nid = pxm_to_node(target->memory_pxm); - node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0); + node_set_perf_attrs(mem_nid, &target->hmem_attrs[access], access); } static void hmat_register_target_device(struct memory_target *target, @@ -734,7 +783,8 @@ static void hmat_register_target(struct memory_target *target) if (!target->registered) { hmat_register_target_initiators(target); hmat_register_target_cache(target); - hmat_register_target_perf(target); + hmat_register_target_perf(target, 0); + hmat_register_target_perf(target, 1); target->registered = true; } mutex_unlock(&target_lock); From dc9e7860df91625fef04d7016182653e8f678f05 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 30 Sep 2020 22:05:47 +0800 Subject: [PATCH 13/13] docs: mm: numaperf.rst Add brief description for access class 1. Try to make minimal changes to the document which already describes access class 0 in a generic fashion (including IO initiatiors that are not CPUs). Signed-off-by: Jonathan Cameron Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/mm/numaperf.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/admin-guide/mm/numaperf.rst b/Documentation/admin-guide/mm/numaperf.rst index 4d69ef1de830..86f2a3c4b638 100644 --- a/Documentation/admin-guide/mm/numaperf.rst +++ b/Documentation/admin-guide/mm/numaperf.rst @@ -56,6 +56,11 @@ nodes' access characteristics share the same performance relative to other linked initiator nodes. Each target within an initiator's access class, though, do not necessarily perform the same as each other. +The access class "1" is used to allow differentiation between initiators +that are CPUs and hence suitable for generic task scheduling, and +IO initiators such as GPUs and NICs. Unlike access class 0, only +nodes containing CPUs are considered. + ================ NUMA Performance ================ @@ -88,6 +93,9 @@ The latency attributes are provided in nanoseconds. The values reported here correspond to the rated latency and bandwidth for the platform. +Access class 1 takes the same form but only includes values for CPU to +memory activity. + ========== NUMA Cache ==========