mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2025-01-07 22:34:18 +08:00
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (50 commits) x86, mm: Allow ZONE_DMA to be configurable x86, NUMA: Trim numa meminfo with max_pfn in a separate loop x86, NUMA: Rename setup_node_bootmem() to setup_node_data() x86, NUMA: Enable emulation on 32bit too x86, NUMA: Enable CONFIG_AMD_NUMA on 32bit too x86, NUMA: Rename amdtopology_64.c to amdtopology.c x86, NUMA: Make numa_init_array() static x86, NUMA: Make 32bit use common NUMA init path x86, NUMA: Initialize and use remap allocator from setup_node_bootmem() x86-32, NUMA: Add @start and @end to init_alloc_remap() x86, NUMA: Remove long 64bit assumption from numa.c x86, NUMA: Enable build of generic NUMA init code on 32bit x86, NUMA: Move NUMA init logic from numa_64.c to numa.c x86-32, NUMA: Update numaq to use new NUMA init protocol x86-32, NUMA: Replace srat_32.c with srat.c x86-32, NUMA: implement temporary NUMA init shims x86, NUMA: Move numa_nodes_parsed to numa.[hc] x86-32, NUMA: Move get_memcfg_numa() into numa_32.c x86, NUMA: make srat.c 32bit safe x86, NUMA: rename srat_64.c to srat.c ...
This commit is contained in:
commit
13588209aa
@ -112,7 +112,14 @@ config MMU
|
|||||||
def_bool y
|
def_bool y
|
||||||
|
|
||||||
config ZONE_DMA
|
config ZONE_DMA
|
||||||
def_bool y
|
bool "DMA memory allocation support" if EXPERT
|
||||||
|
default y
|
||||||
|
help
|
||||||
|
DMA memory allocation support allows devices with less than 32-bit
|
||||||
|
addressing to allocate within the first 16MB of address space.
|
||||||
|
Disable if no such devices will be used.
|
||||||
|
|
||||||
|
If unsure, say Y.
|
||||||
|
|
||||||
config SBUS
|
config SBUS
|
||||||
bool
|
bool
|
||||||
@ -1164,7 +1171,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
|
|||||||
config AMD_NUMA
|
config AMD_NUMA
|
||||||
def_bool y
|
def_bool y
|
||||||
prompt "Old style AMD Opteron NUMA detection"
|
prompt "Old style AMD Opteron NUMA detection"
|
||||||
depends on X86_64 && NUMA && PCI
|
depends on NUMA && PCI
|
||||||
---help---
|
---help---
|
||||||
Enable AMD NUMA node topology detection. You should say Y here if
|
Enable AMD NUMA node topology detection. You should say Y here if
|
||||||
you have a multi processor AMD system. This uses an old method to
|
you have a multi processor AMD system. This uses an old method to
|
||||||
@ -1191,7 +1198,7 @@ config NODES_SPAN_OTHER_NODES
|
|||||||
|
|
||||||
config NUMA_EMU
|
config NUMA_EMU
|
||||||
bool "NUMA emulation"
|
bool "NUMA emulation"
|
||||||
depends on X86_64 && NUMA
|
depends on NUMA
|
||||||
---help---
|
---help---
|
||||||
Enable NUMA emulation. A flat machine will be split
|
Enable NUMA emulation. A flat machine will be split
|
||||||
into virtual nodes when booted with "numa=fake=N", where N is the
|
into virtual nodes when booted with "numa=fake=N", where N is the
|
||||||
@ -1213,6 +1220,10 @@ config HAVE_ARCH_BOOTMEM
|
|||||||
def_bool y
|
def_bool y
|
||||||
depends on X86_32 && NUMA
|
depends on X86_32 && NUMA
|
||||||
|
|
||||||
|
config HAVE_ARCH_ALLOC_REMAP
|
||||||
|
def_bool y
|
||||||
|
depends on X86_32 && NUMA
|
||||||
|
|
||||||
config ARCH_HAVE_MEMORY_PRESENT
|
config ARCH_HAVE_MEMORY_PRESENT
|
||||||
def_bool y
|
def_bool y
|
||||||
depends on X86_32 && DISCONTIGMEM
|
depends on X86_32 && DISCONTIGMEM
|
||||||
@ -1221,13 +1232,9 @@ config NEED_NODE_MEMMAP_SIZE
|
|||||||
def_bool y
|
def_bool y
|
||||||
depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
|
depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
|
||||||
|
|
||||||
config HAVE_ARCH_ALLOC_REMAP
|
|
||||||
def_bool y
|
|
||||||
depends on X86_32 && NUMA
|
|
||||||
|
|
||||||
config ARCH_FLATMEM_ENABLE
|
config ARCH_FLATMEM_ENABLE
|
||||||
def_bool y
|
def_bool y
|
||||||
depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
|
depends on X86_32 && !NUMA
|
||||||
|
|
||||||
config ARCH_DISCONTIGMEM_ENABLE
|
config ARCH_DISCONTIGMEM_ENABLE
|
||||||
def_bool y
|
def_bool y
|
||||||
@ -1237,20 +1244,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
|
|||||||
def_bool y
|
def_bool y
|
||||||
depends on NUMA && X86_32
|
depends on NUMA && X86_32
|
||||||
|
|
||||||
config ARCH_PROC_KCORE_TEXT
|
|
||||||
def_bool y
|
|
||||||
depends on X86_64 && PROC_KCORE
|
|
||||||
|
|
||||||
config ARCH_SPARSEMEM_DEFAULT
|
|
||||||
def_bool y
|
|
||||||
depends on X86_64
|
|
||||||
|
|
||||||
config ARCH_SPARSEMEM_ENABLE
|
config ARCH_SPARSEMEM_ENABLE
|
||||||
def_bool y
|
def_bool y
|
||||||
depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
|
depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
|
||||||
select SPARSEMEM_STATIC if X86_32
|
select SPARSEMEM_STATIC if X86_32
|
||||||
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
|
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
|
||||||
|
|
||||||
|
config ARCH_SPARSEMEM_DEFAULT
|
||||||
|
def_bool y
|
||||||
|
depends on X86_64
|
||||||
|
|
||||||
config ARCH_SELECT_MEMORY_MODEL
|
config ARCH_SELECT_MEMORY_MODEL
|
||||||
def_bool y
|
def_bool y
|
||||||
depends on ARCH_SPARSEMEM_ENABLE
|
depends on ARCH_SPARSEMEM_ENABLE
|
||||||
@ -1259,6 +1262,10 @@ config ARCH_MEMORY_PROBE
|
|||||||
def_bool X86_64
|
def_bool X86_64
|
||||||
depends on MEMORY_HOTPLUG
|
depends on MEMORY_HOTPLUG
|
||||||
|
|
||||||
|
config ARCH_PROC_KCORE_TEXT
|
||||||
|
def_bool y
|
||||||
|
depends on X86_64 && PROC_KCORE
|
||||||
|
|
||||||
config ILLEGAL_POINTER_VALUE
|
config ILLEGAL_POINTER_VALUE
|
||||||
hex
|
hex
|
||||||
default 0 if X86_32
|
default 0 if X86_32
|
||||||
@ -1693,10 +1700,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
|
|||||||
def_bool y
|
def_bool y
|
||||||
depends on MEMORY_HOTPLUG
|
depends on MEMORY_HOTPLUG
|
||||||
|
|
||||||
config HAVE_ARCH_EARLY_PFN_TO_NID
|
|
||||||
def_bool X86_64
|
|
||||||
depends on NUMA
|
|
||||||
|
|
||||||
config USE_PERCPU_NUMA_NODE_ID
|
config USE_PERCPU_NUMA_NODE_ID
|
||||||
def_bool y
|
def_bool y
|
||||||
depends on NUMA
|
depends on NUMA
|
||||||
|
@ -183,8 +183,6 @@ static inline void disable_acpi(void) { }
|
|||||||
|
|
||||||
#define ARCH_HAS_POWER_INIT 1
|
#define ARCH_HAS_POWER_INIT 1
|
||||||
|
|
||||||
struct bootnode;
|
|
||||||
|
|
||||||
#ifdef CONFIG_ACPI_NUMA
|
#ifdef CONFIG_ACPI_NUMA
|
||||||
extern int acpi_numa;
|
extern int acpi_numa;
|
||||||
extern int x86_acpi_numa_init(void);
|
extern int x86_acpi_numa_init(void);
|
||||||
|
@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range {
|
|||||||
|
|
||||||
extern const struct pci_device_id amd_nb_misc_ids[];
|
extern const struct pci_device_id amd_nb_misc_ids[];
|
||||||
extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
|
extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
|
||||||
struct bootnode;
|
|
||||||
|
|
||||||
extern bool early_is_amd_nb(u32 value);
|
extern bool early_is_amd_nb(u32 value);
|
||||||
extern int amd_cache_northbridges(void);
|
extern int amd_cache_northbridges(void);
|
||||||
|
@ -363,7 +363,12 @@ struct apic {
|
|||||||
*/
|
*/
|
||||||
int (*x86_32_early_logical_apicid)(int cpu);
|
int (*x86_32_early_logical_apicid)(int cpu);
|
||||||
|
|
||||||
/* determine CPU -> NUMA node mapping */
|
/*
|
||||||
|
* Optional method called from setup_local_APIC() after logical
|
||||||
|
* apicid is guaranteed to be known to initialize apicid -> node
|
||||||
|
* mapping if NUMA initialization hasn't done so already. Don't
|
||||||
|
* add new users.
|
||||||
|
*/
|
||||||
int (*x86_32_numa_cpu_node)(int cpu);
|
int (*x86_32_numa_cpu_node)(int cpu);
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
@ -537,8 +542,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
|
|||||||
return cpuid_apic >> index_msb;
|
return cpuid_apic >> index_msb;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern int default_x86_32_numa_cpu_node(int cpu);
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static inline unsigned int
|
static inline unsigned int
|
||||||
|
@ -208,8 +208,7 @@ extern const char * const x86_power_flags[32];
|
|||||||
#define test_cpu_cap(c, bit) \
|
#define test_cpu_cap(c, bit) \
|
||||||
test_bit(bit, (unsigned long *)((c)->x86_capability))
|
test_bit(bit, (unsigned long *)((c)->x86_capability))
|
||||||
|
|
||||||
#define cpu_has(c, bit) \
|
#define REQUIRED_MASK_BIT_SET(bit) \
|
||||||
(__builtin_constant_p(bit) && \
|
|
||||||
( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
|
( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
|
||||||
(((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
|
(((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
|
||||||
(((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
|
(((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
|
||||||
@ -219,10 +218,16 @@ extern const char * const x86_power_flags[32];
|
|||||||
(((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
|
(((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
|
||||||
(((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
|
(((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
|
||||||
(((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
|
(((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
|
||||||
(((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \
|
(((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
|
||||||
? 1 : \
|
|
||||||
|
#define cpu_has(c, bit) \
|
||||||
|
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
|
||||||
test_cpu_cap(c, bit))
|
test_cpu_cap(c, bit))
|
||||||
|
|
||||||
|
#define this_cpu_has(bit) \
|
||||||
|
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
|
||||||
|
x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
|
||||||
|
|
||||||
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
|
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
|
||||||
|
|
||||||
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
|
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
|
||||||
|
@ -69,22 +69,18 @@
|
|||||||
|
|
||||||
#define MAX_DMA_CHANNELS 8
|
#define MAX_DMA_CHANNELS 8
|
||||||
|
|
||||||
#ifdef CONFIG_X86_32
|
|
||||||
|
|
||||||
/* The maximum address that we can perform a DMA transfer to on this platform */
|
|
||||||
#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
/* 16MB ISA DMA zone */
|
/* 16MB ISA DMA zone */
|
||||||
#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
|
#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
|
||||||
|
|
||||||
/* 4GB broken PCI/AGP hardware bus master zone */
|
/* 4GB broken PCI/AGP hardware bus master zone */
|
||||||
#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
|
#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
|
||||||
|
|
||||||
|
#ifdef CONFIG_X86_32
|
||||||
|
/* The maximum address that we can perform a DMA transfer to on this platform */
|
||||||
|
#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
|
||||||
|
#else
|
||||||
/* Compat define for old dma zone */
|
/* Compat define for old dma zone */
|
||||||
#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
|
#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* 8237 DMA controllers */
|
/* 8237 DMA controllers */
|
||||||
|
@ -13,31 +13,11 @@ extern struct pglist_data *node_data[];
|
|||||||
#define NODE_DATA(nid) (node_data[nid])
|
#define NODE_DATA(nid) (node_data[nid])
|
||||||
|
|
||||||
#include <asm/numaq.h>
|
#include <asm/numaq.h>
|
||||||
/* summit or generic arch */
|
|
||||||
#include <asm/srat.h>
|
|
||||||
|
|
||||||
extern int get_memcfg_numa_flat(void);
|
|
||||||
/*
|
|
||||||
* This allows any one NUMA architecture to be compiled
|
|
||||||
* for, and still fall back to the flat function if it
|
|
||||||
* fails.
|
|
||||||
*/
|
|
||||||
static inline void get_memcfg_numa(void)
|
|
||||||
{
|
|
||||||
|
|
||||||
if (get_memcfg_numaq())
|
|
||||||
return;
|
|
||||||
if (get_memcfg_from_srat())
|
|
||||||
return;
|
|
||||||
get_memcfg_numa_flat();
|
|
||||||
}
|
|
||||||
|
|
||||||
extern void resume_map_numa_kva(pgd_t *pgd);
|
extern void resume_map_numa_kva(pgd_t *pgd);
|
||||||
|
|
||||||
#else /* !CONFIG_NUMA */
|
#else /* !CONFIG_NUMA */
|
||||||
|
|
||||||
#define get_memcfg_numa get_memcfg_numa_flat
|
|
||||||
|
|
||||||
static inline void resume_map_numa_kva(pgd_t *pgd) {}
|
static inline void resume_map_numa_kva(pgd_t *pgd) {}
|
||||||
|
|
||||||
#endif /* CONFIG_NUMA */
|
#endif /* CONFIG_NUMA */
|
||||||
|
@ -4,36 +4,13 @@
|
|||||||
#ifndef _ASM_X86_MMZONE_64_H
|
#ifndef _ASM_X86_MMZONE_64_H
|
||||||
#define _ASM_X86_MMZONE_64_H
|
#define _ASM_X86_MMZONE_64_H
|
||||||
|
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
#ifdef CONFIG_NUMA
|
||||||
|
|
||||||
#include <linux/mmdebug.h>
|
#include <linux/mmdebug.h>
|
||||||
|
|
||||||
#include <asm/smp.h>
|
#include <asm/smp.h>
|
||||||
|
|
||||||
/* Simple perfect hash to map physical addresses to node numbers */
|
|
||||||
struct memnode {
|
|
||||||
int shift;
|
|
||||||
unsigned int mapsize;
|
|
||||||
s16 *map;
|
|
||||||
s16 embedded_map[64 - 8];
|
|
||||||
} ____cacheline_aligned; /* total size = 128 bytes */
|
|
||||||
extern struct memnode memnode;
|
|
||||||
#define memnode_shift memnode.shift
|
|
||||||
#define memnodemap memnode.map
|
|
||||||
#define memnodemapsize memnode.mapsize
|
|
||||||
|
|
||||||
extern struct pglist_data *node_data[];
|
extern struct pglist_data *node_data[];
|
||||||
|
|
||||||
static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
|
|
||||||
{
|
|
||||||
unsigned nid;
|
|
||||||
VIRTUAL_BUG_ON(!memnodemap);
|
|
||||||
nid = memnodemap[addr >> memnode_shift];
|
|
||||||
VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
|
|
||||||
return nid;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define NODE_DATA(nid) (node_data[nid])
|
#define NODE_DATA(nid) (node_data[nid])
|
||||||
|
|
||||||
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
|
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
|
||||||
|
@ -1,12 +1,24 @@
|
|||||||
#ifndef _ASM_X86_NUMA_H
|
#ifndef _ASM_X86_NUMA_H
|
||||||
#define _ASM_X86_NUMA_H
|
#define _ASM_X86_NUMA_H
|
||||||
|
|
||||||
|
#include <linux/nodemask.h>
|
||||||
|
|
||||||
#include <asm/topology.h>
|
#include <asm/topology.h>
|
||||||
#include <asm/apicdef.h>
|
#include <asm/apicdef.h>
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
#ifdef CONFIG_NUMA
|
||||||
|
|
||||||
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
|
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
|
||||||
|
#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Too small node sizes may confuse the VM badly. Usually they
|
||||||
|
* result from BIOS bugs. So dont recognize nodes as standalone
|
||||||
|
* NUMA entities that have less than this amount of RAM listed:
|
||||||
|
*/
|
||||||
|
#define NODE_MIN_SIZE (4*1024*1024)
|
||||||
|
|
||||||
|
extern int numa_off;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* __apicid_to_node[] stores the raw mapping between physical apicid and
|
* __apicid_to_node[] stores the raw mapping between physical apicid and
|
||||||
@ -17,15 +29,27 @@
|
|||||||
* numa_cpu_node().
|
* numa_cpu_node().
|
||||||
*/
|
*/
|
||||||
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
|
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
|
||||||
|
extern nodemask_t numa_nodes_parsed __initdata;
|
||||||
|
|
||||||
|
extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
|
||||||
|
extern void __init numa_set_distance(int from, int to, int distance);
|
||||||
|
|
||||||
static inline void set_apicid_to_node(int apicid, s16 node)
|
static inline void set_apicid_to_node(int apicid, s16 node)
|
||||||
{
|
{
|
||||||
__apicid_to_node[apicid] = node;
|
__apicid_to_node[apicid] = node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern int __cpuinit numa_cpu_node(int cpu);
|
||||||
|
|
||||||
#else /* CONFIG_NUMA */
|
#else /* CONFIG_NUMA */
|
||||||
static inline void set_apicid_to_node(int apicid, s16 node)
|
static inline void set_apicid_to_node(int apicid, s16 node)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int numa_cpu_node(int cpu)
|
||||||
|
{
|
||||||
|
return NUMA_NO_NODE;
|
||||||
|
}
|
||||||
#endif /* CONFIG_NUMA */
|
#endif /* CONFIG_NUMA */
|
||||||
|
|
||||||
#ifdef CONFIG_X86_32
|
#ifdef CONFIG_X86_32
|
||||||
@ -37,14 +61,12 @@ static inline void set_apicid_to_node(int apicid, s16 node)
|
|||||||
#ifdef CONFIG_NUMA
|
#ifdef CONFIG_NUMA
|
||||||
extern void __cpuinit numa_set_node(int cpu, int node);
|
extern void __cpuinit numa_set_node(int cpu, int node);
|
||||||
extern void __cpuinit numa_clear_node(int cpu);
|
extern void __cpuinit numa_clear_node(int cpu);
|
||||||
extern void __init numa_init_array(void);
|
|
||||||
extern void __init init_cpu_to_node(void);
|
extern void __init init_cpu_to_node(void);
|
||||||
extern void __cpuinit numa_add_cpu(int cpu);
|
extern void __cpuinit numa_add_cpu(int cpu);
|
||||||
extern void __cpuinit numa_remove_cpu(int cpu);
|
extern void __cpuinit numa_remove_cpu(int cpu);
|
||||||
#else /* CONFIG_NUMA */
|
#else /* CONFIG_NUMA */
|
||||||
static inline void numa_set_node(int cpu, int node) { }
|
static inline void numa_set_node(int cpu, int node) { }
|
||||||
static inline void numa_clear_node(int cpu) { }
|
static inline void numa_clear_node(int cpu) { }
|
||||||
static inline void numa_init_array(void) { }
|
|
||||||
static inline void init_cpu_to_node(void) { }
|
static inline void init_cpu_to_node(void) { }
|
||||||
static inline void numa_add_cpu(int cpu) { }
|
static inline void numa_add_cpu(int cpu) { }
|
||||||
static inline void numa_remove_cpu(int cpu) { }
|
static inline void numa_remove_cpu(int cpu) { }
|
||||||
@ -54,4 +76,10 @@ static inline void numa_remove_cpu(int cpu) { }
|
|||||||
void debug_cpumask_set_cpu(int cpu, int node, bool enable);
|
void debug_cpumask_set_cpu(int cpu, int node, bool enable);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONFIG_NUMA_EMU
|
||||||
|
#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
|
||||||
|
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
|
||||||
|
void numa_emu_cmdline(char *);
|
||||||
|
#endif /* CONFIG_NUMA_EMU */
|
||||||
|
|
||||||
#endif /* _ASM_X86_NUMA_H */
|
#endif /* _ASM_X86_NUMA_H */
|
||||||
|
@ -1,16 +1,6 @@
|
|||||||
#ifndef _ASM_X86_NUMA_32_H
|
#ifndef _ASM_X86_NUMA_32_H
|
||||||
#define _ASM_X86_NUMA_32_H
|
#define _ASM_X86_NUMA_32_H
|
||||||
|
|
||||||
extern int numa_off;
|
|
||||||
|
|
||||||
extern int pxm_to_nid(int pxm);
|
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
|
||||||
extern int __cpuinit numa_cpu_node(int cpu);
|
|
||||||
#else /* CONFIG_NUMA */
|
|
||||||
static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
|
|
||||||
#endif /* CONFIG_NUMA */
|
|
||||||
|
|
||||||
#ifdef CONFIG_HIGHMEM
|
#ifdef CONFIG_HIGHMEM
|
||||||
extern void set_highmem_pages_init(void);
|
extern void set_highmem_pages_init(void);
|
||||||
#else
|
#else
|
||||||
|
@ -1,42 +1,6 @@
|
|||||||
#ifndef _ASM_X86_NUMA_64_H
|
#ifndef _ASM_X86_NUMA_64_H
|
||||||
#define _ASM_X86_NUMA_64_H
|
#define _ASM_X86_NUMA_64_H
|
||||||
|
|
||||||
#include <linux/nodemask.h>
|
|
||||||
|
|
||||||
struct bootnode {
|
|
||||||
u64 start;
|
|
||||||
u64 end;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
|
|
||||||
|
|
||||||
extern int numa_off;
|
|
||||||
|
|
||||||
extern unsigned long numa_free_all_bootmem(void);
|
extern unsigned long numa_free_all_bootmem(void);
|
||||||
extern void setup_node_bootmem(int nodeid, unsigned long start,
|
|
||||||
unsigned long end);
|
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
|
||||||
/*
|
|
||||||
* Too small node sizes may confuse the VM badly. Usually they
|
|
||||||
* result from BIOS bugs. So dont recognize nodes as standalone
|
|
||||||
* NUMA entities that have less than this amount of RAM listed:
|
|
||||||
*/
|
|
||||||
#define NODE_MIN_SIZE (4*1024*1024)
|
|
||||||
|
|
||||||
extern nodemask_t numa_nodes_parsed __initdata;
|
|
||||||
|
|
||||||
extern int __cpuinit numa_cpu_node(int cpu);
|
|
||||||
extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
|
|
||||||
extern void __init numa_set_distance(int from, int to, int distance);
|
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA_EMU
|
|
||||||
#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
|
|
||||||
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
|
|
||||||
void numa_emu_cmdline(char *);
|
|
||||||
#endif /* CONFIG_NUMA_EMU */
|
|
||||||
#else
|
|
||||||
static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* _ASM_X86_NUMA_64_H */
|
#endif /* _ASM_X86_NUMA_64_H */
|
||||||
|
@ -29,7 +29,7 @@
|
|||||||
#ifdef CONFIG_X86_NUMAQ
|
#ifdef CONFIG_X86_NUMAQ
|
||||||
|
|
||||||
extern int found_numaq;
|
extern int found_numaq;
|
||||||
extern int get_memcfg_numaq(void);
|
extern int numaq_numa_init(void);
|
||||||
extern int pci_numaq_init(void);
|
extern int pci_numaq_init(void);
|
||||||
|
|
||||||
extern void *xquad_portio;
|
extern void *xquad_portio;
|
||||||
@ -166,11 +166,6 @@ struct sys_cfg_data {
|
|||||||
|
|
||||||
void numaq_tsc_disable(void);
|
void numaq_tsc_disable(void);
|
||||||
|
|
||||||
#else
|
|
||||||
static inline int get_memcfg_numaq(void)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif /* CONFIG_X86_NUMAQ */
|
#endif /* CONFIG_X86_NUMAQ */
|
||||||
#endif /* _ASM_X86_NUMAQ_H */
|
#endif /* _ASM_X86_NUMAQ_H */
|
||||||
|
|
||||||
|
@ -542,6 +542,33 @@ do { \
|
|||||||
old__; \
|
old__; \
|
||||||
})
|
})
|
||||||
|
|
||||||
|
static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
|
||||||
|
const unsigned long __percpu *addr)
|
||||||
|
{
|
||||||
|
unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
|
||||||
|
|
||||||
|
return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int x86_this_cpu_variable_test_bit(int nr,
|
||||||
|
const unsigned long __percpu *addr)
|
||||||
|
{
|
||||||
|
int oldbit;
|
||||||
|
|
||||||
|
asm volatile("bt "__percpu_arg(2)",%1\n\t"
|
||||||
|
"sbb %0,%0"
|
||||||
|
: "=r" (oldbit)
|
||||||
|
: "m" (*(unsigned long *)addr), "Ir" (nr));
|
||||||
|
|
||||||
|
return oldbit;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define x86_this_cpu_test_bit(nr, addr) \
|
||||||
|
(__builtin_constant_p((nr)) \
|
||||||
|
? x86_this_cpu_constant_test_bit((nr), (addr)) \
|
||||||
|
: x86_this_cpu_variable_test_bit((nr), (addr)))
|
||||||
|
|
||||||
|
|
||||||
#include <asm-generic/percpu.h>
|
#include <asm-generic/percpu.h>
|
||||||
|
|
||||||
/* We can use this directly for local CPU (faster). */
|
/* We can use this directly for local CPU (faster). */
|
||||||
|
@ -1,39 +0,0 @@
|
|||||||
/*
|
|
||||||
* Some of the code in this file has been gleaned from the 64 bit
|
|
||||||
* discontigmem support code base.
|
|
||||||
*
|
|
||||||
* Copyright (C) 2002, IBM Corp.
|
|
||||||
*
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful, but
|
|
||||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
|
||||||
* NON INFRINGEMENT. See the GNU General Public License for more
|
|
||||||
* details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
||||||
*
|
|
||||||
* Send feedback to Pat Gaughen <gone@us.ibm.com>
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef _ASM_X86_SRAT_H
|
|
||||||
#define _ASM_X86_SRAT_H
|
|
||||||
|
|
||||||
#ifdef CONFIG_ACPI_NUMA
|
|
||||||
extern int get_memcfg_from_srat(void);
|
|
||||||
#else
|
|
||||||
static inline int get_memcfg_from_srat(void)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* _ASM_X86_SRAT_H */
|
|
@ -93,19 +93,11 @@ extern void setup_node_to_cpumask_map(void);
|
|||||||
#define pcibus_to_node(bus) __pcibus_to_node(bus)
|
#define pcibus_to_node(bus) __pcibus_to_node(bus)
|
||||||
|
|
||||||
#ifdef CONFIG_X86_32
|
#ifdef CONFIG_X86_32
|
||||||
extern unsigned long node_start_pfn[];
|
|
||||||
extern unsigned long node_end_pfn[];
|
|
||||||
extern unsigned long node_remap_size[];
|
|
||||||
#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
|
|
||||||
|
|
||||||
# define SD_CACHE_NICE_TRIES 1
|
# define SD_CACHE_NICE_TRIES 1
|
||||||
# define SD_IDLE_IDX 1
|
# define SD_IDLE_IDX 1
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
# define SD_CACHE_NICE_TRIES 2
|
# define SD_CACHE_NICE_TRIES 2
|
||||||
# define SD_IDLE_IDX 2
|
# define SD_IDLE_IDX 2
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* sched_domains SD_NODE_INIT for NUMA machines */
|
/* sched_domains SD_NODE_INIT for NUMA machines */
|
||||||
|
@ -505,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void)
|
|||||||
{
|
{
|
||||||
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
|
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
|
||||||
|
|
||||||
if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
|
if (this_cpu_has(X86_FEATURE_ARAT)) {
|
||||||
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
|
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
|
||||||
/* Make LAPIC timer preferrable over percpu HPET */
|
/* Make LAPIC timer preferrable over percpu HPET */
|
||||||
lapic_clockevent.rating = 150;
|
lapic_clockevent.rating = 150;
|
||||||
@ -1237,6 +1237,17 @@ void __cpuinit setup_local_APIC(void)
|
|||||||
/* always use the value from LDR */
|
/* always use the value from LDR */
|
||||||
early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
|
early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
|
||||||
logical_smp_processor_id();
|
logical_smp_processor_id();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Some NUMA implementations (NUMAQ) don't initialize apicid to
|
||||||
|
* node mapping during NUMA init. Now that logical apicid is
|
||||||
|
* guaranteed to be known, give it another chance. This is already
|
||||||
|
* a bit too late - percpu allocation has already happened without
|
||||||
|
* proper NUMA affinity.
|
||||||
|
*/
|
||||||
|
if (apic->x86_32_numa_cpu_node)
|
||||||
|
set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
|
||||||
|
apic->x86_32_numa_cpu_node(cpu));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2014,21 +2025,6 @@ void default_init_apic_ldr(void)
|
|||||||
apic_write(APIC_LDR, val);
|
apic_write(APIC_LDR, val);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_X86_32
|
|
||||||
int default_x86_32_numa_cpu_node(int cpu)
|
|
||||||
{
|
|
||||||
#ifdef CONFIG_NUMA
|
|
||||||
int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
|
|
||||||
|
|
||||||
if (apicid != BAD_APICID)
|
|
||||||
return __apicid_to_node[apicid];
|
|
||||||
return NUMA_NO_NODE;
|
|
||||||
#else
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Power management
|
* Power management
|
||||||
*/
|
*/
|
||||||
|
@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v)
|
|||||||
WARN_ON_ONCE(cpu_has_apic && !disable_apic);
|
WARN_ON_ONCE(cpu_has_apic && !disable_apic);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_X86_32
|
|
||||||
static int noop_x86_32_numa_cpu_node(int cpu)
|
|
||||||
{
|
|
||||||
/* we're always on node 0 */
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct apic apic_noop = {
|
struct apic apic_noop = {
|
||||||
.name = "noop",
|
.name = "noop",
|
||||||
.probe = noop_probe,
|
.probe = noop_probe,
|
||||||
@ -195,6 +187,5 @@ struct apic apic_noop = {
|
|||||||
|
|
||||||
#ifdef CONFIG_X86_32
|
#ifdef CONFIG_X86_32
|
||||||
.x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
|
.x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
|
||||||
.x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
|
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
@ -253,5 +253,4 @@ struct apic apic_bigsmp = {
|
|||||||
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
||||||
|
|
||||||
.x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
|
.x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
|
||||||
.x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
|
|
||||||
};
|
};
|
||||||
|
@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void)
|
|||||||
nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
|
nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int es7000_numa_cpu_node(int cpu)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int es7000_cpu_present_to_apicid(int mps_cpu)
|
static int es7000_cpu_present_to_apicid(int mps_cpu)
|
||||||
{
|
{
|
||||||
if (!mps_cpu)
|
if (!mps_cpu)
|
||||||
@ -688,7 +683,6 @@ struct apic __refdata apic_es7000_cluster = {
|
|||||||
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
||||||
|
|
||||||
.x86_32_early_logical_apicid = es7000_early_logical_apicid,
|
.x86_32_early_logical_apicid = es7000_early_logical_apicid,
|
||||||
.x86_32_numa_cpu_node = es7000_numa_cpu_node,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct apic __refdata apic_es7000 = {
|
struct apic __refdata apic_es7000 = {
|
||||||
@ -752,5 +746,4 @@ struct apic __refdata apic_es7000 = {
|
|||||||
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
||||||
|
|
||||||
.x86_32_early_logical_apicid = es7000_early_logical_apicid,
|
.x86_32_early_logical_apicid = es7000_early_logical_apicid,
|
||||||
.x86_32_numa_cpu_node = es7000_numa_cpu_node,
|
|
||||||
};
|
};
|
||||||
|
@ -48,8 +48,6 @@
|
|||||||
#include <asm/e820.h>
|
#include <asm/e820.h>
|
||||||
#include <asm/ipi.h>
|
#include <asm/ipi.h>
|
||||||
|
|
||||||
#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
|
|
||||||
|
|
||||||
int found_numaq;
|
int found_numaq;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -79,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4];
|
|||||||
static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
|
static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
|
||||||
{
|
{
|
||||||
struct eachquadmem *eq = scd->eq + node;
|
struct eachquadmem *eq = scd->eq + node;
|
||||||
|
u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
|
||||||
|
u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
|
||||||
|
int ret;
|
||||||
|
|
||||||
node_set_online(node);
|
node_set(node, numa_nodes_parsed);
|
||||||
|
ret = numa_add_memblk(node, start, end);
|
||||||
/* Convert to pages */
|
BUG_ON(ret < 0);
|
||||||
node_start_pfn[node] =
|
|
||||||
MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
|
|
||||||
|
|
||||||
node_end_pfn[node] =
|
|
||||||
MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
|
|
||||||
|
|
||||||
memblock_x86_register_active_regions(node, node_start_pfn[node],
|
|
||||||
node_end_pfn[node]);
|
|
||||||
|
|
||||||
memory_present(node, node_start_pfn[node], node_end_pfn[node]);
|
|
||||||
|
|
||||||
node_remap_size[node] = node_memmap_size_bytes(node,
|
|
||||||
node_start_pfn[node],
|
|
||||||
node_end_pfn[node]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Function: smp_dump_qct()
|
* Function: smp_dump_qct()
|
||||||
*
|
*
|
||||||
* Description: gets memory layout from the quad config table. This
|
* Description: gets memory layout from the quad config table. This
|
||||||
* function also updates node_online_map with the nodes (quads) present.
|
* function also updates numa_nodes_parsed with the nodes (quads) present.
|
||||||
*/
|
*/
|
||||||
static void __init smp_dump_qct(void)
|
static void __init smp_dump_qct(void)
|
||||||
{
|
{
|
||||||
@ -112,7 +99,6 @@ static void __init smp_dump_qct(void)
|
|||||||
|
|
||||||
scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
|
scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
|
||||||
|
|
||||||
nodes_clear(node_online_map);
|
|
||||||
for_each_node(node) {
|
for_each_node(node) {
|
||||||
if (scd->quads_present31_0 & (1 << node))
|
if (scd->quads_present31_0 & (1 << node))
|
||||||
numaq_register_node(node, scd);
|
numaq_register_node(node, scd);
|
||||||
@ -282,14 +268,14 @@ static __init void early_check_numaq(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int __init get_memcfg_numaq(void)
|
int __init numaq_numa_init(void)
|
||||||
{
|
{
|
||||||
early_check_numaq();
|
early_check_numaq();
|
||||||
if (!found_numaq)
|
if (!found_numaq)
|
||||||
return 0;
|
return -ENOENT;
|
||||||
smp_dump_qct();
|
smp_dump_qct();
|
||||||
|
|
||||||
return 1;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
|
#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
|
||||||
|
@ -172,7 +172,6 @@ struct apic apic_default = {
|
|||||||
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
||||||
|
|
||||||
.x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
|
.x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
|
||||||
.x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
extern struct apic apic_numaq;
|
extern struct apic apic_numaq;
|
||||||
|
@ -551,5 +551,4 @@ struct apic apic_summit = {
|
|||||||
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
|
||||||
|
|
||||||
.x86_32_early_logical_apicid = summit_early_logical_apicid,
|
.x86_32_early_logical_apicid = summit_early_logical_apicid,
|
||||||
.x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
|
|
||||||
};
|
};
|
||||||
|
@ -353,7 +353,6 @@ static void notify_thresholds(__u64 msr_val)
|
|||||||
static void intel_thermal_interrupt(void)
|
static void intel_thermal_interrupt(void)
|
||||||
{
|
{
|
||||||
__u64 msr_val;
|
__u64 msr_val;
|
||||||
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
|
|
||||||
|
|
||||||
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
|
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
|
||||||
|
|
||||||
@ -365,19 +364,19 @@ static void intel_thermal_interrupt(void)
|
|||||||
CORE_LEVEL) != 0)
|
CORE_LEVEL) != 0)
|
||||||
mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
|
mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
|
||||||
|
|
||||||
if (cpu_has(c, X86_FEATURE_PLN))
|
if (this_cpu_has(X86_FEATURE_PLN))
|
||||||
if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
|
if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
|
||||||
POWER_LIMIT_EVENT,
|
POWER_LIMIT_EVENT,
|
||||||
CORE_LEVEL) != 0)
|
CORE_LEVEL) != 0)
|
||||||
mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
|
mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
|
||||||
|
|
||||||
if (cpu_has(c, X86_FEATURE_PTS)) {
|
if (this_cpu_has(X86_FEATURE_PTS)) {
|
||||||
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
|
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
|
||||||
if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
|
if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
|
||||||
THERMAL_THROTTLING_EVENT,
|
THERMAL_THROTTLING_EVENT,
|
||||||
PACKAGE_LEVEL) != 0)
|
PACKAGE_LEVEL) != 0)
|
||||||
mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
|
mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
|
||||||
if (cpu_has(c, X86_FEATURE_PLN))
|
if (this_cpu_has(X86_FEATURE_PLN))
|
||||||
if (therm_throt_process(msr_val &
|
if (therm_throt_process(msr_val &
|
||||||
PACKAGE_THERM_STATUS_POWER_LIMIT,
|
PACKAGE_THERM_STATUS_POWER_LIMIT,
|
||||||
POWER_LIMIT_EVENT,
|
POWER_LIMIT_EVENT,
|
||||||
|
@ -715,7 +715,7 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int __init
|
||||||
check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
|
check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
|
||||||
{
|
{
|
||||||
if (!mpc_new_phys || count <= mpc_new_length) {
|
if (!mpc_new_phys || count <= mpc_new_length) {
|
||||||
|
@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
|
|||||||
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
|
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
|
||||||
{
|
{
|
||||||
if (!need_resched()) {
|
if (!need_resched()) {
|
||||||
if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
|
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
|
||||||
clflush((void *)¤t_thread_info()->flags);
|
clflush((void *)¤t_thread_info()->flags);
|
||||||
|
|
||||||
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
||||||
@ -465,7 +465,7 @@ static void mwait_idle(void)
|
|||||||
if (!need_resched()) {
|
if (!need_resched()) {
|
||||||
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
|
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
|
||||||
trace_cpu_idle(1, smp_processor_id());
|
trace_cpu_idle(1, smp_processor_id());
|
||||||
if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
|
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
|
||||||
clflush((void *)¤t_thread_info()->flags);
|
clflush((void *)¤t_thread_info()->flags);
|
||||||
|
|
||||||
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
||||||
|
@ -1332,9 +1332,9 @@ static inline void mwait_play_dead(void)
|
|||||||
void *mwait_ptr;
|
void *mwait_ptr;
|
||||||
struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
|
struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
|
||||||
|
|
||||||
if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
|
if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))
|
||||||
return;
|
return;
|
||||||
if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
|
if (!this_cpu_has(X86_FEATURE_CLFLSH))
|
||||||
return;
|
return;
|
||||||
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
|
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
|
||||||
return;
|
return;
|
||||||
|
@ -23,8 +23,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
|
|||||||
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
|
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
|
||||||
|
|
||||||
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
|
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
|
||||||
obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
|
obj-$(CONFIG_AMD_NUMA) += amdtopology.o
|
||||||
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
|
obj-$(CONFIG_ACPI_NUMA) += srat.o
|
||||||
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
|
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
|
||||||
|
|
||||||
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
|
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include <linux/module.h>
|
#include <linux/module.h>
|
||||||
#include <linux/nodemask.h>
|
#include <linux/nodemask.h>
|
||||||
#include <linux/memblock.h>
|
#include <linux/memblock.h>
|
||||||
|
#include <linux/bootmem.h>
|
||||||
|
|
||||||
#include <asm/io.h>
|
#include <asm/io.h>
|
||||||
#include <linux/pci_ids.h>
|
#include <linux/pci_ids.h>
|
||||||
@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void)
|
|||||||
|
|
||||||
int __init amd_numa_init(void)
|
int __init amd_numa_init(void)
|
||||||
{
|
{
|
||||||
unsigned long start = PFN_PHYS(0);
|
u64 start = PFN_PHYS(0);
|
||||||
unsigned long end = PFN_PHYS(max_pfn);
|
u64 end = PFN_PHYS(max_pfn);
|
||||||
unsigned numnodes;
|
unsigned numnodes;
|
||||||
unsigned long prevbase;
|
u64 prevbase;
|
||||||
int i, j, nb;
|
int i, j, nb;
|
||||||
u32 nodeid, reg;
|
u32 nodeid, reg;
|
||||||
unsigned int bits, cores, apicid_base;
|
unsigned int bits, cores, apicid_base;
|
||||||
@ -95,7 +96,7 @@ int __init amd_numa_init(void)
|
|||||||
|
|
||||||
prevbase = 0;
|
prevbase = 0;
|
||||||
for (i = 0; i < 8; i++) {
|
for (i = 0; i < 8; i++) {
|
||||||
unsigned long base, limit;
|
u64 base, limit;
|
||||||
|
|
||||||
base = read_pci_config(0, nb, 1, 0x40 + i*8);
|
base = read_pci_config(0, nb, 1, 0x40 + i*8);
|
||||||
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
|
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
|
||||||
@ -107,18 +108,18 @@ int __init amd_numa_init(void)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (nodeid >= numnodes) {
|
if (nodeid >= numnodes) {
|
||||||
pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
|
pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
|
||||||
base, limit);
|
base, limit);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!limit) {
|
if (!limit) {
|
||||||
pr_info("Skipping node entry %d (base %lx)\n",
|
pr_info("Skipping node entry %d (base %Lx)\n",
|
||||||
i, base);
|
i, base);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if ((base >> 8) & 3 || (limit >> 8) & 3) {
|
if ((base >> 8) & 3 || (limit >> 8) & 3) {
|
||||||
pr_err("Node %d using interleaving mode %lx/%lx\n",
|
pr_err("Node %d using interleaving mode %Lx/%Lx\n",
|
||||||
nodeid, (base >> 8) & 3, (limit >> 8) & 3);
|
nodeid, (base >> 8) & 3, (limit >> 8) & 3);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
@ -150,19 +151,19 @@ int __init amd_numa_init(void)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (limit < base) {
|
if (limit < base) {
|
||||||
pr_err("Node %d bogus settings %lx-%lx.\n",
|
pr_err("Node %d bogus settings %Lx-%Lx.\n",
|
||||||
nodeid, base, limit);
|
nodeid, base, limit);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Could sort here, but pun for now. Should not happen anyroads. */
|
/* Could sort here, but pun for now. Should not happen anyroads. */
|
||||||
if (prevbase > base) {
|
if (prevbase > base) {
|
||||||
pr_err("Node map not sorted %lx,%lx\n",
|
pr_err("Node map not sorted %Lx,%Lx\n",
|
||||||
prevbase, base);
|
prevbase, base);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
pr_info("Node %d MemBase %016lx Limit %016lx\n",
|
pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
|
||||||
nodeid, base, limit);
|
nodeid, base, limit);
|
||||||
|
|
||||||
prevbase = base;
|
prevbase = base;
|
@ -678,8 +678,10 @@ static void __init zone_sizes_init(void)
|
|||||||
{
|
{
|
||||||
unsigned long max_zone_pfns[MAX_NR_ZONES];
|
unsigned long max_zone_pfns[MAX_NR_ZONES];
|
||||||
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
|
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
|
||||||
|
#ifdef CONFIG_ZONE_DMA
|
||||||
max_zone_pfns[ZONE_DMA] =
|
max_zone_pfns[ZONE_DMA] =
|
||||||
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
|
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
|
||||||
|
#endif
|
||||||
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
|
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
|
||||||
#ifdef CONFIG_HIGHMEM
|
#ifdef CONFIG_HIGHMEM
|
||||||
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
|
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
|
||||||
@ -716,6 +718,7 @@ void __init paging_init(void)
|
|||||||
* NOTE: at this point the bootmem allocator is fully available.
|
* NOTE: at this point the bootmem allocator is fully available.
|
||||||
*/
|
*/
|
||||||
olpc_dt_build_devicetree();
|
olpc_dt_build_devicetree();
|
||||||
|
sparse_memory_present_with_active_regions(MAX_NUMNODES);
|
||||||
sparse_init();
|
sparse_init();
|
||||||
zone_sizes_init();
|
zone_sizes_init();
|
||||||
}
|
}
|
||||||
|
@ -616,7 +616,9 @@ void __init paging_init(void)
|
|||||||
unsigned long max_zone_pfns[MAX_NR_ZONES];
|
unsigned long max_zone_pfns[MAX_NR_ZONES];
|
||||||
|
|
||||||
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
|
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
|
||||||
|
#ifdef CONFIG_ZONE_DMA
|
||||||
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
|
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
|
||||||
|
#endif
|
||||||
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
|
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
|
||||||
max_zone_pfns[ZONE_NORMAL] = max_pfn;
|
max_zone_pfns[ZONE_NORMAL] = max_pfn;
|
||||||
|
|
||||||
@ -679,14 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(arch_add_memory);
|
EXPORT_SYMBOL_GPL(arch_add_memory);
|
||||||
|
|
||||||
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
|
|
||||||
int memory_add_physaddr_to_nid(u64 start)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* CONFIG_MEMORY_HOTPLUG */
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
||||||
|
|
||||||
static struct kcore_list kcore_vsyscall;
|
static struct kcore_list kcore_vsyscall;
|
||||||
|
@ -90,13 +90,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
|
|||||||
if (is_ISA_range(phys_addr, last_addr))
|
if (is_ISA_range(phys_addr, last_addr))
|
||||||
return (__force void __iomem *)phys_to_virt(phys_addr);
|
return (__force void __iomem *)phys_to_virt(phys_addr);
|
||||||
|
|
||||||
/*
|
|
||||||
* Check if the request spans more than any BAR in the iomem resource
|
|
||||||
* tree.
|
|
||||||
*/
|
|
||||||
WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
|
|
||||||
KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Don't allow anybody to remap normal RAM that we're using..
|
* Don't allow anybody to remap normal RAM that we're using..
|
||||||
*/
|
*/
|
||||||
@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
|
|||||||
ret_addr = (void __iomem *) (vaddr + offset);
|
ret_addr = (void __iomem *) (vaddr + offset);
|
||||||
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
|
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if the request spans more than any BAR in the iomem resource
|
||||||
|
* tree.
|
||||||
|
*/
|
||||||
|
WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
|
||||||
|
KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
|
||||||
|
|
||||||
return ret_addr;
|
return ret_addr;
|
||||||
err_free_area:
|
err_free_area:
|
||||||
free_vm_area(area);
|
free_vm_area(area);
|
||||||
|
@ -1,11 +1,39 @@
|
|||||||
/* Common code for 32 and 64-bit NUMA */
|
/* Common code for 32 and 64-bit NUMA */
|
||||||
#include <linux/topology.h>
|
#include <linux/kernel.h>
|
||||||
#include <linux/module.h>
|
#include <linux/mm.h>
|
||||||
|
#include <linux/string.h>
|
||||||
|
#include <linux/init.h>
|
||||||
#include <linux/bootmem.h>
|
#include <linux/bootmem.h>
|
||||||
#include <asm/numa.h>
|
#include <linux/memblock.h>
|
||||||
|
#include <linux/mmzone.h>
|
||||||
|
#include <linux/ctype.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
#include <linux/nodemask.h>
|
||||||
|
#include <linux/sched.h>
|
||||||
|
#include <linux/topology.h>
|
||||||
|
|
||||||
|
#include <asm/e820.h>
|
||||||
|
#include <asm/proto.h>
|
||||||
|
#include <asm/dma.h>
|
||||||
#include <asm/acpi.h>
|
#include <asm/acpi.h>
|
||||||
|
#include <asm/amd_nb.h>
|
||||||
|
|
||||||
|
#include "numa_internal.h"
|
||||||
|
|
||||||
int __initdata numa_off;
|
int __initdata numa_off;
|
||||||
|
nodemask_t numa_nodes_parsed __initdata;
|
||||||
|
|
||||||
|
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
||||||
|
EXPORT_SYMBOL(node_data);
|
||||||
|
|
||||||
|
static struct numa_meminfo numa_meminfo
|
||||||
|
#ifndef CONFIG_MEMORY_HOTPLUG
|
||||||
|
__initdata
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
|
||||||
|
static int numa_distance_cnt;
|
||||||
|
static u8 *numa_distance;
|
||||||
|
|
||||||
static __init int numa_setup(char *opt)
|
static __init int numa_setup(char *opt)
|
||||||
{
|
{
|
||||||
@ -32,6 +60,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
|
|||||||
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
|
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
|
||||||
};
|
};
|
||||||
|
|
||||||
|
int __cpuinit numa_cpu_node(int cpu)
|
||||||
|
{
|
||||||
|
int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
|
||||||
|
|
||||||
|
if (apicid != BAD_APICID)
|
||||||
|
return __apicid_to_node[apicid];
|
||||||
|
return NUMA_NO_NODE;
|
||||||
|
}
|
||||||
|
|
||||||
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
|
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
|
||||||
EXPORT_SYMBOL(node_to_cpumask_map);
|
EXPORT_SYMBOL(node_to_cpumask_map);
|
||||||
|
|
||||||
@ -95,6 +132,407 @@ void __init setup_node_to_cpumask_map(void)
|
|||||||
pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
|
pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
|
||||||
|
struct numa_meminfo *mi)
|
||||||
|
{
|
||||||
|
/* ignore zero length blks */
|
||||||
|
if (start == end)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* whine about and ignore invalid blks */
|
||||||
|
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
|
||||||
|
pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
|
||||||
|
nid, start, end);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mi->nr_blks >= NR_NODE_MEMBLKS) {
|
||||||
|
pr_err("NUMA: too many memblk ranges\n");
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
mi->blk[mi->nr_blks].start = start;
|
||||||
|
mi->blk[mi->nr_blks].end = end;
|
||||||
|
mi->blk[mi->nr_blks].nid = nid;
|
||||||
|
mi->nr_blks++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
|
||||||
|
* @idx: Index of memblk to remove
|
||||||
|
* @mi: numa_meminfo to remove memblk from
|
||||||
|
*
|
||||||
|
* Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
|
||||||
|
* decrementing @mi->nr_blks.
|
||||||
|
*/
|
||||||
|
void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
|
||||||
|
{
|
||||||
|
mi->nr_blks--;
|
||||||
|
memmove(&mi->blk[idx], &mi->blk[idx + 1],
|
||||||
|
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* numa_add_memblk - Add one numa_memblk to numa_meminfo
|
||||||
|
* @nid: NUMA node ID of the new memblk
|
||||||
|
* @start: Start address of the new memblk
|
||||||
|
* @end: End address of the new memblk
|
||||||
|
*
|
||||||
|
* Add a new memblk to the default numa_meminfo.
|
||||||
|
*
|
||||||
|
* RETURNS:
|
||||||
|
* 0 on success, -errno on failure.
|
||||||
|
*/
|
||||||
|
int __init numa_add_memblk(int nid, u64 start, u64 end)
|
||||||
|
{
|
||||||
|
return numa_add_memblk_to(nid, start, end, &numa_meminfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Initialize NODE_DATA for a node on the local memory */
|
||||||
|
static void __init setup_node_data(int nid, u64 start, u64 end)
|
||||||
|
{
|
||||||
|
const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
|
||||||
|
const u64 nd_high = PFN_PHYS(max_pfn_mapped);
|
||||||
|
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
|
||||||
|
bool remapped = false;
|
||||||
|
u64 nd_pa;
|
||||||
|
void *nd;
|
||||||
|
int tnid;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Don't confuse VM with a node that doesn't have the
|
||||||
|
* minimum amount of memory:
|
||||||
|
*/
|
||||||
|
if (end && (end - start) < NODE_MIN_SIZE)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* initialize remap allocator before aligning to ZONE_ALIGN */
|
||||||
|
init_alloc_remap(nid, start, end);
|
||||||
|
|
||||||
|
start = roundup(start, ZONE_ALIGN);
|
||||||
|
|
||||||
|
printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
|
||||||
|
nid, start, end);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allocate node data. Try remap allocator first, node-local
|
||||||
|
* memory and then any node. Never allocate in DMA zone.
|
||||||
|
*/
|
||||||
|
nd = alloc_remap(nid, nd_size);
|
||||||
|
if (nd) {
|
||||||
|
nd_pa = __pa(nd);
|
||||||
|
remapped = true;
|
||||||
|
} else {
|
||||||
|
nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
|
||||||
|
nd_size, SMP_CACHE_BYTES);
|
||||||
|
if (nd_pa == MEMBLOCK_ERROR)
|
||||||
|
nd_pa = memblock_find_in_range(nd_low, nd_high,
|
||||||
|
nd_size, SMP_CACHE_BYTES);
|
||||||
|
if (nd_pa == MEMBLOCK_ERROR) {
|
||||||
|
pr_err("Cannot find %zu bytes in node %d\n",
|
||||||
|
nd_size, nid);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
|
||||||
|
nd = __va(nd_pa);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* report and initialize */
|
||||||
|
printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
|
||||||
|
nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
|
||||||
|
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
|
||||||
|
if (!remapped && tnid != nid)
|
||||||
|
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
|
||||||
|
|
||||||
|
node_data[nid] = nd;
|
||||||
|
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
|
||||||
|
NODE_DATA(nid)->node_id = nid;
|
||||||
|
NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
|
||||||
|
NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
|
||||||
|
|
||||||
|
node_set_online(nid);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* numa_cleanup_meminfo - Cleanup a numa_meminfo
|
||||||
|
* @mi: numa_meminfo to clean up
|
||||||
|
*
|
||||||
|
* Sanitize @mi by merging and removing unncessary memblks. Also check for
|
||||||
|
* conflicts and clear unused memblks.
|
||||||
|
*
|
||||||
|
* RETURNS:
|
||||||
|
* 0 on success, -errno on failure.
|
||||||
|
*/
|
||||||
|
int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
|
||||||
|
{
|
||||||
|
const u64 low = 0;
|
||||||
|
const u64 high = PFN_PHYS(max_pfn);
|
||||||
|
int i, j, k;
|
||||||
|
|
||||||
|
/* first, trim all entries */
|
||||||
|
for (i = 0; i < mi->nr_blks; i++) {
|
||||||
|
struct numa_memblk *bi = &mi->blk[i];
|
||||||
|
|
||||||
|
/* make sure all blocks are inside the limits */
|
||||||
|
bi->start = max(bi->start, low);
|
||||||
|
bi->end = min(bi->end, high);
|
||||||
|
|
||||||
|
/* and there's no empty block */
|
||||||
|
if (bi->start >= bi->end)
|
||||||
|
numa_remove_memblk_from(i--, mi);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* merge neighboring / overlapping entries */
|
||||||
|
for (i = 0; i < mi->nr_blks; i++) {
|
||||||
|
struct numa_memblk *bi = &mi->blk[i];
|
||||||
|
|
||||||
|
for (j = i + 1; j < mi->nr_blks; j++) {
|
||||||
|
struct numa_memblk *bj = &mi->blk[j];
|
||||||
|
u64 start, end;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* See whether there are overlapping blocks. Whine
|
||||||
|
* about but allow overlaps of the same nid. They
|
||||||
|
* will be merged below.
|
||||||
|
*/
|
||||||
|
if (bi->end > bj->start && bi->start < bj->end) {
|
||||||
|
if (bi->nid != bj->nid) {
|
||||||
|
pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
|
||||||
|
bi->nid, bi->start, bi->end,
|
||||||
|
bj->nid, bj->start, bj->end);
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
|
||||||
|
bi->nid, bi->start, bi->end,
|
||||||
|
bj->start, bj->end);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Join together blocks on the same node, holes
|
||||||
|
* between which don't overlap with memory on other
|
||||||
|
* nodes.
|
||||||
|
*/
|
||||||
|
if (bi->nid != bj->nid)
|
||||||
|
continue;
|
||||||
|
start = min(bi->start, bj->start);
|
||||||
|
end = max(bi->end, bj->end);
|
||||||
|
for (k = 0; k < mi->nr_blks; k++) {
|
||||||
|
struct numa_memblk *bk = &mi->blk[k];
|
||||||
|
|
||||||
|
if (bi->nid == bk->nid)
|
||||||
|
continue;
|
||||||
|
if (start < bk->end && end > bk->start)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (k < mi->nr_blks)
|
||||||
|
continue;
|
||||||
|
printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
|
||||||
|
bi->nid, bi->start, bi->end, bj->start, bj->end,
|
||||||
|
start, end);
|
||||||
|
bi->start = start;
|
||||||
|
bi->end = end;
|
||||||
|
numa_remove_memblk_from(j--, mi);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* clear unused ones */
|
||||||
|
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
|
||||||
|
mi->blk[i].start = mi->blk[i].end = 0;
|
||||||
|
mi->blk[i].nid = NUMA_NO_NODE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Set nodes, which have memory in @mi, in *@nodemask.
|
||||||
|
*/
|
||||||
|
static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
|
||||||
|
const struct numa_meminfo *mi)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
|
||||||
|
if (mi->blk[i].start != mi->blk[i].end &&
|
||||||
|
mi->blk[i].nid != NUMA_NO_NODE)
|
||||||
|
node_set(mi->blk[i].nid, *nodemask);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* numa_reset_distance - Reset NUMA distance table
|
||||||
|
*
|
||||||
|
* The current table is freed. The next numa_set_distance() call will
|
||||||
|
* create a new one.
|
||||||
|
*/
|
||||||
|
void __init numa_reset_distance(void)
|
||||||
|
{
|
||||||
|
size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
|
||||||
|
|
||||||
|
/* numa_distance could be 1LU marking allocation failure, test cnt */
|
||||||
|
if (numa_distance_cnt)
|
||||||
|
memblock_x86_free_range(__pa(numa_distance),
|
||||||
|
__pa(numa_distance) + size);
|
||||||
|
numa_distance_cnt = 0;
|
||||||
|
numa_distance = NULL; /* enable table creation */
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __init numa_alloc_distance(void)
|
||||||
|
{
|
||||||
|
nodemask_t nodes_parsed;
|
||||||
|
size_t size;
|
||||||
|
int i, j, cnt = 0;
|
||||||
|
u64 phys;
|
||||||
|
|
||||||
|
/* size the new table and allocate it */
|
||||||
|
nodes_parsed = numa_nodes_parsed;
|
||||||
|
numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
|
||||||
|
|
||||||
|
for_each_node_mask(i, nodes_parsed)
|
||||||
|
cnt = i;
|
||||||
|
cnt++;
|
||||||
|
size = cnt * cnt * sizeof(numa_distance[0]);
|
||||||
|
|
||||||
|
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
|
||||||
|
size, PAGE_SIZE);
|
||||||
|
if (phys == MEMBLOCK_ERROR) {
|
||||||
|
pr_warning("NUMA: Warning: can't allocate distance table!\n");
|
||||||
|
/* don't retry until explicitly reset */
|
||||||
|
numa_distance = (void *)1LU;
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
|
||||||
|
|
||||||
|
numa_distance = __va(phys);
|
||||||
|
numa_distance_cnt = cnt;
|
||||||
|
|
||||||
|
/* fill with the default distances */
|
||||||
|
for (i = 0; i < cnt; i++)
|
||||||
|
for (j = 0; j < cnt; j++)
|
||||||
|
numa_distance[i * cnt + j] = i == j ?
|
||||||
|
LOCAL_DISTANCE : REMOTE_DISTANCE;
|
||||||
|
printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* numa_set_distance - Set NUMA distance from one NUMA to another
|
||||||
|
* @from: the 'from' node to set distance
|
||||||
|
* @to: the 'to' node to set distance
|
||||||
|
* @distance: NUMA distance
|
||||||
|
*
|
||||||
|
* Set the distance from node @from to @to to @distance. If distance table
|
||||||
|
* doesn't exist, one which is large enough to accommodate all the currently
|
||||||
|
* known nodes will be created.
|
||||||
|
*
|
||||||
|
* If such table cannot be allocated, a warning is printed and further
|
||||||
|
* calls are ignored until the distance table is reset with
|
||||||
|
* numa_reset_distance().
|
||||||
|
*
|
||||||
|
* If @from or @to is higher than the highest known node at the time of
|
||||||
|
* table creation or @distance doesn't make sense, the call is ignored.
|
||||||
|
* This is to allow simplification of specific NUMA config implementations.
|
||||||
|
*/
|
||||||
|
void __init numa_set_distance(int from, int to, int distance)
|
||||||
|
{
|
||||||
|
if (!numa_distance && numa_alloc_distance() < 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
|
||||||
|
printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
|
||||||
|
from, to, distance);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((u8)distance != distance ||
|
||||||
|
(from == to && distance != LOCAL_DISTANCE)) {
|
||||||
|
pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
|
||||||
|
from, to, distance);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
numa_distance[from * numa_distance_cnt + to] = distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
int __node_distance(int from, int to)
|
||||||
|
{
|
||||||
|
if (from >= numa_distance_cnt || to >= numa_distance_cnt)
|
||||||
|
return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
|
||||||
|
return numa_distance[from * numa_distance_cnt + to];
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(__node_distance);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sanity check to catch more bad NUMA configurations (they are amazingly
|
||||||
|
* common). Make sure the nodes cover all memory.
|
||||||
|
*/
|
||||||
|
static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
|
||||||
|
{
|
||||||
|
u64 numaram, e820ram;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
numaram = 0;
|
||||||
|
for (i = 0; i < mi->nr_blks; i++) {
|
||||||
|
u64 s = mi->blk[i].start >> PAGE_SHIFT;
|
||||||
|
u64 e = mi->blk[i].end >> PAGE_SHIFT;
|
||||||
|
numaram += e - s;
|
||||||
|
numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
|
||||||
|
if ((s64)numaram < 0)
|
||||||
|
numaram = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
e820ram = max_pfn - (memblock_x86_hole_size(0,
|
||||||
|
PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
|
||||||
|
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
|
||||||
|
if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
|
||||||
|
printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
|
||||||
|
(numaram << PAGE_SHIFT) >> 20,
|
||||||
|
(e820ram << PAGE_SHIFT) >> 20);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __init numa_register_memblks(struct numa_meminfo *mi)
|
||||||
|
{
|
||||||
|
int i, nid;
|
||||||
|
|
||||||
|
/* Account for nodes with cpus and no memory */
|
||||||
|
node_possible_map = numa_nodes_parsed;
|
||||||
|
numa_nodemask_from_meminfo(&node_possible_map, mi);
|
||||||
|
if (WARN_ON(nodes_empty(node_possible_map)))
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
for (i = 0; i < mi->nr_blks; i++)
|
||||||
|
memblock_x86_register_active_regions(mi->blk[i].nid,
|
||||||
|
mi->blk[i].start >> PAGE_SHIFT,
|
||||||
|
mi->blk[i].end >> PAGE_SHIFT);
|
||||||
|
|
||||||
|
/* for out of order entries */
|
||||||
|
sort_node_map();
|
||||||
|
if (!numa_meminfo_cover_memory(mi))
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
/* Finally register nodes. */
|
||||||
|
for_each_node_mask(nid, node_possible_map) {
|
||||||
|
u64 start = PFN_PHYS(max_pfn);
|
||||||
|
u64 end = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < mi->nr_blks; i++) {
|
||||||
|
if (nid != mi->blk[i].nid)
|
||||||
|
continue;
|
||||||
|
start = min(mi->blk[i].start, start);
|
||||||
|
end = max(mi->blk[i].end, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start < end)
|
||||||
|
setup_node_data(nid, start, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* There are unfortunately some poorly designed mainboards around that
|
* There are unfortunately some poorly designed mainboards around that
|
||||||
* only connect memory to a single CPU. This breaks the 1:1 cpu->node
|
* only connect memory to a single CPU. This breaks the 1:1 cpu->node
|
||||||
@ -102,7 +540,7 @@ void __init setup_node_to_cpumask_map(void)
|
|||||||
* as the number of CPUs is not known yet. We round robin the existing
|
* as the number of CPUs is not known yet. We round robin the existing
|
||||||
* nodes.
|
* nodes.
|
||||||
*/
|
*/
|
||||||
void __init numa_init_array(void)
|
static void __init numa_init_array(void)
|
||||||
{
|
{
|
||||||
int rr, i;
|
int rr, i;
|
||||||
|
|
||||||
@ -117,6 +555,95 @@ void __init numa_init_array(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int __init numa_init(int (*init_func)(void))
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
for (i = 0; i < MAX_LOCAL_APIC; i++)
|
||||||
|
set_apicid_to_node(i, NUMA_NO_NODE);
|
||||||
|
|
||||||
|
nodes_clear(numa_nodes_parsed);
|
||||||
|
nodes_clear(node_possible_map);
|
||||||
|
nodes_clear(node_online_map);
|
||||||
|
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
|
||||||
|
remove_all_active_ranges();
|
||||||
|
numa_reset_distance();
|
||||||
|
|
||||||
|
ret = init_func();
|
||||||
|
if (ret < 0)
|
||||||
|
return ret;
|
||||||
|
ret = numa_cleanup_meminfo(&numa_meminfo);
|
||||||
|
if (ret < 0)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
numa_emulation(&numa_meminfo, numa_distance_cnt);
|
||||||
|
|
||||||
|
ret = numa_register_memblks(&numa_meminfo);
|
||||||
|
if (ret < 0)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
for (i = 0; i < nr_cpu_ids; i++) {
|
||||||
|
int nid = early_cpu_to_node(i);
|
||||||
|
|
||||||
|
if (nid == NUMA_NO_NODE)
|
||||||
|
continue;
|
||||||
|
if (!node_online(nid))
|
||||||
|
numa_clear_node(i);
|
||||||
|
}
|
||||||
|
numa_init_array();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* dummy_numa_init - Fallback dummy NUMA init
|
||||||
|
*
|
||||||
|
* Used if there's no underlying NUMA architecture, NUMA initialization
|
||||||
|
* fails, or NUMA is disabled on the command line.
|
||||||
|
*
|
||||||
|
* Must online at least one node and add memory blocks that cover all
|
||||||
|
* allowed memory. This function must not fail.
|
||||||
|
*/
|
||||||
|
static int __init dummy_numa_init(void)
|
||||||
|
{
|
||||||
|
printk(KERN_INFO "%s\n",
|
||||||
|
numa_off ? "NUMA turned off" : "No NUMA configuration found");
|
||||||
|
printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
|
||||||
|
0LLU, PFN_PHYS(max_pfn));
|
||||||
|
|
||||||
|
node_set(0, numa_nodes_parsed);
|
||||||
|
numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* x86_numa_init - Initialize NUMA
|
||||||
|
*
|
||||||
|
* Try each configured NUMA initialization method until one succeeds. The
|
||||||
|
* last fallback is dummy single node config encomapssing whole memory and
|
||||||
|
* never fails.
|
||||||
|
*/
|
||||||
|
void __init x86_numa_init(void)
|
||||||
|
{
|
||||||
|
if (!numa_off) {
|
||||||
|
#ifdef CONFIG_X86_NUMAQ
|
||||||
|
if (!numa_init(numaq_numa_init))
|
||||||
|
return;
|
||||||
|
#endif
|
||||||
|
#ifdef CONFIG_ACPI_NUMA
|
||||||
|
if (!numa_init(x86_acpi_numa_init))
|
||||||
|
return;
|
||||||
|
#endif
|
||||||
|
#ifdef CONFIG_AMD_NUMA
|
||||||
|
if (!numa_init(amd_numa_init))
|
||||||
|
return;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
numa_init(dummy_numa_init);
|
||||||
|
}
|
||||||
|
|
||||||
static __init int find_near_online_node(int node)
|
static __init int find_near_online_node(int node)
|
||||||
{
|
{
|
||||||
int n, val;
|
int n, val;
|
||||||
@ -282,3 +809,18 @@ const struct cpumask *cpumask_of_node(int node)
|
|||||||
EXPORT_SYMBOL(cpumask_of_node);
|
EXPORT_SYMBOL(cpumask_of_node);
|
||||||
|
|
||||||
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
|
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
|
||||||
|
|
||||||
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
|
int memory_add_physaddr_to_nid(u64 start)
|
||||||
|
{
|
||||||
|
struct numa_meminfo *mi = &numa_meminfo;
|
||||||
|
int nid = mi->blk[0].nid;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < mi->nr_blks; i++)
|
||||||
|
if (mi->blk[i].start <= start && mi->blk[i].end > start)
|
||||||
|
nid = mi->blk[i].nid;
|
||||||
|
return nid;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
||||||
|
#endif
|
||||||
|
@ -22,39 +22,11 @@
|
|||||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <linux/mm.h>
|
|
||||||
#include <linux/bootmem.h>
|
#include <linux/bootmem.h>
|
||||||
#include <linux/memblock.h>
|
#include <linux/memblock.h>
|
||||||
#include <linux/mmzone.h>
|
|
||||||
#include <linux/highmem.h>
|
|
||||||
#include <linux/initrd.h>
|
|
||||||
#include <linux/nodemask.h>
|
|
||||||
#include <linux/module.h>
|
#include <linux/module.h>
|
||||||
#include <linux/kexec.h>
|
|
||||||
#include <linux/pfn.h>
|
|
||||||
#include <linux/swap.h>
|
|
||||||
#include <linux/acpi.h>
|
|
||||||
|
|
||||||
#include <asm/e820.h>
|
|
||||||
#include <asm/setup.h>
|
|
||||||
#include <asm/mmzone.h>
|
|
||||||
#include <asm/bios_ebda.h>
|
|
||||||
#include <asm/proto.h>
|
|
||||||
|
|
||||||
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
|
||||||
EXPORT_SYMBOL(node_data);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* numa interface - we expect the numa architecture specific code to have
|
|
||||||
* populated the following initialisation.
|
|
||||||
*
|
|
||||||
* 1) node_online_map - the map of all nodes configured (online) in the system
|
|
||||||
* 2) node_start_pfn - the starting page frame number for a node
|
|
||||||
* 3) node_end_pfn - the ending page fram number for a node
|
|
||||||
*/
|
|
||||||
unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
|
|
||||||
unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
|
|
||||||
|
|
||||||
|
#include "numa_internal.h"
|
||||||
|
|
||||||
#ifdef CONFIG_DISCONTIGMEM
|
#ifdef CONFIG_DISCONTIGMEM
|
||||||
/*
|
/*
|
||||||
@ -99,108 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern unsigned long find_max_low_pfn(void);
|
|
||||||
extern unsigned long highend_pfn, highstart_pfn;
|
extern unsigned long highend_pfn, highstart_pfn;
|
||||||
|
|
||||||
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
|
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
|
||||||
|
|
||||||
unsigned long node_remap_size[MAX_NUMNODES];
|
|
||||||
static void *node_remap_start_vaddr[MAX_NUMNODES];
|
static void *node_remap_start_vaddr[MAX_NUMNODES];
|
||||||
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
|
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
|
||||||
|
|
||||||
static unsigned long kva_start_pfn;
|
|
||||||
static unsigned long kva_pages;
|
|
||||||
|
|
||||||
int __cpuinit numa_cpu_node(int cpu)
|
|
||||||
{
|
|
||||||
return apic->x86_32_numa_cpu_node(cpu);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* FLAT - support for basic PC memory model with discontig enabled, essentially
|
* Remap memory allocator
|
||||||
* a single node with all available processors in it with a flat
|
|
||||||
* memory map.
|
|
||||||
*/
|
|
||||||
int __init get_memcfg_numa_flat(void)
|
|
||||||
{
|
|
||||||
printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
|
|
||||||
|
|
||||||
node_start_pfn[0] = 0;
|
|
||||||
node_end_pfn[0] = max_pfn;
|
|
||||||
memblock_x86_register_active_regions(0, 0, max_pfn);
|
|
||||||
memory_present(0, 0, max_pfn);
|
|
||||||
node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
|
|
||||||
|
|
||||||
/* Indicate there is one node available. */
|
|
||||||
nodes_clear(node_online_map);
|
|
||||||
node_set_online(0);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Find the highest page frame number we have available for the node
|
|
||||||
*/
|
|
||||||
static void __init propagate_e820_map_node(int nid)
|
|
||||||
{
|
|
||||||
if (node_end_pfn[nid] > max_pfn)
|
|
||||||
node_end_pfn[nid] = max_pfn;
|
|
||||||
/*
|
|
||||||
* if a user has given mem=XXXX, then we need to make sure
|
|
||||||
* that the node _starts_ before that, too, not just ends
|
|
||||||
*/
|
|
||||||
if (node_start_pfn[nid] > max_pfn)
|
|
||||||
node_start_pfn[nid] = max_pfn;
|
|
||||||
BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Allocate memory for the pg_data_t for this node via a crude pre-bootmem
|
|
||||||
* method. For node zero take this from the bottom of memory, for
|
|
||||||
* subsequent nodes place them at node_remap_start_vaddr which contains
|
|
||||||
* node local data in physically node local memory. See setup_memory()
|
|
||||||
* for details.
|
|
||||||
*/
|
|
||||||
static void __init allocate_pgdat(int nid)
|
|
||||||
{
|
|
||||||
char buf[16];
|
|
||||||
|
|
||||||
if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
|
|
||||||
NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
|
|
||||||
else {
|
|
||||||
unsigned long pgdat_phys;
|
|
||||||
pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
|
|
||||||
max_pfn_mapped<<PAGE_SHIFT,
|
|
||||||
sizeof(pg_data_t),
|
|
||||||
PAGE_SIZE);
|
|
||||||
NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
|
|
||||||
memset(buf, 0, sizeof(buf));
|
|
||||||
sprintf(buf, "NODE_DATA %d", nid);
|
|
||||||
memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
|
|
||||||
}
|
|
||||||
printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
|
|
||||||
nid, (unsigned long)NODE_DATA(nid));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
|
|
||||||
* virtual address space (KVA) is reserved and portions of nodes are mapped
|
|
||||||
* using it. This is to allow node-local memory to be allocated for
|
|
||||||
* structures that would normally require ZONE_NORMAL. The memory is
|
|
||||||
* allocated with alloc_remap() and callers should be prepared to allocate
|
|
||||||
* from the bootmem allocator instead.
|
|
||||||
*/
|
*/
|
||||||
static unsigned long node_remap_start_pfn[MAX_NUMNODES];
|
static unsigned long node_remap_start_pfn[MAX_NUMNODES];
|
||||||
static void *node_remap_end_vaddr[MAX_NUMNODES];
|
static void *node_remap_end_vaddr[MAX_NUMNODES];
|
||||||
static void *node_remap_alloc_vaddr[MAX_NUMNODES];
|
static void *node_remap_alloc_vaddr[MAX_NUMNODES];
|
||||||
static unsigned long node_remap_offset[MAX_NUMNODES];
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* alloc_remap - Allocate remapped memory
|
||||||
|
* @nid: NUMA node to allocate memory from
|
||||||
|
* @size: The size of allocation
|
||||||
|
*
|
||||||
|
* Allocate @size bytes from the remap area of NUMA node @nid. The
|
||||||
|
* size of the remap area is predetermined by init_alloc_remap() and
|
||||||
|
* only the callers considered there should call this function. For
|
||||||
|
* more info, please read the comment on top of init_alloc_remap().
|
||||||
|
*
|
||||||
|
* The caller must be ready to handle allocation failure from this
|
||||||
|
* function and fall back to regular memory allocator in such cases.
|
||||||
|
*
|
||||||
|
* CONTEXT:
|
||||||
|
* Single CPU early boot context.
|
||||||
|
*
|
||||||
|
* RETURNS:
|
||||||
|
* Pointer to the allocated memory on success, %NULL on failure.
|
||||||
|
*/
|
||||||
void *alloc_remap(int nid, unsigned long size)
|
void *alloc_remap(int nid, unsigned long size)
|
||||||
{
|
{
|
||||||
void *allocation = node_remap_alloc_vaddr[nid];
|
void *allocation = node_remap_alloc_vaddr[nid];
|
||||||
|
|
||||||
size = ALIGN(size, L1_CACHE_BYTES);
|
size = ALIGN(size, L1_CACHE_BYTES);
|
||||||
|
|
||||||
if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
|
if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
node_remap_alloc_vaddr[nid] += size;
|
node_remap_alloc_vaddr[nid] += size;
|
||||||
@ -209,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size)
|
|||||||
return allocation;
|
return allocation;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __init remap_numa_kva(void)
|
|
||||||
{
|
|
||||||
void *vaddr;
|
|
||||||
unsigned long pfn;
|
|
||||||
int node;
|
|
||||||
|
|
||||||
for_each_online_node(node) {
|
|
||||||
printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
|
|
||||||
for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
|
|
||||||
vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
|
|
||||||
printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
|
|
||||||
(unsigned long)vaddr,
|
|
||||||
node_remap_start_pfn[node] + pfn);
|
|
||||||
set_pmd_pfn((ulong) vaddr,
|
|
||||||
node_remap_start_pfn[node] + pfn,
|
|
||||||
PAGE_KERNEL_LARGE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CONFIG_HIBERNATION
|
#ifdef CONFIG_HIBERNATION
|
||||||
/**
|
/**
|
||||||
* resume_map_numa_kva - add KVA mapping to the temporary page tables created
|
* resume_map_numa_kva - add KVA mapping to the temporary page tables created
|
||||||
@ -240,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
|
|||||||
int node;
|
int node;
|
||||||
|
|
||||||
for_each_online_node(node) {
|
for_each_online_node(node) {
|
||||||
unsigned long start_va, start_pfn, size, pfn;
|
unsigned long start_va, start_pfn, nr_pages, pfn;
|
||||||
|
|
||||||
start_va = (unsigned long)node_remap_start_vaddr[node];
|
start_va = (unsigned long)node_remap_start_vaddr[node];
|
||||||
start_pfn = node_remap_start_pfn[node];
|
start_pfn = node_remap_start_pfn[node];
|
||||||
size = node_remap_size[node];
|
nr_pages = (node_remap_end_vaddr[node] -
|
||||||
|
node_remap_start_vaddr[node]) >> PAGE_SHIFT;
|
||||||
|
|
||||||
printk(KERN_DEBUG "%s: node %d\n", __func__, node);
|
printk(KERN_DEBUG "%s: node %d\n", __func__, node);
|
||||||
|
|
||||||
for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
|
for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
|
||||||
unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
|
unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
|
||||||
pgd_t *pgd = pgd_base + pgd_index(vaddr);
|
pgd_t *pgd = pgd_base + pgd_index(vaddr);
|
||||||
pud_t *pud = pud_offset(pgd, vaddr);
|
pud_t *pud = pud_offset(pgd, vaddr);
|
||||||
@ -264,132 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static __init unsigned long calculate_numa_remap_pages(void)
|
/**
|
||||||
|
* init_alloc_remap - Initialize remap allocator for a NUMA node
|
||||||
|
* @nid: NUMA node to initizlie remap allocator for
|
||||||
|
*
|
||||||
|
* NUMA nodes may end up without any lowmem. As allocating pgdat and
|
||||||
|
* memmap on a different node with lowmem is inefficient, a special
|
||||||
|
* remap allocator is implemented which can be used by alloc_remap().
|
||||||
|
*
|
||||||
|
* For each node, the amount of memory which will be necessary for
|
||||||
|
* pgdat and memmap is calculated and two memory areas of the size are
|
||||||
|
* allocated - one in the node and the other in lowmem; then, the area
|
||||||
|
* in the node is remapped to the lowmem area.
|
||||||
|
*
|
||||||
|
* As pgdat and memmap must be allocated in lowmem anyway, this
|
||||||
|
* doesn't waste lowmem address space; however, the actual lowmem
|
||||||
|
* which gets remapped over is wasted. The amount shouldn't be
|
||||||
|
* problematic on machines this feature will be used.
|
||||||
|
*
|
||||||
|
* Initialization failure isn't fatal. alloc_remap() is used
|
||||||
|
* opportunistically and the callers will fall back to other memory
|
||||||
|
* allocation mechanisms on failure.
|
||||||
|
*/
|
||||||
|
void __init init_alloc_remap(int nid, u64 start, u64 end)
|
||||||
{
|
{
|
||||||
int nid;
|
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||||
unsigned long size, reserve_pages = 0;
|
unsigned long end_pfn = end >> PAGE_SHIFT;
|
||||||
|
unsigned long size, pfn;
|
||||||
for_each_online_node(nid) {
|
u64 node_pa, remap_pa;
|
||||||
u64 node_kva_target;
|
void *remap_va;
|
||||||
u64 node_kva_final;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The acpi/srat node info can show hot-add memroy zones
|
* The acpi/srat node info can show hot-add memroy zones where
|
||||||
* where memory could be added but not currently present.
|
* memory could be added but not currently present.
|
||||||
*/
|
*/
|
||||||
printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
|
printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
|
||||||
nid, node_start_pfn[nid], node_end_pfn[nid]);
|
nid, start_pfn, end_pfn);
|
||||||
if (node_start_pfn[nid] > max_pfn)
|
|
||||||
continue;
|
|
||||||
if (!node_end_pfn[nid])
|
|
||||||
continue;
|
|
||||||
if (node_end_pfn[nid] > max_pfn)
|
|
||||||
node_end_pfn[nid] = max_pfn;
|
|
||||||
|
|
||||||
/* ensure the remap includes space for the pgdat. */
|
/* calculate the necessary space aligned to large page size */
|
||||||
size = node_remap_size[nid] + sizeof(pg_data_t);
|
size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
|
||||||
|
size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
|
||||||
|
size = ALIGN(size, LARGE_PAGE_BYTES);
|
||||||
|
|
||||||
/* convert size to large (pmd size) pages, rounding up */
|
/* allocate node memory and the lowmem remap area */
|
||||||
size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
|
node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
|
||||||
/* now the roundup is correct, convert to PAGE_SIZE pages */
|
if (node_pa == MEMBLOCK_ERROR) {
|
||||||
size = size * PTRS_PER_PTE;
|
pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
|
||||||
|
size, nid);
|
||||||
node_kva_target = round_down(node_end_pfn[nid] - size,
|
return;
|
||||||
PTRS_PER_PTE);
|
|
||||||
node_kva_target <<= PAGE_SHIFT;
|
|
||||||
do {
|
|
||||||
node_kva_final = memblock_find_in_range(node_kva_target,
|
|
||||||
((u64)node_end_pfn[nid])<<PAGE_SHIFT,
|
|
||||||
((u64)size)<<PAGE_SHIFT,
|
|
||||||
LARGE_PAGE_BYTES);
|
|
||||||
node_kva_target -= LARGE_PAGE_BYTES;
|
|
||||||
} while (node_kva_final == MEMBLOCK_ERROR &&
|
|
||||||
(node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
|
|
||||||
|
|
||||||
if (node_kva_final == MEMBLOCK_ERROR)
|
|
||||||
panic("Can not get kva ram\n");
|
|
||||||
|
|
||||||
node_remap_size[nid] = size;
|
|
||||||
node_remap_offset[nid] = reserve_pages;
|
|
||||||
reserve_pages += size;
|
|
||||||
printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
|
|
||||||
" node %d at %llx\n",
|
|
||||||
size, nid, node_kva_final>>PAGE_SHIFT);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* prevent kva address below max_low_pfn want it on system
|
|
||||||
* with less memory later.
|
|
||||||
* layout will be: KVA address , KVA RAM
|
|
||||||
*
|
|
||||||
* we are supposed to only record the one less then max_low_pfn
|
|
||||||
* but we could have some hole in high memory, and it will only
|
|
||||||
* check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
|
|
||||||
* to use it as free.
|
|
||||||
* So memblock_x86_reserve_range here, hope we don't run out of that array
|
|
||||||
*/
|
|
||||||
memblock_x86_reserve_range(node_kva_final,
|
|
||||||
node_kva_final+(((u64)size)<<PAGE_SHIFT),
|
|
||||||
"KVA RAM");
|
|
||||||
|
|
||||||
node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
|
|
||||||
}
|
}
|
||||||
printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
|
memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
|
||||||
reserve_pages);
|
|
||||||
return reserve_pages;
|
remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
|
||||||
|
max_low_pfn << PAGE_SHIFT,
|
||||||
|
size, LARGE_PAGE_BYTES);
|
||||||
|
if (remap_pa == MEMBLOCK_ERROR) {
|
||||||
|
pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
|
||||||
|
size, nid);
|
||||||
|
memblock_x86_free_range(node_pa, node_pa + size);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
|
||||||
|
remap_va = phys_to_virt(remap_pa);
|
||||||
|
|
||||||
static void init_remap_allocator(int nid)
|
/* perform actual remap */
|
||||||
{
|
for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
|
||||||
node_remap_start_vaddr[nid] = pfn_to_kaddr(
|
set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
|
||||||
kva_start_pfn + node_remap_offset[nid]);
|
(node_pa >> PAGE_SHIFT) + pfn,
|
||||||
node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
|
PAGE_KERNEL_LARGE);
|
||||||
(node_remap_size[nid] * PAGE_SIZE);
|
|
||||||
node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
|
|
||||||
ALIGN(sizeof(pg_data_t), PAGE_SIZE);
|
|
||||||
|
|
||||||
printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
|
/* initialize remap allocator parameters */
|
||||||
(ulong) node_remap_start_vaddr[nid],
|
node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
|
||||||
(ulong) node_remap_end_vaddr[nid]);
|
node_remap_start_vaddr[nid] = remap_va;
|
||||||
|
node_remap_end_vaddr[nid] = remap_va + size;
|
||||||
|
node_remap_alloc_vaddr[nid] = remap_va;
|
||||||
|
|
||||||
|
printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
|
||||||
|
nid, node_pa, node_pa + size, remap_va, remap_va + size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void __init initmem_init(void)
|
void __init initmem_init(void)
|
||||||
{
|
{
|
||||||
int nid;
|
x86_numa_init();
|
||||||
long kva_target_pfn;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When mapping a NUMA machine we allocate the node_mem_map arrays
|
|
||||||
* from node local memory. They are then mapped directly into KVA
|
|
||||||
* between zone normal and vmalloc space. Calculate the size of
|
|
||||||
* this space and use it to adjust the boundary between ZONE_NORMAL
|
|
||||||
* and ZONE_HIGHMEM.
|
|
||||||
*/
|
|
||||||
|
|
||||||
get_memcfg_numa();
|
|
||||||
numa_init_array();
|
|
||||||
|
|
||||||
kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
|
|
||||||
|
|
||||||
kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
|
|
||||||
do {
|
|
||||||
kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
|
|
||||||
max_low_pfn<<PAGE_SHIFT,
|
|
||||||
kva_pages<<PAGE_SHIFT,
|
|
||||||
PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
|
|
||||||
kva_target_pfn -= PTRS_PER_PTE;
|
|
||||||
} while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
|
|
||||||
|
|
||||||
if (kva_start_pfn == MEMBLOCK_ERROR)
|
|
||||||
panic("Can not get kva space\n");
|
|
||||||
|
|
||||||
printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
|
|
||||||
kva_start_pfn, max_low_pfn);
|
|
||||||
printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
|
|
||||||
|
|
||||||
/* avoid clash with initrd */
|
|
||||||
memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
|
|
||||||
(kva_start_pfn + kva_pages)<<PAGE_SHIFT,
|
|
||||||
"KVA PG");
|
|
||||||
#ifdef CONFIG_HIGHMEM
|
#ifdef CONFIG_HIGHMEM
|
||||||
highstart_pfn = highend_pfn = max_pfn;
|
highstart_pfn = highend_pfn = max_pfn;
|
||||||
if (max_pfn > max_low_pfn)
|
if (max_pfn > max_low_pfn)
|
||||||
@ -409,51 +257,9 @@ void __init initmem_init(void)
|
|||||||
|
|
||||||
printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
|
printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
|
||||||
(ulong) pfn_to_kaddr(max_low_pfn));
|
(ulong) pfn_to_kaddr(max_low_pfn));
|
||||||
for_each_online_node(nid) {
|
|
||||||
init_remap_allocator(nid);
|
|
||||||
|
|
||||||
allocate_pgdat(nid);
|
|
||||||
}
|
|
||||||
remap_numa_kva();
|
|
||||||
|
|
||||||
printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
|
printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
|
||||||
(ulong) pfn_to_kaddr(highstart_pfn));
|
(ulong) pfn_to_kaddr(highstart_pfn));
|
||||||
for_each_online_node(nid)
|
|
||||||
propagate_e820_map_node(nid);
|
|
||||||
|
|
||||||
for_each_online_node(nid) {
|
|
||||||
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
|
|
||||||
NODE_DATA(nid)->node_id = nid;
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_bootmem_allocator();
|
setup_bootmem_allocator();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
||||||
static int paddr_to_nid(u64 addr)
|
|
||||||
{
|
|
||||||
int nid;
|
|
||||||
unsigned long pfn = PFN_DOWN(addr);
|
|
||||||
|
|
||||||
for_each_node(nid)
|
|
||||||
if (node_start_pfn[nid] <= pfn &&
|
|
||||||
pfn < node_end_pfn[nid])
|
|
||||||
return nid;
|
|
||||||
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This function is used to ask node id BEFORE memmap and mem_section's
|
|
||||||
* initialization (pfn_to_nid() can't be used yet).
|
|
||||||
* If _PXM is not defined on ACPI's DSDT, node id must be found by this.
|
|
||||||
*/
|
|
||||||
int memory_add_physaddr_to_nid(u64 addr)
|
|
||||||
{
|
|
||||||
int nid = paddr_to_nid(addr);
|
|
||||||
return (nid >= 0) ? nid : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
@ -2,646 +2,13 @@
|
|||||||
* Generic VM initialization for x86-64 NUMA setups.
|
* Generic VM initialization for x86-64 NUMA setups.
|
||||||
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
|
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
|
||||||
*/
|
*/
|
||||||
#include <linux/kernel.h>
|
|
||||||
#include <linux/mm.h>
|
|
||||||
#include <linux/string.h>
|
|
||||||
#include <linux/init.h>
|
|
||||||
#include <linux/bootmem.h>
|
#include <linux/bootmem.h>
|
||||||
#include <linux/memblock.h>
|
|
||||||
#include <linux/mmzone.h>
|
|
||||||
#include <linux/ctype.h>
|
|
||||||
#include <linux/module.h>
|
|
||||||
#include <linux/nodemask.h>
|
|
||||||
#include <linux/sched.h>
|
|
||||||
#include <linux/acpi.h>
|
|
||||||
|
|
||||||
#include <asm/e820.h>
|
|
||||||
#include <asm/proto.h>
|
|
||||||
#include <asm/dma.h>
|
|
||||||
#include <asm/acpi.h>
|
|
||||||
#include <asm/amd_nb.h>
|
|
||||||
|
|
||||||
#include "numa_internal.h"
|
#include "numa_internal.h"
|
||||||
|
|
||||||
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
|
||||||
EXPORT_SYMBOL(node_data);
|
|
||||||
|
|
||||||
nodemask_t numa_nodes_parsed __initdata;
|
|
||||||
|
|
||||||
struct memnode memnode;
|
|
||||||
|
|
||||||
static unsigned long __initdata nodemap_addr;
|
|
||||||
static unsigned long __initdata nodemap_size;
|
|
||||||
|
|
||||||
static struct numa_meminfo numa_meminfo __initdata;
|
|
||||||
|
|
||||||
static int numa_distance_cnt;
|
|
||||||
static u8 *numa_distance;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Given a shift value, try to populate memnodemap[]
|
|
||||||
* Returns :
|
|
||||||
* 1 if OK
|
|
||||||
* 0 if memnodmap[] too small (of shift too small)
|
|
||||||
* -1 if node overlap or lost ram (shift too big)
|
|
||||||
*/
|
|
||||||
static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
|
|
||||||
{
|
|
||||||
unsigned long addr, end;
|
|
||||||
int i, res = -1;
|
|
||||||
|
|
||||||
memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
|
|
||||||
for (i = 0; i < mi->nr_blks; i++) {
|
|
||||||
addr = mi->blk[i].start;
|
|
||||||
end = mi->blk[i].end;
|
|
||||||
if (addr >= end)
|
|
||||||
continue;
|
|
||||||
if ((end >> shift) >= memnodemapsize)
|
|
||||||
return 0;
|
|
||||||
do {
|
|
||||||
if (memnodemap[addr >> shift] != NUMA_NO_NODE)
|
|
||||||
return -1;
|
|
||||||
memnodemap[addr >> shift] = mi->blk[i].nid;
|
|
||||||
addr += (1UL << shift);
|
|
||||||
} while (addr < end);
|
|
||||||
res = 1;
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int __init allocate_cachealigned_memnodemap(void)
|
|
||||||
{
|
|
||||||
unsigned long addr;
|
|
||||||
|
|
||||||
memnodemap = memnode.embedded_map;
|
|
||||||
if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
addr = 0x8000;
|
|
||||||
nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
|
|
||||||
nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
|
|
||||||
nodemap_size, L1_CACHE_BYTES);
|
|
||||||
if (nodemap_addr == MEMBLOCK_ERROR) {
|
|
||||||
printk(KERN_ERR
|
|
||||||
"NUMA: Unable to allocate Memory to Node hash map\n");
|
|
||||||
nodemap_addr = nodemap_size = 0;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
memnodemap = phys_to_virt(nodemap_addr);
|
|
||||||
memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
|
|
||||||
|
|
||||||
printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
|
|
||||||
nodemap_addr, nodemap_addr + nodemap_size);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The LSB of all start and end addresses in the node map is the value of the
|
|
||||||
* maximum possible shift.
|
|
||||||
*/
|
|
||||||
static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
|
|
||||||
{
|
|
||||||
int i, nodes_used = 0;
|
|
||||||
unsigned long start, end;
|
|
||||||
unsigned long bitfield = 0, memtop = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < mi->nr_blks; i++) {
|
|
||||||
start = mi->blk[i].start;
|
|
||||||
end = mi->blk[i].end;
|
|
||||||
if (start >= end)
|
|
||||||
continue;
|
|
||||||
bitfield |= start;
|
|
||||||
nodes_used++;
|
|
||||||
if (end > memtop)
|
|
||||||
memtop = end;
|
|
||||||
}
|
|
||||||
if (nodes_used <= 1)
|
|
||||||
i = 63;
|
|
||||||
else
|
|
||||||
i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
|
|
||||||
memnodemapsize = (memtop >> i)+1;
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int __init compute_hash_shift(const struct numa_meminfo *mi)
|
|
||||||
{
|
|
||||||
int shift;
|
|
||||||
|
|
||||||
shift = extract_lsb_from_nodes(mi);
|
|
||||||
if (allocate_cachealigned_memnodemap())
|
|
||||||
return -1;
|
|
||||||
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
|
|
||||||
shift);
|
|
||||||
|
|
||||||
if (populate_memnodemap(mi, shift) != 1) {
|
|
||||||
printk(KERN_INFO "Your memory is not aligned you need to "
|
|
||||||
"rebuild your kernel with a bigger NODEMAPSIZE "
|
|
||||||
"shift=%d\n", shift);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
return shift;
|
|
||||||
}
|
|
||||||
|
|
||||||
int __meminit __early_pfn_to_nid(unsigned long pfn)
|
|
||||||
{
|
|
||||||
return phys_to_nid(pfn << PAGE_SHIFT);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void * __init early_node_mem(int nodeid, unsigned long start,
|
|
||||||
unsigned long end, unsigned long size,
|
|
||||||
unsigned long align)
|
|
||||||
{
|
|
||||||
unsigned long mem;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* put it on high as possible
|
|
||||||
* something will go with NODE_DATA
|
|
||||||
*/
|
|
||||||
if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
|
|
||||||
start = MAX_DMA_PFN<<PAGE_SHIFT;
|
|
||||||
if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
|
|
||||||
end > (MAX_DMA32_PFN<<PAGE_SHIFT))
|
|
||||||
start = MAX_DMA32_PFN<<PAGE_SHIFT;
|
|
||||||
mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
|
|
||||||
if (mem != MEMBLOCK_ERROR)
|
|
||||||
return __va(mem);
|
|
||||||
|
|
||||||
/* extend the search scope */
|
|
||||||
end = max_pfn_mapped << PAGE_SHIFT;
|
|
||||||
start = MAX_DMA_PFN << PAGE_SHIFT;
|
|
||||||
mem = memblock_find_in_range(start, end, size, align);
|
|
||||||
if (mem != MEMBLOCK_ERROR)
|
|
||||||
return __va(mem);
|
|
||||||
|
|
||||||
printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
|
|
||||||
size, nodeid);
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
|
|
||||||
struct numa_meminfo *mi)
|
|
||||||
{
|
|
||||||
/* ignore zero length blks */
|
|
||||||
if (start == end)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/* whine about and ignore invalid blks */
|
|
||||||
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
|
|
||||||
pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
|
|
||||||
nid, start, end);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mi->nr_blks >= NR_NODE_MEMBLKS) {
|
|
||||||
pr_err("NUMA: too many memblk ranges\n");
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
mi->blk[mi->nr_blks].start = start;
|
|
||||||
mi->blk[mi->nr_blks].end = end;
|
|
||||||
mi->blk[mi->nr_blks].nid = nid;
|
|
||||||
mi->nr_blks++;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
|
|
||||||
* @idx: Index of memblk to remove
|
|
||||||
* @mi: numa_meminfo to remove memblk from
|
|
||||||
*
|
|
||||||
* Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
|
|
||||||
* decrementing @mi->nr_blks.
|
|
||||||
*/
|
|
||||||
void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
|
|
||||||
{
|
|
||||||
mi->nr_blks--;
|
|
||||||
memmove(&mi->blk[idx], &mi->blk[idx + 1],
|
|
||||||
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* numa_add_memblk - Add one numa_memblk to numa_meminfo
|
|
||||||
* @nid: NUMA node ID of the new memblk
|
|
||||||
* @start: Start address of the new memblk
|
|
||||||
* @end: End address of the new memblk
|
|
||||||
*
|
|
||||||
* Add a new memblk to the default numa_meminfo.
|
|
||||||
*
|
|
||||||
* RETURNS:
|
|
||||||
* 0 on success, -errno on failure.
|
|
||||||
*/
|
|
||||||
int __init numa_add_memblk(int nid, u64 start, u64 end)
|
|
||||||
{
|
|
||||||
return numa_add_memblk_to(nid, start, end, &numa_meminfo);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Initialize bootmem allocator for a node */
|
|
||||||
void __init
|
|
||||||
setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
|
|
||||||
{
|
|
||||||
unsigned long start_pfn, last_pfn, nodedata_phys;
|
|
||||||
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
|
|
||||||
int nid;
|
|
||||||
|
|
||||||
if (!end)
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Don't confuse VM with a node that doesn't have the
|
|
||||||
* minimum amount of memory:
|
|
||||||
*/
|
|
||||||
if (end && (end - start) < NODE_MIN_SIZE)
|
|
||||||
return;
|
|
||||||
|
|
||||||
start = roundup(start, ZONE_ALIGN);
|
|
||||||
|
|
||||||
printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
|
|
||||||
start, end);
|
|
||||||
|
|
||||||
start_pfn = start >> PAGE_SHIFT;
|
|
||||||
last_pfn = end >> PAGE_SHIFT;
|
|
||||||
|
|
||||||
node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
|
|
||||||
SMP_CACHE_BYTES);
|
|
||||||
if (node_data[nodeid] == NULL)
|
|
||||||
return;
|
|
||||||
nodedata_phys = __pa(node_data[nodeid]);
|
|
||||||
memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
|
|
||||||
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
|
|
||||||
nodedata_phys + pgdat_size - 1);
|
|
||||||
nid = phys_to_nid(nodedata_phys);
|
|
||||||
if (nid != nodeid)
|
|
||||||
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
|
|
||||||
|
|
||||||
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
|
|
||||||
NODE_DATA(nodeid)->node_id = nodeid;
|
|
||||||
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
|
|
||||||
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
|
|
||||||
|
|
||||||
node_set_online(nodeid);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* numa_cleanup_meminfo - Cleanup a numa_meminfo
|
|
||||||
* @mi: numa_meminfo to clean up
|
|
||||||
*
|
|
||||||
* Sanitize @mi by merging and removing unncessary memblks. Also check for
|
|
||||||
* conflicts and clear unused memblks.
|
|
||||||
*
|
|
||||||
* RETURNS:
|
|
||||||
* 0 on success, -errno on failure.
|
|
||||||
*/
|
|
||||||
int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
|
|
||||||
{
|
|
||||||
const u64 low = 0;
|
|
||||||
const u64 high = (u64)max_pfn << PAGE_SHIFT;
|
|
||||||
int i, j, k;
|
|
||||||
|
|
||||||
for (i = 0; i < mi->nr_blks; i++) {
|
|
||||||
struct numa_memblk *bi = &mi->blk[i];
|
|
||||||
|
|
||||||
/* make sure all blocks are inside the limits */
|
|
||||||
bi->start = max(bi->start, low);
|
|
||||||
bi->end = min(bi->end, high);
|
|
||||||
|
|
||||||
/* and there's no empty block */
|
|
||||||
if (bi->start >= bi->end) {
|
|
||||||
numa_remove_memblk_from(i--, mi);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (j = i + 1; j < mi->nr_blks; j++) {
|
|
||||||
struct numa_memblk *bj = &mi->blk[j];
|
|
||||||
unsigned long start, end;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* See whether there are overlapping blocks. Whine
|
|
||||||
* about but allow overlaps of the same nid. They
|
|
||||||
* will be merged below.
|
|
||||||
*/
|
|
||||||
if (bi->end > bj->start && bi->start < bj->end) {
|
|
||||||
if (bi->nid != bj->nid) {
|
|
||||||
pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
|
|
||||||
bi->nid, bi->start, bi->end,
|
|
||||||
bj->nid, bj->start, bj->end);
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
|
|
||||||
bi->nid, bi->start, bi->end,
|
|
||||||
bj->start, bj->end);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Join together blocks on the same node, holes
|
|
||||||
* between which don't overlap with memory on other
|
|
||||||
* nodes.
|
|
||||||
*/
|
|
||||||
if (bi->nid != bj->nid)
|
|
||||||
continue;
|
|
||||||
start = max(min(bi->start, bj->start), low);
|
|
||||||
end = min(max(bi->end, bj->end), high);
|
|
||||||
for (k = 0; k < mi->nr_blks; k++) {
|
|
||||||
struct numa_memblk *bk = &mi->blk[k];
|
|
||||||
|
|
||||||
if (bi->nid == bk->nid)
|
|
||||||
continue;
|
|
||||||
if (start < bk->end && end > bk->start)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (k < mi->nr_blks)
|
|
||||||
continue;
|
|
||||||
printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
|
|
||||||
bi->nid, bi->start, bi->end, bj->start, bj->end,
|
|
||||||
start, end);
|
|
||||||
bi->start = start;
|
|
||||||
bi->end = end;
|
|
||||||
numa_remove_memblk_from(j--, mi);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
|
|
||||||
mi->blk[i].start = mi->blk[i].end = 0;
|
|
||||||
mi->blk[i].nid = NUMA_NO_NODE;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Set nodes, which have memory in @mi, in *@nodemask.
|
|
||||||
*/
|
|
||||||
static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
|
|
||||||
const struct numa_meminfo *mi)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
|
|
||||||
if (mi->blk[i].start != mi->blk[i].end &&
|
|
||||||
mi->blk[i].nid != NUMA_NO_NODE)
|
|
||||||
node_set(mi->blk[i].nid, *nodemask);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* numa_reset_distance - Reset NUMA distance table
|
|
||||||
*
|
|
||||||
* The current table is freed. The next numa_set_distance() call will
|
|
||||||
* create a new one.
|
|
||||||
*/
|
|
||||||
void __init numa_reset_distance(void)
|
|
||||||
{
|
|
||||||
size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
|
|
||||||
|
|
||||||
/* numa_distance could be 1LU marking allocation failure, test cnt */
|
|
||||||
if (numa_distance_cnt)
|
|
||||||
memblock_x86_free_range(__pa(numa_distance),
|
|
||||||
__pa(numa_distance) + size);
|
|
||||||
numa_distance_cnt = 0;
|
|
||||||
numa_distance = NULL; /* enable table creation */
|
|
||||||
}
|
|
||||||
|
|
||||||
static int __init numa_alloc_distance(void)
|
|
||||||
{
|
|
||||||
nodemask_t nodes_parsed;
|
|
||||||
size_t size;
|
|
||||||
int i, j, cnt = 0;
|
|
||||||
u64 phys;
|
|
||||||
|
|
||||||
/* size the new table and allocate it */
|
|
||||||
nodes_parsed = numa_nodes_parsed;
|
|
||||||
numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
|
|
||||||
|
|
||||||
for_each_node_mask(i, nodes_parsed)
|
|
||||||
cnt = i;
|
|
||||||
cnt++;
|
|
||||||
size = cnt * cnt * sizeof(numa_distance[0]);
|
|
||||||
|
|
||||||
phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
|
|
||||||
size, PAGE_SIZE);
|
|
||||||
if (phys == MEMBLOCK_ERROR) {
|
|
||||||
pr_warning("NUMA: Warning: can't allocate distance table!\n");
|
|
||||||
/* don't retry until explicitly reset */
|
|
||||||
numa_distance = (void *)1LU;
|
|
||||||
return -ENOMEM;
|
|
||||||
}
|
|
||||||
memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
|
|
||||||
|
|
||||||
numa_distance = __va(phys);
|
|
||||||
numa_distance_cnt = cnt;
|
|
||||||
|
|
||||||
/* fill with the default distances */
|
|
||||||
for (i = 0; i < cnt; i++)
|
|
||||||
for (j = 0; j < cnt; j++)
|
|
||||||
numa_distance[i * cnt + j] = i == j ?
|
|
||||||
LOCAL_DISTANCE : REMOTE_DISTANCE;
|
|
||||||
printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* numa_set_distance - Set NUMA distance from one NUMA to another
|
|
||||||
* @from: the 'from' node to set distance
|
|
||||||
* @to: the 'to' node to set distance
|
|
||||||
* @distance: NUMA distance
|
|
||||||
*
|
|
||||||
* Set the distance from node @from to @to to @distance. If distance table
|
|
||||||
* doesn't exist, one which is large enough to accommodate all the currently
|
|
||||||
* known nodes will be created.
|
|
||||||
*
|
|
||||||
* If such table cannot be allocated, a warning is printed and further
|
|
||||||
* calls are ignored until the distance table is reset with
|
|
||||||
* numa_reset_distance().
|
|
||||||
*
|
|
||||||
* If @from or @to is higher than the highest known node at the time of
|
|
||||||
* table creation or @distance doesn't make sense, the call is ignored.
|
|
||||||
* This is to allow simplification of specific NUMA config implementations.
|
|
||||||
*/
|
|
||||||
void __init numa_set_distance(int from, int to, int distance)
|
|
||||||
{
|
|
||||||
if (!numa_distance && numa_alloc_distance() < 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
|
|
||||||
printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
|
|
||||||
from, to, distance);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((u8)distance != distance ||
|
|
||||||
(from == to && distance != LOCAL_DISTANCE)) {
|
|
||||||
pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
|
|
||||||
from, to, distance);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
numa_distance[from * numa_distance_cnt + to] = distance;
|
|
||||||
}
|
|
||||||
|
|
||||||
int __node_distance(int from, int to)
|
|
||||||
{
|
|
||||||
if (from >= numa_distance_cnt || to >= numa_distance_cnt)
|
|
||||||
return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
|
|
||||||
return numa_distance[from * numa_distance_cnt + to];
|
|
||||||
}
|
|
||||||
EXPORT_SYMBOL(__node_distance);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Sanity check to catch more bad NUMA configurations (they are amazingly
|
|
||||||
* common). Make sure the nodes cover all memory.
|
|
||||||
*/
|
|
||||||
static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
|
|
||||||
{
|
|
||||||
unsigned long numaram, e820ram;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
numaram = 0;
|
|
||||||
for (i = 0; i < mi->nr_blks; i++) {
|
|
||||||
unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
|
|
||||||
unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
|
|
||||||
numaram += e - s;
|
|
||||||
numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
|
|
||||||
if ((long)numaram < 0)
|
|
||||||
numaram = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
e820ram = max_pfn - (memblock_x86_hole_size(0,
|
|
||||||
max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
|
|
||||||
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
|
|
||||||
if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
|
|
||||||
printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
|
|
||||||
(numaram << PAGE_SHIFT) >> 20,
|
|
||||||
(e820ram << PAGE_SHIFT) >> 20);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int __init numa_register_memblks(struct numa_meminfo *mi)
|
|
||||||
{
|
|
||||||
int i, nid;
|
|
||||||
|
|
||||||
/* Account for nodes with cpus and no memory */
|
|
||||||
node_possible_map = numa_nodes_parsed;
|
|
||||||
numa_nodemask_from_meminfo(&node_possible_map, mi);
|
|
||||||
if (WARN_ON(nodes_empty(node_possible_map)))
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
memnode_shift = compute_hash_shift(mi);
|
|
||||||
if (memnode_shift < 0) {
|
|
||||||
printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < mi->nr_blks; i++)
|
|
||||||
memblock_x86_register_active_regions(mi->blk[i].nid,
|
|
||||||
mi->blk[i].start >> PAGE_SHIFT,
|
|
||||||
mi->blk[i].end >> PAGE_SHIFT);
|
|
||||||
|
|
||||||
/* for out of order entries */
|
|
||||||
sort_node_map();
|
|
||||||
if (!numa_meminfo_cover_memory(mi))
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
/* Finally register nodes. */
|
|
||||||
for_each_node_mask(nid, node_possible_map) {
|
|
||||||
u64 start = (u64)max_pfn << PAGE_SHIFT;
|
|
||||||
u64 end = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < mi->nr_blks; i++) {
|
|
||||||
if (nid != mi->blk[i].nid)
|
|
||||||
continue;
|
|
||||||
start = min(mi->blk[i].start, start);
|
|
||||||
end = max(mi->blk[i].end, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (start < end)
|
|
||||||
setup_node_bootmem(nid, start, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* dummy_numma_init - Fallback dummy NUMA init
|
|
||||||
*
|
|
||||||
* Used if there's no underlying NUMA architecture, NUMA initialization
|
|
||||||
* fails, or NUMA is disabled on the command line.
|
|
||||||
*
|
|
||||||
* Must online at least one node and add memory blocks that cover all
|
|
||||||
* allowed memory. This function must not fail.
|
|
||||||
*/
|
|
||||||
static int __init dummy_numa_init(void)
|
|
||||||
{
|
|
||||||
printk(KERN_INFO "%s\n",
|
|
||||||
numa_off ? "NUMA turned off" : "No NUMA configuration found");
|
|
||||||
printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
|
|
||||||
0LU, max_pfn << PAGE_SHIFT);
|
|
||||||
|
|
||||||
node_set(0, numa_nodes_parsed);
|
|
||||||
numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int __init numa_init(int (*init_func)(void))
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
for (i = 0; i < MAX_LOCAL_APIC; i++)
|
|
||||||
set_apicid_to_node(i, NUMA_NO_NODE);
|
|
||||||
|
|
||||||
nodes_clear(numa_nodes_parsed);
|
|
||||||
nodes_clear(node_possible_map);
|
|
||||||
nodes_clear(node_online_map);
|
|
||||||
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
|
|
||||||
remove_all_active_ranges();
|
|
||||||
numa_reset_distance();
|
|
||||||
|
|
||||||
ret = init_func();
|
|
||||||
if (ret < 0)
|
|
||||||
return ret;
|
|
||||||
ret = numa_cleanup_meminfo(&numa_meminfo);
|
|
||||||
if (ret < 0)
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
numa_emulation(&numa_meminfo, numa_distance_cnt);
|
|
||||||
|
|
||||||
ret = numa_register_memblks(&numa_meminfo);
|
|
||||||
if (ret < 0)
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
for (i = 0; i < nr_cpu_ids; i++) {
|
|
||||||
int nid = early_cpu_to_node(i);
|
|
||||||
|
|
||||||
if (nid == NUMA_NO_NODE)
|
|
||||||
continue;
|
|
||||||
if (!node_online(nid))
|
|
||||||
numa_clear_node(i);
|
|
||||||
}
|
|
||||||
numa_init_array();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void __init initmem_init(void)
|
void __init initmem_init(void)
|
||||||
{
|
{
|
||||||
int ret;
|
x86_numa_init();
|
||||||
|
|
||||||
if (!numa_off) {
|
|
||||||
#ifdef CONFIG_ACPI_NUMA
|
|
||||||
ret = numa_init(x86_acpi_numa_init);
|
|
||||||
if (!ret)
|
|
||||||
return;
|
|
||||||
#endif
|
|
||||||
#ifdef CONFIG_AMD_NUMA
|
|
||||||
ret = numa_init(amd_numa_init);
|
|
||||||
if (!ret)
|
|
||||||
return;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
numa_init(dummy_numa_init);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned long __init numa_free_all_bootmem(void)
|
unsigned long __init numa_free_all_bootmem(void)
|
||||||
@ -656,12 +23,3 @@ unsigned long __init numa_free_all_bootmem(void)
|
|||||||
|
|
||||||
return pages;
|
return pages;
|
||||||
}
|
}
|
||||||
|
|
||||||
int __cpuinit numa_cpu_node(int cpu)
|
|
||||||
{
|
|
||||||
int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
|
|
||||||
|
|
||||||
if (apicid != BAD_APICID)
|
|
||||||
return __apicid_to_node[apicid];
|
|
||||||
return NUMA_NO_NODE;
|
|
||||||
}
|
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <linux/errno.h>
|
#include <linux/errno.h>
|
||||||
#include <linux/topology.h>
|
#include <linux/topology.h>
|
||||||
#include <linux/memblock.h>
|
#include <linux/memblock.h>
|
||||||
|
#include <linux/bootmem.h>
|
||||||
#include <asm/dma.h>
|
#include <asm/dma.h>
|
||||||
|
|
||||||
#include "numa_internal.h"
|
#include "numa_internal.h"
|
||||||
@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
|
|||||||
nr_nodes = MAX_NUMNODES;
|
nr_nodes = MAX_NUMNODES;
|
||||||
}
|
}
|
||||||
|
|
||||||
size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
|
/*
|
||||||
|
* Calculate target node size. x86_32 freaks on __udivdi3() so do
|
||||||
|
* the division in ulong number of pages and convert back.
|
||||||
|
*/
|
||||||
|
size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
|
||||||
|
size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Calculate the number of big nodes that can be allocated as a result
|
* Calculate the number of big nodes that can be allocated as a result
|
||||||
* of consolidating the remainder.
|
* of consolidating the remainder.
|
||||||
@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
|
|||||||
*/
|
*/
|
||||||
while (nodes_weight(physnode_mask)) {
|
while (nodes_weight(physnode_mask)) {
|
||||||
for_each_node_mask(i, physnode_mask) {
|
for_each_node_mask(i, physnode_mask) {
|
||||||
u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
|
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
|
||||||
u64 start, limit, end;
|
u64 start, limit, end;
|
||||||
int phys_blk;
|
int phys_blk;
|
||||||
|
|
||||||
@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
|
|||||||
{
|
{
|
||||||
static struct numa_meminfo ei __initdata;
|
static struct numa_meminfo ei __initdata;
|
||||||
static struct numa_meminfo pi __initdata;
|
static struct numa_meminfo pi __initdata;
|
||||||
const u64 max_addr = max_pfn << PAGE_SHIFT;
|
const u64 max_addr = PFN_PHYS(max_pfn);
|
||||||
u8 *phys_dist = NULL;
|
u8 *phys_dist = NULL;
|
||||||
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
|
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
|
||||||
int max_emu_nid, dfl_phys_nid;
|
int max_emu_nid, dfl_phys_nid;
|
||||||
@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
|
|||||||
if (numa_dist_cnt) {
|
if (numa_dist_cnt) {
|
||||||
u64 phys;
|
u64 phys;
|
||||||
|
|
||||||
phys = memblock_find_in_range(0,
|
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
|
||||||
(u64)max_pfn_mapped << PAGE_SHIFT,
|
|
||||||
phys_size, PAGE_SIZE);
|
phys_size, PAGE_SIZE);
|
||||||
if (phys == MEMBLOCK_ERROR) {
|
if (phys == MEMBLOCK_ERROR) {
|
||||||
pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
|
pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
|
||||||
|
@ -19,6 +19,14 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
|
|||||||
int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
|
int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
|
||||||
void __init numa_reset_distance(void);
|
void __init numa_reset_distance(void);
|
||||||
|
|
||||||
|
void __init x86_numa_init(void);
|
||||||
|
|
||||||
|
#ifdef CONFIG_X86_64
|
||||||
|
static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
|
||||||
|
#else
|
||||||
|
void __init init_alloc_remap(int nid, u64 start, u64 end);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA_EMU
|
#ifdef CONFIG_NUMA_EMU
|
||||||
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
|
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
|
||||||
int numa_dist_cnt);
|
int numa_dist_cnt);
|
||||||
|
@ -26,8 +26,6 @@
|
|||||||
|
|
||||||
int acpi_numa __initdata;
|
int acpi_numa __initdata;
|
||||||
|
|
||||||
static struct bootnode nodes_add[MAX_NUMNODES];
|
|
||||||
|
|
||||||
static __init int setup_node(int pxm)
|
static __init int setup_node(int pxm)
|
||||||
{
|
{
|
||||||
return acpi_map_pxm_to_node(pxm);
|
return acpi_map_pxm_to_node(pxm);
|
||||||
@ -37,7 +35,6 @@ static __init void bad_srat(void)
|
|||||||
{
|
{
|
||||||
printk(KERN_ERR "SRAT: SRAT not used.\n");
|
printk(KERN_ERR "SRAT: SRAT not used.\n");
|
||||||
acpi_numa = -1;
|
acpi_numa = -1;
|
||||||
memset(nodes_add, 0, sizeof(nodes_add));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __init inline int srat_disabled(void)
|
static __init inline int srat_disabled(void)
|
||||||
@ -131,73 +128,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
|
|||||||
pxm, apic_id, node);
|
pxm, apic_id, node);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
static inline int save_add_info(void) {return 1;}
|
static inline int save_add_info(void) {return 1;}
|
||||||
#else
|
#else
|
||||||
static inline int save_add_info(void) {return 0;}
|
static inline int save_add_info(void) {return 0;}
|
||||||
#endif
|
#endif
|
||||||
/*
|
|
||||||
* Update nodes_add[]
|
|
||||||
* This code supports one contiguous hot add area per node
|
|
||||||
*/
|
|
||||||
static void __init
|
|
||||||
update_nodes_add(int node, unsigned long start, unsigned long end)
|
|
||||||
{
|
|
||||||
unsigned long s_pfn = start >> PAGE_SHIFT;
|
|
||||||
unsigned long e_pfn = end >> PAGE_SHIFT;
|
|
||||||
int changed = 0;
|
|
||||||
struct bootnode *nd = &nodes_add[node];
|
|
||||||
|
|
||||||
/* I had some trouble with strange memory hotadd regions breaking
|
|
||||||
the boot. Be very strict here and reject anything unexpected.
|
|
||||||
If you want working memory hotadd write correct SRATs.
|
|
||||||
|
|
||||||
The node size check is a basic sanity check to guard against
|
|
||||||
mistakes */
|
|
||||||
if ((signed long)(end - start) < NODE_MIN_SIZE) {
|
|
||||||
printk(KERN_ERR "SRAT: Hotplug area too small\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* This check might be a bit too strict, but I'm keeping it for now. */
|
|
||||||
if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
|
|
||||||
printk(KERN_ERR
|
|
||||||
"SRAT: Hotplug area %lu -> %lu has existing memory\n",
|
|
||||||
s_pfn, e_pfn);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Looks good */
|
|
||||||
|
|
||||||
if (nd->start == nd->end) {
|
|
||||||
nd->start = start;
|
|
||||||
nd->end = end;
|
|
||||||
changed = 1;
|
|
||||||
} else {
|
|
||||||
if (nd->start == end) {
|
|
||||||
nd->start = start;
|
|
||||||
changed = 1;
|
|
||||||
}
|
|
||||||
if (nd->end == start) {
|
|
||||||
nd->end = end;
|
|
||||||
changed = 1;
|
|
||||||
}
|
|
||||||
if (!changed)
|
|
||||||
printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (changed) {
|
|
||||||
node_set(node, numa_nodes_parsed);
|
|
||||||
printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
|
|
||||||
nd->start, nd->end);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
|
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
|
||||||
void __init
|
void __init
|
||||||
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
|
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
|
||||||
{
|
{
|
||||||
unsigned long start, end;
|
u64 start, end;
|
||||||
int node, pxm;
|
int node, pxm;
|
||||||
|
|
||||||
if (srat_disabled())
|
if (srat_disabled())
|
||||||
@ -226,11 +167,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
|
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
|
||||||
start, end);
|
start, end);
|
||||||
|
|
||||||
if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
|
|
||||||
update_nodes_add(node, start, end);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void __init acpi_numa_arch_fixup(void) {}
|
void __init acpi_numa_arch_fixup(void) {}
|
||||||
@ -244,17 +182,3 @@ int __init x86_acpi_numa_init(void)
|
|||||||
return ret;
|
return ret;
|
||||||
return srat_disabled() ? -EINVAL : 0;
|
return srat_disabled() ? -EINVAL : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
|
|
||||||
int memory_add_physaddr_to_nid(u64 start)
|
|
||||||
{
|
|
||||||
int i, ret = 0;
|
|
||||||
|
|
||||||
for_each_node(i)
|
|
||||||
if (nodes_add[i].start <= start && nodes_add[i].end > start)
|
|
||||||
ret = i;
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
|
||||||
#endif
|
|
@ -1,288 +0,0 @@
|
|||||||
/*
|
|
||||||
* Some of the code in this file has been gleaned from the 64 bit
|
|
||||||
* discontigmem support code base.
|
|
||||||
*
|
|
||||||
* Copyright (C) 2002, IBM Corp.
|
|
||||||
*
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful, but
|
|
||||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
|
||||||
* NON INFRINGEMENT. See the GNU General Public License for more
|
|
||||||
* details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
||||||
*
|
|
||||||
* Send feedback to Pat Gaughen <gone@us.ibm.com>
|
|
||||||
*/
|
|
||||||
#include <linux/mm.h>
|
|
||||||
#include <linux/bootmem.h>
|
|
||||||
#include <linux/memblock.h>
|
|
||||||
#include <linux/mmzone.h>
|
|
||||||
#include <linux/acpi.h>
|
|
||||||
#include <linux/nodemask.h>
|
|
||||||
#include <asm/srat.h>
|
|
||||||
#include <asm/topology.h>
|
|
||||||
#include <asm/smp.h>
|
|
||||||
#include <asm/e820.h>
|
|
||||||
|
|
||||||
/*
|
|
||||||
* proximity macros and definitions
|
|
||||||
*/
|
|
||||||
#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
|
|
||||||
#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
|
|
||||||
#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
|
|
||||||
#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
|
|
||||||
/* bitmap length; _PXM is at most 255 */
|
|
||||||
#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
|
|
||||||
static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
|
|
||||||
|
|
||||||
#define MAX_CHUNKS_PER_NODE 3
|
|
||||||
#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
|
|
||||||
struct node_memory_chunk_s {
|
|
||||||
unsigned long start_pfn;
|
|
||||||
unsigned long end_pfn;
|
|
||||||
u8 pxm; // proximity domain of node
|
|
||||||
u8 nid; // which cnode contains this chunk?
|
|
||||||
u8 bank; // which mem bank on this node
|
|
||||||
};
|
|
||||||
static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
|
|
||||||
|
|
||||||
static int __initdata num_memory_chunks; /* total number of memory chunks */
|
|
||||||
static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
|
|
||||||
|
|
||||||
int acpi_numa __initdata;
|
|
||||||
|
|
||||||
static __init void bad_srat(void)
|
|
||||||
{
|
|
||||||
printk(KERN_ERR "SRAT: SRAT not used.\n");
|
|
||||||
acpi_numa = -1;
|
|
||||||
num_memory_chunks = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __init inline int srat_disabled(void)
|
|
||||||
{
|
|
||||||
return numa_off || acpi_numa < 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Identify CPU proximity domains */
|
|
||||||
void __init
|
|
||||||
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
|
|
||||||
{
|
|
||||||
if (srat_disabled())
|
|
||||||
return;
|
|
||||||
if (cpu_affinity->header.length !=
|
|
||||||
sizeof(struct acpi_srat_cpu_affinity)) {
|
|
||||||
bad_srat();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
|
|
||||||
return; /* empty entry */
|
|
||||||
|
|
||||||
/* mark this node as "seen" in node bitmap */
|
|
||||||
BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
|
|
||||||
|
|
||||||
/* don't need to check apic_id here, because it is always 8 bits */
|
|
||||||
apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
|
|
||||||
|
|
||||||
printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
|
|
||||||
cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Identify memory proximity domains and hot-remove capabilities.
|
|
||||||
* Fill node memory chunk list structure.
|
|
||||||
*/
|
|
||||||
void __init
|
|
||||||
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
|
|
||||||
{
|
|
||||||
unsigned long long paddr, size;
|
|
||||||
unsigned long start_pfn, end_pfn;
|
|
||||||
u8 pxm;
|
|
||||||
struct node_memory_chunk_s *p, *q, *pend;
|
|
||||||
|
|
||||||
if (srat_disabled())
|
|
||||||
return;
|
|
||||||
if (memory_affinity->header.length !=
|
|
||||||
sizeof(struct acpi_srat_mem_affinity)) {
|
|
||||||
bad_srat();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
|
|
||||||
return; /* empty entry */
|
|
||||||
|
|
||||||
pxm = memory_affinity->proximity_domain & 0xff;
|
|
||||||
|
|
||||||
/* mark this node as "seen" in node bitmap */
|
|
||||||
BMAP_SET(pxm_bitmap, pxm);
|
|
||||||
|
|
||||||
/* calculate info for memory chunk structure */
|
|
||||||
paddr = memory_affinity->base_address;
|
|
||||||
size = memory_affinity->length;
|
|
||||||
|
|
||||||
start_pfn = paddr >> PAGE_SHIFT;
|
|
||||||
end_pfn = (paddr + size) >> PAGE_SHIFT;
|
|
||||||
|
|
||||||
|
|
||||||
if (num_memory_chunks >= MAXCHUNKS) {
|
|
||||||
printk(KERN_WARNING "Too many mem chunks in SRAT."
|
|
||||||
" Ignoring %lld MBytes at %llx\n",
|
|
||||||
size/(1024*1024), paddr);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Insertion sort based on base address */
|
|
||||||
pend = &node_memory_chunk[num_memory_chunks];
|
|
||||||
for (p = &node_memory_chunk[0]; p < pend; p++) {
|
|
||||||
if (start_pfn < p->start_pfn)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (p < pend) {
|
|
||||||
for (q = pend; q >= p; q--)
|
|
||||||
*(q + 1) = *q;
|
|
||||||
}
|
|
||||||
p->start_pfn = start_pfn;
|
|
||||||
p->end_pfn = end_pfn;
|
|
||||||
p->pxm = pxm;
|
|
||||||
|
|
||||||
num_memory_chunks++;
|
|
||||||
|
|
||||||
printk(KERN_DEBUG "Memory range %08lx to %08lx"
|
|
||||||
" in proximity domain %02x %s\n",
|
|
||||||
start_pfn, end_pfn,
|
|
||||||
pxm,
|
|
||||||
((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
|
|
||||||
"enabled and removable" : "enabled" ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Callback for SLIT parsing */
|
|
||||||
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void acpi_numa_arch_fixup(void)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* The SRAT table always lists ascending addresses, so can always
|
|
||||||
* assume that the first "start" address that you see is the real
|
|
||||||
* start of the node, and that the current "end" address is after
|
|
||||||
* the previous one.
|
|
||||||
*/
|
|
||||||
static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Only add present memory as told by the e820.
|
|
||||||
* There is no guarantee from the SRAT that the memory it
|
|
||||||
* enumerates is present at boot time because it represents
|
|
||||||
* *possible* memory hotplug areas the same as normal RAM.
|
|
||||||
*/
|
|
||||||
if (memory_chunk->start_pfn >= max_pfn) {
|
|
||||||
printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
|
|
||||||
memory_chunk->start_pfn, memory_chunk->end_pfn);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (memory_chunk->nid != nid)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (!node_has_online_mem(nid))
|
|
||||||
node_start_pfn[nid] = memory_chunk->start_pfn;
|
|
||||||
|
|
||||||
if (node_start_pfn[nid] > memory_chunk->start_pfn)
|
|
||||||
node_start_pfn[nid] = memory_chunk->start_pfn;
|
|
||||||
|
|
||||||
if (node_end_pfn[nid] < memory_chunk->end_pfn)
|
|
||||||
node_end_pfn[nid] = memory_chunk->end_pfn;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int __init get_memcfg_from_srat(void)
|
|
||||||
{
|
|
||||||
int i, j, nid;
|
|
||||||
|
|
||||||
if (srat_disabled())
|
|
||||||
goto out_fail;
|
|
||||||
|
|
||||||
if (acpi_numa_init() < 0)
|
|
||||||
goto out_fail;
|
|
||||||
|
|
||||||
if (num_memory_chunks == 0) {
|
|
||||||
printk(KERN_DEBUG
|
|
||||||
"could not find any ACPI SRAT memory areas.\n");
|
|
||||||
goto out_fail;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Calculate total number of nodes in system from PXM bitmap and create
|
|
||||||
* a set of sequential node IDs starting at zero. (ACPI doesn't seem
|
|
||||||
* to specify the range of _PXM values.)
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
* MCD - we no longer HAVE to number nodes sequentially. PXM domain
|
|
||||||
* numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
|
|
||||||
* 32, so we will continue numbering them in this manner until MAX_NUMNODES
|
|
||||||
* approaches MAX_PXM_DOMAINS for i386.
|
|
||||||
*/
|
|
||||||
nodes_clear(node_online_map);
|
|
||||||
for (i = 0; i < MAX_PXM_DOMAINS; i++) {
|
|
||||||
if (BMAP_TEST(pxm_bitmap, i)) {
|
|
||||||
int nid = acpi_map_pxm_to_node(i);
|
|
||||||
node_set_online(nid);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
BUG_ON(num_online_nodes() == 0);
|
|
||||||
|
|
||||||
/* set cnode id in memory chunk structure */
|
|
||||||
for (i = 0; i < num_memory_chunks; i++)
|
|
||||||
node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
|
|
||||||
|
|
||||||
printk(KERN_DEBUG "pxm bitmap: ");
|
|
||||||
for (i = 0; i < sizeof(pxm_bitmap); i++) {
|
|
||||||
printk(KERN_CONT "%02x ", pxm_bitmap[i]);
|
|
||||||
}
|
|
||||||
printk(KERN_CONT "\n");
|
|
||||||
printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
|
|
||||||
num_online_nodes());
|
|
||||||
printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
|
|
||||||
num_memory_chunks);
|
|
||||||
|
|
||||||
for (i = 0; i < MAX_LOCAL_APIC; i++)
|
|
||||||
set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
|
|
||||||
|
|
||||||
for (j = 0; j < num_memory_chunks; j++){
|
|
||||||
struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
|
|
||||||
printk(KERN_DEBUG
|
|
||||||
"chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
|
|
||||||
j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
|
|
||||||
if (node_read_chunk(chunk->nid, chunk))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
|
|
||||||
min(chunk->end_pfn, max_pfn));
|
|
||||||
}
|
|
||||||
/* for out of order entries in SRAT */
|
|
||||||
sort_node_map();
|
|
||||||
|
|
||||||
for_each_online_node(nid) {
|
|
||||||
unsigned long start = node_start_pfn[nid];
|
|
||||||
unsigned long end = min(node_end_pfn[nid], max_pfn);
|
|
||||||
|
|
||||||
memory_present(nid, start, end);
|
|
||||||
node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
|
|
||||||
}
|
|
||||||
return 1;
|
|
||||||
out_fail:
|
|
||||||
printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
|
|
||||||
" table\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -710,20 +710,14 @@ static int acpi_processor_get_throttling_fadt(struct acpi_processor *pr)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_X86
|
#ifdef CONFIG_X86
|
||||||
static int acpi_throttling_rdmsr(struct acpi_processor *pr,
|
static int acpi_throttling_rdmsr(u64 *value)
|
||||||
u64 *value)
|
|
||||||
{
|
{
|
||||||
struct cpuinfo_x86 *c;
|
|
||||||
u64 msr_high, msr_low;
|
u64 msr_high, msr_low;
|
||||||
unsigned int cpu;
|
|
||||||
u64 msr = 0;
|
u64 msr = 0;
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
|
|
||||||
cpu = pr->id;
|
if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
|
||||||
c = &cpu_data(cpu);
|
!this_cpu_has(X86_FEATURE_ACPI)) {
|
||||||
|
|
||||||
if ((c->x86_vendor != X86_VENDOR_INTEL) ||
|
|
||||||
!cpu_has(c, X86_FEATURE_ACPI)) {
|
|
||||||
printk(KERN_ERR PREFIX
|
printk(KERN_ERR PREFIX
|
||||||
"HARDWARE addr space,NOT supported yet\n");
|
"HARDWARE addr space,NOT supported yet\n");
|
||||||
} else {
|
} else {
|
||||||
@ -738,18 +732,13 @@ static int acpi_throttling_rdmsr(struct acpi_processor *pr,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
|
static int acpi_throttling_wrmsr(u64 value)
|
||||||
{
|
{
|
||||||
struct cpuinfo_x86 *c;
|
|
||||||
unsigned int cpu;
|
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
u64 msr;
|
u64 msr;
|
||||||
|
|
||||||
cpu = pr->id;
|
if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
|
||||||
c = &cpu_data(cpu);
|
!this_cpu_has(X86_FEATURE_ACPI)) {
|
||||||
|
|
||||||
if ((c->x86_vendor != X86_VENDOR_INTEL) ||
|
|
||||||
!cpu_has(c, X86_FEATURE_ACPI)) {
|
|
||||||
printk(KERN_ERR PREFIX
|
printk(KERN_ERR PREFIX
|
||||||
"HARDWARE addr space,NOT supported yet\n");
|
"HARDWARE addr space,NOT supported yet\n");
|
||||||
} else {
|
} else {
|
||||||
@ -761,15 +750,14 @@ static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static int acpi_throttling_rdmsr(struct acpi_processor *pr,
|
static int acpi_throttling_rdmsr(u64 *value)
|
||||||
u64 *value)
|
|
||||||
{
|
{
|
||||||
printk(KERN_ERR PREFIX
|
printk(KERN_ERR PREFIX
|
||||||
"HARDWARE addr space,NOT supported yet\n");
|
"HARDWARE addr space,NOT supported yet\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
|
static int acpi_throttling_wrmsr(u64 value)
|
||||||
{
|
{
|
||||||
printk(KERN_ERR PREFIX
|
printk(KERN_ERR PREFIX
|
||||||
"HARDWARE addr space,NOT supported yet\n");
|
"HARDWARE addr space,NOT supported yet\n");
|
||||||
@ -801,7 +789,7 @@ static int acpi_read_throttling_status(struct acpi_processor *pr,
|
|||||||
ret = 0;
|
ret = 0;
|
||||||
break;
|
break;
|
||||||
case ACPI_ADR_SPACE_FIXED_HARDWARE:
|
case ACPI_ADR_SPACE_FIXED_HARDWARE:
|
||||||
ret = acpi_throttling_rdmsr(pr, value);
|
ret = acpi_throttling_rdmsr(value);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
printk(KERN_ERR PREFIX "Unknown addr space %d\n",
|
printk(KERN_ERR PREFIX "Unknown addr space %d\n",
|
||||||
@ -834,7 +822,7 @@ static int acpi_write_throttling_state(struct acpi_processor *pr,
|
|||||||
ret = 0;
|
ret = 0;
|
||||||
break;
|
break;
|
||||||
case ACPI_ADR_SPACE_FIXED_HARDWARE:
|
case ACPI_ADR_SPACE_FIXED_HARDWARE:
|
||||||
ret = acpi_throttling_wrmsr(pr, value);
|
ret = acpi_throttling_wrmsr(value);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
printk(KERN_ERR PREFIX "Unknown addr space %d\n",
|
printk(KERN_ERR PREFIX "Unknown addr space %d\n",
|
||||||
|
Loading…
Reference in New Issue
Block a user