2008-01-30 20:30:37 +08:00
|
|
|
/*
|
2005-04-17 06:20:36 +08:00
|
|
|
* Generic VM initialization for x86-64 NUMA setups.
|
|
|
|
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
|
2008-01-30 20:30:37 +08:00
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/bootmem.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/ctype.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/nodemask.h>
|
2008-01-30 20:33:11 +08:00
|
|
|
#include <linux/sched.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <asm/e820.h>
|
|
|
|
#include <asm/proto.h>
|
|
|
|
#include <asm/dma.h>
|
|
|
|
#include <asm/numa.h>
|
|
|
|
#include <asm/acpi.h>
|
2008-01-30 20:30:16 +08:00
|
|
|
#include <asm/k8.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifndef Dprintk
|
|
|
|
#define Dprintk(x...)
|
|
|
|
#endif
|
|
|
|
|
2005-09-07 06:17:45 +08:00
|
|
|
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
2008-01-30 20:30:37 +08:00
|
|
|
EXPORT_SYMBOL(node_data);
|
|
|
|
|
2008-05-12 21:43:36 +08:00
|
|
|
static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-03-25 23:31:46 +08:00
|
|
|
struct memnode memnode;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-01-30 20:33:25 +08:00
|
|
|
s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
|
2008-01-30 20:30:37 +08:00
|
|
|
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
|
2005-09-13 00:49:24 +08:00
|
|
|
};
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
int numa_off __initdata;
|
2008-05-12 21:43:36 +08:00
|
|
|
static unsigned long __initdata nodemap_addr;
|
|
|
|
static unsigned long __initdata nodemap_size;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-11-06 00:25:54 +08:00
|
|
|
/*
|
|
|
|
* Given a shift value, try to populate memnodemap[]
|
|
|
|
* Returns :
|
|
|
|
* 1 if OK
|
|
|
|
* 0 if memnodmap[] too small (of shift too small)
|
|
|
|
* -1 if node overlap or lost ram (shift too big)
|
|
|
|
*/
|
2008-01-30 20:30:37 +08:00
|
|
|
static int __init populate_memnodemap(const struct bootnode *nodes,
|
2008-03-26 01:14:35 +08:00
|
|
|
int numnodes, int shift, int *nodeids)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-11-06 00:25:54 +08:00
|
|
|
unsigned long addr, end;
|
2008-01-30 20:30:37 +08:00
|
|
|
int i, res = -1;
|
2005-07-29 12:15:38 +08:00
|
|
|
|
2008-01-30 20:33:25 +08:00
|
|
|
memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
|
2005-07-29 12:15:38 +08:00
|
|
|
for (i = 0; i < numnodes; i++) {
|
2005-11-06 00:25:54 +08:00
|
|
|
addr = nodes[i].start;
|
|
|
|
end = nodes[i].end;
|
|
|
|
if (addr >= end)
|
2005-07-29 12:15:38 +08:00
|
|
|
continue;
|
2007-02-13 20:26:19 +08:00
|
|
|
if ((end >> shift) >= memnodemapsize)
|
2005-11-06 00:25:54 +08:00
|
|
|
return 0;
|
|
|
|
do {
|
2008-01-30 20:33:25 +08:00
|
|
|
if (memnodemap[addr >> shift] != NUMA_NO_NODE)
|
2005-07-29 12:15:38 +08:00
|
|
|
return -1;
|
2008-03-26 01:14:35 +08:00
|
|
|
|
|
|
|
if (!nodeids)
|
|
|
|
memnodemap[addr >> shift] = i;
|
|
|
|
else
|
|
|
|
memnodemap[addr >> shift] = nodeids[i];
|
|
|
|
|
2007-02-13 20:26:19 +08:00
|
|
|
addr += (1UL << shift);
|
2005-11-06 00:25:54 +08:00
|
|
|
} while (addr < end);
|
|
|
|
res = 1;
|
2008-01-30 20:30:37 +08:00
|
|
|
}
|
2005-11-06 00:25:54 +08:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2007-02-13 20:26:19 +08:00
|
|
|
static int __init allocate_cachealigned_memnodemap(void)
|
|
|
|
{
|
2008-02-02 00:49:41 +08:00
|
|
|
unsigned long addr;
|
2007-02-13 20:26:19 +08:00
|
|
|
|
|
|
|
memnodemap = memnode.embedded_map;
|
2008-01-30 20:33:15 +08:00
|
|
|
if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
|
2007-02-13 20:26:19 +08:00
|
|
|
return 0;
|
|
|
|
|
2008-02-02 00:49:41 +08:00
|
|
|
addr = 0x8000;
|
|
|
|
nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
|
2008-06-25 13:14:09 +08:00
|
|
|
nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
|
2008-02-02 00:49:41 +08:00
|
|
|
nodemap_size, L1_CACHE_BYTES);
|
2007-02-13 20:26:19 +08:00
|
|
|
if (nodemap_addr == -1UL) {
|
|
|
|
printk(KERN_ERR
|
|
|
|
"NUMA: Unable to allocate Memory to Node hash map\n");
|
|
|
|
nodemap_addr = nodemap_size = 0;
|
|
|
|
return -1;
|
|
|
|
}
|
2008-02-02 00:49:41 +08:00
|
|
|
memnodemap = phys_to_virt(nodemap_addr);
|
2008-02-02 00:49:41 +08:00
|
|
|
reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
|
2007-02-13 20:26:19 +08:00
|
|
|
|
|
|
|
printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
|
|
|
|
nodemap_addr, nodemap_addr + nodemap_size);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The LSB of all start and end addresses in the node map is the value of the
|
|
|
|
* maximum possible shift.
|
|
|
|
*/
|
2008-01-30 20:30:37 +08:00
|
|
|
static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
|
|
|
|
int numnodes)
|
2005-11-06 00:25:54 +08:00
|
|
|
{
|
2007-02-13 20:26:20 +08:00
|
|
|
int i, nodes_used = 0;
|
2007-02-13 20:26:19 +08:00
|
|
|
unsigned long start, end;
|
|
|
|
unsigned long bitfield = 0, memtop = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < numnodes; i++) {
|
|
|
|
start = nodes[i].start;
|
|
|
|
end = nodes[i].end;
|
|
|
|
if (start >= end)
|
|
|
|
continue;
|
2007-02-13 20:26:20 +08:00
|
|
|
bitfield |= start;
|
|
|
|
nodes_used++;
|
2007-02-13 20:26:19 +08:00
|
|
|
if (end > memtop)
|
|
|
|
memtop = end;
|
|
|
|
}
|
2007-02-13 20:26:20 +08:00
|
|
|
if (nodes_used <= 1)
|
|
|
|
i = 63;
|
|
|
|
else
|
|
|
|
i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
|
2007-02-13 20:26:19 +08:00
|
|
|
memnodemapsize = (memtop >> i)+1;
|
|
|
|
return i;
|
|
|
|
}
|
2005-11-06 00:25:54 +08:00
|
|
|
|
2008-03-26 01:14:35 +08:00
|
|
|
int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
|
|
|
|
int *nodeids)
|
2007-02-13 20:26:19 +08:00
|
|
|
{
|
|
|
|
int shift;
|
2005-11-06 00:25:54 +08:00
|
|
|
|
2007-02-13 20:26:19 +08:00
|
|
|
shift = extract_lsb_from_nodes(nodes, numnodes);
|
|
|
|
if (allocate_cachealigned_memnodemap())
|
|
|
|
return -1;
|
2006-01-12 05:44:33 +08:00
|
|
|
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
|
2005-11-06 00:25:54 +08:00
|
|
|
shift);
|
|
|
|
|
2008-03-26 01:14:35 +08:00
|
|
|
if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
|
2008-01-30 20:30:37 +08:00
|
|
|
printk(KERN_INFO "Your memory is not aligned you need to "
|
|
|
|
"rebuild your kernel with a bigger NODEMAPSIZE "
|
|
|
|
"shift=%d\n", shift);
|
2005-11-06 00:25:54 +08:00
|
|
|
return -1;
|
|
|
|
}
|
2005-07-29 12:15:38 +08:00
|
|
|
return shift;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-06-23 15:08:07 +08:00
|
|
|
int early_pfn_to_nid(unsigned long pfn)
|
|
|
|
{
|
|
|
|
return phys_to_nid(pfn << PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
|
2008-01-30 20:30:37 +08:00
|
|
|
static void * __init early_node_mem(int nodeid, unsigned long start,
|
2008-02-02 00:49:41 +08:00
|
|
|
unsigned long end, unsigned long size,
|
|
|
|
unsigned long align)
|
2006-04-08 01:49:21 +08:00
|
|
|
{
|
2008-02-02 00:49:41 +08:00
|
|
|
unsigned long mem = find_e820_area(start, end, size, align);
|
2006-04-08 01:49:21 +08:00
|
|
|
void *ptr;
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2008-02-02 00:49:42 +08:00
|
|
|
if (mem != -1L)
|
2006-04-08 01:49:21 +08:00
|
|
|
return __va(mem);
|
2008-02-02 00:49:42 +08:00
|
|
|
|
2008-02-02 00:49:41 +08:00
|
|
|
ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
|
2007-10-18 00:04:35 +08:00
|
|
|
if (ptr == NULL) {
|
2006-04-08 01:49:21 +08:00
|
|
|
printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
|
2008-01-30 20:30:37 +08:00
|
|
|
size, nodeid);
|
2006-04-08 01:49:21 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Initialize bootmem allocator for a node */
|
2008-01-30 20:30:37 +08:00
|
|
|
void __init setup_node_bootmem(int nodeid, unsigned long start,
|
|
|
|
unsigned long end)
|
|
|
|
{
|
2008-05-12 21:43:36 +08:00
|
|
|
unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
|
2008-01-30 20:30:37 +08:00
|
|
|
unsigned long bootmap_start, nodedata_phys;
|
2006-04-08 01:49:21 +08:00
|
|
|
void *bootmap;
|
2005-04-17 06:20:36 +08:00
|
|
|
const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
|
2008-03-19 03:52:37 +08:00
|
|
|
int nid;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-01-30 20:30:37 +08:00
|
|
|
start = round_up(start, ZONE_ALIGN);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-01-30 20:30:37 +08:00
|
|
|
printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
|
|
|
|
start, end);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
start_pfn = start >> PAGE_SHIFT;
|
2008-05-12 21:43:36 +08:00
|
|
|
last_pfn = end >> PAGE_SHIFT;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-02 00:49:41 +08:00
|
|
|
node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
|
|
|
|
SMP_CACHE_BYTES);
|
2006-04-08 01:49:21 +08:00
|
|
|
if (node_data[nodeid] == NULL)
|
|
|
|
return;
|
|
|
|
nodedata_phys = __pa(node_data[nodeid]);
|
2008-02-04 23:47:56 +08:00
|
|
|
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
|
|
|
|
nodedata_phys + pgdat_size - 1);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
|
|
|
|
NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
|
|
|
|
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
|
2008-05-12 21:43:36 +08:00
|
|
|
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-03-19 03:52:37 +08:00
|
|
|
/*
|
|
|
|
* Find a place for the bootmem map
|
|
|
|
* nodedata_phys could be on other nodes by alloc_bootmem,
|
|
|
|
* so need to sure bootmap_start not to be small, otherwise
|
|
|
|
* early_node_mem will get that with find_e820_area instead
|
|
|
|
* of alloc_bootmem, that could clash with reserved range
|
|
|
|
*/
|
2008-05-12 21:43:36 +08:00
|
|
|
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
|
2008-03-19 03:52:37 +08:00
|
|
|
nid = phys_to_nid(nodedata_phys);
|
|
|
|
if (nid == nodeid)
|
|
|
|
bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
|
|
|
|
else
|
|
|
|
bootmap_start = round_up(start, PAGE_SIZE);
|
2008-02-02 00:49:41 +08:00
|
|
|
/*
|
2008-05-14 23:15:10 +08:00
|
|
|
* SMP_CACHE_BYTES could be enough, but init_bootmem_node like
|
2008-02-02 00:49:41 +08:00
|
|
|
* to use that to align to PAGE_SIZE
|
|
|
|
*/
|
2006-04-08 01:49:21 +08:00
|
|
|
bootmap = early_node_mem(nodeid, bootmap_start, end,
|
2008-02-02 00:49:41 +08:00
|
|
|
bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
|
2006-04-08 01:49:21 +08:00
|
|
|
if (bootmap == NULL) {
|
|
|
|
if (nodedata_phys < start || nodedata_phys >= end)
|
2008-03-19 03:40:04 +08:00
|
|
|
free_bootmem(nodedata_phys, pgdat_size);
|
2006-04-08 01:49:21 +08:00
|
|
|
node_data[nodeid] = NULL;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
bootmap_start = __pa(bootmap);
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
|
2008-01-30 20:30:37 +08:00
|
|
|
bootmap_start >> PAGE_SHIFT,
|
2008-05-12 21:43:36 +08:00
|
|
|
start_pfn, last_pfn);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-04 23:47:56 +08:00
|
|
|
printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
|
|
|
|
bootmap_start, bootmap_start + bootmap_size - 1,
|
|
|
|
bootmap_pages);
|
|
|
|
|
2006-09-27 16:49:52 +08:00
|
|
|
free_bootmem_with_active_regions(nodeid, end);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-03-19 03:52:37 +08:00
|
|
|
/*
|
|
|
|
* convert early reserve to bootmem reserve earlier
|
|
|
|
* otherwise early_node_mem could use early reserved mem
|
|
|
|
* on previous node
|
|
|
|
*/
|
|
|
|
early_res_to_bootmem(start, end);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* in some case early_node_mem could use alloc_bootmem
|
|
|
|
* to get range on other node, don't reserve that again
|
|
|
|
*/
|
|
|
|
if (nid != nodeid)
|
|
|
|
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
|
|
|
|
else
|
|
|
|
reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
|
|
|
|
pgdat_size, BOOTMEM_DEFAULT);
|
|
|
|
nid = phys_to_nid(bootmap_start);
|
|
|
|
if (nid != nodeid)
|
|
|
|
printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
|
|
|
|
else
|
|
|
|
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
|
|
|
|
bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
|
|
|
|
|
2006-04-08 01:49:18 +08:00
|
|
|
#ifdef CONFIG_ACPI_NUMA
|
|
|
|
srat_reserve_add_area(nodeid);
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
node_set_online(nodeid);
|
2008-01-30 20:30:37 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-01-30 20:30:37 +08:00
|
|
|
/*
|
|
|
|
* There are unfortunately some poorly designed mainboards around that
|
|
|
|
* only connect memory to a single CPU. This breaks the 1:1 cpu->node
|
|
|
|
* mapping. To avoid this fill in the mapping for all possible CPUs,
|
|
|
|
* as the number of CPUs is not known yet. We round robin the existing
|
|
|
|
* nodes.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
void __init numa_init_array(void)
|
|
|
|
{
|
|
|
|
int rr, i;
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2005-10-01 02:59:22 +08:00
|
|
|
rr = first_node(node_online_map);
|
2005-04-17 06:20:36 +08:00
|
|
|
for (i = 0; i < NR_CPUS; i++) {
|
2008-01-30 20:33:33 +08:00
|
|
|
if (early_cpu_to_node(i) != NUMA_NO_NODE)
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
2008-01-30 20:30:37 +08:00
|
|
|
numa_set_node(i, rr);
|
2005-04-17 06:20:36 +08:00
|
|
|
rr = next_node(rr, node_online_map);
|
|
|
|
if (rr == MAX_NUMNODES)
|
|
|
|
rr = first_node(node_online_map);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA_EMU
|
2007-02-13 20:26:22 +08:00
|
|
|
/* Numa emulation */
|
2008-05-12 21:43:36 +08:00
|
|
|
static char *cmdline __initdata;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-02-13 20:26:22 +08:00
|
|
|
/*
|
2008-01-30 20:30:37 +08:00
|
|
|
* Setups up nid to range from addr to addr + size. If the end
|
|
|
|
* boundary is greater than max_addr, then max_addr is used instead.
|
|
|
|
* The return value is 0 if there is additional memory left for
|
|
|
|
* allocation past addr and -1 otherwise. addr is adjusted to be at
|
|
|
|
* the end of the node.
|
2007-02-13 20:26:22 +08:00
|
|
|
*/
|
2007-05-03 01:27:09 +08:00
|
|
|
static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
|
|
|
|
u64 size, u64 max_addr)
|
2007-02-13 20:26:22 +08:00
|
|
|
{
|
2007-05-03 01:27:09 +08:00
|
|
|
int ret = 0;
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2007-05-03 01:27:09 +08:00
|
|
|
nodes[nid].start = *addr;
|
|
|
|
*addr += size;
|
|
|
|
if (*addr >= max_addr) {
|
|
|
|
*addr = max_addr;
|
|
|
|
ret = -1;
|
|
|
|
}
|
|
|
|
nodes[nid].end = *addr;
|
2007-05-03 01:27:20 +08:00
|
|
|
node_set(nid, node_possible_map);
|
2007-05-03 01:27:09 +08:00
|
|
|
printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
|
|
|
|
nodes[nid].start, nodes[nid].end,
|
|
|
|
(nodes[nid].end - nodes[nid].start) >> 20);
|
|
|
|
return ret;
|
2007-02-13 20:26:22 +08:00
|
|
|
}
|
|
|
|
|
2007-05-03 01:27:09 +08:00
|
|
|
/*
|
|
|
|
* Splits num_nodes nodes up equally starting at node_start. The return value
|
|
|
|
* is the number of nodes split up and addr is adjusted to be at the end of the
|
|
|
|
* last node allocated.
|
|
|
|
*/
|
|
|
|
static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
|
|
|
|
u64 max_addr, int node_start,
|
|
|
|
int num_nodes)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-05-03 01:27:09 +08:00
|
|
|
unsigned int big;
|
|
|
|
u64 size;
|
|
|
|
int i;
|
2007-02-13 20:26:22 +08:00
|
|
|
|
2007-05-03 01:27:09 +08:00
|
|
|
if (num_nodes <= 0)
|
|
|
|
return -1;
|
|
|
|
if (num_nodes > MAX_NUMNODES)
|
|
|
|
num_nodes = MAX_NUMNODES;
|
2007-07-21 23:11:29 +08:00
|
|
|
size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
|
2007-05-03 01:27:09 +08:00
|
|
|
num_nodes;
|
2007-02-13 20:26:22 +08:00
|
|
|
/*
|
2007-05-03 01:27:09 +08:00
|
|
|
* Calculate the number of big nodes that can be allocated as a result
|
|
|
|
* of consolidating the leftovers.
|
2007-02-13 20:26:22 +08:00
|
|
|
*/
|
2007-05-03 01:27:09 +08:00
|
|
|
big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
|
|
|
|
FAKE_NODE_MIN_SIZE;
|
|
|
|
|
|
|
|
/* Round down to nearest FAKE_NODE_MIN_SIZE. */
|
|
|
|
size &= FAKE_NODE_MIN_HASH_MASK;
|
|
|
|
if (!size) {
|
|
|
|
printk(KERN_ERR "Not enough memory for each node. "
|
|
|
|
"NUMA emulation disabled.\n");
|
|
|
|
return -1;
|
2007-02-13 20:26:22 +08:00
|
|
|
}
|
2007-05-03 01:27:09 +08:00
|
|
|
|
|
|
|
for (i = node_start; i < num_nodes + node_start; i++) {
|
|
|
|
u64 end = *addr + size;
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2007-02-13 20:26:22 +08:00
|
|
|
if (i < big)
|
|
|
|
end += FAKE_NODE_MIN_SIZE;
|
|
|
|
/*
|
2007-05-03 01:27:09 +08:00
|
|
|
* The final node can have the remaining system RAM. Other
|
|
|
|
* nodes receive roughly the same amount of available pages.
|
2007-02-13 20:26:22 +08:00
|
|
|
*/
|
2007-05-03 01:27:09 +08:00
|
|
|
if (i == num_nodes + node_start - 1)
|
|
|
|
end = max_addr;
|
|
|
|
else
|
2007-07-21 23:11:29 +08:00
|
|
|
while (end - *addr - e820_hole_size(*addr, end) <
|
2007-05-03 01:27:09 +08:00
|
|
|
size) {
|
|
|
|
end += FAKE_NODE_MIN_SIZE;
|
|
|
|
if (end > max_addr) {
|
|
|
|
end = max_addr;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return i - node_start + 1;
|
|
|
|
}
|
|
|
|
|
2007-05-03 01:27:09 +08:00
|
|
|
/*
|
|
|
|
* Splits the remaining system RAM into chunks of size. The remaining memory is
|
|
|
|
* always assigned to a final node and can be asymmetric. Returns the number of
|
|
|
|
* nodes split.
|
|
|
|
*/
|
|
|
|
static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
|
|
|
|
u64 max_addr, int node_start, u64 size)
|
|
|
|
{
|
|
|
|
int i = node_start;
|
|
|
|
size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
|
|
|
|
while (!setup_node_range(i++, nodes, addr, size, max_addr))
|
|
|
|
;
|
|
|
|
return i - node_start;
|
|
|
|
}
|
|
|
|
|
2007-05-03 01:27:09 +08:00
|
|
|
/*
|
2008-05-12 21:43:36 +08:00
|
|
|
* Sets up the system RAM area from start_pfn to last_pfn according to the
|
2007-05-03 01:27:09 +08:00
|
|
|
* numa=fake command-line option.
|
|
|
|
*/
|
2008-04-05 09:11:09 +08:00
|
|
|
static struct bootnode nodes[MAX_NUMNODES] __initdata;
|
|
|
|
|
2008-05-12 21:43:36 +08:00
|
|
|
static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
|
2007-05-03 01:27:09 +08:00
|
|
|
{
|
2008-01-30 20:30:37 +08:00
|
|
|
u64 size, addr = start_pfn << PAGE_SHIFT;
|
2008-05-12 21:43:36 +08:00
|
|
|
u64 max_addr = last_pfn << PAGE_SHIFT;
|
2008-01-30 20:30:37 +08:00
|
|
|
int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
|
2007-05-03 01:27:09 +08:00
|
|
|
|
|
|
|
memset(&nodes, 0, sizeof(nodes));
|
|
|
|
/*
|
|
|
|
* If the numa=fake command-line is just a single number N, split the
|
|
|
|
* system RAM into N fake nodes.
|
|
|
|
*/
|
|
|
|
if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
|
2008-01-30 20:30:37 +08:00
|
|
|
long n = simple_strtol(cmdline, NULL, 0);
|
|
|
|
|
|
|
|
num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
|
2007-05-03 01:27:09 +08:00
|
|
|
if (num_nodes < 0)
|
|
|
|
return num_nodes;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Parse the command line. */
|
2007-05-03 01:27:09 +08:00
|
|
|
for (coeff_flag = 0; ; cmdline++) {
|
2007-05-03 01:27:09 +08:00
|
|
|
if (*cmdline && isdigit(*cmdline)) {
|
|
|
|
num = num * 10 + *cmdline - '0';
|
|
|
|
continue;
|
2007-02-13 20:26:22 +08:00
|
|
|
}
|
2007-05-03 01:27:09 +08:00
|
|
|
if (*cmdline == '*') {
|
|
|
|
if (num > 0)
|
|
|
|
coeff = num;
|
|
|
|
coeff_flag = 1;
|
|
|
|
}
|
2007-05-03 01:27:09 +08:00
|
|
|
if (!*cmdline || *cmdline == ',') {
|
2007-05-03 01:27:09 +08:00
|
|
|
if (!coeff_flag)
|
|
|
|
coeff = 1;
|
2007-05-03 01:27:09 +08:00
|
|
|
/*
|
|
|
|
* Round down to the nearest FAKE_NODE_MIN_SIZE.
|
|
|
|
* Command-line coefficients are in megabytes.
|
|
|
|
*/
|
|
|
|
size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
|
2007-05-03 01:27:09 +08:00
|
|
|
if (size)
|
2007-05-03 01:27:09 +08:00
|
|
|
for (i = 0; i < coeff; i++, num_nodes++)
|
|
|
|
if (setup_node_range(num_nodes, nodes,
|
|
|
|
&addr, size, max_addr) < 0)
|
|
|
|
goto done;
|
2007-05-03 01:27:09 +08:00
|
|
|
if (!*cmdline)
|
|
|
|
break;
|
|
|
|
coeff_flag = 0;
|
|
|
|
coeff = -1;
|
2007-02-13 20:26:22 +08:00
|
|
|
}
|
2007-05-03 01:27:09 +08:00
|
|
|
num = 0;
|
|
|
|
}
|
|
|
|
done:
|
|
|
|
if (!num_nodes)
|
|
|
|
return -1;
|
2007-05-03 01:27:09 +08:00
|
|
|
/* Fill remainder of system RAM, if appropriate. */
|
2007-05-03 01:27:09 +08:00
|
|
|
if (addr < max_addr) {
|
2007-05-03 01:27:09 +08:00
|
|
|
if (coeff_flag && coeff < 0) {
|
|
|
|
/* Split remaining nodes into num-sized chunks */
|
|
|
|
num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
|
|
|
|
num_nodes, num);
|
|
|
|
goto out;
|
|
|
|
}
|
2007-05-03 01:27:09 +08:00
|
|
|
switch (*(cmdline - 1)) {
|
|
|
|
case '*':
|
|
|
|
/* Split remaining nodes into coeff chunks */
|
|
|
|
if (coeff <= 0)
|
|
|
|
break;
|
|
|
|
num_nodes += split_nodes_equally(nodes, &addr, max_addr,
|
|
|
|
num_nodes, coeff);
|
|
|
|
break;
|
|
|
|
case ',':
|
|
|
|
/* Do not allocate remaining system RAM */
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
/* Give one final node */
|
|
|
|
setup_node_range(num_nodes, nodes, &addr,
|
|
|
|
max_addr - addr, max_addr);
|
|
|
|
num_nodes++;
|
|
|
|
}
|
2007-05-03 01:27:09 +08:00
|
|
|
}
|
|
|
|
out:
|
2008-03-26 01:14:35 +08:00
|
|
|
memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
|
2007-05-03 01:27:09 +08:00
|
|
|
if (memnode_shift < 0) {
|
|
|
|
memnode_shift = 0;
|
|
|
|
printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
|
|
|
|
"disabled.\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to vacate all active ranges that may have been registered by
|
2007-07-21 23:11:30 +08:00
|
|
|
* SRAT and set acpi_numa to -1 so that srat_disabled() always returns
|
|
|
|
* true. NUMA emulation has succeeded so we will not scan ACPI nodes.
|
2007-05-03 01:27:09 +08:00
|
|
|
*/
|
|
|
|
remove_all_active_ranges();
|
2007-07-21 23:11:30 +08:00
|
|
|
#ifdef CONFIG_ACPI_NUMA
|
|
|
|
acpi_numa = -1;
|
|
|
|
#endif
|
2007-05-03 01:27:20 +08:00
|
|
|
for_each_node_mask(i, node_possible_map) {
|
2006-09-27 16:49:52 +08:00
|
|
|
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
|
|
|
|
nodes[i].end >> PAGE_SHIFT);
|
2008-01-30 20:30:37 +08:00
|
|
|
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
|
2006-09-27 16:49:52 +08:00
|
|
|
}
|
2007-07-21 23:10:32 +08:00
|
|
|
acpi_fake_nodes(nodes, num_nodes);
|
2008-01-30 20:30:37 +08:00
|
|
|
numa_init_array();
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-05-03 01:27:09 +08:00
|
|
|
#endif /* CONFIG_NUMA_EMU */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-06-22 17:44:49 +08:00
|
|
|
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
|
2008-01-30 20:30:37 +08:00
|
|
|
{
|
2005-04-17 06:20:36 +08:00
|
|
|
int i;
|
|
|
|
|
2007-05-03 01:27:20 +08:00
|
|
|
nodes_clear(node_possible_map);
|
2008-02-17 18:02:21 +08:00
|
|
|
nodes_clear(node_online_map);
|
2007-05-03 01:27:20 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_NUMA_EMU
|
2008-05-12 21:43:36 +08:00
|
|
|
if (cmdline && !numa_emulation(start_pfn, last_pfn))
|
2008-01-30 20:30:37 +08:00
|
|
|
return;
|
2007-05-03 01:27:20 +08:00
|
|
|
nodes_clear(node_possible_map);
|
2008-02-17 18:02:21 +08:00
|
|
|
nodes_clear(node_online_map);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_ACPI_NUMA
|
|
|
|
if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
|
2008-05-12 21:43:36 +08:00
|
|
|
last_pfn << PAGE_SHIFT))
|
2008-01-30 20:30:37 +08:00
|
|
|
return;
|
2007-05-03 01:27:20 +08:00
|
|
|
nodes_clear(node_possible_map);
|
2008-02-17 18:02:21 +08:00
|
|
|
nodes_clear(node_online_map);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_K8_NUMA
|
2008-01-30 20:30:37 +08:00
|
|
|
if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
|
2008-05-12 21:43:36 +08:00
|
|
|
last_pfn<<PAGE_SHIFT))
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
2007-05-03 01:27:20 +08:00
|
|
|
nodes_clear(node_possible_map);
|
2008-02-17 18:02:21 +08:00
|
|
|
nodes_clear(node_online_map);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
printk(KERN_INFO "%s\n",
|
|
|
|
numa_off ? "NUMA turned off" : "No NUMA configuration found");
|
|
|
|
|
2008-01-30 20:30:37 +08:00
|
|
|
printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
|
2005-04-17 06:20:36 +08:00
|
|
|
start_pfn << PAGE_SHIFT,
|
2008-05-12 21:43:36 +08:00
|
|
|
last_pfn << PAGE_SHIFT);
|
2008-01-30 20:30:37 +08:00
|
|
|
/* setup dummy node covering all memory */
|
|
|
|
memnode_shift = 63;
|
2007-02-13 20:26:19 +08:00
|
|
|
memnodemap = memnode.embedded_map;
|
2005-04-17 06:20:36 +08:00
|
|
|
memnodemap[0] = 0;
|
|
|
|
node_set_online(0);
|
2007-05-03 01:27:20 +08:00
|
|
|
node_set(0, node_possible_map);
|
2005-04-17 06:20:36 +08:00
|
|
|
for (i = 0; i < NR_CPUS; i++)
|
2005-11-06 00:25:53 +08:00
|
|
|
numa_set_node(i, 0);
|
2008-05-12 21:43:36 +08:00
|
|
|
e820_register_active_regions(0, start_pfn, last_pfn);
|
|
|
|
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
|
2005-11-06 00:25:53 +08:00
|
|
|
}
|
|
|
|
|
2008-01-30 20:30:37 +08:00
|
|
|
unsigned long __init numa_free_all_bootmem(void)
|
|
|
|
{
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long pages = 0;
|
2008-01-30 20:30:37 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_online_node(i)
|
2005-04-17 06:20:36 +08:00
|
|
|
pages += free_all_bootmem_node(NODE_DATA(i));
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return pages;
|
2008-01-30 20:30:37 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
void __init paging_init(void)
|
2008-01-30 20:30:37 +08:00
|
|
|
{
|
2006-10-11 16:20:39 +08:00
|
|
|
unsigned long max_zone_pfns[MAX_NR_ZONES];
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2006-10-11 16:20:39 +08:00
|
|
|
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
|
|
|
|
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
|
|
|
|
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
|
2008-06-25 13:14:09 +08:00
|
|
|
max_zone_pfns[ZONE_NORMAL] = max_pfn;
|
2005-11-06 00:25:54 +08:00
|
|
|
|
2007-02-13 20:26:25 +08:00
|
|
|
sparse_memory_present_with_active_regions(MAX_NUMNODES);
|
|
|
|
sparse_init();
|
2005-11-06 00:25:54 +08:00
|
|
|
|
2006-09-27 16:49:52 +08:00
|
|
|
free_area_init_nodes(max_zone_pfns);
|
2008-01-30 20:30:37 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-26 16:52:32 +08:00
|
|
|
static __init int numa_setup(char *opt)
|
2008-01-30 20:30:37 +08:00
|
|
|
{
|
2006-09-26 16:52:32 +08:00
|
|
|
if (!opt)
|
|
|
|
return -EINVAL;
|
2008-01-30 20:30:37 +08:00
|
|
|
if (!strncmp(opt, "off", 3))
|
2005-04-17 06:20:36 +08:00
|
|
|
numa_off = 1;
|
|
|
|
#ifdef CONFIG_NUMA_EMU
|
2007-05-03 01:27:09 +08:00
|
|
|
if (!strncmp(opt, "fake=", 5))
|
|
|
|
cmdline = opt + 5;
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_ACPI_NUMA
|
2008-01-30 20:30:37 +08:00
|
|
|
if (!strncmp(opt, "noacpi", 6))
|
|
|
|
acpi_numa = -1;
|
|
|
|
if (!strncmp(opt, "hotadd=", 7))
|
2006-04-08 01:49:18 +08:00
|
|
|
hotadd_percent = simple_strtoul(opt+7, NULL, 10);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
2006-09-26 16:52:32 +08:00
|
|
|
return 0;
|
2008-01-30 20:30:37 +08:00
|
|
|
}
|
2006-09-26 16:52:32 +08:00
|
|
|
early_param("numa", numa_setup);
|
|
|
|
|
x86: cleanup early per cpu variables/accesses v4
* Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is
used by some per_cpu variables that are initialized and accessed
before there are per_cpu areas allocated.
["Early" in respect to per_cpu variables is "earlier than the per_cpu
areas have been setup".]
This patchset adds these new macros:
DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)
EXPORT_EARLY_PER_CPU_SYMBOL(_name)
DECLARE_EARLY_PER_CPU(_type, _name)
early_per_cpu_ptr(_name)
early_per_cpu_map(_name, _idx)
early_per_cpu(_name, _cpu)
The DEFINE macro defines the per_cpu variable as well as the early
map and pointer. It also initializes the per_cpu variable and map
elements to "_initvalue". The early_* macros provide access to
the initial map (usually setup during system init) and the early
pointer. This pointer is initialized to point to the early map
but is then NULL'ed when the actual per_cpu areas are setup. After
that the per_cpu variable is the correct access to the variable.
The early_per_cpu() macro is not very efficient but does show how to
access the variable if you have a function that can be called both
"early" and "late". It tests the early ptr to be NULL, and if not
then it's still valid. Otherwise, the per_cpu variable is used
instead:
#define early_per_cpu(_name, _cpu) \
(early_per_cpu_ptr(_name) ? \
early_per_cpu_ptr(_name)[_cpu] : \
per_cpu(_name, _cpu))
A better method is to actually check the pointer manually. In the
case below, numa_set_node can be called both "early" and "late":
void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else
per_cpu(x86_cpu_to_node_map, cpu) = node;
}
* Add a flag "arch_provides_topology_pointers" that indicates pointers
to topology cpumask_t maps are available. Otherwise, use the function
returning the cpumask_t value. This is useful if cpumask_t set size
is very large to avoid copying data on to/off of the stack.
* The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while
the non-debug case has been optimized a bit.
* Remove an unreferenced compiler warning in drivers/base/topology.c
* Clean up #ifdef in setup.c
For inclusion into sched-devel/latest tree.
Based on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
+ sched-devel/latest .../mingo/linux-2.6-sched-devel.git
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-13 03:21:12 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
2006-01-12 05:45:36 +08:00
|
|
|
/*
|
|
|
|
* Setup early cpu_to_node.
|
|
|
|
*
|
|
|
|
* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
|
|
|
|
* and apicid_to_node[] tables have valid entries for a CPU.
|
|
|
|
* This means we skip cpu_to_node[] initialisation for NUMA
|
|
|
|
* emulation and faking node case (when running a kernel compiled
|
|
|
|
* for NUMA on a non NUMA box), which is OK as cpu_to_node[]
|
|
|
|
* is already initialized in a round robin manner at numa_init_array,
|
|
|
|
* prior to this call, and this initialization is good enough
|
|
|
|
* for the fake NUMA cases.
|
x86: cleanup early per cpu variables/accesses v4
* Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is
used by some per_cpu variables that are initialized and accessed
before there are per_cpu areas allocated.
["Early" in respect to per_cpu variables is "earlier than the per_cpu
areas have been setup".]
This patchset adds these new macros:
DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)
EXPORT_EARLY_PER_CPU_SYMBOL(_name)
DECLARE_EARLY_PER_CPU(_type, _name)
early_per_cpu_ptr(_name)
early_per_cpu_map(_name, _idx)
early_per_cpu(_name, _cpu)
The DEFINE macro defines the per_cpu variable as well as the early
map and pointer. It also initializes the per_cpu variable and map
elements to "_initvalue". The early_* macros provide access to
the initial map (usually setup during system init) and the early
pointer. This pointer is initialized to point to the early map
but is then NULL'ed when the actual per_cpu areas are setup. After
that the per_cpu variable is the correct access to the variable.
The early_per_cpu() macro is not very efficient but does show how to
access the variable if you have a function that can be called both
"early" and "late". It tests the early ptr to be NULL, and if not
then it's still valid. Otherwise, the per_cpu variable is used
instead:
#define early_per_cpu(_name, _cpu) \
(early_per_cpu_ptr(_name) ? \
early_per_cpu_ptr(_name)[_cpu] : \
per_cpu(_name, _cpu))
A better method is to actually check the pointer manually. In the
case below, numa_set_node can be called both "early" and "late":
void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else
per_cpu(x86_cpu_to_node_map, cpu) = node;
}
* Add a flag "arch_provides_topology_pointers" that indicates pointers
to topology cpumask_t maps are available. Otherwise, use the function
returning the cpumask_t value. This is useful if cpumask_t set size
is very large to avoid copying data on to/off of the stack.
* The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while
the non-debug case has been optimized a bit.
* Remove an unreferenced compiler warning in drivers/base/topology.c
* Clean up #ifdef in setup.c
For inclusion into sched-devel/latest tree.
Based on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
+ sched-devel/latest .../mingo/linux-2.6-sched-devel.git
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-13 03:21:12 +08:00
|
|
|
*
|
|
|
|
* Called before the per_cpu areas are setup.
|
2006-01-12 05:45:36 +08:00
|
|
|
*/
|
|
|
|
void __init init_cpu_to_node(void)
|
|
|
|
{
|
x86: cleanup early per cpu variables/accesses v4
* Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is
used by some per_cpu variables that are initialized and accessed
before there are per_cpu areas allocated.
["Early" in respect to per_cpu variables is "earlier than the per_cpu
areas have been setup".]
This patchset adds these new macros:
DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)
EXPORT_EARLY_PER_CPU_SYMBOL(_name)
DECLARE_EARLY_PER_CPU(_type, _name)
early_per_cpu_ptr(_name)
early_per_cpu_map(_name, _idx)
early_per_cpu(_name, _cpu)
The DEFINE macro defines the per_cpu variable as well as the early
map and pointer. It also initializes the per_cpu variable and map
elements to "_initvalue". The early_* macros provide access to
the initial map (usually setup during system init) and the early
pointer. This pointer is initialized to point to the early map
but is then NULL'ed when the actual per_cpu areas are setup. After
that the per_cpu variable is the correct access to the variable.
The early_per_cpu() macro is not very efficient but does show how to
access the variable if you have a function that can be called both
"early" and "late". It tests the early ptr to be NULL, and if not
then it's still valid. Otherwise, the per_cpu variable is used
instead:
#define early_per_cpu(_name, _cpu) \
(early_per_cpu_ptr(_name) ? \
early_per_cpu_ptr(_name)[_cpu] : \
per_cpu(_name, _cpu))
A better method is to actually check the pointer manually. In the
case below, numa_set_node can be called both "early" and "late":
void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else
per_cpu(x86_cpu_to_node_map, cpu) = node;
}
* Add a flag "arch_provides_topology_pointers" that indicates pointers
to topology cpumask_t maps are available. Otherwise, use the function
returning the cpumask_t value. This is useful if cpumask_t set size
is very large to avoid copying data on to/off of the stack.
* The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while
the non-debug case has been optimized a bit.
* Remove an unreferenced compiler warning in drivers/base/topology.c
* Clean up #ifdef in setup.c
For inclusion into sched-devel/latest tree.
Based on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
+ sched-devel/latest .../mingo/linux-2.6-sched-devel.git
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-13 03:21:12 +08:00
|
|
|
int cpu;
|
|
|
|
u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
|
2008-01-30 20:30:37 +08:00
|
|
|
|
x86: cleanup early per cpu variables/accesses v4
* Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is
used by some per_cpu variables that are initialized and accessed
before there are per_cpu areas allocated.
["Early" in respect to per_cpu variables is "earlier than the per_cpu
areas have been setup".]
This patchset adds these new macros:
DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)
EXPORT_EARLY_PER_CPU_SYMBOL(_name)
DECLARE_EARLY_PER_CPU(_type, _name)
early_per_cpu_ptr(_name)
early_per_cpu_map(_name, _idx)
early_per_cpu(_name, _cpu)
The DEFINE macro defines the per_cpu variable as well as the early
map and pointer. It also initializes the per_cpu variable and map
elements to "_initvalue". The early_* macros provide access to
the initial map (usually setup during system init) and the early
pointer. This pointer is initialized to point to the early map
but is then NULL'ed when the actual per_cpu areas are setup. After
that the per_cpu variable is the correct access to the variable.
The early_per_cpu() macro is not very efficient but does show how to
access the variable if you have a function that can be called both
"early" and "late". It tests the early ptr to be NULL, and if not
then it's still valid. Otherwise, the per_cpu variable is used
instead:
#define early_per_cpu(_name, _cpu) \
(early_per_cpu_ptr(_name) ? \
early_per_cpu_ptr(_name)[_cpu] : \
per_cpu(_name, _cpu))
A better method is to actually check the pointer manually. In the
case below, numa_set_node can be called both "early" and "late":
void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else
per_cpu(x86_cpu_to_node_map, cpu) = node;
}
* Add a flag "arch_provides_topology_pointers" that indicates pointers
to topology cpumask_t maps are available. Otherwise, use the function
returning the cpumask_t value. This is useful if cpumask_t set size
is very large to avoid copying data on to/off of the stack.
* The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while
the non-debug case has been optimized a bit.
* Remove an unreferenced compiler warning in drivers/base/topology.c
* Clean up #ifdef in setup.c
For inclusion into sched-devel/latest tree.
Based on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
+ sched-devel/latest .../mingo/linux-2.6-sched-devel.git
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-13 03:21:12 +08:00
|
|
|
BUG_ON(cpu_to_apicid == NULL);
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu) {
|
2008-02-20 07:35:54 +08:00
|
|
|
int node;
|
x86: cleanup early per cpu variables/accesses v4
* Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is
used by some per_cpu variables that are initialized and accessed
before there are per_cpu areas allocated.
["Early" in respect to per_cpu variables is "earlier than the per_cpu
areas have been setup".]
This patchset adds these new macros:
DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)
EXPORT_EARLY_PER_CPU_SYMBOL(_name)
DECLARE_EARLY_PER_CPU(_type, _name)
early_per_cpu_ptr(_name)
early_per_cpu_map(_name, _idx)
early_per_cpu(_name, _cpu)
The DEFINE macro defines the per_cpu variable as well as the early
map and pointer. It also initializes the per_cpu variable and map
elements to "_initvalue". The early_* macros provide access to
the initial map (usually setup during system init) and the early
pointer. This pointer is initialized to point to the early map
but is then NULL'ed when the actual per_cpu areas are setup. After
that the per_cpu variable is the correct access to the variable.
The early_per_cpu() macro is not very efficient but does show how to
access the variable if you have a function that can be called both
"early" and "late". It tests the early ptr to be NULL, and if not
then it's still valid. Otherwise, the per_cpu variable is used
instead:
#define early_per_cpu(_name, _cpu) \
(early_per_cpu_ptr(_name) ? \
early_per_cpu_ptr(_name)[_cpu] : \
per_cpu(_name, _cpu))
A better method is to actually check the pointer manually. In the
case below, numa_set_node can be called both "early" and "late":
void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else
per_cpu(x86_cpu_to_node_map, cpu) = node;
}
* Add a flag "arch_provides_topology_pointers" that indicates pointers
to topology cpumask_t maps are available. Otherwise, use the function
returning the cpumask_t value. This is useful if cpumask_t set size
is very large to avoid copying data on to/off of the stack.
* The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while
the non-debug case has been optimized a bit.
* Remove an unreferenced compiler warning in drivers/base/topology.c
* Clean up #ifdef in setup.c
For inclusion into sched-devel/latest tree.
Based on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
+ sched-devel/latest .../mingo/linux-2.6-sched-devel.git
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-13 03:21:12 +08:00
|
|
|
u16 apicid = cpu_to_apicid[cpu];
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2006-01-12 05:45:36 +08:00
|
|
|
if (apicid == BAD_APICID)
|
|
|
|
continue;
|
2008-02-20 07:35:54 +08:00
|
|
|
node = apicid_to_node[apicid];
|
|
|
|
if (node == NUMA_NO_NODE)
|
2006-01-12 05:45:36 +08:00
|
|
|
continue;
|
2008-02-20 07:35:54 +08:00
|
|
|
if (!node_online(node))
|
|
|
|
continue;
|
x86: cleanup early per cpu variables/accesses v4
* Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is
used by some per_cpu variables that are initialized and accessed
before there are per_cpu areas allocated.
["Early" in respect to per_cpu variables is "earlier than the per_cpu
areas have been setup".]
This patchset adds these new macros:
DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)
EXPORT_EARLY_PER_CPU_SYMBOL(_name)
DECLARE_EARLY_PER_CPU(_type, _name)
early_per_cpu_ptr(_name)
early_per_cpu_map(_name, _idx)
early_per_cpu(_name, _cpu)
The DEFINE macro defines the per_cpu variable as well as the early
map and pointer. It also initializes the per_cpu variable and map
elements to "_initvalue". The early_* macros provide access to
the initial map (usually setup during system init) and the early
pointer. This pointer is initialized to point to the early map
but is then NULL'ed when the actual per_cpu areas are setup. After
that the per_cpu variable is the correct access to the variable.
The early_per_cpu() macro is not very efficient but does show how to
access the variable if you have a function that can be called both
"early" and "late". It tests the early ptr to be NULL, and if not
then it's still valid. Otherwise, the per_cpu variable is used
instead:
#define early_per_cpu(_name, _cpu) \
(early_per_cpu_ptr(_name) ? \
early_per_cpu_ptr(_name)[_cpu] : \
per_cpu(_name, _cpu))
A better method is to actually check the pointer manually. In the
case below, numa_set_node can be called both "early" and "late":
void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else
per_cpu(x86_cpu_to_node_map, cpu) = node;
}
* Add a flag "arch_provides_topology_pointers" that indicates pointers
to topology cpumask_t maps are available. Otherwise, use the function
returning the cpumask_t value. This is useful if cpumask_t set size
is very large to avoid copying data on to/off of the stack.
* The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while
the non-debug case has been optimized a bit.
* Remove an unreferenced compiler warning in drivers/base/topology.c
* Clean up #ifdef in setup.c
For inclusion into sched-devel/latest tree.
Based on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
+ sched-devel/latest .../mingo/linux-2.6-sched-devel.git
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-13 03:21:12 +08:00
|
|
|
numa_set_node(cpu, node);
|
2006-01-12 05:45:36 +08:00
|
|
|
}
|
|
|
|
}
|
x86: cleanup early per cpu variables/accesses v4
* Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is
used by some per_cpu variables that are initialized and accessed
before there are per_cpu areas allocated.
["Early" in respect to per_cpu variables is "earlier than the per_cpu
areas have been setup".]
This patchset adds these new macros:
DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)
EXPORT_EARLY_PER_CPU_SYMBOL(_name)
DECLARE_EARLY_PER_CPU(_type, _name)
early_per_cpu_ptr(_name)
early_per_cpu_map(_name, _idx)
early_per_cpu(_name, _cpu)
The DEFINE macro defines the per_cpu variable as well as the early
map and pointer. It also initializes the per_cpu variable and map
elements to "_initvalue". The early_* macros provide access to
the initial map (usually setup during system init) and the early
pointer. This pointer is initialized to point to the early map
but is then NULL'ed when the actual per_cpu areas are setup. After
that the per_cpu variable is the correct access to the variable.
The early_per_cpu() macro is not very efficient but does show how to
access the variable if you have a function that can be called both
"early" and "late". It tests the early ptr to be NULL, and if not
then it's still valid. Otherwise, the per_cpu variable is used
instead:
#define early_per_cpu(_name, _cpu) \
(early_per_cpu_ptr(_name) ? \
early_per_cpu_ptr(_name)[_cpu] : \
per_cpu(_name, _cpu))
A better method is to actually check the pointer manually. In the
case below, numa_set_node can be called both "early" and "late":
void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else
per_cpu(x86_cpu_to_node_map, cpu) = node;
}
* Add a flag "arch_provides_topology_pointers" that indicates pointers
to topology cpumask_t maps are available. Otherwise, use the function
returning the cpumask_t value. This is useful if cpumask_t set size
is very large to avoid copying data on to/off of the stack.
* The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while
the non-debug case has been optimized a bit.
* Remove an unreferenced compiler warning in drivers/base/topology.c
* Clean up #ifdef in setup.c
For inclusion into sched-devel/latest tree.
Based on:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
+ sched-devel/latest .../mingo/linux-2.6-sched-devel.git
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-05-13 03:21:12 +08:00
|
|
|
#endif
|
2006-01-12 05:45:36 +08:00
|
|
|
|
2006-01-12 05:46:27 +08:00
|
|
|
|