2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
|
|
|
|
*/
|
|
|
|
#ifndef _LINUX_BOOTMEM_H
|
|
|
|
#define _LINUX_BOOTMEM_H
|
|
|
|
|
|
|
|
#include <linux/mmzone.h>
|
2006-09-26 14:31:06 +08:00
|
|
|
#include <asm/dma.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* simple boot-time physical memory area allocator.
|
|
|
|
*/
|
|
|
|
|
|
|
|
extern unsigned long max_low_pfn;
|
|
|
|
extern unsigned long min_low_pfn;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* highest page
|
|
|
|
*/
|
|
|
|
extern unsigned long max_pfn;
|
|
|
|
|
2005-06-26 05:58:18 +08:00
|
|
|
#ifdef CONFIG_CRASH_DUMP
|
|
|
|
extern unsigned long saved_max_pfn;
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* node_bootmem_map is a map pointer - the bits represent all physical
|
|
|
|
* memory pages (including holes) on the node.
|
|
|
|
*/
|
|
|
|
typedef struct bootmem_data {
|
|
|
|
unsigned long node_boot_start;
|
|
|
|
unsigned long node_low_pfn;
|
|
|
|
void *node_bootmem_map;
|
|
|
|
unsigned long last_offset;
|
|
|
|
unsigned long last_pos;
|
|
|
|
unsigned long last_success; /* Previous allocation point. To speed
|
|
|
|
* up searching */
|
2006-03-27 17:15:58 +08:00
|
|
|
struct list_head list;
|
2005-04-17 06:20:36 +08:00
|
|
|
} bootmem_data_t;
|
|
|
|
|
2006-09-26 14:31:08 +08:00
|
|
|
extern unsigned long bootmem_bootmap_pages(unsigned long);
|
|
|
|
extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
|
|
|
|
extern void free_bootmem(unsigned long addr, unsigned long size);
|
|
|
|
extern void *__alloc_bootmem(unsigned long size,
|
|
|
|
unsigned long align,
|
|
|
|
unsigned long goal);
|
|
|
|
extern void *__alloc_bootmem_nopanic(unsigned long size,
|
|
|
|
unsigned long align,
|
|
|
|
unsigned long goal);
|
|
|
|
extern void *__alloc_bootmem_low(unsigned long size,
|
|
|
|
unsigned long align,
|
|
|
|
unsigned long goal);
|
|
|
|
extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
|
|
|
|
unsigned long size,
|
|
|
|
unsigned long align,
|
|
|
|
unsigned long goal);
|
|
|
|
extern void *__alloc_bootmem_core(struct bootmem_data *bdata,
|
|
|
|
unsigned long size,
|
2006-09-26 14:31:05 +08:00
|
|
|
unsigned long align,
|
2006-09-26 14:31:08 +08:00
|
|
|
unsigned long goal,
|
|
|
|
unsigned long limit);
|
|
|
|
|
2008-02-07 16:15:17 +08:00
|
|
|
/*
|
|
|
|
* flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
|
|
|
|
* the architecture-specific code should honor this)
|
|
|
|
*/
|
|
|
|
#define BOOTMEM_DEFAULT 0
|
|
|
|
#define BOOTMEM_EXCLUSIVE (1<<0)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
|
2008-02-07 16:15:17 +08:00
|
|
|
/*
|
|
|
|
* If flags is 0, then the return value is always 0 (success). If
|
|
|
|
* flags contains BOOTMEM_EXCLUSIVE, then -EBUSY is returned if the
|
|
|
|
* memory already was reserved.
|
|
|
|
*/
|
|
|
|
extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
#define alloc_bootmem(x) \
|
2006-09-26 14:31:05 +08:00
|
|
|
__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
|
2005-04-17 06:20:36 +08:00
|
|
|
#define alloc_bootmem_low(x) \
|
2006-09-26 14:31:05 +08:00
|
|
|
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define alloc_bootmem_pages(x) \
|
2006-09-26 14:31:05 +08:00
|
|
|
__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
|
2005-04-17 06:20:36 +08:00
|
|
|
#define alloc_bootmem_low_pages(x) \
|
2006-09-26 14:31:05 +08:00
|
|
|
__alloc_bootmem_low(x, PAGE_SIZE, 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
|
2006-09-26 14:31:08 +08:00
|
|
|
|
|
|
|
extern unsigned long free_all_bootmem(void);
|
|
|
|
extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
|
|
|
|
extern void *__alloc_bootmem_node(pg_data_t *pgdat,
|
|
|
|
unsigned long size,
|
|
|
|
unsigned long align,
|
|
|
|
unsigned long goal);
|
|
|
|
extern unsigned long init_bootmem_node(pg_data_t *pgdat,
|
|
|
|
unsigned long freepfn,
|
|
|
|
unsigned long startpfn,
|
|
|
|
unsigned long endpfn);
|
|
|
|
extern void reserve_bootmem_node(pg_data_t *pgdat,
|
|
|
|
unsigned long physaddr,
|
2008-02-07 16:15:17 +08:00
|
|
|
unsigned long size,
|
|
|
|
int flags);
|
2006-09-26 14:31:08 +08:00
|
|
|
extern void free_bootmem_node(pg_data_t *pgdat,
|
|
|
|
unsigned long addr,
|
|
|
|
unsigned long size);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
|
|
|
|
#define alloc_bootmem_node(pgdat, x) \
|
2006-09-26 14:31:05 +08:00
|
|
|
__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
|
2005-04-17 06:20:36 +08:00
|
|
|
#define alloc_bootmem_pages_node(pgdat, x) \
|
2006-09-26 14:31:05 +08:00
|
|
|
__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
|
2005-04-17 06:20:36 +08:00
|
|
|
#define alloc_bootmem_low_pages_node(pgdat, x) \
|
2006-09-26 14:31:05 +08:00
|
|
|
__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
|
|
|
|
|
2005-06-23 15:07:39 +08:00
|
|
|
#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
|
|
|
|
extern void *alloc_remap(int nid, unsigned long size);
|
|
|
|
#else
|
|
|
|
static inline void *alloc_remap(int nid, unsigned long size)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
2006-09-26 14:31:08 +08:00
|
|
|
#endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */
|
2005-06-23 15:07:39 +08:00
|
|
|
|
2006-07-10 19:44:51 +08:00
|
|
|
extern unsigned long __meminitdata nr_kernel_pages;
|
2007-03-22 16:11:22 +08:00
|
|
|
extern unsigned long __meminitdata nr_all_pages;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-26 14:31:08 +08:00
|
|
|
extern void *alloc_large_system_hash(const char *tablename,
|
|
|
|
unsigned long bucketsize,
|
|
|
|
unsigned long numentries,
|
|
|
|
int scale,
|
|
|
|
int flags,
|
|
|
|
unsigned int *_hash_shift,
|
|
|
|
unsigned int *_hash_mask,
|
|
|
|
unsigned long limit);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-12-07 12:37:33 +08:00
|
|
|
#define HASH_EARLY 0x00000001 /* Allocating during early boot? */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Only NUMA needs hash distribution.
|
[PATCH] x86-64: Set HASHDIST_DEFAULT to 1 for x86_64 NUMA
Enable system hashtable memory to be distributed among nodes on x86_64 NUMA
Forcing the kernel to use node interleaved vmalloc instead of bootmem for
the system hashtable memory (alloc_large_system_hash) reduces the memory
imbalance on node 0 by around 40MB on a 8 node x86_64 NUMA box:
Before the following patch, on bootup of a 8 node box:
Node 0 MemTotal: 3407488 kB
Node 0 MemFree: 3206296 kB
Node 0 MemUsed: 201192 kB
Node 0 Active: 7012 kB
Node 0 Inactive: 512 kB
Node 0 Dirty: 0 kB
Node 0 Writeback: 0 kB
Node 0 FilePages: 1912 kB
Node 0 Mapped: 420 kB
Node 0 AnonPages: 5612 kB
Node 0 PageTables: 468 kB
Node 0 NFS_Unstable: 0 kB
Node 0 Bounce: 0 kB
Node 0 Slab: 5408 kB
Node 0 SReclaimable: 644 kB
Node 0 SUnreclaim: 4764 kB
After the patch (or using hashdist=1 on the kernel command line):
Node 0 MemTotal: 3407488 kB
Node 0 MemFree: 3247608 kB
Node 0 MemUsed: 159880 kB
Node 0 Active: 3012 kB
Node 0 Inactive: 616 kB
Node 0 Dirty: 0 kB
Node 0 Writeback: 0 kB
Node 0 FilePages: 2424 kB
Node 0 Mapped: 380 kB
Node 0 AnonPages: 1200 kB
Node 0 PageTables: 396 kB
Node 0 NFS_Unstable: 0 kB
Node 0 Bounce: 0 kB
Node 0 Slab: 6304 kB
Node 0 SReclaimable: 1596 kB
Node 0 SUnreclaim: 4708 kB
I guess it is a good idea to keep HASHDIST_DEFAULT "on" for x86_64 NUMA
since x86_64 has no dearth of vmalloc space? Or maybe enable hash
distribution for all 64bit NUMA arches? The following patch does it only
for x86_64.
I ran a HPC MPI benchmark -- 'Ansys wingsolid', which takes up quite a bit of
memory and uses up tlb entries. This was on a 4 way, 2 socket
Tyan AMD box (non vsmp), with 8G total memory (4G pernode).
The results with and without hash distribution are:
1. Vanilla - runtime of 1188.000s
2. With hashdist=1 runtime of 1154.000s
Oprofile output for the duration of run is:
1. Vanilla:
PU: AMD64 processors, speed 2411.16 MHz (estimated)
Counted L1_AND_L2_DTLB_MISSES events (L1 and L2 DTLB misses) with a unit
mask of 0x00 (No unit mask) count 500
samples % app name symbol name
163054 6.5513 libansys1.so MultiFront::decompose(int, int,
Elemset *, int *, int, int, int)
162061 6.5114 libansys3.so blockSaxpy6L_fd
162042 6.5107 libansys3.so blockInnerProduct6L_fd
156286 6.2794 libansys3.so maxb33_
87879 3.5309 libansys1.so elmatrixmultpcg_
84857 3.4095 libansys4.so saxpy_pcg
58637 2.3560 libansys4.so .st4560
46612 1.8728 libansys4.so .st4282
43043 1.7294 vmlinux-t copy_user_generic_string
41326 1.6604 libansys3.so blockSaxpyBackSolve6L_fd
41288 1.6589 libansys3.so blockInnerProductBackSolve6L_fd
2. With hashdist=1
CPU: AMD64 processors, speed 2411.13 MHz (estimated)
Counted L1_AND_L2_DTLB_MISSES events (L1 and L2 DTLB misses) with a unit
mask of 0x00 (No unit mask) count 500
samples % app name symbol name
162993 6.9814 libansys1.so MultiFront::decompose(int, int,
Elemset *, int *, int, int, int)
160799 6.8874 libansys3.so blockInnerProduct6L_fd
160459 6.8729 libansys3.so blockSaxpy6L_fd
156018 6.6826 libansys3.so maxb33_
84700 3.6279 libansys4.so saxpy_pcg
83434 3.5737 libansys1.so elmatrixmultpcg_
58074 2.4875 libansys4.so .st4560
46000 1.9703 libansys4.so .st4282
41166 1.7632 libansys3.so blockSaxpyBackSolve6L_fd
41033 1.7575 libansys3.so blockInnerProductBackSolve6L_fd
35762 1.5318 libansys1.so inner_product_sub
35591 1.5245 libansys1.so inner_product_sub2
28259 1.2104 libansys4.so addVectors
Signed-off-by: Pravin B. Shelar <pravin.shelar@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Christoph Lameter <clameter@engr.sgi.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:08 +08:00
|
|
|
* IA64 and x86_64 have sufficient vmalloc space.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
[PATCH] x86-64: Set HASHDIST_DEFAULT to 1 for x86_64 NUMA
Enable system hashtable memory to be distributed among nodes on x86_64 NUMA
Forcing the kernel to use node interleaved vmalloc instead of bootmem for
the system hashtable memory (alloc_large_system_hash) reduces the memory
imbalance on node 0 by around 40MB on a 8 node x86_64 NUMA box:
Before the following patch, on bootup of a 8 node box:
Node 0 MemTotal: 3407488 kB
Node 0 MemFree: 3206296 kB
Node 0 MemUsed: 201192 kB
Node 0 Active: 7012 kB
Node 0 Inactive: 512 kB
Node 0 Dirty: 0 kB
Node 0 Writeback: 0 kB
Node 0 FilePages: 1912 kB
Node 0 Mapped: 420 kB
Node 0 AnonPages: 5612 kB
Node 0 PageTables: 468 kB
Node 0 NFS_Unstable: 0 kB
Node 0 Bounce: 0 kB
Node 0 Slab: 5408 kB
Node 0 SReclaimable: 644 kB
Node 0 SUnreclaim: 4764 kB
After the patch (or using hashdist=1 on the kernel command line):
Node 0 MemTotal: 3407488 kB
Node 0 MemFree: 3247608 kB
Node 0 MemUsed: 159880 kB
Node 0 Active: 3012 kB
Node 0 Inactive: 616 kB
Node 0 Dirty: 0 kB
Node 0 Writeback: 0 kB
Node 0 FilePages: 2424 kB
Node 0 Mapped: 380 kB
Node 0 AnonPages: 1200 kB
Node 0 PageTables: 396 kB
Node 0 NFS_Unstable: 0 kB
Node 0 Bounce: 0 kB
Node 0 Slab: 6304 kB
Node 0 SReclaimable: 1596 kB
Node 0 SUnreclaim: 4708 kB
I guess it is a good idea to keep HASHDIST_DEFAULT "on" for x86_64 NUMA
since x86_64 has no dearth of vmalloc space? Or maybe enable hash
distribution for all 64bit NUMA arches? The following patch does it only
for x86_64.
I ran a HPC MPI benchmark -- 'Ansys wingsolid', which takes up quite a bit of
memory and uses up tlb entries. This was on a 4 way, 2 socket
Tyan AMD box (non vsmp), with 8G total memory (4G pernode).
The results with and without hash distribution are:
1. Vanilla - runtime of 1188.000s
2. With hashdist=1 runtime of 1154.000s
Oprofile output for the duration of run is:
1. Vanilla:
PU: AMD64 processors, speed 2411.16 MHz (estimated)
Counted L1_AND_L2_DTLB_MISSES events (L1 and L2 DTLB misses) with a unit
mask of 0x00 (No unit mask) count 500
samples % app name symbol name
163054 6.5513 libansys1.so MultiFront::decompose(int, int,
Elemset *, int *, int, int, int)
162061 6.5114 libansys3.so blockSaxpy6L_fd
162042 6.5107 libansys3.so blockInnerProduct6L_fd
156286 6.2794 libansys3.so maxb33_
87879 3.5309 libansys1.so elmatrixmultpcg_
84857 3.4095 libansys4.so saxpy_pcg
58637 2.3560 libansys4.so .st4560
46612 1.8728 libansys4.so .st4282
43043 1.7294 vmlinux-t copy_user_generic_string
41326 1.6604 libansys3.so blockSaxpyBackSolve6L_fd
41288 1.6589 libansys3.so blockInnerProductBackSolve6L_fd
2. With hashdist=1
CPU: AMD64 processors, speed 2411.13 MHz (estimated)
Counted L1_AND_L2_DTLB_MISSES events (L1 and L2 DTLB misses) with a unit
mask of 0x00 (No unit mask) count 500
samples % app name symbol name
162993 6.9814 libansys1.so MultiFront::decompose(int, int,
Elemset *, int *, int, int, int)
160799 6.8874 libansys3.so blockInnerProduct6L_fd
160459 6.8729 libansys3.so blockSaxpy6L_fd
156018 6.6826 libansys3.so maxb33_
84700 3.6279 libansys4.so saxpy_pcg
83434 3.5737 libansys1.so elmatrixmultpcg_
58074 2.4875 libansys4.so .st4560
46000 1.9703 libansys4.so .st4282
41166 1.7632 libansys3.so blockSaxpyBackSolve6L_fd
41033 1.7575 libansys3.so blockInnerProductBackSolve6L_fd
35762 1.5318 libansys1.so inner_product_sub
35591 1.5245 libansys1.so inner_product_sub2
28259 1.2104 libansys4.so addVectors
Signed-off-by: Pravin B. Shelar <pravin.shelar@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Christoph Lameter <clameter@engr.sgi.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:08 +08:00
|
|
|
#if defined(CONFIG_NUMA) && (defined(CONFIG_IA64) || defined(CONFIG_X86_64))
|
2005-04-17 06:20:36 +08:00
|
|
|
#define HASHDIST_DEFAULT 1
|
|
|
|
#else
|
|
|
|
#define HASHDIST_DEFAULT 0
|
|
|
|
#endif
|
2006-08-24 18:08:07 +08:00
|
|
|
extern int hashdist; /* Distribute hashes across NUMA nodes? */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
|
|
|
|
#endif /* _LINUX_BOOTMEM_H */
|