mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2025-01-05 21:35:04 +08:00
96db800f5d
alloc_pages_exact_node() was introduced in commit6484eb3e2a
("page allocator: do not check NUMA node ID when the caller knows the node is valid") as an optimized variant of alloc_pages_node(), that doesn't fallback to current node for nid == NUMA_NO_NODE. Unfortunately the name of the function can easily suggest that the allocation is restricted to the given node and fails otherwise. In truth, the node is only preferred, unless __GFP_THISNODE is passed among the gfp flags. The misleading name has lead to mistakes in the past, see for example commits5265047ac3
("mm, thp: really limit transparent hugepage allocation to local node") andb360edb43f
("mm, mempolicy: migrate_to_node should only migrate to node"). Another issue with the name is that there's a family of alloc_pages_exact*() functions where 'exact' means exact size (instead of page order), which leads to more confusion. To prevent further mistakes, this patch effectively renames alloc_pages_exact_node() to __alloc_pages_node() to better convey that it's an optimized variant of alloc_pages_node() not intended for general usage. Both functions get described in comments. It has been also considered to really provide a convenience function for allocations restricted to a node, but the major opinion seems to be that __GFP_THISNODE already provides that functionality and we shouldn't duplicate the API needlessly. The number of users would be small anyway. Existing callers of alloc_pages_exact_node() are simply converted to call __alloc_pages_node(), with the exception of sba_alloc_coherent() which open-codes the check for NUMA_NO_NODE, so it is converted to use alloc_pages_node() instead. This means it no longer performs some VM_BUG_ON checks, and since the current check for nid in alloc_pages_node() uses a 'nid < 0' comparison (which includes NUMA_NO_NODE), it may hide wrong values which would be previously exposed. Both differences will be rectified by the next patch. To sum up, this patch makes no functional changes, except temporarily hiding potentially buggy callers. Restricting the checks in alloc_pages_node() is left for the next patch which can in turn expose more existing buggy callers. Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Robin Holt <robinmholt@gmail.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Christoph Lameter <cl@linux.com> Acked-by: Michael Ellerman <mpe@ellerman.id.au> Cc: Mel Gorman <mgorman@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Greg Thelen <gthelen@google.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Tony Luck <tony.luck@intel.com> Cc: Fenghua Yu <fenghua.yu@intel.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Gleb Natapov <gleb@kernel.org> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Cliff Whickman <cpw@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
282 lines
7.5 KiB
C
282 lines
7.5 KiB
C
/*
|
|
* Copyright (C) 2001-2008 Silicon Graphics, Inc. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify it
|
|
* under the terms of version 2 of the GNU General Public License
|
|
* as published by the Free Software Foundation.
|
|
*
|
|
* A simple uncached page allocator using the generic allocator. This
|
|
* allocator first utilizes the spare (spill) pages found in the EFI
|
|
* memmap and will then start converting cached pages to uncached ones
|
|
* at a granule at a time. Node awareness is implemented by having a
|
|
* pool of pages per node.
|
|
*/
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/init.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/string.h>
|
|
#include <linux/efi.h>
|
|
#include <linux/genalloc.h>
|
|
#include <linux/gfp.h>
|
|
#include <asm/page.h>
|
|
#include <asm/pal.h>
|
|
#include <asm/pgtable.h>
|
|
#include <linux/atomic.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/sn/arch.h>
|
|
|
|
|
|
extern void __init efi_memmap_walk_uc(efi_freemem_callback_t, void *);
|
|
|
|
struct uncached_pool {
|
|
struct gen_pool *pool;
|
|
struct mutex add_chunk_mutex; /* serialize adding a converted chunk */
|
|
int nchunks_added; /* #of converted chunks added to pool */
|
|
atomic_t status; /* smp called function's return status*/
|
|
};
|
|
|
|
#define MAX_CONVERTED_CHUNKS_PER_NODE 2
|
|
|
|
struct uncached_pool uncached_pools[MAX_NUMNODES];
|
|
|
|
|
|
static void uncached_ipi_visibility(void *data)
|
|
{
|
|
int status;
|
|
struct uncached_pool *uc_pool = (struct uncached_pool *)data;
|
|
|
|
status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
|
|
if ((status != PAL_VISIBILITY_OK) &&
|
|
(status != PAL_VISIBILITY_OK_REMOTE_NEEDED))
|
|
atomic_inc(&uc_pool->status);
|
|
}
|
|
|
|
|
|
static void uncached_ipi_mc_drain(void *data)
|
|
{
|
|
int status;
|
|
struct uncached_pool *uc_pool = (struct uncached_pool *)data;
|
|
|
|
status = ia64_pal_mc_drain();
|
|
if (status != PAL_STATUS_SUCCESS)
|
|
atomic_inc(&uc_pool->status);
|
|
}
|
|
|
|
|
|
/*
|
|
* Add a new chunk of uncached memory pages to the specified pool.
|
|
*
|
|
* @pool: pool to add new chunk of uncached memory to
|
|
* @nid: node id of node to allocate memory from, or -1
|
|
*
|
|
* This is accomplished by first allocating a granule of cached memory pages
|
|
* and then converting them to uncached memory pages.
|
|
*/
|
|
static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
|
|
{
|
|
struct page *page;
|
|
int status, i, nchunks_added = uc_pool->nchunks_added;
|
|
unsigned long c_addr, uc_addr;
|
|
|
|
if (mutex_lock_interruptible(&uc_pool->add_chunk_mutex) != 0)
|
|
return -1; /* interrupted by a signal */
|
|
|
|
if (uc_pool->nchunks_added > nchunks_added) {
|
|
/* someone added a new chunk while we were waiting */
|
|
mutex_unlock(&uc_pool->add_chunk_mutex);
|
|
return 0;
|
|
}
|
|
|
|
if (uc_pool->nchunks_added >= MAX_CONVERTED_CHUNKS_PER_NODE) {
|
|
mutex_unlock(&uc_pool->add_chunk_mutex);
|
|
return -1;
|
|
}
|
|
|
|
/* attempt to allocate a granule's worth of cached memory pages */
|
|
|
|
page = __alloc_pages_node(nid,
|
|
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
|
IA64_GRANULE_SHIFT-PAGE_SHIFT);
|
|
if (!page) {
|
|
mutex_unlock(&uc_pool->add_chunk_mutex);
|
|
return -1;
|
|
}
|
|
|
|
/* convert the memory pages from cached to uncached */
|
|
|
|
c_addr = (unsigned long)page_address(page);
|
|
uc_addr = c_addr - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
|
|
|
|
/*
|
|
* There's a small race here where it's possible for someone to
|
|
* access the page through /dev/mem halfway through the conversion
|
|
* to uncached - not sure it's really worth bothering about
|
|
*/
|
|
for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
|
|
SetPageUncached(&page[i]);
|
|
|
|
flush_tlb_kernel_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
|
|
|
|
status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
|
|
if (status == PAL_VISIBILITY_OK_REMOTE_NEEDED) {
|
|
atomic_set(&uc_pool->status, 0);
|
|
status = smp_call_function(uncached_ipi_visibility, uc_pool, 1);
|
|
if (status || atomic_read(&uc_pool->status))
|
|
goto failed;
|
|
} else if (status != PAL_VISIBILITY_OK)
|
|
goto failed;
|
|
|
|
preempt_disable();
|
|
|
|
if (ia64_platform_is("sn2"))
|
|
sn_flush_all_caches(uc_addr, IA64_GRANULE_SIZE);
|
|
else
|
|
flush_icache_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
|
|
|
|
/* flush the just introduced uncached translation from the TLB */
|
|
local_flush_tlb_all();
|
|
|
|
preempt_enable();
|
|
|
|
status = ia64_pal_mc_drain();
|
|
if (status != PAL_STATUS_SUCCESS)
|
|
goto failed;
|
|
atomic_set(&uc_pool->status, 0);
|
|
status = smp_call_function(uncached_ipi_mc_drain, uc_pool, 1);
|
|
if (status || atomic_read(&uc_pool->status))
|
|
goto failed;
|
|
|
|
/*
|
|
* The chunk of memory pages has been converted to uncached so now we
|
|
* can add it to the pool.
|
|
*/
|
|
status = gen_pool_add(uc_pool->pool, uc_addr, IA64_GRANULE_SIZE, nid);
|
|
if (status)
|
|
goto failed;
|
|
|
|
uc_pool->nchunks_added++;
|
|
mutex_unlock(&uc_pool->add_chunk_mutex);
|
|
return 0;
|
|
|
|
/* failed to convert or add the chunk so give it back to the kernel */
|
|
failed:
|
|
for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
|
|
ClearPageUncached(&page[i]);
|
|
|
|
free_pages(c_addr, IA64_GRANULE_SHIFT-PAGE_SHIFT);
|
|
mutex_unlock(&uc_pool->add_chunk_mutex);
|
|
return -1;
|
|
}
|
|
|
|
|
|
/*
|
|
* uncached_alloc_page
|
|
*
|
|
* @starting_nid: node id of node to start with, or -1
|
|
* @n_pages: number of contiguous pages to allocate
|
|
*
|
|
* Allocate the specified number of contiguous uncached pages on the
|
|
* the requested node. If not enough contiguous uncached pages are available
|
|
* on the requested node, roundrobin starting with the next higher node.
|
|
*/
|
|
unsigned long uncached_alloc_page(int starting_nid, int n_pages)
|
|
{
|
|
unsigned long uc_addr;
|
|
struct uncached_pool *uc_pool;
|
|
int nid;
|
|
|
|
if (unlikely(starting_nid >= MAX_NUMNODES))
|
|
return 0;
|
|
|
|
if (starting_nid < 0)
|
|
starting_nid = numa_node_id();
|
|
nid = starting_nid;
|
|
|
|
do {
|
|
if (!node_state(nid, N_HIGH_MEMORY))
|
|
continue;
|
|
uc_pool = &uncached_pools[nid];
|
|
if (uc_pool->pool == NULL)
|
|
continue;
|
|
do {
|
|
uc_addr = gen_pool_alloc(uc_pool->pool,
|
|
n_pages * PAGE_SIZE);
|
|
if (uc_addr != 0)
|
|
return uc_addr;
|
|
} while (uncached_add_chunk(uc_pool, nid) == 0);
|
|
|
|
} while ((nid = (nid + 1) % MAX_NUMNODES) != starting_nid);
|
|
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(uncached_alloc_page);
|
|
|
|
|
|
/*
|
|
* uncached_free_page
|
|
*
|
|
* @uc_addr: uncached address of first page to free
|
|
* @n_pages: number of contiguous pages to free
|
|
*
|
|
* Free the specified number of uncached pages.
|
|
*/
|
|
void uncached_free_page(unsigned long uc_addr, int n_pages)
|
|
{
|
|
int nid = paddr_to_nid(uc_addr - __IA64_UNCACHED_OFFSET);
|
|
struct gen_pool *pool = uncached_pools[nid].pool;
|
|
|
|
if (unlikely(pool == NULL))
|
|
return;
|
|
|
|
if ((uc_addr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
|
|
panic("uncached_free_page invalid address %lx\n", uc_addr);
|
|
|
|
gen_pool_free(pool, uc_addr, n_pages * PAGE_SIZE);
|
|
}
|
|
EXPORT_SYMBOL(uncached_free_page);
|
|
|
|
|
|
/*
|
|
* uncached_build_memmap,
|
|
*
|
|
* @uc_start: uncached starting address of a chunk of uncached memory
|
|
* @uc_end: uncached ending address of a chunk of uncached memory
|
|
* @arg: ignored, (NULL argument passed in on call to efi_memmap_walk_uc())
|
|
*
|
|
* Called at boot time to build a map of pages that can be used for
|
|
* memory special operations.
|
|
*/
|
|
static int __init uncached_build_memmap(u64 uc_start, u64 uc_end, void *arg)
|
|
{
|
|
int nid = paddr_to_nid(uc_start - __IA64_UNCACHED_OFFSET);
|
|
struct gen_pool *pool = uncached_pools[nid].pool;
|
|
size_t size = uc_end - uc_start;
|
|
|
|
touch_softlockup_watchdog();
|
|
|
|
if (pool != NULL) {
|
|
memset((char *)uc_start, 0, size);
|
|
(void) gen_pool_add(pool, uc_start, size, nid);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
static int __init uncached_init(void)
|
|
{
|
|
int nid;
|
|
|
|
for_each_node_state(nid, N_ONLINE) {
|
|
uncached_pools[nid].pool = gen_pool_create(PAGE_SHIFT, nid);
|
|
mutex_init(&uncached_pools[nid].add_chunk_mutex);
|
|
}
|
|
|
|
efi_memmap_walk_uc(uncached_build_memmap, NULL);
|
|
return 0;
|
|
}
|
|
|
|
__initcall(uncached_init);
|