mirror of
https://github.com/edk2-porting/linux-next.git
synced 2025-01-21 03:54:03 +08:00
e3246d8f52
Patch series "sparse-vmemmap: memory savings for compound devmaps (device-dax)", v9. This series minimizes 'struct page' overhead by pursuing a similar approach as Muchun Song series "Free some vmemmap pages of hugetlb page" (now merged since v5.14), but applied to devmap with @vmemmap_shift (device-dax). The vmemmap dedpulication original idea (already used in HugeTLB) is to reuse/deduplicate tail page vmemmap areas, particular the area which only describes tail pages. So a vmemmap page describes 64 struct pages, and the first page for a given ZONE_DEVICE vmemmap would contain the head page and 63 tail pages. The second vmemmap page would contain only tail pages, and that's what gets reused across the rest of the subsection/section. The bigger the page size, the bigger the savings (2M hpage -> save 6 vmemmap pages; 1G hpage -> save 4094 vmemmap pages). This is done for PMEM /specifically only/ on device-dax configured namespaces, not fsdax. In other words, a devmap with a @vmemmap_shift. In terms of savings, per 1Tb of memory, the struct page cost would go down with compound devmap: * with 2M pages we lose 4G instead of 16G (0.39% instead of 1.5% of total memory) * with 1G pages we lose 40MB instead of 16G (0.0014% instead of 1.5% of total memory) The series is mostly summed up by patch 4, and to summarize what the series does: Patches 1 - 3: Minor cleanups in preparation for patch 4. Move the very nice docs of hugetlb_vmemmap.c into a Documentation/vm/ entry. Patch 4: Patch 4 is the one that takes care of the struct page savings (also referred to here as tail-page/vmemmap deduplication). Much like Muchun series, we reuse the second PTE tail page vmemmap areas across a given @vmemmap_shift On important difference though, is that contrary to the hugetlbfs series, there's no vmemmap for the area because we are late-populating it as opposed to remapping a system-ram range. IOW no freeing of pages of already initialized vmemmap like the case for hugetlbfs, which greatly simplifies the logic (besides not being arch-specific). altmap case unchanged and still goes via the vmemmap_populate(). Also adjust the newly added docs to the device-dax case. [Note that device-dax is still a little behind HugeTLB in terms of savings. I have an additional simple patch that reuses the head vmemmap page too, as a follow-up. That will double the savings and namespaces initialization.] Patch 5: Initialize fewer struct pages depending on the page size with DRAM backed struct pages -- because fewer pages are unique and most tail pages (with bigger vmemmap_shift). NVDIMM namespace bootstrap improves from ~268-358 ms to ~80-110/<1ms on 128G NVDIMMs with 2M and 1G respectivally. And struct page needed capacity will be 3.8x / 1071x smaller for 2M and 1G respectivelly. Tested on x86 with 1.5Tb of pmem (including pinning, and RDMA registration/deregistration scalability with 2M MRs) This patch (of 5): In support of using compound pages for devmap mappings, plumb the pgmap down to the vmemmap_populate implementation. Note that while altmap is retrievable from pgmap the memory hotplug code passes altmap without pgmap[*], so both need to be independently plumbed. So in addition to @altmap, pass @pgmap to sparse section populate functions namely: sparse_add_section section_activate populate_section_memmap __populate_section_memmap Passing @pgmap allows __populate_section_memmap() to both fetch the vmemmap_shift in which memmap metadata is created for and also to let sparse-vmemmap fetch pgmap ranges to co-relate to a given section and pick whether to just reuse tail pages from past onlined sections. While at it, fix the kdoc for @altmap for sparse_add_section(). [*] https://lore.kernel.org/linux-mm/20210319092635.6214-1-osalvador@suse.de/ Link: https://lkml.kernel.org/r/20220420155310.9712-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20220420155310.9712-2-joao.m.martins@oracle.com Signed-off-by: Joao Martins <joao.m.martins@oracle.com> Reviewed-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Vishal Verma <vishal.l.verma@intel.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jane Chu <jane.chu@oracle.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
355 lines
11 KiB
C
355 lines
11 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __LINUX_MEMORY_HOTPLUG_H
|
|
#define __LINUX_MEMORY_HOTPLUG_H
|
|
|
|
#include <linux/mmzone.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/bug.h>
|
|
|
|
struct page;
|
|
struct zone;
|
|
struct pglist_data;
|
|
struct mem_section;
|
|
struct memory_block;
|
|
struct memory_group;
|
|
struct resource;
|
|
struct vmem_altmap;
|
|
struct dev_pagemap;
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
|
|
/*
|
|
* For supporting node-hotadd, we have to allocate a new pgdat.
|
|
*
|
|
* If an arch has generic style NODE_DATA(),
|
|
* node_data[nid] = kzalloc() works well. But it depends on the architecture.
|
|
*
|
|
* In general, generic_alloc_nodedata() is used.
|
|
*
|
|
*/
|
|
extern pg_data_t *arch_alloc_nodedata(int nid);
|
|
extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
|
|
|
|
#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
|
|
|
|
#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid)
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/*
|
|
* XXX: node aware allocation can't work well to get new node's memory at this time.
|
|
* Because, pgdat for the new node is not allocated/initialized yet itself.
|
|
* To use new node's memory, more consideration will be necessary.
|
|
*/
|
|
#define generic_alloc_nodedata(nid) \
|
|
({ \
|
|
memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \
|
|
})
|
|
/*
|
|
* This definition is just for error path in node hotadd.
|
|
* For node hotremove, we have to replace this.
|
|
*/
|
|
#define generic_free_nodedata(pgdat) kfree(pgdat)
|
|
|
|
extern pg_data_t *node_data[];
|
|
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
|
|
{
|
|
node_data[nid] = pgdat;
|
|
}
|
|
|
|
#else /* !CONFIG_NUMA */
|
|
|
|
/* never called */
|
|
static inline pg_data_t *generic_alloc_nodedata(int nid)
|
|
{
|
|
BUG();
|
|
return NULL;
|
|
}
|
|
static inline void generic_free_nodedata(pg_data_t *pgdat)
|
|
{
|
|
}
|
|
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
|
|
{
|
|
}
|
|
#endif /* CONFIG_NUMA */
|
|
#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
struct page *pfn_to_online_page(unsigned long pfn);
|
|
|
|
/* Types for control the zone type of onlined and offlined memory */
|
|
enum {
|
|
/* Offline the memory. */
|
|
MMOP_OFFLINE = 0,
|
|
/* Online the memory. Zone depends, see default_zone_for_pfn(). */
|
|
MMOP_ONLINE,
|
|
/* Online the memory to ZONE_NORMAL. */
|
|
MMOP_ONLINE_KERNEL,
|
|
/* Online the memory to ZONE_MOVABLE. */
|
|
MMOP_ONLINE_MOVABLE,
|
|
};
|
|
|
|
/* Flags for add_memory() and friends to specify memory hotplug details. */
|
|
typedef int __bitwise mhp_t;
|
|
|
|
/* No special request */
|
|
#define MHP_NONE ((__force mhp_t)0)
|
|
/*
|
|
* Allow merging of the added System RAM resource with adjacent,
|
|
* mergeable resources. After a successful call to add_memory_resource()
|
|
* with this flag set, the resource pointer must no longer be used as it
|
|
* might be stale, or the resource might have changed.
|
|
*/
|
|
#define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0))
|
|
|
|
/*
|
|
* We want memmap (struct page array) to be self contained.
|
|
* To do so, we will use the beginning of the hot-added range to build
|
|
* the page tables for the memmap array that describes the entire range.
|
|
* Only selected architectures support it with SPARSE_VMEMMAP.
|
|
*/
|
|
#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1))
|
|
/*
|
|
* The nid field specifies a memory group id (mgid) instead. The memory group
|
|
* implies the node id (nid).
|
|
*/
|
|
#define MHP_NID_IS_MGID ((__force mhp_t)BIT(2))
|
|
|
|
/*
|
|
* Extended parameters for memory hotplug:
|
|
* altmap: alternative allocator for memmap array (optional)
|
|
* pgprot: page protection flags to apply to newly created page tables
|
|
* (required)
|
|
*/
|
|
struct mhp_params {
|
|
struct vmem_altmap *altmap;
|
|
pgprot_t pgprot;
|
|
struct dev_pagemap *pgmap;
|
|
};
|
|
|
|
bool mhp_range_allowed(u64 start, u64 size, bool need_mapping);
|
|
struct range mhp_get_pluggable_range(bool need_mapping);
|
|
|
|
/*
|
|
* Zone resizing functions
|
|
*
|
|
* Note: any attempt to resize a zone should has pgdat_resize_lock()
|
|
* zone_span_writelock() both held. This ensure the size of a zone
|
|
* can't be changed while pgdat_resize_lock() held.
|
|
*/
|
|
static inline unsigned zone_span_seqbegin(struct zone *zone)
|
|
{
|
|
return read_seqbegin(&zone->span_seqlock);
|
|
}
|
|
static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
|
|
{
|
|
return read_seqretry(&zone->span_seqlock, iv);
|
|
}
|
|
static inline void zone_span_writelock(struct zone *zone)
|
|
{
|
|
write_seqlock(&zone->span_seqlock);
|
|
}
|
|
static inline void zone_span_writeunlock(struct zone *zone)
|
|
{
|
|
write_sequnlock(&zone->span_seqlock);
|
|
}
|
|
static inline void zone_seqlock_init(struct zone *zone)
|
|
{
|
|
seqlock_init(&zone->span_seqlock);
|
|
}
|
|
extern void adjust_present_page_count(struct page *page,
|
|
struct memory_group *group,
|
|
long nr_pages);
|
|
/* VM interface that may be used by firmware interface */
|
|
extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
|
|
struct zone *zone);
|
|
extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
|
|
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
|
|
struct zone *zone, struct memory_group *group);
|
|
extern void __offline_isolated_pages(unsigned long start_pfn,
|
|
unsigned long end_pfn);
|
|
|
|
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
|
|
|
|
extern void generic_online_page(struct page *page, unsigned int order);
|
|
extern int set_online_page_callback(online_page_callback_t callback);
|
|
extern int restore_online_page_callback(online_page_callback_t callback);
|
|
|
|
extern int try_online_node(int nid);
|
|
|
|
extern int arch_add_memory(int nid, u64 start, u64 size,
|
|
struct mhp_params *params);
|
|
extern u64 max_mem_size;
|
|
|
|
extern int mhp_online_type_from_str(const char *str);
|
|
|
|
/* Default online_type (MMOP_*) when new memory blocks are added. */
|
|
extern int mhp_default_online_type;
|
|
/* If movable_node boot option specified */
|
|
extern bool movable_node_enabled;
|
|
static inline bool movable_node_is_enabled(void)
|
|
{
|
|
return movable_node_enabled;
|
|
}
|
|
|
|
extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap);
|
|
extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
|
|
struct vmem_altmap *altmap);
|
|
|
|
/* reasonably generic interface to expand the physical pages */
|
|
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
|
|
struct mhp_params *params);
|
|
|
|
#ifndef CONFIG_ARCH_HAS_ADD_PAGES
|
|
static inline int add_pages(int nid, unsigned long start_pfn,
|
|
unsigned long nr_pages, struct mhp_params *params)
|
|
{
|
|
return __add_pages(nid, start_pfn, nr_pages, params);
|
|
}
|
|
#else /* ARCH_HAS_ADD_PAGES */
|
|
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
|
|
struct mhp_params *params);
|
|
#endif /* ARCH_HAS_ADD_PAGES */
|
|
|
|
void get_online_mems(void);
|
|
void put_online_mems(void);
|
|
|
|
void mem_hotplug_begin(void);
|
|
void mem_hotplug_done(void);
|
|
|
|
#else /* ! CONFIG_MEMORY_HOTPLUG */
|
|
#define pfn_to_online_page(pfn) \
|
|
({ \
|
|
struct page *___page = NULL; \
|
|
if (pfn_valid(pfn)) \
|
|
___page = pfn_to_page(pfn); \
|
|
___page; \
|
|
})
|
|
|
|
static inline unsigned zone_span_seqbegin(struct zone *zone)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void zone_span_writelock(struct zone *zone) {}
|
|
static inline void zone_span_writeunlock(struct zone *zone) {}
|
|
static inline void zone_seqlock_init(struct zone *zone) {}
|
|
|
|
static inline int try_online_node(int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void get_online_mems(void) {}
|
|
static inline void put_online_mems(void) {}
|
|
|
|
static inline void mem_hotplug_begin(void) {}
|
|
static inline void mem_hotplug_done(void) {}
|
|
|
|
static inline bool movable_node_is_enabled(void)
|
|
{
|
|
return false;
|
|
}
|
|
#endif /* ! CONFIG_MEMORY_HOTPLUG */
|
|
|
|
/*
|
|
* Keep this declaration outside CONFIG_MEMORY_HOTPLUG as some
|
|
* platforms might override and use arch_get_mappable_range()
|
|
* for internal non memory hotplug purposes.
|
|
*/
|
|
struct range arch_get_mappable_range(void);
|
|
|
|
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
|
|
/*
|
|
* pgdat resizing functions
|
|
*/
|
|
static inline
|
|
void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
|
|
{
|
|
spin_lock_irqsave(&pgdat->node_size_lock, *flags);
|
|
}
|
|
static inline
|
|
void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
|
|
{
|
|
spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
|
|
}
|
|
static inline
|
|
void pgdat_resize_init(struct pglist_data *pgdat)
|
|
{
|
|
spin_lock_init(&pgdat->node_size_lock);
|
|
}
|
|
#else /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */
|
|
/*
|
|
* Stub functions for when hotplug is off
|
|
*/
|
|
static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
|
|
static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
|
|
static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
|
|
#endif /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */
|
|
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
|
|
extern void try_offline_node(int nid);
|
|
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
|
|
struct zone *zone, struct memory_group *group);
|
|
extern int remove_memory(u64 start, u64 size);
|
|
extern void __remove_memory(u64 start, u64 size);
|
|
extern int offline_and_remove_memory(u64 start, u64 size);
|
|
|
|
#else
|
|
static inline void try_offline_node(int nid) {}
|
|
|
|
static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
|
|
struct zone *zone, struct memory_group *group)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline int remove_memory(u64 start, u64 size)
|
|
{
|
|
return -EBUSY;
|
|
}
|
|
|
|
static inline void __remove_memory(u64 start, u64 size) {}
|
|
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
|
|
|
extern void set_zone_contiguous(struct zone *zone);
|
|
extern void clear_zone_contiguous(struct zone *zone);
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat);
|
|
extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
|
|
extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
|
|
extern int add_memory_resource(int nid, struct resource *resource,
|
|
mhp_t mhp_flags);
|
|
extern int add_memory_driver_managed(int nid, u64 start, u64 size,
|
|
const char *resource_name,
|
|
mhp_t mhp_flags);
|
|
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
|
|
unsigned long nr_pages,
|
|
struct vmem_altmap *altmap, int migratetype);
|
|
extern void remove_pfn_range_from_zone(struct zone *zone,
|
|
unsigned long start_pfn,
|
|
unsigned long nr_pages);
|
|
extern bool is_memblock_offlined(struct memory_block *mem);
|
|
extern int sparse_add_section(int nid, unsigned long pfn,
|
|
unsigned long nr_pages, struct vmem_altmap *altmap,
|
|
struct dev_pagemap *pgmap);
|
|
extern void sparse_remove_section(struct mem_section *ms,
|
|
unsigned long pfn, unsigned long nr_pages,
|
|
unsigned long map_offset, struct vmem_altmap *altmap);
|
|
extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
|
|
unsigned long pnum);
|
|
extern struct zone *zone_for_pfn_range(int online_type, int nid,
|
|
struct memory_group *group, unsigned long start_pfn,
|
|
unsigned long nr_pages);
|
|
extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
|
|
struct mhp_params *params);
|
|
void arch_remove_linear_mapping(u64 start, u64 size);
|
|
extern bool mhp_supports_memmap_on_memory(unsigned long size);
|
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
|
|
|
#endif /* __LINUX_MEMORY_HOTPLUG_H */
|