2019-05-19 20:08:55 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2008-07-24 12:26:49 +08:00
|
|
|
/*
|
|
|
|
* mm_init.c - Memory initialisation verification and debugging
|
|
|
|
*
|
|
|
|
* Copyright 2008 IBM Corporation, 2008
|
|
|
|
* Author Mel Gorman <mel@csn.ul.ie>
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/init.h>
|
2008-07-24 12:27:39 +08:00
|
|
|
#include <linux/kobject.h>
|
2011-10-16 14:01:52 +08:00
|
|
|
#include <linux/export.h>
|
2013-07-04 06:02:44 +08:00
|
|
|
#include <linux/memory.h>
|
|
|
|
#include <linux/notifier.h>
|
2015-07-01 05:57:05 +08:00
|
|
|
#include <linux/sched.h>
|
2020-08-07 14:23:15 +08:00
|
|
|
#include <linux/mman.h>
|
2023-03-22 01:05:02 +08:00
|
|
|
#include <linux/memblock.h>
|
|
|
|
#include <linux/page-isolation.h>
|
|
|
|
#include <linux/padata.h>
|
|
|
|
#include <linux/nmi.h>
|
|
|
|
#include <linux/buffer_head.h>
|
|
|
|
#include <linux/kmemleak.h>
|
2023-03-22 01:05:06 +08:00
|
|
|
#include <linux/kfence.h>
|
|
|
|
#include <linux/page_ext.h>
|
|
|
|
#include <linux/pti.h>
|
|
|
|
#include <linux/pgtable.h>
|
2024-03-22 00:36:36 +08:00
|
|
|
#include <linux/stackdepot.h>
|
2023-03-22 01:05:10 +08:00
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/cma.h>
|
2024-01-09 12:15:36 +08:00
|
|
|
#include <linux/crash_dump.h>
|
2024-05-06 00:06:19 +08:00
|
|
|
#include <linux/execmem.h>
|
2024-06-06 06:27:51 +08:00
|
|
|
#include <linux/vmstat.h>
|
2008-07-24 12:26:51 +08:00
|
|
|
#include "internal.h"
|
2023-03-22 01:05:11 +08:00
|
|
|
#include "slab.h"
|
2023-03-22 01:05:02 +08:00
|
|
|
#include "shuffle.h"
|
2008-07-24 12:26:49 +08:00
|
|
|
|
2023-03-22 01:05:06 +08:00
|
|
|
#include <asm/setup.h>
|
|
|
|
|
2008-07-24 12:27:39 +08:00
|
|
|
#ifdef CONFIG_DEBUG_MEMORY_INIT
|
2015-02-13 07:00:12 +08:00
|
|
|
int __meminitdata mminit_loglevel;
|
2008-07-24 12:26:49 +08:00
|
|
|
|
2008-07-24 12:26:52 +08:00
|
|
|
/* The zonelists are simply reported, validation is manual. */
|
2015-02-13 07:00:09 +08:00
|
|
|
void __init mminit_verify_zonelist(void)
|
2008-07-24 12:26:52 +08:00
|
|
|
{
|
|
|
|
int nid;
|
|
|
|
|
|
|
|
if (mminit_loglevel < MMINIT_VERIFY)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for_each_online_node(nid) {
|
|
|
|
pg_data_t *pgdat = NODE_DATA(nid);
|
|
|
|
struct zone *zone;
|
|
|
|
struct zoneref *z;
|
|
|
|
struct zonelist *zonelist;
|
|
|
|
int i, listid, zoneid;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
|
|
|
|
|
|
|
|
/* Identify the zone and nodelist */
|
|
|
|
zoneid = i % MAX_NR_ZONES;
|
|
|
|
listid = i / MAX_NR_ZONES;
|
|
|
|
zonelist = &pgdat->node_zonelists[listid];
|
|
|
|
zone = &pgdat->node_zones[zoneid];
|
|
|
|
if (!populated_zone(zone))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Print information about the zonelist */
|
|
|
|
printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
|
|
|
|
listid > 0 ? "thisnode" : "general", nid,
|
|
|
|
zone->name);
|
|
|
|
|
|
|
|
/* Iterate the zonelist */
|
2018-08-22 12:53:32 +08:00
|
|
|
for_each_zone_zonelist(zone, z, zonelist, zoneid)
|
|
|
|
pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
|
2016-03-18 05:19:50 +08:00
|
|
|
pr_cont("\n");
|
2008-07-24 12:26:52 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-07-24 12:26:51 +08:00
|
|
|
void __init mminit_verify_pageflags_layout(void)
|
|
|
|
{
|
|
|
|
int shift, width;
|
|
|
|
unsigned long or_mask, add_mask;
|
|
|
|
|
2023-08-07 10:35:28 +08:00
|
|
|
shift = BITS_PER_LONG;
|
2020-06-02 12:52:49 +08:00
|
|
|
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
mm: multi-gen LRU: groundwork
Evictable pages are divided into multiple generations for each lruvec.
The youngest generation number is stored in lrugen->max_seq for both
anon and file types as they are aged on an equal footing. The oldest
generation numbers are stored in lrugen->min_seq[] separately for anon
and file types as clean file pages can be evicted regardless of swap
constraints. These three variables are monotonically increasing.
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
in order to fit into the gen counter in folio->flags. Each truncated
generation number is an index to lrugen->lists[]. The sliding window
technique is used to track at least MIN_NR_GENS and at most
MAX_NR_GENS generations. The gen counter stores a value within [1,
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
stores 0.
There are two conceptually independent procedures: "the aging", which
produces young generations, and "the eviction", which consumes old
generations. They form a closed-loop system, i.e., "the page reclaim".
Both procedures can be invoked from userspace for the purposes of working
set estimation and proactive reclaim. These techniques are commonly used
to optimize job scheduling (bin packing) in data centers [1][2].
To avoid confusion, the terms "hot" and "cold" will be applied to the
multi-gen LRU, as a new convention; the terms "active" and "inactive" will
be applied to the active/inactive LRU, as usual.
The protection of hot pages and the selection of cold pages are based
on page access channels and patterns. There are two access channels:
one through page tables and the other through file descriptors. The
protection of the former channel is by design stronger because:
1. The uncertainty in determining the access patterns of the former
channel is higher due to the approximation of the accessed bit.
2. The cost of evicting the former channel is higher due to the TLB
flushes required and the likelihood of encountering the dirty bit.
3. The penalty of underprotecting the former channel is higher because
applications usually do not prepare themselves for major page
faults like they do for blocked I/O. E.g., GUI applications
commonly use dedicated I/O threads to avoid blocking rendering
threads.
There are also two access patterns: one with temporal locality and the
other without. For the reasons listed above, the former channel is
assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
present; the latter channel is assumed to follow the latter pattern unless
outlying refaults have been observed [3][4].
The next patch will address the "outlying refaults". Three macros, i.e.,
LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
this patch to make the entire patchset less diffy.
A page is added to the youngest generation on faulting. The aging needs
to check the accessed bit at least twice before handing this page over to
the eviction. The first check takes care of the accessed bit set on the
initial fault; the second check makes sure this page has not been used
since then. This protocol, AKA second chance, requires a minimum of two
generations, hence MIN_NR_GENS.
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
[3] https://lwn.net/Articles/495543/
[4] https://lwn.net/Articles/815342/
Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-18 16:00:02 +08:00
|
|
|
- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
|
2008-07-24 12:26:51 +08:00
|
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
mm: multi-gen LRU: groundwork
Evictable pages are divided into multiple generations for each lruvec.
The youngest generation number is stored in lrugen->max_seq for both
anon and file types as they are aged on an equal footing. The oldest
generation numbers are stored in lrugen->min_seq[] separately for anon
and file types as clean file pages can be evicted regardless of swap
constraints. These three variables are monotonically increasing.
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
in order to fit into the gen counter in folio->flags. Each truncated
generation number is an index to lrugen->lists[]. The sliding window
technique is used to track at least MIN_NR_GENS and at most
MAX_NR_GENS generations. The gen counter stores a value within [1,
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
stores 0.
There are two conceptually independent procedures: "the aging", which
produces young generations, and "the eviction", which consumes old
generations. They form a closed-loop system, i.e., "the page reclaim".
Both procedures can be invoked from userspace for the purposes of working
set estimation and proactive reclaim. These techniques are commonly used
to optimize job scheduling (bin packing) in data centers [1][2].
To avoid confusion, the terms "hot" and "cold" will be applied to the
multi-gen LRU, as a new convention; the terms "active" and "inactive" will
be applied to the active/inactive LRU, as usual.
The protection of hot pages and the selection of cold pages are based
on page access channels and patterns. There are two access channels:
one through page tables and the other through file descriptors. The
protection of the former channel is by design stronger because:
1. The uncertainty in determining the access patterns of the former
channel is higher due to the approximation of the accessed bit.
2. The cost of evicting the former channel is higher due to the TLB
flushes required and the likelihood of encountering the dirty bit.
3. The penalty of underprotecting the former channel is higher because
applications usually do not prepare themselves for major page
faults like they do for blocked I/O. E.g., GUI applications
commonly use dedicated I/O threads to avoid blocking rendering
threads.
There are also two access patterns: one with temporal locality and the
other without. For the reasons listed above, the former channel is
assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
present; the latter channel is assumed to follow the latter pattern unless
outlying refaults have been observed [3][4].
The next patch will address the "outlying refaults". Three macros, i.e.,
LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
this patch to make the entire patchset less diffy.
A page is added to the youngest generation on faulting. The aging needs
to check the accessed bit at least twice before handing this page over to
the eviction. The first check takes care of the accessed bit set on the
initial fault; the second check makes sure this page has not been used
since then. This protocol, AKA second chance, requires a minimum of two
generations, hence MIN_NR_GENS.
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
[3] https://lwn.net/Articles/495543/
[4] https://lwn.net/Articles/815342/
Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-18 16:00:02 +08:00
|
|
|
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
|
2008-07-24 12:26:51 +08:00
|
|
|
SECTIONS_WIDTH,
|
|
|
|
NODES_WIDTH,
|
|
|
|
ZONES_WIDTH,
|
2013-10-07 18:29:20 +08:00
|
|
|
LAST_CPUPID_WIDTH,
|
2020-06-02 12:52:49 +08:00
|
|
|
KASAN_TAG_WIDTH,
|
mm: multi-gen LRU: groundwork
Evictable pages are divided into multiple generations for each lruvec.
The youngest generation number is stored in lrugen->max_seq for both
anon and file types as they are aged on an equal footing. The oldest
generation numbers are stored in lrugen->min_seq[] separately for anon
and file types as clean file pages can be evicted regardless of swap
constraints. These three variables are monotonically increasing.
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
in order to fit into the gen counter in folio->flags. Each truncated
generation number is an index to lrugen->lists[]. The sliding window
technique is used to track at least MIN_NR_GENS and at most
MAX_NR_GENS generations. The gen counter stores a value within [1,
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
stores 0.
There are two conceptually independent procedures: "the aging", which
produces young generations, and "the eviction", which consumes old
generations. They form a closed-loop system, i.e., "the page reclaim".
Both procedures can be invoked from userspace for the purposes of working
set estimation and proactive reclaim. These techniques are commonly used
to optimize job scheduling (bin packing) in data centers [1][2].
To avoid confusion, the terms "hot" and "cold" will be applied to the
multi-gen LRU, as a new convention; the terms "active" and "inactive" will
be applied to the active/inactive LRU, as usual.
The protection of hot pages and the selection of cold pages are based
on page access channels and patterns. There are two access channels:
one through page tables and the other through file descriptors. The
protection of the former channel is by design stronger because:
1. The uncertainty in determining the access patterns of the former
channel is higher due to the approximation of the accessed bit.
2. The cost of evicting the former channel is higher due to the TLB
flushes required and the likelihood of encountering the dirty bit.
3. The penalty of underprotecting the former channel is higher because
applications usually do not prepare themselves for major page
faults like they do for blocked I/O. E.g., GUI applications
commonly use dedicated I/O threads to avoid blocking rendering
threads.
There are also two access patterns: one with temporal locality and the
other without. For the reasons listed above, the former channel is
assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
present; the latter channel is assumed to follow the latter pattern unless
outlying refaults have been observed [3][4].
The next patch will address the "outlying refaults". Three macros, i.e.,
LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
this patch to make the entire patchset less diffy.
A page is added to the youngest generation on faulting. The aging needs
to check the accessed bit at least twice before handing this page over to
the eviction. The first check takes care of the accessed bit set on the
initial fault; the second check makes sure this page has not been used
since then. This protocol, AKA second chance, requires a minimum of two
generations, hence MIN_NR_GENS.
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
[3] https://lwn.net/Articles/495543/
[4] https://lwn.net/Articles/815342/
Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-18 16:00:02 +08:00
|
|
|
LRU_GEN_WIDTH,
|
|
|
|
LRU_REFS_WIDTH,
|
2008-07-24 12:26:51 +08:00
|
|
|
NR_PAGEFLAGS);
|
|
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
2020-06-02 12:52:49 +08:00
|
|
|
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
2008-07-24 12:26:51 +08:00
|
|
|
SECTIONS_SHIFT,
|
|
|
|
NODES_SHIFT,
|
2013-02-23 08:34:47 +08:00
|
|
|
ZONES_SHIFT,
|
2020-06-02 12:52:49 +08:00
|
|
|
LAST_CPUPID_SHIFT,
|
|
|
|
KASAN_TAG_WIDTH);
|
2013-02-23 08:34:47 +08:00
|
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
|
2020-06-02 12:52:49 +08:00
|
|
|
"Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
|
2008-07-24 12:26:51 +08:00
|
|
|
(unsigned long)SECTIONS_PGSHIFT,
|
|
|
|
(unsigned long)NODES_PGSHIFT,
|
2013-02-23 08:34:47 +08:00
|
|
|
(unsigned long)ZONES_PGSHIFT,
|
2020-06-02 12:52:49 +08:00
|
|
|
(unsigned long)LAST_CPUPID_PGSHIFT,
|
|
|
|
(unsigned long)KASAN_TAG_PGSHIFT);
|
2013-02-23 08:34:47 +08:00
|
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
|
|
|
|
"Node/Zone ID: %lu -> %lu\n",
|
|
|
|
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
|
|
|
|
(unsigned long)ZONEID_PGOFF);
|
2008-07-24 12:26:51 +08:00
|
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
|
2013-02-23 08:34:47 +08:00
|
|
|
"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
|
2008-07-24 12:26:51 +08:00
|
|
|
shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
|
|
|
|
#ifdef NODE_NOT_IN_PAGE_FLAGS
|
|
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
|
|
|
|
"Node not in page flags");
|
|
|
|
#endif
|
2013-10-07 18:29:20 +08:00
|
|
|
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
2013-02-23 08:34:47 +08:00
|
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
|
2013-10-07 18:29:20 +08:00
|
|
|
"Last cpupid not in page flags");
|
2013-02-23 08:34:47 +08:00
|
|
|
#endif
|
2008-07-24 12:26:51 +08:00
|
|
|
|
|
|
|
if (SECTIONS_WIDTH) {
|
|
|
|
shift -= SECTIONS_WIDTH;
|
|
|
|
BUG_ON(shift != SECTIONS_PGSHIFT);
|
|
|
|
}
|
|
|
|
if (NODES_WIDTH) {
|
|
|
|
shift -= NODES_WIDTH;
|
|
|
|
BUG_ON(shift != NODES_PGSHIFT);
|
|
|
|
}
|
|
|
|
if (ZONES_WIDTH) {
|
|
|
|
shift -= ZONES_WIDTH;
|
|
|
|
BUG_ON(shift != ZONES_PGSHIFT);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check for bitmask overlaps */
|
|
|
|
or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
|
|
|
|
(NODES_MASK << NODES_PGSHIFT) |
|
|
|
|
(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
|
|
|
add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
|
|
|
|
(NODES_MASK << NODES_PGSHIFT) +
|
|
|
|
(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
|
|
|
BUG_ON(or_mask != add_mask);
|
|
|
|
}
|
|
|
|
|
2008-07-24 12:26:49 +08:00
|
|
|
static __init int set_mminit_loglevel(char *str)
|
|
|
|
{
|
|
|
|
get_option(&str, &mminit_loglevel);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("mminit_loglevel", set_mminit_loglevel);
|
2008-07-24 12:27:39 +08:00
|
|
|
#endif /* CONFIG_DEBUG_MEMORY_INIT */
|
2008-07-24 12:27:39 +08:00
|
|
|
|
|
|
|
struct kobject *mm_kobj;
|
|
|
|
|
2013-07-04 06:02:44 +08:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
s32 vm_committed_as_batch = 32;
|
|
|
|
|
2020-08-07 14:23:15 +08:00
|
|
|
void mm_compute_batch(int overcommit_policy)
|
2013-07-04 06:02:44 +08:00
|
|
|
{
|
|
|
|
u64 memsized_batch;
|
|
|
|
s32 nr = num_present_cpus();
|
|
|
|
s32 batch = max_t(s32, nr*2, 32);
|
2020-08-07 14:23:15 +08:00
|
|
|
unsigned long ram_pages = totalram_pages();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
|
|
|
|
* (total memory/#cpus), and lift it to 25% for other policies
|
|
|
|
* to easy the possible lock contention for percpu_counter
|
|
|
|
* vm_committed_as, while the max limit is INT_MAX
|
|
|
|
*/
|
|
|
|
if (overcommit_policy == OVERCOMMIT_NEVER)
|
|
|
|
memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
|
|
|
|
else
|
|
|
|
memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
|
2013-07-04 06:02:44 +08:00
|
|
|
|
|
|
|
vm_committed_as_batch = max_t(s32, memsized_batch, batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
|
|
|
|
unsigned long action, void *arg)
|
|
|
|
{
|
|
|
|
switch (action) {
|
|
|
|
case MEM_ONLINE:
|
|
|
|
case MEM_OFFLINE:
|
2020-08-07 14:23:15 +08:00
|
|
|
mm_compute_batch(sysctl_overcommit_memory);
|
2020-12-15 11:15:00 +08:00
|
|
|
break;
|
2013-07-04 06:02:44 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init mm_compute_batch_init(void)
|
|
|
|
{
|
2020-08-07 14:23:15 +08:00
|
|
|
mm_compute_batch(sysctl_overcommit_memory);
|
2022-09-23 11:33:47 +08:00
|
|
|
hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
|
2013-07-04 06:02:44 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
__initcall(mm_compute_batch_init);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2008-07-24 12:27:39 +08:00
|
|
|
static int __init mm_sysfs_init(void)
|
|
|
|
{
|
|
|
|
mm_kobj = kobject_create_and_add("mm", kernel_kobj);
|
|
|
|
if (!mm_kobj)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2014-01-28 09:06:55 +08:00
|
|
|
postcore_initcall(mm_sysfs_init);
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
|
|
|
|
static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
|
|
|
|
static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
|
|
|
|
|
|
|
|
static unsigned long required_kernelcore __initdata;
|
|
|
|
static unsigned long required_kernelcore_percent __initdata;
|
|
|
|
static unsigned long required_movablecore __initdata;
|
|
|
|
static unsigned long required_movablecore_percent __initdata;
|
|
|
|
|
|
|
|
static unsigned long nr_kernel_pages __initdata;
|
|
|
|
static unsigned long nr_all_pages __initdata;
|
|
|
|
|
2023-03-22 01:05:09 +08:00
|
|
|
static bool deferred_struct_pages __meminitdata;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
|
|
|
|
|
|
|
|
static int __init cmdline_parse_core(char *p, unsigned long *core,
|
|
|
|
unsigned long *percent)
|
|
|
|
{
|
|
|
|
unsigned long long coremem;
|
|
|
|
char *endptr;
|
|
|
|
|
|
|
|
if (!p)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* Value may be a percentage of total memory, otherwise bytes */
|
|
|
|
coremem = simple_strtoull(p, &endptr, 0);
|
|
|
|
if (*endptr == '%') {
|
|
|
|
/* Paranoid check for percent values greater than 100 */
|
|
|
|
WARN_ON(coremem > 100);
|
|
|
|
|
|
|
|
*percent = coremem;
|
|
|
|
} else {
|
|
|
|
coremem = memparse(p, &p);
|
|
|
|
/* Paranoid check that UL is enough for the coremem value */
|
|
|
|
WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
|
|
|
|
|
|
|
|
*core = coremem >> PAGE_SHIFT;
|
|
|
|
*percent = 0UL;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-05-16 14:38:09 +08:00
|
|
|
bool mirrored_kernelcore __initdata_memblock;
|
|
|
|
|
2023-03-22 01:05:02 +08:00
|
|
|
/*
|
|
|
|
* kernelcore=size sets the amount of memory for use for allocations that
|
|
|
|
* cannot be reclaimed or migrated.
|
|
|
|
*/
|
|
|
|
static int __init cmdline_parse_kernelcore(char *p)
|
|
|
|
{
|
|
|
|
/* parse kernelcore=mirror */
|
|
|
|
if (parse_option_str(p, "mirror")) {
|
|
|
|
mirrored_kernelcore = true;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return cmdline_parse_core(p, &required_kernelcore,
|
|
|
|
&required_kernelcore_percent);
|
|
|
|
}
|
|
|
|
early_param("kernelcore", cmdline_parse_kernelcore);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* movablecore=size sets the amount of memory for use for allocations that
|
|
|
|
* can be reclaimed or migrated.
|
|
|
|
*/
|
|
|
|
static int __init cmdline_parse_movablecore(char *p)
|
|
|
|
{
|
|
|
|
return cmdline_parse_core(p, &required_movablecore,
|
|
|
|
&required_movablecore_percent);
|
|
|
|
}
|
|
|
|
early_param("movablecore", cmdline_parse_movablecore);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* early_calculate_totalpages()
|
|
|
|
* Sum pages in active regions for movable zone.
|
|
|
|
* Populate N_MEMORY for calculating usable_nodes.
|
|
|
|
*/
|
|
|
|
static unsigned long __init early_calculate_totalpages(void)
|
|
|
|
{
|
|
|
|
unsigned long totalpages = 0;
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
int i, nid;
|
|
|
|
|
|
|
|
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
|
|
|
|
unsigned long pages = end_pfn - start_pfn;
|
|
|
|
|
|
|
|
totalpages += pages;
|
|
|
|
if (pages)
|
|
|
|
node_set_state(nid, N_MEMORY);
|
|
|
|
}
|
|
|
|
return totalpages;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This finds a zone that can be used for ZONE_MOVABLE pages. The
|
|
|
|
* assumption is made that zones within a node are ordered in monotonic
|
|
|
|
* increasing memory addresses so that the "highest" populated zone is used
|
|
|
|
*/
|
|
|
|
static void __init find_usable_zone_for_movable(void)
|
|
|
|
{
|
|
|
|
int zone_index;
|
|
|
|
for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
|
|
|
|
if (zone_index == ZONE_MOVABLE)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (arch_zone_highest_possible_pfn[zone_index] >
|
|
|
|
arch_zone_lowest_possible_pfn[zone_index])
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
VM_BUG_ON(zone_index == -1);
|
|
|
|
movable_zone = zone_index;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the PFN the Movable zone begins in each node. Kernel memory
|
|
|
|
* is spread evenly between nodes as long as the nodes have enough
|
|
|
|
* memory. When they don't, some nodes will have more kernelcore than
|
|
|
|
* others
|
|
|
|
*/
|
|
|
|
static void __init find_zone_movable_pfns_for_nodes(void)
|
|
|
|
{
|
|
|
|
int i, nid;
|
|
|
|
unsigned long usable_startpfn;
|
|
|
|
unsigned long kernelcore_node, kernelcore_remaining;
|
|
|
|
/* save the state before borrow the nodemask */
|
|
|
|
nodemask_t saved_node_state = node_states[N_MEMORY];
|
|
|
|
unsigned long totalpages = early_calculate_totalpages();
|
|
|
|
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
|
|
|
|
struct memblock_region *r;
|
|
|
|
|
|
|
|
/* Need to find movable_zone earlier when movable_node is specified. */
|
|
|
|
find_usable_zone_for_movable();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If movable_node is specified, ignore kernelcore and movablecore
|
|
|
|
* options.
|
|
|
|
*/
|
|
|
|
if (movable_node_is_enabled()) {
|
|
|
|
for_each_mem_region(r) {
|
|
|
|
if (!memblock_is_hotpluggable(r))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nid = memblock_get_region_node(r);
|
|
|
|
|
2024-05-25 10:30:38 +08:00
|
|
|
usable_startpfn = memblock_region_memory_base_pfn(r);
|
2023-03-22 01:05:02 +08:00
|
|
|
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
|
|
|
|
min(usable_startpfn, zone_movable_pfn[nid]) :
|
|
|
|
usable_startpfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
goto out2;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If kernelcore=mirror is specified, ignore movablecore option
|
|
|
|
*/
|
|
|
|
if (mirrored_kernelcore) {
|
|
|
|
bool mem_below_4gb_not_mirrored = false;
|
|
|
|
|
2023-08-02 15:23:28 +08:00
|
|
|
if (!memblock_has_mirror()) {
|
|
|
|
pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2024-01-09 12:15:36 +08:00
|
|
|
if (is_kdump_kernel()) {
|
|
|
|
pr_warn("The system is under kdump, ignore kernelcore=mirror.\n");
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2023-03-22 01:05:02 +08:00
|
|
|
for_each_mem_region(r) {
|
|
|
|
if (memblock_is_mirror(r))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nid = memblock_get_region_node(r);
|
|
|
|
|
|
|
|
usable_startpfn = memblock_region_memory_base_pfn(r);
|
|
|
|
|
|
|
|
if (usable_startpfn < PHYS_PFN(SZ_4G)) {
|
|
|
|
mem_below_4gb_not_mirrored = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
|
|
|
|
min(usable_startpfn, zone_movable_pfn[nid]) :
|
|
|
|
usable_startpfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mem_below_4gb_not_mirrored)
|
|
|
|
pr_warn("This configuration results in unmirrored kernel memory.\n");
|
|
|
|
|
|
|
|
goto out2;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If kernelcore=nn% or movablecore=nn% was specified, calculate the
|
|
|
|
* amount of necessary memory.
|
|
|
|
*/
|
|
|
|
if (required_kernelcore_percent)
|
|
|
|
required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
|
|
|
|
10000UL;
|
|
|
|
if (required_movablecore_percent)
|
|
|
|
required_movablecore = (totalpages * 100 * required_movablecore_percent) /
|
|
|
|
10000UL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If movablecore= was specified, calculate what size of
|
|
|
|
* kernelcore that corresponds so that memory usable for
|
|
|
|
* any allocation type is evenly spread. If both kernelcore
|
|
|
|
* and movablecore are specified, then the value of kernelcore
|
|
|
|
* will be used for required_kernelcore if it's greater than
|
|
|
|
* what movablecore would have allowed.
|
|
|
|
*/
|
|
|
|
if (required_movablecore) {
|
|
|
|
unsigned long corepages;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Round-up so that ZONE_MOVABLE is at least as large as what
|
|
|
|
* was requested by the user
|
|
|
|
*/
|
|
|
|
required_movablecore =
|
|
|
|
roundup(required_movablecore, MAX_ORDER_NR_PAGES);
|
|
|
|
required_movablecore = min(totalpages, required_movablecore);
|
|
|
|
corepages = totalpages - required_movablecore;
|
|
|
|
|
|
|
|
required_kernelcore = max(required_kernelcore, corepages);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If kernelcore was not specified or kernelcore size is larger
|
|
|
|
* than totalpages, there is no ZONE_MOVABLE.
|
|
|
|
*/
|
|
|
|
if (!required_kernelcore || required_kernelcore >= totalpages)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
|
|
|
|
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
|
|
|
|
|
|
|
|
restart:
|
|
|
|
/* Spread kernelcore memory as evenly as possible throughout nodes */
|
|
|
|
kernelcore_node = required_kernelcore / usable_nodes;
|
|
|
|
for_each_node_state(nid, N_MEMORY) {
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Recalculate kernelcore_node if the division per node
|
|
|
|
* now exceeds what is necessary to satisfy the requested
|
|
|
|
* amount of memory for the kernel
|
|
|
|
*/
|
|
|
|
if (required_kernelcore < kernelcore_node)
|
|
|
|
kernelcore_node = required_kernelcore / usable_nodes;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* As the map is walked, we track how much memory is usable
|
|
|
|
* by the kernel using kernelcore_remaining. When it is
|
|
|
|
* 0, the rest of the node is usable by ZONE_MOVABLE
|
|
|
|
*/
|
|
|
|
kernelcore_remaining = kernelcore_node;
|
|
|
|
|
|
|
|
/* Go through each range of PFNs within this node */
|
|
|
|
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
|
|
|
|
unsigned long size_pages;
|
|
|
|
|
|
|
|
start_pfn = max(start_pfn, zone_movable_pfn[nid]);
|
|
|
|
if (start_pfn >= end_pfn)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Account for what is only usable for kernelcore */
|
|
|
|
if (start_pfn < usable_startpfn) {
|
|
|
|
unsigned long kernel_pages;
|
|
|
|
kernel_pages = min(end_pfn, usable_startpfn)
|
|
|
|
- start_pfn;
|
|
|
|
|
|
|
|
kernelcore_remaining -= min(kernel_pages,
|
|
|
|
kernelcore_remaining);
|
|
|
|
required_kernelcore -= min(kernel_pages,
|
|
|
|
required_kernelcore);
|
|
|
|
|
|
|
|
/* Continue if range is now fully accounted */
|
|
|
|
if (end_pfn <= usable_startpfn) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Push zone_movable_pfn to the end so
|
|
|
|
* that if we have to rebalance
|
|
|
|
* kernelcore across nodes, we will
|
|
|
|
* not double account here
|
|
|
|
*/
|
|
|
|
zone_movable_pfn[nid] = end_pfn;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
start_pfn = usable_startpfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The usable PFN range for ZONE_MOVABLE is from
|
|
|
|
* start_pfn->end_pfn. Calculate size_pages as the
|
|
|
|
* number of pages used as kernelcore
|
|
|
|
*/
|
|
|
|
size_pages = end_pfn - start_pfn;
|
|
|
|
if (size_pages > kernelcore_remaining)
|
|
|
|
size_pages = kernelcore_remaining;
|
|
|
|
zone_movable_pfn[nid] = start_pfn + size_pages;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some kernelcore has been met, update counts and
|
|
|
|
* break if the kernelcore for this node has been
|
|
|
|
* satisfied
|
|
|
|
*/
|
|
|
|
required_kernelcore -= min(required_kernelcore,
|
|
|
|
size_pages);
|
|
|
|
kernelcore_remaining -= size_pages;
|
|
|
|
if (!kernelcore_remaining)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there is still required_kernelcore, we do another pass with one
|
|
|
|
* less node in the count. This will push zone_movable_pfn[nid] further
|
|
|
|
* along on the nodes that still have memory until kernelcore is
|
|
|
|
* satisfied
|
|
|
|
*/
|
|
|
|
usable_nodes--;
|
|
|
|
if (usable_nodes && required_kernelcore > usable_nodes)
|
|
|
|
goto restart;
|
|
|
|
|
|
|
|
out2:
|
|
|
|
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
|
|
|
|
for (nid = 0; nid < MAX_NUMNODES; nid++) {
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
|
|
|
|
zone_movable_pfn[nid] =
|
|
|
|
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
|
|
|
|
|
|
|
|
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
|
|
|
|
if (zone_movable_pfn[nid] >= end_pfn)
|
|
|
|
zone_movable_pfn[nid] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
/* restore the node_state */
|
|
|
|
node_states[N_MEMORY] = saved_node_state;
|
|
|
|
}
|
|
|
|
|
2023-09-13 18:54:01 +08:00
|
|
|
void __meminit __init_single_page(struct page *page, unsigned long pfn,
|
2023-03-22 01:05:02 +08:00
|
|
|
unsigned long zone, int nid)
|
|
|
|
{
|
|
|
|
mm_zero_struct_page(page);
|
|
|
|
set_page_links(page, zone, nid, pfn);
|
|
|
|
init_page_count(page);
|
2024-05-29 19:19:04 +08:00
|
|
|
atomic_set(&page->_mapcount, -1);
|
2023-03-22 01:05:02 +08:00
|
|
|
page_cpupid_reset_last(page);
|
|
|
|
page_kasan_tag_reset(page);
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&page->lru);
|
|
|
|
#ifdef WANT_PAGE_VIRTUAL
|
|
|
|
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
|
|
|
|
if (!is_highmem_idx(zone))
|
|
|
|
set_page_address(page, __va(pfn << PAGE_SHIFT));
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/*
|
|
|
|
* During memory init memblocks map pfns to nids. The search is expensive and
|
|
|
|
* this caches recent lookups. The implementation of __early_pfn_to_nid
|
|
|
|
* treats start/end as pfns.
|
|
|
|
*/
|
|
|
|
struct mminit_pfnnid_cache {
|
|
|
|
unsigned long last_start;
|
|
|
|
unsigned long last_end;
|
|
|
|
int last_nid;
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
|
|
|
|
*/
|
|
|
|
static int __meminit __early_pfn_to_nid(unsigned long pfn,
|
|
|
|
struct mminit_pfnnid_cache *state)
|
|
|
|
{
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
int nid;
|
|
|
|
|
|
|
|
if (state->last_start <= pfn && pfn < state->last_end)
|
|
|
|
return state->last_nid;
|
|
|
|
|
|
|
|
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
|
|
|
|
if (nid != NUMA_NO_NODE) {
|
|
|
|
state->last_start = start_pfn;
|
|
|
|
state->last_end = end_pfn;
|
|
|
|
state->last_nid = nid;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nid;
|
|
|
|
}
|
|
|
|
|
|
|
|
int __meminit early_pfn_to_nid(unsigned long pfn)
|
|
|
|
{
|
|
|
|
static DEFINE_SPINLOCK(early_pfn_lock);
|
|
|
|
int nid;
|
|
|
|
|
|
|
|
spin_lock(&early_pfn_lock);
|
|
|
|
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
|
|
|
|
if (nid < 0)
|
|
|
|
nid = first_online_node;
|
|
|
|
spin_unlock(&early_pfn_lock);
|
|
|
|
|
|
|
|
return nid;
|
|
|
|
}
|
2023-03-22 01:05:03 +08:00
|
|
|
|
|
|
|
int hashdist = HASHDIST_DEFAULT;
|
|
|
|
|
|
|
|
static int __init set_hashdist(char *str)
|
|
|
|
{
|
|
|
|
if (!str)
|
|
|
|
return 0;
|
|
|
|
hashdist = simple_strtoul(str, &str, 0);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("hashdist=", set_hashdist);
|
|
|
|
|
|
|
|
static inline void fixup_hashdist(void)
|
|
|
|
{
|
|
|
|
if (num_node_state(N_MEMORY) == 1)
|
|
|
|
hashdist = 0;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void fixup_hashdist(void) {}
|
2023-03-22 01:05:02 +08:00
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
|
|
|
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
|
|
|
|
{
|
|
|
|
pgdat->first_deferred_pfn = ULONG_MAX;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Returns true if the struct page for the pfn is initialised */
|
2023-06-19 10:34:06 +08:00
|
|
|
static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
|
|
|
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns true when the remaining initialisation should be deferred until
|
|
|
|
* later in the boot cycle when it can be parallelised.
|
|
|
|
*/
|
|
|
|
static bool __meminit
|
|
|
|
defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
static unsigned long prev_end_pfn, nr_initialised;
|
|
|
|
|
|
|
|
if (early_page_ext_enabled())
|
|
|
|
return false;
|
2024-05-25 10:30:40 +08:00
|
|
|
|
|
|
|
/* Always populate low zones for address-constrained allocations */
|
|
|
|
if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
|
|
|
|
return true;
|
|
|
|
|
2023-03-22 01:05:02 +08:00
|
|
|
/*
|
|
|
|
* prev_end_pfn static that contains the end of previous zone
|
|
|
|
* No need to protect because called very early in boot before smp_init.
|
|
|
|
*/
|
|
|
|
if (prev_end_pfn != end_pfn) {
|
|
|
|
prev_end_pfn = end_pfn;
|
|
|
|
nr_initialised = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We start only with one section of pages, more pages are added as
|
|
|
|
* needed until the rest of deferred pages are initialized.
|
|
|
|
*/
|
|
|
|
nr_initialised++;
|
|
|
|
if ((nr_initialised > PAGES_PER_SECTION) &&
|
|
|
|
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
|
|
|
|
NODE_DATA(nid)->first_deferred_pfn = pfn;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-06-19 10:34:06 +08:00
|
|
|
static void __meminit init_reserved_page(unsigned long pfn, int nid)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
|
|
|
pg_data_t *pgdat;
|
2023-06-19 10:34:06 +08:00
|
|
|
int zid;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
2023-06-19 10:34:06 +08:00
|
|
|
if (early_page_initialised(pfn, nid))
|
2023-03-22 01:05:02 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
pgdat = NODE_DATA(nid);
|
|
|
|
|
|
|
|
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
|
|
|
|
struct zone *zone = &pgdat->node_zones[zid];
|
|
|
|
|
|
|
|
if (zone_spans_pfn(zone, pfn))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
|
|
|
|
|
2023-06-19 10:34:06 +08:00
|
|
|
static inline bool early_page_initialised(unsigned long pfn, int nid)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-06-19 10:34:06 +08:00
|
|
|
static inline void init_reserved_page(unsigned long pfn, int nid)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialised pages do not have PageReserved set. This function is
|
|
|
|
* called for each range allocated by the bootmem allocator and
|
|
|
|
* marks the pages PageReserved. The remaining valid pages are later
|
|
|
|
* sent to the buddy page allocator.
|
|
|
|
*/
|
2023-06-19 10:34:06 +08:00
|
|
|
void __meminit reserve_bootmem_region(phys_addr_t start,
|
|
|
|
phys_addr_t end, int nid)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
|
|
|
unsigned long start_pfn = PFN_DOWN(start);
|
|
|
|
unsigned long end_pfn = PFN_UP(end);
|
|
|
|
|
|
|
|
for (; start_pfn < end_pfn; start_pfn++) {
|
|
|
|
if (pfn_valid(start_pfn)) {
|
|
|
|
struct page *page = pfn_to_page(start_pfn);
|
|
|
|
|
2023-06-19 10:34:06 +08:00
|
|
|
init_reserved_page(start_pfn, nid);
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* no need for atomic set_bit because the struct
|
|
|
|
* page is not visible yet so nobody should
|
|
|
|
* access it yet.
|
|
|
|
*/
|
|
|
|
__SetPageReserved(page);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
|
|
|
|
static bool __meminit
|
|
|
|
overlap_memmap_init(unsigned long zone, unsigned long *pfn)
|
|
|
|
{
|
|
|
|
static struct memblock_region *r;
|
|
|
|
|
|
|
|
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
|
|
|
|
if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
|
|
|
|
for_each_mem_region(r) {
|
|
|
|
if (*pfn < memblock_region_memory_end_pfn(r))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (*pfn >= memblock_region_memory_base_pfn(r) &&
|
|
|
|
memblock_is_mirror(r)) {
|
|
|
|
*pfn = memblock_region_memory_end_pfn(r);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only struct pages that correspond to ranges defined by memblock.memory
|
|
|
|
* are zeroed and initialized by going through __init_single_page() during
|
|
|
|
* memmap_init_zone_range().
|
|
|
|
*
|
|
|
|
* But, there could be struct pages that correspond to holes in
|
|
|
|
* memblock.memory. This can happen because of the following reasons:
|
|
|
|
* - physical memory bank size is not necessarily the exact multiple of the
|
|
|
|
* arbitrary section size
|
|
|
|
* - early reserved memory may not be listed in memblock.memory
|
2023-11-23 02:24:03 +08:00
|
|
|
* - non-memory regions covered by the contigious flatmem mapping
|
2023-03-22 01:05:02 +08:00
|
|
|
* - memory layouts defined with memmap= kernel parameter may not align
|
|
|
|
* nicely with memmap sections
|
|
|
|
*
|
|
|
|
* Explicitly initialize those struct pages so that:
|
|
|
|
* - PG_Reserved is set
|
|
|
|
* - zone and node links point to zone and node that span the page if the
|
|
|
|
* hole is in the middle of a zone
|
|
|
|
* - zone and node links point to adjacent zone/node if the hole falls on
|
|
|
|
* the zone boundary; the pages in such holes will be prepended to the
|
|
|
|
* zone/node above the hole except for the trailing pages in the last
|
|
|
|
* section that will be appended to the zone/node below.
|
|
|
|
*/
|
|
|
|
static void __init init_unavailable_range(unsigned long spfn,
|
|
|
|
unsigned long epfn,
|
|
|
|
int zone, int node)
|
|
|
|
{
|
|
|
|
unsigned long pfn;
|
|
|
|
u64 pgcnt = 0;
|
|
|
|
|
|
|
|
for (pfn = spfn; pfn < epfn; pfn++) {
|
|
|
|
if (!pfn_valid(pageblock_start_pfn(pfn))) {
|
|
|
|
pfn = pageblock_end_pfn(pfn) - 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
|
|
|
|
__SetPageReserved(pfn_to_page(pfn));
|
|
|
|
pgcnt++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pgcnt)
|
2023-11-23 02:24:04 +08:00
|
|
|
pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n",
|
2023-03-22 01:05:02 +08:00
|
|
|
node, zone_names[zone], pgcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initially all pages are reserved - free ones are freed
|
|
|
|
* up by memblock_free_all() once the early boot process is
|
|
|
|
* done. Non-atomic initialization, single-pass.
|
|
|
|
*
|
|
|
|
* All aligned pageblocks are initialized to the specified migratetype
|
|
|
|
* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
|
|
|
|
* zone stats (e.g., nr_isolate_pageblock) are touched.
|
|
|
|
*/
|
|
|
|
void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
|
|
|
|
unsigned long start_pfn, unsigned long zone_end_pfn,
|
|
|
|
enum meminit_context context,
|
|
|
|
struct vmem_altmap *altmap, int migratetype)
|
|
|
|
{
|
|
|
|
unsigned long pfn, end_pfn = start_pfn + size;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
if (highest_memmap_pfn < end_pfn - 1)
|
|
|
|
highest_memmap_pfn = end_pfn - 1;
|
|
|
|
|
|
|
|
#ifdef CONFIG_ZONE_DEVICE
|
|
|
|
/*
|
|
|
|
* Honor reservation requested by the driver for this ZONE_DEVICE
|
|
|
|
* memory. We limit the total number of pages to initialize to just
|
|
|
|
* those that might contain the memory mapping. We will defer the
|
|
|
|
* ZONE_DEVICE page initialization until after we have released
|
|
|
|
* the hotplug lock.
|
|
|
|
*/
|
|
|
|
if (zone == ZONE_DEVICE) {
|
|
|
|
if (!altmap)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (start_pfn == altmap->base_pfn)
|
|
|
|
start_pfn += altmap->reserve;
|
|
|
|
end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for (pfn = start_pfn; pfn < end_pfn; ) {
|
|
|
|
/*
|
|
|
|
* There can be holes in boot-time mem_map[]s handed to this
|
|
|
|
* function. They do not exist on hotplugged memory.
|
|
|
|
*/
|
|
|
|
if (context == MEMINIT_EARLY) {
|
|
|
|
if (overlap_memmap_init(zone, &pfn))
|
|
|
|
continue;
|
|
|
|
if (defer_init(nid, pfn, zone_end_pfn)) {
|
|
|
|
deferred_struct_pages = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
__init_single_page(page, pfn, zone, nid);
|
2024-06-07 17:09:37 +08:00
|
|
|
if (context == MEMINIT_HOTPLUG) {
|
|
|
|
#ifdef CONFIG_ZONE_DEVICE
|
|
|
|
if (zone == ZONE_DEVICE)
|
|
|
|
__SetPageReserved(page);
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
__SetPageOffline(page);
|
|
|
|
}
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Usually, we want to mark the pageblock MIGRATE_MOVABLE,
|
|
|
|
* such that unmovable allocations won't be scattered all
|
|
|
|
* over the place during system boot.
|
|
|
|
*/
|
|
|
|
if (pageblock_aligned(pfn)) {
|
|
|
|
set_pageblock_migratetype(page, migratetype);
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
pfn++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init memmap_init_zone_range(struct zone *zone,
|
|
|
|
unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn,
|
|
|
|
unsigned long *hole_pfn)
|
|
|
|
{
|
|
|
|
unsigned long zone_start_pfn = zone->zone_start_pfn;
|
|
|
|
unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
|
|
|
|
int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
|
|
|
|
|
|
|
|
start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
|
|
|
|
end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
|
|
|
|
|
|
|
|
if (start_pfn >= end_pfn)
|
|
|
|
return;
|
|
|
|
|
|
|
|
memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
|
|
|
|
zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
|
|
|
|
|
|
|
|
if (*hole_pfn < start_pfn)
|
|
|
|
init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
|
|
|
|
|
|
|
|
*hole_pfn = end_pfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init memmap_init(void)
|
|
|
|
{
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
unsigned long hole_pfn = 0;
|
|
|
|
int i, j, zone_id = 0, nid;
|
|
|
|
|
|
|
|
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
|
|
|
|
struct pglist_data *node = NODE_DATA(nid);
|
|
|
|
|
|
|
|
for (j = 0; j < MAX_NR_ZONES; j++) {
|
|
|
|
struct zone *zone = node->node_zones + j;
|
|
|
|
|
|
|
|
if (!populated_zone(zone))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
memmap_init_zone_range(zone, start_pfn, end_pfn,
|
|
|
|
&hole_pfn);
|
|
|
|
zone_id = j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_SPARSEMEM
|
|
|
|
/*
|
|
|
|
* Initialize the memory map for hole in the range [memory_end,
|
|
|
|
* section_end].
|
|
|
|
* Append the pages in this hole to the highest zone in the last
|
|
|
|
* node.
|
|
|
|
* The call to init_unavailable_range() is outside the ifdef to
|
|
|
|
* silence the compiler warining about zone_id set but not used;
|
|
|
|
* for FLATMEM it is a nop anyway
|
|
|
|
*/
|
|
|
|
end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
|
|
|
|
if (hole_pfn < end_pfn)
|
|
|
|
#endif
|
|
|
|
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_ZONE_DEVICE
|
|
|
|
static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
|
|
|
|
unsigned long zone_idx, int nid,
|
|
|
|
struct dev_pagemap *pgmap)
|
|
|
|
{
|
|
|
|
|
|
|
|
__init_single_page(page, pfn, zone_idx, nid);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark page reserved as it will need to wait for onlining
|
|
|
|
* phase for it to be fully associated with a zone.
|
|
|
|
*
|
|
|
|
* We can use the non-atomic __set_bit operation for setting
|
|
|
|
* the flag as we are still initializing the pages.
|
|
|
|
*/
|
|
|
|
__SetPageReserved(page);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
|
|
|
|
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
|
|
|
|
* ever freed or placed on a driver-private list.
|
|
|
|
*/
|
|
|
|
page->pgmap = pgmap;
|
|
|
|
page->zone_device_data = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark the block movable so that blocks are reserved for
|
|
|
|
* movable at startup. This will force kernel allocations
|
|
|
|
* to reserve their blocks rather than leaking throughout
|
|
|
|
* the address space during boot when many long-lived
|
|
|
|
* kernel allocations are made.
|
|
|
|
*
|
|
|
|
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
|
|
|
|
* because this is done early in section_activate()
|
|
|
|
*/
|
|
|
|
if (pageblock_aligned(pfn)) {
|
|
|
|
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ZONE_DEVICE pages are released directly to the driver page allocator
|
|
|
|
* which will set the page count to 1 when allocating the page.
|
|
|
|
*/
|
|
|
|
if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
|
|
|
|
pgmap->type == MEMORY_DEVICE_COHERENT)
|
|
|
|
set_page_count(page, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* With compound page geometry and when struct pages are stored in ram most
|
|
|
|
* tail pages are reused. Consequently, the amount of unique struct pages to
|
|
|
|
* initialize is a lot smaller that the total amount of struct pages being
|
|
|
|
* mapped. This is a paired / mild layering violation with explicit knowledge
|
|
|
|
* of how the sparse_vmemmap internals handle compound pages in the lack
|
|
|
|
* of an altmap. See vmemmap_populate_compound_pages().
|
|
|
|
*/
|
|
|
|
static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
|
mm/vmemmap/devdax: fix kernel crash when probing devdax devices
commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for
compound devmaps") added support for using optimized vmmemap for devdax
devices. But how vmemmap mappings are created are architecture specific.
For example, powerpc with hash translation doesn't have vmemmap mappings
in init_mm page table instead they are bolted table entries in the
hardware page table
vmemmap_populate_compound_pages() used by vmemmap optimization code is not
aware of these architecture-specific mapping. Hence allow architecture to
opt for this feature. I selected architectures supporting
HUGETLB_PAGE_OPTIMIZE_VMEMMAP option as also supporting this feature.
This patch fixes the below crash on ppc64.
BUG: Unable to handle kernel data access on write at 0xc00c000100400038
Faulting instruction address: 0xc000000001269d90
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in:
CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc5-150500.34-default+ #2 5c90a668b6bbd142599890245c2fb5de19d7d28a
Hardware name: IBM,9009-42G POWER9 (raw) 0x4e0202 0xf000005 of:IBM,FW950.40 (VL950_099) hv:phyp pSeries
NIP: c000000001269d90 LR: c0000000004c57d4 CTR: 0000000000000000
REGS: c000000003632c30 TRAP: 0300 Not tainted (6.3.0-rc5-150500.34-default+)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 24842228 XER: 00000000
CFAR: c0000000004c57d0 DAR: c00c000100400038 DSISR: 42000000 IRQMASK: 0
....
NIP [c000000001269d90] __init_single_page.isra.74+0x14/0x4c
LR [c0000000004c57d4] __init_zone_device_page+0x44/0xd0
Call Trace:
[c000000003632ed0] [c000000003632f60] 0xc000000003632f60 (unreliable)
[c000000003632f10] [c0000000004c5ca0] memmap_init_zone_device+0x170/0x250
[c000000003632fe0] [c0000000005575f8] memremap_pages+0x2c8/0x7f0
[c0000000036330c0] [c000000000557b5c] devm_memremap_pages+0x3c/0xa0
[c000000003633100] [c000000000d458a8] dev_dax_probe+0x108/0x3e0
[c0000000036331a0] [c000000000d41430] dax_bus_probe+0xb0/0x140
[c0000000036331d0] [c000000000cef27c] really_probe+0x19c/0x520
[c000000003633260] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c0000000036332e0] [c000000000cef888] driver_probe_device+0x58/0x120
[c000000003633320] [c000000000cefa6c] __device_attach_driver+0x11c/0x1e0
[c0000000036333a0] [c000000000cebc58] bus_for_each_drv+0xa8/0x130
[c000000003633400] [c000000000ceefcc] __device_attach+0x15c/0x250
[c0000000036334a0] [c000000000ced458] bus_probe_device+0x108/0x110
[c0000000036334f0] [c000000000ce92dc] device_add+0x7fc/0xa10
[c0000000036335b0] [c000000000d447c8] devm_create_dev_dax+0x1d8/0x530
[c000000003633640] [c000000000d46b60] __dax_pmem_probe+0x200/0x270
[c0000000036337b0] [c000000000d46bf0] dax_pmem_probe+0x20/0x70
[c0000000036337d0] [c000000000d2279c] nvdimm_bus_probe+0xac/0x2b0
[c000000003633860] [c000000000cef27c] really_probe+0x19c/0x520
[c0000000036338f0] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c000000003633970] [c000000000cef888] driver_probe_device+0x58/0x120
[c0000000036339b0] [c000000000cefd08] __driver_attach+0x1d8/0x240
[c000000003633a30] [c000000000cebb04] bus_for_each_dev+0xb4/0x130
[c000000003633a90] [c000000000cee564] driver_attach+0x34/0x50
[c000000003633ab0] [c000000000ced878] bus_add_driver+0x218/0x300
[c000000003633b40] [c000000000cf1144] driver_register+0xa4/0x1b0
[c000000003633bb0] [c000000000d21a0c] __nd_driver_register+0x5c/0x100
[c000000003633c10] [c00000000206a2e8] dax_pmem_init+0x34/0x48
[c000000003633c30] [c0000000000132d0] do_one_initcall+0x60/0x320
[c000000003633d00] [c0000000020051b0] kernel_init_freeable+0x360/0x400
[c000000003633de0] [c000000000013764] kernel_init+0x34/0x1d0
[c000000003633e50] [c00000000000de14] ret_from_kernel_thread+0x5c/0x64
Link: https://lkml.kernel.org/r/20230411142214.64464-1-aneesh.kumar@linux.ibm.com
Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reported-by: Tarun Sahu <tsahu@linux.ibm.com>
Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-11 22:22:13 +08:00
|
|
|
struct dev_pagemap *pgmap)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
mm/vmemmap/devdax: fix kernel crash when probing devdax devices
commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for
compound devmaps") added support for using optimized vmmemap for devdax
devices. But how vmemmap mappings are created are architecture specific.
For example, powerpc with hash translation doesn't have vmemmap mappings
in init_mm page table instead they are bolted table entries in the
hardware page table
vmemmap_populate_compound_pages() used by vmemmap optimization code is not
aware of these architecture-specific mapping. Hence allow architecture to
opt for this feature. I selected architectures supporting
HUGETLB_PAGE_OPTIMIZE_VMEMMAP option as also supporting this feature.
This patch fixes the below crash on ppc64.
BUG: Unable to handle kernel data access on write at 0xc00c000100400038
Faulting instruction address: 0xc000000001269d90
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in:
CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc5-150500.34-default+ #2 5c90a668b6bbd142599890245c2fb5de19d7d28a
Hardware name: IBM,9009-42G POWER9 (raw) 0x4e0202 0xf000005 of:IBM,FW950.40 (VL950_099) hv:phyp pSeries
NIP: c000000001269d90 LR: c0000000004c57d4 CTR: 0000000000000000
REGS: c000000003632c30 TRAP: 0300 Not tainted (6.3.0-rc5-150500.34-default+)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 24842228 XER: 00000000
CFAR: c0000000004c57d0 DAR: c00c000100400038 DSISR: 42000000 IRQMASK: 0
....
NIP [c000000001269d90] __init_single_page.isra.74+0x14/0x4c
LR [c0000000004c57d4] __init_zone_device_page+0x44/0xd0
Call Trace:
[c000000003632ed0] [c000000003632f60] 0xc000000003632f60 (unreliable)
[c000000003632f10] [c0000000004c5ca0] memmap_init_zone_device+0x170/0x250
[c000000003632fe0] [c0000000005575f8] memremap_pages+0x2c8/0x7f0
[c0000000036330c0] [c000000000557b5c] devm_memremap_pages+0x3c/0xa0
[c000000003633100] [c000000000d458a8] dev_dax_probe+0x108/0x3e0
[c0000000036331a0] [c000000000d41430] dax_bus_probe+0xb0/0x140
[c0000000036331d0] [c000000000cef27c] really_probe+0x19c/0x520
[c000000003633260] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c0000000036332e0] [c000000000cef888] driver_probe_device+0x58/0x120
[c000000003633320] [c000000000cefa6c] __device_attach_driver+0x11c/0x1e0
[c0000000036333a0] [c000000000cebc58] bus_for_each_drv+0xa8/0x130
[c000000003633400] [c000000000ceefcc] __device_attach+0x15c/0x250
[c0000000036334a0] [c000000000ced458] bus_probe_device+0x108/0x110
[c0000000036334f0] [c000000000ce92dc] device_add+0x7fc/0xa10
[c0000000036335b0] [c000000000d447c8] devm_create_dev_dax+0x1d8/0x530
[c000000003633640] [c000000000d46b60] __dax_pmem_probe+0x200/0x270
[c0000000036337b0] [c000000000d46bf0] dax_pmem_probe+0x20/0x70
[c0000000036337d0] [c000000000d2279c] nvdimm_bus_probe+0xac/0x2b0
[c000000003633860] [c000000000cef27c] really_probe+0x19c/0x520
[c0000000036338f0] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c000000003633970] [c000000000cef888] driver_probe_device+0x58/0x120
[c0000000036339b0] [c000000000cefd08] __driver_attach+0x1d8/0x240
[c000000003633a30] [c000000000cebb04] bus_for_each_dev+0xb4/0x130
[c000000003633a90] [c000000000cee564] driver_attach+0x34/0x50
[c000000003633ab0] [c000000000ced878] bus_add_driver+0x218/0x300
[c000000003633b40] [c000000000cf1144] driver_register+0xa4/0x1b0
[c000000003633bb0] [c000000000d21a0c] __nd_driver_register+0x5c/0x100
[c000000003633c10] [c00000000206a2e8] dax_pmem_init+0x34/0x48
[c000000003633c30] [c0000000000132d0] do_one_initcall+0x60/0x320
[c000000003633d00] [c0000000020051b0] kernel_init_freeable+0x360/0x400
[c000000003633de0] [c000000000013764] kernel_init+0x34/0x1d0
[c000000003633e50] [c00000000000de14] ret_from_kernel_thread+0x5c/0x64
Link: https://lkml.kernel.org/r/20230411142214.64464-1-aneesh.kumar@linux.ibm.com
Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reported-by: Tarun Sahu <tsahu@linux.ibm.com>
Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-11 22:22:13 +08:00
|
|
|
if (!vmemmap_can_optimize(altmap, pgmap))
|
|
|
|
return pgmap_vmemmap_nr(pgmap);
|
|
|
|
|
2023-07-25 03:07:49 +08:00
|
|
|
return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __ref memmap_init_compound(struct page *head,
|
|
|
|
unsigned long head_pfn,
|
|
|
|
unsigned long zone_idx, int nid,
|
|
|
|
struct dev_pagemap *pgmap,
|
|
|
|
unsigned long nr_pages)
|
|
|
|
{
|
|
|
|
unsigned long pfn, end_pfn = head_pfn + nr_pages;
|
|
|
|
unsigned int order = pgmap->vmemmap_shift;
|
|
|
|
|
|
|
|
__SetPageHead(head);
|
|
|
|
for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
|
|
|
|
struct page *page = pfn_to_page(pfn);
|
|
|
|
|
|
|
|
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
|
|
|
|
prep_compound_tail(head, pfn - head_pfn);
|
|
|
|
set_page_count(page, 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The first tail page stores important compound page info.
|
|
|
|
* Call prep_compound_head() after the first tail page has
|
|
|
|
* been initialized, to not have the data overwritten.
|
|
|
|
*/
|
|
|
|
if (pfn == head_pfn + 1)
|
|
|
|
prep_compound_head(head, order);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void __ref memmap_init_zone_device(struct zone *zone,
|
|
|
|
unsigned long start_pfn,
|
|
|
|
unsigned long nr_pages,
|
|
|
|
struct dev_pagemap *pgmap)
|
|
|
|
{
|
|
|
|
unsigned long pfn, end_pfn = start_pfn + nr_pages;
|
|
|
|
struct pglist_data *pgdat = zone->zone_pgdat;
|
|
|
|
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
|
|
|
|
unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
|
|
|
|
unsigned long zone_idx = zone_idx(zone);
|
|
|
|
unsigned long start = jiffies;
|
|
|
|
int nid = pgdat->node_id;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The call to memmap_init should have already taken care
|
|
|
|
* of the pages reserved for the memmap, so we can just jump to
|
|
|
|
* the end of that region and start processing the device pages.
|
|
|
|
*/
|
|
|
|
if (altmap) {
|
|
|
|
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
|
|
|
|
nr_pages = end_pfn - start_pfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
|
|
|
|
struct page *page = pfn_to_page(pfn);
|
|
|
|
|
|
|
|
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
|
|
|
|
|
|
|
|
if (pfns_per_compound == 1)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
|
mm/vmemmap/devdax: fix kernel crash when probing devdax devices
commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for
compound devmaps") added support for using optimized vmmemap for devdax
devices. But how vmemmap mappings are created are architecture specific.
For example, powerpc with hash translation doesn't have vmemmap mappings
in init_mm page table instead they are bolted table entries in the
hardware page table
vmemmap_populate_compound_pages() used by vmemmap optimization code is not
aware of these architecture-specific mapping. Hence allow architecture to
opt for this feature. I selected architectures supporting
HUGETLB_PAGE_OPTIMIZE_VMEMMAP option as also supporting this feature.
This patch fixes the below crash on ppc64.
BUG: Unable to handle kernel data access on write at 0xc00c000100400038
Faulting instruction address: 0xc000000001269d90
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in:
CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc5-150500.34-default+ #2 5c90a668b6bbd142599890245c2fb5de19d7d28a
Hardware name: IBM,9009-42G POWER9 (raw) 0x4e0202 0xf000005 of:IBM,FW950.40 (VL950_099) hv:phyp pSeries
NIP: c000000001269d90 LR: c0000000004c57d4 CTR: 0000000000000000
REGS: c000000003632c30 TRAP: 0300 Not tainted (6.3.0-rc5-150500.34-default+)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 24842228 XER: 00000000
CFAR: c0000000004c57d0 DAR: c00c000100400038 DSISR: 42000000 IRQMASK: 0
....
NIP [c000000001269d90] __init_single_page.isra.74+0x14/0x4c
LR [c0000000004c57d4] __init_zone_device_page+0x44/0xd0
Call Trace:
[c000000003632ed0] [c000000003632f60] 0xc000000003632f60 (unreliable)
[c000000003632f10] [c0000000004c5ca0] memmap_init_zone_device+0x170/0x250
[c000000003632fe0] [c0000000005575f8] memremap_pages+0x2c8/0x7f0
[c0000000036330c0] [c000000000557b5c] devm_memremap_pages+0x3c/0xa0
[c000000003633100] [c000000000d458a8] dev_dax_probe+0x108/0x3e0
[c0000000036331a0] [c000000000d41430] dax_bus_probe+0xb0/0x140
[c0000000036331d0] [c000000000cef27c] really_probe+0x19c/0x520
[c000000003633260] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c0000000036332e0] [c000000000cef888] driver_probe_device+0x58/0x120
[c000000003633320] [c000000000cefa6c] __device_attach_driver+0x11c/0x1e0
[c0000000036333a0] [c000000000cebc58] bus_for_each_drv+0xa8/0x130
[c000000003633400] [c000000000ceefcc] __device_attach+0x15c/0x250
[c0000000036334a0] [c000000000ced458] bus_probe_device+0x108/0x110
[c0000000036334f0] [c000000000ce92dc] device_add+0x7fc/0xa10
[c0000000036335b0] [c000000000d447c8] devm_create_dev_dax+0x1d8/0x530
[c000000003633640] [c000000000d46b60] __dax_pmem_probe+0x200/0x270
[c0000000036337b0] [c000000000d46bf0] dax_pmem_probe+0x20/0x70
[c0000000036337d0] [c000000000d2279c] nvdimm_bus_probe+0xac/0x2b0
[c000000003633860] [c000000000cef27c] really_probe+0x19c/0x520
[c0000000036338f0] [c000000000cef6b4] __driver_probe_device+0xb4/0x230
[c000000003633970] [c000000000cef888] driver_probe_device+0x58/0x120
[c0000000036339b0] [c000000000cefd08] __driver_attach+0x1d8/0x240
[c000000003633a30] [c000000000cebb04] bus_for_each_dev+0xb4/0x130
[c000000003633a90] [c000000000cee564] driver_attach+0x34/0x50
[c000000003633ab0] [c000000000ced878] bus_add_driver+0x218/0x300
[c000000003633b40] [c000000000cf1144] driver_register+0xa4/0x1b0
[c000000003633bb0] [c000000000d21a0c] __nd_driver_register+0x5c/0x100
[c000000003633c10] [c00000000206a2e8] dax_pmem_init+0x34/0x48
[c000000003633c30] [c0000000000132d0] do_one_initcall+0x60/0x320
[c000000003633d00] [c0000000020051b0] kernel_init_freeable+0x360/0x400
[c000000003633de0] [c000000000013764] kernel_init+0x34/0x1d0
[c000000003633e50] [c00000000000de14] ret_from_kernel_thread+0x5c/0x64
Link: https://lkml.kernel.org/r/20230411142214.64464-1-aneesh.kumar@linux.ibm.com
Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reported-by: Tarun Sahu <tsahu@linux.ibm.com>
Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-11 22:22:13 +08:00
|
|
|
compound_nr_pages(altmap, pgmap));
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
2023-03-24 01:43:49 +08:00
|
|
|
pr_debug("%s initialised %lu pages in %ums\n", __func__,
|
2023-03-22 01:05:02 +08:00
|
|
|
nr_pages, jiffies_to_msecs(jiffies - start));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The zone ranges provided by the architecture do not include ZONE_MOVABLE
|
|
|
|
* because it is sized independent of architecture. Unlike the other zones,
|
|
|
|
* the starting point for ZONE_MOVABLE is not fixed. It may be different
|
|
|
|
* in each node depending on the size of each node and how evenly kernelcore
|
|
|
|
* is distributed. This helper function adjusts the zone ranges
|
|
|
|
* provided by the architecture for a given node by using the end of the
|
|
|
|
* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
|
|
|
|
* zones within a node are in order of monotonic increases memory addresses
|
|
|
|
*/
|
|
|
|
static void __init adjust_zone_range_for_zone_movable(int nid,
|
|
|
|
unsigned long zone_type,
|
|
|
|
unsigned long node_end_pfn,
|
|
|
|
unsigned long *zone_start_pfn,
|
|
|
|
unsigned long *zone_end_pfn)
|
|
|
|
{
|
|
|
|
/* Only adjust if ZONE_MOVABLE is on this node */
|
|
|
|
if (zone_movable_pfn[nid]) {
|
|
|
|
/* Size ZONE_MOVABLE */
|
|
|
|
if (zone_type == ZONE_MOVABLE) {
|
|
|
|
*zone_start_pfn = zone_movable_pfn[nid];
|
|
|
|
*zone_end_pfn = min(node_end_pfn,
|
|
|
|
arch_zone_highest_possible_pfn[movable_zone]);
|
|
|
|
|
|
|
|
/* Adjust for ZONE_MOVABLE starting within this range */
|
|
|
|
} else if (!mirrored_kernelcore &&
|
|
|
|
*zone_start_pfn < zone_movable_pfn[nid] &&
|
|
|
|
*zone_end_pfn > zone_movable_pfn[nid]) {
|
|
|
|
*zone_end_pfn = zone_movable_pfn[nid];
|
|
|
|
|
|
|
|
/* Check if this whole range is within ZONE_MOVABLE */
|
|
|
|
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
|
|
|
|
*zone_start_pfn = *zone_end_pfn;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
|
|
|
|
* then all holes in the requested range will be accounted for.
|
|
|
|
*/
|
2024-03-26 14:11:29 +08:00
|
|
|
static unsigned long __init __absent_pages_in_range(int nid,
|
2023-03-22 01:05:02 +08:00
|
|
|
unsigned long range_start_pfn,
|
|
|
|
unsigned long range_end_pfn)
|
|
|
|
{
|
|
|
|
unsigned long nr_absent = range_end_pfn - range_start_pfn;
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
|
|
|
|
start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
|
|
|
|
end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
|
|
|
|
nr_absent -= end_pfn - start_pfn;
|
|
|
|
}
|
|
|
|
return nr_absent;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* absent_pages_in_range - Return number of page frames in holes within a range
|
|
|
|
* @start_pfn: The start PFN to start searching for holes
|
|
|
|
* @end_pfn: The end PFN to stop searching for holes
|
|
|
|
*
|
|
|
|
* Return: the number of pages frames in memory holes within a range.
|
|
|
|
*/
|
|
|
|
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return the number of page frames in holes in a zone on a node */
|
|
|
|
static unsigned long __init zone_absent_pages_in_node(int nid,
|
|
|
|
unsigned long zone_type,
|
2023-05-26 16:52:51 +08:00
|
|
|
unsigned long zone_start_pfn,
|
|
|
|
unsigned long zone_end_pfn)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
|
|
|
unsigned long nr_absent;
|
|
|
|
|
2023-05-26 16:52:51 +08:00
|
|
|
/* zone is empty, we don't have any absent pages */
|
|
|
|
if (zone_start_pfn == zone_end_pfn)
|
2023-03-22 01:05:02 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ZONE_MOVABLE handling.
|
|
|
|
* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
|
|
|
|
* and vice versa.
|
|
|
|
*/
|
|
|
|
if (mirrored_kernelcore && zone_movable_pfn[nid]) {
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
struct memblock_region *r;
|
|
|
|
|
|
|
|
for_each_mem_region(r) {
|
|
|
|
start_pfn = clamp(memblock_region_memory_base_pfn(r),
|
|
|
|
zone_start_pfn, zone_end_pfn);
|
|
|
|
end_pfn = clamp(memblock_region_memory_end_pfn(r),
|
|
|
|
zone_start_pfn, zone_end_pfn);
|
|
|
|
|
|
|
|
if (zone_type == ZONE_MOVABLE &&
|
|
|
|
memblock_is_mirror(r))
|
|
|
|
nr_absent += end_pfn - start_pfn;
|
|
|
|
|
|
|
|
if (zone_type == ZONE_NORMAL &&
|
|
|
|
!memblock_is_mirror(r))
|
|
|
|
nr_absent += end_pfn - start_pfn;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nr_absent;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the number of pages a zone spans in a node, including holes
|
|
|
|
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
|
|
|
|
*/
|
|
|
|
static unsigned long __init zone_spanned_pages_in_node(int nid,
|
|
|
|
unsigned long zone_type,
|
|
|
|
unsigned long node_start_pfn,
|
|
|
|
unsigned long node_end_pfn,
|
|
|
|
unsigned long *zone_start_pfn,
|
|
|
|
unsigned long *zone_end_pfn)
|
|
|
|
{
|
|
|
|
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
|
|
|
|
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
|
|
|
|
|
|
|
|
/* Get the start and end of the zone */
|
|
|
|
*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
|
|
|
|
*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
|
2023-07-17 14:58:11 +08:00
|
|
|
adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
|
|
|
|
zone_start_pfn, zone_end_pfn);
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
/* Check that this node has pages within the zone's required range */
|
|
|
|
if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Move the zone boundaries inside the node if necessary */
|
|
|
|
*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
|
|
|
|
*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
|
|
|
|
|
|
|
|
/* Return the spanned pages */
|
|
|
|
return *zone_end_pfn - *zone_start_pfn;
|
|
|
|
}
|
|
|
|
|
2023-05-26 16:52:50 +08:00
|
|
|
static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
|
|
|
|
{
|
|
|
|
struct zone *z;
|
|
|
|
|
|
|
|
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
|
|
|
|
z->zone_start_pfn = 0;
|
|
|
|
z->spanned_pages = 0;
|
|
|
|
z->present_pages = 0;
|
|
|
|
#if defined(CONFIG_MEMORY_HOTPLUG)
|
|
|
|
z->present_early_pages = 0;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
pgdat->node_spanned_pages = 0;
|
|
|
|
pgdat->node_present_pages = 0;
|
|
|
|
pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
|
|
|
|
}
|
|
|
|
|
2024-03-25 22:56:43 +08:00
|
|
|
static void __init calc_nr_kernel_pages(void)
|
|
|
|
{
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
phys_addr_t start_addr, end_addr;
|
|
|
|
u64 u;
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
|
|
|
|
start_pfn = PFN_UP(start_addr);
|
|
|
|
end_pfn = PFN_DOWN(end_addr);
|
|
|
|
|
|
|
|
if (start_pfn < end_pfn) {
|
|
|
|
nr_all_pages += end_pfn - start_pfn;
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
start_pfn = clamp(start_pfn, 0, high_zone_low);
|
|
|
|
end_pfn = clamp(end_pfn, 0, high_zone_low);
|
|
|
|
#endif
|
|
|
|
nr_kernel_pages += end_pfn - start_pfn;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-22 01:05:02 +08:00
|
|
|
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
|
|
|
|
unsigned long node_start_pfn,
|
|
|
|
unsigned long node_end_pfn)
|
|
|
|
{
|
|
|
|
unsigned long realtotalpages = 0, totalpages = 0;
|
|
|
|
enum zone_type i;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NR_ZONES; i++) {
|
|
|
|
struct zone *zone = pgdat->node_zones + i;
|
|
|
|
unsigned long zone_start_pfn, zone_end_pfn;
|
|
|
|
unsigned long spanned, absent;
|
2023-05-26 16:52:51 +08:00
|
|
|
unsigned long real_size;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
|
|
|
|
node_start_pfn,
|
|
|
|
node_end_pfn,
|
|
|
|
&zone_start_pfn,
|
|
|
|
&zone_end_pfn);
|
|
|
|
absent = zone_absent_pages_in_node(pgdat->node_id, i,
|
2023-05-26 16:52:51 +08:00
|
|
|
zone_start_pfn,
|
|
|
|
zone_end_pfn);
|
2023-03-22 01:05:02 +08:00
|
|
|
|
2023-05-26 16:52:51 +08:00
|
|
|
real_size = spanned - absent;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
2023-05-26 16:52:51 +08:00
|
|
|
if (spanned)
|
2023-03-22 01:05:02 +08:00
|
|
|
zone->zone_start_pfn = zone_start_pfn;
|
|
|
|
else
|
|
|
|
zone->zone_start_pfn = 0;
|
2023-05-26 16:52:51 +08:00
|
|
|
zone->spanned_pages = spanned;
|
2023-03-22 01:05:02 +08:00
|
|
|
zone->present_pages = real_size;
|
|
|
|
#if defined(CONFIG_MEMORY_HOTPLUG)
|
|
|
|
zone->present_early_pages = real_size;
|
|
|
|
#endif
|
|
|
|
|
2023-05-26 16:52:51 +08:00
|
|
|
totalpages += spanned;
|
2023-03-22 01:05:02 +08:00
|
|
|
realtotalpages += real_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
pgdat->node_spanned_pages = totalpages;
|
|
|
|
pgdat->node_present_pages = realtotalpages;
|
|
|
|
pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
static void pgdat_init_split_queue(struct pglist_data *pgdat)
|
|
|
|
{
|
|
|
|
struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
|
|
|
|
|
|
|
|
spin_lock_init(&ds_queue->split_queue_lock);
|
|
|
|
INIT_LIST_HEAD(&ds_queue->split_queue);
|
|
|
|
ds_queue->split_queue_len = 0;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPACTION
|
|
|
|
static void pgdat_init_kcompactd(struct pglist_data *pgdat)
|
|
|
|
{
|
|
|
|
init_waitqueue_head(&pgdat->kcompactd_wait);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
pgdat_resize_init(pgdat);
|
|
|
|
pgdat_kswapd_lock_init(pgdat);
|
|
|
|
|
|
|
|
pgdat_init_split_queue(pgdat);
|
|
|
|
pgdat_init_kcompactd(pgdat);
|
|
|
|
|
|
|
|
init_waitqueue_head(&pgdat->kswapd_wait);
|
|
|
|
init_waitqueue_head(&pgdat->pfmemalloc_wait);
|
|
|
|
|
|
|
|
for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
|
|
|
|
init_waitqueue_head(&pgdat->reclaim_wait[i]);
|
|
|
|
|
|
|
|
pgdat_page_ext_init(pgdat);
|
|
|
|
lruvec_init(&pgdat->__lruvec);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
|
|
|
|
unsigned long remaining_pages)
|
|
|
|
{
|
|
|
|
atomic_long_set(&zone->managed_pages, remaining_pages);
|
|
|
|
zone_set_nid(zone, nid);
|
|
|
|
zone->name = zone_names[idx];
|
|
|
|
zone->zone_pgdat = NODE_DATA(nid);
|
|
|
|
spin_lock_init(&zone->lock);
|
|
|
|
zone_seqlock_init(zone);
|
|
|
|
zone_pcp_init(zone);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __meminit zone_init_free_lists(struct zone *zone)
|
|
|
|
{
|
|
|
|
unsigned int order, t;
|
|
|
|
for_each_migratetype_order(order, t) {
|
|
|
|
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
|
|
|
|
zone->free_area[order].nr_free = 0;
|
|
|
|
}
|
2023-06-06 22:26:29 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_UNACCEPTED_MEMORY
|
|
|
|
INIT_LIST_HEAD(&zone->unaccepted_pages);
|
|
|
|
#endif
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void __meminit init_currently_empty_zone(struct zone *zone,
|
|
|
|
unsigned long zone_start_pfn,
|
|
|
|
unsigned long size)
|
|
|
|
{
|
|
|
|
struct pglist_data *pgdat = zone->zone_pgdat;
|
|
|
|
int zone_idx = zone_idx(zone) + 1;
|
|
|
|
|
|
|
|
if (zone_idx > pgdat->nr_zones)
|
|
|
|
pgdat->nr_zones = zone_idx;
|
|
|
|
|
|
|
|
zone->zone_start_pfn = zone_start_pfn;
|
|
|
|
|
|
|
|
mminit_dprintk(MMINIT_TRACE, "memmap_init",
|
|
|
|
"Initialising map node %d zone %lu pfns %lu -> %lu\n",
|
|
|
|
pgdat->node_id,
|
|
|
|
(unsigned long)zone_idx(zone),
|
|
|
|
zone_start_pfn, (zone_start_pfn + size));
|
|
|
|
|
|
|
|
zone_init_free_lists(zone);
|
|
|
|
zone->initialized = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef CONFIG_SPARSEMEM
|
|
|
|
/*
|
|
|
|
* Calculate the size of the zone->blockflags rounded to an unsigned long
|
|
|
|
* Start by making sure zonesize is a multiple of pageblock_order by rounding
|
|
|
|
* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
|
|
|
|
* round what is now in bits to nearest long in bits, then return it in
|
|
|
|
* bytes.
|
|
|
|
*/
|
|
|
|
static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
|
|
|
|
{
|
|
|
|
unsigned long usemapsize;
|
|
|
|
|
|
|
|
zonesize += zone_start_pfn & (pageblock_nr_pages-1);
|
|
|
|
usemapsize = roundup(zonesize, pageblock_nr_pages);
|
|
|
|
usemapsize = usemapsize >> pageblock_order;
|
|
|
|
usemapsize *= NR_PAGEBLOCK_BITS;
|
2023-08-07 10:35:28 +08:00
|
|
|
usemapsize = roundup(usemapsize, BITS_PER_LONG);
|
2023-03-22 01:05:02 +08:00
|
|
|
|
2023-08-07 10:35:28 +08:00
|
|
|
return usemapsize / BITS_PER_BYTE;
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __ref setup_usemap(struct zone *zone)
|
|
|
|
{
|
|
|
|
unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
|
|
|
|
zone->spanned_pages);
|
|
|
|
zone->pageblock_flags = NULL;
|
|
|
|
if (usemapsize) {
|
|
|
|
zone->pageblock_flags =
|
|
|
|
memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
|
|
|
|
zone_to_nid(zone));
|
|
|
|
if (!zone->pageblock_flags)
|
|
|
|
panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
|
|
|
|
usemapsize, zone->name, zone_to_nid(zone));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void setup_usemap(struct zone *zone) {}
|
|
|
|
#endif /* CONFIG_SPARSEMEM */
|
|
|
|
|
|
|
|
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
|
|
|
|
|
|
|
|
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
|
|
|
|
void __init set_pageblock_order(void)
|
|
|
|
{
|
2023-12-28 22:47:04 +08:00
|
|
|
unsigned int order = MAX_PAGE_ORDER;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
/* Check that pageblock_nr_pages has not already been setup */
|
|
|
|
if (pageblock_order)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Don't let pageblocks exceed the maximum allocation granularity. */
|
|
|
|
if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
|
|
|
|
order = HUGETLB_PAGE_ORDER;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Assume the largest contiguous order of interest is a huge page.
|
2023-12-22 15:02:03 +08:00
|
|
|
* This value may be variable depending on boot parameters on powerpc.
|
2023-03-22 01:05:02 +08:00
|
|
|
*/
|
|
|
|
pageblock_order = order;
|
|
|
|
}
|
|
|
|
#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
|
|
|
|
* is unused as pageblock_order is set at compile-time. See
|
|
|
|
* include/linux/pageblock-flags.h for the values of pageblock_order based on
|
|
|
|
* the kernel config
|
|
|
|
*/
|
|
|
|
void __init set_pageblock_order(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set up the zone data structures
|
|
|
|
* - init pgdat internals
|
|
|
|
* - init all zones belonging to this node
|
|
|
|
*
|
|
|
|
* NOTE: this function is only called during memory hotplug
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
|
|
void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
|
|
|
|
{
|
|
|
|
int nid = pgdat->node_id;
|
|
|
|
enum zone_type z;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
pgdat_init_internals(pgdat);
|
|
|
|
|
|
|
|
if (pgdat->per_cpu_nodestats == &boot_nodestats)
|
|
|
|
pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reset the nr_zones, order and highest_zoneidx before reuse.
|
|
|
|
* Note that kswapd will init kswapd_highest_zoneidx properly
|
|
|
|
* when it starts in the near future.
|
|
|
|
*/
|
|
|
|
pgdat->nr_zones = 0;
|
|
|
|
pgdat->kswapd_order = 0;
|
|
|
|
pgdat->kswapd_highest_zoneidx = 0;
|
|
|
|
pgdat->node_start_pfn = 0;
|
2023-06-07 10:50:56 +08:00
|
|
|
pgdat->node_present_pages = 0;
|
|
|
|
|
2023-03-22 01:05:02 +08:00
|
|
|
for_each_online_cpu(cpu) {
|
|
|
|
struct per_cpu_nodestat *p;
|
|
|
|
|
|
|
|
p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
|
|
|
|
memset(p, 0, sizeof(*p));
|
|
|
|
}
|
|
|
|
|
2023-06-07 10:50:56 +08:00
|
|
|
/*
|
|
|
|
* When memory is hot-added, all the memory is in offline state. So
|
|
|
|
* clear all zones' present_pages and managed_pages because they will
|
|
|
|
* be updated in online_pages() and offline_pages().
|
|
|
|
*/
|
|
|
|
for (z = 0; z < MAX_NR_ZONES; z++) {
|
|
|
|
struct zone *zone = pgdat->node_zones + z;
|
|
|
|
|
|
|
|
zone->present_pages = 0;
|
|
|
|
zone_init_internals(zone, z, nid, 0);
|
|
|
|
}
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void __init free_area_init_core(struct pglist_data *pgdat)
|
|
|
|
{
|
|
|
|
enum zone_type j;
|
|
|
|
int nid = pgdat->node_id;
|
|
|
|
|
|
|
|
pgdat_init_internals(pgdat);
|
|
|
|
pgdat->per_cpu_nodestats = &boot_nodestats;
|
|
|
|
|
|
|
|
for (j = 0; j < MAX_NR_ZONES; j++) {
|
|
|
|
struct zone *zone = pgdat->node_zones + j;
|
mm/mm_init.c: remove meaningless calculation of zone->managed_pages in free_area_init_core()
Currently, in free_area_init_core(), when initialize zone's field, a rough
value is set to zone->managed_pages. That value is calculated by
(zone->present_pages - memmap_pages).
In the meantime, add the value to nr_all_pages and nr_kernel_pages which
represent all free pages of system (only low memory or including HIGHMEM
memory separately). Both of them are gonna be used in
alloc_large_system_hash().
However, the rough calculation and setting of zone->managed_pages is
meaningless because
a) memmap pages are allocated on units of node in sparse_init() or
alloc_node_mem_map(pgdat); The simple (zone->present_pages -
memmap_pages) is too rough to make sense for zone;
b) the set zone->managed_pages will be zeroed out and reset with
acutal value in mem_init() via memblock_free_all(). Before the
resetting, no buddy allocation request is issued.
Here, remove the meaningless and complicated calculation of
(zone->present_pages - memmap_pages), directly set zone->managed_pages as
zone->present_pages for now. It will be adjusted in mem_init().
And also remove the assignment of nr_all_pages and nr_kernel_pages in
free_area_init_core(). Instead, call the newly added
calc_nr_kernel_pages() to count up all free but not reserved memory in
memblock and assign to nr_all_pages and nr_kernel_pages. The counting
excludes memmap_pages, and other kernel used data, which is more accurate
than old way and simpler, and can also cover the ppc required
arch_reserved_kernel_pages() case.
And also clean up the outdated code comment above free_area_init_core().
And free_area_init_core() is easy to understand now, no need to add words
to explain.
[bhe@redhat.com: initialize zone->managed_pages as zone->present_pages for now]
Link: https://lkml.kernel.org/r/ZgU0bsJ2FEjykvju@MiWiFi-R3L-srv
Link: https://lkml.kernel.org/r/20240325145646.1044760-5-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-03-25 22:56:44 +08:00
|
|
|
unsigned long size = zone->spanned_pages;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
/*
|
mm/mm_init.c: remove meaningless calculation of zone->managed_pages in free_area_init_core()
Currently, in free_area_init_core(), when initialize zone's field, a rough
value is set to zone->managed_pages. That value is calculated by
(zone->present_pages - memmap_pages).
In the meantime, add the value to nr_all_pages and nr_kernel_pages which
represent all free pages of system (only low memory or including HIGHMEM
memory separately). Both of them are gonna be used in
alloc_large_system_hash().
However, the rough calculation and setting of zone->managed_pages is
meaningless because
a) memmap pages are allocated on units of node in sparse_init() or
alloc_node_mem_map(pgdat); The simple (zone->present_pages -
memmap_pages) is too rough to make sense for zone;
b) the set zone->managed_pages will be zeroed out and reset with
acutal value in mem_init() via memblock_free_all(). Before the
resetting, no buddy allocation request is issued.
Here, remove the meaningless and complicated calculation of
(zone->present_pages - memmap_pages), directly set zone->managed_pages as
zone->present_pages for now. It will be adjusted in mem_init().
And also remove the assignment of nr_all_pages and nr_kernel_pages in
free_area_init_core(). Instead, call the newly added
calc_nr_kernel_pages() to count up all free but not reserved memory in
memblock and assign to nr_all_pages and nr_kernel_pages. The counting
excludes memmap_pages, and other kernel used data, which is more accurate
than old way and simpler, and can also cover the ppc required
arch_reserved_kernel_pages() case.
And also clean up the outdated code comment above free_area_init_core().
And free_area_init_core() is easy to understand now, no need to add words
to explain.
[bhe@redhat.com: initialize zone->managed_pages as zone->present_pages for now]
Link: https://lkml.kernel.org/r/ZgU0bsJ2FEjykvju@MiWiFi-R3L-srv
Link: https://lkml.kernel.org/r/20240325145646.1044760-5-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-03-25 22:56:44 +08:00
|
|
|
* Initialize zone->managed_pages as 0 , it will be reset
|
|
|
|
* when memblock allocator frees pages into buddy system.
|
2023-03-22 01:05:02 +08:00
|
|
|
*/
|
mm/mm_init.c: remove meaningless calculation of zone->managed_pages in free_area_init_core()
Currently, in free_area_init_core(), when initialize zone's field, a rough
value is set to zone->managed_pages. That value is calculated by
(zone->present_pages - memmap_pages).
In the meantime, add the value to nr_all_pages and nr_kernel_pages which
represent all free pages of system (only low memory or including HIGHMEM
memory separately). Both of them are gonna be used in
alloc_large_system_hash().
However, the rough calculation and setting of zone->managed_pages is
meaningless because
a) memmap pages are allocated on units of node in sparse_init() or
alloc_node_mem_map(pgdat); The simple (zone->present_pages -
memmap_pages) is too rough to make sense for zone;
b) the set zone->managed_pages will be zeroed out and reset with
acutal value in mem_init() via memblock_free_all(). Before the
resetting, no buddy allocation request is issued.
Here, remove the meaningless and complicated calculation of
(zone->present_pages - memmap_pages), directly set zone->managed_pages as
zone->present_pages for now. It will be adjusted in mem_init().
And also remove the assignment of nr_all_pages and nr_kernel_pages in
free_area_init_core(). Instead, call the newly added
calc_nr_kernel_pages() to count up all free but not reserved memory in
memblock and assign to nr_all_pages and nr_kernel_pages. The counting
excludes memmap_pages, and other kernel used data, which is more accurate
than old way and simpler, and can also cover the ppc required
arch_reserved_kernel_pages() case.
And also clean up the outdated code comment above free_area_init_core().
And free_area_init_core() is easy to understand now, no need to add words
to explain.
[bhe@redhat.com: initialize zone->managed_pages as zone->present_pages for now]
Link: https://lkml.kernel.org/r/ZgU0bsJ2FEjykvju@MiWiFi-R3L-srv
Link: https://lkml.kernel.org/r/20240325145646.1044760-5-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-03-25 22:56:44 +08:00
|
|
|
zone_init_internals(zone, j, nid, zone->present_pages);
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
if (!size)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
setup_usemap(zone);
|
|
|
|
init_currently_empty_zone(zone, zone->zone_start_pfn, size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
|
|
|
|
phys_addr_t min_addr, int nid, bool exact_nid)
|
|
|
|
{
|
|
|
|
void *ptr;
|
|
|
|
|
|
|
|
if (exact_nid)
|
|
|
|
ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
|
|
|
|
MEMBLOCK_ALLOC_ACCESSIBLE,
|
|
|
|
nid);
|
|
|
|
else
|
|
|
|
ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
|
|
|
|
MEMBLOCK_ALLOC_ACCESSIBLE,
|
|
|
|
nid);
|
|
|
|
|
|
|
|
if (ptr && size > 0)
|
|
|
|
page_init_poison(ptr, size);
|
|
|
|
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_FLATMEM
|
|
|
|
static void __init alloc_node_mem_map(struct pglist_data *pgdat)
|
|
|
|
{
|
2023-12-22 15:02:03 +08:00
|
|
|
unsigned long start, offset, size, end;
|
|
|
|
struct page *map;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
/* Skip empty nodes */
|
|
|
|
if (!pgdat->node_spanned_pages)
|
|
|
|
return;
|
|
|
|
|
|
|
|
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
|
|
|
|
offset = pgdat->node_start_pfn - start;
|
|
|
|
/*
|
2023-12-28 22:47:04 +08:00
|
|
|
* The zone's endpoints aren't required to be MAX_PAGE_ORDER
|
2023-12-22 15:02:03 +08:00
|
|
|
* aligned but the node_mem_map endpoints must be in order
|
|
|
|
* for the buddy allocator to function correctly.
|
2023-03-22 01:05:02 +08:00
|
|
|
*/
|
2023-12-22 15:02:03 +08:00
|
|
|
end = ALIGN(pgdat_end_pfn(pgdat), MAX_ORDER_NR_PAGES);
|
|
|
|
size = (end - start) * sizeof(struct page);
|
|
|
|
map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
|
|
|
|
pgdat->node_id, false);
|
|
|
|
if (!map)
|
|
|
|
panic("Failed to allocate %ld bytes for node %d memory map\n",
|
|
|
|
size, pgdat->node_id);
|
|
|
|
pgdat->node_mem_map = map + offset;
|
mm: don't account memmap per-node
Fix invalid access to pgdat during hot-remove operation:
ndctl users reported a GPF when trying to destroy a namespace:
$ ndctl destroy-namespace all -r all -f
Segmentation fault
dmesg:
Oops: general protection fault, probably for
non-canonical address 0xdffffc0000005650: 0000 [#1] PREEMPT SMP KASAN
PTI
KASAN: probably user-memory-access in range
[0x000000000002b280-0x000000000002b287]
CPU: 26 UID: 0 PID: 1868 Comm: ndctl Not tainted 6.11.0-rc1 #1
Hardware name: Dell Inc. PowerEdge R640/08HT8T, BIOS
2.20.1 09/13/2023
RIP: 0010:mod_node_page_state+0x2a/0x110
cxl-test users report a GPF when trying to unload the test module:
$ modrpobe -r cxl-test
dmesg
BUG: unable to handle page fault for address: 0000000000004200
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 0 UID: 0 PID: 1076 Comm: modprobe Tainted: G O N 6.11.0-rc1 #197
Tainted: [O]=OOT_MODULE, [N]=TEST
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/15
RIP: 0010:mod_node_page_state+0x6/0x90
Currently, when memory is hot-plugged or hot-removed the accounting is
done based on the assumption that memmap is allocated from the same node
as the hot-plugged/hot-removed memory, which is not always the case.
In addition, there are challenges with keeping the node id of the memory
that is being remove to the time when memmap accounting is actually
performed: since this is done after remove_pfn_range_from_zone(), and
also after remove_memory_block_devices(). Meaning that we cannot use
pgdat nor walking though memblocks to get the nid.
Given all of that, account the memmap overhead system wide instead.
For this we are going to be using global atomic counters, but given that
memmap size is rarely modified, and normally is only modified either
during early boot when there is only one CPU, or under a hotplug global
mutex lock, therefore there is no need for per-cpu optimizations.
Also, while we are here rename nr_memmap to nr_memmap_pages, and
nr_memmap_boot to nr_memmap_boot_pages to be self explanatory that the
units are in page count.
[pasha.tatashin@soleen.com: address a few nits from David Hildenbrand]
Link: https://lkml.kernel.org/r/20240809191020.1142142-4-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20240809191020.1142142-4-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20240808213437.682006-4-pasha.tatashin@soleen.com
Fixes: 15995a352474 ("mm: report per-page metadata information")
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Closes: https://lore.kernel.org/linux-cxl/CAHj4cs9Ax1=CoJkgBGP_+sNu6-6=6v=_L-ZBZY0bVLD3wUWZQg@mail.gmail.com
Reported-by: Alison Schofield <alison.schofield@intel.com>
Closes: https://lore.kernel.org/linux-mm/Zq0tPd2h6alFz8XF@aschofie-mobl2/#t
Tested-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Fan Ni <fan.ni@samsung.com>
Cc: Joel Granados <j.granados@samsung.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-09 05:34:36 +08:00
|
|
|
memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
|
2023-12-22 15:02:03 +08:00
|
|
|
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
|
|
|
|
__func__, pgdat->node_id, (unsigned long)pgdat,
|
|
|
|
(unsigned long)pgdat->node_mem_map);
|
|
|
|
#ifndef CONFIG_NUMA
|
|
|
|
/* the global mem_map is just set as node 0's */
|
2023-03-22 01:05:02 +08:00
|
|
|
if (pgdat == NODE_DATA(0)) {
|
|
|
|
mem_map = NODE_DATA(0)->node_mem_map;
|
|
|
|
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
|
|
|
|
mem_map -= offset;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
|
|
|
|
#endif /* CONFIG_FLATMEM */
|
|
|
|
|
|
|
|
/**
|
|
|
|
* get_pfn_range_for_nid - Return the start and end page frames for a node
|
|
|
|
* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
|
|
|
|
* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
|
|
|
|
* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
|
|
|
|
*
|
|
|
|
* It returns the start and end page frame of a node based on information
|
|
|
|
* provided by memblock_set_node(). If called for a node
|
2023-06-25 11:33:40 +08:00
|
|
|
* with no available memory, the start and end PFNs will be 0.
|
2023-03-22 01:05:02 +08:00
|
|
|
*/
|
|
|
|
void __init get_pfn_range_for_nid(unsigned int nid,
|
|
|
|
unsigned long *start_pfn, unsigned long *end_pfn)
|
|
|
|
{
|
|
|
|
unsigned long this_start_pfn, this_end_pfn;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
*start_pfn = -1UL;
|
|
|
|
*end_pfn = 0;
|
|
|
|
|
|
|
|
for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
|
|
|
|
*start_pfn = min(*start_pfn, this_start_pfn);
|
|
|
|
*end_pfn = max(*end_pfn, this_end_pfn);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*start_pfn == -1UL)
|
|
|
|
*start_pfn = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init free_area_init_node(int nid)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = NODE_DATA(nid);
|
|
|
|
unsigned long start_pfn = 0;
|
|
|
|
unsigned long end_pfn = 0;
|
|
|
|
|
|
|
|
/* pg_data_t should be reset to zero when it's allocated */
|
|
|
|
WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
|
|
|
|
|
|
|
|
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
|
|
|
|
|
|
|
|
pgdat->node_id = nid;
|
|
|
|
pgdat->node_start_pfn = start_pfn;
|
|
|
|
pgdat->per_cpu_nodestats = NULL;
|
|
|
|
|
|
|
|
if (start_pfn != end_pfn) {
|
|
|
|
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
|
|
|
|
(u64)start_pfn << PAGE_SHIFT,
|
|
|
|
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
|
2023-05-26 16:52:50 +08:00
|
|
|
|
|
|
|
calculate_node_totalpages(pgdat, start_pfn, end_pfn);
|
2023-03-22 01:05:02 +08:00
|
|
|
} else {
|
|
|
|
pr_info("Initmem setup node %d as memoryless\n", nid);
|
|
|
|
|
2023-05-26 16:52:50 +08:00
|
|
|
reset_memoryless_node_totalpages(pgdat);
|
|
|
|
}
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
alloc_node_mem_map(pgdat);
|
|
|
|
pgdat_set_deferred_range(pgdat);
|
|
|
|
|
|
|
|
free_area_init_core(pgdat);
|
|
|
|
lru_gen_init_pgdat(pgdat);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Any regular or high memory on that node ? */
|
2023-07-10 17:37:50 +08:00
|
|
|
static void __init check_for_memory(pg_data_t *pgdat)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
|
|
|
enum zone_type zone_type;
|
|
|
|
|
|
|
|
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
|
|
|
|
struct zone *zone = &pgdat->node_zones[zone_type];
|
|
|
|
if (populated_zone(zone)) {
|
|
|
|
if (IS_ENABLED(CONFIG_HIGHMEM))
|
2023-06-07 11:24:02 +08:00
|
|
|
node_set_state(pgdat->node_id, N_HIGH_MEMORY);
|
2023-03-22 01:05:02 +08:00
|
|
|
if (zone_type <= ZONE_NORMAL)
|
2023-06-07 11:24:02 +08:00
|
|
|
node_set_state(pgdat->node_id, N_NORMAL_MEMORY);
|
2023-03-22 01:05:02 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if MAX_NUMNODES > 1
|
|
|
|
/*
|
|
|
|
* Figure out the number of possible node ids.
|
|
|
|
*/
|
|
|
|
void __init setup_nr_node_ids(void)
|
|
|
|
{
|
|
|
|
unsigned int highest;
|
|
|
|
|
|
|
|
highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
|
|
|
|
nr_node_ids = highest + 1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
|
|
|
|
* such cases we allow max_zone_pfn sorted in the descending order
|
|
|
|
*/
|
2023-04-14 16:03:53 +08:00
|
|
|
static bool arch_has_descending_max_zone_pfns(void)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
2023-04-14 16:03:53 +08:00
|
|
|
return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* free_area_init - Initialise all pg_data_t and zone data
|
|
|
|
* @max_zone_pfn: an array of max PFNs for each zone
|
|
|
|
*
|
|
|
|
* This will call free_area_init_node() for each active node in the system.
|
|
|
|
* Using the page ranges provided by memblock_set_node(), the size of each
|
|
|
|
* zone in each node and their holes is calculated. If the maximum PFN
|
|
|
|
* between two adjacent zones match, it is assumed that the zone is empty.
|
|
|
|
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
|
|
|
|
* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
|
|
|
|
* starts where the previous one ended. For example, ZONE_DMA32 starts
|
|
|
|
* at arch_max_dma_pfn.
|
|
|
|
*/
|
|
|
|
void __init free_area_init(unsigned long *max_zone_pfn)
|
|
|
|
{
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
int i, nid, zone;
|
|
|
|
bool descending;
|
|
|
|
|
|
|
|
/* Record where the zone boundaries are */
|
|
|
|
memset(arch_zone_lowest_possible_pfn, 0,
|
|
|
|
sizeof(arch_zone_lowest_possible_pfn));
|
|
|
|
memset(arch_zone_highest_possible_pfn, 0,
|
|
|
|
sizeof(arch_zone_highest_possible_pfn));
|
|
|
|
|
|
|
|
start_pfn = PHYS_PFN(memblock_start_of_DRAM());
|
|
|
|
descending = arch_has_descending_max_zone_pfns();
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NR_ZONES; i++) {
|
|
|
|
if (descending)
|
|
|
|
zone = MAX_NR_ZONES - i - 1;
|
|
|
|
else
|
|
|
|
zone = i;
|
|
|
|
|
|
|
|
if (zone == ZONE_MOVABLE)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
end_pfn = max(max_zone_pfn[zone], start_pfn);
|
|
|
|
arch_zone_lowest_possible_pfn[zone] = start_pfn;
|
|
|
|
arch_zone_highest_possible_pfn[zone] = end_pfn;
|
|
|
|
|
|
|
|
start_pfn = end_pfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
|
|
|
|
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
|
|
|
|
find_zone_movable_pfns_for_nodes();
|
|
|
|
|
|
|
|
/* Print out the zone ranges */
|
|
|
|
pr_info("Zone ranges:\n");
|
|
|
|
for (i = 0; i < MAX_NR_ZONES; i++) {
|
|
|
|
if (i == ZONE_MOVABLE)
|
|
|
|
continue;
|
|
|
|
pr_info(" %-8s ", zone_names[i]);
|
|
|
|
if (arch_zone_lowest_possible_pfn[i] ==
|
|
|
|
arch_zone_highest_possible_pfn[i])
|
|
|
|
pr_cont("empty\n");
|
|
|
|
else
|
|
|
|
pr_cont("[mem %#018Lx-%#018Lx]\n",
|
|
|
|
(u64)arch_zone_lowest_possible_pfn[i]
|
|
|
|
<< PAGE_SHIFT,
|
|
|
|
((u64)arch_zone_highest_possible_pfn[i]
|
|
|
|
<< PAGE_SHIFT) - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
|
|
|
|
pr_info("Movable zone start for each node\n");
|
|
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
if (zone_movable_pfn[i])
|
|
|
|
pr_info(" Node %d: %#018Lx\n", i,
|
|
|
|
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Print out the early node map, and initialize the
|
|
|
|
* subsection-map relative to active online memory ranges to
|
|
|
|
* enable future "sub-section" extensions of the memory map.
|
|
|
|
*/
|
|
|
|
pr_info("Early memory node ranges\n");
|
|
|
|
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
|
|
|
|
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
|
|
|
|
(u64)start_pfn << PAGE_SHIFT,
|
|
|
|
((u64)end_pfn << PAGE_SHIFT) - 1);
|
|
|
|
subsection_map_init(start_pfn, end_pfn - start_pfn);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Initialise every node */
|
|
|
|
mminit_verify_pageflags_layout();
|
|
|
|
setup_nr_node_ids();
|
2023-06-01 14:35:35 +08:00
|
|
|
set_pageblock_order();
|
|
|
|
|
2023-03-22 01:05:02 +08:00
|
|
|
for_each_node(nid) {
|
|
|
|
pg_data_t *pgdat;
|
|
|
|
|
|
|
|
if (!node_online(nid)) {
|
|
|
|
/* Allocator not initialized yet */
|
|
|
|
pgdat = arch_alloc_nodedata(nid);
|
|
|
|
if (!pgdat)
|
|
|
|
panic("Cannot allocate %zuB for node %d.\n",
|
|
|
|
sizeof(*pgdat), nid);
|
|
|
|
arch_refresh_nodedata(nid, pgdat);
|
|
|
|
}
|
|
|
|
|
|
|
|
pgdat = NODE_DATA(nid);
|
|
|
|
free_area_init_node(nid);
|
|
|
|
|
2024-03-26 14:11:28 +08:00
|
|
|
/*
|
|
|
|
* No sysfs hierarcy will be created via register_one_node()
|
|
|
|
*for memory-less node because here it's not marked as N_MEMORY
|
|
|
|
*and won't be set online later. The benefit is userspace
|
|
|
|
*program won't be confused by sysfs files/directories of
|
|
|
|
*memory-less node. The pgdat will get fully initialized by
|
|
|
|
*hotadd_init_pgdat() when memory is hotplugged into this node.
|
|
|
|
*/
|
|
|
|
if (pgdat->node_present_pages) {
|
2023-03-22 01:05:02 +08:00
|
|
|
node_set_state(nid, N_MEMORY);
|
2024-03-26 14:11:28 +08:00
|
|
|
check_for_memory(pgdat);
|
|
|
|
}
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
mm/mm_init.c: remove meaningless calculation of zone->managed_pages in free_area_init_core()
Currently, in free_area_init_core(), when initialize zone's field, a rough
value is set to zone->managed_pages. That value is calculated by
(zone->present_pages - memmap_pages).
In the meantime, add the value to nr_all_pages and nr_kernel_pages which
represent all free pages of system (only low memory or including HIGHMEM
memory separately). Both of them are gonna be used in
alloc_large_system_hash().
However, the rough calculation and setting of zone->managed_pages is
meaningless because
a) memmap pages are allocated on units of node in sparse_init() or
alloc_node_mem_map(pgdat); The simple (zone->present_pages -
memmap_pages) is too rough to make sense for zone;
b) the set zone->managed_pages will be zeroed out and reset with
acutal value in mem_init() via memblock_free_all(). Before the
resetting, no buddy allocation request is issued.
Here, remove the meaningless and complicated calculation of
(zone->present_pages - memmap_pages), directly set zone->managed_pages as
zone->present_pages for now. It will be adjusted in mem_init().
And also remove the assignment of nr_all_pages and nr_kernel_pages in
free_area_init_core(). Instead, call the newly added
calc_nr_kernel_pages() to count up all free but not reserved memory in
memblock and assign to nr_all_pages and nr_kernel_pages. The counting
excludes memmap_pages, and other kernel used data, which is more accurate
than old way and simpler, and can also cover the ppc required
arch_reserved_kernel_pages() case.
And also clean up the outdated code comment above free_area_init_core().
And free_area_init_core() is easy to understand now, no need to add words
to explain.
[bhe@redhat.com: initialize zone->managed_pages as zone->present_pages for now]
Link: https://lkml.kernel.org/r/ZgU0bsJ2FEjykvju@MiWiFi-R3L-srv
Link: https://lkml.kernel.org/r/20240325145646.1044760-5-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-03-25 22:56:44 +08:00
|
|
|
calc_nr_kernel_pages();
|
2023-03-22 01:05:02 +08:00
|
|
|
memmap_init();
|
2023-03-22 01:05:03 +08:00
|
|
|
|
|
|
|
/* disable hash distribution for systems with a single node */
|
|
|
|
fixup_hashdist();
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* node_map_pfn_alignment - determine the maximum internode alignment
|
|
|
|
*
|
|
|
|
* This function should be called after node map is populated and sorted.
|
|
|
|
* It calculates the maximum power of two alignment which can distinguish
|
|
|
|
* all the nodes.
|
|
|
|
*
|
|
|
|
* For example, if all nodes are 1GiB and aligned to 1GiB, the return value
|
|
|
|
* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
|
|
|
|
* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
|
|
|
|
* shifted, 1GiB is enough and this function will indicate so.
|
|
|
|
*
|
|
|
|
* This is used to test whether pfn -> nid mapping of the chosen memory
|
|
|
|
* model has fine enough granularity to avoid incorrect mapping for the
|
|
|
|
* populated node map.
|
|
|
|
*
|
|
|
|
* Return: the determined alignment in pfn's. 0 if there is no alignment
|
|
|
|
* requirement (single node).
|
|
|
|
*/
|
|
|
|
unsigned long __init node_map_pfn_alignment(void)
|
|
|
|
{
|
|
|
|
unsigned long accl_mask = 0, last_end = 0;
|
|
|
|
unsigned long start, end, mask;
|
|
|
|
int last_nid = NUMA_NO_NODE;
|
|
|
|
int i, nid;
|
|
|
|
|
|
|
|
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
|
|
|
|
if (!start || last_nid < 0 || last_nid == nid) {
|
|
|
|
last_nid = nid;
|
|
|
|
last_end = end;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Start with a mask granular enough to pin-point to the
|
|
|
|
* start pfn and tick off bits one-by-one until it becomes
|
|
|
|
* too coarse to separate the current node from the last.
|
|
|
|
*/
|
|
|
|
mask = ~((1 << __ffs(start)) - 1);
|
|
|
|
while (mask && last_end <= (start & (mask << 1)))
|
|
|
|
mask <<= 1;
|
|
|
|
|
|
|
|
/* accumulate all internode masks */
|
|
|
|
accl_mask |= mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* convert mask to number of pages */
|
|
|
|
return ~accl_mask + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
2024-06-12 10:04:21 +08:00
|
|
|
static void __init deferred_free_pages(unsigned long pfn,
|
|
|
|
unsigned long nr_pages)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
unsigned long i;
|
|
|
|
|
|
|
|
if (!nr_pages)
|
|
|
|
return;
|
|
|
|
|
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
|
|
|
|
/* Free a large naturally-aligned chunk if possible */
|
2023-03-21 08:24:15 +08:00
|
|
|
if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
|
|
|
|
for (i = 0; i < nr_pages; i += pageblock_nr_pages)
|
|
|
|
set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
|
mm: pass meminit_context to __free_pages_core()
Patch series "mm/memory_hotplug: use PageOffline() instead of
PageReserved() for !ZONE_DEVICE".
This can be a considered a long-overdue follow-up to some parts of [1].
The patches are based on [2], but they are not strictly required -- just
makes it clearer why we can use adjust_managed_page_count() for memory
hotplug without going into details about highmem.
We stop initializing pages with PageReserved() in memory hotplug code --
except when dealing with ZONE_DEVICE for now. Instead, we use
PageOffline(): all pages are initialized to PageOffline() when onlining a
memory section, and only the ones actually getting exposed to the
system/page allocator will get PageOffline cleared.
This way, we enlighten memory hotplug more about PageOffline() pages and
can cleanup some hacks we have in virtio-mem code.
What about ZONE_DEVICE? PageOffline() is wrong, but we might just stop
using PageReserved() for them later by simply checking for
is_zone_device_page() at suitable places. That will be a separate patch
set / proposal.
This primarily affects virtio-mem, HV-balloon and XEN balloon. I only
briefly tested with virtio-mem, which benefits most from these cleanups.
[1] https://lore.kernel.org/all/20191024120938.11237-1-david@redhat.com/
[2] https://lkml.kernel.org/r/20240607083711.62833-1-david@redhat.com
This patch (of 3):
In preparation for further changes, let's teach __free_pages_core() about
the differences of memory hotplug handling.
Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory hotplug
path, and spell out why memory freed via memblock cannot currently use
adjust_managed_page_count().
[david@redhat.com: add missed CONFIG_DEFERRED_STRUCT_PAGE_INIT]
Link: https://lkml.kernel.org/r/b72e6efd-fb0a-459c-b1a0-88a98e5b19e2@redhat.com
[david@redhat.com: fix up the memblock comment, per Oscar]
Link: https://lkml.kernel.org/r/2ed64218-7f3b-4302-a5dc-27f060654fe2@redhat.com
[david@redhat.com: add the parameter name also in the declaration]
Link: https://lkml.kernel.org/r/ca575956-f0dd-4fb9-a307-6b7621681ed9@redhat.com
Link: https://lkml.kernel.org/r/20240607090939.89524-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240607090939.89524-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eugenio Pérez <eperezma@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Marco Elver <elver@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-07 17:09:36 +08:00
|
|
|
__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
|
2023-03-22 01:05:02 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-12-28 22:47:04 +08:00
|
|
|
/* Accept chunks smaller than MAX_PAGE_ORDER upfront */
|
2023-06-06 22:26:29 +08:00
|
|
|
accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
|
|
|
|
|
2023-03-22 01:05:02 +08:00
|
|
|
for (i = 0; i < nr_pages; i++, page++, pfn++) {
|
|
|
|
if (pageblock_aligned(pfn))
|
|
|
|
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
mm: pass meminit_context to __free_pages_core()
Patch series "mm/memory_hotplug: use PageOffline() instead of
PageReserved() for !ZONE_DEVICE".
This can be a considered a long-overdue follow-up to some parts of [1].
The patches are based on [2], but they are not strictly required -- just
makes it clearer why we can use adjust_managed_page_count() for memory
hotplug without going into details about highmem.
We stop initializing pages with PageReserved() in memory hotplug code --
except when dealing with ZONE_DEVICE for now. Instead, we use
PageOffline(): all pages are initialized to PageOffline() when onlining a
memory section, and only the ones actually getting exposed to the
system/page allocator will get PageOffline cleared.
This way, we enlighten memory hotplug more about PageOffline() pages and
can cleanup some hacks we have in virtio-mem code.
What about ZONE_DEVICE? PageOffline() is wrong, but we might just stop
using PageReserved() for them later by simply checking for
is_zone_device_page() at suitable places. That will be a separate patch
set / proposal.
This primarily affects virtio-mem, HV-balloon and XEN balloon. I only
briefly tested with virtio-mem, which benefits most from these cleanups.
[1] https://lore.kernel.org/all/20191024120938.11237-1-david@redhat.com/
[2] https://lkml.kernel.org/r/20240607083711.62833-1-david@redhat.com
This patch (of 3):
In preparation for further changes, let's teach __free_pages_core() about
the differences of memory hotplug handling.
Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory hotplug
path, and spell out why memory freed via memblock cannot currently use
adjust_managed_page_count().
[david@redhat.com: add missed CONFIG_DEFERRED_STRUCT_PAGE_INIT]
Link: https://lkml.kernel.org/r/b72e6efd-fb0a-459c-b1a0-88a98e5b19e2@redhat.com
[david@redhat.com: fix up the memblock comment, per Oscar]
Link: https://lkml.kernel.org/r/2ed64218-7f3b-4302-a5dc-27f060654fe2@redhat.com
[david@redhat.com: add the parameter name also in the declaration]
Link: https://lkml.kernel.org/r/ca575956-f0dd-4fb9-a307-6b7621681ed9@redhat.com
Link: https://lkml.kernel.org/r/20240607090939.89524-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240607090939.89524-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eugenio Pérez <eperezma@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Marco Elver <elver@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-07 17:09:36 +08:00
|
|
|
__free_pages_core(page, 0, MEMINIT_EARLY);
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Completion tracking for deferred_init_memmap() threads */
|
|
|
|
static atomic_t pgdat_init_n_undone __initdata;
|
|
|
|
static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
|
|
|
|
|
|
|
|
static inline void __init pgdat_init_report_one_done(void)
|
|
|
|
{
|
|
|
|
if (atomic_dec_and_test(&pgdat_init_n_undone))
|
|
|
|
complete(&pgdat_init_all_done_comp);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize struct pages. We minimize pfn page lookups and scheduler checks
|
2023-03-21 08:24:15 +08:00
|
|
|
* by performing it only once every MAX_ORDER_NR_PAGES.
|
2023-03-22 01:05:02 +08:00
|
|
|
* Return number of pages initialized.
|
|
|
|
*/
|
2024-06-12 10:04:21 +08:00
|
|
|
static unsigned long __init deferred_init_pages(struct zone *zone,
|
|
|
|
unsigned long pfn, unsigned long end_pfn)
|
2023-03-22 01:05:02 +08:00
|
|
|
{
|
|
|
|
int nid = zone_to_nid(zone);
|
2024-06-12 10:04:21 +08:00
|
|
|
unsigned long nr_pages = end_pfn - pfn;
|
2023-03-22 01:05:02 +08:00
|
|
|
int zid = zone_idx(zone);
|
2024-06-12 10:04:21 +08:00
|
|
|
struct page *page = pfn_to_page(pfn);
|
2023-03-22 01:05:02 +08:00
|
|
|
|
2024-06-12 10:04:21 +08:00
|
|
|
for (; pfn < end_pfn; pfn++, page++)
|
2023-03-22 01:05:02 +08:00
|
|
|
__init_single_page(page, pfn, zid, nid);
|
2024-03-26 14:11:31 +08:00
|
|
|
return nr_pages;
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2024-06-05 15:13:39 +08:00
|
|
|
* This function is meant to pre-load the iterator for the zone init from
|
|
|
|
* a given point.
|
|
|
|
* Specifically it walks through the ranges starting with initial index
|
|
|
|
* passed to it until we are caught up to the first_init_pfn value and
|
|
|
|
* exits there. If we never encounter the value we return false indicating
|
|
|
|
* there are no valid ranges left.
|
2023-03-22 01:05:02 +08:00
|
|
|
*/
|
|
|
|
static bool __init
|
|
|
|
deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
|
|
|
|
unsigned long *spfn, unsigned long *epfn,
|
|
|
|
unsigned long first_init_pfn)
|
|
|
|
{
|
2024-06-05 15:13:39 +08:00
|
|
|
u64 j = *i;
|
|
|
|
|
|
|
|
if (j == 0)
|
|
|
|
__next_mem_pfn_range_in_zone(&j, zone, spfn, epfn);
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Start out by walking through the ranges in this zone that have
|
|
|
|
* already been initialized. We don't need to do anything with them
|
|
|
|
* so we just need to flush them out of the system.
|
|
|
|
*/
|
2024-06-05 15:13:39 +08:00
|
|
|
for_each_free_mem_pfn_range_in_zone_from(j, zone, spfn, epfn) {
|
2023-03-22 01:05:02 +08:00
|
|
|
if (*epfn <= first_init_pfn)
|
|
|
|
continue;
|
|
|
|
if (*spfn < first_init_pfn)
|
|
|
|
*spfn = first_init_pfn;
|
|
|
|
*i = j;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize and free pages. We do it in two loops: first we initialize
|
|
|
|
* struct page, then free to buddy allocator, because while we are
|
|
|
|
* freeing pages we can access pages that are ahead (computing buddy
|
|
|
|
* page in __free_one_page()).
|
|
|
|
*
|
|
|
|
* In order to try and keep some memory in the cache we have the loop
|
|
|
|
* broken along max page order boundaries. This way we will not cause
|
|
|
|
* any issues with the buddy page computation.
|
|
|
|
*/
|
|
|
|
static unsigned long __init
|
|
|
|
deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
|
|
|
|
unsigned long *end_pfn)
|
|
|
|
{
|
|
|
|
unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
|
|
|
|
unsigned long spfn = *start_pfn, epfn = *end_pfn;
|
|
|
|
unsigned long nr_pages = 0;
|
|
|
|
u64 j = *i;
|
|
|
|
|
|
|
|
/* First we loop through and initialize the page values */
|
|
|
|
for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
|
|
|
|
unsigned long t;
|
|
|
|
|
|
|
|
if (mo_pfn <= *start_pfn)
|
|
|
|
break;
|
|
|
|
|
|
|
|
t = min(mo_pfn, *end_pfn);
|
|
|
|
nr_pages += deferred_init_pages(zone, *start_pfn, t);
|
|
|
|
|
|
|
|
if (mo_pfn < *end_pfn) {
|
|
|
|
*start_pfn = mo_pfn;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Reset values and now loop through freeing pages as needed */
|
|
|
|
swap(j, *i);
|
|
|
|
|
|
|
|
for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
|
|
|
|
unsigned long t;
|
|
|
|
|
|
|
|
if (mo_pfn <= spfn)
|
|
|
|
break;
|
|
|
|
|
|
|
|
t = min(mo_pfn, epfn);
|
2024-06-12 10:04:21 +08:00
|
|
|
deferred_free_pages(spfn, t - spfn);
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
if (mo_pfn <= epfn)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nr_pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init
|
|
|
|
deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
|
|
|
|
void *arg)
|
|
|
|
{
|
|
|
|
unsigned long spfn, epfn;
|
|
|
|
struct zone *zone = arg;
|
2024-06-05 15:13:39 +08:00
|
|
|
u64 i = 0;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
|
|
|
|
|
|
|
|
/*
|
2023-12-28 22:47:04 +08:00
|
|
|
* Initialize and free pages in MAX_PAGE_ORDER sized increments so that
|
|
|
|
* we can avoid introducing any issues with the buddy allocator.
|
2023-03-22 01:05:02 +08:00
|
|
|
*/
|
|
|
|
while (spfn < end_pfn) {
|
|
|
|
deferred_init_maxorder(&i, zone, &spfn, &epfn);
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-23 04:38:01 +08:00
|
|
|
static unsigned int __init
|
2023-03-22 01:05:02 +08:00
|
|
|
deferred_page_init_max_threads(const struct cpumask *node_cpumask)
|
|
|
|
{
|
2024-05-23 04:38:01 +08:00
|
|
|
return max(cpumask_weight(node_cpumask), 1U);
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Initialise remaining memory on a node */
|
|
|
|
static int __init deferred_init_memmap(void *data)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = data;
|
|
|
|
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
|
|
|
|
unsigned long spfn = 0, epfn = 0;
|
|
|
|
unsigned long first_init_pfn, flags;
|
|
|
|
unsigned long start = jiffies;
|
|
|
|
struct zone *zone;
|
2024-06-05 15:13:37 +08:00
|
|
|
int max_threads;
|
2024-06-05 15:13:39 +08:00
|
|
|
u64 i = 0;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
/* Bind memory initialisation thread to a local node if possible */
|
|
|
|
if (!cpumask_empty(cpumask))
|
|
|
|
set_cpus_allowed_ptr(current, cpumask);
|
|
|
|
|
|
|
|
pgdat_resize_lock(pgdat, &flags);
|
|
|
|
first_init_pfn = pgdat->first_deferred_pfn;
|
|
|
|
if (first_init_pfn == ULONG_MAX) {
|
|
|
|
pgdat_resize_unlock(pgdat, &flags);
|
|
|
|
pgdat_init_report_one_done();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Sanity check boundaries */
|
|
|
|
BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
|
|
|
|
BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
|
|
|
|
pgdat->first_deferred_pfn = ULONG_MAX;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Once we unlock here, the zone cannot be grown anymore, thus if an
|
|
|
|
* interrupt thread must allocate this early in boot, zone must be
|
|
|
|
* pre-grown prior to start of deferred page initialization.
|
|
|
|
*/
|
|
|
|
pgdat_resize_unlock(pgdat, &flags);
|
|
|
|
|
2024-06-05 15:13:37 +08:00
|
|
|
/* Only the highest zone is deferred */
|
|
|
|
zone = pgdat->node_zones + pgdat->nr_zones - 1;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
max_threads = deferred_page_init_max_threads(cpumask);
|
|
|
|
|
2024-06-05 15:13:38 +08:00
|
|
|
while (deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, first_init_pfn)) {
|
|
|
|
first_init_pfn = ALIGN(epfn, PAGES_PER_SECTION);
|
2023-03-22 01:05:02 +08:00
|
|
|
struct padata_mt_job job = {
|
|
|
|
.thread_fn = deferred_init_memmap_chunk,
|
|
|
|
.fn_arg = zone,
|
|
|
|
.start = spfn,
|
2024-06-05 15:13:38 +08:00
|
|
|
.size = first_init_pfn - spfn,
|
2023-03-22 01:05:02 +08:00
|
|
|
.align = PAGES_PER_SECTION,
|
|
|
|
.min_chunk = PAGES_PER_SECTION,
|
|
|
|
.max_threads = max_threads,
|
2024-03-07 05:04:17 +08:00
|
|
|
.numa_aware = false,
|
2023-03-22 01:05:02 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
padata_do_multithreaded(&job);
|
|
|
|
}
|
2024-06-05 15:13:38 +08:00
|
|
|
|
2023-03-22 01:05:02 +08:00
|
|
|
/* Sanity check that the next zone really is unpopulated */
|
2024-06-05 15:13:37 +08:00
|
|
|
WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone));
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
pr_info("node %d deferred pages initialised in %ums\n",
|
|
|
|
pgdat->node_id, jiffies_to_msecs(jiffies - start));
|
|
|
|
|
|
|
|
pgdat_init_report_one_done();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this zone has deferred pages, try to grow it by initializing enough
|
|
|
|
* deferred pages to satisfy the allocation specified by order, rounded up to
|
|
|
|
* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
|
|
|
|
* of SECTION_SIZE bytes by initializing struct pages in increments of
|
|
|
|
* PAGES_PER_SECTION * sizeof(struct page) bytes.
|
|
|
|
*
|
|
|
|
* Return true when zone was grown, otherwise return false. We return true even
|
|
|
|
* when we grow less than requested, to let the caller decide if there are
|
|
|
|
* enough pages to satisfy the allocation.
|
|
|
|
*/
|
|
|
|
bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
|
|
|
|
{
|
|
|
|
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
|
|
|
|
pg_data_t *pgdat = zone->zone_pgdat;
|
|
|
|
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
|
|
|
|
unsigned long spfn, epfn, flags;
|
|
|
|
unsigned long nr_pages = 0;
|
2024-06-05 15:13:39 +08:00
|
|
|
u64 i = 0;
|
2023-03-22 01:05:02 +08:00
|
|
|
|
|
|
|
/* Only the last zone may have deferred pages */
|
|
|
|
if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
pgdat_resize_lock(pgdat, &flags);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If someone grew this zone while we were waiting for spinlock, return
|
|
|
|
* true, as there might be enough pages already.
|
|
|
|
*/
|
|
|
|
if (first_deferred_pfn != pgdat->first_deferred_pfn) {
|
|
|
|
pgdat_resize_unlock(pgdat, &flags);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If the zone is empty somebody else may have cleared out the zone */
|
|
|
|
if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
|
|
|
|
first_deferred_pfn)) {
|
|
|
|
pgdat->first_deferred_pfn = ULONG_MAX;
|
|
|
|
pgdat_resize_unlock(pgdat, &flags);
|
|
|
|
/* Retry only once. */
|
|
|
|
return first_deferred_pfn != ULONG_MAX;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2023-12-28 22:47:04 +08:00
|
|
|
* Initialize and free pages in MAX_PAGE_ORDER sized increments so
|
2023-03-22 01:05:02 +08:00
|
|
|
* that we can avoid introducing any issues with the buddy
|
|
|
|
* allocator.
|
|
|
|
*/
|
|
|
|
while (spfn < epfn) {
|
|
|
|
/* update our first deferred PFN for this section */
|
|
|
|
first_deferred_pfn = spfn;
|
|
|
|
|
|
|
|
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
|
|
|
|
touch_nmi_watchdog();
|
|
|
|
|
|
|
|
/* We should only stop along section boundaries */
|
|
|
|
if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* If our quota has been met we can stop here */
|
|
|
|
if (nr_pages >= nr_pages_needed)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
pgdat->first_deferred_pfn = spfn;
|
|
|
|
pgdat_resize_unlock(pgdat, &flags);
|
|
|
|
|
|
|
|
return nr_pages > 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
|
|
|
|
|
|
|
|
#ifdef CONFIG_CMA
|
|
|
|
void __init init_cma_reserved_pageblock(struct page *page)
|
|
|
|
{
|
|
|
|
unsigned i = pageblock_nr_pages;
|
|
|
|
struct page *p = page;
|
|
|
|
|
|
|
|
do {
|
|
|
|
__ClearPageReserved(p);
|
|
|
|
set_page_count(p, 0);
|
|
|
|
} while (++p, --i);
|
|
|
|
|
|
|
|
set_pageblock_migratetype(page, MIGRATE_CMA);
|
|
|
|
set_page_refcounted(page);
|
2024-08-13 23:07:57 +08:00
|
|
|
/* pages were reserved and not allocated */
|
|
|
|
clear_page_tag_ref(page);
|
2023-03-22 01:05:02 +08:00
|
|
|
__free_pages(page, pageblock_order);
|
|
|
|
|
|
|
|
adjust_managed_page_count(page, pageblock_nr_pages);
|
|
|
|
page_zone(page)->cma_pages += pageblock_nr_pages;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2023-05-16 14:38:11 +08:00
|
|
|
void set_zone_contiguous(struct zone *zone)
|
|
|
|
{
|
|
|
|
unsigned long block_start_pfn = zone->zone_start_pfn;
|
|
|
|
unsigned long block_end_pfn;
|
|
|
|
|
|
|
|
block_end_pfn = pageblock_end_pfn(block_start_pfn);
|
|
|
|
for (; block_start_pfn < zone_end_pfn(zone);
|
|
|
|
block_start_pfn = block_end_pfn,
|
|
|
|
block_end_pfn += pageblock_nr_pages) {
|
|
|
|
|
|
|
|
block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
|
|
|
|
|
|
|
|
if (!__pageblock_pfn_to_page(block_start_pfn,
|
|
|
|
block_end_pfn, zone))
|
|
|
|
return;
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We confirm that there is no hole */
|
|
|
|
zone->contiguous = true;
|
|
|
|
}
|
|
|
|
|
2024-06-11 22:52:23 +08:00
|
|
|
static void __init mem_init_print_info(void);
|
2023-03-22 01:05:02 +08:00
|
|
|
void __init page_alloc_init_late(void)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
int nid;
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
|
|
|
|
|
|
|
/* There will be num_node_state(N_MEMORY) threads */
|
|
|
|
atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
|
|
|
|
for_each_node_state(nid, N_MEMORY) {
|
|
|
|
kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Block until all are initialised */
|
|
|
|
wait_for_completion(&pgdat_init_all_done_comp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We initialized the rest of the deferred pages. Permanently disable
|
|
|
|
* on-demand struct page initialization.
|
|
|
|
*/
|
|
|
|
static_branch_disable(&deferred_pages);
|
|
|
|
|
|
|
|
/* Reinit limits that are based on free pages after the kernel is up */
|
|
|
|
files_maxfiles_init();
|
|
|
|
#endif
|
|
|
|
|
2024-06-11 22:52:23 +08:00
|
|
|
/* Accounting of total+free memory is stable at this point. */
|
|
|
|
mem_init_print_info();
|
2023-03-22 01:05:02 +08:00
|
|
|
buffer_init();
|
|
|
|
|
|
|
|
/* Discard memblock private memory */
|
|
|
|
memblock_discard();
|
|
|
|
|
|
|
|
for_each_node_state(nid, N_MEMORY)
|
|
|
|
shuffle_free_memory(NODE_DATA(nid));
|
|
|
|
|
|
|
|
for_each_populated_zone(zone)
|
|
|
|
set_zone_contiguous(zone);
|
2023-03-22 01:05:09 +08:00
|
|
|
|
|
|
|
/* Initialize page ext after all struct pages are initialized. */
|
|
|
|
if (deferred_struct_pages)
|
|
|
|
page_ext_init();
|
2023-05-16 14:38:20 +08:00
|
|
|
|
|
|
|
page_alloc_sysctl_init();
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Adaptive scale is meant to reduce sizes of hash tables on large memory
|
|
|
|
* machines. As memory size is increased the scale is also increased but at
|
|
|
|
* slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
|
|
|
|
* quadruples the scale is increased by one, which means the size of hash table
|
|
|
|
* only doubles, instead of quadrupling as well.
|
|
|
|
* Because 32-bit systems cannot have large physical memory, where this scaling
|
|
|
|
* makes sense, it is disabled on such platforms.
|
|
|
|
*/
|
|
|
|
#if __BITS_PER_LONG > 32
|
|
|
|
#define ADAPT_SCALE_BASE (64ul << 30)
|
|
|
|
#define ADAPT_SCALE_SHIFT 2
|
|
|
|
#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* allocate a large system hash table from bootmem
|
|
|
|
* - it is assumed that the hash table must contain an exact power-of-2
|
|
|
|
* quantity of entries
|
|
|
|
* - limit is the number of hash buckets, not the total allocation size
|
|
|
|
*/
|
|
|
|
void *__init alloc_large_system_hash(const char *tablename,
|
|
|
|
unsigned long bucketsize,
|
|
|
|
unsigned long numentries,
|
|
|
|
int scale,
|
|
|
|
int flags,
|
|
|
|
unsigned int *_hash_shift,
|
|
|
|
unsigned int *_hash_mask,
|
|
|
|
unsigned long low_limit,
|
|
|
|
unsigned long high_limit)
|
|
|
|
{
|
|
|
|
unsigned long long max = high_limit;
|
|
|
|
unsigned long log2qty, size;
|
|
|
|
void *table;
|
|
|
|
gfp_t gfp_flags;
|
|
|
|
bool virt;
|
|
|
|
bool huge;
|
|
|
|
|
|
|
|
/* allow the kernel cmdline to have a say */
|
|
|
|
if (!numentries) {
|
|
|
|
/* round applicable memory size up to nearest megabyte */
|
|
|
|
numentries = nr_kernel_pages;
|
|
|
|
|
|
|
|
/* It isn't necessary when PAGE_SIZE >= 1MB */
|
|
|
|
if (PAGE_SIZE < SZ_1M)
|
|
|
|
numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
|
|
|
|
|
|
|
|
#if __BITS_PER_LONG > 32
|
|
|
|
if (!high_limit) {
|
|
|
|
unsigned long adapt;
|
|
|
|
|
|
|
|
for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
|
|
|
|
adapt <<= ADAPT_SCALE_SHIFT)
|
|
|
|
scale++;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* limit to 1 bucket per 2^scale bytes of low memory */
|
|
|
|
if (scale > PAGE_SHIFT)
|
|
|
|
numentries >>= (scale - PAGE_SHIFT);
|
|
|
|
else
|
|
|
|
numentries <<= (PAGE_SHIFT - scale);
|
|
|
|
|
2023-06-25 10:13:23 +08:00
|
|
|
if (unlikely((numentries * bucketsize) < PAGE_SIZE))
|
2023-03-22 01:05:02 +08:00
|
|
|
numentries = PAGE_SIZE / bucketsize;
|
|
|
|
}
|
|
|
|
numentries = roundup_pow_of_two(numentries);
|
|
|
|
|
|
|
|
/* limit allocation size to 1/16 total memory by default */
|
|
|
|
if (max == 0) {
|
|
|
|
max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
|
|
|
|
do_div(max, bucketsize);
|
|
|
|
}
|
|
|
|
max = min(max, 0x80000000ULL);
|
|
|
|
|
|
|
|
if (numentries < low_limit)
|
|
|
|
numentries = low_limit;
|
|
|
|
if (numentries > max)
|
|
|
|
numentries = max;
|
|
|
|
|
|
|
|
log2qty = ilog2(numentries);
|
|
|
|
|
|
|
|
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
|
|
|
|
do {
|
|
|
|
virt = false;
|
|
|
|
size = bucketsize << log2qty;
|
|
|
|
if (flags & HASH_EARLY) {
|
|
|
|
if (flags & HASH_ZERO)
|
|
|
|
table = memblock_alloc(size, SMP_CACHE_BYTES);
|
|
|
|
else
|
|
|
|
table = memblock_alloc_raw(size,
|
|
|
|
SMP_CACHE_BYTES);
|
2023-12-28 22:47:04 +08:00
|
|
|
} else if (get_order(size) > MAX_PAGE_ORDER || hashdist) {
|
2023-03-22 01:05:02 +08:00
|
|
|
table = vmalloc_huge(size, gfp_flags);
|
|
|
|
virt = true;
|
|
|
|
if (table)
|
|
|
|
huge = is_vm_area_hugepages(table);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* If bucketsize is not a power-of-two, we may free
|
|
|
|
* some pages at the end of hash table which
|
|
|
|
* alloc_pages_exact() automatically does
|
|
|
|
*/
|
|
|
|
table = alloc_pages_exact(size, gfp_flags);
|
|
|
|
kmemleak_alloc(table, size, 1, gfp_flags);
|
|
|
|
}
|
|
|
|
} while (!table && size > PAGE_SIZE && --log2qty);
|
|
|
|
|
|
|
|
if (!table)
|
|
|
|
panic("Failed to allocate %s hash table\n", tablename);
|
|
|
|
|
|
|
|
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
|
|
|
|
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
|
|
|
|
virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
|
|
|
|
|
|
|
|
if (_hash_shift)
|
|
|
|
*_hash_shift = log2qty;
|
|
|
|
if (_hash_mask)
|
|
|
|
*_hash_mask = (1 << log2qty) - 1;
|
|
|
|
|
|
|
|
return table;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init memblock_free_pages(struct page *page, unsigned long pfn,
|
|
|
|
unsigned int order)
|
|
|
|
{
|
2023-06-19 10:34:06 +08:00
|
|
|
if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
|
|
|
|
int nid = early_pfn_to_nid(pfn);
|
|
|
|
|
|
|
|
if (!early_page_initialised(pfn, nid))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-03-22 01:05:02 +08:00
|
|
|
if (!kmsan_memblock_free_pages(page, order)) {
|
|
|
|
/* KMSAN will take care of these pages. */
|
|
|
|
return;
|
|
|
|
}
|
2024-03-22 00:36:56 +08:00
|
|
|
|
|
|
|
/* pages were reserved and not allocated */
|
2024-08-13 23:07:56 +08:00
|
|
|
clear_page_tag_ref(page);
|
mm: pass meminit_context to __free_pages_core()
Patch series "mm/memory_hotplug: use PageOffline() instead of
PageReserved() for !ZONE_DEVICE".
This can be a considered a long-overdue follow-up to some parts of [1].
The patches are based on [2], but they are not strictly required -- just
makes it clearer why we can use adjust_managed_page_count() for memory
hotplug without going into details about highmem.
We stop initializing pages with PageReserved() in memory hotplug code --
except when dealing with ZONE_DEVICE for now. Instead, we use
PageOffline(): all pages are initialized to PageOffline() when onlining a
memory section, and only the ones actually getting exposed to the
system/page allocator will get PageOffline cleared.
This way, we enlighten memory hotplug more about PageOffline() pages and
can cleanup some hacks we have in virtio-mem code.
What about ZONE_DEVICE? PageOffline() is wrong, but we might just stop
using PageReserved() for them later by simply checking for
is_zone_device_page() at suitable places. That will be a separate patch
set / proposal.
This primarily affects virtio-mem, HV-balloon and XEN balloon. I only
briefly tested with virtio-mem, which benefits most from these cleanups.
[1] https://lore.kernel.org/all/20191024120938.11237-1-david@redhat.com/
[2] https://lkml.kernel.org/r/20240607083711.62833-1-david@redhat.com
This patch (of 3):
In preparation for further changes, let's teach __free_pages_core() about
the differences of memory hotplug handling.
Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory hotplug
path, and spell out why memory freed via memblock cannot currently use
adjust_managed_page_count().
[david@redhat.com: add missed CONFIG_DEFERRED_STRUCT_PAGE_INIT]
Link: https://lkml.kernel.org/r/b72e6efd-fb0a-459c-b1a0-88a98e5b19e2@redhat.com
[david@redhat.com: fix up the memblock comment, per Oscar]
Link: https://lkml.kernel.org/r/2ed64218-7f3b-4302-a5dc-27f060654fe2@redhat.com
[david@redhat.com: add the parameter name also in the declaration]
Link: https://lkml.kernel.org/r/ca575956-f0dd-4fb9-a307-6b7621681ed9@redhat.com
Link: https://lkml.kernel.org/r/20240607090939.89524-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240607090939.89524-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eugenio Pérez <eperezma@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Marco Elver <elver@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-07 17:09:36 +08:00
|
|
|
__free_pages_core(page, order, MEMINIT_EARLY);
|
2023-03-22 01:05:02 +08:00
|
|
|
}
|
2023-03-22 01:05:06 +08:00
|
|
|
|
2023-05-16 14:38:10 +08:00
|
|
|
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
|
|
|
|
EXPORT_SYMBOL(init_on_alloc);
|
|
|
|
|
|
|
|
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
|
|
|
|
EXPORT_SYMBOL(init_on_free);
|
|
|
|
|
2023-03-22 01:05:08 +08:00
|
|
|
static bool _init_on_alloc_enabled_early __read_mostly
|
|
|
|
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
|
|
|
|
static int __init early_init_on_alloc(char *buf)
|
|
|
|
{
|
|
|
|
|
|
|
|
return kstrtobool(buf, &_init_on_alloc_enabled_early);
|
|
|
|
}
|
|
|
|
early_param("init_on_alloc", early_init_on_alloc);
|
|
|
|
|
|
|
|
static bool _init_on_free_enabled_early __read_mostly
|
|
|
|
= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
|
|
|
|
static int __init early_init_on_free(char *buf)
|
|
|
|
{
|
|
|
|
return kstrtobool(buf, &_init_on_free_enabled_early);
|
|
|
|
}
|
|
|
|
early_param("init_on_free", early_init_on_free);
|
|
|
|
|
|
|
|
DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Enable static keys related to various memory debugging and hardening options.
|
|
|
|
* Some override others, and depend on early params that are evaluated in the
|
|
|
|
* order of appearance. So we need to first gather the full picture of what was
|
|
|
|
* enabled, and then make decisions.
|
|
|
|
*/
|
|
|
|
static void __init mem_debugging_and_hardening_init(void)
|
|
|
|
{
|
|
|
|
bool page_poisoning_requested = false;
|
|
|
|
bool want_check_pages = false;
|
|
|
|
|
|
|
|
#ifdef CONFIG_PAGE_POISONING
|
|
|
|
/*
|
|
|
|
* Page poisoning is debug page alloc for some arches. If
|
|
|
|
* either of those options are enabled, enable poisoning.
|
|
|
|
*/
|
|
|
|
if (page_poisoning_enabled() ||
|
|
|
|
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
|
|
|
|
debug_pagealloc_enabled())) {
|
|
|
|
static_branch_enable(&_page_poisoning_enabled);
|
|
|
|
page_poisoning_requested = true;
|
|
|
|
want_check_pages = true;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2024-06-05 17:17:10 +08:00
|
|
|
if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
|
2023-03-22 01:05:08 +08:00
|
|
|
page_poisoning_requested) {
|
|
|
|
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
|
2024-06-05 17:17:10 +08:00
|
|
|
"will take precedence over init_on_alloc and init_on_free\n");
|
2023-03-22 01:05:08 +08:00
|
|
|
_init_on_alloc_enabled_early = false;
|
|
|
|
_init_on_free_enabled_early = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (_init_on_alloc_enabled_early) {
|
|
|
|
want_check_pages = true;
|
|
|
|
static_branch_enable(&init_on_alloc);
|
|
|
|
} else {
|
|
|
|
static_branch_disable(&init_on_alloc);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (_init_on_free_enabled_early) {
|
|
|
|
want_check_pages = true;
|
|
|
|
static_branch_enable(&init_on_free);
|
|
|
|
} else {
|
|
|
|
static_branch_disable(&init_on_free);
|
|
|
|
}
|
|
|
|
|
2024-06-05 17:17:10 +08:00
|
|
|
if (IS_ENABLED(CONFIG_KMSAN) &&
|
|
|
|
(_init_on_alloc_enabled_early || _init_on_free_enabled_early))
|
|
|
|
pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
|
2023-03-22 01:05:08 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_PAGEALLOC
|
|
|
|
if (debug_pagealloc_enabled()) {
|
|
|
|
want_check_pages = true;
|
|
|
|
static_branch_enable(&_debug_pagealloc_enabled);
|
|
|
|
|
|
|
|
if (debug_guardpage_minorder())
|
|
|
|
static_branch_enable(&_debug_guardpage_enabled);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Any page debugging or hardening option also enables sanity checking
|
|
|
|
* of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
|
|
|
|
* enabled already.
|
|
|
|
*/
|
|
|
|
if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
|
|
|
|
static_branch_enable(&check_pages_enabled);
|
|
|
|
}
|
|
|
|
|
2023-03-22 01:05:06 +08:00
|
|
|
/* Report memory auto-initialization states for this boot. */
|
|
|
|
static void __init report_meminit(void)
|
|
|
|
{
|
|
|
|
const char *stack;
|
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
|
|
|
|
stack = "all(pattern)";
|
|
|
|
else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
|
|
|
|
stack = "all(zero)";
|
|
|
|
else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
|
|
|
|
stack = "byref_all(zero)";
|
|
|
|
else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
|
|
|
|
stack = "byref(zero)";
|
|
|
|
else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
|
|
|
|
stack = "__user(zero)";
|
|
|
|
else
|
|
|
|
stack = "off";
|
|
|
|
|
2024-06-05 17:17:10 +08:00
|
|
|
pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
|
2023-03-22 01:05:06 +08:00
|
|
|
stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
|
2024-06-05 17:17:10 +08:00
|
|
|
want_init_on_free() ? "on" : "off");
|
2023-03-22 01:05:06 +08:00
|
|
|
if (want_init_on_free())
|
|
|
|
pr_info("mem auto-init: clearing system memory may take some time...\n");
|
|
|
|
}
|
|
|
|
|
2023-03-22 01:05:10 +08:00
|
|
|
static void __init mem_init_print_info(void)
|
|
|
|
{
|
|
|
|
unsigned long physpages, codesize, datasize, rosize, bss_size;
|
|
|
|
unsigned long init_code_size, init_data_size;
|
|
|
|
|
|
|
|
physpages = get_num_physpages();
|
|
|
|
codesize = _etext - _stext;
|
|
|
|
datasize = _edata - _sdata;
|
|
|
|
rosize = __end_rodata - __start_rodata;
|
|
|
|
bss_size = __bss_stop - __bss_start;
|
|
|
|
init_data_size = __init_end - __init_begin;
|
|
|
|
init_code_size = _einittext - _sinittext;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Detect special cases and adjust section sizes accordingly:
|
|
|
|
* 1) .init.* may be embedded into .data sections
|
|
|
|
* 2) .init.text.* may be out of [__init_begin, __init_end],
|
|
|
|
* please refer to arch/tile/kernel/vmlinux.lds.S.
|
|
|
|
* 3) .rodata.* may be embedded into .text or .data sections.
|
|
|
|
*/
|
|
|
|
#define adj_init_size(start, end, size, pos, adj) \
|
|
|
|
do { \
|
|
|
|
if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
|
|
|
|
size -= adj; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
adj_init_size(__init_begin, __init_end, init_data_size,
|
|
|
|
_sinittext, init_code_size);
|
|
|
|
adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
|
|
|
|
adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
|
|
|
|
adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
|
|
|
|
adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
|
|
|
|
|
|
|
|
#undef adj_init_size
|
|
|
|
|
|
|
|
pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
", %luK highmem"
|
|
|
|
#endif
|
|
|
|
")\n",
|
|
|
|
K(nr_free_pages()), K(physpages),
|
|
|
|
codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
|
|
|
|
(init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
|
|
|
|
K(physpages - totalram_pages() - totalcma_pages),
|
|
|
|
K(totalcma_pages)
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
, K(totalhigh_pages())
|
|
|
|
#endif
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2023-03-22 01:05:06 +08:00
|
|
|
/*
|
|
|
|
* Set up kernel memory allocators
|
|
|
|
*/
|
|
|
|
void __init mm_core_init(void)
|
|
|
|
{
|
|
|
|
/* Initializations relying on SMP setup */
|
2024-06-19 09:06:10 +08:00
|
|
|
BUILD_BUG_ON(MAX_ZONELISTS > 2);
|
2023-03-22 01:05:06 +08:00
|
|
|
build_all_zonelists(NULL);
|
|
|
|
page_alloc_init_cpuhp();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* page_ext requires contiguous pages,
|
2023-12-28 22:47:04 +08:00
|
|
|
* bigger than MAX_PAGE_ORDER unless SPARSEMEM.
|
2023-03-22 01:05:06 +08:00
|
|
|
*/
|
|
|
|
page_ext_init_flatmem();
|
2023-03-22 01:05:08 +08:00
|
|
|
mem_debugging_and_hardening_init();
|
2023-07-18 15:30:19 +08:00
|
|
|
kfence_alloc_pool_and_metadata();
|
2023-03-22 01:05:06 +08:00
|
|
|
report_meminit();
|
|
|
|
kmsan_init_shadow();
|
|
|
|
stack_depot_early_init();
|
|
|
|
mem_init();
|
|
|
|
kmem_cache_init();
|
|
|
|
/*
|
|
|
|
* page_owner must be initialized after buddy is ready, and also after
|
|
|
|
* slab is ready so that stack_depot_init() works properly
|
|
|
|
*/
|
|
|
|
page_ext_init_flatmem_late();
|
|
|
|
kmemleak_init();
|
2023-03-22 01:05:07 +08:00
|
|
|
ptlock_cache_init();
|
|
|
|
pgtable_cache_init();
|
2023-03-22 01:05:06 +08:00
|
|
|
debug_objects_mem_init();
|
|
|
|
vmalloc_init();
|
|
|
|
/* If no deferred init page_ext now, as vmap is fully initialized */
|
|
|
|
if (!deferred_struct_pages)
|
|
|
|
page_ext_init();
|
|
|
|
/* Should be run before the first non-init thread is created */
|
|
|
|
init_espfix_bsp();
|
|
|
|
/* Should be run after espfix64 is set up. */
|
|
|
|
pti_init();
|
|
|
|
kmsan_init_runtime();
|
|
|
|
mm_cache_init();
|
2024-05-06 00:06:19 +08:00
|
|
|
execmem_init();
|
2023-03-22 01:05:06 +08:00
|
|
|
}
|