2019-06-03 13:44:50 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2012-03-05 19:49:27 +08:00
|
|
|
/*
|
|
|
|
* Based on arch/arm/mm/init.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1995-2005 Russell King
|
|
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/export.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/init.h>
|
2016-08-15 14:45:46 +08:00
|
|
|
#include <linux/cache.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/nodemask.h>
|
|
|
|
#include <linux/initrd.h>
|
|
|
|
#include <linux/gfp.h>
|
|
|
|
#include <linux/memblock.h>
|
|
|
|
#include <linux/sort.h>
|
2017-04-03 10:24:32 +08:00
|
|
|
#include <linux/of.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <linux/of_fdt.h>
|
2019-10-15 02:31:03 +08:00
|
|
|
#include <linux/dma-direct.h>
|
2020-09-11 16:56:52 +08:00
|
|
|
#include <linux/dma-map-ops.h>
|
2014-07-29 02:03:03 +08:00
|
|
|
#include <linux/efi.h>
|
2015-02-06 02:01:53 +08:00
|
|
|
#include <linux/swiotlb.h>
|
2016-09-05 19:30:22 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2017-01-11 05:35:49 +08:00
|
|
|
#include <linux/mm.h>
|
2017-04-03 10:24:32 +08:00
|
|
|
#include <linux/kexec.h>
|
arm64: kdump: provide /proc/vmcore file
Arch-specific functions are added to allow for implementing a crash dump
file interface, /proc/vmcore, which can be viewed as a ELF file.
A user space tool, like kexec-tools, is responsible for allocating
a separate region for the core's ELF header within crash kdump kernel
memory and filling it in when executing kexec_load().
Then, its location will be advertised to crash dump kernel via a new
device-tree property, "linux,elfcorehdr", and crash dump kernel preserves
the region for later use with reserve_elfcorehdr() at boot time.
On crash dump kernel, /proc/vmcore will access the primary kernel's memory
with copy_oldmem_page(), which feeds the data page-by-page by ioremap'ing
it since it does not reside in linear mapping on crash dump kernel.
Meanwhile, elfcorehdr_read() is simple as the region is always mapped.
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Reviewed-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-04-03 10:24:38 +08:00
|
|
|
#include <linux/crash_dump.h>
|
mm: hugetlb: optionally allocate gigantic hugepages using cma
Commit 944d9fec8d7a ("hugetlb: add support for gigantic page allocation
at runtime") has added the run-time allocation of gigantic pages.
However it actually works only at early stages of the system loading,
when the majority of memory is free. After some time the memory gets
fragmented by non-movable pages, so the chances to find a contiguous 1GB
block are getting close to zero. Even dropping caches manually doesn't
help a lot.
At large scale rebooting servers in order to allocate gigantic hugepages
is quite expensive and complex. At the same time keeping some constant
percentage of memory in reserved hugepages even if the workload isn't
using it is a big waste: not all workloads can benefit from using 1 GB
pages.
The following solution can solve the problem:
1) On boot time a dedicated cma area* is reserved. The size is passed
as a kernel argument.
2) Run-time allocations of gigantic hugepages are performed using the
cma allocator and the dedicated cma area
In this case gigantic hugepages can be allocated successfully with a
high probability, however the memory isn't completely wasted if nobody
is using 1GB hugepages: it can be used for pagecache, anon memory, THPs,
etc.
* On a multi-node machine a per-node cma area is allocated on each node.
Following gigantic hugetlb allocation are using the first available
numa node if the mask isn't specified by a user.
Usage:
1) configure the kernel to allocate a cma area for hugetlb allocations:
pass hugetlb_cma=10G as a kernel argument
2) allocate hugetlb pages as usual, e.g.
echo 10 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
If the option isn't enabled or the allocation of the cma area failed,
the current behavior of the system is preserved.
x86 and arm-64 are covered by this patch, other architectures can be
trivially added later.
The patch contains clean-ups and fixes proposed and implemented by Aslan
Bakirov and Randy Dunlap. It also contains ideas and suggestions
proposed by Rik van Riel, Michal Hocko and Mike Kravetz. Thanks!
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Andreas Schaufler <andreas.schaufler@gmx.de>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@kernel.org>
Cc: Aslan Bakirov <aslan@fb.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Link: http://lkml.kernel.org/r/20200407163840.92263-3-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-11 05:32:45 +08:00
|
|
|
#include <linux/hugetlb.h>
|
arm64: mm: Set ZONE_DMA size based on early IORT scan
We recently introduced a 1 GB sized ZONE_DMA to cater for platforms
incorporating masters that can address less than 32 bits of DMA, in
particular the Raspberry Pi 4, which has 4 or 8 GB of DRAM, but has
peripherals that can only address up to 1 GB (and its PCIe host
bridge can only access the bottom 3 GB)
Instructing the DMA layer about these limitations is straight-forward,
even though we had to fix some issues regarding memory limits set in
the IORT for named components, and regarding the handling of ACPI _DMA
methods. However, the DMA layer also needs to be able to allocate
memory that is guaranteed to meet those DMA constraints, for bounce
buffering as well as allocating the backing for consistent mappings.
This is why the 1 GB ZONE_DMA was introduced recently. Unfortunately,
it turns out the having a 1 GB ZONE_DMA as well as a ZONE_DMA32 causes
problems with kdump, and potentially in other places where allocations
cannot cross zone boundaries. Therefore, we should avoid having two
separate DMA zones when possible.
So let's do an early scan of the IORT, and only create the ZONE_DMA
if we encounter any devices that need it. This puts the burden on
the firmware to describe such limitations in the IORT, which may be
redundant (and less precise) if _DMA methods are also being provided.
However, it should be noted that this situation is highly unusual for
arm64 ACPI machines. Also, the DMA subsystem still gives precedence to
the _DMA method if implemented, and so we will not lose the ability to
perform streaming DMA outside the ZONE_DMA if the _DMA method permits
it.
[nsaenz: unified implementation with DT's counterpart]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20201119175400.9995-7-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-20 01:53:58 +08:00
|
|
|
#include <linux/acpi_iort.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2016-02-16 20:52:42 +08:00
|
|
|
#include <asm/boot.h>
|
2014-07-17 00:42:43 +08:00
|
|
|
#include <asm/fixmap.h>
|
2016-02-16 20:52:40 +08:00
|
|
|
#include <asm/kasan.h>
|
2016-02-16 20:52:42 +08:00
|
|
|
#include <asm/kernel-pgtable.h>
|
KVM: arm64: Prepare the creation of s1 mappings at EL2
When memory protection is enabled, the EL2 code needs the ability to
create and manage its own page-table. To do so, introduce a new set of
hypercalls to bootstrap a memory management system at EL2.
This leads to the following boot flow in nVHE Protected mode:
1. the host allocates memory for the hypervisor very early on, using
the memblock API;
2. the host creates a set of stage 1 page-table for EL2, installs the
EL2 vectors, and issues the __pkvm_init hypercall;
3. during __pkvm_init, the hypervisor re-creates its stage 1 page-table
and stores it in the memory pool provided by the host;
4. the hypervisor then extends its stage 1 mappings to include a
vmemmap in the EL2 VA space, hence allowing to use the buddy
allocator introduced in a previous patch;
5. the hypervisor jumps back in the idmap page, switches from the
host-provided page-table to the new one, and wraps up its
initialization by enabling the new allocator, before returning to
the host.
6. the host can free the now unused page-table created for EL2, and
will now need to issue hypercalls to make changes to the EL2 stage 1
mappings instead of modifying them directly.
Note that for the sake of simplifying the review, this patch focuses on
the hypervisor side of things. In other words, this only implements the
new hypercalls, but does not make use of them from the host yet. The
host-side changes will follow in a subsequent patch.
Credits to Will for __pkvm_init_switch_pgd.
Acked-by: Will Deacon <will@kernel.org>
Co-authored-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210319100146.1149909-18-qperret@google.com
2021-03-19 18:01:25 +08:00
|
|
|
#include <asm/kvm_host.h>
|
arm64: Fix overlapping VA allocations
PCI IO space was intended to be 16MiB, at 32MiB below MODULES_VADDR, but
commit d1e6dc91b532d3d3 ("arm64: Add architectural support for PCI")
extended this to cover the full 32MiB. The final 8KiB of this 32MiB is
also allocated for the fixmap, allowing for potential clashes between
the two.
This change was masked by assumptions in mem_init and the page table
dumping code, which assumed the I/O space to be 16MiB long through
seaparte hard-coded definitions.
This patch changes the definition of the PCI I/O space allocation to
live in asm/memory.h, along with the other VA space allocations. As the
fixmap allocation depends on the number of fixmap entries, this is moved
below the PCI I/O space allocation. Both the fixmap and PCI I/O space
are guarded with 2MB of padding. Sites assuming the I/O space was 16MiB
are moved over use new PCI_IO_{START,END} definitions, which will keep
in sync with the size of the IO space (now restored to 16MiB).
As a useful side effect, the use of the new PCI_IO_{START,END}
definitions prevents a build issue in the dumping code due to a (now
redundant) missing include of io.h for PCI_IOBASE.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Liviu Dudau <liviu.dudau@arm.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
[catalin.marinas@arm.com: reorder FIXADDR and PCI_IO address_markers_idx enum]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2015-01-23 02:20:35 +08:00
|
|
|
#include <asm/memory.h>
|
2016-04-09 06:50:27 +08:00
|
|
|
#include <asm/numa.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <asm/sections.h>
|
|
|
|
#include <asm/setup.h>
|
2019-05-15 06:46:51 +08:00
|
|
|
#include <linux/sizes.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <asm/tlb.h>
|
2014-11-14 23:54:08 +08:00
|
|
|
#include <asm/alternative.h>
|
2021-05-13 04:18:22 +08:00
|
|
|
#include <asm/xen/swiotlb-xen.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2016-02-16 20:52:42 +08:00
|
|
|
/*
|
|
|
|
* We need to be able to catch inadvertent references to memstart_addr
|
|
|
|
* that occur (potentially in generic code) before arm64_memblock_init()
|
|
|
|
* executes, which assigns it its actual value. So use a default value
|
|
|
|
* that cannot be mistaken for a real physical address.
|
|
|
|
*/
|
2016-08-15 14:45:46 +08:00
|
|
|
s64 memstart_addr __ro_after_init = -1;
|
2018-12-08 02:08:15 +08:00
|
|
|
EXPORT_SYMBOL(memstart_addr);
|
|
|
|
|
2019-09-12 02:25:45 +08:00
|
|
|
/*
|
2021-01-07 22:40:08 +08:00
|
|
|
* If the corresponding config options are enabled, we create both ZONE_DMA
|
|
|
|
* and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory
|
|
|
|
* unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
|
|
|
|
* In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
|
|
|
|
* otherwise it is empty.
|
2019-09-12 02:25:45 +08:00
|
|
|
*/
|
2016-08-15 14:45:46 +08:00
|
|
|
phys_addr_t arm64_dma_phys_limit __ro_after_init;
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2017-04-03 10:24:32 +08:00
|
|
|
#ifdef CONFIG_KEXEC_CORE
|
|
|
|
/*
|
|
|
|
* reserve_crashkernel() - reserves memory for crash kernel
|
|
|
|
*
|
|
|
|
* This function reserves memory area given in "crashkernel=" kernel command
|
|
|
|
* line parameter. The memory reserved is used by dump capture kernel when
|
|
|
|
* primary kernel is crashing.
|
|
|
|
*/
|
|
|
|
static void __init reserve_crashkernel(void)
|
|
|
|
{
|
|
|
|
unsigned long long crash_base, crash_size;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
|
|
|
|
&crash_size, &crash_base);
|
|
|
|
/* no crashkernel= or invalid value specified */
|
|
|
|
if (ret || !crash_size)
|
|
|
|
return;
|
|
|
|
|
|
|
|
crash_size = PAGE_ALIGN(crash_size);
|
|
|
|
|
|
|
|
if (crash_base == 0) {
|
|
|
|
/* Current arm64 boot protocol requires 2MB alignment */
|
2021-01-07 22:40:08 +08:00
|
|
|
crash_base = memblock_find_in_range(0, arm64_dma_phys_limit,
|
2017-04-03 10:24:32 +08:00
|
|
|
crash_size, SZ_2M);
|
|
|
|
if (crash_base == 0) {
|
|
|
|
pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
|
|
|
|
crash_size);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* User specifies base address explicitly. */
|
|
|
|
if (!memblock_is_region_memory(crash_base, crash_size)) {
|
|
|
|
pr_warn("cannot reserve crashkernel: region is not memory\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (memblock_is_region_reserved(crash_base, crash_size)) {
|
|
|
|
pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!IS_ALIGNED(crash_base, SZ_2M)) {
|
|
|
|
pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
memblock_reserve(crash_base, crash_size);
|
|
|
|
|
|
|
|
pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
|
|
|
|
crash_base, crash_base + crash_size, crash_size >> 20);
|
|
|
|
|
|
|
|
crashk_res.start = crash_base;
|
|
|
|
crashk_res.end = crash_base + crash_size - 1;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void __init reserve_crashkernel(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_KEXEC_CORE */
|
|
|
|
|
arm64: kdump: provide /proc/vmcore file
Arch-specific functions are added to allow for implementing a crash dump
file interface, /proc/vmcore, which can be viewed as a ELF file.
A user space tool, like kexec-tools, is responsible for allocating
a separate region for the core's ELF header within crash kdump kernel
memory and filling it in when executing kexec_load().
Then, its location will be advertised to crash dump kernel via a new
device-tree property, "linux,elfcorehdr", and crash dump kernel preserves
the region for later use with reserve_elfcorehdr() at boot time.
On crash dump kernel, /proc/vmcore will access the primary kernel's memory
with copy_oldmem_page(), which feeds the data page-by-page by ioremap'ing
it since it does not reside in linear mapping on crash dump kernel.
Meanwhile, elfcorehdr_read() is simple as the region is always mapped.
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Reviewed-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-04-03 10:24:38 +08:00
|
|
|
#ifdef CONFIG_CRASH_DUMP
|
|
|
|
static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
|
|
|
|
const char *uname, int depth, void *data)
|
|
|
|
{
|
|
|
|
const __be32 *reg;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (depth != 1 || strcmp(uname, "chosen") != 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len);
|
|
|
|
if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, ®);
|
|
|
|
elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, ®);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* reserve_elfcorehdr() - reserves memory for elf core header
|
|
|
|
*
|
|
|
|
* This function reserves the memory occupied by an elf core header
|
|
|
|
* described in the device tree. This region contains all the
|
|
|
|
* information about primary kernel's core image and is used by a dump
|
|
|
|
* capture kernel to access the system memory on primary kernel.
|
|
|
|
*/
|
|
|
|
static void __init reserve_elfcorehdr(void)
|
|
|
|
{
|
|
|
|
of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL);
|
|
|
|
|
|
|
|
if (!elfcorehdr_size)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) {
|
|
|
|
pr_warn("elfcorehdr is overlapped\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
memblock_reserve(elfcorehdr_addr, elfcorehdr_size);
|
|
|
|
|
|
|
|
pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n",
|
|
|
|
elfcorehdr_size >> 10, elfcorehdr_addr);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void __init reserve_elfcorehdr(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_CRASH_DUMP */
|
2019-09-12 02:25:45 +08:00
|
|
|
|
2014-07-18 18:54:37 +08:00
|
|
|
/*
|
2020-11-19 02:58:09 +08:00
|
|
|
* Return the maximum physical address for a zone accessible by the given bits
|
|
|
|
* limit. If DRAM starts above 32-bit, expand the zone to the maximum
|
|
|
|
* available memory, otherwise cap it at 32-bit.
|
2014-07-18 18:54:37 +08:00
|
|
|
*/
|
2019-09-12 02:25:45 +08:00
|
|
|
static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
|
2014-07-18 18:54:37 +08:00
|
|
|
{
|
2020-11-19 02:58:09 +08:00
|
|
|
phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
|
|
|
|
phys_addr_t phys_start = memblock_start_of_DRAM();
|
|
|
|
|
|
|
|
if (phys_start > U32_MAX)
|
|
|
|
zone_mask = PHYS_ADDR_MAX;
|
|
|
|
else if (phys_start > zone_mask)
|
|
|
|
zone_mask = U32_MAX;
|
|
|
|
|
|
|
|
return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
|
2014-07-18 18:54:37 +08:00
|
|
|
}
|
|
|
|
|
2016-04-09 06:50:27 +08:00
|
|
|
static void __init zone_sizes_init(unsigned long min, unsigned long max)
|
|
|
|
{
|
|
|
|
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
|
arm64: mm: Set ZONE_DMA size based on early IORT scan
We recently introduced a 1 GB sized ZONE_DMA to cater for platforms
incorporating masters that can address less than 32 bits of DMA, in
particular the Raspberry Pi 4, which has 4 or 8 GB of DRAM, but has
peripherals that can only address up to 1 GB (and its PCIe host
bridge can only access the bottom 3 GB)
Instructing the DMA layer about these limitations is straight-forward,
even though we had to fix some issues regarding memory limits set in
the IORT for named components, and regarding the handling of ACPI _DMA
methods. However, the DMA layer also needs to be able to allocate
memory that is guaranteed to meet those DMA constraints, for bounce
buffering as well as allocating the backing for consistent mappings.
This is why the 1 GB ZONE_DMA was introduced recently. Unfortunately,
it turns out the having a 1 GB ZONE_DMA as well as a ZONE_DMA32 causes
problems with kdump, and potentially in other places where allocations
cannot cross zone boundaries. Therefore, we should avoid having two
separate DMA zones when possible.
So let's do an early scan of the IORT, and only create the ZONE_DMA
if we encounter any devices that need it. This puts the burden on
the firmware to describe such limitations in the IORT, which may be
redundant (and less precise) if _DMA methods are also being provided.
However, it should be noted that this situation is highly unusual for
arm64 ACPI machines. Also, the DMA subsystem still gives precedence to
the _DMA method if implemented, and so we will not lose the ability to
perform streaming DMA outside the ZONE_DMA if the _DMA method permits
it.
[nsaenz: unified implementation with DT's counterpart]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20201119175400.9995-7-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-20 01:53:58 +08:00
|
|
|
unsigned int __maybe_unused acpi_zone_dma_bits;
|
arm64: mm: Set ZONE_DMA size based on devicetree's dma-ranges
We recently introduced a 1 GB sized ZONE_DMA to cater for platforms
incorporating masters that can address less than 32 bits of DMA, in
particular the Raspberry Pi 4, which has 4 or 8 GB of DRAM, but has
peripherals that can only address up to 1 GB (and its PCIe host
bridge can only access the bottom 3 GB)
The DMA layer also needs to be able to allocate memory that is
guaranteed to meet those DMA constraints, for bounce buffering as well
as allocating the backing for consistent mappings. This is why the 1 GB
ZONE_DMA was introduced recently. Unfortunately, it turns out the having
a 1 GB ZONE_DMA as well as a ZONE_DMA32 causes problems with kdump, and
potentially in other places where allocations cannot cross zone
boundaries. Therefore, we should avoid having two separate DMA zones
when possible.
So, with the help of of_dma_get_max_cpu_address() get the topmost
physical address accessible to all DMA masters in system and use that
information to fine-tune ZONE_DMA's size. In the absence of addressing
limited masters ZONE_DMA will span the whole 32-bit address space,
otherwise, in the case of the Raspberry Pi 4 it'll only span the 30-bit
address space, and have ZONE_DMA32 cover the rest of the 32-bit address
space.
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Link: https://lore.kernel.org/r/20201119175400.9995-6-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-20 01:53:57 +08:00
|
|
|
unsigned int __maybe_unused dt_zone_dma_bits;
|
2021-01-07 22:40:08 +08:00
|
|
|
phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
|
2016-04-09 06:50:27 +08:00
|
|
|
|
2019-09-12 02:25:45 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA
|
arm64: mm: Set ZONE_DMA size based on early IORT scan
We recently introduced a 1 GB sized ZONE_DMA to cater for platforms
incorporating masters that can address less than 32 bits of DMA, in
particular the Raspberry Pi 4, which has 4 or 8 GB of DRAM, but has
peripherals that can only address up to 1 GB (and its PCIe host
bridge can only access the bottom 3 GB)
Instructing the DMA layer about these limitations is straight-forward,
even though we had to fix some issues regarding memory limits set in
the IORT for named components, and regarding the handling of ACPI _DMA
methods. However, the DMA layer also needs to be able to allocate
memory that is guaranteed to meet those DMA constraints, for bounce
buffering as well as allocating the backing for consistent mappings.
This is why the 1 GB ZONE_DMA was introduced recently. Unfortunately,
it turns out the having a 1 GB ZONE_DMA as well as a ZONE_DMA32 causes
problems with kdump, and potentially in other places where allocations
cannot cross zone boundaries. Therefore, we should avoid having two
separate DMA zones when possible.
So let's do an early scan of the IORT, and only create the ZONE_DMA
if we encounter any devices that need it. This puts the burden on
the firmware to describe such limitations in the IORT, which may be
redundant (and less precise) if _DMA methods are also being provided.
However, it should be noted that this situation is highly unusual for
arm64 ACPI machines. Also, the DMA subsystem still gives precedence to
the _DMA method if implemented, and so we will not lose the ability to
perform streaming DMA outside the ZONE_DMA if the _DMA method permits
it.
[nsaenz: unified implementation with DT's counterpart]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20201119175400.9995-7-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-20 01:53:58 +08:00
|
|
|
acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
|
arm64: mm: Set ZONE_DMA size based on devicetree's dma-ranges
We recently introduced a 1 GB sized ZONE_DMA to cater for platforms
incorporating masters that can address less than 32 bits of DMA, in
particular the Raspberry Pi 4, which has 4 or 8 GB of DRAM, but has
peripherals that can only address up to 1 GB (and its PCIe host
bridge can only access the bottom 3 GB)
The DMA layer also needs to be able to allocate memory that is
guaranteed to meet those DMA constraints, for bounce buffering as well
as allocating the backing for consistent mappings. This is why the 1 GB
ZONE_DMA was introduced recently. Unfortunately, it turns out the having
a 1 GB ZONE_DMA as well as a ZONE_DMA32 causes problems with kdump, and
potentially in other places where allocations cannot cross zone
boundaries. Therefore, we should avoid having two separate DMA zones
when possible.
So, with the help of of_dma_get_max_cpu_address() get the topmost
physical address accessible to all DMA masters in system and use that
information to fine-tune ZONE_DMA's size. In the absence of addressing
limited masters ZONE_DMA will span the whole 32-bit address space,
otherwise, in the case of the Raspberry Pi 4 it'll only span the 30-bit
address space, and have ZONE_DMA32 cover the rest of the 32-bit address
space.
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Link: https://lore.kernel.org/r/20201119175400.9995-6-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-20 01:53:57 +08:00
|
|
|
dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
|
arm64: mm: Set ZONE_DMA size based on early IORT scan
We recently introduced a 1 GB sized ZONE_DMA to cater for platforms
incorporating masters that can address less than 32 bits of DMA, in
particular the Raspberry Pi 4, which has 4 or 8 GB of DRAM, but has
peripherals that can only address up to 1 GB (and its PCIe host
bridge can only access the bottom 3 GB)
Instructing the DMA layer about these limitations is straight-forward,
even though we had to fix some issues regarding memory limits set in
the IORT for named components, and regarding the handling of ACPI _DMA
methods. However, the DMA layer also needs to be able to allocate
memory that is guaranteed to meet those DMA constraints, for bounce
buffering as well as allocating the backing for consistent mappings.
This is why the 1 GB ZONE_DMA was introduced recently. Unfortunately,
it turns out the having a 1 GB ZONE_DMA as well as a ZONE_DMA32 causes
problems with kdump, and potentially in other places where allocations
cannot cross zone boundaries. Therefore, we should avoid having two
separate DMA zones when possible.
So let's do an early scan of the IORT, and only create the ZONE_DMA
if we encounter any devices that need it. This puts the burden on
the firmware to describe such limitations in the IORT, which may be
redundant (and less precise) if _DMA methods are also being provided.
However, it should be noted that this situation is highly unusual for
arm64 ACPI machines. Also, the DMA subsystem still gives precedence to
the _DMA method if implemented, and so we will not lose the ability to
perform streaming DMA outside the ZONE_DMA if the _DMA method permits
it.
[nsaenz: unified implementation with DT's counterpart]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20201119175400.9995-7-nsaenzjulienne@suse.de
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-20 01:53:58 +08:00
|
|
|
zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
|
2020-11-20 01:53:54 +08:00
|
|
|
arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
|
2019-09-12 02:25:45 +08:00
|
|
|
max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
|
|
|
|
#endif
|
2019-05-29 00:08:20 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA32
|
2021-01-07 22:40:08 +08:00
|
|
|
max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
|
|
|
|
if (!arm64_dma_phys_limit)
|
|
|
|
arm64_dma_phys_limit = dma32_phys_limit;
|
2019-05-29 00:08:20 +08:00
|
|
|
#endif
|
2021-01-07 22:40:08 +08:00
|
|
|
if (!arm64_dma_phys_limit)
|
|
|
|
arm64_dma_phys_limit = PHYS_MASK + 1;
|
2016-04-09 06:50:27 +08:00
|
|
|
max_zone_pfns[ZONE_NORMAL] = max;
|
|
|
|
|
2020-06-04 06:57:10 +08:00
|
|
|
free_area_init(max_zone_pfns);
|
2016-04-09 06:50:27 +08:00
|
|
|
}
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
int pfn_valid(unsigned long pfn)
|
|
|
|
{
|
2021-03-05 13:24:58 +08:00
|
|
|
phys_addr_t addr = PFN_PHYS(pfn);
|
2021-04-20 17:35:59 +08:00
|
|
|
struct mem_section *ms;
|
arm64: mm: check for upper PAGE_SHIFT bits in pfn_valid()
ARM64's pfn_valid() shifts away the upper PAGE_SHIFT bits of the input
before seeing if the PFN is valid. This leads to false positives when
some of the upper bits are set, but the lower bits match a valid PFN.
For example, the following userspace code looks up a bogus entry in
/proc/kpageflags:
int pagemap = open("/proc/self/pagemap", O_RDONLY);
int pageflags = open("/proc/kpageflags", O_RDONLY);
uint64_t pfn, val;
lseek64(pagemap, [...], SEEK_SET);
read(pagemap, &pfn, sizeof(pfn));
if (pfn & (1UL << 63)) { /* valid PFN */
pfn &= ((1UL << 55) - 1); /* clear flag bits */
pfn |= (1UL << 55);
lseek64(pageflags, pfn * sizeof(uint64_t), SEEK_SET);
read(pageflags, &val, sizeof(val));
}
On ARM64 this causes the userspace process to crash with SIGSEGV rather
than reading (1 << KPF_NOPAGE). kpageflags_read() treats the offset as
valid, and stable_page_flags() will try to access an address between the
user and kernel address ranges.
Fixes: c1cc1552616d ("arm64: MMU initialisation")
Cc: stable@vger.kernel.org
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2018-08-16 03:51:21 +08:00
|
|
|
|
2021-03-05 13:24:58 +08:00
|
|
|
/*
|
|
|
|
* Ensure the upper PAGE_SHIFT bits are clear in the
|
|
|
|
* pfn. Else it might lead to false positives when
|
|
|
|
* some of the upper bits are set, but the lower bits
|
|
|
|
* match a valid pfn.
|
|
|
|
*/
|
|
|
|
if (PHYS_PFN(addr) != pfn)
|
arm64: mm: check for upper PAGE_SHIFT bits in pfn_valid()
ARM64's pfn_valid() shifts away the upper PAGE_SHIFT bits of the input
before seeing if the PFN is valid. This leads to false positives when
some of the upper bits are set, but the lower bits match a valid PFN.
For example, the following userspace code looks up a bogus entry in
/proc/kpageflags:
int pagemap = open("/proc/self/pagemap", O_RDONLY);
int pageflags = open("/proc/kpageflags", O_RDONLY);
uint64_t pfn, val;
lseek64(pagemap, [...], SEEK_SET);
read(pagemap, &pfn, sizeof(pfn));
if (pfn & (1UL << 63)) { /* valid PFN */
pfn &= ((1UL << 55) - 1); /* clear flag bits */
pfn |= (1UL << 55);
lseek64(pageflags, pfn * sizeof(uint64_t), SEEK_SET);
read(pageflags, &val, sizeof(val));
}
On ARM64 this causes the userspace process to crash with SIGSEGV rather
than reading (1 << KPF_NOPAGE). kpageflags_read() treats the offset as
valid, and stable_page_flags() will try to access an address between the
user and kernel address ranges.
Fixes: c1cc1552616d ("arm64: MMU initialisation")
Cc: stable@vger.kernel.org
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2018-08-16 03:51:21 +08:00
|
|
|
return 0;
|
2018-12-12 02:48:48 +08:00
|
|
|
|
|
|
|
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
|
|
|
|
return 0;
|
|
|
|
|
2021-03-05 13:24:58 +08:00
|
|
|
ms = __pfn_to_section(pfn);
|
|
|
|
if (!valid_section(ms))
|
2018-12-12 02:48:48 +08:00
|
|
|
return 0;
|
2021-03-05 13:24:57 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ZONE_DEVICE memory does not have the memblock entries.
|
|
|
|
* memblock_is_map_memory() check for ZONE_DEVICE based
|
|
|
|
* addresses will always fail. Even the normal hotplugged
|
|
|
|
* memory will never have MEMBLOCK_NOMAP flag set in their
|
|
|
|
* memblock entries. Skip memblock search for all non early
|
|
|
|
* memory sections covering all of hotplug memory including
|
|
|
|
* both normal and ZONE_DEVICE based.
|
|
|
|
*/
|
2021-03-05 13:24:58 +08:00
|
|
|
if (!early_section(ms))
|
|
|
|
return pfn_section_valid(ms, pfn);
|
2021-04-20 17:35:59 +08:00
|
|
|
|
arm64: mm: check for upper PAGE_SHIFT bits in pfn_valid()
ARM64's pfn_valid() shifts away the upper PAGE_SHIFT bits of the input
before seeing if the PFN is valid. This leads to false positives when
some of the upper bits are set, but the lower bits match a valid PFN.
For example, the following userspace code looks up a bogus entry in
/proc/kpageflags:
int pagemap = open("/proc/self/pagemap", O_RDONLY);
int pageflags = open("/proc/kpageflags", O_RDONLY);
uint64_t pfn, val;
lseek64(pagemap, [...], SEEK_SET);
read(pagemap, &pfn, sizeof(pfn));
if (pfn & (1UL << 63)) { /* valid PFN */
pfn &= ((1UL << 55) - 1); /* clear flag bits */
pfn |= (1UL << 55);
lseek64(pageflags, pfn * sizeof(uint64_t), SEEK_SET);
read(pageflags, &val, sizeof(val));
}
On ARM64 this causes the userspace process to crash with SIGSEGV rather
than reading (1 << KPF_NOPAGE). kpageflags_read() treats the offset as
valid, and stable_page_flags() will try to access an address between the
user and kernel address ranges.
Fixes: c1cc1552616d ("arm64: MMU initialisation")
Cc: stable@vger.kernel.org
Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2018-08-16 03:51:21 +08:00
|
|
|
return memblock_is_map_memory(addr);
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(pfn_valid);
|
|
|
|
|
2018-06-15 06:28:02 +08:00
|
|
|
static phys_addr_t memory_limit = PHYS_ADDR_MAX;
|
2015-01-16 00:42:14 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Limit the memory size that was specified via FDT.
|
|
|
|
*/
|
|
|
|
static int __init early_mem(char *p)
|
|
|
|
{
|
|
|
|
if (!p)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
memory_limit = memparse(p, &p) & PAGE_MASK;
|
|
|
|
pr_notice("Memory limited to %lldMB\n", memory_limit >> 20);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("mem", early_mem);
|
|
|
|
|
2017-04-03 10:24:31 +08:00
|
|
|
static int __init early_init_dt_scan_usablemem(unsigned long node,
|
|
|
|
const char *uname, int depth, void *data)
|
|
|
|
{
|
|
|
|
struct memblock_region *usablemem = data;
|
|
|
|
const __be32 *reg;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (depth != 1 || strcmp(uname, "chosen") != 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
reg = of_get_flat_dt_prop(node, "linux,usable-memory-range", &len);
|
|
|
|
if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
usablemem->base = dt_mem_next_cell(dt_root_addr_cells, ®);
|
|
|
|
usablemem->size = dt_mem_next_cell(dt_root_size_cells, ®);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init fdt_enforce_memory_region(void)
|
|
|
|
{
|
|
|
|
struct memblock_region reg = {
|
|
|
|
.size = 0,
|
|
|
|
};
|
|
|
|
|
|
|
|
of_scan_flat_dt(early_init_dt_scan_usablemem, ®);
|
|
|
|
|
|
|
|
if (reg.size)
|
|
|
|
memblock_cap_memory_range(reg.base, reg.size);
|
|
|
|
}
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
void __init arm64_memblock_init(void)
|
|
|
|
{
|
arm64: mm: extend linear region for 52-bit VA configurations
For historical reasons, the arm64 kernel VA space is configured as two
equally sized halves, i.e., on a 48-bit VA build, the VA space is split
into a 47-bit vmalloc region and a 47-bit linear region.
When support for 52-bit virtual addressing was added, this equal split
was kept, resulting in a substantial waste of virtual address space in
the linear region:
48-bit VA 52-bit VA
0xffff_ffff_ffff_ffff +-------------+ +-------------+
| vmalloc | | vmalloc |
0xffff_8000_0000_0000 +-------------+ _PAGE_END(48) +-------------+
| linear | : :
0xffff_0000_0000_0000 +-------------+ : :
: : : :
: : : :
: : : :
: : : currently :
: unusable : : :
: : : unused :
: by : : :
: : : :
: hardware : : :
: : : :
0xfff8_0000_0000_0000 : : _PAGE_END(52) +-------------+
: : | |
: : | |
: : | |
: : | |
: : | |
: unusable : | |
: : | linear |
: by : | |
: : | region |
: hardware : | |
: : | |
: : | |
: : | |
: : | |
: : | |
: : | |
0xfff0_0000_0000_0000 +-------------+ PAGE_OFFSET +-------------+
As illustrated above, the 52-bit VA kernel uses 47 bits for the vmalloc
space (as before), to ensure that a single 64k granule kernel image can
support any 64k granule capable system, regardless of whether it supports
the 52-bit virtual addressing extension. However, due to the fact that
the VA space is still split in equal halves, the linear region is only
2^51 bytes in size, wasting almost half of the 52-bit VA space.
Let's fix this, by abandoning the equal split, and simply assigning all
VA space outside of the vmalloc region to the linear region.
The KASAN shadow region is reconfigured so that it ends at the start of
the vmalloc region, and grows downwards. That way, the arrangement of
the vmalloc space (which contains kernel mappings, modules, BPF region,
the vmemmap array etc) is identical between non-KASAN and KASAN builds,
which aids debugging.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Link: https://lore.kernel.org/r/20201008153602.9467-3-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-10-08 23:36:00 +08:00
|
|
|
const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
|
2016-02-16 20:52:42 +08:00
|
|
|
|
2017-04-03 10:24:31 +08:00
|
|
|
/* Handle linux,usable-memory-range property */
|
|
|
|
fdt_enforce_memory_region();
|
|
|
|
|
2018-01-19 03:13:11 +08:00
|
|
|
/* Remove memory above our supported physical address size */
|
|
|
|
memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
|
|
|
|
|
2016-02-16 20:52:42 +08:00
|
|
|
/*
|
|
|
|
* Select a suitable value for the base of physical memory.
|
|
|
|
*/
|
|
|
|
memstart_addr = round_down(memblock_start_of_DRAM(),
|
|
|
|
ARM64_MEMSTART_ALIGN);
|
|
|
|
|
arm64: Warn the user when a small VA_BITS value wastes memory
The memblock code ignores any memory that doesn't fit in the
linear mapping. In order to preserve the distance between two physical
memory locations and their mappings in the linear map, any hole between
two memory regions occupies the same space in the linear map.
On most systems, this is hardly a problem (the memory banks are close
together, and VA_BITS represents a large space compared to the available
memory *and* the potential gaps).
On NUMA systems, things are quite different: the gaps between the
memory nodes can be pretty large compared to the memory size itself,
and the range from memblock_start_of_DRAM() to memblock_end_of_DRAM()
can exceed the space described by VA_BITS.
Unfortunately, we're not very good at making this obvious to the user,
and on a D05 system (two sockets and 4 nodes with 64GB each)
accidentally configured with 39bit VA, we display something like this:
[ 0.000000] NUMA: NODE_DATA [mem 0x1ffbffe100-0x1ffbffffff]
[ 0.000000] NUMA: NODE_DATA [mem 0x2febfc1100-0x2febfc2fff]
[ 0.000000] NUMA: Initmem setup node 2 [<memory-less node>]
[ 0.000000] NUMA: NODE_DATA [mem 0x2febfbf200-0x2febfc10ff]
[ 0.000000] NUMA: NODE_DATA(2) on node 1
[ 0.000000] NUMA: Initmem setup node 3 [<memory-less node>]
[ 0.000000] NUMA: NODE_DATA [mem 0x2febfbd300-0x2febfbf1ff]
[ 0.000000] NUMA: NODE_DATA(3) on node 1
which isn't very explicit, and doesn't tell the user why 128GB
have suddently disappeared.
Let's add a warning message telling the user that memory has been
truncated, and offer a potential solution (bumping VA_BITS up).
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20201215152918.1511108-1-maz@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-12-15 23:29:18 +08:00
|
|
|
if ((memblock_end_of_DRAM() - memstart_addr) > linear_region_size)
|
|
|
|
pr_warn("Memory doesn't fit in the linear mapping, VA_BITS too small\n");
|
|
|
|
|
2016-02-16 20:52:42 +08:00
|
|
|
/*
|
|
|
|
* Remove the memory that we will not be able to cover with the
|
|
|
|
* linear mapping. Take care not to clip the kernel which may be
|
|
|
|
* high in memory.
|
|
|
|
*/
|
2017-01-11 05:35:49 +08:00
|
|
|
memblock_remove(max_t(u64, memstart_addr + linear_region_size,
|
|
|
|
__pa_symbol(_end)), ULLONG_MAX);
|
2016-03-30 20:25:46 +08:00
|
|
|
if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
|
|
|
|
/* ensure that memstart_addr remains sufficiently aligned */
|
|
|
|
memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
|
|
|
|
ARM64_MEMSTART_ALIGN);
|
|
|
|
memblock_remove(0, memstart_addr);
|
|
|
|
}
|
2016-02-16 20:52:42 +08:00
|
|
|
|
arm64: mm: use single quantity to represent the PA to VA translation
On arm64, the global variable memstart_addr represents the physical
address of PAGE_OFFSET, and so physical to virtual translations or
vice versa used to come down to simple additions or subtractions
involving the values of PAGE_OFFSET and memstart_addr.
When support for 52-bit virtual addressing was introduced, we had to
deal with PAGE_OFFSET potentially being outside of the region that
can be covered by the virtual range (as the 52-bit VA capable build
needs to be able to run on systems that are only 48-bit VA capable),
and for this reason, another translation was introduced, and recorded
in the global variable physvirt_offset.
However, if we go back to the original definition of memstart_addr,
i.e., the physical address of PAGE_OFFSET, it turns out that there is
no need for two separate translations: instead, we can simply subtract
the size of the unaddressable VA space from memstart_addr to make the
available physical memory appear in the 48-bit addressable VA region.
This simplifies things, but also fixes a bug on KASLR builds, which
may update memstart_addr later on in arm64_memblock_init(), but fails
to update vmemmap and physvirt_offset accordingly.
Fixes: 5383cc6efed1 ("arm64: mm: Introduce vabits_actual")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Link: https://lore.kernel.org/r/20201008153602.9467-2-ardb@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2020-10-08 23:35:59 +08:00
|
|
|
/*
|
|
|
|
* If we are running with a 52-bit kernel VA config on a system that
|
|
|
|
* does not support it, we have to place the available physical
|
|
|
|
* memory in the 48-bit addressable part of the linear region, i.e.,
|
|
|
|
* we have to move it upward. Since memstart_addr represents the
|
|
|
|
* physical address of PAGE_OFFSET, we have to *subtract* from it.
|
|
|
|
*/
|
|
|
|
if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
|
|
|
|
memstart_addr -= _PAGE_OFFSET(48) - _PAGE_OFFSET(52);
|
|
|
|
|
2016-02-16 20:52:42 +08:00
|
|
|
/*
|
|
|
|
* Apply the memory limit if it was set. Since the kernel may be loaded
|
|
|
|
* high up in memory, add back the kernel region that must be accessible
|
|
|
|
* via the linear mapping.
|
|
|
|
*/
|
2018-06-15 06:28:02 +08:00
|
|
|
if (memory_limit != PHYS_ADDR_MAX) {
|
2016-07-29 06:48:29 +08:00
|
|
|
memblock_mem_limit_remove_map(memory_limit);
|
2017-01-11 05:35:49 +08:00
|
|
|
memblock_add(__pa_symbol(_text), (u64)(_end - _text));
|
2016-02-16 20:52:42 +08:00
|
|
|
}
|
2015-01-16 00:42:14 +08:00
|
|
|
|
2018-11-06 06:54:29 +08:00
|
|
|
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
|
2016-03-30 21:18:42 +08:00
|
|
|
/*
|
|
|
|
* Add back the memory we just removed if it results in the
|
|
|
|
* initrd to become inaccessible via the linear mapping.
|
|
|
|
* Otherwise, this is a no-op
|
|
|
|
*/
|
2018-11-06 06:54:29 +08:00
|
|
|
u64 base = phys_initrd_start & PAGE_MASK;
|
2019-04-18 12:29:29 +08:00
|
|
|
u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
|
2016-03-30 21:18:42 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We can only add back the initrd memory if we don't end up
|
|
|
|
* with more memory than we can address via the linear mapping.
|
|
|
|
* It is up to the bootloader to position the kernel and the
|
|
|
|
* initrd reasonably close to each other (i.e., within 32 GB of
|
|
|
|
* each other) so that all granule/#levels combinations can
|
|
|
|
* always access both.
|
|
|
|
*/
|
|
|
|
if (WARN(base < memblock_start_of_DRAM() ||
|
|
|
|
base + size > memblock_start_of_DRAM() +
|
|
|
|
linear_region_size,
|
|
|
|
"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
|
2019-04-04 00:58:39 +08:00
|
|
|
phys_initrd_size = 0;
|
2016-03-30 21:18:42 +08:00
|
|
|
} else {
|
|
|
|
memblock_remove(base, size); /* clear MEMBLOCK_ flags */
|
|
|
|
memblock_add(base, size);
|
|
|
|
memblock_reserve(base, size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-01-29 18:59:03 +08:00
|
|
|
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
|
|
|
|
extern u16 memstart_offset_seed;
|
2020-10-14 16:18:57 +08:00
|
|
|
u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
|
|
|
|
int parange = cpuid_feature_extract_unsigned_field(
|
|
|
|
mmfr0, ID_AA64MMFR0_PARANGE_SHIFT);
|
|
|
|
s64 range = linear_region_size -
|
|
|
|
BIT(id_aa64mmfr0_parange_to_phys_shift(parange));
|
2016-01-29 18:59:03 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the size of the linear region exceeds, by a sufficient
|
2020-10-14 16:18:57 +08:00
|
|
|
* margin, the size of the region that the physical memory can
|
|
|
|
* span, randomize the linear region as well.
|
2016-01-29 18:59:03 +08:00
|
|
|
*/
|
2020-10-14 16:18:57 +08:00
|
|
|
if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) {
|
2018-12-24 15:40:07 +08:00
|
|
|
range /= ARM64_MEMSTART_ALIGN;
|
2016-01-29 18:59:03 +08:00
|
|
|
memstart_addr -= ARM64_MEMSTART_ALIGN *
|
|
|
|
((range * memstart_offset_seed) >> 16);
|
|
|
|
}
|
|
|
|
}
|
2015-01-16 00:42:14 +08:00
|
|
|
|
2014-06-24 23:51:35 +08:00
|
|
|
/*
|
|
|
|
* Register the kernel text, kernel data, initrd, and initial
|
|
|
|
* pagetables with memblock.
|
|
|
|
*/
|
arm64: omit [_text, _stext) from permanent kernel mapping
In a previous patch, we increased the size of the EFI PE/COFF header
to 64 KB, which resulted in the _stext symbol to appear at a fixed
offset of 64 KB into the image.
Since 64 KB is also the largest page size we support, this completely
removes the need to map the first 64 KB of the kernel image, given that
it only contains the arm64 Image header and the EFI header, neither of
which we ever access again after booting the kernel. More importantly,
we should avoid an executable mapping of non-executable and not entirely
predictable data, to deal with the unlikely event that we inadvertently
emitted something that looks like an opcode that could be used as a
gadget for speculative execution.
So let's limit the kernel mapping of .text to the [_stext, _etext)
region, which matches the view of generic code (such as kallsyms) when
it reasons about the boundaries of the kernel's .text section.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201117124729.12642-2-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-17 20:47:27 +08:00
|
|
|
memblock_reserve(__pa_symbol(_stext), _end - _stext);
|
2018-11-06 06:54:29 +08:00
|
|
|
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
|
2016-02-16 20:52:41 +08:00
|
|
|
/* the generic initrd code expects virtual addresses */
|
2018-11-06 06:54:29 +08:00
|
|
|
initrd_start = __phys_to_virt(phys_initrd_start);
|
|
|
|
initrd_end = initrd_start + phys_initrd_size;
|
2016-02-16 20:52:41 +08:00
|
|
|
}
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2014-09-09 01:01:08 +08:00
|
|
|
early_init_fdt_scan_reserved_mem();
|
2014-06-13 20:41:20 +08:00
|
|
|
|
arm64: kdump: provide /proc/vmcore file
Arch-specific functions are added to allow for implementing a crash dump
file interface, /proc/vmcore, which can be viewed as a ELF file.
A user space tool, like kexec-tools, is responsible for allocating
a separate region for the core's ELF header within crash kdump kernel
memory and filling it in when executing kexec_load().
Then, its location will be advertised to crash dump kernel via a new
device-tree property, "linux,elfcorehdr", and crash dump kernel preserves
the region for later use with reserve_elfcorehdr() at boot time.
On crash dump kernel, /proc/vmcore will access the primary kernel's memory
with copy_oldmem_page(), which feeds the data page-by-page by ioremap'ing
it since it does not reside in linear mapping on crash dump kernel.
Meanwhile, elfcorehdr_read() is simple as the region is always mapped.
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Reviewed-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-04-03 10:24:38 +08:00
|
|
|
reserve_elfcorehdr();
|
|
|
|
|
2017-12-04 22:13:05 +08:00
|
|
|
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void __init bootmem_init(void)
|
|
|
|
{
|
|
|
|
unsigned long min, max;
|
|
|
|
|
|
|
|
min = PFN_UP(memblock_start_of_DRAM());
|
|
|
|
max = PFN_DOWN(memblock_end_of_DRAM());
|
|
|
|
|
2015-04-15 06:48:33 +08:00
|
|
|
early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
|
|
|
|
|
2016-04-09 06:50:27 +08:00
|
|
|
max_pfn = max_low_pfn = max;
|
2019-03-21 12:21:25 +08:00
|
|
|
min_low_pfn = min;
|
2016-04-09 06:50:27 +08:00
|
|
|
|
2020-11-19 08:38:25 +08:00
|
|
|
arch_numa_init();
|
2020-06-18 05:58:28 +08:00
|
|
|
|
|
|
|
/*
|
2020-11-19 08:38:25 +08:00
|
|
|
* must be done after arch_numa_init() which calls numa_init() to
|
2020-06-18 05:58:28 +08:00
|
|
|
* initialize node_online_map that gets used in hugetlb_cma_reserve()
|
|
|
|
* while allocating required CMA size across online nodes.
|
|
|
|
*/
|
2020-07-01 12:42:01 +08:00
|
|
|
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
|
|
|
|
arm64_hugetlb_cma_reserve();
|
2020-06-18 05:58:28 +08:00
|
|
|
#endif
|
|
|
|
|
2020-08-24 07:03:08 +08:00
|
|
|
dma_pernuma_cma_reserve();
|
|
|
|
|
KVM: arm64: Prepare the creation of s1 mappings at EL2
When memory protection is enabled, the EL2 code needs the ability to
create and manage its own page-table. To do so, introduce a new set of
hypercalls to bootstrap a memory management system at EL2.
This leads to the following boot flow in nVHE Protected mode:
1. the host allocates memory for the hypervisor very early on, using
the memblock API;
2. the host creates a set of stage 1 page-table for EL2, installs the
EL2 vectors, and issues the __pkvm_init hypercall;
3. during __pkvm_init, the hypervisor re-creates its stage 1 page-table
and stores it in the memory pool provided by the host;
4. the hypervisor then extends its stage 1 mappings to include a
vmemmap in the EL2 VA space, hence allowing to use the buddy
allocator introduced in a previous patch;
5. the hypervisor jumps back in the idmap page, switches from the
host-provided page-table to the new one, and wraps up its
initialization by enabling the new allocator, before returning to
the host.
6. the host can free the now unused page-table created for EL2, and
will now need to issue hypercalls to make changes to the EL2 stage 1
mappings instead of modifying them directly.
Note that for the sake of simplifying the review, this patch focuses on
the hypervisor side of things. In other words, this only implements the
new hypercalls, but does not make use of them from the host yet. The
host-side changes will follow in a subsequent patch.
Credits to Will for __pkvm_init_switch_pgd.
Acked-by: Will Deacon <will@kernel.org>
Co-authored-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210319100146.1149909-18-qperret@google.com
2021-03-19 18:01:25 +08:00
|
|
|
kvm_hyp_reserve();
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
/*
|
2020-08-07 14:24:02 +08:00
|
|
|
* sparse_init() tries to allocate memory from memblock, so must be
|
|
|
|
* done after the fixed reservations
|
2012-03-05 19:49:27 +08:00
|
|
|
*/
|
|
|
|
sparse_init();
|
|
|
|
zone_sizes_init(min, max);
|
|
|
|
|
2021-01-07 22:40:08 +08:00
|
|
|
/*
|
|
|
|
* Reserve the CMA area after arm64_dma_phys_limit was initialised.
|
|
|
|
*/
|
|
|
|
dma_contiguous_reserve(arm64_dma_phys_limit);
|
|
|
|
|
2020-11-20 01:53:53 +08:00
|
|
|
/*
|
|
|
|
* request_standard_resources() depends on crashkernel's memory being
|
|
|
|
* reserved, so do it here.
|
|
|
|
*/
|
|
|
|
reserve_crashkernel();
|
|
|
|
|
2016-04-09 06:50:27 +08:00
|
|
|
memblock_dump_all();
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mem_init() marks the free areas in the mem_map and tells us how much memory
|
|
|
|
* is free. This is done after various parts of the system have claimed their
|
|
|
|
* memory after the kernel image.
|
|
|
|
*/
|
|
|
|
void __init mem_init(void)
|
|
|
|
{
|
2016-12-16 21:28:41 +08:00
|
|
|
if (swiotlb_force == SWIOTLB_FORCE ||
|
2021-01-07 22:40:08 +08:00
|
|
|
max_pfn > PFN_DOWN(arm64_dma_phys_limit))
|
2016-06-08 15:53:46 +08:00
|
|
|
swiotlb_init(1);
|
2021-05-13 04:18:22 +08:00
|
|
|
else if (!xen_swiotlb_detect())
|
2017-01-16 19:46:33 +08:00
|
|
|
swiotlb_force = SWIOTLB_NO_FORCE;
|
2015-02-06 02:01:53 +08:00
|
|
|
|
2019-03-30 21:13:46 +08:00
|
|
|
set_max_mapnr(max_pfn - PHYS_PFN_OFFSET);
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2013-07-04 06:03:49 +08:00
|
|
|
/* this will put all unused low memory onto the freelists */
|
2018-10-31 06:09:30 +08:00
|
|
|
memblock_free_all();
|
2012-03-05 19:49:27 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check boundaries twice: Some fundamental inconsistencies can be
|
|
|
|
* detected at build time already.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_COMPAT
|
2018-12-07 06:50:37 +08:00
|
|
|
BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
|
2012-03-05 19:49:27 +08:00
|
|
|
#endif
|
|
|
|
|
2013-07-04 06:03:49 +08:00
|
|
|
if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
|
2012-03-05 19:49:27 +08:00
|
|
|
extern int sysctl_overcommit_memory;
|
|
|
|
/*
|
|
|
|
* On a machine this small we won't get anywhere without
|
|
|
|
* overcommit, so turn it on by default.
|
|
|
|
*/
|
|
|
|
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void free_initmem(void)
|
|
|
|
{
|
2017-01-11 05:35:49 +08:00
|
|
|
free_reserved_area(lm_alias(__init_begin),
|
|
|
|
lm_alias(__init_end),
|
2019-10-04 12:23:58 +08:00
|
|
|
POISON_FREE_INITMEM, "unused kernel");
|
2016-09-05 19:30:22 +08:00
|
|
|
/*
|
|
|
|
* Unmap the __init region but leave the VM area in place. This
|
|
|
|
* prevents the region from being reused for kernel modules, which
|
|
|
|
* is not supported by kallsyms.
|
|
|
|
*/
|
2021-04-30 13:59:01 +08:00
|
|
|
vunmap_range((u64)__init_begin, (u64)__init_end);
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2020-06-29 12:38:31 +08:00
|
|
|
void dump_mem_limit(void)
|
2016-02-16 20:52:42 +08:00
|
|
|
{
|
2018-06-15 06:28:02 +08:00
|
|
|
if (memory_limit != PHYS_ADDR_MAX) {
|
2016-02-16 20:52:42 +08:00
|
|
|
pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
|
|
|
|
} else {
|
|
|
|
pr_emerg("Memory Limit: none\n");
|
|
|
|
}
|
|
|
|
}
|