2019-06-03 13:44:50 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2012-03-05 19:49:27 +08:00
|
|
|
/*
|
|
|
|
* Based on arch/arm/mm/mmu.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1995-2005 Russell King
|
|
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
|
|
*/
|
|
|
|
|
2016-08-15 14:45:46 +08:00
|
|
|
#include <linux/cache.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <linux/export.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/init.h>
|
2017-04-03 10:24:34 +08:00
|
|
|
#include <linux/ioport.h>
|
|
|
|
#include <linux/kexec.h>
|
2015-06-01 19:40:32 +08:00
|
|
|
#include <linux/libfdt.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/nodemask.h>
|
|
|
|
#include <linux/memblock.h>
|
2022-02-16 12:31:36 +08:00
|
|
|
#include <linux/memremap.h>
|
2020-03-04 12:28:43 +08:00
|
|
|
#include <linux/memory.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <linux/fs.h>
|
2012-10-23 21:55:08 +08:00
|
|
|
#include <linux/io.h>
|
2017-01-11 05:35:49 +08:00
|
|
|
#include <linux/mm.h>
|
2017-05-15 19:40:20 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2021-07-08 09:07:59 +08:00
|
|
|
#include <linux/set_memory.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2016-01-25 19:44:56 +08:00
|
|
|
#include <asm/barrier.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <asm/cputype.h>
|
2014-11-22 05:50:42 +08:00
|
|
|
#include <asm/fixmap.h>
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
#include <asm/kasan.h>
|
2015-10-19 21:19:28 +08:00
|
|
|
#include <asm/kernel-pgtable.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <asm/sections.h>
|
|
|
|
#include <asm/setup.h>
|
2019-05-15 06:46:51 +08:00
|
|
|
#include <linux/sizes.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <asm/tlb.h>
|
|
|
|
#include <asm/mmu_context.h>
|
2016-10-28 00:27:34 +08:00
|
|
|
#include <asm/ptdump.h>
|
2018-06-06 15:01:21 +08:00
|
|
|
#include <asm/tlbflush.h>
|
2020-08-07 14:22:28 +08:00
|
|
|
#include <asm/pgalloc.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2017-03-10 04:52:07 +08:00
|
|
|
#define NO_BLOCK_MAPPINGS BIT(0)
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
#define NO_CONT_MAPPINGS BIT(1)
|
2021-03-10 18:49:41 +08:00
|
|
|
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
|
2017-03-10 04:52:07 +08:00
|
|
|
|
2022-06-24 23:06:33 +08:00
|
|
|
int idmap_t0sz __ro_after_init;
|
arm64: mm: increase VA range of identity map
The page size and the number of translation levels, and hence the supported
virtual address range, are build-time configurables on arm64 whose optimal
values are use case dependent. However, in the current implementation, if
the system's RAM is located at a very high offset, the virtual address range
needs to reflect that merely because the identity mapping, which is only used
to enable or disable the MMU, requires the extended virtual range to map the
physical memory at an equal virtual offset.
This patch relaxes that requirement, by increasing the number of translation
levels for the identity mapping only, and only when actually needed, i.e.,
when system RAM's offset is found to be out of reach at runtime.
Tested-by: Laura Abbott <lauraa@codeaurora.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-03-20 00:42:27 +08:00
|
|
|
|
2022-06-24 23:06:32 +08:00
|
|
|
#if VA_BITS > 48
|
|
|
|
u64 vabits_actual __ro_after_init = VA_BITS_MIN;
|
2019-08-07 23:55:18 +08:00
|
|
|
EXPORT_SYMBOL(vabits_actual);
|
2022-06-24 23:06:32 +08:00
|
|
|
#endif
|
arm64: mm: increase VA range of identity map
The page size and the number of translation levels, and hence the supported
virtual address range, are build-time configurables on arm64 whose optimal
values are use case dependent. However, in the current implementation, if
the system's RAM is located at a very high offset, the virtual address range
needs to reflect that merely because the identity mapping, which is only used
to enable or disable the MMU, requires the extended virtual range to map the
physical memory at an equal virtual offset.
This patch relaxes that requirement, by increasing the number of translation
levels for the identity mapping only, and only when actually needed, i.e.,
when system RAM's offset is found to be out of reach at runtime.
Tested-by: Laura Abbott <lauraa@codeaurora.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-03-20 00:42:27 +08:00
|
|
|
|
2022-06-24 23:06:31 +08:00
|
|
|
u64 kimage_vaddr __ro_after_init = (u64)&_text;
|
|
|
|
EXPORT_SYMBOL(kimage_vaddr);
|
arm64: mm: increase VA range of identity map
The page size and the number of translation levels, and hence the supported
virtual address range, are build-time configurables on arm64 whose optimal
values are use case dependent. However, in the current implementation, if
the system's RAM is located at a very high offset, the virtual address range
needs to reflect that merely because the identity mapping, which is only used
to enable or disable the MMU, requires the extended virtual range to map the
physical memory at an equal virtual offset.
This patch relaxes that requirement, by increasing the number of translation
levels for the identity mapping only, and only when actually needed, i.e.,
when system RAM's offset is found to be out of reach at runtime.
Tested-by: Laura Abbott <lauraa@codeaurora.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-03-20 00:42:27 +08:00
|
|
|
|
2016-08-15 14:45:46 +08:00
|
|
|
u64 kimage_voffset __ro_after_init;
|
2016-02-16 20:52:42 +08:00
|
|
|
EXPORT_SYMBOL(kimage_voffset);
|
|
|
|
|
2022-06-24 23:06:48 +08:00
|
|
|
u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 };
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The booting CPU updates the failed status @__early_cpu_boot_status,
|
|
|
|
* with MMU turned off.
|
|
|
|
*/
|
|
|
|
long __section(".mmuoff.data.write") __early_cpu_boot_status;
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
/*
|
|
|
|
* Empty_zero_page is a special page that is used for zero-initialized data
|
|
|
|
* and COW.
|
|
|
|
*/
|
arm64: mm: place empty_zero_page in bss
Currently the zero page is set up in paging_init, and thus we cannot use
the zero page earlier. We use the zero page as a reserved TTBR value
from which no TLB entries may be allocated (e.g. when uninstalling the
idmap). To enable such usage earlier (as may be required for invasive
changes to the kernel page tables), and to minimise the time that the
idmap is active, we need to be able to use the zero page before
paging_init.
This patch follows the example set by x86, by allocating the zero page
at compile time, in .bss. This means that the zero page itself is
available immediately upon entry to start_kernel (as we zero .bss before
this), and also means that the zero page takes up no space in the raw
Image binary. The associated struct page is allocated in bootmem_init,
and remains unavailable until this time.
Outside of arch code, the only users of empty_zero_page assume that the
empty_zero_page symbol refers to the zeroed memory itself, and that
ZERO_PAGE(x) must be used to acquire the associated struct page,
following the example of x86. This patch also brings arm64 inline with
these assumptions.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:44:57 +08:00
|
|
|
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
|
2012-03-05 19:49:27 +08:00
|
|
|
EXPORT_SYMBOL(empty_zero_page);
|
|
|
|
|
2016-02-16 20:52:40 +08:00
|
|
|
static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
|
|
|
|
static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
|
|
|
|
static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
|
|
|
|
|
2018-09-25 00:15:02 +08:00
|
|
|
static DEFINE_SPINLOCK(swapper_pgdir_lock);
|
2022-02-01 19:44:00 +08:00
|
|
|
static DEFINE_MUTEX(fixmap_lock);
|
2018-09-25 00:15:02 +08:00
|
|
|
|
|
|
|
void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
|
|
|
|
{
|
|
|
|
pgd_t *fixmap_pgdp;
|
|
|
|
|
|
|
|
spin_lock(&swapper_pgdir_lock);
|
2018-10-10 22:43:22 +08:00
|
|
|
fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
|
2018-09-25 00:15:02 +08:00
|
|
|
WRITE_ONCE(*fixmap_pgdp, pgd);
|
|
|
|
/*
|
|
|
|
* We need dsb(ishst) here to ensure the page-table-walker sees
|
|
|
|
* our new entry before set_p?d() returns. The fixmap's
|
|
|
|
* flush_tlb_kernel_range() via clear_fixmap() does this for us.
|
|
|
|
*/
|
|
|
|
pgd_clear_fixmap();
|
|
|
|
spin_unlock(&swapper_pgdir_lock);
|
|
|
|
}
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
|
|
|
|
unsigned long size, pgprot_t vma_prot)
|
|
|
|
{
|
2021-07-01 09:51:19 +08:00
|
|
|
if (!pfn_is_map_memory(pfn))
|
2012-03-05 19:49:27 +08:00
|
|
|
return pgprot_noncached(vma_prot);
|
|
|
|
else if (file->f_flags & O_SYNC)
|
|
|
|
return pgprot_writecombine(vma_prot);
|
|
|
|
return vma_prot;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(phys_mem_access_prot);
|
|
|
|
|
2019-03-12 08:57:46 +08:00
|
|
|
static phys_addr_t __init early_pgtable_alloc(int shift)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
2015-11-21 01:45:40 +08:00
|
|
|
phys_addr_t phys;
|
|
|
|
void *ptr;
|
|
|
|
|
2021-11-05 23:05:09 +08:00
|
|
|
phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
|
|
|
|
MEMBLOCK_ALLOC_NOLEAKTRACE);
|
2019-03-12 14:29:26 +08:00
|
|
|
if (!phys)
|
|
|
|
panic("Failed to allocate page table page\n");
|
2016-01-25 19:45:08 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
|
|
|
|
* slot will be free, so we can (ab)use the FIX_PTE slot to initialise
|
|
|
|
* any level of table.
|
|
|
|
*/
|
|
|
|
ptr = pte_set_fixmap(phys);
|
|
|
|
|
2016-01-25 19:44:56 +08:00
|
|
|
memset(ptr, 0, PAGE_SIZE);
|
|
|
|
|
2016-01-25 19:45:08 +08:00
|
|
|
/*
|
|
|
|
* Implicit barriers also ensure the zeroed page is visible to the page
|
|
|
|
* table walker
|
|
|
|
*/
|
|
|
|
pte_clear_fixmap();
|
|
|
|
|
|
|
|
return phys;
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2016-10-21 19:22:56 +08:00
|
|
|
static bool pgattr_change_is_safe(u64 old, u64 new)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The following mapping attributes may be updated in live
|
|
|
|
* kernel mappings without the need for break-before-make.
|
|
|
|
*/
|
2019-11-27 17:51:13 +08:00
|
|
|
pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2017-03-10 04:52:06 +08:00
|
|
|
/* creating or taking down mappings is always safe */
|
|
|
|
if (old == 0 || new == 0)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/* live contiguous mappings may not be manipulated at all */
|
|
|
|
if ((old | new) & PTE_CONT)
|
|
|
|
return false;
|
|
|
|
|
2018-02-24 02:04:48 +08:00
|
|
|
/* Transitioning from Non-Global to Global is unsafe */
|
|
|
|
if (old & ~new & PTE_NG)
|
|
|
|
return false;
|
2018-01-29 19:59:54 +08:00
|
|
|
|
2019-11-27 17:51:13 +08:00
|
|
|
/*
|
|
|
|
* Changing the memory type between Normal and Normal-Tagged is safe
|
|
|
|
* since Tagged is considered a permission attribute from the
|
|
|
|
* mismatched attribute aliases perspective.
|
|
|
|
*/
|
|
|
|
if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
|
|
|
|
(old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
|
|
|
|
((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
|
|
|
|
(new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
|
|
|
|
mask |= PTE_ATTRINDX_MASK;
|
|
|
|
|
2017-03-10 04:52:06 +08:00
|
|
|
return ((old ^ new) & ~mask) == 0;
|
2016-10-21 19:22:56 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
phys_addr_t phys, pgprot_t prot)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
pte_t *ptep;
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
ptep = pte_set_fixmap_offset(pmdp, addr);
|
2012-03-05 19:49:27 +08:00
|
|
|
do {
|
2018-02-15 19:14:56 +08:00
|
|
|
pte_t old_pte = READ_ONCE(*ptep);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
|
2016-10-21 19:22:56 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* After the PTE entry has been populated once, we
|
|
|
|
* only allow updates to the permission attributes.
|
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
|
|
|
|
READ_ONCE(pte_val(*ptep))));
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2017-03-10 04:52:04 +08:00
|
|
|
phys += PAGE_SIZE;
|
2018-02-15 19:14:56 +08:00
|
|
|
} while (ptep++, addr += PAGE_SIZE, addr != end);
|
2016-01-25 19:45:08 +08:00
|
|
|
|
|
|
|
pte_clear_fixmap();
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
unsigned long end, phys_addr_t phys,
|
|
|
|
pgprot_t prot,
|
2019-03-12 08:57:46 +08:00
|
|
|
phys_addr_t (*pgtable_alloc)(int),
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
int flags)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_t pmd = READ_ONCE(*pmdp);
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pmd_sect(pmd));
|
|
|
|
if (pmd_none(pmd)) {
|
2021-03-10 18:49:41 +08:00
|
|
|
pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
phys_addr_t pte_phys;
|
2021-03-10 18:49:41 +08:00
|
|
|
|
|
|
|
if (flags & NO_EXEC_MAPPINGS)
|
|
|
|
pmdval |= PMD_TABLE_PXN;
|
2016-02-06 08:24:46 +08:00
|
|
|
BUG_ON(!pgtable_alloc);
|
2019-03-12 08:57:46 +08:00
|
|
|
pte_phys = pgtable_alloc(PAGE_SHIFT);
|
2021-03-10 18:49:41 +08:00
|
|
|
__pmd_populate(pmdp, pte_phys, pmdval);
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd = READ_ONCE(*pmdp);
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pmd_bad(pmd));
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
pgprot_t __prot = prot;
|
|
|
|
|
|
|
|
next = pte_cont_addr_end(addr, end);
|
|
|
|
|
|
|
|
/* use a contiguous mapping if the range is suitably aligned */
|
|
|
|
if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
|
|
|
|
(flags & NO_CONT_MAPPINGS) == 0)
|
|
|
|
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
init_pte(pmdp, addr, next, phys, __prot);
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
phys += next - addr;
|
|
|
|
} while (addr = next, addr != end);
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
phys_addr_t phys, pgprot_t prot,
|
2019-03-12 08:57:46 +08:00
|
|
|
phys_addr_t (*pgtable_alloc)(int), int flags)
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_t *pmdp;
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pmdp = pmd_set_fixmap_offset(pudp, addr);
|
2012-03-05 19:49:27 +08:00
|
|
|
do {
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_t old_pmd = READ_ONCE(*pmdp);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
next = pmd_addr_end(addr, end);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
/* try section mapping first */
|
2021-06-14 16:18:26 +08:00
|
|
|
if (((addr | next | phys) & ~PMD_MASK) == 0 &&
|
2017-03-10 04:52:07 +08:00
|
|
|
(flags & NO_BLOCK_MAPPINGS) == 0) {
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_set_huge(pmdp, phys, prot);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2014-02-05 00:01:31 +08:00
|
|
|
/*
|
2016-10-21 19:22:56 +08:00
|
|
|
* After the PMD entry has been populated once, we
|
|
|
|
* only allow updates to the permission attributes.
|
2014-02-05 00:01:31 +08:00
|
|
|
*/
|
2016-10-21 19:22:56 +08:00
|
|
|
BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
|
2018-02-15 19:14:56 +08:00
|
|
|
READ_ONCE(pmd_val(*pmdp))));
|
2014-02-05 00:01:31 +08:00
|
|
|
} else {
|
2018-02-15 19:14:56 +08:00
|
|
|
alloc_init_cont_pte(pmdp, addr, next, phys, prot,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
pgtable_alloc, flags);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
|
|
|
BUG_ON(pmd_val(old_pmd) != 0 &&
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
|
2014-02-05 00:01:31 +08:00
|
|
|
}
|
2012-03-05 19:49:27 +08:00
|
|
|
phys += next - addr;
|
2018-02-15 19:14:56 +08:00
|
|
|
} while (pmdp++, addr = next, addr != end);
|
2016-01-25 19:45:08 +08:00
|
|
|
|
|
|
|
pmd_clear_fixmap();
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
unsigned long end, phys_addr_t phys,
|
|
|
|
pgprot_t prot,
|
2019-03-12 08:57:46 +08:00
|
|
|
phys_addr_t (*pgtable_alloc)(int), int flags)
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t pud = READ_ONCE(*pudp);
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check for initial section mappings in the pgd/pud.
|
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pud_sect(pud));
|
|
|
|
if (pud_none(pud)) {
|
2021-03-10 18:49:41 +08:00
|
|
|
pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
phys_addr_t pmd_phys;
|
2021-03-10 18:49:41 +08:00
|
|
|
|
|
|
|
if (flags & NO_EXEC_MAPPINGS)
|
|
|
|
pudval |= PUD_TABLE_PXN;
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
BUG_ON(!pgtable_alloc);
|
2019-03-12 08:57:46 +08:00
|
|
|
pmd_phys = pgtable_alloc(PMD_SHIFT);
|
2021-03-10 18:49:41 +08:00
|
|
|
__pud_populate(pudp, pmd_phys, pudval);
|
2018-02-15 19:14:56 +08:00
|
|
|
pud = READ_ONCE(*pudp);
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
}
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pud_bad(pud));
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
pgprot_t __prot = prot;
|
|
|
|
|
|
|
|
next = pmd_cont_addr_end(addr, end);
|
|
|
|
|
|
|
|
/* use a contiguous mapping if the range is suitably aligned */
|
|
|
|
if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
|
|
|
|
(flags & NO_CONT_MAPPINGS) == 0)
|
|
|
|
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
phys += next - addr;
|
|
|
|
} while (addr = next, addr != end);
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
|
|
|
|
phys_addr_t phys, pgprot_t prot,
|
2019-03-12 08:57:46 +08:00
|
|
|
phys_addr_t (*pgtable_alloc)(int),
|
2018-02-15 19:14:56 +08:00
|
|
|
int flags)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t *pudp;
|
2020-06-05 07:46:23 +08:00
|
|
|
p4d_t *p4dp = p4d_offset(pgdp, addr);
|
|
|
|
p4d_t p4d = READ_ONCE(*p4dp);
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2020-06-05 07:46:23 +08:00
|
|
|
if (p4d_none(p4d)) {
|
2021-03-10 18:49:41 +08:00
|
|
|
p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
|
2016-02-06 08:24:46 +08:00
|
|
|
phys_addr_t pud_phys;
|
2021-03-10 18:49:41 +08:00
|
|
|
|
|
|
|
if (flags & NO_EXEC_MAPPINGS)
|
|
|
|
p4dval |= P4D_TABLE_PXN;
|
2016-02-06 08:24:46 +08:00
|
|
|
BUG_ON(!pgtable_alloc);
|
2019-03-12 08:57:46 +08:00
|
|
|
pud_phys = pgtable_alloc(PUD_SHIFT);
|
2021-03-10 18:49:41 +08:00
|
|
|
__p4d_populate(p4dp, pud_phys, p4dval);
|
2020-06-05 07:46:23 +08:00
|
|
|
p4d = READ_ONCE(*p4dp);
|
2014-05-12 17:40:51 +08:00
|
|
|
}
|
2020-06-05 07:46:23 +08:00
|
|
|
BUG_ON(p4d_bad(p4d));
|
2014-05-12 17:40:51 +08:00
|
|
|
|
2020-06-05 07:46:23 +08:00
|
|
|
pudp = pud_set_fixmap_offset(p4dp, addr);
|
2012-03-05 19:49:27 +08:00
|
|
|
do {
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t old_pud = READ_ONCE(*pudp);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
next = pud_addr_end(addr, end);
|
2014-05-06 21:02:27 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For 4K granule only, attempt to put down a 1GB block
|
|
|
|
*/
|
2022-02-16 13:06:52 +08:00
|
|
|
if (pud_sect_supported() &&
|
|
|
|
((addr | next | phys) & ~PUD_MASK) == 0 &&
|
2017-03-10 04:52:07 +08:00
|
|
|
(flags & NO_BLOCK_MAPPINGS) == 0) {
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_set_huge(pudp, phys, prot);
|
2014-05-06 21:02:27 +08:00
|
|
|
|
|
|
|
/*
|
2016-10-21 19:22:56 +08:00
|
|
|
* After the PUD entry has been populated once, we
|
|
|
|
* only allow updates to the permission attributes.
|
2014-05-06 21:02:27 +08:00
|
|
|
*/
|
2016-10-21 19:22:56 +08:00
|
|
|
BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
|
2018-02-15 19:14:56 +08:00
|
|
|
READ_ONCE(pud_val(*pudp))));
|
2014-05-06 21:02:27 +08:00
|
|
|
} else {
|
2018-02-15 19:14:56 +08:00
|
|
|
alloc_init_cont_pmd(pudp, addr, next, phys, prot,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
pgtable_alloc, flags);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
|
|
|
BUG_ON(pud_val(old_pud) != 0 &&
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
|
2014-05-06 21:02:27 +08:00
|
|
|
}
|
2012-03-05 19:49:27 +08:00
|
|
|
phys += next - addr;
|
2018-02-15 19:14:56 +08:00
|
|
|
} while (pudp++, addr = next, addr != end);
|
2016-01-25 19:45:08 +08:00
|
|
|
|
|
|
|
pud_clear_fixmap();
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
arm64: mm: don't acquire mutex when rewriting swapper
Since commit:
47546a1912fc4a03 ("arm64: mm: install KPTI nG mappings with MMU enabled)"
... when building with CONFIG_DEBUG_ATOMIC_SLEEP=y and booting under
QEMU TCG with '-cpu max', there's a boot-time splat:
| BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580
| in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 15, name: migration/0
| preempt_count: 1, expected: 0
| RCU nest depth: 0, expected: 0
| no locks held by migration/0/15.
| irq event stamp: 28
| hardirqs last enabled at (27): [<ffff8000091ed180>] _raw_spin_unlock_irq+0x3c/0x7c
| hardirqs last disabled at (28): [<ffff8000081b8d74>] multi_cpu_stop+0x150/0x18c
| softirqs last enabled at (0): [<ffff80000809a314>] copy_process+0x594/0x1964
| softirqs last disabled at (0): [<0000000000000000>] 0x0
| CPU: 0 PID: 15 Comm: migration/0 Not tainted 6.0.0-rc3-00002-g419b42ff7eef #3
| Hardware name: linux,dummy-virt (DT)
| Stopper: multi_cpu_stop+0x0/0x18c <- stop_cpus.constprop.0+0xa0/0xfc
| Call trace:
| dump_backtrace.part.0+0xd0/0xe0
| show_stack+0x1c/0x5c
| dump_stack_lvl+0x88/0xb4
| dump_stack+0x1c/0x38
| __might_resched+0x180/0x230
| __might_sleep+0x4c/0xa0
| __mutex_lock+0x5c/0x450
| mutex_lock_nested+0x30/0x40
| create_kpti_ng_temp_pgd+0x4fc/0x6d0
| kpti_install_ng_mappings+0x2b8/0x3b0
| cpu_enable_non_boot_scope_capabilities+0x7c/0xd0
| multi_cpu_stop+0xa0/0x18c
| cpu_stopper_thread+0x88/0x11c
| smpboot_thread_fn+0x1ec/0x290
| kthread+0x118/0x120
| ret_from_fork+0x10/0x20
Since commit:
ee017ee353506fce ("arm64/mm: avoid fixmap race condition when create pud mapping")
... once the kernel leave the SYSTEM_BOOTING state, the fixmap pagetable
entries are protected by the fixmap_lock mutex.
The new KPTI rewrite code uses __create_pgd_mapping() to create a
temporary pagetable. This happens in atomic context, after secondary
CPUs are brought up and the kernel has left the SYSTEM_BOOTING state.
Hence we try to acquire a mutex in atomic context, which is generally
unsound (though benign in this case as the mutex should be free and all
other CPUs are quiescent).
This patch avoids the issue by pulling the mutex out of alloc_init_pud()
and calling it at a higher level in the pagetable manipulation code.
This allows it to be used without locking where one CPU is known to be
in exclusive control of the machine, even after having left the
SYSTEM_BOOTING state.
Fixes: 47546a1912fc ("arm64: mm: install KPTI nG mappings with MMU enabled")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20220920134731.1625740-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-09-20 21:47:31 +08:00
|
|
|
static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
|
|
|
|
unsigned long virt, phys_addr_t size,
|
|
|
|
pgprot_t prot,
|
|
|
|
phys_addr_t (*pgtable_alloc)(int),
|
|
|
|
int flags)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
2019-11-03 20:35:58 +08:00
|
|
|
unsigned long addr, end, next;
|
2020-06-09 12:33:10 +08:00
|
|
|
pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2015-11-23 21:26:19 +08:00
|
|
|
/*
|
|
|
|
* If the virtual and physical address don't have the same offset
|
|
|
|
* within a page, we cannot map the region as the caller expects.
|
|
|
|
*/
|
|
|
|
if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
|
|
|
|
return;
|
|
|
|
|
2015-11-23 21:26:20 +08:00
|
|
|
phys &= PAGE_MASK;
|
2012-03-05 19:49:27 +08:00
|
|
|
addr = virt & PAGE_MASK;
|
2019-11-03 20:35:58 +08:00
|
|
|
end = PAGE_ALIGN(virt + size);
|
2012-03-05 19:49:27 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
2018-02-15 19:14:56 +08:00
|
|
|
alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
|
2017-03-10 04:52:07 +08:00
|
|
|
flags);
|
2012-03-05 19:49:27 +08:00
|
|
|
phys += next - addr;
|
2018-02-15 19:14:56 +08:00
|
|
|
} while (pgdp++, addr = next, addr != end);
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
arm64: mm: don't acquire mutex when rewriting swapper
Since commit:
47546a1912fc4a03 ("arm64: mm: install KPTI nG mappings with MMU enabled)"
... when building with CONFIG_DEBUG_ATOMIC_SLEEP=y and booting under
QEMU TCG with '-cpu max', there's a boot-time splat:
| BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580
| in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 15, name: migration/0
| preempt_count: 1, expected: 0
| RCU nest depth: 0, expected: 0
| no locks held by migration/0/15.
| irq event stamp: 28
| hardirqs last enabled at (27): [<ffff8000091ed180>] _raw_spin_unlock_irq+0x3c/0x7c
| hardirqs last disabled at (28): [<ffff8000081b8d74>] multi_cpu_stop+0x150/0x18c
| softirqs last enabled at (0): [<ffff80000809a314>] copy_process+0x594/0x1964
| softirqs last disabled at (0): [<0000000000000000>] 0x0
| CPU: 0 PID: 15 Comm: migration/0 Not tainted 6.0.0-rc3-00002-g419b42ff7eef #3
| Hardware name: linux,dummy-virt (DT)
| Stopper: multi_cpu_stop+0x0/0x18c <- stop_cpus.constprop.0+0xa0/0xfc
| Call trace:
| dump_backtrace.part.0+0xd0/0xe0
| show_stack+0x1c/0x5c
| dump_stack_lvl+0x88/0xb4
| dump_stack+0x1c/0x38
| __might_resched+0x180/0x230
| __might_sleep+0x4c/0xa0
| __mutex_lock+0x5c/0x450
| mutex_lock_nested+0x30/0x40
| create_kpti_ng_temp_pgd+0x4fc/0x6d0
| kpti_install_ng_mappings+0x2b8/0x3b0
| cpu_enable_non_boot_scope_capabilities+0x7c/0xd0
| multi_cpu_stop+0xa0/0x18c
| cpu_stopper_thread+0x88/0x11c
| smpboot_thread_fn+0x1ec/0x290
| kthread+0x118/0x120
| ret_from_fork+0x10/0x20
Since commit:
ee017ee353506fce ("arm64/mm: avoid fixmap race condition when create pud mapping")
... once the kernel leave the SYSTEM_BOOTING state, the fixmap pagetable
entries are protected by the fixmap_lock mutex.
The new KPTI rewrite code uses __create_pgd_mapping() to create a
temporary pagetable. This happens in atomic context, after secondary
CPUs are brought up and the kernel has left the SYSTEM_BOOTING state.
Hence we try to acquire a mutex in atomic context, which is generally
unsound (though benign in this case as the mutex should be free and all
other CPUs are quiescent).
This patch avoids the issue by pulling the mutex out of alloc_init_pud()
and calling it at a higher level in the pagetable manipulation code.
This allows it to be used without locking where one CPU is known to be
in exclusive control of the machine, even after having left the
SYSTEM_BOOTING state.
Fixes: 47546a1912fc ("arm64: mm: install KPTI nG mappings with MMU enabled")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20220920134731.1625740-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-09-20 21:47:31 +08:00
|
|
|
static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
|
|
|
|
unsigned long virt, phys_addr_t size,
|
|
|
|
pgprot_t prot,
|
|
|
|
phys_addr_t (*pgtable_alloc)(int),
|
|
|
|
int flags)
|
|
|
|
{
|
|
|
|
mutex_lock(&fixmap_lock);
|
|
|
|
__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
|
|
|
|
pgtable_alloc, flags);
|
|
|
|
mutex_unlock(&fixmap_lock);
|
|
|
|
}
|
|
|
|
|
arm64: mm: install KPTI nG mappings with MMU enabled
In cases where we unmap the kernel while running in user space, we rely
on ASIDs to distinguish the minimal trampoline from the full kernel
mapping, and this means we must use non-global attributes for those
mappings, to ensure they are scoped by ASID and will not hit in the TLB
inadvertently.
We only do this when needed, as this is generally more costly in terms
of TLB pressure, and so we boot without these non-global attributes, and
apply them to all existing kernel mappings once all CPUs are up and we
know whether or not the non-global attributes are needed. At this point,
we cannot simply unmap and remap the entire address space, so we have to
update all existing block and page descriptors in place.
Currently, we go through a lot of trouble to perform these updates with
the MMU and caches off, to avoid violating break before make (BBM) rules
imposed by the architecture. Since we make changes to page tables that
are not covered by the ID map, we gain access to those descriptors by
disabling translations altogether. This means that the stores to memory
are issued with device attributes, and require extra care in terms of
coherency, which is costly. We also rely on the ID map to access a
shared flag, which requires the ID map to be executable and writable at
the same time, which is another thing we'd prefer to avoid.
So let's switch to an approach where we replace the kernel mapping with
a minimal mapping of a few pages that can be used for a minimal, ad-hoc
fixmap that we can use to map each page table in turn as we traverse the
hierarchy.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20220609174320.4035379-3-ardb@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2022-06-10 01:43:20 +08:00
|
|
|
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
|
arm64: mm: don't acquire mutex when rewriting swapper
Since commit:
47546a1912fc4a03 ("arm64: mm: install KPTI nG mappings with MMU enabled)"
... when building with CONFIG_DEBUG_ATOMIC_SLEEP=y and booting under
QEMU TCG with '-cpu max', there's a boot-time splat:
| BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580
| in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 15, name: migration/0
| preempt_count: 1, expected: 0
| RCU nest depth: 0, expected: 0
| no locks held by migration/0/15.
| irq event stamp: 28
| hardirqs last enabled at (27): [<ffff8000091ed180>] _raw_spin_unlock_irq+0x3c/0x7c
| hardirqs last disabled at (28): [<ffff8000081b8d74>] multi_cpu_stop+0x150/0x18c
| softirqs last enabled at (0): [<ffff80000809a314>] copy_process+0x594/0x1964
| softirqs last disabled at (0): [<0000000000000000>] 0x0
| CPU: 0 PID: 15 Comm: migration/0 Not tainted 6.0.0-rc3-00002-g419b42ff7eef #3
| Hardware name: linux,dummy-virt (DT)
| Stopper: multi_cpu_stop+0x0/0x18c <- stop_cpus.constprop.0+0xa0/0xfc
| Call trace:
| dump_backtrace.part.0+0xd0/0xe0
| show_stack+0x1c/0x5c
| dump_stack_lvl+0x88/0xb4
| dump_stack+0x1c/0x38
| __might_resched+0x180/0x230
| __might_sleep+0x4c/0xa0
| __mutex_lock+0x5c/0x450
| mutex_lock_nested+0x30/0x40
| create_kpti_ng_temp_pgd+0x4fc/0x6d0
| kpti_install_ng_mappings+0x2b8/0x3b0
| cpu_enable_non_boot_scope_capabilities+0x7c/0xd0
| multi_cpu_stop+0xa0/0x18c
| cpu_stopper_thread+0x88/0x11c
| smpboot_thread_fn+0x1ec/0x290
| kthread+0x118/0x120
| ret_from_fork+0x10/0x20
Since commit:
ee017ee353506fce ("arm64/mm: avoid fixmap race condition when create pud mapping")
... once the kernel leave the SYSTEM_BOOTING state, the fixmap pagetable
entries are protected by the fixmap_lock mutex.
The new KPTI rewrite code uses __create_pgd_mapping() to create a
temporary pagetable. This happens in atomic context, after secondary
CPUs are brought up and the kernel has left the SYSTEM_BOOTING state.
Hence we try to acquire a mutex in atomic context, which is generally
unsound (though benign in this case as the mutex should be free and all
other CPUs are quiescent).
This patch avoids the issue by pulling the mutex out of alloc_init_pud()
and calling it at a higher level in the pagetable manipulation code.
This allows it to be used without locking where one CPU is known to be
in exclusive control of the machine, even after having left the
SYSTEM_BOOTING state.
Fixes: 47546a1912fc ("arm64: mm: install KPTI nG mappings with MMU enabled")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20220920134731.1625740-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-09-20 21:47:31 +08:00
|
|
|
extern __alias(__create_pgd_mapping_locked)
|
arm64: mm: install KPTI nG mappings with MMU enabled
In cases where we unmap the kernel while running in user space, we rely
on ASIDs to distinguish the minimal trampoline from the full kernel
mapping, and this means we must use non-global attributes for those
mappings, to ensure they are scoped by ASID and will not hit in the TLB
inadvertently.
We only do this when needed, as this is generally more costly in terms
of TLB pressure, and so we boot without these non-global attributes, and
apply them to all existing kernel mappings once all CPUs are up and we
know whether or not the non-global attributes are needed. At this point,
we cannot simply unmap and remap the entire address space, so we have to
update all existing block and page descriptors in place.
Currently, we go through a lot of trouble to perform these updates with
the MMU and caches off, to avoid violating break before make (BBM) rules
imposed by the architecture. Since we make changes to page tables that
are not covered by the ID map, we gain access to those descriptors by
disabling translations altogether. This means that the stores to memory
are issued with device attributes, and require extra care in terms of
coherency, which is costly. We also rely on the ID map to access a
shared flag, which requires the ID map to be executable and writable at
the same time, which is another thing we'd prefer to avoid.
So let's switch to an approach where we replace the kernel mapping with
a minimal mapping of a few pages that can be used for a minimal, ad-hoc
fixmap that we can use to map each page table in turn as we traverse the
hierarchy.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20220609174320.4035379-3-ardb@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2022-06-10 01:43:20 +08:00
|
|
|
void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
|
|
|
|
phys_addr_t size, pgprot_t prot,
|
|
|
|
phys_addr_t (*pgtable_alloc)(int), int flags);
|
|
|
|
#endif
|
|
|
|
|
2019-04-08 18:23:48 +08:00
|
|
|
static phys_addr_t __pgd_pgtable_alloc(int shift)
|
2019-03-12 08:57:47 +08:00
|
|
|
{
|
2019-07-12 11:58:02 +08:00
|
|
|
void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
|
2019-03-12 08:57:47 +08:00
|
|
|
BUG_ON(!ptr);
|
|
|
|
|
|
|
|
/* Ensure the zeroed page is visible to the page table walker */
|
|
|
|
dsb(ishst);
|
|
|
|
return __pa(ptr);
|
|
|
|
}
|
|
|
|
|
2019-03-12 08:57:46 +08:00
|
|
|
static phys_addr_t pgd_pgtable_alloc(int shift)
|
2015-01-22 09:36:06 +08:00
|
|
|
{
|
2019-04-08 18:23:48 +08:00
|
|
|
phys_addr_t pa = __pgd_pgtable_alloc(shift);
|
2019-03-12 08:57:46 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Call proper page table ctor in case later we need to
|
|
|
|
* call core mm functions like apply_to_page_range() on
|
|
|
|
* this pre-allocated page table.
|
|
|
|
*
|
|
|
|
* We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
|
|
|
|
* folded, and if so pgtable_pmd_page_ctor() becomes nop.
|
|
|
|
*/
|
|
|
|
if (shift == PAGE_SHIFT)
|
2019-09-26 07:49:46 +08:00
|
|
|
BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa)));
|
2019-03-12 08:57:46 +08:00
|
|
|
else if (shift == PMD_SHIFT)
|
2019-04-08 18:23:48 +08:00
|
|
|
BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
|
2016-01-25 19:44:56 +08:00
|
|
|
|
2019-04-08 18:23:48 +08:00
|
|
|
return pa;
|
2015-01-22 09:36:06 +08:00
|
|
|
}
|
|
|
|
|
2016-02-06 08:24:46 +08:00
|
|
|
/*
|
|
|
|
* This function can only be used to modify existing table entries,
|
|
|
|
* without allocating new levels of table. Note that this permits the
|
|
|
|
* creation of new section or page entries.
|
|
|
|
*/
|
|
|
|
static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
|
2015-01-22 09:36:06 +08:00
|
|
|
phys_addr_t size, pgprot_t prot)
|
2014-03-13 00:28:06 +08:00
|
|
|
{
|
2019-08-14 21:28:48 +08:00
|
|
|
if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
|
2014-03-13 00:28:06 +08:00
|
|
|
pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
|
|
|
|
&phys, virt);
|
|
|
|
return;
|
|
|
|
}
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
|
|
|
|
NO_CONT_MAPPINGS);
|
2014-03-13 00:28:06 +08:00
|
|
|
}
|
|
|
|
|
2014-10-20 21:42:07 +08:00
|
|
|
void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
|
|
|
|
unsigned long virt, phys_addr_t size,
|
2016-10-21 19:22:57 +08:00
|
|
|
pgprot_t prot, bool page_mappings_only)
|
2014-10-20 21:42:07 +08:00
|
|
|
{
|
2017-03-10 04:52:07 +08:00
|
|
|
int flags = 0;
|
|
|
|
|
2016-07-23 01:32:25 +08:00
|
|
|
BUG_ON(mm == &init_mm);
|
|
|
|
|
2017-03-10 04:52:07 +08:00
|
|
|
if (page_mappings_only)
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
2017-03-10 04:52:07 +08:00
|
|
|
|
2016-01-25 19:45:10 +08:00
|
|
|
__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
|
2017-03-10 04:52:07 +08:00
|
|
|
pgd_pgtable_alloc, flags);
|
2014-03-13 00:28:06 +08:00
|
|
|
}
|
|
|
|
|
2017-03-10 04:52:00 +08:00
|
|
|
static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
|
|
|
|
phys_addr_t size, pgprot_t prot)
|
2015-01-22 09:36:06 +08:00
|
|
|
{
|
2019-08-14 21:28:48 +08:00
|
|
|
if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
|
2017-03-10 04:52:00 +08:00
|
|
|
pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
|
2015-01-22 09:36:06 +08:00
|
|
|
&phys, virt);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
|
|
|
|
NO_CONT_MAPPINGS);
|
2017-03-10 04:52:00 +08:00
|
|
|
|
|
|
|
/* flush the TLBs after updating live kernel mappings */
|
|
|
|
flush_tlb_kernel_range(virt, virt + size);
|
2015-01-22 09:36:06 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
|
2017-04-03 10:24:34 +08:00
|
|
|
phys_addr_t end, pgprot_t prot, int flags)
|
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
|
2017-04-03 10:24:34 +08:00
|
|
|
prot, early_pgtable_alloc, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init mark_linear_text_alias_ro(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Remove the write permissions from the linear alias of .text/.rodata
|
|
|
|
*/
|
arm64: omit [_text, _stext) from permanent kernel mapping
In a previous patch, we increased the size of the EFI PE/COFF header
to 64 KB, which resulted in the _stext symbol to appear at a fixed
offset of 64 KB into the image.
Since 64 KB is also the largest page size we support, this completely
removes the need to map the first 64 KB of the kernel image, given that
it only contains the arm64 Image header and the EFI header, neither of
which we ever access again after booting the kernel. More importantly,
we should avoid an executable mapping of non-executable and not entirely
predictable data, to deal with the unlikely event that we inadvertently
emitted something that looks like an opcode that could be used as a
gadget for speculative execution.
So let's limit the kernel mapping of .text to the [_stext, _etext)
region, which matches the view of generic code (such as kallsyms) when
it reasons about the boundaries of the kernel's .text section.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201117124729.12642-2-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-17 20:47:27 +08:00
|
|
|
update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
|
|
|
|
(unsigned long)__init_begin - (unsigned long)_stext,
|
2017-04-03 10:24:34 +08:00
|
|
|
PAGE_KERNEL_RO);
|
|
|
|
}
|
|
|
|
|
2020-11-20 01:55:56 +08:00
|
|
|
static bool crash_mem_map __initdata;
|
|
|
|
|
|
|
|
static int __init enable_crash_mem_map(char *arg)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Proper parameter parsing is done by reserve_crashkernel(). We only
|
|
|
|
* need to know if the linear map has to avoid block mappings so that
|
|
|
|
* the crashkernel reservations can be unmapped later.
|
|
|
|
*/
|
|
|
|
crash_mem_map = true;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("crashkernel", enable_crash_mem_map);
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void __init map_mem(pgd_t *pgdp)
|
2015-01-22 09:36:06 +08:00
|
|
|
{
|
2021-03-10 18:49:41 +08:00
|
|
|
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
|
arm64: omit [_text, _stext) from permanent kernel mapping
In a previous patch, we increased the size of the EFI PE/COFF header
to 64 KB, which resulted in the _stext symbol to appear at a fixed
offset of 64 KB into the image.
Since 64 KB is also the largest page size we support, this completely
removes the need to map the first 64 KB of the kernel image, given that
it only contains the arm64 Image header and the EFI header, neither of
which we ever access again after booting the kernel. More importantly,
we should avoid an executable mapping of non-executable and not entirely
predictable data, to deal with the unlikely event that we inadvertently
emitted something that looks like an opcode that could be used as a
gadget for speculative execution.
So let's limit the kernel mapping of .text to the [_stext, _etext)
region, which matches the view of generic code (such as kallsyms) when
it reasons about the boundaries of the kernel's .text section.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201117124729.12642-2-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-17 20:47:27 +08:00
|
|
|
phys_addr_t kernel_start = __pa_symbol(_stext);
|
2017-01-13 13:59:35 +08:00
|
|
|
phys_addr_t kernel_end = __pa_symbol(__init_begin);
|
2020-10-14 07:58:08 +08:00
|
|
|
phys_addr_t start, end;
|
2021-03-10 18:49:41 +08:00
|
|
|
int flags = NO_EXEC_MAPPINGS;
|
2020-10-14 07:58:08 +08:00
|
|
|
u64 i;
|
2017-03-10 04:52:07 +08:00
|
|
|
|
2021-03-10 18:49:41 +08:00
|
|
|
/*
|
|
|
|
* Setting hierarchical PXNTable attributes on table entries covering
|
|
|
|
* the linear region is only possible if it is guaranteed that no table
|
|
|
|
* entries at any level are being shared between the linear region and
|
|
|
|
* the vmalloc region. Check whether this is true for the PGD level, in
|
|
|
|
* which case it is guaranteed to be true for all other levels as well.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
|
|
|
|
|
2022-09-21 15:48:41 +08:00
|
|
|
if (can_set_direct_map())
|
2021-03-10 18:49:41 +08:00
|
|
|
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2015-01-22 09:36:06 +08:00
|
|
|
/*
|
2016-02-16 20:52:40 +08:00
|
|
|
* Take care not to create a writable alias for the
|
|
|
|
* read-only text and rodata sections of the kernel image.
|
2017-04-03 10:24:34 +08:00
|
|
|
* So temporarily mark them as NOMAP to skip mappings in
|
|
|
|
* the following for-loop
|
2015-01-22 09:36:06 +08:00
|
|
|
*/
|
2017-04-03 10:24:34 +08:00
|
|
|
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2022-03-03 01:38:09 +08:00
|
|
|
#ifdef CONFIG_KEXEC_CORE
|
|
|
|
if (crash_mem_map) {
|
2022-07-05 14:25:56 +08:00
|
|
|
if (defer_reserve_crashkernel())
|
2022-03-03 01:38:09 +08:00
|
|
|
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
|
|
|
else if (crashk_res.end)
|
|
|
|
memblock_mark_nomap(crashk_res.start,
|
|
|
|
resource_size(&crashk_res));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-04-03 10:24:34 +08:00
|
|
|
/* map all the memory banks */
|
2020-10-14 07:58:08 +08:00
|
|
|
for_each_mem_range(i, &start, &end) {
|
2017-04-03 10:24:34 +08:00
|
|
|
if (start >= end)
|
|
|
|
break;
|
2019-11-27 17:51:13 +08:00
|
|
|
/*
|
|
|
|
* The linear map must allow allocation tags reading/writing
|
|
|
|
* if MTE is present. Otherwise, it has the same attributes as
|
|
|
|
* PAGE_KERNEL.
|
|
|
|
*/
|
2021-03-09 20:26:01 +08:00
|
|
|
__map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
|
|
|
|
flags);
|
2017-04-03 10:24:34 +08:00
|
|
|
}
|
2016-02-16 20:52:40 +08:00
|
|
|
|
|
|
|
/*
|
arm64: omit [_text, _stext) from permanent kernel mapping
In a previous patch, we increased the size of the EFI PE/COFF header
to 64 KB, which resulted in the _stext symbol to appear at a fixed
offset of 64 KB into the image.
Since 64 KB is also the largest page size we support, this completely
removes the need to map the first 64 KB of the kernel image, given that
it only contains the arm64 Image header and the EFI header, neither of
which we ever access again after booting the kernel. More importantly,
we should avoid an executable mapping of non-executable and not entirely
predictable data, to deal with the unlikely event that we inadvertently
emitted something that looks like an opcode that could be used as a
gadget for speculative execution.
So let's limit the kernel mapping of .text to the [_stext, _etext)
region, which matches the view of generic code (such as kallsyms) when
it reasons about the boundaries of the kernel's .text section.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201117124729.12642-2-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-17 20:47:27 +08:00
|
|
|
* Map the linear alias of the [_stext, __init_begin) interval
|
2017-03-10 04:52:01 +08:00
|
|
|
* as non-executable now, and remove the write permission in
|
|
|
|
* mark_linear_text_alias_ro() below (which will be called after
|
|
|
|
* alternative patching has completed). This makes the contents
|
|
|
|
* of the region accessible to subsystems such as hibernate,
|
|
|
|
* but protects it from inadvertent modification or execution.
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
* Note that contiguous mappings cannot be remapped in this way,
|
|
|
|
* so we should avoid them here.
|
2016-02-16 20:52:40 +08:00
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
__map_memblock(pgdp, kernel_start, kernel_end,
|
2017-04-03 10:24:34 +08:00
|
|
|
PAGE_KERNEL, NO_CONT_MAPPINGS);
|
|
|
|
memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
|
2022-03-03 01:38:09 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Use page-level mappings here so that we can shrink the region
|
|
|
|
* in page granularity and put back unused memory to buddy system
|
|
|
|
* through /sys/kernel/kexec_crash_size interface.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_KEXEC_CORE
|
2022-07-05 14:25:56 +08:00
|
|
|
if (crash_mem_map && !defer_reserve_crashkernel()) {
|
2022-03-03 01:38:09 +08:00
|
|
|
if (crashk_res.end) {
|
|
|
|
__map_memblock(pgdp, crashk_res.start,
|
|
|
|
crashk_res.end + 1,
|
|
|
|
PAGE_KERNEL,
|
|
|
|
NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
|
|
|
|
memblock_clear_nomap(crashk_res.start,
|
|
|
|
resource_size(&crashk_res));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2015-01-22 09:36:06 +08:00
|
|
|
void mark_rodata_ro(void)
|
|
|
|
{
|
2016-02-20 01:50:32 +08:00
|
|
|
unsigned long section_size;
|
2016-02-16 20:52:40 +08:00
|
|
|
|
2016-02-20 01:50:32 +08:00
|
|
|
/*
|
arm64: mm: fix location of _etext
As Kees Cook notes in the ARM counterpart of this patch [0]:
The _etext position is defined to be the end of the kernel text code,
and should not include any part of the data segments. This interferes
with things that might check memory ranges and expect executable code
up to _etext.
In particular, Kees is referring to the HARDENED_USERCOPY patch set [1],
which rejects attempts to call copy_to_user() on kernel ranges containing
executable code, but does allow access to the .rodata segment. Regardless
of whether one may or may not agree with the distinction, it makes sense
for _etext to have the same meaning across architectures.
So let's put _etext where it belongs, between .text and .rodata, and fix
up existing references to use __init_begin instead, which unlike _end_rodata
includes the exception and notes sections as well.
The _etext references in kaslr.c are left untouched, since its references
to [_stext, _etext) are meant to capture potential jump instruction targets,
and so disregarding .rodata is actually an improvement here.
[0] http://article.gmane.org/gmane.linux.kernel/2245084
[1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502
Reported-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-06-23 21:53:17 +08:00
|
|
|
* mark .rodata as read only. Use __init_begin rather than __end_rodata
|
|
|
|
* to cover NOTES and EXCEPTION_TABLE.
|
2016-02-20 01:50:32 +08:00
|
|
|
*/
|
arm64: mm: fix location of _etext
As Kees Cook notes in the ARM counterpart of this patch [0]:
The _etext position is defined to be the end of the kernel text code,
and should not include any part of the data segments. This interferes
with things that might check memory ranges and expect executable code
up to _etext.
In particular, Kees is referring to the HARDENED_USERCOPY patch set [1],
which rejects attempts to call copy_to_user() on kernel ranges containing
executable code, but does allow access to the .rodata segment. Regardless
of whether one may or may not agree with the distinction, it makes sense
for _etext to have the same meaning across architectures.
So let's put _etext where it belongs, between .text and .rodata, and fix
up existing references to use __init_begin instead, which unlike _end_rodata
includes the exception and notes sections as well.
The _etext references in kaslr.c are left untouched, since its references
to [_stext, _etext) are meant to capture potential jump instruction targets,
and so disregarding .rodata is actually an improvement here.
[0] http://article.gmane.org/gmane.linux.kernel/2245084
[1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502
Reported-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-06-23 21:53:17 +08:00
|
|
|
section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
|
2017-03-10 04:52:00 +08:00
|
|
|
update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
|
2016-02-20 01:50:32 +08:00
|
|
|
section_size, PAGE_KERNEL_RO);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2016-10-28 00:27:34 +08:00
|
|
|
debug_checkwx();
|
2015-01-22 09:36:06 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
pgprot_t prot, struct vm_struct *vma,
|
2017-07-24 18:46:09 +08:00
|
|
|
int flags, unsigned long vm_flags)
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
{
|
2017-01-11 05:35:49 +08:00
|
|
|
phys_addr_t pa_start = __pa_symbol(va_start);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
unsigned long size = va_end - va_start;
|
|
|
|
|
|
|
|
BUG_ON(!PAGE_ALIGNED(pa_start));
|
|
|
|
BUG_ON(!PAGE_ALIGNED(size));
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
__create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
early_pgtable_alloc, flags);
|
2016-02-16 20:52:40 +08:00
|
|
|
|
2017-07-24 18:46:09 +08:00
|
|
|
if (!(vm_flags & VM_NO_GUARD))
|
|
|
|
size += PAGE_SIZE;
|
|
|
|
|
2016-02-16 20:52:40 +08:00
|
|
|
vma->addr = va_start;
|
|
|
|
vma->phys_addr = pa_start;
|
|
|
|
vma->size = size;
|
2017-07-24 18:46:09 +08:00
|
|
|
vma->flags = VM_MAP | vm_flags;
|
2016-02-16 20:52:40 +08:00
|
|
|
vma->caller = __builtin_return_address(0);
|
|
|
|
|
|
|
|
vm_area_add_early(vma);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
}
|
|
|
|
|
2017-11-14 22:14:17 +08:00
|
|
|
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
|
|
|
|
static int __init map_entry_trampoline(void)
|
|
|
|
{
|
2021-11-18 23:04:32 +08:00
|
|
|
int i;
|
|
|
|
|
2017-11-14 22:14:17 +08:00
|
|
|
pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
|
|
|
|
phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
|
|
|
|
|
|
|
|
/* The trampoline is always mapped and can therefore be global */
|
|
|
|
pgprot_val(prot) &= ~PTE_NG;
|
|
|
|
|
|
|
|
/* Map only the text into the trampoline page table */
|
|
|
|
memset(tramp_pg_dir, 0, PGD_SIZE);
|
2021-11-18 23:04:32 +08:00
|
|
|
__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
|
|
|
|
entry_tramp_text_size(), prot,
|
|
|
|
__pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
|
2017-11-14 22:14:17 +08:00
|
|
|
|
2017-12-06 19:24:02 +08:00
|
|
|
/* Map both the text and data into the kernel page table */
|
2021-11-18 23:04:32 +08:00
|
|
|
for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
|
|
|
|
__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
|
|
|
|
pa_start + i * PAGE_SIZE, prot);
|
|
|
|
|
2022-06-23 00:10:10 +08:00
|
|
|
if (IS_ENABLED(CONFIG_RELOCATABLE))
|
|
|
|
__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
|
|
|
|
pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO);
|
2017-12-06 19:24:02 +08:00
|
|
|
|
2017-11-14 22:14:17 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
core_initcall(map_entry_trampoline);
|
|
|
|
#endif
|
|
|
|
|
2020-05-07 03:51:31 +08:00
|
|
|
/*
|
|
|
|
* Open coded check for BTI, only for use to determine configuration
|
|
|
|
* for early mappings for before the cpufeature code has run.
|
|
|
|
*/
|
|
|
|
static bool arm64_early_this_cpu_has_bti(void)
|
|
|
|
{
|
|
|
|
u64 pfr1;
|
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
|
|
|
|
return false;
|
|
|
|
|
2021-02-08 17:57:29 +08:00
|
|
|
pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1);
|
2020-05-07 03:51:31 +08:00
|
|
|
return cpuid_feature_extract_unsigned_field(pfr1,
|
2022-09-06 06:54:04 +08:00
|
|
|
ID_AA64PFR1_EL1_BT_SHIFT);
|
2020-05-07 03:51:31 +08:00
|
|
|
}
|
|
|
|
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
/*
|
|
|
|
* Create fine-grained mappings for the kernel.
|
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
static void __init map_kernel(pgd_t *pgdp)
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
{
|
2017-03-10 04:52:03 +08:00
|
|
|
static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
|
|
|
|
vmlinux_initdata, vmlinux_data;
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2017-03-10 04:52:02 +08:00
|
|
|
/*
|
|
|
|
* External debuggers may need to write directly to the text
|
|
|
|
* mapping to install SW breakpoints. Allow this (only) when
|
|
|
|
* explicitly requested with rodata=off.
|
|
|
|
*/
|
|
|
|
pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
|
|
|
|
|
2020-05-07 03:51:31 +08:00
|
|
|
/*
|
|
|
|
* If we have a CPU that supports BTI and a kernel built for
|
|
|
|
* BTI then mark the kernel executable text as guarded pages
|
|
|
|
* now so we don't have to rewrite the page tables later.
|
|
|
|
*/
|
|
|
|
if (arm64_early_this_cpu_has_bti())
|
|
|
|
text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
|
|
|
|
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
/*
|
|
|
|
* Only rodata will be remapped with different permissions later on,
|
|
|
|
* all other segments are allowed to use contiguous mappings.
|
|
|
|
*/
|
arm64: omit [_text, _stext) from permanent kernel mapping
In a previous patch, we increased the size of the EFI PE/COFF header
to 64 KB, which resulted in the _stext symbol to appear at a fixed
offset of 64 KB into the image.
Since 64 KB is also the largest page size we support, this completely
removes the need to map the first 64 KB of the kernel image, given that
it only contains the arm64 Image header and the EFI header, neither of
which we ever access again after booting the kernel. More importantly,
we should avoid an executable mapping of non-executable and not entirely
predictable data, to deal with the unlikely event that we inadvertently
emitted something that looks like an opcode that could be used as a
gadget for speculative execution.
So let's limit the kernel mapping of .text to the [_stext, _etext)
region, which matches the view of generic code (such as kallsyms) when
it reasons about the boundaries of the kernel's .text section.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20201117124729.12642-2-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-11-17 20:47:27 +08:00
|
|
|
map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0,
|
2017-07-24 18:46:09 +08:00
|
|
|
VM_NO_GUARD);
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
|
2017-07-24 18:46:09 +08:00
|
|
|
&vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
|
2017-07-24 18:46:09 +08:00
|
|
|
&vmlinux_inittext, 0, VM_NO_GUARD);
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
|
2017-07-24 18:46:09 +08:00
|
|
|
&vmlinux_initdata, 0, VM_NO_GUARD);
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2020-06-09 12:33:10 +08:00
|
|
|
if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdp, FIXADDR_START)))) {
|
2016-02-16 20:52:40 +08:00
|
|
|
/*
|
|
|
|
* The fixmap falls in a separate pgd to the kernel, and doesn't
|
|
|
|
* live in the carveout for the swapper_pg_dir. We can simply
|
|
|
|
* re-use the existing dir for the fixmap.
|
|
|
|
*/
|
2020-06-09 12:33:10 +08:00
|
|
|
set_pgd(pgd_offset_pgd(pgdp, FIXADDR_START),
|
2018-02-15 19:14:56 +08:00
|
|
|
READ_ONCE(*pgd_offset_k(FIXADDR_START)));
|
2016-02-16 20:52:40 +08:00
|
|
|
} else if (CONFIG_PGTABLE_LEVELS > 3) {
|
2019-08-27 23:57:08 +08:00
|
|
|
pgd_t *bm_pgdp;
|
2020-06-05 07:46:23 +08:00
|
|
|
p4d_t *bm_p4dp;
|
2019-08-27 23:57:08 +08:00
|
|
|
pud_t *bm_pudp;
|
2016-02-16 20:52:40 +08:00
|
|
|
/*
|
|
|
|
* The fixmap shares its top level pgd entry with the kernel
|
|
|
|
* mapping. This can really only occur when we are running
|
|
|
|
* with 16k/4 levels, so we can simply reuse the pud level
|
|
|
|
* entry instead.
|
|
|
|
*/
|
|
|
|
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
|
2020-06-09 12:33:10 +08:00
|
|
|
bm_pgdp = pgd_offset_pgd(pgdp, FIXADDR_START);
|
2020-06-05 07:46:23 +08:00
|
|
|
bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START);
|
|
|
|
bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START);
|
2019-08-27 23:57:08 +08:00
|
|
|
pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
|
2016-02-16 20:52:40 +08:00
|
|
|
pud_clear_fixmap();
|
|
|
|
} else {
|
|
|
|
BUG();
|
|
|
|
}
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
kasan_copy_shadow(pgdp);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
}
|
|
|
|
|
2022-06-24 23:06:42 +08:00
|
|
|
static void __init create_idmap(void)
|
|
|
|
{
|
|
|
|
u64 start = __pa_symbol(__idmap_text_start);
|
|
|
|
u64 size = __pa_symbol(__idmap_text_end) - start;
|
|
|
|
pgd_t *pgd = idmap_pg_dir;
|
|
|
|
u64 pgd_phys;
|
|
|
|
|
|
|
|
/* check if we need an additional level of translation */
|
|
|
|
if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) {
|
|
|
|
pgd_phys = early_pgtable_alloc(PAGE_SHIFT);
|
|
|
|
set_pgd(&idmap_pg_dir[start >> VA_BITS],
|
|
|
|
__pgd(pgd_phys | P4D_TYPE_TABLE));
|
|
|
|
pgd = __va(pgd_phys);
|
|
|
|
}
|
|
|
|
__create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX,
|
|
|
|
early_pgtable_alloc, 0);
|
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) {
|
|
|
|
extern u32 __idmap_kpti_flag;
|
|
|
|
u64 pa = __pa_symbol(&__idmap_kpti_flag);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The KPTI G-to-nG conversion code needs a read-write mapping
|
|
|
|
* of its synchronization flag in the ID map.
|
|
|
|
*/
|
|
|
|
__create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL,
|
|
|
|
early_pgtable_alloc, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
void __init paging_init(void)
|
|
|
|
{
|
2018-09-25 00:15:02 +08:00
|
|
|
pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
|
2022-06-24 23:06:42 +08:00
|
|
|
extern pgd_t init_idmap_pg_dir[];
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2022-06-24 23:06:33 +08:00
|
|
|
idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0));
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel(pgdp);
|
|
|
|
map_mem(pgdp);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
|
|
|
pgd_clear_fixmap();
|
|
|
|
|
2022-06-24 23:06:42 +08:00
|
|
|
cpu_replace_ttbr1(lm_alias(swapper_pg_dir), init_idmap_pg_dir);
|
arm64/mm: Separate boot-time page tables from swapper_pg_dir
Since the address of swapper_pg_dir is fixed for a given kernel image,
it is an attractive target for manipulation via an arbitrary write. To
mitigate this we'd like to make it read-only by moving it into the
rodata section.
We require that swapper_pg_dir is at a fixed offset from tramp_pg_dir
and reserved_ttbr0, so these will also need to move into rodata.
However, swapper_pg_dir is allocated along with some transient page
tables used for boot which we do not want to move into rodata.
As a step towards this, this patch separates the boot-time page tables
into a new init_pg_dir, and reduces swapper_pg_dir to the single page it
needs to be. This allows us to retain the relationship between
swapper_pg_dir, tramp_pg_dir, and swapper_pg_dir, while cleanly
separating these from the boot-time page tables.
The init_pg_dir holds all of the pgd/pud/pmd/pte levels needed during
boot, and all of these levels will be freed when we switch to the
swapper_pg_dir, which is initialized by the existing code in
paging_init(). Since we start off on the init_pg_dir, we no longer need
to allocate a transient page table in paging_init() in order to ensure
that swapper_pg_dir isn't live while we initialize it.
There should be no functional change as a result of this patch.
Signed-off-by: Jun Yao <yaojun8558363@gmail.com>
Reviewed-by: James Morse <james.morse@arm.com>
[Mark: place init_pg_dir after BSS, fold mm changes, commit message]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2018-09-24 22:47:49 +08:00
|
|
|
init_mm.pgd = swapper_pg_dir;
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2021-11-06 04:43:19 +08:00
|
|
|
memblock_phys_free(__pa_symbol(init_pg_dir),
|
|
|
|
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
|
2018-11-07 22:16:06 +08:00
|
|
|
|
|
|
|
memblock_allow_resize();
|
2022-06-24 23:06:42 +08:00
|
|
|
|
|
|
|
create_idmap();
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether a kernel address is valid (derived from arch/x86/).
|
|
|
|
*/
|
|
|
|
int kern_addr_valid(unsigned long addr)
|
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
pgd_t *pgdp;
|
2020-06-05 07:46:23 +08:00
|
|
|
p4d_t *p4dp;
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t *pudp, pud;
|
|
|
|
pmd_t *pmdp, pmd;
|
|
|
|
pte_t *ptep, pte;
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2020-06-10 19:09:44 +08:00
|
|
|
addr = arch_kasan_reset_tag(addr);
|
2012-03-05 19:49:27 +08:00
|
|
|
if ((((long)addr) >> VA_BITS) != -1UL)
|
|
|
|
return 0;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pgdp = pgd_offset_k(addr);
|
|
|
|
if (pgd_none(READ_ONCE(*pgdp)))
|
2012-03-05 19:49:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2020-06-05 07:46:23 +08:00
|
|
|
p4dp = p4d_offset(pgdp, addr);
|
|
|
|
if (p4d_none(READ_ONCE(*p4dp)))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
pudp = pud_offset(p4dp, addr);
|
2018-02-15 19:14:56 +08:00
|
|
|
pud = READ_ONCE(*pudp);
|
|
|
|
if (pud_none(pud))
|
2012-03-05 19:49:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
if (pud_sect(pud))
|
|
|
|
return pfn_valid(pud_pfn(pud));
|
2014-05-06 21:02:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pmdp = pmd_offset(pudp, addr);
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
|
|
|
if (pmd_none(pmd))
|
2012-03-05 19:49:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
if (pmd_sect(pmd))
|
|
|
|
return pfn_valid(pmd_pfn(pmd));
|
2014-04-16 01:53:24 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
ptep = pte_offset_kernel(pmdp, addr);
|
|
|
|
pte = READ_ONCE(*ptep);
|
|
|
|
if (pte_none(pte))
|
2012-03-05 19:49:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
return pfn_valid(pte_pfn(pte));
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
2020-03-04 12:28:43 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
2020-08-07 14:23:29 +08:00
|
|
|
static void free_hotplug_page_range(struct page *page, size_t size,
|
|
|
|
struct vmem_altmap *altmap)
|
2020-03-04 12:28:43 +08:00
|
|
|
{
|
2020-08-07 14:23:29 +08:00
|
|
|
if (altmap) {
|
|
|
|
vmem_altmap_free(altmap, size >> PAGE_SHIFT);
|
|
|
|
} else {
|
|
|
|
WARN_ON(PageReserved(page));
|
|
|
|
free_pages((unsigned long)page_address(page), get_order(size));
|
|
|
|
}
|
2020-03-04 12:28:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void free_hotplug_pgtable_page(struct page *page)
|
|
|
|
{
|
2020-08-07 14:23:29 +08:00
|
|
|
free_hotplug_page_range(page, PAGE_SIZE, NULL);
|
2020-03-04 12:28:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool pgtable_range_aligned(unsigned long start, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling,
|
|
|
|
unsigned long mask)
|
|
|
|
{
|
|
|
|
start &= mask;
|
|
|
|
if (start < floor)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (ceiling) {
|
|
|
|
ceiling &= mask;
|
|
|
|
if (!ceiling)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (end - 1 > ceiling - 1)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
|
2020-08-07 14:23:29 +08:00
|
|
|
unsigned long end, bool free_mapped,
|
|
|
|
struct vmem_altmap *altmap)
|
2020-03-04 12:28:43 +08:00
|
|
|
{
|
|
|
|
pte_t *ptep, pte;
|
|
|
|
|
|
|
|
do {
|
|
|
|
ptep = pte_offset_kernel(pmdp, addr);
|
|
|
|
pte = READ_ONCE(*ptep);
|
|
|
|
if (pte_none(pte))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pte_present(pte));
|
|
|
|
pte_clear(&init_mm, addr, ptep);
|
|
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
|
|
if (free_mapped)
|
2020-08-07 14:23:29 +08:00
|
|
|
free_hotplug_page_range(pte_page(pte),
|
|
|
|
PAGE_SIZE, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
} while (addr += PAGE_SIZE, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
|
2020-08-07 14:23:29 +08:00
|
|
|
unsigned long end, bool free_mapped,
|
|
|
|
struct vmem_altmap *altmap)
|
2020-03-04 12:28:43 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pmd_t *pmdp, pmd;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
pmdp = pmd_offset(pudp, addr);
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
|
|
|
if (pmd_none(pmd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pmd_present(pmd));
|
|
|
|
if (pmd_sect(pmd)) {
|
|
|
|
pmd_clear(pmdp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* One TLBI should be sufficient here as the PMD_SIZE
|
|
|
|
* range is mapped with a single block entry.
|
|
|
|
*/
|
|
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
|
|
if (free_mapped)
|
|
|
|
free_hotplug_page_range(pmd_page(pmd),
|
2020-08-07 14:23:29 +08:00
|
|
|
PMD_SIZE, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
WARN_ON(!pmd_table(pmd));
|
2020-08-07 14:23:29 +08:00
|
|
|
unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
|
2020-08-07 14:23:29 +08:00
|
|
|
unsigned long end, bool free_mapped,
|
|
|
|
struct vmem_altmap *altmap)
|
2020-03-04 12:28:43 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pud_t *pudp, pud;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
pudp = pud_offset(p4dp, addr);
|
|
|
|
pud = READ_ONCE(*pudp);
|
|
|
|
if (pud_none(pud))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pud_present(pud));
|
|
|
|
if (pud_sect(pud)) {
|
|
|
|
pud_clear(pudp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* One TLBI should be sufficient here as the PUD_SIZE
|
|
|
|
* range is mapped with a single block entry.
|
|
|
|
*/
|
|
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
|
|
if (free_mapped)
|
|
|
|
free_hotplug_page_range(pud_page(pud),
|
2020-08-07 14:23:29 +08:00
|
|
|
PUD_SIZE, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
WARN_ON(!pud_table(pud));
|
2020-08-07 14:23:29 +08:00
|
|
|
unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
|
2020-08-07 14:23:29 +08:00
|
|
|
unsigned long end, bool free_mapped,
|
|
|
|
struct vmem_altmap *altmap)
|
2020-03-04 12:28:43 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
p4d_t *p4dp, p4d;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = p4d_addr_end(addr, end);
|
|
|
|
p4dp = p4d_offset(pgdp, addr);
|
|
|
|
p4d = READ_ONCE(*p4dp);
|
|
|
|
if (p4d_none(p4d))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!p4d_present(p4d));
|
2020-08-07 14:23:29 +08:00
|
|
|
unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_range(unsigned long addr, unsigned long end,
|
2020-08-07 14:23:29 +08:00
|
|
|
bool free_mapped, struct vmem_altmap *altmap)
|
2020-03-04 12:28:43 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pgd_t *pgdp, pgd;
|
|
|
|
|
2020-08-07 14:23:29 +08:00
|
|
|
/*
|
|
|
|
* altmap can only be used as vmemmap mapping backing memory.
|
|
|
|
* In case the backing memory itself is not being freed, then
|
|
|
|
* altmap is irrelevant. Warn about this inconsistency when
|
|
|
|
* encountered.
|
|
|
|
*/
|
|
|
|
WARN_ON(!free_mapped && altmap);
|
|
|
|
|
2020-03-04 12:28:43 +08:00
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
pgdp = pgd_offset_k(addr);
|
|
|
|
pgd = READ_ONCE(*pgdp);
|
|
|
|
if (pgd_none(pgd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pgd_present(pgd));
|
2020-08-07 14:23:29 +08:00
|
|
|
unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling)
|
|
|
|
{
|
|
|
|
pte_t *ptep, pte;
|
|
|
|
unsigned long i, start = addr;
|
|
|
|
|
|
|
|
do {
|
|
|
|
ptep = pte_offset_kernel(pmdp, addr);
|
|
|
|
pte = READ_ONCE(*ptep);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is just a sanity check here which verifies that
|
|
|
|
* pte clearing has been done by earlier unmap loops.
|
|
|
|
*/
|
|
|
|
WARN_ON(!pte_none(pte));
|
|
|
|
} while (addr += PAGE_SIZE, addr < end);
|
|
|
|
|
|
|
|
if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether we can free the pte page if the rest of the
|
|
|
|
* entries are empty. Overlap with other regions have been
|
|
|
|
* handled by the floor/ceiling check.
|
|
|
|
*/
|
|
|
|
ptep = pte_offset_kernel(pmdp, 0UL);
|
|
|
|
for (i = 0; i < PTRS_PER_PTE; i++) {
|
|
|
|
if (!pte_none(READ_ONCE(ptep[i])))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pmd_clear(pmdp);
|
|
|
|
__flush_tlb_kernel_pgtable(start);
|
|
|
|
free_hotplug_pgtable_page(virt_to_page(ptep));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling)
|
|
|
|
{
|
|
|
|
pmd_t *pmdp, pmd;
|
|
|
|
unsigned long i, next, start = addr;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
pmdp = pmd_offset(pudp, addr);
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
|
|
|
if (pmd_none(pmd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
|
|
|
|
free_empty_pte_table(pmdp, addr, next, floor, ceiling);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
|
|
|
|
if (CONFIG_PGTABLE_LEVELS <= 2)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether we can free the pmd page if the rest of the
|
|
|
|
* entries are empty. Overlap with other regions have been
|
|
|
|
* handled by the floor/ceiling check.
|
|
|
|
*/
|
|
|
|
pmdp = pmd_offset(pudp, 0UL);
|
|
|
|
for (i = 0; i < PTRS_PER_PMD; i++) {
|
|
|
|
if (!pmd_none(READ_ONCE(pmdp[i])))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pud_clear(pudp);
|
|
|
|
__flush_tlb_kernel_pgtable(start);
|
|
|
|
free_hotplug_pgtable_page(virt_to_page(pmdp));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling)
|
|
|
|
{
|
|
|
|
pud_t *pudp, pud;
|
|
|
|
unsigned long i, next, start = addr;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
pudp = pud_offset(p4dp, addr);
|
|
|
|
pud = READ_ONCE(*pudp);
|
|
|
|
if (pud_none(pud))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
|
|
|
|
free_empty_pmd_table(pudp, addr, next, floor, ceiling);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
|
|
|
|
if (CONFIG_PGTABLE_LEVELS <= 3)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether we can free the pud page if the rest of the
|
|
|
|
* entries are empty. Overlap with other regions have been
|
|
|
|
* handled by the floor/ceiling check.
|
|
|
|
*/
|
|
|
|
pudp = pud_offset(p4dp, 0UL);
|
|
|
|
for (i = 0; i < PTRS_PER_PUD; i++) {
|
|
|
|
if (!pud_none(READ_ONCE(pudp[i])))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
p4d_clear(p4dp);
|
|
|
|
__flush_tlb_kernel_pgtable(start);
|
|
|
|
free_hotplug_pgtable_page(virt_to_page(pudp));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
p4d_t *p4dp, p4d;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = p4d_addr_end(addr, end);
|
|
|
|
p4dp = p4d_offset(pgdp, addr);
|
|
|
|
p4d = READ_ONCE(*p4dp);
|
|
|
|
if (p4d_none(p4d))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!p4d_present(p4d));
|
|
|
|
free_empty_pud_table(p4dp, addr, next, floor, ceiling);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_tables(unsigned long addr, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pgd_t *pgdp, pgd;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
pgdp = pgd_offset_k(addr);
|
|
|
|
pgd = READ_ONCE(*pgdp);
|
|
|
|
if (pgd_none(pgd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pgd_present(pgd));
|
|
|
|
free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-12-29 15:53:54 +08:00
|
|
|
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
|
|
|
struct vmem_altmap *altmap)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
2013-04-30 06:07:50 +08:00
|
|
|
unsigned long addr = start;
|
2012-03-05 19:49:27 +08:00
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pgd_t *pgdp;
|
2020-06-05 07:46:23 +08:00
|
|
|
p4d_t *p4dp;
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2021-01-05 19:24:11 +08:00
|
|
|
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
|
2022-09-20 09:49:51 +08:00
|
|
|
|
|
|
|
if (!ARM64_KERNEL_USES_PMD_MAPS)
|
|
|
|
return vmemmap_populate_basepages(start, end, node, altmap);
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pgdp = vmemmap_pgd_populate(addr, node);
|
|
|
|
if (!pgdp)
|
2012-03-05 19:49:27 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2020-06-05 07:46:23 +08:00
|
|
|
p4dp = vmemmap_p4d_populate(pgdp, addr, node);
|
|
|
|
if (!p4dp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
pudp = vmemmap_pud_populate(p4dp, addr, node);
|
2018-02-15 19:14:56 +08:00
|
|
|
if (!pudp)
|
2012-03-05 19:49:27 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pmdp = pmd_offset(pudp, addr);
|
|
|
|
if (pmd_none(READ_ONCE(*pmdp))) {
|
2012-03-05 19:49:27 +08:00
|
|
|
void *p = NULL;
|
|
|
|
|
2020-08-07 14:23:29 +08:00
|
|
|
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
|
2020-10-15 08:51:23 +08:00
|
|
|
if (!p) {
|
|
|
|
if (vmemmap_populate_basepages(addr, next, node, altmap))
|
|
|
|
return -ENOMEM;
|
|
|
|
continue;
|
|
|
|
}
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
|
2012-03-05 19:49:27 +08:00
|
|
|
} else
|
2018-02-15 19:14:56 +08:00
|
|
|
vmemmap_verify((pte_t *)pmdp, node, addr, next);
|
2012-03-05 19:49:27 +08:00
|
|
|
} while (addr = next, addr != end);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2021-05-24 15:40:30 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
2017-12-29 15:53:56 +08:00
|
|
|
void vmemmap_free(unsigned long start, unsigned long end,
|
|
|
|
struct vmem_altmap *altmap)
|
2013-02-23 08:33:08 +08:00
|
|
|
{
|
2020-03-04 12:28:43 +08:00
|
|
|
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
|
|
|
|
|
2020-08-07 14:23:29 +08:00
|
|
|
unmap_hotplug_range(start, end, true, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
|
2013-02-23 08:33:08 +08:00
|
|
|
}
|
2021-05-24 15:40:30 +08:00
|
|
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2021-02-22 09:43:51 +08:00
|
|
|
static inline pud_t *fixmap_pud(unsigned long addr)
|
2014-11-22 05:50:42 +08:00
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
pgd_t *pgdp = pgd_offset_k(addr);
|
2020-06-05 07:46:23 +08:00
|
|
|
p4d_t *p4dp = p4d_offset(pgdp, addr);
|
|
|
|
p4d_t p4d = READ_ONCE(*p4dp);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2020-06-05 07:46:23 +08:00
|
|
|
BUG_ON(p4d_none(p4d) || p4d_bad(p4d));
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2020-06-05 07:46:23 +08:00
|
|
|
return pud_offset_kimg(p4dp, addr);
|
2014-11-22 05:50:42 +08:00
|
|
|
}
|
|
|
|
|
2021-02-22 09:43:51 +08:00
|
|
|
static inline pmd_t *fixmap_pmd(unsigned long addr)
|
2014-11-22 05:50:42 +08:00
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t *pudp = fixmap_pud(addr);
|
|
|
|
pud_t pud = READ_ONCE(*pudp);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pud_none(pud) || pud_bad(pud));
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
return pmd_offset_kimg(pudp, addr);
|
2014-11-22 05:50:42 +08:00
|
|
|
}
|
|
|
|
|
2021-02-22 09:43:51 +08:00
|
|
|
static inline pte_t *fixmap_pte(unsigned long addr)
|
2014-11-22 05:50:42 +08:00
|
|
|
{
|
2016-02-16 20:52:38 +08:00
|
|
|
return &bm_pte[pte_index(addr)];
|
2014-11-22 05:50:42 +08:00
|
|
|
}
|
|
|
|
|
2017-01-11 05:35:49 +08:00
|
|
|
/*
|
|
|
|
* The p*d_populate functions call virt_to_phys implicitly so they can't be used
|
|
|
|
* directly on kernel symbols (bm_p*d). This function is called too early to use
|
|
|
|
* lm_alias so __p*d_populate functions must be used to populate with the
|
|
|
|
* physical address from __pa_symbol.
|
|
|
|
*/
|
2014-11-22 05:50:42 +08:00
|
|
|
void __init early_fixmap_init(void)
|
|
|
|
{
|
2020-06-05 07:46:23 +08:00
|
|
|
pgd_t *pgdp;
|
|
|
|
p4d_t *p4dp, p4d;
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
2014-11-22 05:50:42 +08:00
|
|
|
unsigned long addr = FIXADDR_START;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pgdp = pgd_offset_k(addr);
|
2020-06-05 07:46:23 +08:00
|
|
|
p4dp = p4d_offset(pgdp, addr);
|
|
|
|
p4d = READ_ONCE(*p4dp);
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
if (CONFIG_PGTABLE_LEVELS > 3 &&
|
2020-06-05 07:46:23 +08:00
|
|
|
!(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) {
|
2016-02-16 20:52:40 +08:00
|
|
|
/*
|
|
|
|
* We only end up here if the kernel mapping and the fixmap
|
|
|
|
* share the top level pgd entry, which should only happen on
|
|
|
|
* 16k/4 levels configurations.
|
|
|
|
*/
|
|
|
|
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
|
2020-06-05 07:46:23 +08:00
|
|
|
pudp = pud_offset_kimg(p4dp, addr);
|
2016-02-16 20:52:40 +08:00
|
|
|
} else {
|
2020-06-05 07:46:23 +08:00
|
|
|
if (p4d_none(p4d))
|
2021-03-10 18:49:40 +08:00
|
|
|
__p4d_populate(p4dp, __pa_symbol(bm_pud), P4D_TYPE_TABLE);
|
2018-02-15 19:14:56 +08:00
|
|
|
pudp = fixmap_pud(addr);
|
2016-02-16 20:52:40 +08:00
|
|
|
}
|
2018-02-15 19:14:56 +08:00
|
|
|
if (pud_none(READ_ONCE(*pudp)))
|
2021-03-10 18:49:40 +08:00
|
|
|
__pud_populate(pudp, __pa_symbol(bm_pmd), PUD_TYPE_TABLE);
|
2018-02-15 19:14:56 +08:00
|
|
|
pmdp = fixmap_pmd(addr);
|
|
|
|
__pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The boot-ioremap range spans multiple pmds, for which
|
2016-02-16 20:52:38 +08:00
|
|
|
* we are not prepared:
|
2014-11-22 05:50:42 +08:00
|
|
|
*/
|
|
|
|
BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
|
|
|
|
!= (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
|
|
|
|
|| pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
|
2014-11-22 05:50:42 +08:00
|
|
|
WARN_ON(1);
|
2018-02-15 19:14:56 +08:00
|
|
|
pr_warn("pmdp %p != %p, %p\n",
|
|
|
|
pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
|
2014-11-22 05:50:42 +08:00
|
|
|
fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
|
|
|
|
pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
|
|
|
|
fix_to_virt(FIX_BTMAP_BEGIN));
|
|
|
|
pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n",
|
|
|
|
fix_to_virt(FIX_BTMAP_END));
|
|
|
|
|
|
|
|
pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
|
|
|
|
pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-07 02:44:26 +08:00
|
|
|
/*
|
|
|
|
* Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we
|
|
|
|
* ever need to use IPIs for TLB broadcasting, then we're in trouble here.
|
|
|
|
*/
|
2014-11-22 05:50:42 +08:00
|
|
|
void __set_fixmap(enum fixed_addresses idx,
|
|
|
|
phys_addr_t phys, pgprot_t flags)
|
|
|
|
{
|
|
|
|
unsigned long addr = __fix_to_virt(idx);
|
2018-02-15 19:14:56 +08:00
|
|
|
pte_t *ptep;
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2015-03-04 21:27:35 +08:00
|
|
|
BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
ptep = fixmap_pte(addr);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
|
|
|
if (pgprot_val(flags)) {
|
2018-02-15 19:14:56 +08:00
|
|
|
set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
|
2014-11-22 05:50:42 +08:00
|
|
|
} else {
|
2018-02-15 19:14:56 +08:00
|
|
|
pte_clear(&init_mm, addr, ptep);
|
2014-11-22 05:50:42 +08:00
|
|
|
flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
|
|
|
|
}
|
|
|
|
}
|
2015-06-01 19:40:32 +08:00
|
|
|
|
2019-08-23 14:24:50 +08:00
|
|
|
void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
|
2015-06-01 19:40:32 +08:00
|
|
|
{
|
|
|
|
const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
int offset;
|
2015-06-01 19:40:32 +08:00
|
|
|
void *dt_virt;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether the physical FDT address is set and meets the minimum
|
|
|
|
* alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
|
2016-08-01 19:29:31 +08:00
|
|
|
* at least 8 bytes so that we can always access the magic and size
|
|
|
|
* fields of the FDT header after mapping the first chunk, double check
|
|
|
|
* here if that is indeed the case.
|
2015-06-01 19:40:32 +08:00
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
|
|
|
|
if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure that the FDT region can be mapped without the need to
|
|
|
|
* allocate additional translation table pages, so that it is safe
|
2016-02-06 08:24:46 +08:00
|
|
|
* to call create_mapping_noalloc() this early.
|
2015-06-01 19:40:32 +08:00
|
|
|
*
|
|
|
|
* On 64k pages, the FDT will be mapped using PTEs, so we need to
|
|
|
|
* be in the same PMD as the rest of the fixmap.
|
|
|
|
* On 4k pages, we'll use section mappings for the FDT so we only
|
|
|
|
* have to be in the same PUD.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(dt_virt_base % SZ_2M);
|
|
|
|
|
2015-10-19 21:19:28 +08:00
|
|
|
BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> SWAPPER_TABLE_SHIFT !=
|
|
|
|
__fix_to_virt(FIX_BTMAP_BEGIN) >> SWAPPER_TABLE_SHIFT);
|
2015-06-01 19:40:32 +08:00
|
|
|
|
2015-10-19 21:19:28 +08:00
|
|
|
offset = dt_phys % SWAPPER_BLOCK_SIZE;
|
2015-06-01 19:40:32 +08:00
|
|
|
dt_virt = (void *)dt_virt_base + offset;
|
|
|
|
|
|
|
|
/* map the first chunk so we can read the size from the header */
|
2016-02-06 08:24:46 +08:00
|
|
|
create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
|
|
|
|
dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
|
2015-06-01 19:40:32 +08:00
|
|
|
|
2016-08-01 19:29:31 +08:00
|
|
|
if (fdt_magic(dt_virt) != FDT_MAGIC)
|
2015-06-01 19:40:32 +08:00
|
|
|
return NULL;
|
|
|
|
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
*size = fdt_totalsize(dt_virt);
|
|
|
|
if (*size > MAX_FDT_SIZE)
|
2015-06-01 19:40:32 +08:00
|
|
|
return NULL;
|
|
|
|
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
if (offset + *size > SWAPPER_BLOCK_SIZE)
|
2016-02-06 08:24:46 +08:00
|
|
|
create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
|
2015-06-01 19:40:32 +08:00
|
|
|
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
return dt_virt;
|
|
|
|
}
|
2015-06-01 19:40:32 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
|
2016-02-16 20:52:35 +08:00
|
|
|
{
|
2019-05-27 11:58:15 +08:00
|
|
|
pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
|
2018-02-21 20:59:27 +08:00
|
|
|
|
2018-05-24 02:43:46 +08:00
|
|
|
/* Only allow permission changes for now */
|
|
|
|
if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
|
|
|
|
pud_val(new_pud)))
|
2018-02-21 20:59:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2019-05-27 15:03:29 +08:00
|
|
|
VM_BUG_ON(phys & ~PUD_MASK);
|
2018-05-24 02:43:46 +08:00
|
|
|
set_pud(pudp, new_pud);
|
2016-02-16 20:52:35 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
|
2016-02-16 20:52:35 +08:00
|
|
|
{
|
2019-05-27 11:58:15 +08:00
|
|
|
pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
|
2018-02-21 20:59:27 +08:00
|
|
|
|
2018-05-24 02:43:46 +08:00
|
|
|
/* Only allow permission changes for now */
|
|
|
|
if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
|
|
|
|
pmd_val(new_pmd)))
|
2018-02-21 20:59:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2019-05-27 15:03:29 +08:00
|
|
|
VM_BUG_ON(phys & ~PMD_MASK);
|
2018-05-24 02:43:46 +08:00
|
|
|
set_pmd(pmdp, new_pmd);
|
2016-02-16 20:52:35 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2021-07-21 15:02:13 +08:00
|
|
|
int pud_clear_huge(pud_t *pudp)
|
|
|
|
{
|
|
|
|
if (!pud_sect(READ_ONCE(*pudp)))
|
|
|
|
return 0;
|
|
|
|
pud_clear(pudp);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
int pmd_clear_huge(pmd_t *pmdp)
|
2016-02-16 20:52:35 +08:00
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
if (!pmd_sect(READ_ONCE(*pmdp)))
|
2016-02-16 20:52:35 +08:00
|
|
|
return 0;
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_clear(pmdp);
|
2016-02-16 20:52:35 +08:00
|
|
|
return 1;
|
|
|
|
}
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
|
2018-06-06 15:01:21 +08:00
|
|
|
int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
{
|
2018-06-06 15:01:21 +08:00
|
|
|
pte_t *table;
|
|
|
|
pmd_t pmd;
|
|
|
|
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
|
|
|
|
2018-09-06 00:38:57 +08:00
|
|
|
if (!pmd_table(pmd)) {
|
2018-12-28 16:37:42 +08:00
|
|
|
VM_WARN_ON(1);
|
2018-06-06 15:01:21 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
table = pte_offset_kernel(pmdp, addr);
|
|
|
|
pmd_clear(pmdp);
|
|
|
|
__flush_tlb_kernel_pgtable(addr);
|
|
|
|
pte_free_kernel(NULL, table);
|
|
|
|
return 1;
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
}
|
|
|
|
|
2018-06-06 15:01:21 +08:00
|
|
|
int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
{
|
2018-06-06 15:01:21 +08:00
|
|
|
pmd_t *table;
|
|
|
|
pmd_t *pmdp;
|
|
|
|
pud_t pud;
|
|
|
|
unsigned long next, end;
|
|
|
|
|
|
|
|
pud = READ_ONCE(*pudp);
|
|
|
|
|
2018-09-06 00:38:57 +08:00
|
|
|
if (!pud_table(pud)) {
|
2018-12-28 16:37:42 +08:00
|
|
|
VM_WARN_ON(1);
|
2018-06-06 15:01:21 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
table = pmd_offset(pudp, addr);
|
|
|
|
pmdp = table;
|
|
|
|
next = addr;
|
|
|
|
end = addr + PUD_SIZE;
|
|
|
|
do {
|
|
|
|
pmd_free_pte_page(pmdp, next);
|
|
|
|
} while (pmdp++, next += PMD_SIZE, next != end);
|
|
|
|
|
|
|
|
pud_clear(pudp);
|
|
|
|
__flush_tlb_kernel_pgtable(addr);
|
|
|
|
pmd_free(NULL, table);
|
|
|
|
return 1;
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
}
|
2018-12-12 02:48:48 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
2020-03-04 12:28:43 +08:00
|
|
|
static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
|
|
|
|
{
|
|
|
|
unsigned long end = start + size;
|
|
|
|
|
|
|
|
WARN_ON(pgdir != init_mm.pgd);
|
|
|
|
WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
|
|
|
|
|
2020-08-07 14:23:29 +08:00
|
|
|
unmap_hotplug_range(start, end, false, NULL);
|
2020-03-04 12:28:43 +08:00
|
|
|
free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
|
|
|
|
}
|
|
|
|
|
2021-02-26 09:17:37 +08:00
|
|
|
struct range arch_get_mappable_range(void)
|
2020-11-13 15:30:14 +08:00
|
|
|
{
|
2021-02-26 09:17:37 +08:00
|
|
|
struct range mhp_range;
|
2021-02-16 23:03:51 +08:00
|
|
|
u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
|
|
|
|
u64 end_linear_pa = __pa(PAGE_END - 1);
|
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
|
|
|
|
/*
|
|
|
|
* Check for a wrap, it is possible because of randomized linear
|
|
|
|
* mapping the start physical address is actually bigger than
|
|
|
|
* the end physical address. In this case set start to zero
|
|
|
|
* because [0, end_linear_pa] range must still be able to cover
|
|
|
|
* all addressable physical addresses.
|
|
|
|
*/
|
|
|
|
if (start_linear_pa > end_linear_pa)
|
|
|
|
start_linear_pa = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
WARN_ON(start_linear_pa > end_linear_pa);
|
2021-02-26 09:17:37 +08:00
|
|
|
|
2020-11-13 15:30:14 +08:00
|
|
|
/*
|
|
|
|
* Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)]
|
|
|
|
* accommodating both its ends but excluding PAGE_END. Max physical
|
|
|
|
* range which can be mapped inside this linear mapping range, must
|
|
|
|
* also be derived from its end points.
|
|
|
|
*/
|
2021-02-16 23:03:51 +08:00
|
|
|
mhp_range.start = start_linear_pa;
|
|
|
|
mhp_range.end = end_linear_pa;
|
|
|
|
|
2021-02-26 09:17:37 +08:00
|
|
|
return mhp_range;
|
2020-11-13 15:30:14 +08:00
|
|
|
}
|
|
|
|
|
2019-05-14 08:21:26 +08:00
|
|
|
int arch_add_memory(int nid, u64 start, u64 size,
|
2020-04-11 05:33:21 +08:00
|
|
|
struct mhp_params *params)
|
2018-12-12 02:48:48 +08:00
|
|
|
{
|
2021-03-10 18:49:41 +08:00
|
|
|
int ret, flags = NO_EXEC_MAPPINGS;
|
2018-12-12 02:48:48 +08:00
|
|
|
|
2021-02-26 09:17:37 +08:00
|
|
|
VM_BUG_ON(!mhp_range_allowed(start, size, true));
|
2021-02-26 09:19:03 +08:00
|
|
|
|
2022-09-21 15:48:41 +08:00
|
|
|
if (can_set_direct_map())
|
2021-03-10 18:49:41 +08:00
|
|
|
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
2018-12-12 02:48:48 +08:00
|
|
|
|
|
|
|
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
|
mm/memory_hotplug: add pgprot_t to mhp_params
devm_memremap_pages() is currently used by the PCI P2PDMA code to create
struct page mappings for IO memory. At present, these mappings are
created with PAGE_KERNEL which implies setting the PAT bits to be WB.
However, on x86, an mtrr register will typically override this and force
the cache type to be UC-. In the case firmware doesn't set this
register it is effectively WB and will typically result in a machine
check exception when it's accessed.
Other arches are not currently likely to function correctly seeing they
don't have any MTRR registers to fall back on.
To solve this, provide a way to specify the pgprot value explicitly to
arch_add_memory().
Of the arches that support MEMORY_HOTPLUG: x86_64, and arm64 need a
simple change to pass the pgprot_t down to their respective functions
which set up the page tables. For x86_32, set the page tables
explicitly using _set_memory_prot() (seeing they are already mapped).
For ia64, s390 and sh, reject anything but PAGE_KERNEL settings -- this
should be fine, for now, seeing these architectures don't support
ZONE_DEVICE.
A check in __add_pages() is also added to ensure the pgprot parameter
was set for all arches.
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Badger <ebadger@gigaio.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Link: http://lkml.kernel.org/r/20200306170846.9333-7-logang@deltatee.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-11 05:33:36 +08:00
|
|
|
size, params->pgprot, __pgd_pgtable_alloc,
|
|
|
|
flags);
|
2018-12-12 02:48:48 +08:00
|
|
|
|
2019-11-07 09:43:21 +08:00
|
|
|
memblock_clear_nomap(start, size);
|
|
|
|
|
2020-03-04 12:28:43 +08:00
|
|
|
ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
|
2020-04-11 05:33:21 +08:00
|
|
|
params);
|
2020-03-04 12:28:43 +08:00
|
|
|
if (ret)
|
|
|
|
__remove_pgd_mapping(swapper_pg_dir,
|
|
|
|
__phys_to_virt(start), size);
|
2021-09-29 02:51:49 +08:00
|
|
|
else {
|
|
|
|
max_pfn = PFN_UP(start + size);
|
|
|
|
max_low_pfn = max_pfn;
|
|
|
|
}
|
|
|
|
|
2020-03-04 12:28:43 +08:00
|
|
|
return ret;
|
2018-12-12 02:48:48 +08:00
|
|
|
}
|
2020-03-04 12:28:43 +08:00
|
|
|
|
2021-09-08 10:55:04 +08:00
|
|
|
void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
|
2019-07-19 06:56:41 +08:00
|
|
|
{
|
|
|
|
unsigned long start_pfn = start >> PAGE_SHIFT;
|
|
|
|
unsigned long nr_pages = size >> PAGE_SHIFT;
|
|
|
|
|
2020-01-05 04:59:33 +08:00
|
|
|
__remove_pages(start_pfn, nr_pages, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This memory hotplug notifier helps prevent boot memory from being
|
|
|
|
* inadvertently removed as it blocks pfn range offlining process in
|
|
|
|
* __offline_pages(). Hence this prevents both offlining as well as
|
|
|
|
* removal process for boot memory which is initially always online.
|
|
|
|
* In future if and when boot memory could be removed, this notifier
|
|
|
|
* should be dropped and free_hotplug_page_range() should handle any
|
|
|
|
* reserved pages allocated during boot.
|
|
|
|
*/
|
|
|
|
static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
|
|
|
|
unsigned long action, void *data)
|
|
|
|
{
|
|
|
|
struct mem_section *ms;
|
|
|
|
struct memory_notify *arg = data;
|
|
|
|
unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
|
|
|
|
unsigned long pfn = arg->start_pfn;
|
|
|
|
|
2020-11-09 12:28:56 +08:00
|
|
|
if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
|
2020-03-04 12:28:43 +08:00
|
|
|
return NOTIFY_OK;
|
|
|
|
|
|
|
|
for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
|
2020-11-09 12:28:56 +08:00
|
|
|
unsigned long start = PFN_PHYS(pfn);
|
|
|
|
unsigned long end = start + (1UL << PA_SECTION_SHIFT);
|
|
|
|
|
2020-03-04 12:28:43 +08:00
|
|
|
ms = __pfn_to_section(pfn);
|
2020-11-09 12:28:56 +08:00
|
|
|
if (!early_section(ms))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (action == MEM_GOING_OFFLINE) {
|
|
|
|
/*
|
|
|
|
* Boot memory removal is not supported. Prevent
|
|
|
|
* it via blocking any attempted offline request
|
|
|
|
* for the boot memory and just report it.
|
|
|
|
*/
|
|
|
|
pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end);
|
2020-03-04 12:28:43 +08:00
|
|
|
return NOTIFY_BAD;
|
2020-11-09 12:28:56 +08:00
|
|
|
} else if (action == MEM_OFFLINE) {
|
|
|
|
/*
|
|
|
|
* This should have never happened. Boot memory
|
|
|
|
* offlining should have been prevented by this
|
|
|
|
* very notifier. Probably some memory removal
|
|
|
|
* procedure might have changed which would then
|
|
|
|
* require further debug.
|
|
|
|
*/
|
|
|
|
pr_err("Boot memory [%lx %lx] offlined\n", start, end);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Core memory hotplug does not process a return
|
|
|
|
* code from the notifier for MEM_OFFLINE events.
|
|
|
|
* The error condition has been reported. Return
|
|
|
|
* from here as if ignored.
|
|
|
|
*/
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
2020-03-04 12:28:43 +08:00
|
|
|
}
|
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block prevent_bootmem_remove_nb = {
|
|
|
|
.notifier_call = prevent_bootmem_remove_notifier,
|
|
|
|
};
|
|
|
|
|
2020-11-09 12:28:57 +08:00
|
|
|
/*
|
|
|
|
* This ensures that boot memory sections on the platform are online
|
|
|
|
* from early boot. Memory sections could not be prevented from being
|
|
|
|
* offlined, unless for some reason they are not online to begin with.
|
|
|
|
* This helps validate the basic assumption on which the above memory
|
|
|
|
* event notifier works to prevent boot memory section offlining and
|
|
|
|
* its possible removal.
|
|
|
|
*/
|
|
|
|
static void validate_bootmem_online(void)
|
|
|
|
{
|
|
|
|
phys_addr_t start, end, addr;
|
|
|
|
struct mem_section *ms;
|
|
|
|
u64 i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scanning across all memblock might be expensive
|
|
|
|
* on some big memory systems. Hence enable this
|
|
|
|
* validation only with DEBUG_VM.
|
|
|
|
*/
|
|
|
|
if (!IS_ENABLED(CONFIG_DEBUG_VM))
|
|
|
|
return;
|
|
|
|
|
|
|
|
for_each_mem_range(i, &start, &end) {
|
|
|
|
for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) {
|
|
|
|
ms = __pfn_to_section(PHYS_PFN(addr));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* All memory ranges in the system at this point
|
|
|
|
* should have been marked as early sections.
|
|
|
|
*/
|
|
|
|
WARN_ON(!early_section(ms));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Memory notifier mechanism here to prevent boot
|
|
|
|
* memory offlining depends on the fact that each
|
|
|
|
* early section memory on the system is initially
|
|
|
|
* online. Otherwise a given memory section which
|
|
|
|
* is already offline will be overlooked and can
|
|
|
|
* be removed completely. Call out such sections.
|
|
|
|
*/
|
|
|
|
if (!online_section(ms))
|
|
|
|
pr_err("Boot memory [%llx %llx] is offline, can be removed\n",
|
|
|
|
addr, addr + (1UL << PA_SECTION_SHIFT));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-04 12:28:43 +08:00
|
|
|
static int __init prevent_bootmem_remove_init(void)
|
|
|
|
{
|
2020-11-09 12:28:55 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
|
|
|
|
return ret;
|
|
|
|
|
2020-11-09 12:28:57 +08:00
|
|
|
validate_bootmem_online();
|
2020-11-09 12:28:55 +08:00
|
|
|
ret = register_memory_notifier(&prevent_bootmem_remove_nb);
|
|
|
|
if (ret)
|
|
|
|
pr_err("%s: Notifier registration failed %d\n", __func__, ret);
|
|
|
|
|
|
|
|
return ret;
|
2019-07-19 06:56:41 +08:00
|
|
|
}
|
2020-11-09 12:28:55 +08:00
|
|
|
early_initcall(prevent_bootmem_remove_init);
|
2019-07-19 06:56:41 +08:00
|
|
|
#endif
|