mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-12-01 08:04:22 +08:00
412cb3801d
We use cpus_have_const_cap() to check for ARM64_WORKAROUND_2645198 but this is not necessary and alternative_has_cap() would be preferable. For historical reasons, cpus_have_const_cap() is more complicated than it needs to be. Before cpucaps are finalized, it will perform a bitmap test of the system_cpucaps bitmap, and once cpucaps are finalized it will use an alternative branch. This used to be necessary to handle some race conditions in the window between cpucap detection and the subsequent patching of alternatives and static branches, where different branches could be out-of-sync with one another (or w.r.t. alternative sequences). Now that we use alternative branches instead of static branches, these are all patched atomically w.r.t. one another, and there are only a handful of cases that need special care in the window between cpucap detection and alternative patching. Due to the above, it would be nice to remove cpus_have_const_cap(), and migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(), or cpus_have_cap() depending on when their requirements. This will remove redundant instructions and improve code generation, and will make it easier to determine how each callsite will behave before, during, and after alternative patching. The ARM64_WORKAROUND_2645198 cpucap is detected and patched before any userspace translation table exist, and the workaround is only necessary when manipulating usrspace translation tables which are in use. Thus it is not necessary to use cpus_have_const_cap(), and alternative_has_cap() is equivalent. This patch replaces the use of cpus_have_const_cap() with alternative_has_cap_unlikely(), which will avoid generating code to test the system_cpucaps bitmap and should be better for all subsequent calls at runtime. The ARM64_WORKAROUND_2645198 cpucap is added to cpucap_is_possible() so that code can be elided entirely when this is not possible, and redundant IS_ENABLED() checks are removed. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
1489 lines
39 KiB
C
1489 lines
39 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Based on arch/arm/mm/mmu.c
|
|
*
|
|
* Copyright (C) 1995-2005 Russell King
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
*/
|
|
|
|
#include <linux/cache.h>
|
|
#include <linux/export.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/init.h>
|
|
#include <linux/ioport.h>
|
|
#include <linux/kexec.h>
|
|
#include <linux/libfdt.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/nodemask.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/memremap.h>
|
|
#include <linux/memory.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/io.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/set_memory.h>
|
|
#include <linux/kfence.h>
|
|
|
|
#include <asm/barrier.h>
|
|
#include <asm/cputype.h>
|
|
#include <asm/fixmap.h>
|
|
#include <asm/kasan.h>
|
|
#include <asm/kernel-pgtable.h>
|
|
#include <asm/sections.h>
|
|
#include <asm/setup.h>
|
|
#include <linux/sizes.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/ptdump.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/kfence.h>
|
|
|
|
#define NO_BLOCK_MAPPINGS BIT(0)
|
|
#define NO_CONT_MAPPINGS BIT(1)
|
|
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
|
|
|
|
int idmap_t0sz __ro_after_init;
|
|
|
|
#if VA_BITS > 48
|
|
u64 vabits_actual __ro_after_init = VA_BITS_MIN;
|
|
EXPORT_SYMBOL(vabits_actual);
|
|
#endif
|
|
|
|
u64 kimage_vaddr __ro_after_init = (u64)&_text;
|
|
EXPORT_SYMBOL(kimage_vaddr);
|
|
|
|
u64 kimage_voffset __ro_after_init;
|
|
EXPORT_SYMBOL(kimage_voffset);
|
|
|
|
u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 };
|
|
|
|
/*
|
|
* The booting CPU updates the failed status @__early_cpu_boot_status,
|
|
* with MMU turned off.
|
|
*/
|
|
long __section(".mmuoff.data.write") __early_cpu_boot_status;
|
|
|
|
/*
|
|
* Empty_zero_page is a special page that is used for zero-initialized data
|
|
* and COW.
|
|
*/
|
|
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
|
|
EXPORT_SYMBOL(empty_zero_page);
|
|
|
|
static DEFINE_SPINLOCK(swapper_pgdir_lock);
|
|
static DEFINE_MUTEX(fixmap_lock);
|
|
|
|
void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
|
|
{
|
|
pgd_t *fixmap_pgdp;
|
|
|
|
spin_lock(&swapper_pgdir_lock);
|
|
fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
|
|
WRITE_ONCE(*fixmap_pgdp, pgd);
|
|
/*
|
|
* We need dsb(ishst) here to ensure the page-table-walker sees
|
|
* our new entry before set_p?d() returns. The fixmap's
|
|
* flush_tlb_kernel_range() via clear_fixmap() does this for us.
|
|
*/
|
|
pgd_clear_fixmap();
|
|
spin_unlock(&swapper_pgdir_lock);
|
|
}
|
|
|
|
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
|
|
unsigned long size, pgprot_t vma_prot)
|
|
{
|
|
if (!pfn_is_map_memory(pfn))
|
|
return pgprot_noncached(vma_prot);
|
|
else if (file->f_flags & O_SYNC)
|
|
return pgprot_writecombine(vma_prot);
|
|
return vma_prot;
|
|
}
|
|
EXPORT_SYMBOL(phys_mem_access_prot);
|
|
|
|
static phys_addr_t __init early_pgtable_alloc(int shift)
|
|
{
|
|
phys_addr_t phys;
|
|
void *ptr;
|
|
|
|
phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
|
|
MEMBLOCK_ALLOC_NOLEAKTRACE);
|
|
if (!phys)
|
|
panic("Failed to allocate page table page\n");
|
|
|
|
/*
|
|
* The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
|
|
* slot will be free, so we can (ab)use the FIX_PTE slot to initialise
|
|
* any level of table.
|
|
*/
|
|
ptr = pte_set_fixmap(phys);
|
|
|
|
memset(ptr, 0, PAGE_SIZE);
|
|
|
|
/*
|
|
* Implicit barriers also ensure the zeroed page is visible to the page
|
|
* table walker
|
|
*/
|
|
pte_clear_fixmap();
|
|
|
|
return phys;
|
|
}
|
|
|
|
bool pgattr_change_is_safe(u64 old, u64 new)
|
|
{
|
|
/*
|
|
* The following mapping attributes may be updated in live
|
|
* kernel mappings without the need for break-before-make.
|
|
*/
|
|
pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
|
|
|
|
/* creating or taking down mappings is always safe */
|
|
if (!pte_valid(__pte(old)) || !pte_valid(__pte(new)))
|
|
return true;
|
|
|
|
/* A live entry's pfn should not change */
|
|
if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
|
|
return false;
|
|
|
|
/* live contiguous mappings may not be manipulated at all */
|
|
if ((old | new) & PTE_CONT)
|
|
return false;
|
|
|
|
/* Transitioning from Non-Global to Global is unsafe */
|
|
if (old & ~new & PTE_NG)
|
|
return false;
|
|
|
|
/*
|
|
* Changing the memory type between Normal and Normal-Tagged is safe
|
|
* since Tagged is considered a permission attribute from the
|
|
* mismatched attribute aliases perspective.
|
|
*/
|
|
if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
|
|
(old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
|
|
((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
|
|
(new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
|
|
mask |= PTE_ATTRINDX_MASK;
|
|
|
|
return ((old ^ new) & ~mask) == 0;
|
|
}
|
|
|
|
static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys, pgprot_t prot)
|
|
{
|
|
pte_t *ptep;
|
|
|
|
ptep = pte_set_fixmap_offset(pmdp, addr);
|
|
do {
|
|
pte_t old_pte = READ_ONCE(*ptep);
|
|
|
|
set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
|
|
|
|
/*
|
|
* After the PTE entry has been populated once, we
|
|
* only allow updates to the permission attributes.
|
|
*/
|
|
BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
|
|
READ_ONCE(pte_val(*ptep))));
|
|
|
|
phys += PAGE_SIZE;
|
|
} while (ptep++, addr += PAGE_SIZE, addr != end);
|
|
|
|
pte_clear_fixmap();
|
|
}
|
|
|
|
static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
|
|
unsigned long end, phys_addr_t phys,
|
|
pgprot_t prot,
|
|
phys_addr_t (*pgtable_alloc)(int),
|
|
int flags)
|
|
{
|
|
unsigned long next;
|
|
pmd_t pmd = READ_ONCE(*pmdp);
|
|
|
|
BUG_ON(pmd_sect(pmd));
|
|
if (pmd_none(pmd)) {
|
|
pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
|
|
phys_addr_t pte_phys;
|
|
|
|
if (flags & NO_EXEC_MAPPINGS)
|
|
pmdval |= PMD_TABLE_PXN;
|
|
BUG_ON(!pgtable_alloc);
|
|
pte_phys = pgtable_alloc(PAGE_SHIFT);
|
|
__pmd_populate(pmdp, pte_phys, pmdval);
|
|
pmd = READ_ONCE(*pmdp);
|
|
}
|
|
BUG_ON(pmd_bad(pmd));
|
|
|
|
do {
|
|
pgprot_t __prot = prot;
|
|
|
|
next = pte_cont_addr_end(addr, end);
|
|
|
|
/* use a contiguous mapping if the range is suitably aligned */
|
|
if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
|
|
(flags & NO_CONT_MAPPINGS) == 0)
|
|
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
|
|
|
|
init_pte(pmdp, addr, next, phys, __prot);
|
|
|
|
phys += next - addr;
|
|
} while (addr = next, addr != end);
|
|
}
|
|
|
|
static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys, pgprot_t prot,
|
|
phys_addr_t (*pgtable_alloc)(int), int flags)
|
|
{
|
|
unsigned long next;
|
|
pmd_t *pmdp;
|
|
|
|
pmdp = pmd_set_fixmap_offset(pudp, addr);
|
|
do {
|
|
pmd_t old_pmd = READ_ONCE(*pmdp);
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
/* try section mapping first */
|
|
if (((addr | next | phys) & ~PMD_MASK) == 0 &&
|
|
(flags & NO_BLOCK_MAPPINGS) == 0) {
|
|
pmd_set_huge(pmdp, phys, prot);
|
|
|
|
/*
|
|
* After the PMD entry has been populated once, we
|
|
* only allow updates to the permission attributes.
|
|
*/
|
|
BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
|
|
READ_ONCE(pmd_val(*pmdp))));
|
|
} else {
|
|
alloc_init_cont_pte(pmdp, addr, next, phys, prot,
|
|
pgtable_alloc, flags);
|
|
|
|
BUG_ON(pmd_val(old_pmd) != 0 &&
|
|
pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
|
|
}
|
|
phys += next - addr;
|
|
} while (pmdp++, addr = next, addr != end);
|
|
|
|
pmd_clear_fixmap();
|
|
}
|
|
|
|
static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
|
|
unsigned long end, phys_addr_t phys,
|
|
pgprot_t prot,
|
|
phys_addr_t (*pgtable_alloc)(int), int flags)
|
|
{
|
|
unsigned long next;
|
|
pud_t pud = READ_ONCE(*pudp);
|
|
|
|
/*
|
|
* Check for initial section mappings in the pgd/pud.
|
|
*/
|
|
BUG_ON(pud_sect(pud));
|
|
if (pud_none(pud)) {
|
|
pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
|
|
phys_addr_t pmd_phys;
|
|
|
|
if (flags & NO_EXEC_MAPPINGS)
|
|
pudval |= PUD_TABLE_PXN;
|
|
BUG_ON(!pgtable_alloc);
|
|
pmd_phys = pgtable_alloc(PMD_SHIFT);
|
|
__pud_populate(pudp, pmd_phys, pudval);
|
|
pud = READ_ONCE(*pudp);
|
|
}
|
|
BUG_ON(pud_bad(pud));
|
|
|
|
do {
|
|
pgprot_t __prot = prot;
|
|
|
|
next = pmd_cont_addr_end(addr, end);
|
|
|
|
/* use a contiguous mapping if the range is suitably aligned */
|
|
if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
|
|
(flags & NO_CONT_MAPPINGS) == 0)
|
|
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
|
|
|
|
init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
|
|
|
|
phys += next - addr;
|
|
} while (addr = next, addr != end);
|
|
}
|
|
|
|
static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
|
|
phys_addr_t phys, pgprot_t prot,
|
|
phys_addr_t (*pgtable_alloc)(int),
|
|
int flags)
|
|
{
|
|
unsigned long next;
|
|
pud_t *pudp;
|
|
p4d_t *p4dp = p4d_offset(pgdp, addr);
|
|
p4d_t p4d = READ_ONCE(*p4dp);
|
|
|
|
if (p4d_none(p4d)) {
|
|
p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
|
|
phys_addr_t pud_phys;
|
|
|
|
if (flags & NO_EXEC_MAPPINGS)
|
|
p4dval |= P4D_TABLE_PXN;
|
|
BUG_ON(!pgtable_alloc);
|
|
pud_phys = pgtable_alloc(PUD_SHIFT);
|
|
__p4d_populate(p4dp, pud_phys, p4dval);
|
|
p4d = READ_ONCE(*p4dp);
|
|
}
|
|
BUG_ON(p4d_bad(p4d));
|
|
|
|
pudp = pud_set_fixmap_offset(p4dp, addr);
|
|
do {
|
|
pud_t old_pud = READ_ONCE(*pudp);
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
/*
|
|
* For 4K granule only, attempt to put down a 1GB block
|
|
*/
|
|
if (pud_sect_supported() &&
|
|
((addr | next | phys) & ~PUD_MASK) == 0 &&
|
|
(flags & NO_BLOCK_MAPPINGS) == 0) {
|
|
pud_set_huge(pudp, phys, prot);
|
|
|
|
/*
|
|
* After the PUD entry has been populated once, we
|
|
* only allow updates to the permission attributes.
|
|
*/
|
|
BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
|
|
READ_ONCE(pud_val(*pudp))));
|
|
} else {
|
|
alloc_init_cont_pmd(pudp, addr, next, phys, prot,
|
|
pgtable_alloc, flags);
|
|
|
|
BUG_ON(pud_val(old_pud) != 0 &&
|
|
pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
|
|
}
|
|
phys += next - addr;
|
|
} while (pudp++, addr = next, addr != end);
|
|
|
|
pud_clear_fixmap();
|
|
}
|
|
|
|
static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
|
|
unsigned long virt, phys_addr_t size,
|
|
pgprot_t prot,
|
|
phys_addr_t (*pgtable_alloc)(int),
|
|
int flags)
|
|
{
|
|
unsigned long addr, end, next;
|
|
pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
|
|
|
|
/*
|
|
* If the virtual and physical address don't have the same offset
|
|
* within a page, we cannot map the region as the caller expects.
|
|
*/
|
|
if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
|
|
return;
|
|
|
|
phys &= PAGE_MASK;
|
|
addr = virt & PAGE_MASK;
|
|
end = PAGE_ALIGN(virt + size);
|
|
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
|
|
flags);
|
|
phys += next - addr;
|
|
} while (pgdp++, addr = next, addr != end);
|
|
}
|
|
|
|
static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
|
|
unsigned long virt, phys_addr_t size,
|
|
pgprot_t prot,
|
|
phys_addr_t (*pgtable_alloc)(int),
|
|
int flags)
|
|
{
|
|
mutex_lock(&fixmap_lock);
|
|
__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
|
|
pgtable_alloc, flags);
|
|
mutex_unlock(&fixmap_lock);
|
|
}
|
|
|
|
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
|
|
extern __alias(__create_pgd_mapping_locked)
|
|
void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
|
|
phys_addr_t size, pgprot_t prot,
|
|
phys_addr_t (*pgtable_alloc)(int), int flags);
|
|
#endif
|
|
|
|
static phys_addr_t __pgd_pgtable_alloc(int shift)
|
|
{
|
|
void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
|
|
BUG_ON(!ptr);
|
|
|
|
/* Ensure the zeroed page is visible to the page table walker */
|
|
dsb(ishst);
|
|
return __pa(ptr);
|
|
}
|
|
|
|
static phys_addr_t pgd_pgtable_alloc(int shift)
|
|
{
|
|
phys_addr_t pa = __pgd_pgtable_alloc(shift);
|
|
struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
|
|
|
|
/*
|
|
* Call proper page table ctor in case later we need to
|
|
* call core mm functions like apply_to_page_range() on
|
|
* this pre-allocated page table.
|
|
*
|
|
* We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
|
|
* folded, and if so pagetable_pte_ctor() becomes nop.
|
|
*/
|
|
if (shift == PAGE_SHIFT)
|
|
BUG_ON(!pagetable_pte_ctor(ptdesc));
|
|
else if (shift == PMD_SHIFT)
|
|
BUG_ON(!pagetable_pmd_ctor(ptdesc));
|
|
|
|
return pa;
|
|
}
|
|
|
|
/*
|
|
* This function can only be used to modify existing table entries,
|
|
* without allocating new levels of table. Note that this permits the
|
|
* creation of new section or page entries.
|
|
*/
|
|
void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
|
|
phys_addr_t size, pgprot_t prot)
|
|
{
|
|
if (virt < PAGE_OFFSET) {
|
|
pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
|
|
&phys, virt);
|
|
return;
|
|
}
|
|
__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
|
|
NO_CONT_MAPPINGS);
|
|
}
|
|
|
|
void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
|
|
unsigned long virt, phys_addr_t size,
|
|
pgprot_t prot, bool page_mappings_only)
|
|
{
|
|
int flags = 0;
|
|
|
|
BUG_ON(mm == &init_mm);
|
|
|
|
if (page_mappings_only)
|
|
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
|
|
|
__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
|
|
pgd_pgtable_alloc, flags);
|
|
}
|
|
|
|
static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
|
|
phys_addr_t size, pgprot_t prot)
|
|
{
|
|
if (virt < PAGE_OFFSET) {
|
|
pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
|
|
&phys, virt);
|
|
return;
|
|
}
|
|
|
|
__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
|
|
NO_CONT_MAPPINGS);
|
|
|
|
/* flush the TLBs after updating live kernel mappings */
|
|
flush_tlb_kernel_range(virt, virt + size);
|
|
}
|
|
|
|
static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
|
|
phys_addr_t end, pgprot_t prot, int flags)
|
|
{
|
|
__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
|
|
prot, early_pgtable_alloc, flags);
|
|
}
|
|
|
|
void __init mark_linear_text_alias_ro(void)
|
|
{
|
|
/*
|
|
* Remove the write permissions from the linear alias of .text/.rodata
|
|
*/
|
|
update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
|
|
(unsigned long)__init_begin - (unsigned long)_stext,
|
|
PAGE_KERNEL_RO);
|
|
}
|
|
|
|
#ifdef CONFIG_KFENCE
|
|
|
|
bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
|
|
|
|
/* early_param() will be parsed before map_mem() below. */
|
|
static int __init parse_kfence_early_init(char *arg)
|
|
{
|
|
int val;
|
|
|
|
if (get_option(&arg, &val))
|
|
kfence_early_init = !!val;
|
|
return 0;
|
|
}
|
|
early_param("kfence.sample_interval", parse_kfence_early_init);
|
|
|
|
static phys_addr_t __init arm64_kfence_alloc_pool(void)
|
|
{
|
|
phys_addr_t kfence_pool;
|
|
|
|
if (!kfence_early_init)
|
|
return 0;
|
|
|
|
kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
|
|
if (!kfence_pool) {
|
|
pr_err("failed to allocate kfence pool\n");
|
|
kfence_early_init = false;
|
|
return 0;
|
|
}
|
|
|
|
/* Temporarily mark as NOMAP. */
|
|
memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
|
|
|
|
return kfence_pool;
|
|
}
|
|
|
|
static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
|
|
{
|
|
if (!kfence_pool)
|
|
return;
|
|
|
|
/* KFENCE pool needs page-level mapping. */
|
|
__map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
|
|
pgprot_tagged(PAGE_KERNEL),
|
|
NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
|
|
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
|
|
__kfence_pool = phys_to_virt(kfence_pool);
|
|
}
|
|
#else /* CONFIG_KFENCE */
|
|
|
|
static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
|
|
static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { }
|
|
|
|
#endif /* CONFIG_KFENCE */
|
|
|
|
static void __init map_mem(pgd_t *pgdp)
|
|
{
|
|
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
|
|
phys_addr_t kernel_start = __pa_symbol(_stext);
|
|
phys_addr_t kernel_end = __pa_symbol(__init_begin);
|
|
phys_addr_t start, end;
|
|
phys_addr_t early_kfence_pool;
|
|
int flags = NO_EXEC_MAPPINGS;
|
|
u64 i;
|
|
|
|
/*
|
|
* Setting hierarchical PXNTable attributes on table entries covering
|
|
* the linear region is only possible if it is guaranteed that no table
|
|
* entries at any level are being shared between the linear region and
|
|
* the vmalloc region. Check whether this is true for the PGD level, in
|
|
* which case it is guaranteed to be true for all other levels as well.
|
|
*/
|
|
BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
|
|
|
|
early_kfence_pool = arm64_kfence_alloc_pool();
|
|
|
|
if (can_set_direct_map())
|
|
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
|
|
|
/*
|
|
* Take care not to create a writable alias for the
|
|
* read-only text and rodata sections of the kernel image.
|
|
* So temporarily mark them as NOMAP to skip mappings in
|
|
* the following for-loop
|
|
*/
|
|
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
|
|
|
|
/* map all the memory banks */
|
|
for_each_mem_range(i, &start, &end) {
|
|
if (start >= end)
|
|
break;
|
|
/*
|
|
* The linear map must allow allocation tags reading/writing
|
|
* if MTE is present. Otherwise, it has the same attributes as
|
|
* PAGE_KERNEL.
|
|
*/
|
|
__map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
|
|
flags);
|
|
}
|
|
|
|
/*
|
|
* Map the linear alias of the [_stext, __init_begin) interval
|
|
* as non-executable now, and remove the write permission in
|
|
* mark_linear_text_alias_ro() below (which will be called after
|
|
* alternative patching has completed). This makes the contents
|
|
* of the region accessible to subsystems such as hibernate,
|
|
* but protects it from inadvertent modification or execution.
|
|
* Note that contiguous mappings cannot be remapped in this way,
|
|
* so we should avoid them here.
|
|
*/
|
|
__map_memblock(pgdp, kernel_start, kernel_end,
|
|
PAGE_KERNEL, NO_CONT_MAPPINGS);
|
|
memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
|
|
arm64_kfence_map_pool(early_kfence_pool, pgdp);
|
|
}
|
|
|
|
void mark_rodata_ro(void)
|
|
{
|
|
unsigned long section_size;
|
|
|
|
/*
|
|
* mark .rodata as read only. Use __init_begin rather than __end_rodata
|
|
* to cover NOTES and EXCEPTION_TABLE.
|
|
*/
|
|
section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
|
|
update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
|
|
section_size, PAGE_KERNEL_RO);
|
|
|
|
debug_checkwx();
|
|
}
|
|
|
|
static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
|
|
pgprot_t prot, struct vm_struct *vma,
|
|
int flags, unsigned long vm_flags)
|
|
{
|
|
phys_addr_t pa_start = __pa_symbol(va_start);
|
|
unsigned long size = va_end - va_start;
|
|
|
|
BUG_ON(!PAGE_ALIGNED(pa_start));
|
|
BUG_ON(!PAGE_ALIGNED(size));
|
|
|
|
__create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
|
|
early_pgtable_alloc, flags);
|
|
|
|
if (!(vm_flags & VM_NO_GUARD))
|
|
size += PAGE_SIZE;
|
|
|
|
vma->addr = va_start;
|
|
vma->phys_addr = pa_start;
|
|
vma->size = size;
|
|
vma->flags = VM_MAP | vm_flags;
|
|
vma->caller = __builtin_return_address(0);
|
|
|
|
vm_area_add_early(vma);
|
|
}
|
|
|
|
static pgprot_t kernel_exec_prot(void)
|
|
{
|
|
return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
|
|
}
|
|
|
|
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
|
|
static int __init map_entry_trampoline(void)
|
|
{
|
|
int i;
|
|
|
|
pgprot_t prot = kernel_exec_prot();
|
|
phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
|
|
|
|
/* The trampoline is always mapped and can therefore be global */
|
|
pgprot_val(prot) &= ~PTE_NG;
|
|
|
|
/* Map only the text into the trampoline page table */
|
|
memset(tramp_pg_dir, 0, PGD_SIZE);
|
|
__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
|
|
entry_tramp_text_size(), prot,
|
|
__pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
|
|
|
|
/* Map both the text and data into the kernel page table */
|
|
for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
|
|
__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
|
|
pa_start + i * PAGE_SIZE, prot);
|
|
|
|
if (IS_ENABLED(CONFIG_RELOCATABLE))
|
|
__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
|
|
pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO);
|
|
|
|
return 0;
|
|
}
|
|
core_initcall(map_entry_trampoline);
|
|
#endif
|
|
|
|
/*
|
|
* Open coded check for BTI, only for use to determine configuration
|
|
* for early mappings for before the cpufeature code has run.
|
|
*/
|
|
static bool arm64_early_this_cpu_has_bti(void)
|
|
{
|
|
u64 pfr1;
|
|
|
|
if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
|
|
return false;
|
|
|
|
pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1);
|
|
return cpuid_feature_extract_unsigned_field(pfr1,
|
|
ID_AA64PFR1_EL1_BT_SHIFT);
|
|
}
|
|
|
|
/*
|
|
* Create fine-grained mappings for the kernel.
|
|
*/
|
|
static void __init map_kernel(pgd_t *pgdp)
|
|
{
|
|
static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
|
|
vmlinux_initdata, vmlinux_data;
|
|
|
|
/*
|
|
* External debuggers may need to write directly to the text
|
|
* mapping to install SW breakpoints. Allow this (only) when
|
|
* explicitly requested with rodata=off.
|
|
*/
|
|
pgprot_t text_prot = kernel_exec_prot();
|
|
|
|
/*
|
|
* If we have a CPU that supports BTI and a kernel built for
|
|
* BTI then mark the kernel executable text as guarded pages
|
|
* now so we don't have to rewrite the page tables later.
|
|
*/
|
|
if (arm64_early_this_cpu_has_bti())
|
|
text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
|
|
|
|
/*
|
|
* Only rodata will be remapped with different permissions later on,
|
|
* all other segments are allowed to use contiguous mappings.
|
|
*/
|
|
map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0,
|
|
VM_NO_GUARD);
|
|
map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
|
|
&vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
|
|
map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
|
|
&vmlinux_inittext, 0, VM_NO_GUARD);
|
|
map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
|
|
&vmlinux_initdata, 0, VM_NO_GUARD);
|
|
map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
|
|
|
|
fixmap_copy(pgdp);
|
|
kasan_copy_shadow(pgdp);
|
|
}
|
|
|
|
static void __init create_idmap(void)
|
|
{
|
|
u64 start = __pa_symbol(__idmap_text_start);
|
|
u64 size = __pa_symbol(__idmap_text_end) - start;
|
|
pgd_t *pgd = idmap_pg_dir;
|
|
u64 pgd_phys;
|
|
|
|
/* check if we need an additional level of translation */
|
|
if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) {
|
|
pgd_phys = early_pgtable_alloc(PAGE_SHIFT);
|
|
set_pgd(&idmap_pg_dir[start >> VA_BITS],
|
|
__pgd(pgd_phys | P4D_TYPE_TABLE));
|
|
pgd = __va(pgd_phys);
|
|
}
|
|
__create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX,
|
|
early_pgtable_alloc, 0);
|
|
|
|
if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) {
|
|
extern u32 __idmap_kpti_flag;
|
|
u64 pa = __pa_symbol(&__idmap_kpti_flag);
|
|
|
|
/*
|
|
* The KPTI G-to-nG conversion code needs a read-write mapping
|
|
* of its synchronization flag in the ID map.
|
|
*/
|
|
__create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL,
|
|
early_pgtable_alloc, 0);
|
|
}
|
|
}
|
|
|
|
void __init paging_init(void)
|
|
{
|
|
pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
|
|
extern pgd_t init_idmap_pg_dir[];
|
|
|
|
idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0));
|
|
|
|
map_kernel(pgdp);
|
|
map_mem(pgdp);
|
|
|
|
pgd_clear_fixmap();
|
|
|
|
cpu_replace_ttbr1(lm_alias(swapper_pg_dir), init_idmap_pg_dir);
|
|
init_mm.pgd = swapper_pg_dir;
|
|
|
|
memblock_phys_free(__pa_symbol(init_pg_dir),
|
|
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
|
|
|
|
memblock_allow_resize();
|
|
|
|
create_idmap();
|
|
}
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
static void free_hotplug_page_range(struct page *page, size_t size,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
if (altmap) {
|
|
vmem_altmap_free(altmap, size >> PAGE_SHIFT);
|
|
} else {
|
|
WARN_ON(PageReserved(page));
|
|
free_pages((unsigned long)page_address(page), get_order(size));
|
|
}
|
|
}
|
|
|
|
static void free_hotplug_pgtable_page(struct page *page)
|
|
{
|
|
free_hotplug_page_range(page, PAGE_SIZE, NULL);
|
|
}
|
|
|
|
static bool pgtable_range_aligned(unsigned long start, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling,
|
|
unsigned long mask)
|
|
{
|
|
start &= mask;
|
|
if (start < floor)
|
|
return false;
|
|
|
|
if (ceiling) {
|
|
ceiling &= mask;
|
|
if (!ceiling)
|
|
return false;
|
|
}
|
|
|
|
if (end - 1 > ceiling - 1)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
|
|
unsigned long end, bool free_mapped,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
pte_t *ptep, pte;
|
|
|
|
do {
|
|
ptep = pte_offset_kernel(pmdp, addr);
|
|
pte = READ_ONCE(*ptep);
|
|
if (pte_none(pte))
|
|
continue;
|
|
|
|
WARN_ON(!pte_present(pte));
|
|
pte_clear(&init_mm, addr, ptep);
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
if (free_mapped)
|
|
free_hotplug_page_range(pte_page(pte),
|
|
PAGE_SIZE, altmap);
|
|
} while (addr += PAGE_SIZE, addr < end);
|
|
}
|
|
|
|
static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
|
|
unsigned long end, bool free_mapped,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
unsigned long next;
|
|
pmd_t *pmdp, pmd;
|
|
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
pmdp = pmd_offset(pudp, addr);
|
|
pmd = READ_ONCE(*pmdp);
|
|
if (pmd_none(pmd))
|
|
continue;
|
|
|
|
WARN_ON(!pmd_present(pmd));
|
|
if (pmd_sect(pmd)) {
|
|
pmd_clear(pmdp);
|
|
|
|
/*
|
|
* One TLBI should be sufficient here as the PMD_SIZE
|
|
* range is mapped with a single block entry.
|
|
*/
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
if (free_mapped)
|
|
free_hotplug_page_range(pmd_page(pmd),
|
|
PMD_SIZE, altmap);
|
|
continue;
|
|
}
|
|
WARN_ON(!pmd_table(pmd));
|
|
unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
|
|
} while (addr = next, addr < end);
|
|
}
|
|
|
|
static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
|
|
unsigned long end, bool free_mapped,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
unsigned long next;
|
|
pud_t *pudp, pud;
|
|
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
pudp = pud_offset(p4dp, addr);
|
|
pud = READ_ONCE(*pudp);
|
|
if (pud_none(pud))
|
|
continue;
|
|
|
|
WARN_ON(!pud_present(pud));
|
|
if (pud_sect(pud)) {
|
|
pud_clear(pudp);
|
|
|
|
/*
|
|
* One TLBI should be sufficient here as the PUD_SIZE
|
|
* range is mapped with a single block entry.
|
|
*/
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
if (free_mapped)
|
|
free_hotplug_page_range(pud_page(pud),
|
|
PUD_SIZE, altmap);
|
|
continue;
|
|
}
|
|
WARN_ON(!pud_table(pud));
|
|
unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
|
|
} while (addr = next, addr < end);
|
|
}
|
|
|
|
static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
|
|
unsigned long end, bool free_mapped,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
unsigned long next;
|
|
p4d_t *p4dp, p4d;
|
|
|
|
do {
|
|
next = p4d_addr_end(addr, end);
|
|
p4dp = p4d_offset(pgdp, addr);
|
|
p4d = READ_ONCE(*p4dp);
|
|
if (p4d_none(p4d))
|
|
continue;
|
|
|
|
WARN_ON(!p4d_present(p4d));
|
|
unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
|
|
} while (addr = next, addr < end);
|
|
}
|
|
|
|
static void unmap_hotplug_range(unsigned long addr, unsigned long end,
|
|
bool free_mapped, struct vmem_altmap *altmap)
|
|
{
|
|
unsigned long next;
|
|
pgd_t *pgdp, pgd;
|
|
|
|
/*
|
|
* altmap can only be used as vmemmap mapping backing memory.
|
|
* In case the backing memory itself is not being freed, then
|
|
* altmap is irrelevant. Warn about this inconsistency when
|
|
* encountered.
|
|
*/
|
|
WARN_ON(!free_mapped && altmap);
|
|
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
pgdp = pgd_offset_k(addr);
|
|
pgd = READ_ONCE(*pgdp);
|
|
if (pgd_none(pgd))
|
|
continue;
|
|
|
|
WARN_ON(!pgd_present(pgd));
|
|
unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
|
|
} while (addr = next, addr < end);
|
|
}
|
|
|
|
static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
|
|
unsigned long end, unsigned long floor,
|
|
unsigned long ceiling)
|
|
{
|
|
pte_t *ptep, pte;
|
|
unsigned long i, start = addr;
|
|
|
|
do {
|
|
ptep = pte_offset_kernel(pmdp, addr);
|
|
pte = READ_ONCE(*ptep);
|
|
|
|
/*
|
|
* This is just a sanity check here which verifies that
|
|
* pte clearing has been done by earlier unmap loops.
|
|
*/
|
|
WARN_ON(!pte_none(pte));
|
|
} while (addr += PAGE_SIZE, addr < end);
|
|
|
|
if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
|
|
return;
|
|
|
|
/*
|
|
* Check whether we can free the pte page if the rest of the
|
|
* entries are empty. Overlap with other regions have been
|
|
* handled by the floor/ceiling check.
|
|
*/
|
|
ptep = pte_offset_kernel(pmdp, 0UL);
|
|
for (i = 0; i < PTRS_PER_PTE; i++) {
|
|
if (!pte_none(READ_ONCE(ptep[i])))
|
|
return;
|
|
}
|
|
|
|
pmd_clear(pmdp);
|
|
__flush_tlb_kernel_pgtable(start);
|
|
free_hotplug_pgtable_page(virt_to_page(ptep));
|
|
}
|
|
|
|
static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
|
|
unsigned long end, unsigned long floor,
|
|
unsigned long ceiling)
|
|
{
|
|
pmd_t *pmdp, pmd;
|
|
unsigned long i, next, start = addr;
|
|
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
pmdp = pmd_offset(pudp, addr);
|
|
pmd = READ_ONCE(*pmdp);
|
|
if (pmd_none(pmd))
|
|
continue;
|
|
|
|
WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
|
|
free_empty_pte_table(pmdp, addr, next, floor, ceiling);
|
|
} while (addr = next, addr < end);
|
|
|
|
if (CONFIG_PGTABLE_LEVELS <= 2)
|
|
return;
|
|
|
|
if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
|
|
return;
|
|
|
|
/*
|
|
* Check whether we can free the pmd page if the rest of the
|
|
* entries are empty. Overlap with other regions have been
|
|
* handled by the floor/ceiling check.
|
|
*/
|
|
pmdp = pmd_offset(pudp, 0UL);
|
|
for (i = 0; i < PTRS_PER_PMD; i++) {
|
|
if (!pmd_none(READ_ONCE(pmdp[i])))
|
|
return;
|
|
}
|
|
|
|
pud_clear(pudp);
|
|
__flush_tlb_kernel_pgtable(start);
|
|
free_hotplug_pgtable_page(virt_to_page(pmdp));
|
|
}
|
|
|
|
static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
|
|
unsigned long end, unsigned long floor,
|
|
unsigned long ceiling)
|
|
{
|
|
pud_t *pudp, pud;
|
|
unsigned long i, next, start = addr;
|
|
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
pudp = pud_offset(p4dp, addr);
|
|
pud = READ_ONCE(*pudp);
|
|
if (pud_none(pud))
|
|
continue;
|
|
|
|
WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
|
|
free_empty_pmd_table(pudp, addr, next, floor, ceiling);
|
|
} while (addr = next, addr < end);
|
|
|
|
if (CONFIG_PGTABLE_LEVELS <= 3)
|
|
return;
|
|
|
|
if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
|
|
return;
|
|
|
|
/*
|
|
* Check whether we can free the pud page if the rest of the
|
|
* entries are empty. Overlap with other regions have been
|
|
* handled by the floor/ceiling check.
|
|
*/
|
|
pudp = pud_offset(p4dp, 0UL);
|
|
for (i = 0; i < PTRS_PER_PUD; i++) {
|
|
if (!pud_none(READ_ONCE(pudp[i])))
|
|
return;
|
|
}
|
|
|
|
p4d_clear(p4dp);
|
|
__flush_tlb_kernel_pgtable(start);
|
|
free_hotplug_pgtable_page(virt_to_page(pudp));
|
|
}
|
|
|
|
static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
|
|
unsigned long end, unsigned long floor,
|
|
unsigned long ceiling)
|
|
{
|
|
unsigned long next;
|
|
p4d_t *p4dp, p4d;
|
|
|
|
do {
|
|
next = p4d_addr_end(addr, end);
|
|
p4dp = p4d_offset(pgdp, addr);
|
|
p4d = READ_ONCE(*p4dp);
|
|
if (p4d_none(p4d))
|
|
continue;
|
|
|
|
WARN_ON(!p4d_present(p4d));
|
|
free_empty_pud_table(p4dp, addr, next, floor, ceiling);
|
|
} while (addr = next, addr < end);
|
|
}
|
|
|
|
static void free_empty_tables(unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
unsigned long next;
|
|
pgd_t *pgdp, pgd;
|
|
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
pgdp = pgd_offset_k(addr);
|
|
pgd = READ_ONCE(*pgdp);
|
|
if (pgd_none(pgd))
|
|
continue;
|
|
|
|
WARN_ON(!pgd_present(pgd));
|
|
free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
|
|
} while (addr = next, addr < end);
|
|
}
|
|
#endif
|
|
|
|
void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
|
|
unsigned long addr, unsigned long next)
|
|
{
|
|
pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
|
|
}
|
|
|
|
int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
|
|
unsigned long addr, unsigned long next)
|
|
{
|
|
vmemmap_verify((pte_t *)pmdp, node, addr, next);
|
|
return 1;
|
|
}
|
|
|
|
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
|
|
|
|
if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES))
|
|
return vmemmap_populate_basepages(start, end, node, altmap);
|
|
else
|
|
return vmemmap_populate_hugepages(start, end, node, altmap);
|
|
}
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
void vmemmap_free(unsigned long start, unsigned long end,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
|
|
|
|
unmap_hotplug_range(start, end, true, altmap);
|
|
free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
|
|
}
|
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
|
|
|
int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
|
|
{
|
|
pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
|
|
|
|
/* Only allow permission changes for now */
|
|
if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
|
|
pud_val(new_pud)))
|
|
return 0;
|
|
|
|
VM_BUG_ON(phys & ~PUD_MASK);
|
|
set_pud(pudp, new_pud);
|
|
return 1;
|
|
}
|
|
|
|
int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
|
|
{
|
|
pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
|
|
|
|
/* Only allow permission changes for now */
|
|
if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
|
|
pmd_val(new_pmd)))
|
|
return 0;
|
|
|
|
VM_BUG_ON(phys & ~PMD_MASK);
|
|
set_pmd(pmdp, new_pmd);
|
|
return 1;
|
|
}
|
|
|
|
int pud_clear_huge(pud_t *pudp)
|
|
{
|
|
if (!pud_sect(READ_ONCE(*pudp)))
|
|
return 0;
|
|
pud_clear(pudp);
|
|
return 1;
|
|
}
|
|
|
|
int pmd_clear_huge(pmd_t *pmdp)
|
|
{
|
|
if (!pmd_sect(READ_ONCE(*pmdp)))
|
|
return 0;
|
|
pmd_clear(pmdp);
|
|
return 1;
|
|
}
|
|
|
|
int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
|
|
{
|
|
pte_t *table;
|
|
pmd_t pmd;
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
|
|
|
if (!pmd_table(pmd)) {
|
|
VM_WARN_ON(1);
|
|
return 1;
|
|
}
|
|
|
|
table = pte_offset_kernel(pmdp, addr);
|
|
pmd_clear(pmdp);
|
|
__flush_tlb_kernel_pgtable(addr);
|
|
pte_free_kernel(NULL, table);
|
|
return 1;
|
|
}
|
|
|
|
int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
|
|
{
|
|
pmd_t *table;
|
|
pmd_t *pmdp;
|
|
pud_t pud;
|
|
unsigned long next, end;
|
|
|
|
pud = READ_ONCE(*pudp);
|
|
|
|
if (!pud_table(pud)) {
|
|
VM_WARN_ON(1);
|
|
return 1;
|
|
}
|
|
|
|
table = pmd_offset(pudp, addr);
|
|
pmdp = table;
|
|
next = addr;
|
|
end = addr + PUD_SIZE;
|
|
do {
|
|
pmd_free_pte_page(pmdp, next);
|
|
} while (pmdp++, next += PMD_SIZE, next != end);
|
|
|
|
pud_clear(pudp);
|
|
__flush_tlb_kernel_pgtable(addr);
|
|
pmd_free(NULL, table);
|
|
return 1;
|
|
}
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
|
|
{
|
|
unsigned long end = start + size;
|
|
|
|
WARN_ON(pgdir != init_mm.pgd);
|
|
WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
|
|
|
|
unmap_hotplug_range(start, end, false, NULL);
|
|
free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
|
|
}
|
|
|
|
struct range arch_get_mappable_range(void)
|
|
{
|
|
struct range mhp_range;
|
|
u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
|
|
u64 end_linear_pa = __pa(PAGE_END - 1);
|
|
|
|
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
|
|
/*
|
|
* Check for a wrap, it is possible because of randomized linear
|
|
* mapping the start physical address is actually bigger than
|
|
* the end physical address. In this case set start to zero
|
|
* because [0, end_linear_pa] range must still be able to cover
|
|
* all addressable physical addresses.
|
|
*/
|
|
if (start_linear_pa > end_linear_pa)
|
|
start_linear_pa = 0;
|
|
}
|
|
|
|
WARN_ON(start_linear_pa > end_linear_pa);
|
|
|
|
/*
|
|
* Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)]
|
|
* accommodating both its ends but excluding PAGE_END. Max physical
|
|
* range which can be mapped inside this linear mapping range, must
|
|
* also be derived from its end points.
|
|
*/
|
|
mhp_range.start = start_linear_pa;
|
|
mhp_range.end = end_linear_pa;
|
|
|
|
return mhp_range;
|
|
}
|
|
|
|
int arch_add_memory(int nid, u64 start, u64 size,
|
|
struct mhp_params *params)
|
|
{
|
|
int ret, flags = NO_EXEC_MAPPINGS;
|
|
|
|
VM_BUG_ON(!mhp_range_allowed(start, size, true));
|
|
|
|
if (can_set_direct_map())
|
|
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
|
|
|
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
|
|
size, params->pgprot, __pgd_pgtable_alloc,
|
|
flags);
|
|
|
|
memblock_clear_nomap(start, size);
|
|
|
|
ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
|
|
params);
|
|
if (ret)
|
|
__remove_pgd_mapping(swapper_pg_dir,
|
|
__phys_to_virt(start), size);
|
|
else {
|
|
max_pfn = PFN_UP(start + size);
|
|
max_low_pfn = max_pfn;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
|
|
{
|
|
unsigned long start_pfn = start >> PAGE_SHIFT;
|
|
unsigned long nr_pages = size >> PAGE_SHIFT;
|
|
|
|
__remove_pages(start_pfn, nr_pages, altmap);
|
|
__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
|
|
}
|
|
|
|
/*
|
|
* This memory hotplug notifier helps prevent boot memory from being
|
|
* inadvertently removed as it blocks pfn range offlining process in
|
|
* __offline_pages(). Hence this prevents both offlining as well as
|
|
* removal process for boot memory which is initially always online.
|
|
* In future if and when boot memory could be removed, this notifier
|
|
* should be dropped and free_hotplug_page_range() should handle any
|
|
* reserved pages allocated during boot.
|
|
*/
|
|
static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
|
|
unsigned long action, void *data)
|
|
{
|
|
struct mem_section *ms;
|
|
struct memory_notify *arg = data;
|
|
unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
|
|
unsigned long pfn = arg->start_pfn;
|
|
|
|
if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
|
|
return NOTIFY_OK;
|
|
|
|
for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
|
|
unsigned long start = PFN_PHYS(pfn);
|
|
unsigned long end = start + (1UL << PA_SECTION_SHIFT);
|
|
|
|
ms = __pfn_to_section(pfn);
|
|
if (!early_section(ms))
|
|
continue;
|
|
|
|
if (action == MEM_GOING_OFFLINE) {
|
|
/*
|
|
* Boot memory removal is not supported. Prevent
|
|
* it via blocking any attempted offline request
|
|
* for the boot memory and just report it.
|
|
*/
|
|
pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end);
|
|
return NOTIFY_BAD;
|
|
} else if (action == MEM_OFFLINE) {
|
|
/*
|
|
* This should have never happened. Boot memory
|
|
* offlining should have been prevented by this
|
|
* very notifier. Probably some memory removal
|
|
* procedure might have changed which would then
|
|
* require further debug.
|
|
*/
|
|
pr_err("Boot memory [%lx %lx] offlined\n", start, end);
|
|
|
|
/*
|
|
* Core memory hotplug does not process a return
|
|
* code from the notifier for MEM_OFFLINE events.
|
|
* The error condition has been reported. Return
|
|
* from here as if ignored.
|
|
*/
|
|
return NOTIFY_DONE;
|
|
}
|
|
}
|
|
return NOTIFY_OK;
|
|
}
|
|
|
|
static struct notifier_block prevent_bootmem_remove_nb = {
|
|
.notifier_call = prevent_bootmem_remove_notifier,
|
|
};
|
|
|
|
/*
|
|
* This ensures that boot memory sections on the platform are online
|
|
* from early boot. Memory sections could not be prevented from being
|
|
* offlined, unless for some reason they are not online to begin with.
|
|
* This helps validate the basic assumption on which the above memory
|
|
* event notifier works to prevent boot memory section offlining and
|
|
* its possible removal.
|
|
*/
|
|
static void validate_bootmem_online(void)
|
|
{
|
|
phys_addr_t start, end, addr;
|
|
struct mem_section *ms;
|
|
u64 i;
|
|
|
|
/*
|
|
* Scanning across all memblock might be expensive
|
|
* on some big memory systems. Hence enable this
|
|
* validation only with DEBUG_VM.
|
|
*/
|
|
if (!IS_ENABLED(CONFIG_DEBUG_VM))
|
|
return;
|
|
|
|
for_each_mem_range(i, &start, &end) {
|
|
for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) {
|
|
ms = __pfn_to_section(PHYS_PFN(addr));
|
|
|
|
/*
|
|
* All memory ranges in the system at this point
|
|
* should have been marked as early sections.
|
|
*/
|
|
WARN_ON(!early_section(ms));
|
|
|
|
/*
|
|
* Memory notifier mechanism here to prevent boot
|
|
* memory offlining depends on the fact that each
|
|
* early section memory on the system is initially
|
|
* online. Otherwise a given memory section which
|
|
* is already offline will be overlooked and can
|
|
* be removed completely. Call out such sections.
|
|
*/
|
|
if (!online_section(ms))
|
|
pr_err("Boot memory [%llx %llx] is offline, can be removed\n",
|
|
addr, addr + (1UL << PA_SECTION_SHIFT));
|
|
}
|
|
}
|
|
}
|
|
|
|
static int __init prevent_bootmem_remove_init(void)
|
|
{
|
|
int ret = 0;
|
|
|
|
if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
|
|
return ret;
|
|
|
|
validate_bootmem_online();
|
|
ret = register_memory_notifier(&prevent_bootmem_remove_nb);
|
|
if (ret)
|
|
pr_err("%s: Notifier registration failed %d\n", __func__, ret);
|
|
|
|
return ret;
|
|
}
|
|
early_initcall(prevent_bootmem_remove_init);
|
|
#endif
|
|
|
|
pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
|
|
{
|
|
if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
|
|
/*
|
|
* Break-before-make (BBM) is required for all user space mappings
|
|
* when the permission changes from executable to non-executable
|
|
* in cases where cpu is affected with errata #2645198.
|
|
*/
|
|
if (pte_user_exec(READ_ONCE(*ptep)))
|
|
return ptep_clear_flush(vma, addr, ptep);
|
|
}
|
|
return ptep_get_and_clear(vma->vm_mm, addr, ptep);
|
|
}
|
|
|
|
void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
|
|
pte_t old_pte, pte_t pte)
|
|
{
|
|
set_pte_at(vma->vm_mm, addr, ptep, pte);
|
|
}
|